From d1c5a010f3891ec74549007776c01423a0f07a1c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Cs=C3=A1rdi?= Date: Sun, 15 Sep 2024 10:32:42 +0200 Subject: [PATCH 01/26] Write null_count per column chunk --- src/lib/ParquetOutFile.cpp | 11 ++++ src/read-metadata.cpp | 7 ++- .../_snaps/write-parquet-statistics.md | 60 +++++++++++++++++++ .../testthat/test-write-parquet-statistics.R | 15 +++++ 4 files changed, 92 insertions(+), 1 deletion(-) create mode 100644 tests/testthat/_snaps/write-parquet-statistics.md create mode 100644 tests/testthat/test-write-parquet-statistics.R diff --git a/src/lib/ParquetOutFile.cpp b/src/lib/ParquetOutFile.cpp index 51f9bb3..2cbf37f 100644 --- a/src/lib/ParquetOutFile.cpp +++ b/src/lib/ParquetOutFile.cpp @@ -547,6 +547,10 @@ void ParquetOutFile::write_column(uint32_t idx, int64_t from, int64_t until) { uint32_t col_start = pfile.tellp(); // we increase this as needed cmd->__set_total_uncompressed_size(0); + Statistics stat; + // we increase this as we write + stat.__set_null_count(0); + cmd->__set_statistics(stat); if (encodings[idx] == Encoding::RLE_DICTIONARY) { uint32_t dictionary_page_offset = pfile.tellp(); write_dictionary_page(idx, from, until); @@ -673,6 +677,7 @@ void ParquetOutFile::write_data_page(uint32_t idx, int64_t rg_from, int64_t rg_until, uint64_t page_from, uint64_t page_until) { ColumnMetaData *cmd = &(column_meta_data[idx]); + Statistics *stat = &(cmd->statistics); SchemaElement se = schemas[idx + 1]; PageHeader ph; DataPageHeaderV2 dph2; @@ -852,6 +857,7 @@ void ParquetOutFile::write_data_page(uint32_t idx, int64_t rg_from, cmd->__set_total_uncompressed_size( cmd->total_uncompressed_size + rle_size + 4 ); + stat->__set_null_count(stat->null_count + page_num_values - num_present); // 4. write data to file write_present_data_(pfile, idx, data_size, num_present, page_from, page_until); @@ -905,6 +911,7 @@ void ParquetOutFile::write_data_page(uint32_t idx, int64_t rg_from, cmd->__set_total_uncompressed_size( cmd->total_uncompressed_size + rle_size ); + stat->__set_null_count(stat->null_count + page_num_values - num_present); } else if (se.repetition_type == FieldRepetitionType::OPTIONAL && encodings[idx] == Encoding::RLE_DICTIONARY && @@ -961,6 +968,7 @@ void ParquetOutFile::write_data_page(uint32_t idx, int64_t rg_from, cmd->__set_total_uncompressed_size( cmd->total_uncompressed_size + rle2_size ); + stat->__set_null_count(stat->null_count + page_num_values - num_present); } else if (se.repetition_type == FieldRepetitionType::OPTIONAL && encodings[idx] == Encoding::RLE_DICTIONARY && @@ -1021,6 +1029,7 @@ void ParquetOutFile::write_data_page(uint32_t idx, int64_t rg_from, cmd->__set_total_uncompressed_size( cmd->total_uncompressed_size + rle2_size ); + stat->__set_null_count(stat->null_count + page_num_values - num_present); } else if (se.repetition_type == FieldRepetitionType::REQUIRED && encodings[idx] == Encoding::RLE && @@ -1132,6 +1141,7 @@ void ParquetOutFile::write_data_page(uint32_t idx, int64_t rg_from, cmd->__set_total_uncompressed_size( cmd->total_uncompressed_size + rle2_size ); + stat->__set_null_count(stat->null_count + page_num_values - num_present); } else if (se.repetition_type == FieldRepetitionType::OPTIONAL && encodings[idx] == Encoding::RLE && @@ -1183,6 +1193,7 @@ void ParquetOutFile::write_data_page(uint32_t idx, int64_t rg_from, cmd->__set_total_uncompressed_size( cmd->total_uncompressed_size + rle2_size ); + stat->__set_null_count(stat->null_count + page_num_values - num_present); } } diff --git a/src/read-metadata.cpp b/src/read-metadata.cpp index 3b067e8..1e478c2 100644 --- a/src/read-metadata.cpp +++ b/src/read-metadata.cpp @@ -301,7 +301,8 @@ SEXP convert_column_chunks(const char *file_name, "data_page_offset", "index_page_offset", "dictionary_page_offset", - // TODO: statistics + "null_count", + // TODO: more statistics // TODO: encoding_stats "" }; @@ -334,6 +335,7 @@ SEXP convert_column_chunks(const char *file_name, SET_VECTOR_ELT(rccs, 16, safe_allocvector_real(nccs, &uwtoken)); // data_page_offset SET_VECTOR_ELT(rccs, 17, safe_allocvector_real(nccs, &uwtoken)); // index_page_offset SET_VECTOR_ELT(rccs, 18, safe_allocvector_real(nccs, &uwtoken)); // dictionary_page_offset + SET_VECTOR_ELT(rccs, 19, safe_allocvector_real(nccs, &uwtoken)); // statistics.null_count SEXP rfile_name = PROTECT(safe_mkchar(file_name, &uwtoken)); @@ -375,6 +377,9 @@ SEXP convert_column_chunks(const char *file_name, cmd.__isset.index_page_offset ? cmd.index_page_offset : NA_REAL; REAL(VECTOR_ELT(rccs, 18))[idx] = cmd.__isset.dictionary_page_offset ? cmd.dictionary_page_offset : NA_REAL; + REAL(VECTOR_ELT(rccs, 19))[idx] = + cmd.__isset.statistics && cmd.statistics.__isset.null_count ? + cmd.statistics.null_count : NA_REAL; idx++; } diff --git a/tests/testthat/_snaps/write-parquet-statistics.md b/tests/testthat/_snaps/write-parquet-statistics.md new file mode 100644 index 0000000..6a26e09 --- /dev/null +++ b/tests/testthat/_snaps/write-parquet-statistics.md @@ -0,0 +1,60 @@ +# null_count is written + + Code + as.data.frame(read_parquet_metadata(tmp)[["column_chunks"]][, c("row_group", + "column", "null_count")]) + Output + row_group column null_count + 1 0 0 1 + 2 0 1 1 + 3 0 2 1 + 4 0 3 1 + 5 0 4 1 + 6 0 5 1 + 7 0 6 1 + 8 0 7 1 + 9 0 8 1 + 10 0 9 1 + 11 0 10 0 + 12 0 11 0 + 13 0 12 0 + 14 1 0 0 + 15 1 1 0 + 16 1 2 0 + 17 1 3 0 + 18 1 4 0 + 19 1 5 0 + 20 1 6 0 + 21 1 7 0 + 22 1 8 0 + 23 1 9 0 + 24 1 10 1 + 25 1 11 1 + 26 1 12 1 + 27 2 0 0 + 28 2 1 0 + 29 2 2 0 + 30 2 3 0 + 31 2 4 0 + 32 2 5 0 + 33 2 6 0 + 34 2 7 0 + 35 2 8 0 + 36 2 9 0 + 37 2 10 0 + 38 2 11 0 + 39 2 12 0 + 40 3 0 0 + 41 3 1 0 + 42 3 2 0 + 43 3 3 0 + 44 3 4 0 + 45 3 5 0 + 46 3 6 0 + 47 3 7 0 + 48 3 8 0 + 49 3 9 0 + 50 3 10 0 + 51 3 11 0 + 52 3 12 0 + diff --git a/tests/testthat/test-write-parquet-statistics.R b/tests/testthat/test-write-parquet-statistics.R new file mode 100644 index 0000000..c4bd7bc --- /dev/null +++ b/tests/testthat/test-write-parquet-statistics.R @@ -0,0 +1,15 @@ +test_that("null_count is written", { + tmp <- tempfile(fileext = ".parquet") + on.exit(unlink(tmp), add = TRUE) + df <- test_df(missing = TRUE) + write_parquet( + df, tmp, + options = parquet_options(num_rows_per_row_group = 10) + ) + expect_equal(as.data.frame(df), as.data.frame(read_parquet(tmp))) + expect_snapshot( + as.data.frame(read_parquet_metadata(tmp)[["column_chunks"]][ + , c("row_group", "column", "null_count") + ]) + ) +}) From 4a1fc576ecba0c10e699505447ae22f17c3df585 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Cs=C3=A1rdi?= Date: Sun, 15 Sep 2024 19:32:42 +0200 Subject: [PATCH 02/26] Report min/max stats in `read_parquet_metadata()` --- R/parquet-metadata.R | 27 +++++++++++++++++++++++++++ man/read_parquet_metadata.Rd | 26 ++++++++++++++++++++++++++ src/protect.cpp | 5 +++++ src/protect.h | 5 +++++ src/read-metadata.cpp | 24 ++++++++++++++++++++++++ 5 files changed, 87 insertions(+) diff --git a/R/parquet-metadata.R b/R/parquet-metadata.R index 50c5612..b0ae9ae 100644 --- a/R/parquet-metadata.R +++ b/R/parquet-metadata.R @@ -126,6 +126,21 @@ format_schema_result <- function(mtd, sch, options) { #' integer for the root node, and `NA` for a leaf node. # ------------------------------------------------------------------------- #' * `$row_groups`: a data frame, information about the row groups. +#' Some important columns: +#' - `file_name`: file name. +#' - `id`: row group id, integer from zero to number of row groups +#' minus one. +#' - `total_byte_size`: total uncompressed size of all column data. +#' - `num_rows`: number of rows. +#' - `file_offset`: where the row group starts in the file. This is +#' optional, so it might be `NA`. +#' - `total_compressed_size`: total byte size of all compressed +#' (and potentially encrypted) column data in this row group. +#' This is optional, so it might be `NA`. +#' - `ordinal`: ordinal position of the row group in the file, starting +#' from zero. This is optional, so it might be `NA`. If `NA`, then +#' the order of the row groups is as they appear in the metadata. +# ------------------------------------------------------------------------- #' * `$column_chunks`: a data frame, information about all column chunks, #' across all row groups. Some important columns: #' - `file_name`: file name. @@ -155,6 +170,16 @@ format_schema_result <- function(mtd, sch, options) { #' - `dictionary_page_offset`: absolute position of the first #' dictionary page of the column chunk in the file, or `NA` if there #' are no dictionary pages. +#' - `min_value`: list column of raw vectors, the minimum value of the +#' column, in binary. If `NULL`, then then it is not specified. +#' This column is experimental. +#' - `max_value`: list column of raw vectors, the maximum value of the +#' column, in binary. If `NULL`, then then it is not specified. +#' This column is experimental. +#' - `is_min_value_exact`: whether the minimum value is an actual +#' value of a column, or a bound. It may be `NA`. +#' - `is_max_value_exact`: whether the maximum value is an actual +#' value of a column, or a bound. It may be `NA`. #' #' @export #' @seealso [read_parquet_info()] for a much shorter summary. @@ -191,6 +216,8 @@ read_parquet_metadata <- function(file, options = parquet_options()) { res$column_chunks$codec <- names(codecs)[res$column_chunks$codec + 1L] res$column_chunks$encodings <- I(res$column_chunks$encodings) res$column_chunks$path_in_schema <- I(res$column_chunks$path_in_schema) + res$column_chunks$min_value <- I(res$column_chunks$min_value) + res$column_chunks$max_value <- I(res$column_chunks$max_value) res$column_chunks <- as.data.frame(res$column_chunks) class(res$column_chunks) <- c("tbl", class(res$column_chunks)) diff --git a/man/read_parquet_metadata.Rd b/man/read_parquet_metadata.Rd index 0034132..73b8582 100644 --- a/man/read_parquet_metadata.Rd +++ b/man/read_parquet_metadata.Rd @@ -53,6 +53,22 @@ additional entries, e.g. \code{bit_width}, \code{is_signed}, etc. integer for the root node, and \code{NA} for a leaf node. } \item \verb{$row_groups}: a data frame, information about the row groups. +Some important columns: +\itemize{ +\item \code{file_name}: file name. +\item \code{id}: row group id, integer from zero to number of row groups +minus one. +\item \code{total_byte_size}: total uncompressed size of all column data. +\item \code{num_rows}: number of rows. +\item \code{file_offset}: where the row group starts in the file. This is +optional, so it might be \code{NA}. +\item \code{total_compressed_size}: total byte size of all compressed +(and potentially encrypted) column data in this row group. +This is optional, so it might be \code{NA}. +\item \code{ordinal}: ordinal position of the row group in the file, starting +from zero. This is optional, so it might be \code{NA}. If \code{NA}, then +the order of the row groups is as they appear in the metadata. +} \item \verb{$column_chunks}: a data frame, information about all column chunks, across all row groups. Some important columns: \itemize{ @@ -83,6 +99,16 @@ the column chunk in the file, or \code{NA} if there are no index pages. \item \code{dictionary_page_offset}: absolute position of the first dictionary page of the column chunk in the file, or \code{NA} if there are no dictionary pages. +\item \code{min_value}: list column of raw vectors, the minimum value of the +column, in binary. If \code{NULL}, then then it is not specified. +This column is experimental. +\item \code{max_value}: list column of raw vectors, the maximum value of the +column, in binary. If \code{NULL}, then then it is not specified. +This column is experimental. +\item \code{is_min_value_exact}: whether the minimum value is an actual +value of a column, or a bound. It may be \code{NA}. +\item \code{is_max_value_exact}: whether the maximum value is an actual +value of a column, or a bound. It may be \code{NA}. } } } diff --git a/src/protect.cpp b/src/protect.cpp index 9d67d21..6a8ce6c 100644 --- a/src/protect.cpp +++ b/src/protect.cpp @@ -17,6 +17,11 @@ SEXP wrapped_intsxp(void *len) { return Rf_allocVector(INTSXP, *xlen); } +SEXP wrapped_lglsxp(void *len) { + R_xlen_t *xlen = (R_xlen_t*) len; + return Rf_allocVector(LGLSXP, *xlen); +} + SEXP wrapped_realsxp(void *len) { R_xlen_t *xlen = (R_xlen_t*) len; return Rf_allocVector(REALSXP, *xlen); diff --git a/src/protect.h b/src/protect.h index 9804c1a..bd8e7b6 100644 --- a/src/protect.h +++ b/src/protect.h @@ -46,6 +46,7 @@ void throw_error(void *err, Rboolean jump); SEXP wrapped_rawsxp(void *len); SEXP wrapped_intsxp(void *len); +SEXP wrapped_lglsxp(void *len); SEXP wrapped_realsxp(void *len); SEXP wrapped_strsxp(void *len); SEXP wrapped_vecsxp(void *len); @@ -66,6 +67,10 @@ inline SEXP safe_allocvector_int(R_xlen_t len, SEXP *uwt) { return R_UnwindProtect(wrapped_intsxp, &len, throw_error, uwt, *uwt); } +inline SEXP safe_allocvector_lgl(R_xlen_t len, SEXP *uwt) { + return R_UnwindProtect(wrapped_lglsxp, &len, throw_error, uwt, *uwt); +} + inline SEXP safe_allocvector_real(R_xlen_t len, SEXP *uwt) { return R_UnwindProtect(wrapped_realsxp, &len, throw_error, uwt, *uwt); } diff --git a/src/read-metadata.cpp b/src/read-metadata.cpp index 1e478c2..412729f 100644 --- a/src/read-metadata.cpp +++ b/src/read-metadata.cpp @@ -302,6 +302,10 @@ SEXP convert_column_chunks(const char *file_name, "index_page_offset", "dictionary_page_offset", "null_count", + "min_value", + "max_value", + "is_min_value_exact", + "is_max_value_exact", // TODO: more statistics // TODO: encoding_stats "" @@ -336,6 +340,10 @@ SEXP convert_column_chunks(const char *file_name, SET_VECTOR_ELT(rccs, 17, safe_allocvector_real(nccs, &uwtoken)); // index_page_offset SET_VECTOR_ELT(rccs, 18, safe_allocvector_real(nccs, &uwtoken)); // dictionary_page_offset SET_VECTOR_ELT(rccs, 19, safe_allocvector_real(nccs, &uwtoken)); // statistics.null_count + SET_VECTOR_ELT(rccs, 20, safe_allocvector_vec(nccs, &uwtoken)); // statistics.min_value + SET_VECTOR_ELT(rccs, 21, safe_allocvector_vec(nccs, &uwtoken)); // statistics.max_value + SET_VECTOR_ELT(rccs, 22, safe_allocvector_lgl(nccs, &uwtoken)); // statistics.is_min_value_exact + SET_VECTOR_ELT(rccs, 23, safe_allocvector_lgl(nccs, &uwtoken)); // statistics.is_max_value_exact SEXP rfile_name = PROTECT(safe_mkchar(file_name, &uwtoken)); @@ -380,6 +388,22 @@ SEXP convert_column_chunks(const char *file_name, REAL(VECTOR_ELT(rccs, 19))[idx] = cmd.__isset.statistics && cmd.statistics.__isset.null_count ? cmd.statistics.null_count : NA_REAL; + if (cmd.__isset.statistics && cmd.statistics.__isset.min_value) { + size_t vl = cmd.statistics.min_value.size(); + SET_VECTOR_ELT(VECTOR_ELT(rccs, 20), idx, safe_allocvector_raw(vl, &uwtoken)); + memcpy(RAW(VECTOR_ELT(VECTOR_ELT(rccs, 20), idx)), cmd.statistics.min_value.data(), vl); + } + if (cmd.__isset.statistics && cmd.statistics.__isset.max_value) { + size_t vl = cmd.statistics.max_value.size(); + SET_VECTOR_ELT(VECTOR_ELT(rccs, 21), idx, safe_allocvector_raw(vl, &uwtoken)); + memcpy(RAW(VECTOR_ELT(VECTOR_ELT(rccs, 21), idx)), cmd.statistics.max_value.data(), vl); + } + LOGICAL(VECTOR_ELT(rccs, 22))[idx] = + cmd.__isset.statistics && cmd.statistics.__isset.is_min_value_exact ? + cmd.statistics.is_min_value_exact : NA_LOGICAL; + LOGICAL(VECTOR_ELT(rccs, 23))[idx] = + cmd.__isset.statistics && cmd.statistics.__isset.is_max_value_exact ? + cmd.statistics.is_max_value_exact : NA_LOGICAL; idx++; } From d6ab47bcdd4fae3c1139bb39f0a7cc3d2958ca73 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Cs=C3=A1rdi?= Date: Sun, 15 Sep 2024 19:35:54 +0200 Subject: [PATCH 03/26] Improve docs [ci skip] --- R/parquet-metadata.R | 2 ++ man/read_parquet_metadata.Rd | 2 ++ 2 files changed, 4 insertions(+) diff --git a/R/parquet-metadata.R b/R/parquet-metadata.R index b0ae9ae..0f07466 100644 --- a/R/parquet-metadata.R +++ b/R/parquet-metadata.R @@ -170,6 +170,8 @@ format_schema_result <- function(mtd, sch, options) { #' - `dictionary_page_offset`: absolute position of the first #' dictionary page of the column chunk in the file, or `NA` if there #' are no dictionary pages. +#' - `null_count`: the number of missing values in the column chunk. +#' It may be `NA`. #' - `min_value`: list column of raw vectors, the minimum value of the #' column, in binary. If `NULL`, then then it is not specified. #' This column is experimental. diff --git a/man/read_parquet_metadata.Rd b/man/read_parquet_metadata.Rd index 73b8582..2a7320c 100644 --- a/man/read_parquet_metadata.Rd +++ b/man/read_parquet_metadata.Rd @@ -99,6 +99,8 @@ the column chunk in the file, or \code{NA} if there are no index pages. \item \code{dictionary_page_offset}: absolute position of the first dictionary page of the column chunk in the file, or \code{NA} if there are no dictionary pages. +\item \code{null_count}: the number of missing values in the column chunk. +It may be \code{NA}. \item \code{min_value}: list column of raw vectors, the minimum value of the column, in binary. If \code{NULL}, then then it is not specified. This column is experimental. From a669d9032f9fb769f875be2a09cd638e94e3c3ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Cs=C3=A1rdi?= Date: Mon, 16 Sep 2024 08:08:19 +0200 Subject: [PATCH 04/26] Update test snapshot --- tests/testthat/_snaps/parquet-metadata.md | 42 +++++++++++++++-------- 1 file changed, 28 insertions(+), 14 deletions(-) diff --git a/tests/testthat/_snaps/parquet-metadata.md b/tests/testthat/_snaps/parquet-metadata.md index e706280..152dc58 100644 --- a/tests/testthat/_snaps/parquet-metadata.md +++ b/tests/testthat/_snaps/parquet-metadata.md @@ -106,20 +106,34 @@ 11 275 2879 NA 12 275 3154 NA 13 21 3429 NA - dictionary_page_offset - 1 NA - 2 NA - 3 NA - 4 NA - 5 NA - 6 NA - 7 NA - 8 NA - 9 NA - 10 NA - 11 NA - 12 NA - 13 NA + dictionary_page_offset null_count min_value max_value is_min_value_exact + 1 NA 0 NA + 2 NA 0 NA + 3 NA 0 NA + 4 NA 0 NA + 5 NA 0 NA + 6 NA 0 NA + 7 NA 0 NA + 8 NA 0 NA + 9 NA 0 NA + 10 NA 0 NA + 11 NA 0 NA + 12 NA 0 NA + 13 NA 0 NA + is_max_value_exact + 1 NA + 2 NA + 3 NA + 4 NA + 5 NA + 6 NA + 7 NA + 8 NA + 9 NA + 10 NA + 11 NA + 12 NA + 13 NA --- From 1b5d7ec6a34a9bcb6af99713b9039d44892d64cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Cs=C3=A1rdi?= Date: Mon, 16 Sep 2024 08:08:35 +0200 Subject: [PATCH 05/26] Writer: pass row group & page to callbacks So they can calculate min and max values easily. --- src/lib/ParquetOutFile.cpp | 70 ++++++++++++++++++++++---------------- src/lib/ParquetOutFile.h | 49 ++++++++++++++++---------- src/write.cpp | 52 ++++++++++++++++++---------- 3 files changed, 105 insertions(+), 66 deletions(-) diff --git a/src/lib/ParquetOutFile.cpp b/src/lib/ParquetOutFile.cpp index 2cbf37f..bfcda2e 100644 --- a/src/lib/ParquetOutFile.cpp +++ b/src/lib/ParquetOutFile.cpp @@ -246,6 +246,8 @@ void ParquetOutFile::write_data_( std::ostream &file, uint32_t idx, uint32_t size, + uint32_t group, + uint32_t page, uint64_t from, uint64_t until) { @@ -254,28 +256,28 @@ void ParquetOutFile::write_data_( parquet::Type::type type = se.type; switch (type) { case Type::INT32: - write_int32(file, idx, from, until, se); + write_int32(file, idx, group, page, from, until, se); break; case Type::INT64: - write_int64(file, idx, from, until, se); + write_int64(file, idx, group, page, from, until, se); break; case Type::INT96: - write_int96(file, idx, from, until, se); + write_int96(file, idx, group, page, from, until, se); break; case Type::FLOAT: - write_float(file, idx, from, until, se); + write_float(file, idx, group, page, from, until, se); break; case Type::DOUBLE: - write_double(file, idx, from, until, se); + write_double(file, idx, group, page, from, until, se); break; case Type::BYTE_ARRAY: - write_byte_array(file, idx, from, until, se); + write_byte_array(file, idx, group, page, from, until, se); break; case Type::FIXED_LEN_BYTE_ARRAY: - write_fixed_len_byte_array(file, idx, from, until, se); + write_fixed_len_byte_array(file, idx, group, page, from, until, se); break; case Type::BOOLEAN: - write_boolean(file, idx, from, until); + write_boolean(file, idx, group, page, from, until); break; default: throw runtime_error("Cannot write unknown column type"); // # nocov @@ -300,6 +302,8 @@ void ParquetOutFile::write_present_data_( uint32_t idx, uint32_t size, uint32_t num_present, + uint32_t group, + uint32_t page, uint64_t from, uint64_t until) { @@ -308,25 +312,25 @@ void ParquetOutFile::write_present_data_( parquet::Type::type type = se.type; switch (type) { case Type::INT32: - write_int32(file, idx, from, until, se); + write_int32(file, idx, group, page, from, until, se); break; case Type::INT64: - write_int64(file, idx, from, until, se); + write_int64(file, idx, group, page, from, until, se); break; case Type::INT96: - write_int96(file, idx, from, until, se); + write_int96(file, idx, group, page, from, until, se); break; case Type::FLOAT: - write_float(file, idx, from, until, se); + write_float(file, idx, group, page, from, until, se); break; case Type::DOUBLE: - write_double(file, idx, from, until, se); + write_double(file, idx, group, page, from, until, se); break; case Type::BYTE_ARRAY: - write_byte_array(file, idx, from, until, se); + write_byte_array(file, idx, group, page, from, until, se); break; case Type::FIXED_LEN_BYTE_ARRAY: - write_fixed_len_byte_array(file, idx, from, until, se); + write_fixed_len_byte_array(file, idx, group, page, from, until, se); break; case Type::BOOLEAN: write_present_boolean(file, idx, num_present, from, until); @@ -510,11 +514,12 @@ void ParquetOutFile::write() { // write int64_t from = row_group_starts[idx]; int64_t until = idx < row_group_starts.size() - 1 ? row_group_starts[idx + 1] : num_rows; - int64_t total_size = write_columns(from, until); + int64_t total_size = write_columns(idx, from, until); // row group metadata vector ccs; for (uint32_t idx = 0; idx < num_cols; idx++) { + Statistics stat; ColumnChunk cc; cc.__set_file_offset(column_meta_data[idx].data_page_offset); cc.__set_meta_data(column_meta_data[idx]); @@ -531,17 +536,19 @@ void ParquetOutFile::write() { pfile_.close(); } -int64_t ParquetOutFile::write_columns(int64_t from, int64_t until) { +int64_t ParquetOutFile::write_columns(uint32_t group, int64_t from, + int64_t until) { uint32_t start = pfile.tellp(); for (uint32_t idx = 0; idx < num_cols; idx++) { - write_column(idx, from, until); + write_column(idx, group, from, until); } uint32_t end = pfile.tellp(); // return total size return end - start; } -void ParquetOutFile::write_column(uint32_t idx, int64_t from, int64_t until) { +void ParquetOutFile::write_column(uint32_t idx, uint32_t group, + int64_t from, int64_t until) { ColumnMetaData *cmd = &(column_meta_data[idx]); SchemaElement se = schemas[idx + 1]; uint32_t col_start = pfile.tellp(); @@ -557,7 +564,7 @@ void ParquetOutFile::write_column(uint32_t idx, int64_t from, int64_t until) { cmd->__set_dictionary_page_offset(dictionary_page_offset); } uint32_t data_offset = pfile.tellp(); - write_data_pages(idx, from, until); + write_data_pages(idx, group, from, until); int32_t column_bytes = ((int32_t) pfile.tellp()) - col_start; cmd->__set_num_values(until - from); cmd->__set_total_compressed_size(column_bytes); @@ -623,8 +630,8 @@ void ParquetOutFile::write_dictionary_page(uint32_t idx, int64_t from, } } -void ParquetOutFile::write_data_pages(uint32_t idx, int64_t from, - int64_t until) { +void ParquetOutFile::write_data_pages(uint32_t idx, uint32_t group, + int64_t from, int64_t until) { SchemaElement se = schemas[idx + 1]; int64_t rg_num_rows = until - from; @@ -669,11 +676,12 @@ void ParquetOutFile::write_data_pages(uint32_t idx, int64_t from, if (page_until > until) { page_until = until; } - write_data_page(idx, from, until, page_from, page_until); + write_data_page(idx, group, i, from, until, page_from, page_until); } } -void ParquetOutFile::write_data_page(uint32_t idx, int64_t rg_from, +void ParquetOutFile::write_data_page(uint32_t idx, uint32_t group, + uint32_t page, int64_t rg_from, int64_t rg_until, uint64_t page_from, uint64_t page_until) { ColumnMetaData *cmd = &(column_meta_data[idx]); @@ -721,7 +729,7 @@ void ParquetOutFile::write_data_page(uint32_t idx, int64_t rg_from, ph.__set_data_page_header_v2(dph2); } write_page_header(idx, ph); - write_data_(pfile, idx, data_size, page_from, page_until); + write_data_(pfile, idx, data_size, group, page, page_from, page_until); } else if (se.repetition_type == FieldRepetitionType::REQUIRED && encodings[idx] == Encoding::PLAIN && @@ -735,7 +743,7 @@ void ParquetOutFile::write_data_page(uint32_t idx, int64_t rg_from, buf_unc.reset(data_size); std::unique_ptr os0 = std::unique_ptr(new std::ostream(&buf_unc)); - write_data_(*os0, idx, data_size, page_from, page_until); + write_data_(*os0, idx, data_size, group, page, page_from, page_until); // 2. compress buf_unc to buf_com size_t cdata_size = compress(cmd->codec, buf_unc, data_size, buf_com); @@ -860,7 +868,8 @@ void ParquetOutFile::write_data_page(uint32_t idx, int64_t rg_from, stat->__set_null_count(stat->null_count + page_num_values - num_present); // 4. write data to file - write_present_data_(pfile, idx, data_size, num_present, page_from, page_until); + write_present_data_(pfile, idx, data_size, num_present, group, page, + page_from, page_until); } else if (se.repetition_type == FieldRepetitionType::OPTIONAL && encodings[idx] == Encoding::PLAIN && @@ -891,7 +900,8 @@ void ParquetOutFile::write_data_page(uint32_t idx, int64_t rg_from, std::unique_ptr os1 = std::unique_ptr(new std::ostream(&buf_com)); buf_com.skip(rle_size); - write_present_data_(*os1, idx, data_size, num_present, page_from, page_until); + write_present_data_(*os1, idx, data_size, num_present, group, page, + page_from, page_until); // 4. compress buf_com to buf_unc // for data page v2, the def levels are not compressed! @@ -1040,7 +1050,7 @@ void ParquetOutFile::write_data_page(uint32_t idx, int64_t rg_from, buf_unc.reset(data_size); std::unique_ptr os0 = std::unique_ptr(new std::ostream(&buf_unc)); - write_boolean_as_int(*os0, idx, page_from, page_until); + write_boolean_as_int(*os0, idx, group, page, page_from, page_until); // 2. RLE encode buf_unc to buf_com uint32_t rle_size = rle_encode( @@ -1070,7 +1080,7 @@ void ParquetOutFile::write_data_page(uint32_t idx, int64_t rg_from, buf_unc.reset(data_size); std::unique_ptr os0 = std::unique_ptr(new std::ostream(&buf_unc)); - write_boolean_as_int(*os0, idx, page_from, page_until); + write_boolean_as_int(*os0, idx, group, page, page_from, page_until); // 2. RLE encode buf_unc to buf_com uint32_t rle_size = rle_encode( diff --git a/src/lib/ParquetOutFile.h b/src/lib/ParquetOutFile.h index 4adce01..b0efa1a 100644 --- a/src/lib/ParquetOutFile.h +++ b/src/lib/ParquetOutFile.h @@ -35,25 +35,33 @@ class ParquetOutFile { // write out various parquet types, these must be implemented in // the subclass - virtual void write_int32(std::ostream &file, uint32_t idx, uint64_t from, - uint64_t until, parquet::SchemaElement &sel) = 0; - virtual void write_int64(std::ostream &file, uint32_t idx, uint64_t from, - uint64_t until, parquet::SchemaElement &sel) = 0; - virtual void write_int96(std::ostream &file, uint32_t idx, uint64_t from, - uint64_t until, parquet::SchemaElement &sel) = 0; - virtual void write_float(std::ostream &file, uint32_t idx, uint64_t from, - uint64_t until, parquet::SchemaElement &sel) = 0; - virtual void write_double(std::ostream &file, uint32_t idx, uint64_t from, - uint64_t until, parquet::SchemaElement &sel) = 0; + virtual void write_int32(std::ostream &file, uint32_t idx, uint32_t group, + uint32_t page, uint64_t from, uint64_t until, + parquet::SchemaElement &sel) = 0; + virtual void write_int64(std::ostream &file, uint32_t idx, uint32_t group, + uint32_t page, uint64_t from, uint64_t until, + parquet::SchemaElement &sel) = 0; + virtual void write_int96(std::ostream &file, uint32_t idx, uint32_t group, + uint32_t page, uint64_t from, uint64_t until, + parquet::SchemaElement &sel) = 0; + virtual void write_float(std::ostream &file, uint32_t idx, uint32_t group, + uint32_t page, uint64_t from, uint64_t until, + parquet::SchemaElement &sel) = 0; + virtual void write_double(std::ostream &file, uint32_t idx, uint32_t group, + uint32_t page, uint64_t from, uint64_t until, + parquet::SchemaElement &sel) = 0; virtual void write_byte_array(std::ostream &file, uint32_t idx, - uint64_t from, uint64_t until, + uint32_t group, uint32_t page, uint64_t from, + uint64_t until, parquet::SchemaElement &sel) = 0; virtual void write_fixed_len_byte_array(std::ostream &file, uint32_t idx, + uint32_t group, uint32_t page, uint64_t from, uint64_t until, parquet::SchemaElement &sel) = 0; - virtual void write_boolean(std::ostream &file, uint32_t idx, - uint64_t from, uint64_t until) = 0; + virtual void write_boolean(std::ostream &file, uint32_t idx, uint32_t group, + uint32_t page, uint64_t from, uint64_t until) = 0; virtual void write_boolean_as_int(std::ostream &file, uint32_t idx, + uint32_t group, uint32_t page, uint64_t from, uint64_t until) = 0; // callbacks for missing values @@ -113,19 +121,24 @@ class ParquetOutFile { void init_column_meta_data(); // return total size - int64_t write_columns(int64_t from, int64_t until); - void write_column(uint32_t idx, int64_t from, int64_t until); + int64_t write_columns(uint32_t group, int64_t from, int64_t until); + void write_column(uint32_t idx, uint32_t group, int64_t from, + int64_t until); void write_dictionary_page(uint32_t idx, int64_t from, int64_t until); - void write_data_pages(uint32_t idx, int64_t from, int64_t until); - void write_data_page(uint32_t idx, int64_t rg_from, int64_t rg_until, + void write_data_pages(uint32_t idx, uint32_t group, int64_t from, + int64_t until); + void write_data_page(uint32_t idx, uint32_t group, uint32_t page, + int64_t rg_from, int64_t rg_until, uint64_t from, uint64_t until); void write_page_header(uint32_t idx, parquet::PageHeader &ph); void write_footer(); void write_data_(std::ostream &file, uint32_t idx, uint32_t size, - uint64_t from, uint64_t until); + uint32_t group, uint32_t page, uint64_t from, + uint64_t until); void write_present_data_(std::ostream &file, uint32_t idx, uint32_t size, uint32_t num_present, + uint32_t group, uint32_t page, uint64_t from, uint64_t until); void write_dictionary_(std::ostream &file, uint32_t idx, uint32_t size, parquet::SchemaElement &sel, int64_t from, diff --git a/src/write.cpp b/src/write.cpp index 71eb8c9..23a573f 100644 --- a/src/write.cpp +++ b/src/write.cpp @@ -105,27 +105,33 @@ class RParquetOutFile : public ParquetOutFile { int compsession_level, std::vector &row_groups ); - void write_int32(std::ostream &file, uint32_t idx, uint64_t from, - uint64_t until, parquet::SchemaElement &sel); - void write_int64(std::ostream &file, uint32_t idx, uint64_t from, - uint64_t until, parquet::SchemaElement &sel); - void write_int96(std::ostream &file, uint32_t idx, uint64_t from, - uint64_t until, parquet::SchemaElement &sel); - void write_float(std::ostream &file, uint32_t idx, uint64_t from, - uint64_t until, parquet::SchemaElement &sel); - void write_double(std::ostream &file, uint32_t idx, uint64_t from, - uint64_t until, parquet::SchemaElement &sel); - void write_byte_array(std::ostream &file, uint32_t id, uint64_t from, - uint64_t until, parquet::SchemaElement &sel); + void write_int32(std::ostream &file, uint32_t idx, uint32_t group, + uint32_t page, uint64_t from, uint64_t until, + parquet::SchemaElement &sel); + void write_int64(std::ostream &file, uint32_t idx, uint32_t group, + uint32_t page, uint64_t from, uint64_t until, + parquet::SchemaElement &sel); + void write_int96(std::ostream &file, uint32_t idx, uint32_t group, + uint32_t page, uint64_t from, uint64_t until, + parquet::SchemaElement &sel); + void write_float(std::ostream &file, uint32_t idx, uint32_t group, + uint32_t page, uint64_t from, uint64_t until, + parquet::SchemaElement &sel); + void write_double(std::ostream &file, uint32_t idx, uint32_t group, + uint32_t page, uint64_t from, uint64_t until, + parquet::SchemaElement &sel); + void write_byte_array(std::ostream &file, uint32_t idx, uint32_t group, + uint32_t page, uint64_t from, uint64_t until, + parquet::SchemaElement &sel); void write_fixed_len_byte_array(std::ostream &file, uint32_t id, - uint64_t from, uint64_t until, - parquet::SchemaElement &sel); + uint32_t group, uint32_t page, uint64_t from, + uint64_t until, parquet::SchemaElement &sel); uint32_t get_size_byte_array(uint32_t idx, uint32_t num_present, uint64_t from, uint64_t until); - void write_boolean(std::ostream &file, uint32_t idx, uint64_t from, - uint64_t until); - void write_boolean_as_int(std::ostream &file, uint32_t idx, - uint64_t from, uint64_t until); + void write_boolean(std::ostream &file, uint32_t idx, uint32_t group, + uint32_t page, uint64_t from, uint64_t until); + void write_boolean_as_int(std::ostream &file, uint32_t idx, uint32_t group, + uint32_t page, uint64_t from, uint64_t until); uint32_t write_present(std::ostream &file, uint32_t idx, uint64_t from, uint64_t until); @@ -755,6 +761,7 @@ void write_double_int32(std::ostream &file, SEXP col, uint32_t idx, } void RParquetOutFile::write_int32(std::ostream &file, uint32_t idx, + uint32_t group, uint32_t page, uint64_t from, uint64_t until, parquet::SchemaElement &sel) { SEXP col = VECTOR_ELT(df, idx); @@ -978,6 +985,7 @@ void write_double_int64(std::ostream &file, SEXP col, uint32_t idx, } void RParquetOutFile::write_int64(std::ostream &file, uint32_t idx, + uint32_t group, uint32_t page, uint64_t from, uint64_t until, parquet::SchemaElement &sel) { // This is double in R, so we need to convert @@ -1019,6 +1027,7 @@ void RParquetOutFile::write_int64(std::ostream &file, uint32_t idx, } void RParquetOutFile::write_int96(std::ostream &file, uint32_t idx, + uint32_t group, uint32_t page, uint64_t from, uint64_t until, parquet::SchemaElement &sel) { // This is double in R, so we need to convert @@ -1058,6 +1067,7 @@ void RParquetOutFile::write_int96(std::ostream &file, uint32_t idx, } void RParquetOutFile::write_float(std::ostream &file, uint32_t idx, + uint32_t group, uint32_t page, uint64_t from, uint64_t until, parquet::SchemaElement &sel) { SEXP col = VECTOR_ELT(df, idx); @@ -1083,6 +1093,7 @@ void RParquetOutFile::write_float(std::ostream &file, uint32_t idx, } void RParquetOutFile::write_double(std::ostream &file, uint32_t idx, + uint32_t group, uint32_t page, uint64_t from, uint64_t until, parquet::SchemaElement &sel) { SEXP col = VECTOR_ELT(df, idx); @@ -1112,6 +1123,7 @@ void RParquetOutFile::write_double(std::ostream &file, uint32_t idx, } void RParquetOutFile::write_byte_array(std::ostream &file, uint32_t idx, + uint32_t group, uint32_t page, uint64_t from, uint64_t until, parquet::SchemaElement &sel) { SEXP col = VECTOR_ELT(df, idx); @@ -1259,6 +1271,7 @@ static bool parse_uuid(const char *c, char *u, char *t) { void RParquetOutFile::write_fixed_len_byte_array( std::ostream &file, uint32_t idx, + uint32_t group, uint32_t page, uint64_t from, uint64_t until, parquet::SchemaElement &sel) { @@ -1399,6 +1412,7 @@ void write_boolean_impl(std::ostream &file, SEXP col, } void RParquetOutFile::write_boolean(std::ostream &file, uint32_t idx, + uint32_t group, uint32_t page, uint64_t from, uint64_t until) { SEXP col = VECTOR_ELT(df, idx); if (TYPEOF(col) != LGLSXP) { @@ -1413,6 +1427,8 @@ void RParquetOutFile::write_boolean(std::ostream &file, uint32_t idx, void RParquetOutFile::write_boolean_as_int(std::ostream &file, uint32_t idx, + uint32_t group, + uint32_t page, uint64_t from, uint64_t until) { SEXP col = VECTOR_ELT(df, idx); From 55f126f32dbfe8faecfdeecb162cdf3cbf5a5693 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Cs=C3=A1rdi?= Date: Mon, 16 Sep 2024 11:59:33 +0200 Subject: [PATCH 06/26] Min/max for PLAIN INT32 and DATE --- R/options.R | 12 +- man/parquet_options.Rd | 9 +- src/lib/ParquetOutFile.cpp | 11 +- src/lib/ParquetOutFile.h | 10 ++ src/write.cpp | 141 +++++++++++++++++- .../_snaps/write-parquet-statistics.md | 14 ++ .../testthat/test-write-parquet-statistics.R | 31 ++++ 7 files changed, 217 insertions(+), 11 deletions(-) diff --git a/R/options.R b/R/options.R index 275d17c..7ae1f5f 100644 --- a/R/options.R +++ b/R/options.R @@ -34,6 +34,11 @@ #' metadata to the file [write_parquet()]. #' @param write_data_page_version Data version to write by default. #' Possible values are 1 and 2. Default is 1. +#' @param write_minmax_values Whether to write minimum and maximum values +#' per row group, for data types that support this in [write_parquet()]. +#' However, nanoparquet currently does not support minimum and maximum +#' values for the `DECIMAL` and `FLOAT16` logical types. Currently the +#' default is `TRUE`. #' #' @return List of nanoparquet options. #' @@ -55,7 +60,8 @@ parquet_options <- function( num_rows_per_row_group = getOption("nanoparquet.num_rows_per_row_group", 10000000L), use_arrow_metadata = getOption("nanoparquet.use_arrow_metadata", TRUE), write_arrow_metadata = getOption("nanoparquet.write_arrow_metadata", TRUE), - write_data_page_version = getOption("nanoparquet.write_data_page_version", 1L) + write_data_page_version = getOption("nanoparquet.write_data_page_version", 1L), + write_minmax_values = getOption("nanoparquet.write_minmax_values", TRUE) ) { stopifnot(is.character(class)) stopifnot(is_flag(use_arrow_metadata)) @@ -66,6 +72,7 @@ parquet_options <- function( identical(write_data_page_version, 1L) || identical(write_data_page_version, 2L) ) + stopifnot(is_flag(write_minmax_values)) num_rows_per_row_group <- as_count( num_rows_per_row_group, "num_rows_per_row_group" @@ -86,6 +93,7 @@ parquet_options <- function( num_rows_per_row_group = num_rows_per_row_group, use_arrow_metadata = use_arrow_metadata, write_arrow_metadata = write_arrow_metadata, - write_data_page_version = as.integer(write_data_page_version) + write_data_page_version = as.integer(write_data_page_version), + write_minmax_values = write_minmax_values ) } diff --git a/man/parquet_options.Rd b/man/parquet_options.Rd index b0c0a1c..c93140a 100644 --- a/man/parquet_options.Rd +++ b/man/parquet_options.Rd @@ -10,7 +10,8 @@ parquet_options( num_rows_per_row_group = getOption("nanoparquet.num_rows_per_row_group", 10000000L), use_arrow_metadata = getOption("nanoparquet.use_arrow_metadata", TRUE), write_arrow_metadata = getOption("nanoparquet.write_arrow_metadata", TRUE), - write_data_page_version = getOption("nanoparquet.write_data_page_version", 1L) + write_data_page_version = getOption("nanoparquet.write_data_page_version", 1L), + write_minmax_values = getOption("nanoparquet.write_minmax_values", TRUE) ) } \arguments{ @@ -56,6 +57,12 @@ metadata to the file \code{\link[=write_parquet]{write_parquet()}}.} \item{write_data_page_version}{Data version to write by default. Possible values are 1 and 2. Default is 1.} + +\item{write_minmax_values}{Whether to write minimum and maximum values +per row group, for data types that support this in \code{\link[=write_parquet]{write_parquet()}}. +However, nanoparquet currently does not support minimum and maximum +values for the \code{DECIMAL} and \code{FLOAT16} logical types. Currently the +default is \code{TRUE}.} } \value{ List of nanoparquet options. diff --git a/src/lib/ParquetOutFile.cpp b/src/lib/ParquetOutFile.cpp index bfcda2e..c4a11a8 100644 --- a/src/lib/ParquetOutFile.cpp +++ b/src/lib/ParquetOutFile.cpp @@ -514,12 +514,12 @@ void ParquetOutFile::write() { // write int64_t from = row_group_starts[idx]; int64_t until = idx < row_group_starts.size() - 1 ? row_group_starts[idx + 1] : num_rows; + write_row_group(idx); int64_t total_size = write_columns(idx, from, until); // row group metadata vector ccs; for (uint32_t idx = 0; idx < num_cols; idx++) { - Statistics stat; ColumnChunk cc; cc.__set_file_offset(column_meta_data[idx].data_page_offset); cc.__set_meta_data(column_meta_data[idx]); @@ -569,6 +569,15 @@ void ParquetOutFile::write_column(uint32_t idx, uint32_t group, cmd->__set_num_values(until - from); cmd->__set_total_compressed_size(column_bytes); cmd->__set_data_page_offset(data_offset); + // min-max values + std::string min_value, max_value; + if (get_group_minmax_values(idx, group, se, min_value, max_value)) { + Statistics *stat = &cmd->statistics; + stat->__set_min_value(min_value); + stat->__set_max_value(max_value); + stat->__set_is_min_value_exact(true); + stat->__set_is_max_value_exact(true); + } } void ParquetOutFile::write_page_header(uint32_t idx, PageHeader &ph) { diff --git a/src/lib/ParquetOutFile.h b/src/lib/ParquetOutFile.h index b0efa1a..02f833b 100644 --- a/src/lib/ParquetOutFile.h +++ b/src/lib/ParquetOutFile.h @@ -33,6 +33,11 @@ class ParquetOutFile { void add_key_value_metadata(std::string key, std::string value); void write(); + // This makes the write inherently sequential and we might remove it + // latest. Currently, it makes it easier to keep track of minimume and + // maximum values per row group. + virtual void write_row_group(uint32_t group) = 0; + // write out various parquet types, these must be implemented in // the subclass virtual void write_int32(std::ostream &file, uint32_t idx, uint32_t group, @@ -95,6 +100,11 @@ class ParquetOutFile { uint64_t page_from, uint64_t page_until) = 0; + virtual bool get_group_minmax_values(uint32_t idx, uint32_t group, + parquet::SchemaElement &sel, + std::string &min_value, + std::string &max_value) = 0; + int data_page_version = 1; private: diff --git a/src/write.cpp b/src/write.cpp index 23a573f..07ec5b7 100644 --- a/src/write.cpp +++ b/src/write.cpp @@ -89,6 +89,7 @@ extern "C" { SEXP nanoparquet_create_dict(SEXP x, SEXP rlen); SEXP nanoparquet_create_dict_idx_(SEXP x, SEXP from, SEXP until); SEXP nanoparquet_avg_run_length(SEXP x, SEXP rlen); +static SEXP get_list_element(SEXP list, const char *str); } class RParquetOutFile : public ParquetOutFile { @@ -105,6 +106,7 @@ class RParquetOutFile : public ParquetOutFile { int compsession_level, std::vector &row_groups ); + void write_row_group(uint32_t group); void write_int32(std::ostream &file, uint32_t idx, uint32_t group, uint32_t page, uint64_t from, uint64_t until, parquet::SchemaElement &sel); @@ -154,6 +156,12 @@ class RParquetOutFile : public ParquetOutFile { int64_t rg_from, int64_t rg_until, uint64_t page_from, uint64_t page_until); + // statistics + bool get_group_minmax_values(uint32_t idx, uint32_t group, + parquet::SchemaElement &sel, + std::string &min_value, + std::string &max_value); + void write( SEXP dfsxp, SEXP dim, @@ -171,11 +179,24 @@ class RParquetOutFile : public ParquetOutFile { SEXP dicts_from = R_NilValue; ByteBuffer present; + bool write_minmax_values; + std::vector is_minmax_supported; + std::vector min_values; + std::vector max_values; + std::vector has_minmax_value; + void create_dictionary(uint32_t idx, int64_t from, int64_t until); // for LGLSXP this mean RLE encoding bool should_use_dict_encoding(uint32_t idx); parquet::Encoding::type detect_encoding(uint32_t idx, parquet::SchemaElement &sel, int32_t renc); + + void write_integer_int32(std::ostream &file, SEXP col, uint32_t idx, + uint64_t from, uint64_t until, + parquet::SchemaElement &sel); + void write_double_int32_time(std::ostream &file, SEXP col, uint32_t idx, + uint64_t from, uint64_t until, + parquet::SchemaElement &sel, double factor); }; RParquetOutFile::RParquetOutFile( @@ -583,23 +604,56 @@ void write_integer_int32_dec(std::ostream & file, SEXP col, uint64_t from, } } -void write_integer_int32(std::ostream &file, SEXP col, uint32_t idx, - uint64_t from, uint64_t until, - parquet::SchemaElement &sel) { +void RParquetOutFile::write_row_group(uint32_t group) { + if (write_minmax_values) { + std::fill(min_values.begin(), min_values.end(), std::string()); + std::fill(max_values.begin(), max_values.end(), std::string()); + std::fill(has_minmax_value.begin(), has_minmax_value.end(), false); + } +} + +#define GRAB_MIN(idx, t) ((t*) min_values[idx].data()) +#define GRAB_MAX(idx, t) ((t*) max_values[idx].data()) +#define SAVE_MIN(idx, val, t) do { \ + min_values[idx] = std::string((const char*) &val, sizeof(t)); \ + min_value = (t*) min_values[idx].data(); } while (0) +#define SAVE_MAX(idx, val, t) do { \ + max_values[idx] = std::string((const char*) &val, sizeof(t)); \ + max_value = (t*) max_values[idx].data(); } while (0) + +void RParquetOutFile::write_integer_int32(std::ostream &file, SEXP col, + uint32_t idx, + uint64_t from, uint64_t until, + parquet::SchemaElement &sel) { bool is_signed = TRUE; int bit_width = 32; if (sel.__isset.logicalType && sel.logicalType.__isset.INTEGER) { is_signed = sel.logicalType.INTEGER.isSigned; bit_width = sel.logicalType.INTEGER.bitWidth; } + + bool minmax = write_minmax_values && is_minmax_supported[idx]; + int32_t *min_value = 0, *max_value = 0; + if (minmax && has_minmax_value[idx]) { + min_value = GRAB_MIN(idx, int32_t); + max_value = GRAB_MAX(idx, int32_t); + } + if (bit_width == 32) { - if (sel.repetition_type == parquet::FieldRepetitionType::REQUIRED) { + if (!write_minmax_values && + sel.repetition_type == parquet::FieldRepetitionType::REQUIRED) { uint64_t len = until - from; file.write((const char *) (INTEGER(col) + from), sizeof(int) * len); } else { for (uint64_t i = from; i < until; i++) { int32_t val = INTEGER(col)[i]; if (val == NA_INTEGER) continue; + if (minmax && (min_value == 0 || val < *min_value)) { + SAVE_MIN(idx, val, int32_t); + } + if (minmax && (max_value == 0 || val > *max_value)) { + SAVE_MAX(idx, val, int32_t); + } file.write((const char*) &val, sizeof(int32_t)); } } @@ -628,9 +682,16 @@ void write_integer_int32(std::ostream &file, SEXP col, uint32_t idx, w, (is_signed ? "" : "U"), bit_width, val, idx + 1, i + 1 ); } + if (minmax && (min_value == 0 || val < *min_value)) { + SAVE_MIN(idx, val, int32_t); + } + if (minmax && (max_value == 0 || val > *max_value)) { + SAVE_MAX(idx, val, int32_t); + } file.write((const char *) &val, sizeof(int32_t)); } } + has_minmax_value[idx] = has_minmax_value[idx] || min_value != 0; } void write_double_int32_dec(std::ostream &file, SEXP col, uint64_t from, @@ -666,15 +727,31 @@ void write_double_int32_dec(std::ostream &file, SEXP col, uint64_t from, } } -void write_double_int32_time(std::ostream &file, SEXP col, uint32_t idx, - uint64_t from, uint64_t until, - parquet::SchemaElement &sel, double factor) { +void RParquetOutFile::write_double_int32_time(std::ostream &file, SEXP col, + uint32_t idx, uint64_t from, + uint64_t until, + parquet::SchemaElement &sel, + double factor) { + int32_t *min_value = 0, *max_value = 0; + bool minmax = write_minmax_values && is_minmax_supported[idx]; + if (minmax && has_minmax_value[idx]) { + min_value = GRAB_MIN(idx, int32_t); + max_value = GRAB_MAX(idx, int32_t); + } + for (uint64_t i = from; i < until; i++) { double val = REAL(col)[i]; if (R_IsNA(val)) continue; int32_t ival = val * factor; + if (minmax && (min_value == 0 || ival < *min_value)) { + SAVE_MIN(idx, ival, int32_t); + } + if (minmax && (max_value == 0 || ival > *max_value)) { + SAVE_MAX(idx, ival, int32_t); + } file.write((const char *)&ival, sizeof(int32_t)); } + has_minmax_value[idx] = has_minmax_value[idx] || min_value != 0; } void write_double_int32(std::ostream &file, SEXP col, uint32_t idx, @@ -2189,6 +2266,23 @@ void RParquetOutFile::write_dictionary_indices( } } +bool RParquetOutFile::get_group_minmax_values(uint32_t idx, uint32_t group, + parquet::SchemaElement &sel, + std::string &min_value, + std::string &max_value) { + + if (!is_minmax_supported[idx]) { + return false; + } else if (!has_minmax_value[idx]) { + // maybe all values are missing + return false; + } else { + min_value = min_values[idx]; + max_value = max_values[idx]; + return true; + } +} + void nanoparquet_map_to_parquet_type( SEXP x, SEXP options, @@ -2433,6 +2527,13 @@ void RParquetOutFile::write( R_xlen_t nr = INTEGER(dim)[0]; set_num_rows(nr); R_xlen_t nc = INTEGER(dim)[1]; + + write_minmax_values = LOGICAL(get_list_element(options, "write_minmax_values"))[0]; + is_minmax_supported = std::vector(nc, false); + has_minmax_value.resize(nc); + min_values.resize(nc); + max_values.resize(nc); + for (R_xlen_t idx = 0; idx < nc; idx++) { SEXP col = VECTOR_ELT(dfsxp, idx); bool req = LOGICAL(required)[idx]; @@ -2469,6 +2570,32 @@ void RParquetOutFile::write( } } + if (!write_minmax_values) { + // nothing to do + } if (sel.__isset.logicalType) { + parquet::LogicalType < = sel.logicalType; + is_minmax_supported[idx] = lt.__isset.DATE || lt.__isset.INTEGER; + // TODO: support the rest + // is_minmax_supported[idx] = + // lt.__isset.STRING || lt.__isset.ENUM || + // lt.__isset.TIME || lt.__isset.TIMESTAMP || + // lt.__isset.JSON || lt.__isset.BSON || lt.__isset.UUID || + // lt.__isset.DECIMAL || lt.isset.FLOAT16; + } else { + switch(sel.type) { + // case parquet::Type::BOOLEAN: + case parquet::Type::INT32: + // case parquet::Type::INT64: + // case parquet::Type::FLOAT: + // case parquet::Type::DOUBLE: + is_minmax_supported[idx] = true; + break; + default: + is_minmax_supported[idx] = false; + break; + } + } + int32_t ienc = INTEGER(encoding)[idx]; parquet::Encoding::type enc = detect_encoding(idx, sel, ienc); schema_add_column(sel, enc); diff --git a/tests/testthat/_snaps/write-parquet-statistics.md b/tests/testthat/_snaps/write-parquet-statistics.md index 6a26e09..7317750 100644 --- a/tests/testthat/_snaps/write-parquet-statistics.md +++ b/tests/testthat/_snaps/write-parquet-statistics.md @@ -58,3 +58,17 @@ 51 3 11 0 52 3 12 0 +# min/max for integers + + Code + mtd[mtd$column == 2, c("row_group", "column", "min_value", "max_value")] + Output + row_group column min_value max_value + 3 0 2 04, 00, .... 04, 00, .... + 16 1 2 04, 00, .... 04, 00, .... + 29 2 2 06, 00, .... 06, 00, .... + 42 3 2 06, 00, .... 08, 00, .... + 55 4 2 08, 00, .... 08, 00, .... + 68 5 2 08, 00, .... 08, 00, .... + 81 6 2 08, 00, .... 08, 00, .... + diff --git a/tests/testthat/test-write-parquet-statistics.R b/tests/testthat/test-write-parquet-statistics.R index c4bd7bc..254d864 100644 --- a/tests/testthat/test-write-parquet-statistics.R +++ b/tests/testthat/test-write-parquet-statistics.R @@ -13,3 +13,34 @@ test_that("null_count is written", { ]) ) }) + +test_that("min/max for integers", { + tmp <- tempfile(fileext = ".parquet") + on.exit(unlink(tmp), add = TRUE) + df <- test_df(missing = TRUE) + df <- df[order(df$cyl), ] + rownames(df) <- NULL + + write_parquet( + df, tmp, + encoding = "PLAIN", + options = parquet_options(num_rows_per_row_group = 5) + ) + expect_equal(as.data.frame(df), as.data.frame(read_parquet(tmp))) + mtd <- as.data.frame(read_parquet_metadata(tmp)[["column_chunks"]]) + expect_snapshot( + mtd[mtd$column == 2, c("row_group", "column", "min_value", "max_value")] + ) + + # dictionary + write_parquet( + df, tmp, + encoding = ifelse(map_chr(df, class) == "logical", "PLAIN", "RLE_DICTIONARY"), + options = parquet_options(num_rows_per_row_group = 5) + ) + expect_equal(as.data.frame(df), as.data.frame(read_parquet(tmp))) + mtd <- as.data.frame(read_parquet_metadata(tmp)[["column_chunks"]]) + expect_snapshot( + mtd[mtd$column == 2, c("row_group", "column", "min_value", "max_value")] + ) +}) From b6ada24f67d31a636571149f88711ebc93f1f304 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Cs=C3=A1rdi?= Date: Mon, 16 Sep 2024 12:24:39 +0200 Subject: [PATCH 07/26] Work around an unrelated crash --- tests/testthat/_snaps/write-parquet-statistics.md | 14 ++++++++++++++ tests/testthat/test-write-parquet-statistics.R | 3 ++- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/tests/testthat/_snaps/write-parquet-statistics.md b/tests/testthat/_snaps/write-parquet-statistics.md index 7317750..282cfb8 100644 --- a/tests/testthat/_snaps/write-parquet-statistics.md +++ b/tests/testthat/_snaps/write-parquet-statistics.md @@ -72,3 +72,17 @@ 68 5 2 08, 00, .... 08, 00, .... 81 6 2 08, 00, .... 08, 00, .... +--- + + Code + mtd[mtd$column == 2, c("row_group", "column", "min_value", "max_value")] + Output + row_group column min_value max_value + 3 0 2 + 16 1 2 + 29 2 2 + 42 3 2 + 55 4 2 + 68 5 2 + 81 6 2 + diff --git a/tests/testthat/test-write-parquet-statistics.R b/tests/testthat/test-write-parquet-statistics.R index 254d864..e48b04f 100644 --- a/tests/testthat/test-write-parquet-statistics.R +++ b/tests/testthat/test-write-parquet-statistics.R @@ -33,9 +33,10 @@ test_that("min/max for integers", { ) # dictionary + enc <- ifelse(map_chr(df, class) == "integer", "RLE_DICTIONARY", "PLAIN") write_parquet( df, tmp, - encoding = ifelse(map_chr(df, class) == "logical", "PLAIN", "RLE_DICTIONARY"), + encoding = enc, options = parquet_options(num_rows_per_row_group = 5) ) expect_equal(as.data.frame(df), as.data.frame(read_parquet(tmp))) From 87a368915c36dc9f588c580ea33e28a72968c903 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Cs=C3=A1rdi?= Date: Mon, 16 Sep 2024 12:26:27 +0200 Subject: [PATCH 08/26] Min/max for integer dictionaries --- src/dictionary-encoding.cpp | 21 +++++-- src/write.cpp | 16 +++--- tests/testthat/_snaps/parquet-metadata.md | 56 +++++++++---------- .../_snaps/write-parquet-statistics.md | 16 +++--- 4 files changed, 60 insertions(+), 49 deletions(-) diff --git a/src/dictionary-encoding.cpp b/src/dictionary-encoding.cpp index e2edb03..3b4bc2c 100644 --- a/src/dictionary-encoding.cpp +++ b/src/dictionary-encoding.cpp @@ -122,13 +122,19 @@ uint64_t create_dict_real_idx(double* values, int *dict, int *idx, uint64_t len) } template -uint64_t create_dict_idx(T* values, int *dict, int *idx, uint64_t len, T naval) { +uint64_t create_dict_idx(T* values, int *dict, int *idx, uint64_t len, + T naval, T &minval, T &maxval) { std::unordered_map mm; mm.reserve(len * 2); T *begin = values; T *end = begin + len; int n = 0; + if (begin < end) { + minval = *begin; + maxval = *begin; + } + for (int i = 0; begin < end; begin++, i++) { if (*begin == naval) { idx[i] = NA_INTEGER; @@ -136,6 +142,8 @@ uint64_t create_dict_idx(T* values, int *dict, int *idx, uint64_t len, T naval) } auto it = mm.find(*begin); if (it == mm.end()) { + if (*begin < minval) minval = *begin; + if (*begin > maxval) maxval = *begin; mm.insert(std::make_pair(*begin, n)); idx[i] = n; dict[n] = i; @@ -185,12 +193,13 @@ SEXP nanoparquet_create_dict_idx_(SEXP x, SEXP from, SEXP until) { SEXP dict = PROTECT(Rf_allocVector(INTSXP, len)); int *idict = INTEGER(dict); int *iidx = INTEGER(idx); + int imin, imax; switch (TYPEOF(x)) { case LGLSXP: - dictlen = create_dict_idx(LOGICAL(x) + cfrom, iidx, idict, len, NA_LOGICAL); + dictlen = create_dict_idx(LOGICAL(x) + cfrom, iidx, idict, len, NA_LOGICAL, imin, imax); break; case INTSXP: - dictlen = create_dict_idx(INTEGER(x) + cfrom, idict, iidx, len, NA_INTEGER); + dictlen = create_dict_idx(INTEGER(x) + cfrom, idict, iidx, len, NA_INTEGER, imin, imax); break; case REALSXP: dictlen = create_dict_real_idx(REAL(x) + cfrom, idict, iidx, len); @@ -204,9 +213,13 @@ SEXP nanoparquet_create_dict_idx_(SEXP x, SEXP from, SEXP until) { break; } - SEXP res = PROTECT(Rf_allocVector(VECSXP, 2)); + SEXP res = PROTECT(Rf_allocVector(VECSXP, TYPEOF(x) == INTSXP ? 4 : 2)); SET_VECTOR_ELT(res, 0, dict); SET_VECTOR_ELT(res, 1, idx); + if (TYPEOF(x) == INTSXP) { + SET_VECTOR_ELT(res, 2, Rf_ScalarInteger(imin)); + SET_VECTOR_ELT(res, 3, Rf_ScalarInteger(imax)); + } if (dictlen < len) { SET_VECTOR_ELT(res, 0, Rf_xlengthgets(dict, dictlen)); diff --git a/src/write.cpp b/src/write.cpp index 07ec5b7..8bb8c58 100644 --- a/src/write.cpp +++ b/src/write.cpp @@ -229,6 +229,13 @@ void RParquetOutFile::create_dictionary(uint32_t idx, int64_t from, SET_VECTOR_ELT(dicts, idx, d); INTEGER(dicts_from)[idx] = from; UNPROTECT(3); + if (write_minmax_values && Rf_length(d) == 4 && + is_minmax_supported[idx] && Rf_xlength(col) > 0 && + !Rf_isNull(VECTOR_ELT(d, 2)) && !Rf_isNull(VECTOR_ELT(d, 3))) { + has_minmax_value[idx] = true; + min_values[idx] = std::string((const char*) INTEGER(VECTOR_ELT(d, 2)), sizeof(int32_t)); + max_values[idx] = std::string((const char*) INTEGER(VECTOR_ELT(d, 3)), sizeof(int32_t)); + } } static const char *enc_[] = { @@ -1244,15 +1251,6 @@ void RParquetOutFile::write_byte_array(std::ostream &file, uint32_t idx, } break; } - case INTSXP: { - int32_t precision, scale; - bool isdec = is_decimal(sel, precision, scale); - - break; - } - case REALSXP: { - break; - } default: Rf_errorcall( // # nocov nanoparquet_call, // # nocov diff --git a/tests/testthat/_snaps/parquet-metadata.md b/tests/testthat/_snaps/parquet-metadata.md index 152dc58..72de511 100644 --- a/tests/testthat/_snaps/parquet-metadata.md +++ b/tests/testthat/_snaps/parquet-metadata.md @@ -106,34 +106,34 @@ 11 275 2879 NA 12 275 3154 NA 13 21 3429 NA - dictionary_page_offset null_count min_value max_value is_min_value_exact - 1 NA 0 NA - 2 NA 0 NA - 3 NA 0 NA - 4 NA 0 NA - 5 NA 0 NA - 6 NA 0 NA - 7 NA 0 NA - 8 NA 0 NA - 9 NA 0 NA - 10 NA 0 NA - 11 NA 0 NA - 12 NA 0 NA - 13 NA 0 NA - is_max_value_exact - 1 NA - 2 NA - 3 NA - 4 NA - 5 NA - 6 NA - 7 NA - 8 NA - 9 NA - 10 NA - 11 NA - 12 NA - 13 NA + dictionary_page_offset null_count min_value max_value + 1 NA 0 + 2 NA 0 + 3 NA 0 04, 00, .... 08, 00, .... + 4 NA 0 + 5 NA 0 + 6 NA 0 + 7 NA 0 + 8 NA 0 + 9 NA 0 + 10 NA 0 + 11 NA 0 + 12 NA 0 + 13 NA 0 + is_min_value_exact is_max_value_exact + 1 NA NA + 2 NA NA + 3 TRUE TRUE + 4 NA NA + 5 NA NA + 6 NA NA + 7 NA NA + 8 NA NA + 9 NA NA + 10 NA NA + 11 NA NA + 12 NA NA + 13 NA NA --- diff --git a/tests/testthat/_snaps/write-parquet-statistics.md b/tests/testthat/_snaps/write-parquet-statistics.md index 282cfb8..0b5b045 100644 --- a/tests/testthat/_snaps/write-parquet-statistics.md +++ b/tests/testthat/_snaps/write-parquet-statistics.md @@ -77,12 +77,12 @@ Code mtd[mtd$column == 2, c("row_group", "column", "min_value", "max_value")] Output - row_group column min_value max_value - 3 0 2 - 16 1 2 - 29 2 2 - 42 3 2 - 55 4 2 - 68 5 2 - 81 6 2 + row_group column min_value max_value + 3 0 2 04, 00, .... 04, 00, .... + 16 1 2 04, 00, .... 04, 00, .... + 29 2 2 06, 00, .... 06, 00, .... + 42 3 2 06, 00, .... 08, 00, .... + 55 4 2 08, 00, .... 08, 00, .... + 68 5 2 08, 00, .... 08, 00, .... + 81 6 2 08, 00, .... 08, 00, .... From 5dae2d80f8357319250a4570db2362ca5258a30f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Cs=C3=A1rdi?= Date: Mon, 16 Sep 2024 15:13:03 +0200 Subject: [PATCH 09/26] Fix a bug with string dicts and row groups --- src/write.cpp | 2 +- tests/testthat/test-write-parquet-row-groups.R | 13 +++++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/src/write.cpp b/src/write.cpp index 8bb8c58..8d4b09a 100644 --- a/src/write.cpp +++ b/src/write.cpp @@ -1732,7 +1732,7 @@ uint32_t RParquetOutFile::get_size_dictionary( int *beg = INTEGER(dictidx); int *end = beg + len; for (; beg < end; beg++) { - const char *c = CHAR(STRING_ELT(col, *beg)); + const char *c = CHAR(STRING_ELT(col, *beg + from)); size += strlen(c); } return size; diff --git a/tests/testthat/test-write-parquet-row-groups.R b/tests/testthat/test-write-parquet-row-groups.R index 4c8d302..ff94c58 100644 --- a/tests/testthat/test-write-parquet-row-groups.R +++ b/tests/testthat/test-write-parquet-row-groups.R @@ -96,3 +96,16 @@ test_that("non-factors write local dictionary", { } }) }) + +test_that("strings in a dictionary", { + tmp <- tempfile(fileext = ".parquet") + on.exit(unlink(tmp), add = TRUE) + + df <- test_df() + write_parquet( + df, tmp, + encoding = c(large = "RLE", "RLE_DICTIONARY"), + options = parquet_options(num_rows_per_row_group=10) + ) + expect_equal(as.data.frame(df), as.data.frame(read_parquet(tmp))) +}) From 12016fa9585f39b016a097084d8f5b3a3c519279 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Cs=C3=A1rdi?= Date: Mon, 16 Sep 2024 15:56:01 +0200 Subject: [PATCH 10/26] Test case for writing min/max for DATEs --- .../_snaps/write-parquet-statistics.md | 22 ++++++++++++++++++ .../testthat/test-write-parquet-statistics.R | 23 +++++++++++++++++++ 2 files changed, 45 insertions(+) diff --git a/tests/testthat/_snaps/write-parquet-statistics.md b/tests/testthat/_snaps/write-parquet-statistics.md index 0b5b045..1f47179 100644 --- a/tests/testthat/_snaps/write-parquet-statistics.md +++ b/tests/testthat/_snaps/write-parquet-statistics.md @@ -86,3 +86,25 @@ 68 5 2 08, 00, .... 08, 00, .... 81 6 2 08, 00, .... 08, 00, .... +# min/max for DATEs + + Code + as.data.frame(read_parquet_schema(tmp)[, -1]) + Output + name r_type type type_length repetition_type converted_type logical_type + 1 schema NA + 2 day Date INT32 NA REQUIRED DATE DATE + 3 count integer INT32 NA REQUIRED INT_32 INT, 32,.... + num_children scale precision field_id + 1 2 NA NA NA + 2 NA NA NA NA + 3 NA NA NA NA + Code + as.Date(map_int(minv, readBin, what = "integer", n = 1)) + Output + [1] "2024-09-06" "2024-09-08" "2024-09-10" "2024-09-12" "2024-09-14" + Code + as.Date(map_int(maxv, readBin, what = "integer", n = 1)) + Output + [1] "2024-09-07" "2024-09-09" "2024-09-11" "2024-09-13" "2024-09-15" + diff --git a/tests/testthat/test-write-parquet-statistics.R b/tests/testthat/test-write-parquet-statistics.R index e48b04f..2f9557a 100644 --- a/tests/testthat/test-write-parquet-statistics.R +++ b/tests/testthat/test-write-parquet-statistics.R @@ -45,3 +45,26 @@ test_that("min/max for integers", { mtd[mtd$column == 2, c("row_group", "column", "min_value", "max_value")] ) }) + +test_that("min/max for DATEs", { + tmp <- tempfile(fileext = ".parquet") + on.exit(unlink(tmp), add = TRUE) + + df <- data.frame( + day = rep(as.Date("2024-09-16") - 10:1, each = 10), + count = 1:100 + ) + write_parquet( + df, tmp, + options = parquet_options(num_rows_per_row_group = 20) + ) + expect_equal(as.data.frame(df), as.data.frame(read_parquet(tmp))) + mtd <- as.data.frame(read_parquet_metadata(tmp)[["column_chunks"]]) + minv <- mtd[mtd$column == 0, "min_value"] + maxv <- mtd[mtd$column == 0, "max_value"] + expect_snapshot({ + as.data.frame(read_parquet_schema(tmp)[,-1]) + as.Date(map_int(minv, readBin, what = "integer", n = 1)) + as.Date(map_int(maxv, readBin, what = "integer", n = 1)) + }) +}) From f71bf85cbc3fb8897d242fbdd26755473156dabf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Cs=C3=A1rdi?= Date: Mon, 16 Sep 2024 17:29:54 +0200 Subject: [PATCH 11/26] Fix tests on older R --- tests/testthat/_snaps/write-parquet-statistics.md | 4 ++-- tests/testthat/test-write-parquet-statistics.R | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/testthat/_snaps/write-parquet-statistics.md b/tests/testthat/_snaps/write-parquet-statistics.md index 1f47179..4750c5e 100644 --- a/tests/testthat/_snaps/write-parquet-statistics.md +++ b/tests/testthat/_snaps/write-parquet-statistics.md @@ -100,11 +100,11 @@ 2 NA NA NA NA 3 NA NA NA NA Code - as.Date(map_int(minv, readBin, what = "integer", n = 1)) + as.Date(map_int(minv, readBin, what = "integer", n = 1), origin = "1970-01-01") Output [1] "2024-09-06" "2024-09-08" "2024-09-10" "2024-09-12" "2024-09-14" Code - as.Date(map_int(maxv, readBin, what = "integer", n = 1)) + as.Date(map_int(maxv, readBin, what = "integer", n = 1), origin = "1970-01-01") Output [1] "2024-09-07" "2024-09-09" "2024-09-11" "2024-09-13" "2024-09-15" diff --git a/tests/testthat/test-write-parquet-statistics.R b/tests/testthat/test-write-parquet-statistics.R index 2f9557a..a67f6e7 100644 --- a/tests/testthat/test-write-parquet-statistics.R +++ b/tests/testthat/test-write-parquet-statistics.R @@ -64,7 +64,7 @@ test_that("min/max for DATEs", { maxv <- mtd[mtd$column == 0, "max_value"] expect_snapshot({ as.data.frame(read_parquet_schema(tmp)[,-1]) - as.Date(map_int(minv, readBin, what = "integer", n = 1)) - as.Date(map_int(maxv, readBin, what = "integer", n = 1)) + as.Date(map_int(minv, readBin, what = "integer", n = 1), origin = "1970-01-01") + as.Date(map_int(maxv, readBin, what = "integer", n = 1), origin = "1970-01-01") }) }) From d9996591d778dca0ee3b1fa57106b1cedacdd0ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Cs=C3=A1rdi?= Date: Fri, 20 Sep 2024 14:09:14 +0200 Subject: [PATCH 12/26] Test min/max when writing uncompressed data --- .../_snaps/write-parquet-statistics.md | 46 +++++++++--- .../testthat/test-write-parquet-statistics.R | 72 +++++++++---------- 2 files changed, 73 insertions(+), 45 deletions(-) diff --git a/tests/testthat/_snaps/write-parquet-statistics.md b/tests/testthat/_snaps/write-parquet-statistics.md index 4750c5e..8634745 100644 --- a/tests/testthat/_snaps/write-parquet-statistics.md +++ b/tests/testthat/_snaps/write-parquet-statistics.md @@ -61,7 +61,7 @@ # min/max for integers Code - mtd[mtd$column == 2, c("row_group", "column", "min_value", "max_value")] + do(compression = "snappy") Output row_group column min_value max_value 3 0 2 04, 00, .... 04, 00, .... @@ -75,7 +75,35 @@ --- Code - mtd[mtd$column == 2, c("row_group", "column", "min_value", "max_value")] + do(compression = "uncompressed") + Output + row_group column min_value max_value + 3 0 2 04, 00, .... 04, 00, .... + 16 1 2 04, 00, .... 04, 00, .... + 29 2 2 06, 00, .... 06, 00, .... + 42 3 2 06, 00, .... 08, 00, .... + 55 4 2 08, 00, .... 08, 00, .... + 68 5 2 08, 00, .... 08, 00, .... + 81 6 2 08, 00, .... 08, 00, .... + +--- + + Code + do(encoding = enc, compression = "snappy") + Output + row_group column min_value max_value + 3 0 2 04, 00, .... 04, 00, .... + 16 1 2 04, 00, .... 04, 00, .... + 29 2 2 06, 00, .... 06, 00, .... + 42 3 2 06, 00, .... 08, 00, .... + 55 4 2 08, 00, .... 08, 00, .... + 68 5 2 08, 00, .... 08, 00, .... + 81 6 2 08, 00, .... 08, 00, .... + +--- + + Code + do(encoding = enc, compression = "uncompressed") Output row_group column min_value max_value 3 0 2 04, 00, .... 04, 00, .... @@ -89,8 +117,9 @@ # min/max for DATEs Code - as.data.frame(read_parquet_schema(tmp)[, -1]) + do() Output + [[1]] name r_type type type_length repetition_type converted_type logical_type 1 schema NA 2 day Date INT32 NA REQUIRED DATE DATE @@ -99,12 +128,11 @@ 1 2 NA NA NA 2 NA NA NA NA 3 NA NA NA NA - Code - as.Date(map_int(minv, readBin, what = "integer", n = 1), origin = "1970-01-01") - Output + + [[2]] [1] "2024-09-06" "2024-09-08" "2024-09-10" "2024-09-12" "2024-09-14" - Code - as.Date(map_int(maxv, readBin, what = "integer", n = 1), origin = "1970-01-01") - Output + + [[3]] [1] "2024-09-07" "2024-09-09" "2024-09-11" "2024-09-13" "2024-09-15" + diff --git a/tests/testthat/test-write-parquet-statistics.R b/tests/testthat/test-write-parquet-statistics.R index a67f6e7..78f2128 100644 --- a/tests/testthat/test-write-parquet-statistics.R +++ b/tests/testthat/test-write-parquet-statistics.R @@ -21,50 +21,50 @@ test_that("min/max for integers", { df <- df[order(df$cyl), ] rownames(df) <- NULL - write_parquet( - df, tmp, - encoding = "PLAIN", - options = parquet_options(num_rows_per_row_group = 5) - ) - expect_equal(as.data.frame(df), as.data.frame(read_parquet(tmp))) - mtd <- as.data.frame(read_parquet_metadata(tmp)[["column_chunks"]]) - expect_snapshot( + do <- function(encoding = "PLAIN",...) { + write_parquet( + df, tmp, + encoding = encoding, + options = parquet_options(num_rows_per_row_group = 5), + ... + ) + expect_equal(as.data.frame(df), as.data.frame(read_parquet(tmp))) + mtd <- as.data.frame(read_parquet_metadata(tmp)[["column_chunks"]]) mtd[mtd$column == 2, c("row_group", "column", "min_value", "max_value")] - ) + } + expect_snapshot(do(compression = "snappy")) + expect_snapshot(do(compression = "uncompressed")) # dictionary enc <- ifelse(map_chr(df, class) == "integer", "RLE_DICTIONARY", "PLAIN") - write_parquet( - df, tmp, - encoding = enc, - options = parquet_options(num_rows_per_row_group = 5) - ) - expect_equal(as.data.frame(df), as.data.frame(read_parquet(tmp))) - mtd <- as.data.frame(read_parquet_metadata(tmp)[["column_chunks"]]) - expect_snapshot( - mtd[mtd$column == 2, c("row_group", "column", "min_value", "max_value")] - ) + expect_snapshot(do(encoding = enc, compression = "snappy")) + expect_snapshot(do(encoding = enc, compression = "uncompressed")) }) test_that("min/max for DATEs", { tmp <- tempfile(fileext = ".parquet") on.exit(unlink(tmp), add = TRUE) - df <- data.frame( - day = rep(as.Date("2024-09-16") - 10:1, each = 10), - count = 1:100 - ) - write_parquet( - df, tmp, - options = parquet_options(num_rows_per_row_group = 20) - ) - expect_equal(as.data.frame(df), as.data.frame(read_parquet(tmp))) - mtd <- as.data.frame(read_parquet_metadata(tmp)[["column_chunks"]]) - minv <- mtd[mtd$column == 0, "min_value"] - maxv <- mtd[mtd$column == 0, "max_value"] - expect_snapshot({ - as.data.frame(read_parquet_schema(tmp)[,-1]) - as.Date(map_int(minv, readBin, what = "integer", n = 1), origin = "1970-01-01") - as.Date(map_int(maxv, readBin, what = "integer", n = 1), origin = "1970-01-01") - }) + do <- function(...) { + df <- data.frame( + day = rep(as.Date("2024-09-16") - 10:1, each = 10), + count = 1:100 + ) + write_parquet( + df, tmp, + options = parquet_options(num_rows_per_row_group = 20), + ... + ) + expect_equal(as.data.frame(df), as.data.frame(read_parquet(tmp))) + mtd <- as.data.frame(read_parquet_metadata(tmp)[["column_chunks"]]) + minv <- mtd[mtd$column == 0, "min_value"] + maxv <- mtd[mtd$column == 0, "max_value"] + list( + as.data.frame(read_parquet_schema(tmp)[, -1]), + as.Date(map_int(minv, readBin, what = "integer", n = 1), origin = "1970-01-01"), + as.Date(map_int(maxv, readBin, what = "integer", n = 1), origin = "1970-01-01") + ) + } + + expect_snapshot(do()) }) From d46c91ce37b6dc37545326461d3ef6363fd2328c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Cs=C3=A1rdi?= Date: Fri, 20 Sep 2024 14:12:20 +0200 Subject: [PATCH 13/26] Minor refactor Move function to better place. --- src/write.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/write.cpp b/src/write.cpp index 8d4b09a..d069fca 100644 --- a/src/write.cpp +++ b/src/write.cpp @@ -526,6 +526,14 @@ static const char *type_names[] = { "an S4 object" }; +void RParquetOutFile::write_row_group(uint32_t group) { + if (write_minmax_values) { + std::fill(min_values.begin(), min_values.end(), std::string()); + std::fill(max_values.begin(), max_values.end(), std::string()); + std::fill(has_minmax_value.begin(), has_minmax_value.end(), false); + } +} + static bool is_decimal(parquet::SchemaElement &sel, int32_t &precision, int32_t &scale) { if (sel.__isset.logicalType && sel.logicalType.__isset.DECIMAL) { @@ -611,14 +619,6 @@ void write_integer_int32_dec(std::ostream & file, SEXP col, uint64_t from, } } -void RParquetOutFile::write_row_group(uint32_t group) { - if (write_minmax_values) { - std::fill(min_values.begin(), min_values.end(), std::string()); - std::fill(max_values.begin(), max_values.end(), std::string()); - std::fill(has_minmax_value.begin(), has_minmax_value.end(), false); - } -} - #define GRAB_MIN(idx, t) ((t*) min_values[idx].data()) #define GRAB_MAX(idx, t) ((t*) max_values[idx].data()) #define SAVE_MIN(idx, val, t) do { \ From 62623424edd2ac059a99fdf6362bfc455ff73b09 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Cs=C3=A1rdi?= Date: Sat, 21 Sep 2024 06:50:10 +0200 Subject: [PATCH 14/26] Fixes for writing min/max for integers - fix double -> int32 conversion - fixes for NA values --- R/utils.R | 2 + src/dictionary-encoding.cpp | 58 +++-- src/lib/ParquetOutFile.cpp | 12 +- src/lib/ParquetOutFile.h | 4 +- src/write.cpp | 80 +++++-- .../_snaps/write-parquet-statistics.md | 226 +++++++++++++++--- .../testthat/test-write-parquet-statistics.R | 99 +++++++- 7 files changed, 407 insertions(+), 74 deletions(-) diff --git a/R/utils.R b/R/utils.R index af2be45..7d273c0 100644 --- a/R/utils.R +++ b/R/utils.R @@ -1,5 +1,7 @@ `%||%` <- function(l, r) if (is.null(l)) r else l +`%&&%` <- function(l, r) if (is.null(l)) NULL else r + is_rcmd_check <- function() { if (identical(Sys.getenv("NOT_CRAN"), "true")) { FALSE diff --git a/src/dictionary-encoding.cpp b/src/dictionary-encoding.cpp index 3b4bc2c..fa6cb15 100644 --- a/src/dictionary-encoding.cpp +++ b/src/dictionary-encoding.cpp @@ -95,20 +95,30 @@ uint64_t create_dict_ptr_idx(void** values, int *dict, int *idx, return n; } -uint64_t create_dict_real_idx(double* values, int *dict, int *idx, uint64_t len) { +uint64_t create_dict_real_idx(double* values, int *dict, int *idx, + uint64_t len, double &minval, + double &maxval, bool &hasminmax) { std::unordered_map mm; mm.reserve(len * 2); double *begin = values; double *end = begin + len; int n = 0; + hasminmax = false; + for (int i = 0; begin < end; begin++, i++) { if (R_IsNA(*begin)) { idx[i] = NA_INTEGER; continue; } + if (!hasminmax) { + hasminmax = true; + minval = maxval = *begin; + } auto it = mm.find(*begin); if (it == mm.end()) { + if (*begin < minval) minval = *begin; + if (*begin > maxval) maxval = *begin; mm.insert(std::make_pair(*begin, n)); idx[i] = n; dict[n] = i; @@ -123,23 +133,24 @@ uint64_t create_dict_real_idx(double* values, int *dict, int *idx, uint64_t len) template uint64_t create_dict_idx(T* values, int *dict, int *idx, uint64_t len, - T naval, T &minval, T &maxval) { + T naval, T &minval, T &maxval, bool &hasminmax) { std::unordered_map mm; mm.reserve(len * 2); T *begin = values; T *end = begin + len; int n = 0; - if (begin < end) { - minval = *begin; - maxval = *begin; - } + hasminmax = false; for (int i = 0; begin < end; begin++, i++) { if (*begin == naval) { idx[i] = NA_INTEGER; continue; } + if (!hasminmax) { + hasminmax = true; + minval = maxval = *begin; + } auto it = mm.find(*begin); if (it == mm.end()) { if (*begin < minval) minval = *begin; @@ -194,18 +205,32 @@ SEXP nanoparquet_create_dict_idx_(SEXP x, SEXP from, SEXP until) { int *idict = INTEGER(dict); int *iidx = INTEGER(idx); int imin, imax; + double dmin, dmax; + bool hasminmax = false; switch (TYPEOF(x)) { case LGLSXP: - dictlen = create_dict_idx(LOGICAL(x) + cfrom, iidx, idict, len, NA_LOGICAL, imin, imax); + dictlen = create_dict_idx( + LOGICAL(x) + cfrom, iidx, idict, len, NA_LOGICAL, + imin, imax, hasminmax + ); break; case INTSXP: - dictlen = create_dict_idx(INTEGER(x) + cfrom, idict, iidx, len, NA_INTEGER, imin, imax); + dictlen = create_dict_idx( + INTEGER(x) + cfrom, idict, iidx, len, NA_INTEGER, + imin, imax, hasminmax + ); break; case REALSXP: - dictlen = create_dict_real_idx(REAL(x) + cfrom, idict, iidx, len); + dictlen = create_dict_real_idx( + REAL(x) + cfrom, idict, iidx, len, + dmin, dmax, hasminmax + ); break; case STRSXP: { - dictlen = create_dict_ptr_idx((void**)(STRING_PTR_RO(x) + cfrom), idict, iidx, len, (void*) NA_STRING); + dictlen = create_dict_ptr_idx( + (void**)(STRING_PTR_RO(x) + cfrom), idict, iidx, len, + (void*) NA_STRING + ); break; } default: @@ -213,12 +238,17 @@ SEXP nanoparquet_create_dict_idx_(SEXP x, SEXP from, SEXP until) { break; } - SEXP res = PROTECT(Rf_allocVector(VECSXP, TYPEOF(x) == INTSXP ? 4 : 2)); + SEXP res = PROTECT(Rf_allocVector(VECSXP, hasminmax ? 4 : 2)); SET_VECTOR_ELT(res, 0, dict); SET_VECTOR_ELT(res, 1, idx); - if (TYPEOF(x) == INTSXP) { - SET_VECTOR_ELT(res, 2, Rf_ScalarInteger(imin)); - SET_VECTOR_ELT(res, 3, Rf_ScalarInteger(imax)); + if (hasminmax) { + if (TYPEOF(x) == INTSXP) { + SET_VECTOR_ELT(res, 2, Rf_ScalarInteger(imin)); + SET_VECTOR_ELT(res, 3, Rf_ScalarInteger(imax)); + } else if (TYPEOF(x) == REALSXP) { + SET_VECTOR_ELT(res, 2, Rf_ScalarReal(dmin)); + SET_VECTOR_ELT(res, 3, Rf_ScalarReal(dmax)); + } } if (dictlen < len) { diff --git a/src/lib/ParquetOutFile.cpp b/src/lib/ParquetOutFile.cpp index c4a11a8..0da5337 100644 --- a/src/lib/ParquetOutFile.cpp +++ b/src/lib/ParquetOutFile.cpp @@ -602,7 +602,7 @@ void ParquetOutFile::write_dictionary_page(uint32_t idx, int64_t from, // Uncompresed size of the dictionary in bytes uint32_t dict_size = get_size_dictionary(idx, se, from, until); // Number of entries in the dicitonary - uint32_t num_dict_values = get_num_values_dictionary(idx, from, until); + uint32_t num_dict_values = get_num_values_dictionary(idx, se, from, until); // Init page header PageHeader ph; @@ -650,7 +650,7 @@ void ParquetOutFile::write_data_pages(uint32_t idx, uint32_t group, total_size = calculate_column_data_size(idx, rg_num_rows, from, until); } else { // estimate the max RLE length - uint32_t num_values = get_num_values_dictionary(idx, from, until); + uint32_t num_values = get_num_values_dictionary(idx, se, from, until); uint8_t bit_width = ceil(log2((double) num_values)); total_size = MaxRleBpSizeSimple(rg_num_rows, bit_width); } @@ -778,7 +778,7 @@ void ParquetOutFile::write_data_page(uint32_t idx, uint32_t group, page_from, page_until); // 2. RLE encode buf_unc to buf_com - uint32_t num_dict_values = get_num_values_dictionary(idx, rg_from, rg_until); + uint32_t num_dict_values = get_num_values_dictionary(idx, se, rg_from, rg_until); uint8_t bit_width = ceil(log2((double) num_dict_values)); uint32_t rle_size = rle_encode( buf_unc, @@ -814,7 +814,7 @@ void ParquetOutFile::write_data_page(uint32_t idx, uint32_t group, page_from, page_until); // 2. RLE encode buf_unc to buf_com - uint32_t num_dict_values = get_num_values_dictionary(idx, rg_from, rg_until); + uint32_t num_dict_values = get_num_values_dictionary(idx, se, rg_from, rg_until); uint8_t bit_width = ceil(log2((double) num_dict_values)); uint32_t rle_size = rle_encode( buf_unc, @@ -962,7 +962,7 @@ void ParquetOutFile::write_data_page(uint32_t idx, uint32_t group, page_from, page_until); // 4. append RLE buf_unc to buf_com - uint32_t num_dict_values = get_num_values_dictionary(idx, rg_from, rg_until); + uint32_t num_dict_values = get_num_values_dictionary(idx, se, rg_from, rg_until); uint8_t bit_width = ceil(log2((double) num_dict_values)); uint32_t rle2_size = rle_encode( buf_unc, @@ -1019,7 +1019,7 @@ void ParquetOutFile::write_data_page(uint32_t idx, uint32_t group, page_from, page_until); // 4. append RLE buf_unc to buf_com - uint32_t num_dict_values = get_num_values_dictionary(idx, rg_from, rg_until); + uint32_t num_dict_values = get_num_values_dictionary(idx, se, rg_from, rg_until); uint8_t bit_width = ceil(log2((double) num_dict_values)); uint32_t rle2_size = rle_encode( buf_unc, diff --git a/src/lib/ParquetOutFile.h b/src/lib/ParquetOutFile.h index 02f833b..34782d8 100644 --- a/src/lib/ParquetOutFile.h +++ b/src/lib/ParquetOutFile.h @@ -85,7 +85,9 @@ class ParquetOutFile { virtual uint32_t get_size_byte_array(uint32_t idx, uint32_t num_present, uint64_t from, uint64_t until) = 0; - virtual uint32_t get_num_values_dictionary(uint32_t idx, int64_t from, + virtual uint32_t get_num_values_dictionary(uint32_t idx, + parquet::SchemaElement &sel, + int64_t from, int64_t until) = 0; virtual uint32_t get_size_dictionary(uint32_t idx, parquet::SchemaElement &sel, diff --git a/src/write.cpp b/src/write.cpp index d069fca..9900295 100644 --- a/src/write.cpp +++ b/src/write.cpp @@ -145,8 +145,9 @@ class RParquetOutFile : public ParquetOutFile { uint64_t until); // for dictionaries - uint32_t get_num_values_dictionary(uint32_t idx, int64_t form, - int64_t until); + uint32_t get_num_values_dictionary(uint32_t idx, + parquet::SchemaElement &sel, + int64_t form, int64_t until); uint32_t get_size_dictionary(uint32_t idx, parquet::SchemaElement &type, int64_t from, int64_t until); void write_dictionary(std::ostream &file, uint32_t idx, @@ -185,7 +186,8 @@ class RParquetOutFile : public ParquetOutFile { std::vector max_values; std::vector has_minmax_value; - void create_dictionary(uint32_t idx, int64_t from, int64_t until); + void create_dictionary(uint32_t idx, int64_t from, int64_t until, + parquet::SchemaElement &sel); // for LGLSXP this mean RLE encoding bool should_use_dict_encoding(uint32_t idx); parquet::Encoding::type @@ -197,6 +199,9 @@ class RParquetOutFile : public ParquetOutFile { void write_double_int32_time(std::ostream &file, SEXP col, uint32_t idx, uint64_t from, uint64_t until, parquet::SchemaElement &sel, double factor); + void write_double_int32(std::ostream &file, SEXP col, uint32_t idx, + uint64_t from, uint64_t until, + parquet::SchemaElement &sel); }; RParquetOutFile::RParquetOutFile( @@ -216,7 +221,8 @@ RParquetOutFile::RParquetOutFile( } void RParquetOutFile::create_dictionary(uint32_t idx, int64_t from, - int64_t until) { + int64_t until, + parquet::SchemaElement &sel) { if (!Rf_isNull(VECTOR_ELT(dicts, idx)) && INTEGER(dicts_from)[idx] == from) { return; @@ -233,8 +239,20 @@ void RParquetOutFile::create_dictionary(uint32_t idx, int64_t from, is_minmax_supported[idx] && Rf_xlength(col) > 0 && !Rf_isNull(VECTOR_ELT(d, 2)) && !Rf_isNull(VECTOR_ELT(d, 3))) { has_minmax_value[idx] = true; - min_values[idx] = std::string((const char*) INTEGER(VECTOR_ELT(d, 2)), sizeof(int32_t)); - max_values[idx] = std::string((const char*) INTEGER(VECTOR_ELT(d, 3)), sizeof(int32_t)); + if (TYPEOF(VECTOR_ELT(d, 2)) == INTSXP) { + min_values[idx] = std::string((const char*) INTEGER(VECTOR_ELT(d, 2)), sizeof(int32_t)); + max_values[idx] = std::string((const char*) INTEGER(VECTOR_ELT(d, 3)), sizeof(int32_t)); + } else if (TYPEOF(VECTOR_ELT(d, 2)) == REALSXP) { + if (sel.type == parquet::Type::INT32) { + int32_t min = REAL(VECTOR_ELT(d, 2))[0]; + int32_t max = REAL(VECTOR_ELT(d, 3))[0]; + min_values[idx] = std::string((const char*) &min, sizeof(int32_t)); + max_values[idx] = std::string((const char*) &max, sizeof(int32_t)); + } else if (sel.type == parquet::Type::DOUBLE) { + min_values[idx] = std::string((const char*) REAL(VECTOR_ELT(d, 2)), sizeof(double)); + max_values[idx] = std::string((const char*) REAL(VECTOR_ELT(d, 3)), sizeof(double)); + } + } } } @@ -761,9 +779,10 @@ void RParquetOutFile::write_double_int32_time(std::ostream &file, SEXP col, has_minmax_value[idx] = has_minmax_value[idx] || min_value != 0; } -void write_double_int32(std::ostream &file, SEXP col, uint32_t idx, - uint64_t from, uint64_t until, - parquet::SchemaElement &sel) { +void RParquetOutFile::write_double_int32(std::ostream &file, SEXP col, + uint32_t idx, uint64_t from, + uint64_t until, + parquet::SchemaElement &sel) { bool is_signed = TRUE; int bit_width = 32; if (sel.__isset.logicalType && sel.logicalType.__isset.INTEGER) { @@ -771,6 +790,13 @@ void write_double_int32(std::ostream &file, SEXP col, uint32_t idx, bit_width = sel.logicalType.INTEGER.bitWidth; } if (is_signed) { + int32_t *min_value = 0, *max_value = 0; + bool minmax = write_minmax_values && is_minmax_supported[idx]; + if (minmax && has_minmax_value[idx]) { + min_value = GRAB_MIN(idx, int32_t); + max_value = GRAB_MAX(idx, int32_t); + } + int32_t min, max; switch (bit_width) { case 8: @@ -799,9 +825,23 @@ void write_double_int32(std::ostream &file, SEXP col, uint32_t idx, ); } int32_t ival = val; + if (minmax && (min_value == 0 || ival < *min_value)) { + SAVE_MIN(idx, ival, int32_t); + } + if (minmax && (max_value == 0 || ival > *max_value)) { + SAVE_MAX(idx, ival, int32_t); + } file.write((const char *)&ival, sizeof(int32_t)); } + has_minmax_value[idx] = has_minmax_value[idx] || min_value != 0; } else { + uint32_t *min_value = 0, *max_value = 0; + bool minmax = write_minmax_values && is_minmax_supported[idx]; + if (minmax && has_minmax_value[idx]) { + min_value = GRAB_MIN(idx, uint32_t); + max_value = GRAB_MAX(idx, uint32_t); + } + uint32_t max; switch (bit_width) { case 8: @@ -838,9 +878,16 @@ void write_double_int32(std::ostream &file, SEXP col, uint32_t idx, val, idx + 1, i + 1 ); } - int32_t ival = val; - file.write((const char *)&ival, sizeof(int32_t)); + uint32_t uival = val; + if (minmax && (min_value == 0 || uival < *min_value)) { + SAVE_MIN(idx, uival, uint32_t); + } + if (minmax && (max_value == 0 || uival > *max_value)) { + SAVE_MAX(idx, uival, uint32_t); + } + file.write((const char *)&uival, sizeof(uint32_t)); } + has_minmax_value[idx] = has_minmax_value[idx] || min_value != 0; } } @@ -1642,13 +1689,14 @@ void RParquetOutFile::write_present_boolean( uint32_t RParquetOutFile::get_num_values_dictionary( uint32_t idx, + parquet::SchemaElement &sel, int64_t from, int64_t until) { SEXP col = VECTOR_ELT(df, idx); if (Rf_inherits(col, "factor")) { return Rf_nlevels(col); } else { - create_dictionary(idx, from, until); + create_dictionary(idx, from, until, sel); return Rf_length(VECTOR_ELT(VECTOR_ELT(dicts, idx), 0)); } } @@ -1675,7 +1723,7 @@ uint32_t RParquetOutFile::get_size_dictionary( UNPROTECT(1); return size; } else { - create_dictionary(idx, from, until); + create_dictionary(idx, from, until, sel); SEXP dictidx = VECTOR_ELT(VECTOR_ELT(dicts, idx), 0); if (type == parquet::Type::INT32) { return Rf_xlength(dictidx) * sizeof(int); @@ -1694,7 +1742,7 @@ uint32_t RParquetOutFile::get_size_dictionary( break; } case REALSXP: { - create_dictionary(idx, from, until); + create_dictionary(idx, from, until, sel); SEXP dict = VECTOR_ELT(VECTOR_ELT(dicts, idx), 0); if (type == parquet::Type::DOUBLE) { return Rf_xlength(dict) * sizeof(double); @@ -1719,7 +1767,7 @@ uint32_t RParquetOutFile::get_size_dictionary( } case STRSXP: { // need to count the length of the stings that are indexed in dict - create_dictionary(idx, from, until); + create_dictionary(idx, from, until, sel); SEXP dictidx = VECTOR_ELT(VECTOR_ELT(dicts, idx), 0); R_xlen_t len = Rf_xlength(dictidx); bool is_uuid = sel.__isset.logicalType && sel.logicalType.__isset.UUID; @@ -1740,7 +1788,7 @@ uint32_t RParquetOutFile::get_size_dictionary( } case LGLSXP: { // this does not happen, no dictionaries for BOOLEAN, makes no sense - create_dictionary(idx, from, until); // # nocov + create_dictionary(idx, from, until, sel); // # nocov SEXP dictidx = VECTOR_ELT(VECTOR_ELT(dicts, idx), 0); // # nocov R_xlen_t l = Rf_xlength(dictidx); // # nocov return l / 8 + (l % 8 > 0); // # nocov diff --git a/tests/testthat/_snaps/write-parquet-statistics.md b/tests/testthat/_snaps/write-parquet-statistics.md index 8634745..f27685a 100644 --- a/tests/testthat/_snaps/write-parquet-statistics.md +++ b/tests/testthat/_snaps/write-parquet-statistics.md @@ -63,56 +63,72 @@ Code do(compression = "snappy") Output - row_group column min_value max_value - 3 0 2 04, 00, .... 04, 00, .... - 16 1 2 04, 00, .... 04, 00, .... - 29 2 2 06, 00, .... 06, 00, .... - 42 3 2 06, 00, .... 08, 00, .... - 55 4 2 08, 00, .... 08, 00, .... - 68 5 2 08, 00, .... 08, 00, .... - 81 6 2 08, 00, .... 08, 00, .... + [[1]] + [1] 1 -100 -1000 NA + + [[2]] + [1] 5 100 1000 NA + + [[3]] + [1] TRUE TRUE TRUE NA + + [[4]] + [1] TRUE TRUE TRUE NA + --- Code do(compression = "uncompressed") Output - row_group column min_value max_value - 3 0 2 04, 00, .... 04, 00, .... - 16 1 2 04, 00, .... 04, 00, .... - 29 2 2 06, 00, .... 06, 00, .... - 42 3 2 06, 00, .... 08, 00, .... - 55 4 2 08, 00, .... 08, 00, .... - 68 5 2 08, 00, .... 08, 00, .... - 81 6 2 08, 00, .... 08, 00, .... + [[1]] + [1] 1 -100 -1000 NA + + [[2]] + [1] 5 100 1000 NA + + [[3]] + [1] TRUE TRUE TRUE NA + + [[4]] + [1] TRUE TRUE TRUE NA + --- Code do(encoding = enc, compression = "snappy") Output - row_group column min_value max_value - 3 0 2 04, 00, .... 04, 00, .... - 16 1 2 04, 00, .... 04, 00, .... - 29 2 2 06, 00, .... 06, 00, .... - 42 3 2 06, 00, .... 08, 00, .... - 55 4 2 08, 00, .... 08, 00, .... - 68 5 2 08, 00, .... 08, 00, .... - 81 6 2 08, 00, .... 08, 00, .... + [[1]] + [1] 1 -100 -1000 NA + + [[2]] + [1] 5 100 1000 NA + + [[3]] + [1] TRUE TRUE TRUE NA + + [[4]] + [1] TRUE TRUE TRUE NA + --- Code do(encoding = enc, compression = "uncompressed") Output - row_group column min_value max_value - 3 0 2 04, 00, .... 04, 00, .... - 16 1 2 04, 00, .... 04, 00, .... - 29 2 2 06, 00, .... 06, 00, .... - 42 3 2 06, 00, .... 08, 00, .... - 55 4 2 08, 00, .... 08, 00, .... - 68 5 2 08, 00, .... 08, 00, .... - 81 6 2 08, 00, .... 08, 00, .... + [[1]] + [1] 1 -100 -1000 NA + + [[2]] + [1] 5 100 1000 NA + + [[3]] + [1] TRUE TRUE TRUE NA + + [[4]] + [1] TRUE TRUE TRUE NA + # min/max for DATEs @@ -122,7 +138,7 @@ [[1]] name r_type type type_length repetition_type converted_type logical_type 1 schema NA - 2 day Date INT32 NA REQUIRED DATE DATE + 2 day Date INT32 NA OPTIONAL DATE DATE 3 count integer INT32 NA REQUIRED INT_32 INT, 32,.... num_children scale precision field_id 1 2 NA NA NA @@ -136,3 +152,147 @@ [1] "2024-09-07" "2024-09-09" "2024-09-11" "2024-09-13" "2024-09-15" +# min/max for double -> signed integers + + Code + do(compression = "snappy") + Output + [[1]] + [1] 1 -100 -1000 NA + + [[2]] + [1] 5 100 1000 NA + + [[3]] + [1] TRUE TRUE TRUE NA + + [[4]] + [1] TRUE TRUE TRUE NA + + +--- + + Code + do(compression = "uncompressed") + Output + [[1]] + [1] 1 -100 -1000 NA + + [[2]] + [1] 5 100 1000 NA + + [[3]] + [1] TRUE TRUE TRUE NA + + [[4]] + [1] TRUE TRUE TRUE NA + + +--- + + Code + do(encoding = "RLE_DICTIONARY", compression = "snappy") + Output + [[1]] + [1] 1 -100 -1000 NA + + [[2]] + [1] 5 100 1000 NA + + [[3]] + [1] TRUE TRUE TRUE NA + + [[4]] + [1] TRUE TRUE TRUE NA + + +--- + + Code + do(encoding = "RLE_DICTIONARY", compression = "uncompressed") + Output + [[1]] + [1] 1 -100 -1000 NA + + [[2]] + [1] 5 100 1000 NA + + [[3]] + [1] TRUE TRUE TRUE NA + + [[4]] + [1] TRUE TRUE TRUE NA + + +# min/max for double -> unsigned integers + + Code + do(compression = "snappy") + Output + [[1]] + [1] 1 1 0 NA + + [[2]] + [1] 5 100 1000 NA + + [[3]] + [1] TRUE TRUE TRUE NA + + [[4]] + [1] TRUE TRUE TRUE NA + + +--- + + Code + do(compression = "uncompressed") + Output + [[1]] + [1] 1 1 0 NA + + [[2]] + [1] 5 100 1000 NA + + [[3]] + [1] TRUE TRUE TRUE NA + + [[4]] + [1] TRUE TRUE TRUE NA + + +--- + + Code + do(encoding = "RLE_DICTIONARY", compression = "snappy") + Output + [[1]] + [1] 1 1 0 NA + + [[2]] + [1] 5 100 1000 NA + + [[3]] + [1] TRUE TRUE TRUE NA + + [[4]] + [1] TRUE TRUE TRUE NA + + +--- + + Code + do(encoding = "RLE_DICTIONARY", compression = "uncompressed") + Output + [[1]] + [1] 1 1 0 NA + + [[2]] + [1] 5 100 1000 NA + + [[3]] + [1] TRUE TRUE TRUE NA + + [[4]] + [1] TRUE TRUE TRUE NA + + diff --git a/tests/testthat/test-write-parquet-statistics.R b/tests/testthat/test-write-parquet-statistics.R index 78f2128..0deade4 100644 --- a/tests/testthat/test-write-parquet-statistics.R +++ b/tests/testthat/test-write-parquet-statistics.R @@ -17,9 +17,16 @@ test_that("null_count is written", { test_that("min/max for integers", { tmp <- tempfile(fileext = ".parquet") on.exit(unlink(tmp), add = TRUE) - df <- test_df(missing = TRUE) - df <- df[order(df$cyl), ] - rownames(df) <- NULL + df <- data.frame(x = c( + sample(1:5), + sample(c(1:3, -100L, 100L)), + sample(c(-1000L, NA_integer_, 1000L, NA_integer_, NA_integer_)), + rep(NA_integer_, 3) + )) + + as_int <- function(x) { + sapply(x, function(xx) xx %&&% readBin(xx, what = "integer") %||% NA_integer_) + } do <- function(encoding = "PLAIN",...) { write_parquet( @@ -30,7 +37,12 @@ test_that("min/max for integers", { ) expect_equal(as.data.frame(df), as.data.frame(read_parquet(tmp))) mtd <- as.data.frame(read_parquet_metadata(tmp)[["column_chunks"]]) - mtd[mtd$column == 2, c("row_group", "column", "min_value", "max_value")] + list( + as_int(mtd[["min_value"]]), + as_int(mtd[["max_value"]]), + mtd[["is_min_value_exact"]], + mtd[["is_max_value_exact"]] + ) } expect_snapshot(do(compression = "snappy")) expect_snapshot(do(compression = "uncompressed")) @@ -50,6 +62,7 @@ test_that("min/max for DATEs", { day = rep(as.Date("2024-09-16") - 10:1, each = 10), count = 1:100 ) + df$day[c(1, 20, 25, 40)] <- as.Date(NA_character_) write_parquet( df, tmp, options = parquet_options(num_rows_per_row_group = 20), @@ -68,3 +81,81 @@ test_that("min/max for DATEs", { expect_snapshot(do()) }) + +test_that("min/max for double -> signed integers", { + tmp <- tempfile(fileext = ".parquet") + on.exit(unlink(tmp), add = TRUE) + df <- data.frame(x = as.double(c( + sample(1:5), + sample(c(1:3, -100L, 100L)), + sample(c(-1000L, NA_integer_, 1000L, NA_integer_, NA_integer_)), + rep(NA_integer_, 3) + ))) + + as_int <- function(x) { + sapply(x, function(xx) xx %&&% readBin(xx, what = "integer") %||% NA_integer_) + } + + do <- function(encoding = "PLAIN",...) { + write_parquet( + df, tmp, + schema = parquet_schema(x = "INT32"), + encoding = encoding, + options = parquet_options(num_rows_per_row_group = 5), + ... + ) + expect_equal(as.data.frame(df), as.data.frame(read_parquet(tmp))) + mtd <- as.data.frame(read_parquet_metadata(tmp)[["column_chunks"]]) + list( + as_int(mtd[["min_value"]]), + as_int(mtd[["max_value"]]), + mtd[["is_min_value_exact"]], + mtd[["is_max_value_exact"]] + ) + } + expect_snapshot(do(compression = "snappy")) + expect_snapshot(do(compression = "uncompressed")) + + # dictionary + expect_snapshot(do(encoding = "RLE_DICTIONARY", compression = "snappy")) + expect_snapshot(do(encoding = "RLE_DICTIONARY", compression = "uncompressed")) +}) + +test_that("min/max for double -> unsigned integers", { + tmp <- tempfile(fileext = ".parquet") + on.exit(unlink(tmp), add = TRUE) + df <- data.frame(x = as.double(c( + sample(1:5), + sample(c(1:3, 1L, 100L)), + sample(c(0L, NA_integer_, 1000L, NA_integer_, NA_integer_)), + rep(NA_integer_, 3) + ))) + + as_int <- function(x) { + sapply(x, function(xx) xx %&&% readBin(xx, what = "integer") %||% NA_integer_) + } + + do <- function(encoding = "PLAIN",...) { + write_parquet( + df, tmp, + schema = parquet_schema(x = "UINT_32"), + encoding = encoding, + options = parquet_options(num_rows_per_row_group = 5), + ... + ) + expect_equal(as.data.frame(df), as.data.frame(read_parquet(tmp))) + mtd <- as.data.frame(read_parquet_metadata(tmp)[["column_chunks"]]) + list( + as_int(mtd[["min_value"]]), + as_int(mtd[["max_value"]]), + mtd[["is_min_value_exact"]], + mtd[["is_max_value_exact"]] + ) + } + expect_snapshot(do(compression = "snappy")) + expect_snapshot(do(compression = "uncompressed")) + + # dictionary + expect_snapshot(do(encoding = "RLE_DICTIONARY", compression = "snappy")) + expect_snapshot(do(encoding = "RLE_DICTIONARY", compression = "uncompressed")) +}) From 4b128d0e6397cad236e43f5410565819bfb9f453 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Cs=C3=A1rdi?= Date: Sat, 21 Sep 2024 07:14:49 +0200 Subject: [PATCH 15/26] Fix writing min/max for TIME --- src/write.cpp | 73 +++++++++++-------- .../_snaps/write-parquet-statistics.md | 72 ++++++++++++++++++ .../testthat/test-write-parquet-statistics.R | 41 +++++++++++ 3 files changed, 157 insertions(+), 29 deletions(-) diff --git a/src/write.cpp b/src/write.cpp index 9900295..1517b42 100644 --- a/src/write.cpp +++ b/src/write.cpp @@ -220,6 +220,30 @@ RParquetOutFile::RParquetOutFile( ParquetOutFile(stream, codec, compression_level, row_groups) { } +static bool is_time(parquet::SchemaElement &sel, double &factor) { + factor = 1.0; + if (sel.__isset.logicalType && sel.logicalType.__isset.TIME) { + auto unit = sel.logicalType.TIME.unit; + if (unit.__isset.MILLIS) { + factor = 1000; + } else if (unit.__isset.MICROS) { + factor = 1000 * 1000; + } else if (unit.__isset.NANOS) { + factor = 1000 * 1000 * 1000; + } + return true; + } else if (sel.__isset.converted_type) { + if (sel.converted_type == parquet::ConvertedType::TIME_MILLIS) { + factor = 1000; + return true; + } else if (sel.converted_type == parquet::ConvertedType::TIME_MICROS) { + factor = 1000 * 1000; + return true; + } + } + return false; +} + void RParquetOutFile::create_dictionary(uint32_t idx, int64_t from, int64_t until, parquet::SchemaElement &sel) { @@ -243,11 +267,25 @@ void RParquetOutFile::create_dictionary(uint32_t idx, int64_t from, min_values[idx] = std::string((const char*) INTEGER(VECTOR_ELT(d, 2)), sizeof(int32_t)); max_values[idx] = std::string((const char*) INTEGER(VECTOR_ELT(d, 3)), sizeof(int32_t)); } else if (TYPEOF(VECTOR_ELT(d, 2)) == REALSXP) { - if (sel.type == parquet::Type::INT32) { - int32_t min = REAL(VECTOR_ELT(d, 2))[0]; - int32_t max = REAL(VECTOR_ELT(d, 3))[0]; + double factor; + bool istime = is_time(sel, factor); + if (istime) { + if (sel.type == parquet::Type::INT32) { + int32_t min = REAL(VECTOR_ELT(d, 2))[0] * factor; + int32_t max = REAL(VECTOR_ELT(d, 3))[0] * factor; min_values[idx] = std::string((const char*) &min, sizeof(int32_t)); max_values[idx] = std::string((const char*) &max, sizeof(int32_t)); + } else { + int64_t min = REAL(VECTOR_ELT(d, 2))[0] * factor; + int64_t max = REAL(VECTOR_ELT(d, 3))[0] * factor; + min_values[idx] = std::string((const char*) &min, sizeof(int64_t)); + max_values[idx] = std::string((const char*) &max, sizeof(int64_t)); + } + } else if (sel.type == parquet::Type::INT32) { + int32_t min = REAL(VECTOR_ELT(d, 2))[0]; + int32_t max = REAL(VECTOR_ELT(d, 3))[0]; + min_values[idx] = std::string((const char*) &min, sizeof(int32_t)); + max_values[idx] = std::string((const char*) &max, sizeof(int32_t)); } else if (sel.type == parquet::Type::DOUBLE) { min_values[idx] = std::string((const char*) REAL(VECTOR_ELT(d, 2)), sizeof(double)); max_values[idx] = std::string((const char*) REAL(VECTOR_ELT(d, 3)), sizeof(double)); @@ -580,30 +618,6 @@ static bool is_decimal(parquet::SchemaElement &sel, int32_t &precision, } } -static bool is_time(parquet::SchemaElement &sel, double &factor) { - factor = 1.0; - if (sel.__isset.logicalType && sel.logicalType.__isset.TIME) { - auto unit = sel.logicalType.TIME.unit; - if (unit.__isset.MILLIS) { - factor = 1000; - } else if (unit.__isset.MICROS) { - factor = 1000 * 1000; - } else if (unit.__isset.NANOS) { - factor = 1000 * 1000 * 1000; - } - return true; - } else if (sel.__isset.converted_type) { - if (sel.converted_type == parquet::ConvertedType::TIME_MILLIS) { - factor = 1000; - return true; - } else if (sel.converted_type == parquet::ConvertedType::TIME_MICROS) { - factor = 1000 * 1000; - return true; - } - } - return false; -} - void write_integer_int32_dec(std::ostream & file, SEXP col, uint64_t from, uint64_t until, int32_t precision, int32_t scale) { @@ -2620,11 +2634,12 @@ void RParquetOutFile::write( // nothing to do } if (sel.__isset.logicalType) { parquet::LogicalType < = sel.logicalType; - is_minmax_supported[idx] = lt.__isset.DATE || lt.__isset.INTEGER; + is_minmax_supported[idx] = lt.__isset.DATE || lt.__isset.INTEGER || + lt.__isset.TIME; // TODO: support the rest // is_minmax_supported[idx] = // lt.__isset.STRING || lt.__isset.ENUM || - // lt.__isset.TIME || lt.__isset.TIMESTAMP || + // lt.__isset.TIMESTAMP || // lt.__isset.JSON || lt.__isset.BSON || lt.__isset.UUID || // lt.__isset.DECIMAL || lt.isset.FLOAT16; } else { diff --git a/tests/testthat/_snaps/write-parquet-statistics.md b/tests/testthat/_snaps/write-parquet-statistics.md index f27685a..502c944 100644 --- a/tests/testthat/_snaps/write-parquet-statistics.md +++ b/tests/testthat/_snaps/write-parquet-statistics.md @@ -296,3 +296,75 @@ [1] TRUE TRUE TRUE NA +# minmax for double -> INT32 TIME(MULLIS) + + Code + do(compression = "snappy") + Output + [[1]] + [1] 1000 -100000 -1000000 NA + + [[2]] + [1] 5000 100000 1000000 NA + + [[3]] + [1] TRUE TRUE TRUE NA + + [[4]] + [1] TRUE TRUE TRUE NA + + +--- + + Code + do(compression = "uncompressed") + Output + [[1]] + [1] 1000 -100000 -1000000 NA + + [[2]] + [1] 5000 100000 1000000 NA + + [[3]] + [1] TRUE TRUE TRUE NA + + [[4]] + [1] TRUE TRUE TRUE NA + + +--- + + Code + do(encoding = "RLE_DICTIONARY", compression = "snappy") + Output + [[1]] + [1] 1000 -100000 -1000000 NA + + [[2]] + [1] 5000 100000 1000000 NA + + [[3]] + [1] TRUE TRUE TRUE NA + + [[4]] + [1] TRUE TRUE TRUE NA + + +--- + + Code + do(encoding = "RLE_DICTIONARY", compression = "uncompressed") + Output + [[1]] + [1] 1000 -100000 -1000000 NA + + [[2]] + [1] 5000 100000 1000000 NA + + [[3]] + [1] TRUE TRUE TRUE NA + + [[4]] + [1] TRUE TRUE TRUE NA + + diff --git a/tests/testthat/test-write-parquet-statistics.R b/tests/testthat/test-write-parquet-statistics.R index 0deade4..497730a 100644 --- a/tests/testthat/test-write-parquet-statistics.R +++ b/tests/testthat/test-write-parquet-statistics.R @@ -159,3 +159,44 @@ test_that("min/max for double -> unsigned integers", { expect_snapshot(do(encoding = "RLE_DICTIONARY", compression = "snappy")) expect_snapshot(do(encoding = "RLE_DICTIONARY", compression = "uncompressed")) }) + +test_that("minmax for double -> INT32 TIME(MULLIS)", { + tmp <- tempfile(fileext = ".parquet") + on.exit(unlink(tmp), add = TRUE) + # IDK what's the point of signed TIME, but it seems to be allowed, the + # sort order is signed + df <- data.frame(x = hms::as_hms(c( + sample(1:5), + sample(c(1:3, -100L, 100L)), + sample(c(-1000L, NA_integer_, 1000L, NA_integer_, NA_integer_)), + rep(NA_integer_, 3) + ))) + + as_int <- function(x) { + sapply(x, function(xx) xx %&&% readBin(xx, what = "integer") %||% NA_integer_) + } + + do <- function(encoding = "PLAIN",...) { + write_parquet( + df, tmp, + schema = parquet_schema(x = "TIME_MILLIS"), + encoding = encoding, + options = parquet_options(num_rows_per_row_group = 5), + ... + ) + expect_equal(as.data.frame(df), as.data.frame(read_parquet(tmp))) + mtd <- as.data.frame(read_parquet_metadata(tmp)[["column_chunks"]]) + list( + as_int(mtd[["min_value"]]), + as_int(mtd[["max_value"]]), + mtd[["is_min_value_exact"]], + mtd[["is_max_value_exact"]] + ) + } + expect_snapshot(do(compression = "snappy")) + expect_snapshot(do(compression = "uncompressed")) + + # dictionary + expect_snapshot(do(encoding = "RLE_DICTIONARY", compression = "snappy")) + expect_snapshot(do(encoding = "RLE_DICTIONARY", compression = "uncompressed")) +}) From 98b64256fc6e63e861b7a2f69e6cb61c8ec99de4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Cs=C3=A1rdi?= Date: Sat, 21 Sep 2024 07:28:27 +0200 Subject: [PATCH 16/26] Write min/max for DOUBLE --- src/write.cpp | 22 +++++- .../_snaps/write-parquet-statistics.md | 76 ++++++++++++++++++- .../testthat/test-write-parquet-statistics.R | 43 ++++++++++- 3 files changed, 133 insertions(+), 8 deletions(-) diff --git a/src/write.cpp b/src/write.cpp index 1517b42..b55497b 100644 --- a/src/write.cpp +++ b/src/write.cpp @@ -679,7 +679,7 @@ void RParquetOutFile::write_integer_int32(std::ostream &file, SEXP col, } if (bit_width == 32) { - if (!write_minmax_values && + if (!minmax && sel.repetition_type == parquet::FieldRepetitionType::REQUIRED) { uint64_t len = until - from; file.write((const char *) (INTEGER(col) + from), sizeof(int) * len); @@ -1255,15 +1255,31 @@ void RParquetOutFile::write_double(std::ostream &file, uint32_t idx, "Internal nanoparquet error, row index too large" ); } - if (sel.repetition_type == parquet::FieldRepetitionType::REQUIRED) { + + bool minmax = write_minmax_values && is_minmax_supported[idx]; + double *min_value = 0, *max_value = 0; + if (minmax && has_minmax_value[idx]) { + min_value = GRAB_MIN(idx, double); + max_value = GRAB_MAX(idx, double); + } + + if (!minmax && + sel.repetition_type == parquet::FieldRepetitionType::REQUIRED) { uint64_t len = until - from; file.write((const char *) (REAL(col) + from), sizeof(double) * len); } else { for (uint64_t i = from; i < until; i++) { double val = REAL(col)[i]; if (R_IsNA(val)) continue; + if (minmax && (min_value == 0 || val < *min_value)) { + SAVE_MIN(idx, val, double); + } + if (minmax && (max_value == 0 || val > *max_value)) { + SAVE_MAX(idx, val, double); + } file.write((const char*) &val, sizeof(double)); } + has_minmax_value[idx] = has_minmax_value[idx] || min_value != 0; } } @@ -2648,7 +2664,7 @@ void RParquetOutFile::write( case parquet::Type::INT32: // case parquet::Type::INT64: // case parquet::Type::FLOAT: - // case parquet::Type::DOUBLE: + case parquet::Type::DOUBLE: is_minmax_supported[idx] = true; break; default: diff --git a/tests/testthat/_snaps/write-parquet-statistics.md b/tests/testthat/_snaps/write-parquet-statistics.md index 502c944..1106c07 100644 --- a/tests/testthat/_snaps/write-parquet-statistics.md +++ b/tests/testthat/_snaps/write-parquet-statistics.md @@ -97,7 +97,7 @@ --- Code - do(encoding = enc, compression = "snappy") + do(encoding = "RLE_DICTIONARY", compression = "snappy") Output [[1]] [1] 1 -100 -1000 NA @@ -115,7 +115,7 @@ --- Code - do(encoding = enc, compression = "uncompressed") + do(encoding = "RLE_DICTIONARY", compression = "uncompressed") Output [[1]] [1] 1 -100 -1000 NA @@ -368,3 +368,75 @@ [1] TRUE TRUE TRUE NA +# min/max for DOUBLE + + Code + do(compression = "snappy") + Output + [[1]] + [1] 1 -100 -1000 NA + + [[2]] + [1] 5 100 1000 NA + + [[3]] + [1] TRUE TRUE TRUE NA + + [[4]] + [1] TRUE TRUE TRUE NA + + +--- + + Code + do(compression = "uncompressed") + Output + [[1]] + [1] 1 -100 -1000 NA + + [[2]] + [1] 5 100 1000 NA + + [[3]] + [1] TRUE TRUE TRUE NA + + [[4]] + [1] TRUE TRUE TRUE NA + + +--- + + Code + do(encoding = "RLE_DICTIONARY", compression = "snappy") + Output + [[1]] + [1] 1 -100 -1000 NA + + [[2]] + [1] 5 100 1000 NA + + [[3]] + [1] TRUE TRUE TRUE NA + + [[4]] + [1] TRUE TRUE TRUE NA + + +--- + + Code + do(encoding = "RLE_DICTIONARY", compression = "uncompressed") + Output + [[1]] + [1] 1 -100 -1000 NA + + [[2]] + [1] 5 100 1000 NA + + [[3]] + [1] TRUE TRUE TRUE NA + + [[4]] + [1] TRUE TRUE TRUE NA + + diff --git a/tests/testthat/test-write-parquet-statistics.R b/tests/testthat/test-write-parquet-statistics.R index 497730a..78afc0d 100644 --- a/tests/testthat/test-write-parquet-statistics.R +++ b/tests/testthat/test-write-parquet-statistics.R @@ -48,9 +48,8 @@ test_that("min/max for integers", { expect_snapshot(do(compression = "uncompressed")) # dictionary - enc <- ifelse(map_chr(df, class) == "integer", "RLE_DICTIONARY", "PLAIN") - expect_snapshot(do(encoding = enc, compression = "snappy")) - expect_snapshot(do(encoding = enc, compression = "uncompressed")) + expect_snapshot(do(encoding = "RLE_DICTIONARY", compression = "snappy")) + expect_snapshot(do(encoding = "RLE_DICTIONARY", compression = "uncompressed")) }) test_that("min/max for DATEs", { @@ -200,3 +199,41 @@ test_that("minmax for double -> INT32 TIME(MULLIS)", { expect_snapshot(do(encoding = "RLE_DICTIONARY", compression = "snappy")) expect_snapshot(do(encoding = "RLE_DICTIONARY", compression = "uncompressed")) }) + +test_that("min/max for DOUBLE", { + tmp <- tempfile(fileext = ".parquet") + on.exit(unlink(tmp), add = TRUE) + df <- data.frame(x = as.double(c( + sample(1:5), + sample(c(1:3, -100, 100)), + sample(c(-1000, NA_real_, 1000, NA_real_, NA_real_)), + rep(NA_real_, 3) + ))) + + as_dbl <- function(x) { + sapply(x, function(xx) xx %&&% readBin(xx, what = "double") %||% NA_real_) + } + + do <- function(encoding = "PLAIN",...) { + write_parquet( + df, tmp, + encoding = encoding, + options = parquet_options(num_rows_per_row_group = 5), + ... + ) + expect_equal(as.data.frame(df), as.data.frame(read_parquet(tmp))) + mtd <- as.data.frame(read_parquet_metadata(tmp)[["column_chunks"]]) + list( + as_dbl(mtd[["min_value"]]), + as_dbl(mtd[["max_value"]]), + mtd[["is_min_value_exact"]], + mtd[["is_max_value_exact"]] + ) + } + expect_snapshot(do(compression = "snappy")) + expect_snapshot(do(compression = "uncompressed")) + + # dictionary + expect_snapshot(do(encoding = "RLE_DICTIONARY", compression = "snappy")) + expect_snapshot(do(encoding = "RLE_DICTIONARY", compression = "uncompressed")) +}) From 6dc4840857487265ee8385618efa19154c7983e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Cs=C3=A1rdi?= Date: Sat, 21 Sep 2024 08:51:54 +0200 Subject: [PATCH 17/26] Min/max for FLOAT --- src/rwrapper.cpp | 7 ++ src/write.cpp | 22 +++++- .../_snaps/write-parquet-statistics.md | 72 +++++++++++++++++++ .../testthat/test-write-parquet-statistics.R | 39 ++++++++++ 4 files changed, 139 insertions(+), 1 deletion(-) diff --git a/src/rwrapper.cpp b/src/rwrapper.cpp index 4e4943b..ec3ae23 100644 --- a/src/rwrapper.cpp +++ b/src/rwrapper.cpp @@ -56,6 +56,12 @@ SEXP zstd_uncompress_raw(SEXP x, SEXP ucl); SEXP test_memstream(); +SEXP read_float(SEXP x) { + float *f = (float*) RAW(x); + double d = *f; + return Rf_ScalarReal(d); +} + SEXP is_asan_() { #if defined(__has_feature) # if __has_feature(address_sanitizer) // for clang @@ -122,6 +128,7 @@ static const R_CallMethodDef R_CallDef[] = { CALLDEF(zstd_uncompress_raw, 2), CALLDEF(test_memstream, 0), + CALLDEF(read_float, 1), CALLDEF(is_asan_, 0), CALLDEF(is_ubsan_, 0), diff --git a/src/write.cpp b/src/write.cpp index b55497b..59ce788 100644 --- a/src/write.cpp +++ b/src/write.cpp @@ -289,6 +289,11 @@ void RParquetOutFile::create_dictionary(uint32_t idx, int64_t from, } else if (sel.type == parquet::Type::DOUBLE) { min_values[idx] = std::string((const char*) REAL(VECTOR_ELT(d, 2)), sizeof(double)); max_values[idx] = std::string((const char*) REAL(VECTOR_ELT(d, 3)), sizeof(double)); + } else if (sel.type == parquet::Type::FLOAT) { + float min = REAL(VECTOR_ELT(d, 2))[0]; + float max = REAL(VECTOR_ELT(d, 3))[0]; + min_values[idx] = std::string((const char*) &min, sizeof(float)); + max_values[idx] = std::string((const char*) &max, sizeof(float)); } } } @@ -1229,12 +1234,27 @@ void RParquetOutFile::write_float(std::ostream &file, uint32_t idx, "Internal nanoparquet error, row index too large" ); } + + bool minmax = write_minmax_values && is_minmax_supported[idx]; + float *min_value = 0, *max_value = 0; + if (minmax && has_minmax_value[idx]) { + min_value = GRAB_MIN(idx, float); + max_value = GRAB_MAX(idx, float); + } + for (uint64_t i = from; i < until; i++) { double val = REAL(col)[i]; if (R_IsNA(val)) continue; float el = val; + if (minmax && (min_value == 0 || el< *min_value)) { + SAVE_MIN(idx, el, float); + } + if (minmax && (max_value == 0 || el > *max_value)) { + SAVE_MAX(idx, el, float); + } file.write((const char*) &el, sizeof(float)); } + has_minmax_value[idx] = has_minmax_value[idx] || min_value != 0; } void RParquetOutFile::write_double(std::ostream &file, uint32_t idx, @@ -2663,7 +2683,7 @@ void RParquetOutFile::write( // case parquet::Type::BOOLEAN: case parquet::Type::INT32: // case parquet::Type::INT64: - // case parquet::Type::FLOAT: + case parquet::Type::FLOAT: case parquet::Type::DOUBLE: is_minmax_supported[idx] = true; break; diff --git a/tests/testthat/_snaps/write-parquet-statistics.md b/tests/testthat/_snaps/write-parquet-statistics.md index 1106c07..df3bc2d 100644 --- a/tests/testthat/_snaps/write-parquet-statistics.md +++ b/tests/testthat/_snaps/write-parquet-statistics.md @@ -440,3 +440,75 @@ [1] TRUE TRUE TRUE NA +# min/max for FLOAT + + Code + do(compression = "snappy") + Output + [[1]] + [1] 1 -100 -1000 NA + + [[2]] + [1] 5 100 1000 NA + + [[3]] + [1] TRUE TRUE TRUE NA + + [[4]] + [1] TRUE TRUE TRUE NA + + +--- + + Code + do(compression = "uncompressed") + Output + [[1]] + [1] 1 -100 -1000 NA + + [[2]] + [1] 5 100 1000 NA + + [[3]] + [1] TRUE TRUE TRUE NA + + [[4]] + [1] TRUE TRUE TRUE NA + + +--- + + Code + do(encoding = "RLE_DICTIONARY", compression = "snappy") + Output + [[1]] + [1] 1 -100 -1000 NA + + [[2]] + [1] 5 100 1000 NA + + [[3]] + [1] TRUE TRUE TRUE NA + + [[4]] + [1] TRUE TRUE TRUE NA + + +--- + + Code + do(encoding = "RLE_DICTIONARY", compression = "uncompressed") + Output + [[1]] + [1] 1 -100 -1000 NA + + [[2]] + [1] 5 100 1000 NA + + [[3]] + [1] TRUE TRUE TRUE NA + + [[4]] + [1] TRUE TRUE TRUE NA + + diff --git a/tests/testthat/test-write-parquet-statistics.R b/tests/testthat/test-write-parquet-statistics.R index 78afc0d..3dbf879 100644 --- a/tests/testthat/test-write-parquet-statistics.R +++ b/tests/testthat/test-write-parquet-statistics.R @@ -237,3 +237,42 @@ test_that("min/max for DOUBLE", { expect_snapshot(do(encoding = "RLE_DICTIONARY", compression = "snappy")) expect_snapshot(do(encoding = "RLE_DICTIONARY", compression = "uncompressed")) }) + +test_that("min/max for FLOAT", { + tmp <- tempfile(fileext = ".parquet") + on.exit(unlink(tmp), add = TRUE) + df <- data.frame(x = as.double(c( + sample(1:5), + sample(c(1:3, -100, 100)), + sample(c(-1000, NA_real_, 1000, NA_real_, NA_real_)), + rep(NA_real_, 3) + ))) + + as_flt <- function(x) { + sapply(x, function(xx) xx %&&% .Call(read_float, xx) %||% NA_real_) + } + + do <- function(encoding = "PLAIN",...) { + write_parquet( + df, tmp, + schema = parquet_schema(x = "FLOAT"), + encoding = encoding, + options = parquet_options(num_rows_per_row_group = 5), + ... + ) + expect_equal(as.data.frame(df), as.data.frame(read_parquet(tmp))) + mtd <- as.data.frame(read_parquet_metadata(tmp)[["column_chunks"]]) + list( + as_flt(mtd[["min_value"]]), + as_flt(mtd[["max_value"]]), + mtd[["is_min_value_exact"]], + mtd[["is_max_value_exact"]] + ) + } + expect_snapshot(do(compression = "snappy")) + expect_snapshot(do(compression = "uncompressed")) + + # dictionary + expect_snapshot(do(encoding = "RLE_DICTIONARY", compression = "snappy")) + expect_snapshot(do(encoding = "RLE_DICTIONARY", compression = "uncompressed")) +}) From 649b7fbf24722d99d6b504579d6c3d6c043c80b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Cs=C3=A1rdi?= Date: Sat, 21 Sep 2024 09:42:36 +0200 Subject: [PATCH 18/26] Write min/max for INT64 --- src/rwrapper.cpp | 7 + src/write.cpp | 136 +++++++++++++++-- .../_snaps/write-parquet-statistics.md | 144 ++++++++++++++++++ .../testthat/test-write-parquet-statistics.R | 78 ++++++++++ 4 files changed, 352 insertions(+), 13 deletions(-) diff --git a/src/rwrapper.cpp b/src/rwrapper.cpp index ec3ae23..34ba469 100644 --- a/src/rwrapper.cpp +++ b/src/rwrapper.cpp @@ -62,6 +62,12 @@ SEXP read_float(SEXP x) { return Rf_ScalarReal(d); } +SEXP read_int64(SEXP x) { + int64_t *f = (int64_t*) RAW(x); + double d = *f; + return Rf_ScalarReal(d); +} + SEXP is_asan_() { #if defined(__has_feature) # if __has_feature(address_sanitizer) // for clang @@ -129,6 +135,7 @@ static const R_CallMethodDef R_CallDef[] = { CALLDEF(test_memstream, 0), CALLDEF(read_float, 1), + CALLDEF(read_int64, 1), CALLDEF(is_asan_, 0), CALLDEF(is_ubsan_, 0), diff --git a/src/write.cpp b/src/write.cpp index 59ce788..e8155c1 100644 --- a/src/write.cpp +++ b/src/write.cpp @@ -202,6 +202,14 @@ class RParquetOutFile : public ParquetOutFile { void write_double_int32(std::ostream &file, SEXP col, uint32_t idx, uint64_t from, uint64_t until, parquet::SchemaElement &sel); + void write_integer_int64(std::ostream &file, SEXP col, uint32_t idx, + uint64_t from, uint64_t until); + void write_double_int64(std::ostream &file, SEXP col, uint32_t idx, + uint64_t from, uint64_t until, + parquet::SchemaElement &sel); + void write_double_int64_time(std::ostream &file, SEXP col, uint32_t idx, + uint64_t from, uint64_t until, + parquet::SchemaElement &sel, double factor); }; RParquetOutFile::RParquetOutFile( @@ -264,8 +272,17 @@ void RParquetOutFile::create_dictionary(uint32_t idx, int64_t from, !Rf_isNull(VECTOR_ELT(d, 2)) && !Rf_isNull(VECTOR_ELT(d, 3))) { has_minmax_value[idx] = true; if (TYPEOF(VECTOR_ELT(d, 2)) == INTSXP) { - min_values[idx] = std::string((const char*) INTEGER(VECTOR_ELT(d, 2)), sizeof(int32_t)); - max_values[idx] = std::string((const char*) INTEGER(VECTOR_ELT(d, 3)), sizeof(int32_t)); + if (sel.type == parquet::Type::INT32) { + min_values[idx] = std::string((const char*) INTEGER(VECTOR_ELT(d, 2)), sizeof(int32_t)); + max_values[idx] = std::string((const char*) INTEGER(VECTOR_ELT(d, 3)), sizeof(int32_t)); + } else if (sel.type == parquet::Type::INT64) { + int64_t min = INTEGER(VECTOR_ELT(d, 2))[0]; + int64_t max = INTEGER(VECTOR_ELT(d, 3))[0]; + min_values[idx] = std::string((const char*) &min, sizeof(int64_t)); + max_values[idx] = std::string((const char*) &max, sizeof(int64_t)); + } else { + Rf_error("Unknown type when writing out INTSXP min/max values, internal error"); + } } else if (TYPEOF(VECTOR_ELT(d, 2)) == REALSXP) { double factor; bool istime = is_time(sel, factor); @@ -294,7 +311,16 @@ void RParquetOutFile::create_dictionary(uint32_t idx, int64_t from, float max = REAL(VECTOR_ELT(d, 3))[0]; min_values[idx] = std::string((const char*) &min, sizeof(float)); max_values[idx] = std::string((const char*) &max, sizeof(float)); + } else if (sel.type == parquet::Type::INT64) { + int64_t min = REAL(VECTOR_ELT(d, 2))[0]; + int64_t max = REAL(VECTOR_ELT(d, 3))[0]; + min_values[idx] = std::string((const char*) &min, sizeof(int64_t)); + max_values[idx] = std::string((const char*) &max, sizeof(int64_t)); + } else { + Rf_error("Unknown type when writing out REALSXP min/max values, internal error"); } + } else { + Rf_error("Unknown R type when writing out min/max values, internal error"); } } } @@ -988,16 +1014,31 @@ void write_integer_int64_dec(std::ostream &file, SEXP col, uint64_t from, } } -void write_integer_int64(std::ostream &file, SEXP col, uint64_t from, - uint64_t until) { +void RParquetOutFile::write_integer_int64(std::ostream &file, SEXP col, + uint32_t idx, + uint64_t from, uint64_t until) { + + int64_t *min_value = 0, *max_value = 0; + bool minmax = write_minmax_values && is_minmax_supported[idx]; + if (minmax && has_minmax_value[idx]) { + min_value = GRAB_MIN(idx, int64_t); + max_value = GRAB_MAX(idx, int64_t); + } for (uint64_t i = from; i < until; i++) { int32_t val = INTEGER(col)[i]; if (val == NA_INTEGER) continue; int64_t el = val; + if (minmax && (min_value == 0 || el < *min_value)) { + SAVE_MIN(idx, el, int64_t); + } + if (minmax && (max_value == 0 || el > *max_value)) { + SAVE_MAX(idx, el, int64_t); + } file.write((const char*) &el, sizeof(int64_t)); } - } + has_minmax_value[idx] = has_minmax_value[idx] || min_value != 0; +} void write_double_int64_dec(std::ostream &file, SEXP col, uint64_t from, uint64_t until, int32_t precision, @@ -1031,20 +1072,37 @@ void write_integer_int64(std::ostream &file, SEXP col, uint64_t from, } } -void write_double_int64_time(std::ostream &file, SEXP col, uint32_t idx, - uint64_t from, uint64_t until, - parquet::SchemaElement &sel, double factor) { +void RParquetOutFile::write_double_int64_time(std::ostream &file, SEXP col, + uint32_t idx, uint64_t from, + uint64_t until, + parquet::SchemaElement &sel, + double factor) { + int64_t *min_value = 0, *max_value = 0; + bool minmax = write_minmax_values && is_minmax_supported[idx]; + if (minmax && has_minmax_value[idx]) { + min_value = GRAB_MIN(idx, int64_t); + max_value = GRAB_MAX(idx, int64_t); + } + for (uint64_t i = from; i < until; i++) { double val = REAL(col)[i]; if (R_IsNA(val)) continue; int64_t ival = val * factor; + if (minmax && (min_value == 0 || ival < *min_value)) { + SAVE_MIN(idx, ival, int64_t); + } + if (minmax && (max_value == 0 || ival > *max_value)) { + SAVE_MAX(idx, ival, int64_t); + } file.write((const char *)&ival, sizeof(int64_t)); } + has_minmax_value[idx] = has_minmax_value[idx] || min_value != 0; } -void write_double_int64(std::ostream &file, SEXP col, uint32_t idx, - uint64_t from, uint64_t until, - parquet::SchemaElement &sel) { +void RParquetOutFile::write_double_int64(std::ostream &file, SEXP col, + uint32_t idx, uint64_t from, + uint64_t until, + parquet::SchemaElement &sel) { if (Rf_inherits(col, "POSIXct")) { int64_t fact = 1; if (sel.__isset.logicalType && sel.logicalType.__isset.TIMESTAMP) { @@ -1063,19 +1121,45 @@ void write_double_int64(std::ostream &file, SEXP col, uint32_t idx, fact = 1000 * 1000; } } + int64_t *min_value = 0, *max_value = 0; + bool minmax = write_minmax_values && is_minmax_supported[idx]; + if (minmax && has_minmax_value[idx]) { + min_value = GRAB_MIN(idx, int64_t); + max_value = GRAB_MAX(idx, int64_t); + } for (uint64_t i = from; i < until; i++) { double val = REAL(col)[i]; if (R_IsNA(val)) continue; int64_t el = val * fact; + if (minmax && (min_value == 0 || el < *min_value)) { + SAVE_MIN(idx, el, int64_t); + } + if (minmax && (max_value == 0 || el > *max_value)) { + SAVE_MAX(idx, el, int64_t); + } file.write((const char *)&el, sizeof(int64_t)); } + has_minmax_value[idx] = has_minmax_value[idx] || min_value != 0; } else if (Rf_inherits(col, "difftime")) { + int64_t *min_value = 0, *max_value = 0; + bool minmax = write_minmax_values && is_minmax_supported[idx]; + if (minmax && has_minmax_value[idx]) { + min_value = GRAB_MIN(idx, int64_t); + max_value = GRAB_MAX(idx, int64_t); + } for (uint64_t i = from; i < until; i++) { double val = REAL(col)[i]; if (R_IsNA(val)) continue; int64_t el = val * 1000 * 1000 * 1000; + if (minmax && (min_value == 0 || el < *min_value)) { + SAVE_MIN(idx, el, int64_t); + } + if (minmax && (max_value == 0 || el > *max_value)) { + SAVE_MAX(idx, el, int64_t); + } file.write((const char *)&el, sizeof(int64_t)); } + has_minmax_value[idx] = has_minmax_value[idx] || min_value != 0; } else { bool is_signed = TRUE; int bit_width = 64; @@ -1091,6 +1175,12 @@ void write_double_int64(std::ostream &file, SEXP col, uint32_t idx, } if (is_signed) { double min = -pow(2, 63), max = -(min+1); + int64_t *min_value = 0, *max_value = 0; + bool minmax = write_minmax_values && is_minmax_supported[idx]; + if (minmax && has_minmax_value[idx]) { + min_value = GRAB_MIN(idx, int64_t); + max_value = GRAB_MAX(idx, int64_t); + } for (uint64_t i = from; i < until; i++) { double val = REAL(col)[i]; if (R_IsNA(val)) continue; @@ -1104,10 +1194,23 @@ void write_double_int64(std::ostream &file, SEXP col, uint32_t idx, ); } int64_t el = val; + if (minmax && (min_value == 0 || el < *min_value)) { + SAVE_MIN(idx, el, int64_t); + } + if (minmax && (max_value == 0 || el > *max_value)) { + SAVE_MAX(idx, el, int64_t); + } file.write((const char *)&el, sizeof(int64_t)); } + has_minmax_value[idx] = has_minmax_value[idx] || min_value != 0; } else { double max = pow(2, 64) - 1; + uint64_t *min_value = 0, *max_value = 0; + bool minmax = write_minmax_values && is_minmax_supported[idx]; + if (minmax && has_minmax_value[idx]) { + min_value = GRAB_MIN(idx, uint64_t); + max_value = GRAB_MAX(idx, uint64_t); + } for (uint64_t i = from; i < until; i++) { double val = REAL(col)[i]; if (R_IsNA(val)) continue; @@ -1128,8 +1231,15 @@ void write_double_int64(std::ostream &file, SEXP col, uint32_t idx, ); } uint64_t el = val; + if (minmax && (min_value == 0 || el < *min_value)) { + SAVE_MIN(idx, el, uint64_t); + } + if (minmax && (max_value == 0 || el > *max_value)) { + SAVE_MAX(idx, el, uint64_t); + } file.write((const char *)&el, sizeof(uint64_t)); } + has_minmax_value[idx] = has_minmax_value[idx] || min_value != 0; } } } @@ -1155,7 +1265,7 @@ void RParquetOutFile::write_int64(std::ostream &file, uint32_t idx, if (isdec) { write_integer_int64_dec(file, col, from, until, precision, scale); } else { - write_integer_int64(file, col, from, until); + write_integer_int64(file, col, idx, from, until); } break; case REALSXP: @@ -2682,7 +2792,7 @@ void RParquetOutFile::write( switch(sel.type) { // case parquet::Type::BOOLEAN: case parquet::Type::INT32: - // case parquet::Type::INT64: + case parquet::Type::INT64: case parquet::Type::FLOAT: case parquet::Type::DOUBLE: is_minmax_supported[idx] = true; diff --git a/tests/testthat/_snaps/write-parquet-statistics.md b/tests/testthat/_snaps/write-parquet-statistics.md index df3bc2d..1c01b59 100644 --- a/tests/testthat/_snaps/write-parquet-statistics.md +++ b/tests/testthat/_snaps/write-parquet-statistics.md @@ -512,3 +512,147 @@ [1] TRUE TRUE TRUE NA +# min/max for integer -> INT64 + + Code + do(compression = "snappy") + Output + [[1]] + [1] 1 -100 -1000 NA + + [[2]] + [1] 5 100 1000 NA + + [[3]] + [1] TRUE TRUE TRUE NA + + [[4]] + [1] TRUE TRUE TRUE NA + + +--- + + Code + do(compression = "uncompressed") + Output + [[1]] + [1] 1 -100 -1000 NA + + [[2]] + [1] 5 100 1000 NA + + [[3]] + [1] TRUE TRUE TRUE NA + + [[4]] + [1] TRUE TRUE TRUE NA + + +--- + + Code + do(encoding = "RLE_DICTIONARY", compression = "snappy") + Output + [[1]] + [1] 1 -100 -1000 NA + + [[2]] + [1] 5 100 1000 NA + + [[3]] + [1] TRUE TRUE TRUE NA + + [[4]] + [1] TRUE TRUE TRUE NA + + +--- + + Code + do(encoding = "RLE_DICTIONARY", compression = "uncompressed") + Output + [[1]] + [1] 1 -100 -1000 NA + + [[2]] + [1] 5 100 1000 NA + + [[3]] + [1] TRUE TRUE TRUE NA + + [[4]] + [1] TRUE TRUE TRUE NA + + +# min/max for REALSXP -> INT64 + + Code + do(compression = "snappy") + Output + [[1]] + [1] 1 -100 -1000 NA + + [[2]] + [1] 5 100 1000 NA + + [[3]] + [1] TRUE TRUE TRUE NA + + [[4]] + [1] TRUE TRUE TRUE NA + + +--- + + Code + do(compression = "uncompressed") + Output + [[1]] + [1] 1 -100 -1000 NA + + [[2]] + [1] 5 100 1000 NA + + [[3]] + [1] TRUE TRUE TRUE NA + + [[4]] + [1] TRUE TRUE TRUE NA + + +--- + + Code + do(encoding = "RLE_DICTIONARY", compression = "snappy") + Output + [[1]] + [1] 1 -100 -1000 NA + + [[2]] + [1] 5 100 1000 NA + + [[3]] + [1] TRUE TRUE TRUE NA + + [[4]] + [1] TRUE TRUE TRUE NA + + +--- + + Code + do(encoding = "RLE_DICTIONARY", compression = "uncompressed") + Output + [[1]] + [1] 1 -100 -1000 NA + + [[2]] + [1] 5 100 1000 NA + + [[3]] + [1] TRUE TRUE TRUE NA + + [[4]] + [1] TRUE TRUE TRUE NA + + diff --git a/tests/testthat/test-write-parquet-statistics.R b/tests/testthat/test-write-parquet-statistics.R index 3dbf879..8b47d35 100644 --- a/tests/testthat/test-write-parquet-statistics.R +++ b/tests/testthat/test-write-parquet-statistics.R @@ -276,3 +276,81 @@ test_that("min/max for FLOAT", { expect_snapshot(do(encoding = "RLE_DICTIONARY", compression = "snappy")) expect_snapshot(do(encoding = "RLE_DICTIONARY", compression = "uncompressed")) }) + +test_that("min/max for integer -> INT64", { + tmp <- tempfile(fileext = ".parquet") + on.exit(unlink(tmp), add = TRUE) + df <- data.frame(x = c( + sample(1:5), + sample(c(1:3, -100L, 100L)), + sample(c(-1000L, NA_integer_, 1000L, NA_integer_, NA_integer_)), + rep(NA_integer_, 3) + )) + + as_int64 <- function(x) { + sapply(x, function(xx) xx %&&% .Call(read_int64, xx) %||% NA_real_) + } + + do <- function(encoding = "PLAIN",...) { + write_parquet( + df, tmp, + schema = parquet_schema(x = "INT64"), + encoding = encoding, + options = parquet_options(num_rows_per_row_group = 5), + ... + ) + expect_equal(as.data.frame(df), as.data.frame(read_parquet(tmp))) + mtd <- as.data.frame(read_parquet_metadata(tmp)[["column_chunks"]]) + list( + as_int64(mtd[["min_value"]]), + as_int64(mtd[["max_value"]]), + mtd[["is_min_value_exact"]], + mtd[["is_max_value_exact"]] + ) + } + expect_snapshot(do(compression = "snappy")) + expect_snapshot(do(compression = "uncompressed")) + + # dictionary + expect_snapshot(do(encoding = "RLE_DICTIONARY", compression = "snappy")) + expect_snapshot(do(encoding = "RLE_DICTIONARY", compression = "uncompressed")) +}) + +test_that("min/max for REALSXP -> INT64", { + tmp <- tempfile(fileext = ".parquet") + on.exit(unlink(tmp), add = TRUE) + df <- data.frame(x = as.double(c( + sample(1:5), + sample(c(1:3, -100L, 100L)), + sample(c(-1000L, NA_integer_, 1000L, NA_integer_, NA_integer_)), + rep(NA_integer_, 3) + ))) + + as_int64 <- function(x) { + sapply(x, function(xx) xx %&&% .Call(read_int64, xx) %||% NA_real_) + } + + do <- function(encoding = "PLAIN",...) { + write_parquet( + df, tmp, + schema = parquet_schema(x = "INT64"), + encoding = encoding, + options = parquet_options(num_rows_per_row_group = 5), + ... + ) + expect_equal(as.data.frame(df), as.data.frame(read_parquet(tmp))) + mtd <- as.data.frame(read_parquet_metadata(tmp)[["column_chunks"]]) + list( + as_int64(mtd[["min_value"]]), + as_int64(mtd[["max_value"]]), + mtd[["is_min_value_exact"]], + mtd[["is_max_value_exact"]] + ) + } + expect_snapshot(do(compression = "snappy")) + expect_snapshot(do(compression = "uncompressed")) + + # dictionary + expect_snapshot(do(encoding = "RLE_DICTIONARY", compression = "snappy")) + expect_snapshot(do(encoding = "RLE_DICTIONARY", compression = "uncompressed")) +}) From 16d3c1ee26f66d897a13b34a20dda0947f88a710 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Cs=C3=A1rdi?= Date: Sat, 21 Sep 2024 09:44:15 +0200 Subject: [PATCH 19/26] Fix metadata snapshot tests --- tests/testthat/test-parquet-metadata.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/test-parquet-metadata.R b/tests/testthat/test-parquet-metadata.R index 3c76c48..2c4d870 100644 --- a/tests/testthat/test-parquet-metadata.R +++ b/tests/testthat/test-parquet-metadata.R @@ -18,7 +18,7 @@ test_that("parquet_metadata", { as.data.frame(mtd$file_meta_data) as.data.frame(mtd$schema) as.data.frame(mtd$row_groups) - as.data.frame(mtd$column_chunks) + as.data.frame(mtd$column_chunks[, 1:20]) }) sch <- read_parquet_schema("test.parquet") From ca834cebdda0892f0ec218dbae682474f638a108 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Cs=C3=A1rdi?= Date: Sat, 21 Sep 2024 09:52:18 +0200 Subject: [PATCH 20/26] Ooops, really fix snapshots --- tests/testthat/_snaps/parquet-metadata.md | 44 ++++++++--------------- 1 file changed, 15 insertions(+), 29 deletions(-) diff --git a/tests/testthat/_snaps/parquet-metadata.md b/tests/testthat/_snaps/parquet-metadata.md index 72de511..b9dc1fe 100644 --- a/tests/testthat/_snaps/parquet-metadata.md +++ b/tests/testthat/_snaps/parquet-metadata.md @@ -48,7 +48,7 @@ ordinal 1 NA Code - as.data.frame(mtd$column_chunks) + as.data.frame(mtd$column_chunks[, 1:20]) Output file_name row_group column file_path file_offset offset_index_offset 1 test.parquet 0 0 4 NA @@ -106,34 +106,20 @@ 11 275 2879 NA 12 275 3154 NA 13 21 3429 NA - dictionary_page_offset null_count min_value max_value - 1 NA 0 - 2 NA 0 - 3 NA 0 04, 00, .... 08, 00, .... - 4 NA 0 - 5 NA 0 - 6 NA 0 - 7 NA 0 - 8 NA 0 - 9 NA 0 - 10 NA 0 - 11 NA 0 - 12 NA 0 - 13 NA 0 - is_min_value_exact is_max_value_exact - 1 NA NA - 2 NA NA - 3 TRUE TRUE - 4 NA NA - 5 NA NA - 6 NA NA - 7 NA NA - 8 NA NA - 9 NA NA - 10 NA NA - 11 NA NA - 12 NA NA - 13 NA NA + dictionary_page_offset null_count + 1 NA 0 + 2 NA 0 + 3 NA 0 + 4 NA 0 + 5 NA 0 + 6 NA 0 + 7 NA 0 + 8 NA 0 + 9 NA 0 + 10 NA 0 + 11 NA 0 + 12 NA 0 + 13 NA 0 --- From 105c7d15bb0ab3a05986469d59025fabe9f5bbd0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Cs=C3=A1rdi?= Date: Sat, 21 Sep 2024 09:52:29 +0200 Subject: [PATCH 21/26] Fix compilation on Linux --- src/rwrapper.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/rwrapper.cpp b/src/rwrapper.cpp index 34ba469..e96f18c 100644 --- a/src/rwrapper.cpp +++ b/src/rwrapper.cpp @@ -1,3 +1,5 @@ +#include + #include extern "C" { From 07338d15379f5747d958900f0fc2c8b6fa9ba103 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Cs=C3=A1rdi?= Date: Sat, 21 Sep 2024 10:03:49 +0200 Subject: [PATCH 22/26] Better error from min/max conversion --- src/write.cpp | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/write.cpp b/src/write.cpp index e8155c1..b11ce99 100644 --- a/src/write.cpp +++ b/src/write.cpp @@ -281,7 +281,11 @@ void RParquetOutFile::create_dictionary(uint32_t idx, int64_t from, min_values[idx] = std::string((const char*) &min, sizeof(int64_t)); max_values[idx] = std::string((const char*) &max, sizeof(int64_t)); } else { - Rf_error("Unknown type when writing out INTSXP min/max values, internal error"); + Rf_errorcall( + nanoparquet_call, + "Cannot convert an integer vector to Parquet type %s.", + parquet::_Type_VALUES_TO_NAMES.at(sel.type) + ); } } else if (TYPEOF(VECTOR_ELT(d, 2)) == REALSXP) { double factor; @@ -317,7 +321,11 @@ void RParquetOutFile::create_dictionary(uint32_t idx, int64_t from, min_values[idx] = std::string((const char*) &min, sizeof(int64_t)); max_values[idx] = std::string((const char*) &max, sizeof(int64_t)); } else { - Rf_error("Unknown type when writing out REALSXP min/max values, internal error"); + Rf_errorcall( + nanoparquet_call, + "Cannot convert a double vector to Parquet type %s.", + parquet::_Type_VALUES_TO_NAMES.at(sel.type) + ); } } else { Rf_error("Unknown R type when writing out min/max values, internal error"); From 3d0f67b3e0556ed4e8007c64bf51eba0f8184ffd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Cs=C3=A1rdi?= Date: Sat, 21 Sep 2024 15:08:18 +0200 Subject: [PATCH 23/26] Implement min/max for various string (BYTE_ARRAY) types --- src/dictionary-encoding.cpp | 46 ++- src/write.cpp | 58 +++- .../_snaps/write-parquet-statistics.md | 288 ++++++++++++++++++ .../testthat/test-write-parquet-statistics.R | 61 ++++ 4 files changed, 440 insertions(+), 13 deletions(-) diff --git a/src/dictionary-encoding.cpp b/src/dictionary-encoding.cpp index fa6cb15..bcbb366 100644 --- a/src/dictionary-encoding.cpp +++ b/src/dictionary-encoding.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include "protect.h" @@ -68,21 +69,48 @@ uint64_t create_dict(T* values, uint64_t len, T naval) { return n; } -uint64_t create_dict_ptr_idx(void** values, int *dict, int *idx, - uint64_t len, void *naval) { +static inline bool STR_LESS(SEXP sc, SEXP set) { + const char *c = CHAR(sc), *et = CHAR(set); + size_t l = strlen(c), el = strlen(et); + if (l == 0) return el > 0; + if (el == 0) return false; + int res = memcmp(c, et, l < el ? l : el); + return res < 0 || (res == 0 && l < el); +} + +static inline bool STR_MORE(SEXP sc, SEXP set) { + const char *c = CHAR(sc), *et = CHAR(set); + size_t l = strlen(c), el = strlen(et); + if (l == 0) return false; + if (el == 0) return true; + int res = memcmp(c, et, l < el ? l : el); + return res > 0 || (res == 0 && l > el); +} + +uint64_t create_dict_str_idx(const SEXP* values, int *dict, int *idx, + uint64_t len, SEXP naval, SEXP &minval, + SEXP &maxval, bool &hasminmax) { std::unordered_map mm; mm.reserve(len * 2); - void **begin = values; - void **end = begin + len; + SEXP *begin = (SEXP*) values; + SEXP *end = (SEXP*) begin + len; int n = 0; + hasminmax = false; + for (int i = 0; begin < end; begin++, i++) { if (*begin == naval) { idx[i] = NA_INTEGER; continue; } + if (!hasminmax) { + hasminmax = true; + minval = maxval = *begin; + } auto it = mm.find(*begin); if (it == mm.end()) { + if (STR_LESS(*begin, minval)) minval = *begin; + if (STR_MORE(*begin, maxval)) maxval = *begin; mm.insert(std::make_pair(*begin, n)); idx[i] = n; dict[n] = i; @@ -206,6 +234,7 @@ SEXP nanoparquet_create_dict_idx_(SEXP x, SEXP from, SEXP until) { int *iidx = INTEGER(idx); int imin, imax; double dmin, dmax; + SEXP smin = R_NilValue, smax = R_NilValue; bool hasminmax = false; switch (TYPEOF(x)) { case LGLSXP: @@ -227,9 +256,9 @@ SEXP nanoparquet_create_dict_idx_(SEXP x, SEXP from, SEXP until) { ); break; case STRSXP: { - dictlen = create_dict_ptr_idx( - (void**)(STRING_PTR_RO(x) + cfrom), idict, iidx, len, - (void*) NA_STRING + dictlen = create_dict_str_idx( + STRING_PTR_RO(x) + cfrom, idict, iidx, len, NA_STRING, + smin, smax, hasminmax ); break; } @@ -248,6 +277,9 @@ SEXP nanoparquet_create_dict_idx_(SEXP x, SEXP from, SEXP until) { } else if (TYPEOF(x) == REALSXP) { SET_VECTOR_ELT(res, 2, Rf_ScalarReal(dmin)); SET_VECTOR_ELT(res, 3, Rf_ScalarReal(dmax)); + } else if (TYPEOF(x) == STRSXP) { + SET_VECTOR_ELT(res, 2, smin); + SET_VECTOR_ELT(res, 3, smax); } } diff --git a/src/write.cpp b/src/write.cpp index b11ce99..82c156a 100644 --- a/src/write.cpp +++ b/src/write.cpp @@ -327,6 +327,11 @@ void RParquetOutFile::create_dictionary(uint32_t idx, int64_t from, parquet::_Type_VALUES_TO_NAMES.at(sel.type) ); } + } else if (TYPEOF(VECTOR_ELT(d, 2)) == CHARSXP) { + const char *min = CHAR(VECTOR_ELT(d, 2)); + const char *max = CHAR(VECTOR_ELT(d, 3)); + min_values[idx] = std::string(min, strlen(min)); + max_values[idx] = std::string(max, strlen(min)); } else { Rf_error("Unknown R type when writing out min/max values, internal error"); } @@ -1421,6 +1426,33 @@ void RParquetOutFile::write_double(std::ostream &file, uint32_t idx, } } +static inline bool STR_LESS(const char *c, size_t l, std::string &etalon) { + size_t el = etalon.size(); + // "" is less than anything but "" + if (l == 0) return el > 0; + // otherwise anything is more than "" + if (el == 0) return false; + int res = memcmp(c, etalon.data(), l < el ? l : el); + return res < 0 || (res == 0 && l < el); +} + +static inline bool STR_MORE(const char *c, size_t l, std::string &etalon) { + size_t el = etalon.size(); + // "" is not more than anything + if (l == 0) return false; + // othwrwise anything is more than "" + if (el == 0) return true; + int res = memcmp(c, etalon.data(), l < el ? l : el); + return res > 0 || (res == 0 && l > el); +} + +#define SAVE_MIN_STR(idx, c, l) do { \ + min_values[idx] = std::string((c), (l)); \ + min_value = &min_values[idx]; } while (0) +#define SAVE_MAX_STR(idx, c, l) do { \ + max_values[idx] = std::string((c), (l)); \ + max_value = &max_values[idx]; } while (0) + void RParquetOutFile::write_byte_array(std::ostream &file, uint32_t idx, uint32_t group, uint32_t page, uint64_t from, uint64_t until, @@ -1435,6 +1467,13 @@ void RParquetOutFile::write_byte_array(std::ostream &file, uint32_t idx, switch (TYPEOF(col)) { case STRSXP: { + bool minmax = write_minmax_values && is_minmax_supported[idx]; + std::string *min_value = nullptr, *max_value = nullptr; + if (minmax && has_minmax_value[idx]) { + min_value = &min_values[idx]; + max_value = &max_values[idx]; + } + for (uint64_t i = from; i < until; i++) { SEXP el = STRING_ELT(col, i); if (el == NA_STRING) { @@ -1442,9 +1481,16 @@ void RParquetOutFile::write_byte_array(std::ostream &file, uint32_t idx, } const char *c = CHAR(el); uint32_t len1 = strlen(c); + if (minmax && (min_value == nullptr || STR_LESS(c, len1, *min_value))) { + SAVE_MIN_STR(idx, c, len1); + } + if (minmax && (max_value == nullptr || STR_MORE(c, len1, *max_value))) { + SAVE_MAX_STR(idx, c, len1); + } file.write((const char *)&len1, 4); file.write(c, len1); } + has_minmax_value[idx] = has_minmax_value[idx] || min_value != nullptr; break; } case VECSXP: { @@ -2789,13 +2835,11 @@ void RParquetOutFile::write( } if (sel.__isset.logicalType) { parquet::LogicalType < = sel.logicalType; is_minmax_supported[idx] = lt.__isset.DATE || lt.__isset.INTEGER || - lt.__isset.TIME; + lt.__isset.TIME || lt.__isset.STRING || lt.__isset.ENUM || + lt.__isset.JSON || lt.__isset.BSON; // TODO: support the rest - // is_minmax_supported[idx] = - // lt.__isset.STRING || lt.__isset.ENUM || - // lt.__isset.TIMESTAMP || - // lt.__isset.JSON || lt.__isset.BSON || lt.__isset.UUID || - // lt.__isset.DECIMAL || lt.isset.FLOAT16; + // is_minmax_supported[idx] = lt.__isset.TIMESTAMP || + // lt.__isset.UUID || lt.__isset.DECIMAL || lt.isset.FLOAT16; } else { switch(sel.type) { // case parquet::Type::BOOLEAN: @@ -2803,6 +2847,8 @@ void RParquetOutFile::write( case parquet::Type::INT64: case parquet::Type::FLOAT: case parquet::Type::DOUBLE: + // case parquet::Type::BYTE_ARRAY; + // case parquet::Type::FIXED_LEN_BYTE_ARRAY; is_minmax_supported[idx] = true; break; default: diff --git a/tests/testthat/_snaps/write-parquet-statistics.md b/tests/testthat/_snaps/write-parquet-statistics.md index 1c01b59..ca7e154 100644 --- a/tests/testthat/_snaps/write-parquet-statistics.md +++ b/tests/testthat/_snaps/write-parquet-statistics.md @@ -656,3 +656,291 @@ [1] TRUE TRUE TRUE NA +# min/max for STRING + + Code + do(compression = "snappy") + Output + [[1]] + [1] "a" "!!!" "!" NA + + [[2]] + [1] "e" "~~~" "~" NA + + [[3]] + [1] TRUE TRUE TRUE NA + + [[4]] + [1] TRUE TRUE TRUE NA + + +--- + + Code + do(compression = "uncompressed") + Output + [[1]] + [1] "a" "!!!" "!" NA + + [[2]] + [1] "e" "~~~" "~" NA + + [[3]] + [1] TRUE TRUE TRUE NA + + [[4]] + [1] TRUE TRUE TRUE NA + + +--- + + Code + do(encoding = "RLE_DICTIONARY", compression = "snappy") + Output + [[1]] + [1] "a" "!!!" "!" NA + + [[2]] + [1] "e" "~~~" "~" NA + + [[3]] + [1] TRUE TRUE TRUE NA + + [[4]] + [1] TRUE TRUE TRUE NA + + +--- + + Code + do(encoding = "RLE_DICTIONARY", compression = "uncompressed") + Output + [[1]] + [1] "a" "!!!" "!" NA + + [[2]] + [1] "e" "~~~" "~" NA + + [[3]] + [1] TRUE TRUE TRUE NA + + [[4]] + [1] TRUE TRUE TRUE NA + + +--- + + Code + do(compression = "snappy", type = "JSON") + Output + [[1]] + [1] "a" "!!!" "!" NA + + [[2]] + [1] "e" "~~~" "~" NA + + [[3]] + [1] TRUE TRUE TRUE NA + + [[4]] + [1] TRUE TRUE TRUE NA + + +--- + + Code + do(compression = "uncompressed", type = "JSON") + Output + [[1]] + [1] "a" "!!!" "!" NA + + [[2]] + [1] "e" "~~~" "~" NA + + [[3]] + [1] TRUE TRUE TRUE NA + + [[4]] + [1] TRUE TRUE TRUE NA + + +--- + + Code + do(encoding = "RLE_DICTIONARY", compression = "snappy", type = "JSON") + Output + [[1]] + [1] "a" "!!!" "!" NA + + [[2]] + [1] "e" "~~~" "~" NA + + [[3]] + [1] TRUE TRUE TRUE NA + + [[4]] + [1] TRUE TRUE TRUE NA + + +--- + + Code + do(encoding = "RLE_DICTIONARY", compression = "uncompressed", type = "JSON") + Output + [[1]] + [1] "a" "!!!" "!" NA + + [[2]] + [1] "e" "~~~" "~" NA + + [[3]] + [1] TRUE TRUE TRUE NA + + [[4]] + [1] TRUE TRUE TRUE NA + + +--- + + Code + do(compression = "snappy", type = "BSON") + Output + [[1]] + [1] "a" "!!!" "!" NA + + [[2]] + [1] "e" "~~~" "~" NA + + [[3]] + [1] TRUE TRUE TRUE NA + + [[4]] + [1] TRUE TRUE TRUE NA + + +--- + + Code + do(compression = "uncompressed", type = "BSON") + Output + [[1]] + [1] "a" "!!!" "!" NA + + [[2]] + [1] "e" "~~~" "~" NA + + [[3]] + [1] TRUE TRUE TRUE NA + + [[4]] + [1] TRUE TRUE TRUE NA + + +--- + + Code + do(encoding = "RLE_DICTIONARY", compression = "snappy", type = "BSON") + Output + [[1]] + [1] "a" "!!!" "!" NA + + [[2]] + [1] "e" "~~~" "~" NA + + [[3]] + [1] TRUE TRUE TRUE NA + + [[4]] + [1] TRUE TRUE TRUE NA + + +--- + + Code + do(encoding = "RLE_DICTIONARY", compression = "uncompressed", type = "BSON") + Output + [[1]] + [1] "a" "!!!" "!" NA + + [[2]] + [1] "e" "~~~" "~" NA + + [[3]] + [1] TRUE TRUE TRUE NA + + [[4]] + [1] TRUE TRUE TRUE NA + + +--- + + Code + do(compression = "snappy", type = "ENUM") + Output + [[1]] + [1] "a" "!!!" "!" NA + + [[2]] + [1] "e" "~~~" "~" NA + + [[3]] + [1] TRUE TRUE TRUE NA + + [[4]] + [1] TRUE TRUE TRUE NA + + +--- + + Code + do(compression = "uncompressed", type = "ENUM") + Output + [[1]] + [1] "a" "!!!" "!" NA + + [[2]] + [1] "e" "~~~" "~" NA + + [[3]] + [1] TRUE TRUE TRUE NA + + [[4]] + [1] TRUE TRUE TRUE NA + + +--- + + Code + do(encoding = "RLE_DICTIONARY", compression = "snappy", type = "ENUM") + Output + [[1]] + [1] "a" "!!!" "!" NA + + [[2]] + [1] "e" "~~~" "~" NA + + [[3]] + [1] TRUE TRUE TRUE NA + + [[4]] + [1] TRUE TRUE TRUE NA + + +--- + + Code + do(encoding = "RLE_DICTIONARY", compression = "uncompressed", type = "ENUM") + Output + [[1]] + [1] "a" "!!!" "!" NA + + [[2]] + [1] "e" "~~~" "~" NA + + [[3]] + [1] TRUE TRUE TRUE NA + + [[4]] + [1] TRUE TRUE TRUE NA + + diff --git a/tests/testthat/test-write-parquet-statistics.R b/tests/testthat/test-write-parquet-statistics.R index 8b47d35..7917ec9 100644 --- a/tests/testthat/test-write-parquet-statistics.R +++ b/tests/testthat/test-write-parquet-statistics.R @@ -354,3 +354,64 @@ test_that("min/max for REALSXP -> INT64", { expect_snapshot(do(encoding = "RLE_DICTIONARY", compression = "snappy")) expect_snapshot(do(encoding = "RLE_DICTIONARY", compression = "uncompressed")) }) + +test_that("min/max for STRING", { + tmp <- tempfile(fileext = ".parquet") + on.exit(unlink(tmp), add = TRUE) + df <- data.frame(x = c( + sample(letters[1:5]), + sample(c(letters[1:3], "!!!", "~~~")), + sample(c("!", NA_character_, "~", NA_character_, NA_character_)), + rep(NA_character_, 3) + )) + + as_str <- function(x) { + sapply(x, function(xx) xx %&&% rawToChar(xx) %||% NA_character_) + } + + do <- function(encoding = "PLAIN", type = "STRING", ...) { + write_parquet( + df, tmp, + schema = parquet_schema(x = type), + encoding = encoding, + options = parquet_options(num_rows_per_row_group = 5), + ... + ) + expect_equal(as.data.frame(df), as.data.frame(read_parquet(tmp))) + expect_equal(read_parquet_schema(tmp)$logical_type[[2]]$type, type) + mtd <- as.data.frame(read_parquet_metadata(tmp)[["column_chunks"]]) + list( + as_str(mtd[["min_value"]]), + as_str(mtd[["max_value"]]), + mtd[["is_min_value_exact"]], + mtd[["is_max_value_exact"]] + ) + } + expect_snapshot(do(compression = "snappy")) + expect_snapshot(do(compression = "uncompressed")) + + # dictionary + expect_snapshot(do(encoding = "RLE_DICTIONARY", compression = "snappy")) + expect_snapshot(do(encoding = "RLE_DICTIONARY", compression = "uncompressed")) + + expect_snapshot(do(compression = "snappy", type = "JSON")) + expect_snapshot(do(compression = "uncompressed", type = "JSON")) + + # dictionary + expect_snapshot(do(encoding = "RLE_DICTIONARY", compression = "snappy", type = "JSON")) + expect_snapshot(do(encoding = "RLE_DICTIONARY", compression = "uncompressed", type = "JSON")) + + expect_snapshot(do(compression = "snappy", type = "BSON")) + expect_snapshot(do(compression = "uncompressed", type = "BSON")) + + # dictionary + expect_snapshot(do(encoding = "RLE_DICTIONARY", compression = "snappy", type = "BSON")) + expect_snapshot(do(encoding = "RLE_DICTIONARY", compression = "uncompressed", type = "BSON")) + + expect_snapshot(do(compression = "snappy", type = "ENUM")) + expect_snapshot(do(compression = "uncompressed", type = "ENUM")) + + # dictionary + expect_snapshot(do(encoding = "RLE_DICTIONARY", compression = "snappy", type = "ENUM")) + expect_snapshot(do(encoding = "RLE_DICTIONARY", compression = "uncompressed", type = "ENUM")) +}) From f9e54281ab42a111d121c4cb14c0e038a60adf72 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Cs=C3=A1rdi?= Date: Sun, 22 Sep 2024 11:30:49 +0200 Subject: [PATCH 24/26] Min/max for TIMESTAMP --- src/write.cpp | 31 +++++---------- .../_snaps/write-parquet-statistics.md | 36 +++++++++++++++++ .../testthat/test-write-parquet-statistics.R | 39 +++++++++++++++++++ 3 files changed, 85 insertions(+), 21 deletions(-) diff --git a/src/write.cpp b/src/write.cpp index 82c156a..461229e 100644 --- a/src/write.cpp +++ b/src/write.cpp @@ -1116,6 +1116,13 @@ void RParquetOutFile::write_double_int64(std::ostream &file, SEXP col, uint32_t idx, uint64_t from, uint64_t until, parquet::SchemaElement &sel) { + int64_t *min_value = 0, *max_value = 0; + bool minmax = write_minmax_values && is_minmax_supported[idx]; + if (minmax && has_minmax_value[idx]) { + min_value = GRAB_MIN(idx, int64_t); + max_value = GRAB_MAX(idx, int64_t); + } + if (Rf_inherits(col, "POSIXct")) { int64_t fact = 1; if (sel.__isset.logicalType && sel.logicalType.__isset.TIMESTAMP) { @@ -1134,12 +1141,6 @@ void RParquetOutFile::write_double_int64(std::ostream &file, SEXP col, fact = 1000 * 1000; } } - int64_t *min_value = 0, *max_value = 0; - bool minmax = write_minmax_values && is_minmax_supported[idx]; - if (minmax && has_minmax_value[idx]) { - min_value = GRAB_MIN(idx, int64_t); - max_value = GRAB_MAX(idx, int64_t); - } for (uint64_t i = from; i < until; i++) { double val = REAL(col)[i]; if (R_IsNA(val)) continue; @@ -1154,12 +1155,6 @@ void RParquetOutFile::write_double_int64(std::ostream &file, SEXP col, } has_minmax_value[idx] = has_minmax_value[idx] || min_value != 0; } else if (Rf_inherits(col, "difftime")) { - int64_t *min_value = 0, *max_value = 0; - bool minmax = write_minmax_values && is_minmax_supported[idx]; - if (minmax && has_minmax_value[idx]) { - min_value = GRAB_MIN(idx, int64_t); - max_value = GRAB_MAX(idx, int64_t); - } for (uint64_t i = from; i < until; i++) { double val = REAL(col)[i]; if (R_IsNA(val)) continue; @@ -1188,12 +1183,6 @@ void RParquetOutFile::write_double_int64(std::ostream &file, SEXP col, } if (is_signed) { double min = -pow(2, 63), max = -(min+1); - int64_t *min_value = 0, *max_value = 0; - bool minmax = write_minmax_values && is_minmax_supported[idx]; - if (minmax && has_minmax_value[idx]) { - min_value = GRAB_MIN(idx, int64_t); - max_value = GRAB_MAX(idx, int64_t); - } for (uint64_t i = from; i < until; i++) { double val = REAL(col)[i]; if (R_IsNA(val)) continue; @@ -2836,10 +2825,10 @@ void RParquetOutFile::write( parquet::LogicalType < = sel.logicalType; is_minmax_supported[idx] = lt.__isset.DATE || lt.__isset.INTEGER || lt.__isset.TIME || lt.__isset.STRING || lt.__isset.ENUM || - lt.__isset.JSON || lt.__isset.BSON; + lt.__isset.JSON || lt.__isset.BSON || lt.__isset.TIMESTAMP; // TODO: support the rest - // is_minmax_supported[idx] = lt.__isset.TIMESTAMP || - // lt.__isset.UUID || lt.__isset.DECIMAL || lt.isset.FLOAT16; + // is_minmax_supported[idx] = lt.__isset.UUID || + // lt.__isset.DECIMAL || lt.isset.FLOAT16; } else { switch(sel.type) { // case parquet::Type::BOOLEAN: diff --git a/tests/testthat/_snaps/write-parquet-statistics.md b/tests/testthat/_snaps/write-parquet-statistics.md index ca7e154..4048500 100644 --- a/tests/testthat/_snaps/write-parquet-statistics.md +++ b/tests/testthat/_snaps/write-parquet-statistics.md @@ -944,3 +944,39 @@ [1] TRUE TRUE TRUE NA +# min/max for REALSXP -> TIMESTAMP (INT64) + + Code + do(compression = "snappy") + Output + [[1]] + [1] 1e+06 -1e+08 -1e+09 NA + + [[2]] + [1] 5e+06 1e+08 1e+09 NA + + [[3]] + [1] TRUE TRUE TRUE NA + + [[4]] + [1] TRUE TRUE TRUE NA + + +--- + + Code + do(compression = "uncompressed") + Output + [[1]] + [1] 1e+06 -1e+08 -1e+09 NA + + [[2]] + [1] 5e+06 1e+08 1e+09 NA + + [[3]] + [1] TRUE TRUE TRUE NA + + [[4]] + [1] TRUE TRUE TRUE NA + + diff --git a/tests/testthat/test-write-parquet-statistics.R b/tests/testthat/test-write-parquet-statistics.R index 7917ec9..55719ee 100644 --- a/tests/testthat/test-write-parquet-statistics.R +++ b/tests/testthat/test-write-parquet-statistics.R @@ -415,3 +415,42 @@ test_that("min/max for STRING", { expect_snapshot(do(encoding = "RLE_DICTIONARY", compression = "snappy", type = "ENUM")) expect_snapshot(do(encoding = "RLE_DICTIONARY", compression = "uncompressed", type = "ENUM")) }) + +test_that("min/max for REALSXP -> TIMESTAMP (INT64)", { + tmp <- tempfile(fileext = ".parquet") + on.exit(unlink(tmp), add = TRUE) + now <- 0L + df <- data.frame(x = .POSIXct(as.double(c( + sample(now + 1:5), + sample(c(now + c(1:3, -100L, 100L))), + sample(c(now - 1000L, NA_integer_, now + 1000L, NA_integer_, NA_integer_)), + rep(NA_integer_, 3) + )), tz = "UTC")) + + as_int64 <- function(x) { + sapply(x, function(xx) xx %&&% .Call(read_int64, xx) %||% NA_real_) + } + + do <- function(encoding = "PLAIN",...) { + write_parquet( + df, tmp, + encoding = encoding, + options = parquet_options(num_rows_per_row_group = 5), + ... + ) + expect_equal(as.data.frame(df), as.data.frame(read_parquet(tmp))) + mtd <- as.data.frame(read_parquet_metadata(tmp)[["column_chunks"]]) + list( + as_int64(mtd[["min_value"]]), + as_int64(mtd[["max_value"]]), + mtd[["is_min_value_exact"]], + mtd[["is_max_value_exact"]] + ) + } + expect_snapshot(do(compression = "snappy")) + expect_snapshot(do(compression = "uncompressed")) +return() + # dictionary + expect_snapshot(do(encoding = "RLE_DICTIONARY", compression = "snappy")) + expect_snapshot(do(encoding = "RLE_DICTIONARY", compression = "uncompressed")) +}) From 3f96d6c25524abac26fb7c5cca94f07c9e73801d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Cs=C3=A1rdi?= Date: Sun, 22 Sep 2024 11:33:45 +0200 Subject: [PATCH 25/26] Document types without min/max write support --- R/options.R | 6 ++++-- man/parquet_options.Rd | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/R/options.R b/R/options.R index 7ae1f5f..2bb44fc 100644 --- a/R/options.R +++ b/R/options.R @@ -37,8 +37,10 @@ #' @param write_minmax_values Whether to write minimum and maximum values #' per row group, for data types that support this in [write_parquet()]. #' However, nanoparquet currently does not support minimum and maximum -#' values for the `DECIMAL` and `FLOAT16` logical types. Currently the -#' default is `TRUE`. +#' values for the `DECIMAL`, `UUID` and `FLOAT16` logical types and the +#' `BOOLEAN`, `BYTE_ARRAY` and `FIXED_LEN_BYTE_ARRAY` primitive types +#' if they are writing without a logical type. Currently the default +#' is `TRUE`. #' #' @return List of nanoparquet options. #' diff --git a/man/parquet_options.Rd b/man/parquet_options.Rd index c93140a..9c0863f 100644 --- a/man/parquet_options.Rd +++ b/man/parquet_options.Rd @@ -61,8 +61,10 @@ Possible values are 1 and 2. Default is 1.} \item{write_minmax_values}{Whether to write minimum and maximum values per row group, for data types that support this in \code{\link[=write_parquet]{write_parquet()}}. However, nanoparquet currently does not support minimum and maximum -values for the \code{DECIMAL} and \code{FLOAT16} logical types. Currently the -default is \code{TRUE}.} +values for the \code{DECIMAL}, \code{UUID} and \code{FLOAT16} logical types and the +\code{BOOLEAN}, \code{BYTE_ARRAY} and \code{FIXED_LEN_BYTE_ARRAY} primitive types +if they are writing without a logical type. Currently the default +is \code{TRUE}.} } \value{ List of nanoparquet options. From e43fc3710d67019ad98fb76d558a34e1667cad62 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Cs=C3=A1rdi?= Date: Sun, 22 Sep 2024 11:48:19 +0200 Subject: [PATCH 26/26] NEWS for writing min/max [CI skip] --- NEWS.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/NEWS.md b/NEWS.md index cd1148c..44c7c17 100644 --- a/NEWS.md +++ b/NEWS.md @@ -16,6 +16,8 @@ `read_parquet_schema()` or the new `infer_parquet_schema()` function instead. +* Other improvements: + - The new `parquet_schema()` function creates a Parquet schema from scratch. You can use this schema as the new `schema` argument of `write_parquet()`, to specify how the columns of a data frame should @@ -25,6 +27,10 @@ at most 10 million rows into a single row group. You can choose the row groups manually with the `row_groups` argument. + - `write_parquet()` now writes minimum and maximum values per row group + for most types. See `?parquet_options()` for turning this off. It also + writes out the number of non-missing values. + - Newly supported type conversions in `write_parquet()` via the schema argument: