Merge branch 'main' into refactor-read

r-lib · Jul 26, 2024 · a9e1829 · a9e1829
2 parents 4d7edd0 + e2925e3
commit a9e1829
Show file tree

Hide file tree

Showing 25 changed files with 15,291 additions and 245 deletions.
diff --git a/R/nanoparquet-package.R b/R/nanoparquet-package.R
@@ -22,3 +22,15 @@
 #' @seealso [nanoparquet-package] for options that modify the type
 #' mappings.
 NULL
+
+#' @name parquet-encodings
+#' @title Parquet encodings
+#' @description
+#' Various Parquet encodings
+#'
+#' @details
+#' ```{r, child = "tools/encodings.Rmd"}
+#' ```
+#' @seealso [write_parquet()] on how to select a non-default encoding when
+#' writing Parquet files.
+NULL
diff --git a/R/porcelain.R b/R/porcelain.R
@@ -243,7 +243,7 @@ dict_encode <- function(x, n = length(x)) {
 }
 
 dict_encode_idx <- function(x) {
-	.Call(nanoparquet_create_dict_idx, x)
+	.Call(nanoparquet_create_dict_idx, x, sys.call())
 }
 
 lgl_avg_run_length <- function(x, n = length(x)) {

diff --git a/R/schema.R b/R/schema.R
@@ -149,7 +149,8 @@ parquet_schema_create <- function(types) {
 parquet_type <- function(type, type_length = NULL, bit_width = NULL,
                          is_signed = NULL, precision = NULL, scale = NULL,
                          is_adjusted_utc = NULL, unit = NULL,
-                         primitive_type = NULL, repetition_type = NULL) {
+                         primitive_type = NULL, repetition_type = NULL,
+                         encoding = NULL) {
 
   fixed_len_byte_array <- function() {
     stopifnot(

diff --git a/R/utils.R b/R/utils.R
@@ -60,3 +60,8 @@ any_na <- function(x) {
     anyNA(x)
   }
 }
+
+is_named <- function(x) {
+  nm <- names(x)
+  !is.null(nm) && !anyNA(nm)
+}
diff --git a/R/write-parquet.R b/R/write-parquet.R
@@ -15,6 +15,32 @@
 #'   use the schema of a Parquet file.
 #' @param compression Compression algorithm to use. Currently `"snappy"`
 #'   (the default), `"gzip"`, `"zstd"`, and `"uncompressed"` are supported.
+#' @param encoding Encoding to use. Possible values:
+#'   * If `NULL`, the appropriate encoding is selected automatically:
+#'     `RLE` or `PLAIN` for `BOOLEAN` columns, `RLE_DICTIONARY` for other
+#'     columns with many repeated values, and `PLAIN` otherwise.
+#'   * If It is a single (unnamed) character string, then it'll be used
+#'     for all columns.
+#'   * If it is an unnamed character vector of encoding names of the same
+#'     length as the number of columns in the data frame, then those
+#'     encodings will be used for each column.
+#'   * If it is a named character vector, then the named must be unique
+#'     and each name must match a column name, to specify the encoding of
+#'     that column. The special empty name (`""`) applies to the rest of
+#'     the columns. If there is no empty name, the rest of the columns
+#'     will use the default encoding.
+#'
+#'   If `NA_character_` is specified for a column, the default encoding is
+#'   used for the column.
+#'
+#'   If a specified encoding is invalid for a certain column type,
+#'   or nanoparquet does not implement it, `write_parquet()` throws an
+#'   error.
+#'
+#'   This version of nanoparquet supports the following encodings:
+#'   `r paste("\u0060", names(nanoparquet:::encodings), "\u0060", collapse = ", ")`.
+#'
+#'   See [parquet-encodings] for more about encodings.
 #' @param metadata Additional key-value metadata to add to the file.
 #'   This must be a named character vector, or a data frame with columns
 #'   character columns called `key` and `value`.
@@ -34,6 +60,7 @@ write_parquet <- function(
   file,
   schema = NULL,
   compression = c("snappy", "gzip", "zstd", "uncompressed"),
+  encoding = NULL,
   metadata = NULL,
   options = parquet_options()) {
 
@@ -121,7 +148,9 @@ write_parquet <- function(
     ifelse(hasna[is.na(rt)], "OPITONAL", "REQUIRED")
   required <- schema[["repetition_type"]] == "REQUIRED"
 
-  res <- .Call(
+encoding <- parse_encoding(encoding, x)
+
+res <- .Call(
     nanoparquet_write,
     x,
     file,
@@ -130,7 +159,9 @@ write_parquet <- function(
     metadata,
     required,
     options,
-    schema
+    schema,
+    encodings[encoding],
+    sys.call()
   )
 
   if (is.null(res)) {
@@ -139,3 +170,36 @@ write_parquet <- function(
     res
   }
 }
+
+parse_encoding <- function(encoding, x) {
+  stopifnot(
+    "`encoding` must be `NULL` or a character vector" =
+      is.null(encoding) || is.character(encoding),
+    "`encoding` contains at least one unknown encoding" =
+      all(is.na(encoding) | encoding %in% names(encodings))
+  )
+
+  if (is.null(encoding)) {
+    structure(rep(NA_character_, length(x)), names = names(x))
+
+  } else if (is_named(encoding)) {
+    stopifnot(
+      "names of `encoding` must be unique" =
+        !anyDuplicated(names(encoding)),
+      "names of `encoding` must match names of `x`" =
+        all(names(encoding) %in% c(names(x), ""))
+    )
+    def <- c(encoding[names(encoding) == ""], NA_character_)[1]
+    encoding <- encoding[names(encoding) != ""]
+    res <- structure(rep(def, length(x)), names = names(x))
+    res[names(encoding)] <- encoding
+    res
+
+  } else if (length(encoding) == 1) {
+    structure(rep(encoding, length(x)), names = names(x))
+
+  } else {
+    stopifnot(length(encoding) == length(x))
+    structure(encoding, names = names(x))
+  }
+}
diff --git a/_pkgdown.yml b/_pkgdown.yml
@@ -1,10 +1,10 @@
-url: https://r-lib.github.io/nanoparquet
+url: https://nanoparquet.r-lib.org/
 template:
   package: tidytemplate
   bootstrap: 5
   includes:
     in_header: |
-      <script defer data-domain="r-lib.github.io/nanoparquet,all.tidyverse.org" src="https://plausible.io/js/plausible.js"></script>
+      <script defer data-domain="nanoparquet.r-lib.org,all.tidyverse.org" src="https://plausible.io/js/plausible.js"></script>
 
 destination: docs
 
@@ -19,6 +19,7 @@ reference:
   contents:
   - infer_parquet_schema
   - nanoparquet-types
+  - parquet-encodings
   - parquet_schema
 
 - title: Read Parquet files

diff --git a/man/parquet-encodings.Rd b/man/parquet-encodings.Rd
diff --git a/man/write_parquet.Rd b/man/write_parquet.Rd