Skip to content

Commit

Permalink
Merge branch 'main' into refactor-read
Browse files Browse the repository at this point in the history
  • Loading branch information
gaborcsardi committed Jul 26, 2024
2 parents 4d7edd0 + e2925e3 commit a9e1829
Show file tree
Hide file tree
Showing 25 changed files with 15,291 additions and 245 deletions.
12 changes: 12 additions & 0 deletions R/nanoparquet-package.R
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,15 @@
#' @seealso [nanoparquet-package] for options that modify the type
#' mappings.
NULL

#' @name parquet-encodings
#' @title Parquet encodings
#' @description
#' Various Parquet encodings
#'
#' @details
#' ```{r, child = "tools/encodings.Rmd"}
#' ```
#' @seealso [write_parquet()] on how to select a non-default encoding when
#' writing Parquet files.
NULL
2 changes: 1 addition & 1 deletion R/porcelain.R
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,7 @@ dict_encode <- function(x, n = length(x)) {
}

dict_encode_idx <- function(x) {
.Call(nanoparquet_create_dict_idx, x)
.Call(nanoparquet_create_dict_idx, x, sys.call())
}

lgl_avg_run_length <- function(x, n = length(x)) {
Expand Down
3 changes: 2 additions & 1 deletion R/schema.R
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,8 @@ parquet_schema_create <- function(types) {
parquet_type <- function(type, type_length = NULL, bit_width = NULL,
is_signed = NULL, precision = NULL, scale = NULL,
is_adjusted_utc = NULL, unit = NULL,
primitive_type = NULL, repetition_type = NULL) {
primitive_type = NULL, repetition_type = NULL,
encoding = NULL) {

fixed_len_byte_array <- function() {
stopifnot(
Expand Down
5 changes: 5 additions & 0 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -60,3 +60,8 @@ any_na <- function(x) {
anyNA(x)
}
}

is_named <- function(x) {
nm <- names(x)
!is.null(nm) && !anyNA(nm)
}
68 changes: 66 additions & 2 deletions R/write-parquet.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,32 @@
#' use the schema of a Parquet file.
#' @param compression Compression algorithm to use. Currently `"snappy"`
#' (the default), `"gzip"`, `"zstd"`, and `"uncompressed"` are supported.
#' @param encoding Encoding to use. Possible values:
#' * If `NULL`, the appropriate encoding is selected automatically:
#' `RLE` or `PLAIN` for `BOOLEAN` columns, `RLE_DICTIONARY` for other
#' columns with many repeated values, and `PLAIN` otherwise.
#' * If It is a single (unnamed) character string, then it'll be used
#' for all columns.
#' * If it is an unnamed character vector of encoding names of the same
#' length as the number of columns in the data frame, then those
#' encodings will be used for each column.
#' * If it is a named character vector, then the named must be unique
#' and each name must match a column name, to specify the encoding of
#' that column. The special empty name (`""`) applies to the rest of
#' the columns. If there is no empty name, the rest of the columns
#' will use the default encoding.
#'
#' If `NA_character_` is specified for a column, the default encoding is
#' used for the column.
#'
#' If a specified encoding is invalid for a certain column type,
#' or nanoparquet does not implement it, `write_parquet()` throws an
#' error.
#'
#' This version of nanoparquet supports the following encodings:
#' `r paste("\u0060", names(nanoparquet:::encodings), "\u0060", collapse = ", ")`.
#'
#' See [parquet-encodings] for more about encodings.
#' @param metadata Additional key-value metadata to add to the file.
#' This must be a named character vector, or a data frame with columns
#' character columns called `key` and `value`.
Expand All @@ -34,6 +60,7 @@ write_parquet <- function(
file,
schema = NULL,
compression = c("snappy", "gzip", "zstd", "uncompressed"),
encoding = NULL,
metadata = NULL,
options = parquet_options()) {

Expand Down Expand Up @@ -121,7 +148,9 @@ write_parquet <- function(
ifelse(hasna[is.na(rt)], "OPITONAL", "REQUIRED")
required <- schema[["repetition_type"]] == "REQUIRED"

res <- .Call(
encoding <- parse_encoding(encoding, x)

res <- .Call(
nanoparquet_write,
x,
file,
Expand All @@ -130,7 +159,9 @@ write_parquet <- function(
metadata,
required,
options,
schema
schema,
encodings[encoding],
sys.call()
)

if (is.null(res)) {
Expand All @@ -139,3 +170,36 @@ write_parquet <- function(
res
}
}

parse_encoding <- function(encoding, x) {
stopifnot(
"`encoding` must be `NULL` or a character vector" =
is.null(encoding) || is.character(encoding),
"`encoding` contains at least one unknown encoding" =
all(is.na(encoding) | encoding %in% names(encodings))
)

if (is.null(encoding)) {
structure(rep(NA_character_, length(x)), names = names(x))

} else if (is_named(encoding)) {
stopifnot(
"names of `encoding` must be unique" =
!anyDuplicated(names(encoding)),
"names of `encoding` must match names of `x`" =
all(names(encoding) %in% c(names(x), ""))
)
def <- c(encoding[names(encoding) == ""], NA_character_)[1]
encoding <- encoding[names(encoding) != ""]
res <- structure(rep(def, length(x)), names = names(x))
res[names(encoding)] <- encoding
res

} else if (length(encoding) == 1) {
structure(rep(encoding, length(x)), names = names(x))

} else {
stopifnot(length(encoding) == length(x))
structure(encoding, names = names(x))
}
}
5 changes: 3 additions & 2 deletions _pkgdown.yml
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
url: https://r-lib.github.io/nanoparquet
url: https://nanoparquet.r-lib.org/
template:
package: tidytemplate
bootstrap: 5
includes:
in_header: |
<script defer data-domain="r-lib.github.io/nanoparquet,all.tidyverse.org" src="https://plausible.io/js/plausible.js"></script>
<script defer data-domain="nanoparquet.r-lib.org,all.tidyverse.org" src="https://plausible.io/js/plausible.js"></script>
destination: docs

Expand All @@ -19,6 +19,7 @@ reference:
contents:
- infer_parquet_schema
- nanoparquet-types
- parquet-encodings
- parquet_schema

- title: Read Parquet files
Expand Down
138 changes: 138 additions & 0 deletions man/parquet-encodings.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

30 changes: 30 additions & 0 deletions man/write_parquet.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit a9e1829

Please sign in to comment.