Merge branch 'release'

fstpackage · Apr 2, 2020 · 7ed8702 · 7ed8702
2 parents 8834ea7 + 3d8b99e
commit 7ed8702
Show file tree

Hide file tree

Showing 131 changed files with 12,493 additions and 7,280 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -1,7 +1,10 @@
 ^.*\.Rproj$
 ^\.Rproj\.user$
 \.o$
+^_pkgdown\.yml$
+^CONDUCT\.md$
 \.dll$
+^docs$
 \.a$
 \.Rmd$
 LZ4/LICENSE$
@@ -10,6 +13,7 @@ LZ4/LICENSE$
 \.TMP$
 \.png$
 \.yml$
-dataset\.fst$
+^dataset\.fst$
 ^res - readme\.fst$
 ^_pkgdown\.yml$
+^CRAN-RELEASE$
diff --git a/.gitignore b/.gitignore
@@ -18,3 +18,5 @@
 .Rproj.user
 *.TMP
 /revdep/*
+/docs/*
+/src-i386/
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -1,4 +1,6 @@
 
+# Contributions 
+
 Contributions to `fst` are welcome from anyone and are best sent as pull requests on [the GitHub repository](https://github.com/fstpackage/fst/). This page provides some instructions to potential contributors about how to add to the package.
 
 1. Contributions can be submitted as [a pull request](https://help.github.com/articles/creating-a-pull-request/) on GitHub by forking or cloning the [fst repository](https://github.com/fstpackage/fst/), making changes and submitting the pull request.

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -5,8 +5,8 @@ Description: Multithreaded serialization of compressed data frames using the
     'fst' format. The 'fst' format allows for random access of stored data and
     compression with the LZ4 and ZSTD compressors created by Yann Collet. The ZSTD
     compression library is owned by Facebook Inc.
-Version: 0.9.0
-Date: 2019-04-02
+Version: 0.9.2
+Date: 2020-03-31
 Authors@R: c(
     person("Mark", "Klik", email = "markklik@gmail.com", role = c("aut", "cre", "cph")),
     person("Yann", "Collet", role = c("ctb", "cph"),
@@ -19,7 +19,7 @@ Imports:
     Rcpp
 LinkingTo: Rcpp
 SystemRequirements: little-endian platform
-RoxygenNote: 6.1.1
+RoxygenNote: 7.1.0
 Suggests:
     testthat,
     bit64,
@@ -28,6 +28,7 @@ Suggests:
     nanotime,
     crayon
 License: AGPL-3 | file LICENSE
+Encoding: UTF-8
 Copyright: This package includes sources from the LZ4 library written
     by Yann Collet, sources of the ZSTD library owned by Facebook, Inc.
     and sources of the fstlib library owned by Mark Klik

diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,31 @@
 
-# fst 0.9.0 (April 2, 2019)
+# fst 0.9.2
+
+Version 0.9.2 of the `fst` package brings support for zero-row table serialization and compression for long vectors. In
+addition, `fst` was prepared for the change in the default settings for the stringsAsFactors argument (data.frame) in
+R 4.0.0.
+
+## Library updates
+
+* Library `fstlib` updated to version 0.1.6
+* Library LZ4 updated to version 1.9.2
+* library ZSTD updated to version 1.4.4
+
+## Enhancements
+
+* Incorrect column selection gets a more informative error message (issue #138, thanks Jean-Luc and @Moohan for reporting).
+* Long raw vectors can be hashed with hash_fst (issue #202)
+* Empty tables are serialized correctly using `write_fst()` (issue #99)
+
+## Bugs solved
+
+* Ellipsis is forwarded in method `str.fst_table()` (issue #219, thanks @nbenn for reporting).
+* Coloring is turned off for terminals that don't support it or when package `crayon` is not installed (issue #198, thanks @muschellij2 for reporting and the code fix).
+* Method `metadata_fst()` correctly displays the key of the data table if column names are not in alphabetical order (issue #199, thanks @renkun-ken for the pull request).
+* stringsAsFactors argument defaults to FALSE for upcoming R 4.0.0 (issue #234, thanks @CRAN for reporting)
+
+
+# fst 0.9.0
 
 Version 0.9.0 of the `fst` package addresses the request from CRAN maintainers to fix issues identified by rchk. These issues result from PROTECT / UNPROTECT pairs called in the constructor / destructor pairs of C++ classes. rchk (rightfully) warns about those because it can't determine from the code if pairs are properly matched. With this submission the relevant SEXP classes are protected by containing them in SEXP classes that are already PROTECTED, which allows for removal of the PROTECT / UNPROTECT pairs in question.
 
@@ -21,7 +47,8 @@ As of `fst` version 0.9.0, support for fst files generated with `fst` package ve
 
 Various documentation issues have been fixed (thanks @ginberg and @renkun-ken for the pull requests).
 
-# fst 0.8.10 (December 14, 2018)
+
+# fst 0.8.10
 
 Version 0.8.10 of the `fst` package is an intermediate release designed to update the incorporated C++ libraries
 to their latest versions and to fix reported issues. Also, per request of CRAN maintainers, the OpenMP build option was moved to the correct flag in the Makevars file, resolving a warning in the package check.
@@ -47,11 +74,12 @@ to their latest versions and to fix reported issues. Also, per request of CRAN m
 * Documentation updates (issue #158, thanks @HughParsonage for submitting)
 
 
-# fst 0.8.8 (June 6, 2018)
+# fst 0.8.8
 
 Version 0.8.8 of the `fst` package is an intermediate release designed to fix valgrind warnings reported on CRAN builds (per request of CRAN maintainers). These warnings were due to `fst` writing uninitialized data buffers to file, which was done to maximize speed. To fix these warnings (and for safety), all memory blocks are now initialized to zero before being written to disk.
 
-# fst 0.8.6 (May 15, 2018)
+
+# fst 0.8.6
 
 Version 0.8.6 of the `fst` package brings clearer printing of `fst_table` objects. It also includes optimizations for controlling the number of threads used by the package during reads and writes and after a fork has ended. The `LZ4` and `ZSTD` compression libraries are updated to their latest (and fastest) releases. UTF-8 encoded column names are now correctly stored in the `fst` format.
 
@@ -147,7 +175,7 @@ The v0.8.4 release brings a `data.frame` interface to the `fst` package. Column
 
 * A data.table warning message was given on modification of columns of a sorted table. _Thanks @martinblostein._
 
-* Stack imbalance warnings under centain conditions. _Thanks @ryankennedyio_
+* Stack imbalance warnings under certain conditions. _Thanks @ryankennedyio_
 
 ## Benchmarks
 

diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -21,12 +21,12 @@ fsthasher <- function(rawVec, seed, blockHash) {
     .Call(`_fst_fsthasher`, rawVec, seed, blockHash)
 }
 
-fstcomp <- function(rawVec, compressor, compression, hash) {
-    .Call(`_fst_fstcomp`, rawVec, compressor, compression, hash)
+fstcomp <- function(rawVec, compressor, compression, hash, r_container) {
+    .Call(`_fst_fstcomp`, rawVec, compressor, compression, hash, r_container)
 }
 
-fstdecomp <- function(rawVec) {
-    .Call(`_fst_fstdecomp`, rawVec)
+fstdecomp <- function(rawVec, r_container) {
+    .Call(`_fst_fstdecomp`, rawVec, r_container)
 }
 
 getnrofthreads <- function() {

diff --git a/R/compress.R b/R/compress.R
@@ -44,7 +44,8 @@ compress_fst <- function(x, compressor = "ZSTD", compression = 0, hash = FALSE)
     stop("Parameter x is not set to a raw vector.")
   }
 
-  compressed_vec <- fstcomp(x, compressor, as.integer(compression), hash)
+  container <- as.list(1)
+  compressed_vec <- fstcomp(x, compressor, as.integer(compression), hash, container)
 
   if (inherits(compressed_vec, "fst_error")) {
     stop(compressed_vec)
@@ -66,7 +67,8 @@ decompress_fst <- function(x) {
     stop("Parameter x should be a raw vector with compressed data.")
   }
 
-  decompressed_vec <- fstdecomp(x)
+  container <- as.list(1)
+  decompressed_vec <- fstdecomp(x, container)
 
   if (inherits(decompressed_vec, "fst_error")) {
     stop(decompressed_vec)

diff --git a/R/fst.R b/R/fst.R
@@ -48,18 +48,18 @@
 #' x <- data.frame(A = 1:10000, B = sample(c(TRUE, FALSE, NA), 10000, replace = TRUE))
 #'
 #' # Default compression
-#' write_fst(x, "dataset.fst")  # filesize: 17 KB
-#' y <- read_fst("dataset.fst") # read fst file
-#'
+#' fst_file <- tempfile(fileext = ".fst")
+#' write_fst(x, fst_file)  # filesize: 17 KB
+#' y <- read_fst(fst_file) # read fst file
+
 #' # Maximum compression
-#' write_fst(x, "dataset.fst", 100)  # fileSize: 4 KB
-#' y <- read_fst("dataset.fst") # read fst file
+#' write_fst(x, fst_file, 100)  # fileSize: 4 KB
+#' y <- read_fst(fst_file) # read fst file
 #'
 #' # Random access
-#' y <- read_fst("dataset.fst", "B") # read selection of columns
-#' y <- read_fst("dataset.fst", "A", 100, 200) # read selection of columns and rows
+#' y <- read_fst(fst_file, "B") # read selection of columns
+#' y <- read_fst(fst_file, "A", 100, 200) # read selection of columns and rows
 #' @export
-#' @md
 write_fst <- function(x, path, compress = 50, uniform_encoding = TRUE) {
   if (!is.character(path)) stop("Please specify a correct path.")
 
@@ -92,10 +92,11 @@ write_fst <- function(x, path, compress = 50, uniform_encoding = TRUE) {
 #'   Last = sample(LETTERS, 10))
 #'
 #' # Write to fst file
-#' write_fst(x, "dataset.fst")
+#' fst_file <- tempfile(fileext = ".fst")
+#' write_fst(x, fst_file)
 #'
 #' # Display meta information
-#' metadata_fst("dataset.fst")
+#' metadata_fst(fst_file)
 #' @export
 metadata_fst <- function(path, old_format = FALSE) {
 
@@ -112,13 +113,13 @@ metadata_fst <- function(path, old_format = FALSE) {
     stop(metadata)
   }
 
-  colInfo <- list(path = full_path, nrOfRows = metadata$nrOfRows,
+  col_info <- list(path = full_path, nrOfRows = metadata$nrOfRows,
     keys = metadata$keyNames, columnNames = metadata$colNames,
     columnBaseTypes = metadata$colBaseType, keyColIndex = metadata$keyColIndex,
     columnTypes = metadata$colType)
-  class(colInfo) <- "fstmetadata"
+  class(col_info) <- "fstmetadata"
 
-  colInfo
+  col_info
 }
 
 
@@ -132,22 +133,42 @@ print.fstmetadata <- function(x, ...) {
     "IDate", "ITime", "double", "Date", "POSIXct", "difftime", "ITime", "logical", "integer64",
     "nanotime", "raw")
 
-  colNames <- format(encodeString(x$columnNames, quote = "'"))
-
-  # Table has no key columns
+  # table has no key columns
   if (is.null(x$keys)) {
-    cat(paste0("* ", colNames, ": ", types[x$columnTypes], "\n"), sep = "")
+    column_names <- format(encodeString(x$columnNames, quote = "'"))
+    cat(paste0("* ", column_names, ": ", types[x$columnTypes], "\n"), sep = "")
     return(invisible(NULL))
   }
 
-  # Table has key columns
-  keys <- data.frame(k = x$keys, count = 1:length(x$keys))
-  colTab <- data.frame(k = x$columnNames, o = 1:length(x$columnNames))
-  colTab <- merge(colTab, keys, "k", all.x = TRUE)
-  colTab$l <- paste0(" (key ", colTab$count, ")")
-  colTab[is.na(colTab$count), "l"] <- ""
+  # table has key columns
+  keys <- data.frame(
+    k = x$keys,
+    count = seq_along(x$keys),
+    stringsAsFactors = FALSE)
+
+  col_info <- data.frame(
+    k = x$columnNames,
+    o = seq_along(x$columnNames),
+    t = types[x$columnTypes],
+    stringsAsFactors = FALSE)
+
+  # merge keys to correct column
+  col_info <- merge(col_info, keys, "k", all.x = TRUE, sort = FALSE)
+
+  col_info$k <- format(encodeString(col_info$k, quote = "'"))
+  col_info$l <- paste0(" (key ", col_info$count, ")")
+  col_info[is.na(col_info$count), "l"] <- ""
 
-  cat(paste0("* ", colNames, ": ", types[x$columnTypes], colTab$l, "\n"), sep = "")
+  col_info <- col_info[order(col_info$count, col_info$o), ]  # key columns at the top
+
+  cat(paste0("* ", col_info$k, ": ", col_info$t, col_info$l, "\n"), sep = "")
+}
+
+
+#' @rdname write_fst
+#' @export
+write.fst <- function(x, path, compress = 50, uniform_encoding = TRUE) {
+  write_fst(x, path, compress, uniform_encoding)
 }
 
 
@@ -163,8 +184,8 @@ print.fstmetadata <- function(x, ...) {
 #' converted with fst package versions 0.8.0 to 0.8.10.
 #'
 #' @export
-read_fst <- function(path, columns = NULL, from = 1, to = NULL, as.data.table = FALSE, old_format = FALSE) {
-  fileName <- normalizePath(path, mustWork = FALSE)
+read_fst <- function(path, columns = NULL, from = 1, to = NULL, as.data.table = FALSE, old_format = FALSE) {  # nolint
+  file_name <- normalizePath(path, mustWork = FALSE)
 
   if (!is.null(columns)) {
     if (!is.character(columns)) {
@@ -191,21 +212,27 @@ read_fst <- function(path, columns = NULL, from = 1, to = NULL, as.data.table =
     " lower than 0.8.0 should be read (and rewritten) using fst package versions <= 0.8.10.")
   }
 
-  res <- fstretrieve(fileName, columns, from, to)
+  res <- fstretrieve(file_name, columns, from, to)
 
   if (inherits(res, "fst_error")) {
     stop(res)
   }
 
+  # long vectors are not supported yet with data.table, tibble's or data.frame,
+  # so return a list instead
+  nr_of_rows <- length(res$resTable[[1]])
+  if (nr_of_rows >= 2 ^ 31) {
+    return(res$resTable)
+  }
 
   if (as.data.table) {
     if (!requireNamespace("data.table", quietly = TRUE)) {
       stop("Please install package data.table when using as.data.table = TRUE")
     }
 
-    keyNames <- res$keyNames
+    key_names <- res$keyNames
     res <- data.table::setDT(res$resTable)  # nolint
-    if (length(keyNames) > 0) data.table::setattr(res, "sorted", keyNames)
+    if (length(key_names) > 0) data.table::setattr(res, "sorted", key_names)
     return(res)
   }
 
@@ -214,10 +241,10 @@ read_fst <- function(path, columns = NULL, from = 1, to = NULL, as.data.table =
   # use setters from data.table to improve performance
   if (requireNamespace("data.table", quietly = TRUE)) {
     data.table::setattr(res_table, "class", "data.frame")
-    data.table::setattr(res_table, "row.names", 1:length(res_table[[1L]]))
+    data.table::setattr(res_table, "row.names", seq_len(length(res_table[[1L]])))
   } else {
     class(res_table) <- "data.frame"
-    attr(res_table, "row.names") <- 1:length(res_table[[1L]])
+    attr(res_table, "row.names") <- seq_len(length(res_table[[1L]]))
   }
 
   res_table
@@ -226,20 +253,13 @@ read_fst <- function(path, columns = NULL, from = 1, to = NULL, as.data.table =
 
 #' @rdname write_fst
 #' @export
-write.fst <- function(x, path, compress = 50, uniform_encoding = TRUE) {
-  write_fst(x, path, compress, uniform_encoding)
-}
-
-
-#' @rdname write_fst
-#' @export
-read.fst <- function(path, columns = NULL, from = 1, to = NULL, as.data.table = FALSE, old_format = FALSE) {
+read.fst <- function(path, columns = NULL, from = 1, to = NULL, as.data.table = FALSE, old_format = FALSE) {  # nolint
   read_fst(path, columns, from, to, as.data.table, old_format)
 }
 
 
 #' @rdname metadata_fst
 #' @export
-fst.metadata <- function(path, old_format = FALSE) {
+fst.metadata <- function(path, old_format = FALSE) {  # nolint
   metadata_fst(path, old_format)
 }