diff --git a/.travis.yml b/.travis.yml
index 2730186..71ba91b 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -20,8 +20,7 @@ before_install:
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install llvm &&
export PATH="/usr/local/opt/llvm/bin:$PATH" &&
export LDFLAGS="-L/usr/local/opt/llvm/lib" &&
- export CPPFLAGS="-I/usr/local/opt/llvm/include" &&
- export PKG_CXXFLAGS="-O3 -Wall -pedantic"; fi
+ export CPPFLAGS="-I/usr/local/opt/llvm/include"; fi
r_packages:
- covr
@@ -31,12 +30,9 @@ r_packages:
- testthat
- data.table
-addons:
- apt:
- update: true
-
after_success:
- - Rscript -e 'library(covr); codecov(quiet = FALSE)'
+ - test $TRAVIS_OS_NAME == "linux" &&
+ travis_wait Rscript -e 'library(covr); codecov(quiet = FALSE)'
env:
global:
diff --git a/DESCRIPTION b/DESCRIPTION
index 10b4e2c..2d18005 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -5,8 +5,8 @@ Description: Multithreaded serialization of compressed data frames using the
'fst' format. The 'fst' format allows for random access of stored data and
compression with the LZ4 and ZSTD compressors created by Yann Collet. The ZSTD
compression library is owned by Facebook Inc.
-Version: 0.8.10
-Date: 2018-12-13
+Version: 0.9.0
+Date: 2019-04-02
Authors@R: c(
person("Mark", "Klik", email = "markklik@gmail.com", role = c("aut", "cre", "cph")),
person("Yann", "Collet", role = c("ctb", "cph"),
diff --git a/NEWS.md b/NEWS.md
index b1723b4..2ce0f1b 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,4 +1,26 @@
+# fst 0.9.0 (April 2, 2019)
+
+Version 0.9.0 of the `fst` package addresses the request from CRAN maintainers to fix issues identified by rchk. These issues result from PROTECT / UNPROTECT pairs called in the constructor / destructor pairs of C++ classes. rchk (rightfully) warns about those because it can't determine from the code if pairs are properly matched. With this submission the relevant SEXP classes are protected by containing them in SEXP classes that are already PROTECTED, which allows for removal of the PROTECT / UNPROTECT pairs in question.
+
+As of `fst` version 0.9.0, support for fst files generated with `fst` package versions lower than 0.8.0 has been deprecated. This significantly reduces the (C++) code base and prepares `fst` for future code changes.
+
+## Library updates
+
+* Library `fstlib` updated to version 0.1.1
+
+## Enhancements
+
+* Method `setnrofthreads` returns invisible result to avoid printing unwanted output (thanks @renkun-ken for the pull request)
+
+## Bugs solved
+
+* Empty subsets can be selected using `fst::fst` (thanks @renkun-ken for reporting)
+
+## Documentation
+
+Various documentation issues have been fixed (thanks @ginberg and @renkun-ken for the pull requests).
+
# fst 0.8.10 (December 14, 2018)
Version 0.8.10 of the `fst` package is an intermediate release designed to update the incorporated C++ libraries
@@ -39,7 +61,7 @@ Version 0.8.6 of the `fst` package brings clearer printing of `fst_table` object
* User has more control over the number of threads used by fst. Option 'fst_threads' can now be used to initialize the number of threads when the package is first loaded (issue #132, thanks to @karldw for the pull request).
-* Option 'fst_restore_after_fork' can be used to select the threading behaviour after a fork has ended. Like the `data.table` package, `fst` switches back to a single thread when a fork is detected (using OpenMP in a fork can lead to problems). Unlike `data.table`, the `fst` package restores the number of threads to it's previous setting when the fork ends. If this leads to unexpected problems, the user can set the 'fst_restore_after_fork' option to FALSE to disable that.
+* Option 'fst_restore_after_fork' can be used to select the threading behavior after a fork has ended. Like the `data.table` package, `fst` switches back to a single thread when a fork is detected (using OpenMP in a fork can lead to problems). Unlike `data.table`, the `fst` package restores the number of threads to it's previous setting when the fork ends. If this leads to unexpected problems, the user can set the 'fst_restore_after_fork' option to FALSE to disable that.
## Bugs solved
diff --git a/R/RcppExports.R b/R/RcppExports.R
index d48e814..c922ceb 100644
--- a/R/RcppExports.R
+++ b/R/RcppExports.R
@@ -9,12 +9,12 @@ fststore <- function(fileName, table, compression, uniformEncoding) {
.Call(`_fst_fststore`, fileName, table, compression, uniformEncoding)
}
-fstmetadata <- function(fileName, oldFormat) {
- .Call(`_fst_fstmetadata`, fileName, oldFormat)
+fstmetadata <- function(fileName) {
+ .Call(`_fst_fstmetadata`, fileName)
}
-fstretrieve <- function(fileName, columnSelection, startRow, endRow, oldFormat) {
- .Call(`_fst_fstretrieve`, fileName, columnSelection, startRow, endRow, oldFormat)
+fstretrieve <- function(fileName, columnSelection, startRow, endRow) {
+ .Call(`_fst_fstretrieve`, fileName, columnSelection, startRow, endRow)
}
fsthasher <- function(rawVec, seed, blockHash) {
diff --git a/R/fst.R b/R/fst.R
index 684e3a2..4ac5aa3 100644
--- a/R/fst.R
+++ b/R/fst.R
@@ -80,7 +80,8 @@ write_fst <- function(x, path, compress = 50, uniform_encoding = TRUE) {
#' Method for checking basic properties of the dataset stored in \code{path}.
#'
#' @param path path to fst file
-#' @param old_format use TRUE to read fst files generated with a fst package version lower than v0.8.0
+#' @param old_format must be FALSE, the old fst file format is deprecated and can only be read and
+#' converted with fst package versions 0.8.0 to 0.8.10.
#' @return Returns a list with meta information on the stored dataset in \code{path}.
#' Has class \code{fstmetadata}.
#' @examples
@@ -97,13 +98,15 @@ write_fst <- function(x, path, compress = 50, uniform_encoding = TRUE) {
#' metadata_fst("dataset.fst")
#' @export
metadata_fst <- function(path, old_format = FALSE) {
- if (!is.logical(old_format)) {
- stop("A logical value is expected for parameter 'old_format'.")
+
+ if (old_format != FALSE) {
+ stop("Parameter old_format is depricated, fst files written with fst package version",
+ " lower than 0.8.0 should be read (and rewritten) using fst package versions <= 0.8.10.")
}
full_path <- normalizePath(path, mustWork = FALSE)
- metadata <- fstmetadata(full_path, old_format)
+ metadata <- fstmetadata(full_path)
if (inherits(metadata, "fst_error")) {
stop(metadata)
@@ -150,13 +153,14 @@ print.fstmetadata <- function(x, ...) {
#' @rdname write_fst
#'
-#' @param columns Column names to read. The default is to read all all columns.
+#' @param columns Column names to read. The default is to read all columns.
#' @param from Read data starting from this row number.
#' @param to Read data up until this row number. The default is to read to the last row of the stored dataset.
#' @param as.data.table If TRUE, the result will be returned as a \code{data.table} object. Any keys set on
#' dataset \code{x} before writing will be retained. This allows for storage of sorted datasets. This option
#' requires \code{data.table} package to be installed.
-#' @param old_format use TRUE to read fst files generated with a fst package version lower than v0.8.0
+#' @param old_format must be FALSE, the old fst file format is deprecated and can only be read and
+#' converted with fst package versions 0.8.0 to 0.8.10.
#'
#' @export
read_fst <- function(path, columns = NULL, from = 1, to = NULL, as.data.table = FALSE, old_format = FALSE) {
@@ -182,11 +186,12 @@ read_fst <- function(path, columns = NULL, from = 1, to = NULL, as.data.table =
to <- as.integer(to)
}
- if (!is.logical(old_format)) {
- stop("A logical value is expected for parameter 'old_format'.")
+ if (old_format != FALSE) {
+ stop("Parameter old_format is depricated, fst files written with fst package version",
+ " lower than 0.8.0 should be read (and rewritten) using fst package versions <= 0.8.10.")
}
- res <- fstretrieve(fileName, columns, from, to, old_format)
+ res <- fstretrieve(fileName, columns, from, to)
if (inherits(res, "fst_error")) {
stop(res)
diff --git a/R/fst_table.R b/R/fst_table.R
index 6cd7ad9..fb90742 100644
--- a/R/fst_table.R
+++ b/R/fst_table.R
@@ -63,6 +63,12 @@
#' }
fst <- function(path, old_format = FALSE) {
+ # old format is deprecated as of v0.9.0
+ if (old_format != FALSE) {
+ stop("Parameter old_format is depricated, fst files written with fst package version",
+ " lower than 0.8.0 should be read (and rewritten) using fst package versions <= 0.8.10.")
+ }
+
# wrap in a list so that additional elements can be added if required
ft <- list(
meta = metadata_fst(path, old_format),
@@ -341,40 +347,65 @@ as.list.fst_table <- function(x, ...) {
}
+# drop to lower dimension when drop = TRUE
+return_drop <- function(x, drop) {
+
+ if (!drop | ncol(x) > 1) return(x)
+
+ x[[1]]
+}
+
+
#' @export
-`[.fst_table` <- function(x, i, j, drop = FALSE) {
- if (drop) {
- warning("drop ignored", call. = FALSE)
+`[.fst_table` <- function(x, i, j, drop) {
+
+ # check for old_format in case an 'old' fst_table object was deserialized
+ if (.subset2(x, "old_format") != FALSE) {
+ stop("fst files written with fst package version",
+ " lower than 0.8.0 should be read (and rewritten) using fst package versions <= 0.8.10.")
}
meta_info <- .subset2(x, "meta")
- # when only i is present, we do a column subsetting
+ # no additional arguments provided
if (missing(i) && missing(j)) {
- return(read_fst(meta_info$path, old_format = .subset2(x, "old_format")))
+
+ # never drop as with data.frame
+ return(read_fst(meta_info$path))
}
+
if (nargs() <= 2) {
- # return full table
+ # result is never dropped with 2 arguments
+
if (missing(i)) {
- # we have a j
+ # we have a named argument j
j <- .column_indexes_fst(meta_info, j)
- return(read_fst(meta_info$path, j, old_format = .subset2(x, "old_format")))
+ return(read_fst(meta_info$path, j))
}
# i is interpreted as j
j <- .column_indexes_fst(meta_info, i)
- return(read_fst(meta_info$path, j, old_format = .subset2(x, "old_format")))
+ return(read_fst(meta_info$path, j))
+ }
+
+ # drop dimension if single column selected and drop != FALSE
+ drop_dim <- FALSE
+
+ if (!missing(j) && length(j) == 1) {
+
+ if (!(!missing(drop) && drop == FALSE)) {
+ drop_dim <- TRUE
+ }
}
- # return all rows
+ # special case where i is interpreted as j: select all rows, never drop
- # special case where i is interpreted as j: select all rows
if (nargs() == 3 && !missing(drop) && !missing(i)) {
j <- .column_indexes_fst(meta_info, i)
- return(read_fst(meta_info$path, j, old_format = .subset2(x, "old_format")))
+ return(read_fst(meta_info$path, j))
}
# i and j not reversed
@@ -382,7 +413,10 @@ as.list.fst_table <- function(x, ...) {
# full columns
if (missing(i)) {
j <- .column_indexes_fst(meta_info, j)
- return(read_fst(meta_info$path, j, old_format = .subset2(x, "old_format")))
+ x <- read_fst(meta_info$path, j)
+
+ if (!drop_dim) return(x)
+ return(x[[1]])
}
@@ -397,29 +431,37 @@ as.list.fst_table <- function(x, ...) {
# cast to integer and determine row range
i <- as.integer(i)
- min_row <- min(i)
- max_row <- max(i)
- # boundary check
- if (min_row < 0) {
- stop("Row selection out of range")
- }
+ # empty row selection
+ if (length(i) == 0) {
+ min_row <- 1
+ max_row <- 1
+ } else {
+ min_row <- min(i)
+ max_row <- max(i)
- if (max_row > meta_info$nrOfRows) {
- stop("Row selection out of range")
+ # boundary check
+ if (min_row < 0) {
+ stop("Row selection out of range")
+ }
+
+ if (max_row > meta_info$nrOfRows) {
+ stop("Row selection out of range")
+ }
}
# column subset
# select all columns
if (missing(j)) {
- fst_data <- read_fst(meta_info$path, from = min_row, to = max_row, old_format = .subset2(x, "old_format"))
-
- return(fst_data[1 + i - min_row, ])
+ fst_data <- read_fst(meta_info$path, from = min_row, to = max_row)
+ x <- fst_data[1 + i - min_row, ] # row selection, no dropping
+ } else {
+ j <- .column_indexes_fst(meta_info, j)
+ fst_data <- read_fst(meta_info$path, j, from = min_row, to = max_row)
+ x <- fst_data[1 + i - min_row, , drop = FALSE] # row selection, no dropping
}
- j <- .column_indexes_fst(meta_info, j)
- fst_data <- read_fst(meta_info$path, j, from = min_row, to = max_row, old_format = .subset2(x, "old_format"))
-
- fst_data[1 + i - min_row, ]
+ if (!drop_dim) return(x)
+ return(x[[1]])
}
diff --git a/R/openmp.R b/R/openmp.R
index 00da659..0697d2b 100644
--- a/R/openmp.R
+++ b/R/openmp.R
@@ -30,7 +30,7 @@
#' specific requirements. As a default, \code{fst} uses a number of threads equal to the number of
#' logical cores in the system.
#'
-#' The number of threads can also be set with \code{option(fst_threads = N)}.
+#' The number of threads can also be set with \code{options(fst_threads = N)}.
#' NOTE: This option is only read when the package's namespace is first loaded, with commands like
#' \code{library}, \code{require}, or \code{::}. If you have already used one of these, you
#' must use \code{threads_fst} to set the number of threads.
@@ -62,5 +62,5 @@ threads_fst <- function(nr_of_threads = NULL, reset_after_fork = NULL) {
return(getnrofthreads())
}
- setnrofthreads(nr_of_threads)
+ invisible(setnrofthreads(nr_of_threads))
}
diff --git a/README.Rmd b/README.Rmd
index cbdbbf6..2bcf3aa 100644
--- a/README.Rmd
+++ b/README.Rmd
@@ -17,7 +17,7 @@ knitr::opts_chunk$set(
[![Linux/OSX Build Status](https://travis-ci.org/fstpackage/fst.svg?branch=develop)](https://travis-ci.org/fstpackage/fst)
-[![WIndows Build status](https://ci.appveyor.com/api/projects/status/6g6kp8onpb26jhnm/branch/develop?svg=true)](https://ci.appveyor.com/project/fstpackage/fst/branch/develop)
+[![Windows Build status](https://ci.appveyor.com/api/projects/status/6g6kp8onpb26jhnm/branch/develop?svg=true)](https://ci.appveyor.com/project/fstpackage/fst/branch/develop)
[![License: AGPL v3](https://img.shields.io/badge/License-AGPL%20v3-blue.svg)](https://www.gnu.org/licenses/agpl-3.0)
[![CRAN_Status_Badge](http://www.r-pkg.org/badges/version/fst)](https://cran.r-project.org/package=fst)
[![codecov](https://codecov.io/gh/fstpackage/fst/branch/develop/graph/badge.svg)](https://codecov.io/gh/fstpackage/fst)
diff --git a/README.md b/README.md
index cf6a642..9d4ac1b 100644
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@
[![Linux/OSX Build
Status](https://travis-ci.org/fstpackage/fst.svg?branch=develop)](https://travis-ci.org/fstpackage/fst)
-[![WIndows Build
+[![Windows Build
status](https://ci.appveyor.com/api/projects/status/6g6kp8onpb26jhnm/branch/develop?svg=true)](https://ci.appveyor.com/project/fstpackage/fst/branch/develop)
[![License: AGPL
v3](https://img.shields.io/badge/License-AGPL%20v3-blue.svg)](https://www.gnu.org/licenses/agpl-3.0)
diff --git a/cran-checklist.md b/cran-checklist.md
index cf43103..20093eb 100644
--- a/cran-checklist.md
+++ b/cran-checklist.md
@@ -10,16 +10,17 @@
- AppVeyor (Windows Server)
- latest R dev version on Windows
* Build packages with dependencies on fst
-* Start release branch from develop
+* Merge develop branch into release branch
* Bump version to even value in DESCRIPTION and check package startup message
* Update README.Rmd and verify generated README.md on Github (release)
+* Update cran_comments.md
* Update NEWS.md and make sure to remove '(in development)' in the version title
and update the version number
* Credit all GitHub contributions in NEWS.md
* Build docs folder using pkgdown::build_site()
* Merge branch release into master
* Submit to CRAN
-
+* Commit the fstpackage.github.io repositry with the latest docs
# After releasing to CRAN
@@ -27,5 +28,5 @@
* Go to the repository release page and create a new release with tag version vx.y.z.
Copy and paste the contents of the relevant NEWS.md section into the release notes.
* Add '(in development)' to version title in NEWS.md and update to odd version number
-* Bump version to odd value and check package startup message
+* Check package startup message
* Merge release branch into develop
diff --git a/cran-comments.md b/cran-comments.md
index 2f0cba8..9b01c15 100644
--- a/cran-comments.md
+++ b/cran-comments.md
@@ -1,35 +1,31 @@
## Submission
-This submission of fst adresses Prof. Ripley's request to move the OpenMP flag to PKG_CXXFLAGS.
-In addition several minor issues have been resolved to increase package stability and the
-libraries on which fst depends (fstlib, LZ4 and ZSTD) are updated to their latest version.
+This submission of fst (v0.9.0) addresses Dr. Kurt Hornik's request to fix issues identified by rchk. These issues result from PROTECT / UNPROTECT pairs called in the constructor / destructor pairs of C++ classes. rchk (rightfully) warns about those because it can't determine from the code if pairs are properly matched. With this submission the relevant SEXP classes are protected by containing them in SEXP classes that are already PROTECTED, which allows for removal of the PROTECT / UNPROTECT pairs in question.
-## Test environments
+Two false warnings remain, detected in fst_compress.cpp. The code was thoroughly checked to affirm the stability of the code.
+
+In addition, with this submission, support for fst files generated with package versions before 0.8.0 has been deprecated, significantly reducing the (C++) code base.
+
+## Test environments
* OSX on travis-ci
* Ubuntu 14.04 on travis-ci
* Ubuntu 18.10 locally
* Ubuntu 18.10 locally using clang-6.0
-* docker with the rocker/r-devel-ubsan-clang instrumented image
-* docker with the rocker/r-devel-san instrumented image
-* Windows 10 local R 3.5.1
+* Docker with the rocker/r-devel-ubsan-clang instrumented image
+* Docker with the rocker/r-devel-san instrumented image
+* Windows 10 local R 3.5.3
* Windows 10 local R-dev 3.6.0 pre-release
-* Windows Server 2012 R2 x64 (build 9600) on AppVeyor R 3.5.1
-* R-Devel 3.6.0 build on Windows 10
+* Windows Server 2012 R2 x64 (build 9600) on AppVeyor (R 3.5.3)
+* Singularity-container package for running rchk on Ubuntu 18.10
+* Valgrind on Ubuntu 18.10.
+* Rhub (only on systems that support OpenMP)
## R CMD check results
There were no ERRORs or WARNINGs.
-On some platforms a note is generated with R CMD check:
- installed size is 7.0Mb
-The install size on different platforms varies significantly, from 1.42 MB (windows 10) to more than 7 MB on fedora.
-
-## Valgrind
-
-To reproduce the CRAN valgrind report, an instrumented (level 2) build of R was constructed on a fresh Ubuntu 16.04 image using config.site and configure parameters as specified in the memtests README file on CRAN. That build shows no valgrind warnings using the current submision.
-
## Downstream dependencies
I have run R CMD check on downstream dependencies and found no issues:
diff --git a/docs/pkgdown.yml b/docs/pkgdown.yml
index c02f58a..4f60fa8 100644
--- a/docs/pkgdown.yml
+++ b/docs/pkgdown.yml
@@ -1,4 +1,4 @@
-pandoc: 2.3.1
+pandoc: '2.5'
pkgdown: 1.3.0
pkgdown_sha: ~
articles: []
diff --git a/man/fst.Rd b/man/fst.Rd
index f990ea4..d760a90 100644
--- a/man/fst.Rd
+++ b/man/fst.Rd
@@ -9,7 +9,8 @@ fst(path, old_format = FALSE)
\arguments{
\item{path}{path to fst file}
-\item{old_format}{use TRUE to read fst files generated with a fst package version lower than v0.8.0}
+\item{old_format}{must be FALSE, the old fst file format is deprecated and can only be read and
+converted with fst package versions 0.8.0 to 0.8.10.}
}
\value{
An object of class \code{fst_table}
diff --git a/man/metadata_fst.Rd b/man/metadata_fst.Rd
index ca840ae..7ea0fff 100644
--- a/man/metadata_fst.Rd
+++ b/man/metadata_fst.Rd
@@ -12,7 +12,8 @@ fst.metadata(path, old_format = FALSE)
\arguments{
\item{path}{path to fst file}
-\item{old_format}{use TRUE to read fst files generated with a fst package version lower than v0.8.0}
+\item{old_format}{must be FALSE, the old fst file format is deprecated and can only be read and
+converted with fst package versions 0.8.0 to 0.8.10.}
}
\value{
Returns a list with meta information on the stored dataset in \code{path}.
diff --git a/man/threads_fst.Rd b/man/threads_fst.Rd
index 2d7cf5e..792b200 100644
--- a/man/threads_fst.Rd
+++ b/man/threads_fst.Rd
@@ -31,7 +31,7 @@ specific requirements. As a default, \code{fst} uses a number of threads equal t
logical cores in the system.
}
\details{
-The number of threads can also be set with \code{option(fst_threads = N)}.
+The number of threads can also be set with \code{options(fst_threads = N)}.
NOTE: This option is only read when the package's namespace is first loaded, with commands like
\code{library}, \code{require}, or \code{::}. If you have already used one of these, you
must use \code{threads_fst} to set the number of threads.
diff --git a/man/write_fst.Rd b/man/write_fst.Rd
index 988d194..9b9a053 100644
--- a/man/write_fst.Rd
+++ b/man/write_fst.Rd
@@ -32,7 +32,7 @@ If \code{uniform.encoding} is set to \code{FALSE}, no such assumption will be ma
to the same encoding. The latter is a relatively expensive operation and will reduce write performance for
character columns.}
-\item{columns}{Column names to read. The default is to read all all columns.}
+\item{columns}{Column names to read. The default is to read all columns.}
\item{from}{Read data starting from this row number.}
@@ -42,7 +42,8 @@ character columns.}
dataset \code{x} before writing will be retained. This allows for storage of sorted datasets. This option
requires \code{data.table} package to be installed.}
-\item{old_format}{use TRUE to read fst files generated with a fst package version lower than v0.8.0}
+\item{old_format}{must be FALSE, the old fst file format is deprecated and can only be read and
+converted with fst package versions 0.8.0 to 0.8.10.}
}
\value{
\code{read_fst} returns a data frame with the selected columns and rows. \code{write_fst}
diff --git a/src/Makevars b/src/Makevars
index 111b2ee..fb2d027 100644
--- a/src/Makevars
+++ b/src/Makevars
@@ -15,11 +15,10 @@ LIBZSTD = fstcore/ZSTD/common/entropy_common.o fstcore/ZSTD/common/error_private
fstcore/ZSTD/common/pool.o fstcore/ZSTD/compress/zstd_opt.o fstcore/ZSTD/dictBuilder/zdict.o \
fstcore/ZSTD/compress/zstd_double_fast.o fstcore/ZSTD/compress/hist.o fstcore/ZSTD/dictBuilder/fastcover.o
LIBCOMPRESSION = fstcore/compression/compression.o fstcore/compression/compressor.o
-LIBFRAME = fstcore/interface/openmphelper.o fstcore/interface/fststore.o fstcore_v1/logical/logical_v4.o \
- fstcore/logical/logical_v10.o fstcore_v1/integer/integer_v2.o fstcore/integer/integer_v8.o fstcore/byte/byte_v12.o \
- fstcore_v1/double/double_v3.o fstcore/double/double_v9.o fstcore_v1/character/character_v1.o fstcore/character/character_v6.o \
- fstcore_v1/factor/factor_v5.o fstcore/factor/factor_v7.o fstcore/blockstreamer/blockstreamer_v2.o \
- fstcore_v1/blockstreamer/blockstreamer_v1.o fstcore/integer64/integer64_v11.o
+LIBFRAME = fstcore/interface/openmphelper.o fstcore/interface/fststore.o fstcore/logical/logical_v10.o \
+ fstcore/integer/integer_v8.o fstcore/byte/byte_v12.o fstcore/double/double_v9.o fstcore/character/character_v6.o \
+ fstcore/factor/factor_v7.o fstcore/blockstreamer/blockstreamer_v2.o \
+ fstcore/integer64/integer64_v11.o
$(SHLIB): libLZ4.a libZSTD.a libCOMPRESSION.a libFRAME.a
diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp
index 121152d..9ebd106 100644
--- a/src/RcppExports.cpp
+++ b/src/RcppExports.cpp
@@ -30,20 +30,19 @@ BEGIN_RCPP
END_RCPP
}
// fstmetadata
-SEXP fstmetadata(Rcpp::String fileName, SEXP oldFormat);
-RcppExport SEXP _fst_fstmetadata(SEXP fileNameSEXP, SEXP oldFormatSEXP) {
+SEXP fstmetadata(Rcpp::String fileName);
+RcppExport SEXP _fst_fstmetadata(SEXP fileNameSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< Rcpp::String >::type fileName(fileNameSEXP);
- Rcpp::traits::input_parameter< SEXP >::type oldFormat(oldFormatSEXP);
- rcpp_result_gen = Rcpp::wrap(fstmetadata(fileName, oldFormat));
+ rcpp_result_gen = Rcpp::wrap(fstmetadata(fileName));
return rcpp_result_gen;
END_RCPP
}
// fstretrieve
-SEXP fstretrieve(Rcpp::String fileName, SEXP columnSelection, SEXP startRow, SEXP endRow, SEXP oldFormat);
-RcppExport SEXP _fst_fstretrieve(SEXP fileNameSEXP, SEXP columnSelectionSEXP, SEXP startRowSEXP, SEXP endRowSEXP, SEXP oldFormatSEXP) {
+SEXP fstretrieve(Rcpp::String fileName, SEXP columnSelection, SEXP startRow, SEXP endRow);
+RcppExport SEXP _fst_fstretrieve(SEXP fileNameSEXP, SEXP columnSelectionSEXP, SEXP startRowSEXP, SEXP endRowSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
@@ -51,8 +50,7 @@ BEGIN_RCPP
Rcpp::traits::input_parameter< SEXP >::type columnSelection(columnSelectionSEXP);
Rcpp::traits::input_parameter< SEXP >::type startRow(startRowSEXP);
Rcpp::traits::input_parameter< SEXP >::type endRow(endRowSEXP);
- Rcpp::traits::input_parameter< SEXP >::type oldFormat(oldFormatSEXP);
- rcpp_result_gen = Rcpp::wrap(fstretrieve(fileName, columnSelection, startRow, endRow, oldFormat));
+ rcpp_result_gen = Rcpp::wrap(fstretrieve(fileName, columnSelection, startRow, endRow));
return rcpp_result_gen;
END_RCPP
}
diff --git a/src/flex_store.cpp b/src/flex_store.cpp
index 1920026..30a5a79 100644
--- a/src/flex_store.cpp
+++ b/src/flex_store.cpp
@@ -40,9 +40,9 @@
#include
#include
#include
+#include
#include
-#include
using namespace std;
@@ -91,7 +91,11 @@ SEXP fststore(String fileName, SEXP table, SEXP compression, SEXP uniformEncodin
return fst_error("Parameter compression should be an integer value between 0 and 100");
}
- FstTable fstTable(table, *LOGICAL(uniformEncoding));
+ // avoid using PROTECT statements in C++ classes (which generate rchk errors)
+ // this PROTECTED container can be used to hold any R object safely
+ SEXP r_container = PROTECT(Rf_allocVector(VECSXP, 1));
+
+ FstTable fstTable(table, *LOGICAL(uniformEncoding), r_container);
FstStore fstStore(fileName.get_cstring());
try
@@ -100,81 +104,54 @@ SEXP fststore(String fileName, SEXP table, SEXP compression, SEXP uniformEncodin
}
catch (const std::runtime_error& e)
{
+ UNPROTECT(1);
return fst_error(e.what());
}
+ UNPROTECT(1);
return R_NilValue;
}
-SEXP fstmetadata(String fileName, SEXP oldFormat)
+SEXP fstmetadata(String fileName)
{
FstStore fstStore(fileName.get_cstring());
std::unique_ptr columnFactory(new ColumnFactory());
- // Character vector case 1: 2 2
- // Integer vector case 2: 4 5
- // Real vector case 3: 5 10
- // Logical vector case 4: 6 15
- // Factor vector case 5: 3 3
-
- // use fst format v0.7.2
- if (*LOGICAL(oldFormat) != 0)
- {
- try
- {
- List resOld = fstMeta_v1(fileName); // scans further for safety
-
- IntegerVector typeVec = resOld[4];
- IntegerVector attributeVec(typeVec.size());
-
- // change type ordering
- int types[6] { 0, 2, 4, 5, 6, 3};
- int attributeTypes[6] { 0, 2, 5, 10, 15, 3};
-
- for (int pos = 0; pos < typeVec.size(); pos++)
- {
- int oldType = typeVec[pos];
- typeVec[pos] = types[oldType];
- attributeVec[pos] = attributeTypes[oldType];
- }
-
- resOld[4] = typeVec;
- resOld.push_back(attributeVec, "colType");
-
- return resOld; // scans further for safety
- }
- catch(...)
- {
- return fst_error("An unknown C++ error occured in the legacy fstlib library. Please rewrite your fst files using the latest version of the fst package.");
- }
- }
+ // to hold the column names
+ SEXP list_container = PROTECT(Rf_allocVector(VECSXP, 1));
+ StringVectorContainer* str_container = new StringVectorContainer(list_container);
+ std::unique_ptr col_names(str_container);
// use fst format >= v0.8.0
try
{
- fstStore.fstMeta(columnFactory.get());
-
+ fstStore.fstMeta(columnFactory.get(), col_names.get());
}
catch (const std::runtime_error& e)
{
+ UNPROTECT(1); // list_container
+
// We may be looking at a fst v0.7.2 file format, this unsafe code will be removed later
if (std::strcmp(e.what(), FSTERROR_NON_FST_FILE) == 0)
{
return fst_error("File header information does not contain the fst format marker. "
"If this is a fst file generated with package version older than v0.8.0, "
- "you can read your file by using 'old_format = TRUE'.");
+ "please read (and re-write) your file using fst package versions 0.8.0 to 0.8.10.");
}
return fst_error(e.what());
}
catch(...)
{
+ UNPROTECT(1); // list_container
+
return fst_error("An unknown C++ error occured in the fstlib library.");
}
// R internals part TODO: speed up this code
- SEXP colNames = ((BlockReaderChar*) fstStore.blockReader)->StrVector();
+ // SEXP colNames = ((BlockReaderChar*) fstStore.blockReader)->StrVector();
+ SEXP colNames = str_container->StrVector();
// Convert column info to integer vector
// IntegerVector colTypeVec(fstStore.nrOfCols);
@@ -205,7 +182,7 @@ SEXP fstmetadata(String fileName, SEXP oldFormat)
keyColIndex[col] = fstStore.keyColPos[col];
}
- UNPROTECT(1); // keyNames
+ UNPROTECT(2); // keyNames, list_container
retList = List::create(
_["nNofCols"] = fstStore.nrOfCols,
@@ -220,6 +197,8 @@ SEXP fstmetadata(String fileName, SEXP oldFormat)
}
else
{
+ UNPROTECT(1); // list_container
+
retList = List::create(
_["nrOfCols"] = fstStore.nrOfCols,
_["nrOfRows"] = *fstStore.p_nrOfRows,
@@ -234,9 +213,19 @@ SEXP fstmetadata(String fileName, SEXP oldFormat)
}
-SEXP fstretrieve(String fileName, SEXP columnSelection, SEXP startRow, SEXP endRow, SEXP oldFormat)
+SEXP fstretrieve(String fileName, SEXP columnSelection, SEXP startRow, SEXP endRow)
{
- FstTable tableReader;
+ // avoid using PROTECT statements in C++ classes (which generate rchk errors)
+ // this PROTECTED container can be used to hold any R object safely
+ SEXP r_container = PROTECT(Rf_allocVector(VECSXP, 1));
+
+ // to hold the column names
+ SEXP list_container = PROTECT(Rf_allocVector(VECSXP, 1));
+ StringVectorContainer* str_container = new StringVectorContainer(list_container);
+ std::unique_ptr col_names(str_container);
+
+ FstTable tableReader(r_container);
+
std::unique_ptr columnFactory(new ColumnFactory());
FstStore fstStore(fileName.get_cstring());
@@ -252,8 +241,9 @@ SEXP fstretrieve(String fileName, SEXP columnSelection, SEXP startRow, SEXP endR
vector keyIndex;
- std::unique_ptr colNames(new StringArray());
+ // use fst format >= v0.8.0
std::unique_ptr colSelection;
+ std::unique_ptr colNames(new StringArray());
if (!Rf_isNull(columnSelection))
{
@@ -261,53 +251,42 @@ SEXP fstretrieve(String fileName, SEXP columnSelection, SEXP startRow, SEXP endR
colSelection->SetArray(columnSelection);
}
- // use fst format v0.7.2
- if (*LOGICAL(oldFormat) != 0)
- {
- try
- {
- SEXP res = fstRead_v1(fileName, columnSelection, startRow, endRow);
-
- return res;
- }
- catch(...)
- {
- return fst_error("An unknown C++ error occured in the legacy fstlib library. "
- "Please rewrite your fst files using the latest version of the fst package.");
- }
- }
-
- // use fst format >= v0.8.0
try
{
- fstStore.fstRead(tableReader, colSelection.get(), sRow, eRow, columnFactory.get(), keyIndex, &*colNames);
+ fstStore.fstRead(tableReader, colSelection.get(), sRow, eRow, columnFactory.get(), keyIndex, &*colNames, col_names.get());
}
catch (const std::runtime_error& e)
{
// We may be looking at a fst v0.7.2 file format, this unsafe code will be removed later
if (std::strcmp(e.what(), FSTERROR_NON_FST_FILE) == 0)
{
+ UNPROTECT(2); // r_container, list_container
return fst_error("File header information does not contain the fst format marker. "
"If this is a fst file generated with package version older than v0.8.0, "
- "you can read your file by using 'old_format = TRUE'.");
+ "please read (and re-write) your file using fst package versions 0.8.0 to 0.8.10.");
}
// re-throw uncatched errors
+ UNPROTECT(2); // r_container, list_container
return fst_error(e.what());
}
catch(...)
{
+ UNPROTECT(2); // r_container, list_container
return fst_error("An unknown C++ error occured in the fstlib library");
}
- SEXP colNameVec = colNames->StrVector();
+ SEXP colNameVec = tableReader.GetColNames();
+
+ // SEXP colNameVec = colNames->StrVector();
// Generalize to full atributes
- Rf_setAttrib(tableReader.resTable, R_NamesSymbol, colNameVec);
+ SEXP resTable = PROTECT(tableReader.ResTable());
+ Rf_setAttrib(resTable, R_NamesSymbol, colNameVec);
+ UNPROTECT(1);
// Convert keyIndex to keyNames
- SEXP keyNames;
- PROTECT(keyNames = Rf_allocVector(STRSXP, keyIndex.size()));
+ SEXP keyNames = PROTECT(Rf_allocVector(STRSXP, keyIndex.size()));
int count = 0;
@@ -316,10 +295,10 @@ SEXP fstretrieve(String fileName, SEXP columnSelection, SEXP startRow, SEXP endR
SET_STRING_ELT(keyNames, count++, STRING_ELT(colNameVec, *keyIt));
}
- UNPROTECT(1);
+ UNPROTECT(3); // r_container, keyNames, list_container
return List::create(
_["keyNames"] = keyNames,
_["keyIndex"] = keyIndex,
- _["resTable"] = tableReader.resTable);
+ _["resTable"] = tableReader.ResTable());
}
diff --git a/src/flex_store.h b/src/flex_store.h
index 5cd859a..989046f 100644
--- a/src/flex_store.h
+++ b/src/flex_store.h
@@ -35,10 +35,10 @@ int fstlib_version();
SEXP fststore(Rcpp::String fileName, SEXP table, SEXP compression, SEXP uniformEncoding);
// [[Rcpp::export]]
-SEXP fstmetadata(Rcpp::String fileName, SEXP oldFormat);
+SEXP fstmetadata(Rcpp::String fileName);
// [[Rcpp::export]]
-SEXP fstretrieve(Rcpp::String fileName, SEXP columnSelection, SEXP startRow, SEXP endRow, SEXP oldFormat);
+SEXP fstretrieve(Rcpp::String fileName, SEXP columnSelection, SEXP startRow, SEXP endRow);
#endif // FASTSTORE_H
diff --git a/src/flex_store_v1.cpp b/src/flex_store_v1.cpp
deleted file mode 100644
index 081cc31..0000000
--- a/src/flex_store_v1.cpp
+++ /dev/null
@@ -1,542 +0,0 @@
-/*
- fst - R package for ultra fast storage and retrieval of datasets
-
- Copyright (C) 2017-present, Mark AJ Klik
-
- This file is part of the fst R package.
-
- The fst R package is free software: you can redistribute it and/or modify it
- under the terms of the GNU Affero General Public License version 3 as
- published by the Free Software Foundation.
-
- The fst R package is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License
- for more details.
-
- You should have received a copy of the GNU Affero General Public License along
- with the fst R package. If not, see .
-
- You can contact the author at:
- - fst R package source repository : https://github.com/fstpackage/fst
-*/
-
-
-#include
-
-#include
-#include
-
-#include
-
-#include
-
-#include
-#include
-#include
-#include
-#include
-
-
-using namespace std;
-using namespace Rcpp;
-
-
-// Scans further than necessary for safety!!!
-List fstMeta_v1(String fileName)
-{
- // Open file
- ifstream myfile;
- myfile.open(fileName.get_cstring(), ios::binary);
-
- // Additional check compared to CRAN v0.7.2
- if (myfile.fail())
- {
- myfile.close();
- ::Rf_error("There was an error opening the fst file. Please check for a correct filename.");
- }
-
-
- // Read column size
- short int colSizes[2];
- myfile.read((char*) &colSizes, 2 * sizeof(short int));
-
-
- // Additional checks compared to CRAN v0.7.2
- if ((colSizes[0] < 0) | (colSizes[1] < 0))
- {
- myfile.close();
- ::Rf_error("Unrecognised file type, are you sure this is a fst file?");
- }
-
- short int nrOfCols = colSizes[0];
- short int keyLength = colSizes[1] & 32767;
-
-
- // Read key column index
- short int *keyColumns = new short int[keyLength + 1];
- myfile.read((char*) keyColumns, keyLength * sizeof(short int)); // may be of length zero
-
- // Additional checks compared to CRAN v0.7.2
- if (keyLength > 0)
- {
- for (int keyColCount = 0; keyColCount < keyLength; ++keyColCount)
- {
- if ((keyColumns[keyColCount] < 0) | (keyColumns[keyColCount] >= nrOfCols))
- {
- myfile.close();
- delete[] keyColumns;
- ::Rf_error("Error reading file header, are you sure this is a fst file?");
- }
- }
- }
-
-
- // Read column types
- short int *colTypes = new short int[nrOfCols];
- myfile.read((char*) colTypes, nrOfCols * sizeof(short int));
-
- // Additional checks compared to CRAN v0.7.2
- for (int colCount = 0; colCount < nrOfCols; ++colCount)
- {
- if ((colTypes[colCount] < 0) | (colTypes[colCount] > 5))
- {
- myfile.close();
- delete[] colTypes;
- delete[] keyColumns;
- ::Rf_error("Error reading file header, are you sure this is a fst file?");
- }
- }
-
-
- // Convert to integer vector
- IntegerVector colTypeVec(nrOfCols);
- for (int col = 0; col != nrOfCols; ++col)
- {
- colTypeVec[col] = colTypes[col];
- }
-
-
- // Read block positions
- unsigned long long* allBlockPos = new unsigned long long[nrOfCols + 1];
- myfile.read((char*) allBlockPos, (nrOfCols + 1) * sizeof(unsigned long long));
-
- // Additional checks compared to CRAN v0.7.2
- for (int colCount = 2; colCount <= nrOfCols; ++colCount)
- {
- // block positions should be monotonically increasing
- if (allBlockPos[colCount] < allBlockPos[colCount - 1])
- {
- myfile.close();
- delete[] colTypes;
- delete[] keyColumns;
- delete[] allBlockPos;
- ::Rf_error("Error reading file header (blockPos), are you sure this is a fst file?");
- }
- }
-
- int nrOfRows = (int) allBlockPos[0];
-
- // Additional checks compared to CRAN v0.7.2
- if (nrOfRows <= 0)
- {
- myfile.close();
- delete[] colTypes;
- delete[] keyColumns;
- delete[] allBlockPos;
- ::Rf_error("Error reading file header (blockPos), are you sure this is a fst file?");
- }
-
-
- // Convert to numeric vector
- NumericVector blockPosVec(nrOfCols + 1);
- for (int col = 0; col != nrOfCols + 1; ++col)
- {
- blockPosVec[col] = (double) allBlockPos[col];
- }
-
-
- // Read column names
- SEXP colNames;
- PROTECT(colNames = Rf_allocVector(STRSXP, nrOfCols));
- unsigned long long offset = (nrOfCols + 1) * sizeof(unsigned long long) + (nrOfCols + keyLength + 2) * sizeof(short int);
- fdsReadCharVec_v1(myfile, colNames, offset, 0, (unsigned int) nrOfCols, (unsigned int) nrOfCols);
-
- // cleanup
- delete[] colTypes;
- delete[] allBlockPos;
- myfile.close();
-
- // add deprecated warning
- Rf_warning("This fst file was created with a beta version of the fst package. Please re-write the data as this format will not be supported in future releases.");
-
- if (keyLength > 0)
- {
- SEXP keyNames;
- PROTECT(keyNames = Rf_allocVector(STRSXP, keyLength));
- for (int i = 0; i < keyLength; ++i)
- {
- SET_STRING_ELT(keyNames, i, STRING_ELT(colNames, keyColumns[i]));
- }
-
- IntegerVector keyColIndex(keyLength);
- for (int col = 0; col != keyLength; ++col)
- {
- keyColIndex[col] = keyColumns[col];
- }
-
- UNPROTECT(2);
- delete[] keyColumns;
-
- return List::create(
- _["nrOfCols"] = nrOfCols,
- _["nrOfRows"] = nrOfRows,
- _["fstVersion"] = 0,
- _["keyLength"] = keyLength,
- _["colBaseType"] = colTypeVec,
- _["colNames"] = colNames,
- _["keyColIndex"] = keyColIndex,
- _["keyNames"] = keyNames);
- }
-
- UNPROTECT(1);
- delete[] keyColumns;
-
- return List::create(
- _["nrOfCols"] = nrOfCols,
- _["nrOfRows"] = nrOfRows,
- _["fstVersion"] = 0,
- _["keyLength"] = keyLength,
- _["colBaseType"] = colTypeVec,
- _["colNames"] = colNames);
-}
-
-
-SEXP fstRead_v1(String fileName, SEXP columnSelection, SEXP startRow, SEXP endRow)
-{
- // Open file
- ifstream myfile;
- myfile.open(fileName.get_cstring(), ios::binary);
-
- // Additional check compared to CRAN v0.7.2
- if (myfile.fail())
- {
- myfile.close();
- ::Rf_error("There was an error opening the fst file. Please check for a correct filename.");
- }
-
- // Read column size
- short int colSizes[2];
- myfile.read((char*) &colSizes, 2 * sizeof(short int));
-
-
- // Additional checks compared to CRAN v0.7.2
- if ((colSizes[0] < 0) | (colSizes[1] < 0))
- {
- myfile.close();
- ::Rf_error("Unrecognised file type, are you sure this is a fst file?");
- }
-
- short int nrOfCols = colSizes[0];
- short int keyLength = colSizes[1] & 32767;
-
-
- // Read key column index
- short int *keyColumns = new short int[keyLength + 1];
- myfile.read((char*) keyColumns, keyLength * sizeof(short int)); // may be of length zero
-
- // Additional checks compared to CRAN v0.7.2
- if (keyLength > 0)
- {
- for (int keyColCount = 0; keyColCount < keyLength; ++keyColCount)
- {
- if ((keyColumns[keyColCount] < 0) | (keyColumns[keyColCount] >= nrOfCols))
- {
- myfile.close();
- delete[] keyColumns;
- ::Rf_error("Error reading file header, are you sure this is a fst file?");
- }
- }
- }
-
- // Read column types
- short int *colTypes = new short int[nrOfCols];
- myfile.read((char*) colTypes, nrOfCols * sizeof(short int));
-
- // Additional checks compared to CRAN v0.7.2
- for (int colCount = 0; colCount < nrOfCols; ++colCount)
- {
- if ((colTypes[colCount] < 0) | (colTypes[colCount] > 5))
- {
- myfile.close();
- delete[] colTypes;
- delete[] keyColumns;
- ::Rf_error("Error reading file header, are you sure this is a fst file?");
- }
- }
-
-
- // Read block positions
- unsigned long long* allBlockPos = new unsigned long long[nrOfCols + 1];
- myfile.read((char*) allBlockPos, (nrOfCols + 1) * sizeof(unsigned long long));
-
- // Additional checks compared to CRAN v0.7.2
- for (int colCount = 2; colCount <= nrOfCols; ++colCount)
- {
- // block positions should be monotonically increasing
- if (allBlockPos[colCount] < allBlockPos[colCount - 1])
- {
- myfile.close();
- delete[] colTypes;
- delete[] keyColumns;
- delete[] allBlockPos;
- ::Rf_error("Error reading file header (blockPos), are you sure this is a fst file?");
- }
- }
-
- // Read column names
- SEXP colNames;
- PROTECT(colNames = Rf_allocVector(STRSXP, nrOfCols));
- unsigned long long offset = (nrOfCols + 1) * sizeof(unsigned long long) + (nrOfCols + keyLength + 2) * sizeof(short int);
- fdsReadCharVec_v1(myfile, colNames, offset, 0, (unsigned int) nrOfCols, (unsigned int) nrOfCols);
-
-
- // Determine column selection
- int *colIndex = new int[nrOfCols];
- int nrOfSelect = LENGTH(columnSelection);
-
- if (Rf_isNull(columnSelection))
- {
- for (int colNr = 0; colNr < nrOfCols; ++colNr)
- {
- colIndex[colNr] = colNr;
- }
- nrOfSelect = nrOfCols;
- }
- else // determine column numbers of column names
- {
- int equal;
- for (int colSel = 0; colSel < nrOfSelect; ++colSel)
- {
- equal = -1;
- const char* str1 = CHAR(STRING_ELT(columnSelection, colSel));
-
- for (int colNr = 0; colNr < nrOfCols; ++colNr)
- {
- const char* str2 = CHAR(STRING_ELT(colNames, colNr));
- if (strcmp(str1, str2) == 0)
- {
- equal = colNr;
- break;
- }
- }
-
- if (equal == -1)
- {
- delete[] colIndex;
- delete[] colTypes;
- delete[] keyColumns;
- delete[] allBlockPos;
- myfile.close();
- UNPROTECT(1);
- ::Rf_error("Selected column not found.");
- }
-
- colIndex[colSel] = equal;
- }
- }
-
-
- // Check range of selected rows
- int firstRow = INTEGER(startRow)[0] - 1;
- int nrOfRows = (int) allBlockPos[0];
-
- if (firstRow >= nrOfRows || firstRow < 0)
- {
- delete[] colIndex;
- delete[] colTypes;
- delete[] keyColumns;
- delete[] allBlockPos;
- myfile.close();
- UNPROTECT(1);
-
- if (firstRow < 0)
- {
- ::Rf_error("Parameter fromRow should have a positive value.");
- }
-
- ::Rf_error("Row selection is out of range.");
- }
-
- int length = nrOfRows - firstRow;
-
- // Determine vector length
- if (!Rf_isNull(endRow))
- {
- int lastRow = *INTEGER(endRow);
-
- if (lastRow <= firstRow)
- {
- delete[] colIndex;
- delete[] colTypes;
- delete[] keyColumns;
- delete[] allBlockPos;
- myfile.close();
- UNPROTECT(1);
- ::Rf_error("Parameter 'lastRow' should be equal to or larger than parameter 'fromRow'.");
- }
-
- length = min(lastRow - firstRow, nrOfRows - firstRow);
- }
-
- unsigned long long* blockPos = allBlockPos + 1;
-
- // Vector of selected column names
- SEXP selectedNames;
- PROTECT(selectedNames = Rf_allocVector(STRSXP, nrOfSelect));
-
- SEXP resTable;
- PROTECT(resTable = Rf_allocVector(VECSXP, nrOfSelect));
-
- SEXP colInfo;
- PROTECT(colInfo = Rf_allocVector(VECSXP, nrOfSelect));
-
- for (int colSel = 0; colSel < nrOfSelect; ++colSel)
- {
- int colNr = colIndex[colSel];
-
- if (colNr < 0 || colNr >= nrOfCols)
- {
- delete[] colIndex;
- delete[] colTypes;
- delete[] keyColumns;
- delete[] allBlockPos;
- myfile.close();
- UNPROTECT(4);
- ::Rf_error("Column selection is out of range.");
- }
-
- // Column name
- SEXP selName = STRING_ELT(colNames, colNr);
- SET_STRING_ELT(selectedNames, colSel, selName);
-
- unsigned long long pos = blockPos[colNr];
-
- SEXP singleColInfo;
-
- switch (colTypes[colNr])
- {
- // Character vector
- case 1:
- SEXP strVec;
- PROTECT(strVec = Rf_allocVector(STRSXP, length));
- singleColInfo = fdsReadCharVec_v1(myfile, strVec, pos, firstRow, length, nrOfRows);
- SET_VECTOR_ELT(resTable, colSel, strVec);
- UNPROTECT(1);
- break;
-
- // Integer vector
- case 2:
- SEXP intVec;
- PROTECT(intVec = Rf_allocVector(INTSXP, length));
- singleColInfo = fdsReadIntVec_v2(myfile, intVec, pos, firstRow, length, nrOfRows);
- SET_VECTOR_ELT(resTable, colSel, intVec);
- UNPROTECT(1);
- break;
-
- // Real vector
- case 3:
- SEXP realVec;
- PROTECT(realVec = Rf_allocVector(REALSXP, length));
- singleColInfo = fdsReadRealVec_v3(myfile, realVec, pos, firstRow, length, nrOfRows);
- SET_VECTOR_ELT(resTable, colSel, realVec);
- UNPROTECT(1);
- break;
-
- // Logical vector
- case 4:
- SEXP boolVec;
- PROTECT(boolVec = Rf_allocVector(LGLSXP, length));
- singleColInfo = fdsReadLogicalVec_v4(myfile, boolVec, pos, firstRow, length, nrOfRows);
- SET_VECTOR_ELT(resTable, colSel, boolVec);
- UNPROTECT(1);
- break;
-
- // Factor vector
- case 5:
- SEXP facVec;
- PROTECT(facVec = Rf_allocVector(INTSXP, length));
- singleColInfo = fdsReadFactorVec_v5(myfile, facVec, pos, firstRow, length, nrOfRows);
- SET_VECTOR_ELT(resTable, colSel, facVec);
- UNPROTECT(2); // level string was also generated
- break;
-
-
- default:
- delete[] colIndex;
- delete[] colTypes;
- delete[] keyColumns;
- delete[] allBlockPos;
- myfile.close();
- ::Rf_error("Unknown type found in column.");
- }
-
- SET_VECTOR_ELT(colInfo, colSel, singleColInfo);
- }
-
- myfile.close();
-
- Rf_setAttrib(resTable, R_NamesSymbol, selectedNames);
-
- int found = 0;
- for (int i = 0; i < keyLength; ++i)
- {
- for (int colSel = 0; colSel < nrOfSelect; ++colSel)
- {
- if (keyColumns[i] == colIndex[colSel]) // key present in result
- {
- ++found;
- break;
- }
- }
- }
-
- // Only keys present in result set
- if (found > 0)
- {
- SEXP keyNames;
- PROTECT(keyNames = Rf_allocVector(STRSXP, found));
- for (int i = 0; i < found; ++i)
- {
- SET_STRING_ELT(keyNames, i, STRING_ELT(colNames, keyColumns[i]));
- }
-
- // cleanup
- UNPROTECT(5);
- delete[] colIndex;
- delete[] colTypes;
- delete[] keyColumns;
- delete[] allBlockPos;
-
- return List::create(
- _["keyNames"] = keyNames,
- _["resTable"] = resTable,
- _["selectedNames"] = selectedNames,
- _["colInfo"] = colInfo);
- }
-
- UNPROTECT(4);
- delete[] colIndex;
- delete[] colTypes;
- delete[] keyColumns;
- delete[] allBlockPos;
-
- // add deprecated warning
- Rf_warning("This fst file was created with a beta version of the fst package. Please re-write the data as this format will not be supported in future releases.");
-
- return List::create(
- _["keyNames"] = R_NilValue,
- _["selectedNames"] = selectedNames,
- _["resTable"] = resTable,
- _["colInfo"] = colInfo);
-}
diff --git a/src/flex_store_v1.h b/src/flex_store_v1.h
deleted file mode 100644
index ae53a36..0000000
--- a/src/flex_store_v1.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- fst - R package for ultra fast storage and retrieval of datasets
-
- Copyright (C) 2017-present, Mark AJ Klik
-
- This file is part of the fst R package.
-
- The fst R package is free software: you can redistribute it and/or modify it
- under the terms of the GNU Affero General Public License version 3 as
- published by the Free Software Foundation.
-
- The fst R package is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License
- for more details.
-
- You should have received a copy of the GNU Affero General Public License along
- with the fst R package. If not, see .
-
- You can contact the author at:
- - fst R package source repository : https://github.com/fstpackage/fst
-*/
-
-
-#ifndef FASTSTORE_V1_H
-#define FASTSTORE_V1_H
-
-#include
-#include
-
-#include
-
-
-Rcpp::List fstMeta_v1(Rcpp::String fileName);
-
-SEXP fstRead_v1(Rcpp::String fileName, SEXP columnSelection, SEXP startRow, SEXP endRow);
-
-
-#endif // FASTSTORE_V1_H
diff --git a/src/fst_blockrunner_char.cpp b/src/fst_blockrunner_char.cpp
index 46fa5b9..2944b10 100644
--- a/src/fst_blockrunner_char.cpp
+++ b/src/fst_blockrunner_char.cpp
@@ -127,8 +127,8 @@ void BlockWriterChar::SetBuffersFromVec(unsigned long long startCount, unsigned
void BlockReaderChar::AllocateVec(unsigned long long vecLength)
{
- PROTECT(this->strVec = Rf_allocVector(STRSXP, vecLength));
- isProtected = true;
+ this->strVec = Rf_allocVector(STRSXP, vecLength);
+ // isProtected = true;
}
void BlockReaderChar::BufferToVec(unsigned long long nrOfElements, unsigned long long startElem,
diff --git a/src/fst_blockrunner_char.h b/src/fst_blockrunner_char.h
index 074871f..f1c0d27 100644
--- a/src/fst_blockrunner_char.h
+++ b/src/fst_blockrunner_char.h
@@ -105,18 +105,19 @@ class BlockWriterChar : public IStringWriter
class BlockReaderChar : public IStringColumn
{
SEXP strVec;
- bool isProtected;
+ // bool isProtected;
cetype_t strEncoding;
StringEncoding string_encoding = StringEncoding::NATIVE;
public:
BlockReaderChar()
{
- isProtected = false;
+ // isProtected = false;
}
- ~BlockReaderChar(){ if (isProtected) UNPROTECT(1); }
+ ~BlockReaderChar(){}
+ // make sure strVec is PROTECTED after creation
void AllocateVec(unsigned long long vecLength);
StringEncoding GetEncoding()
diff --git a/src/fst_column.h b/src/fst_column.h
index faafb58..16640ed 100644
--- a/src/fst_column.h
+++ b/src/fst_column.h
@@ -33,16 +33,15 @@ class StringArray : public IStringArray
{
SEXP strVec;
cetype_t strEncoding = cetype_t::CE_NATIVE;
- bool isProtected;
public:
- ~StringArray() { if (isProtected) UNPROTECT(1); }
+ ~StringArray() { }
+ // strVec should be PROTECTED after calling this method
void AllocateArray(unsigned int length)
{
- PROTECT(this->strVec = Rf_allocVector(STRSXP, length));
- isProtected = true;
+ this->strVec = Rf_allocVector(STRSXP, length);
}
// Use an existing SEXP
@@ -100,20 +99,50 @@ class FactorColumn : public IFactorColumn
public:
SEXP intVec;
std::unique_ptr blockReaderStrVecP;
- FstColumnAttribute columnAttribute;
+ // FstColumnAttribute columnAttribute;
- FactorColumn(int nrOfRows, FstColumnAttribute columnAttribute)
+ FactorColumn(int nrOfRows, int nrOfLevels, FstColumnAttribute columnAttribute)
{
- intVec = Rf_allocVector(INTSXP, nrOfRows);
- PROTECT(intVec);
+ intVec = PROTECT(Rf_allocVector(INTSXP, nrOfRows));
- this->columnAttribute = columnAttribute; // e.g. for 'FACTOR_ORDERED' specification
+ // this->columnAttribute = columnAttribute; // e.g. for 'FACTOR_ORDERED' specification
blockReaderStrVecP = std::unique_ptr(new BlockReaderChar());
+
+ BlockReaderChar* block_reader = blockReaderStrVecP.get();
+ block_reader->AllocateVec(nrOfLevels);
+ SEXP str_vector = PROTECT(block_reader->StrVector());
+
+ // set and PROTECT the levels attribute
+ SEXP level_str = PROTECT(Rf_mkString("levels"));
+ Rf_setAttrib(intVec, level_str, str_vector);
+ UNPROTECT(2); // level_str, str_vector
+
+ if (columnAttribute == FstColumnAttribute::FACTOR_ORDERED) // ordered factor
+ {
+ SEXP class_str = PROTECT(Rf_mkString("class"));
+ SEXP classes = PROTECT(Rf_allocVector(STRSXP, 2));
+
+ SET_STRING_ELT(classes, 0, Rf_mkChar("ordered"));
+ SET_STRING_ELT(classes, 1, Rf_mkChar("factor"));
+ Rf_setAttrib(intVec, class_str, classes);
+
+ UNPROTECT(2); // classes, class_str
+ }
+ else // unordered factor
+ {
+ SEXP class_str = PROTECT(Rf_mkString("class"));
+ SEXP factor_str = PROTECT(Rf_mkString("factor"));
+
+ Rf_setAttrib(intVec, class_str, factor_str);
+
+ UNPROTECT(2); // factor_str, class_str
+ }
+
+ UNPROTECT(1); // intVec
}
~FactorColumn()
{
- UNPROTECT(1);
};
int* LevelData()
@@ -125,11 +154,6 @@ class FactorColumn : public IFactorColumn
{
return blockReaderStrVecP.get();
}
-
- FstColumnAttribute Attribute()
- {
- return columnAttribute;
- }
};
@@ -141,12 +165,10 @@ class LogicalColumn : public ILogicalColumn
LogicalColumn(int nrOfRows)
{
boolVec = Rf_allocVector(LGLSXP, nrOfRows);
- PROTECT(boolVec);
}
~LogicalColumn()
{
- UNPROTECT(1);
}
int* Data()
@@ -163,8 +185,7 @@ class Int64Column : public IInt64Column
Int64Column(int nrOfRows, FstColumnAttribute columnAttribute, short int scale)
{
- int64Vec = Rf_allocVector(REALSXP, nrOfRows);
- PROTECT(int64Vec);
+ int64Vec = PROTECT(Rf_allocVector(REALSXP, nrOfRows));
// test for nanotime type
if (columnAttribute == FstColumnAttribute::INT_64_TIME_SECONDS)
@@ -179,22 +200,25 @@ class Int64Column : public IInt64Column
PROTECT(classAttr = Rf_mkString("nanotime"));
Rf_setAttrib(classAttr, Rf_mkString("package"), Rf_mkString("nanotime"));
- UNPROTECT(1); // necessary?
Rf_classgets(int64Vec, classAttr);
Rf_setAttrib(int64Vec, Rf_mkString(".S3Class"), Rf_mkString("integer64"));
SET_S4_OBJECT(int64Vec);
+ UNPROTECT(2); // int64Vec, classAttr
return;
}
// default int64 column type
- Rf_setAttrib(int64Vec, Rf_mkString("class"), Rf_mkString("integer64"));
+ SEXP int64_str = PROTECT(Rf_mkString("integer64"));
+
+ Rf_classgets(int64Vec, int64_str);
+
+ UNPROTECT(2); // int64_str, int64Vec
}
~Int64Column()
{
- UNPROTECT(1);
}
long long* Data()
@@ -250,6 +274,7 @@ class DoubleColumn : public IDoubleColumn
Rf_setAttrib(colVec, Rf_mkString("units"), Rf_mkString("secs"));
}
+ UNPROTECT(1); // colVec
return;
}
@@ -257,6 +282,7 @@ class DoubleColumn : public IDoubleColumn
if (columnAttribute == FstColumnAttribute::DOUBLE_64_DATE_DAYS)
{
Rf_classgets(colVec, Rf_mkString("Date"));
+ UNPROTECT(1); // colVec
return;
}
@@ -267,9 +293,11 @@ class DoubleColumn : public IDoubleColumn
if (scale != FstTimeScale::SECONDS)
{
+ UNPROTECT(1); // colVec
throw(std::runtime_error("ITime column with unknown scale detected"));
}
+ UNPROTECT(1); // colVec
return;
}
@@ -277,20 +305,23 @@ class DoubleColumn : public IDoubleColumn
// POSIXct type
if (columnAttribute == FstColumnAttribute::DOUBLE_64_TIMESTAMP_SECONDS)
{
- SEXP classes;
- PROTECT(classes = Rf_allocVector(STRSXP, 2));
+ SEXP classes = PROTECT(Rf_allocVector(STRSXP, 2));
+
SET_STRING_ELT(classes, 0, Rf_mkChar("POSIXct"));
SET_STRING_ELT(classes, 1, Rf_mkChar("POSIXt"));
- UNPROTECT(1);
Rf_classgets(colVec, classes);
+
+ UNPROTECT(2); // classes, colVec
+
return;
}
+
+ UNPROTECT(1); // colVec
}
~DoubleColumn()
{
- UNPROTECT(1);
}
double* Data()
@@ -305,12 +336,15 @@ class DoubleColumn : public IDoubleColumn
{
if (annotation.length() > 0)
{
- SEXP timeZone = Rf_ScalarString(Rf_mkCharLen(annotation.c_str(), annotation.length()));
+ SEXP timeZone = PROTECT(Rf_ScalarString(Rf_mkCharLen(annotation.c_str(), annotation.length())));
Rf_setAttrib(colVec, Rf_install("tzone"), timeZone);
+ UNPROTECT(1);
return;
}
- Rf_setAttrib(colVec, Rf_install("tzone"), Rf_mkString(""));
+ SEXP empty_str = PROTECT(Rf_mkString(""));
+ Rf_setAttrib(colVec, Rf_install("tzone"), empty_str);
+ UNPROTECT(1);
return;
}
}
@@ -326,8 +360,7 @@ class IntegerColumn : public IIntegerColumn
IntegerColumn(int nrOfRows, FstColumnAttribute columnAttribute, short int scale)
{
- colVec = Rf_allocVector(INTSXP, nrOfRows);
- PROTECT(colVec);
+ colVec = PROTECT(Rf_allocVector(INTSXP, nrOfRows));
// store for later reference
this->columnAttribute = columnAttribute;
@@ -359,32 +392,34 @@ class IntegerColumn : public IIntegerColumn
Rf_setAttrib(colVec, Rf_mkString("units"), Rf_mkString("secs"));
}
+ UNPROTECT(1); // colVec
return;
}
if (columnAttribute == FstColumnAttribute::INT_32_DATE_DAYS)
{
- SEXP classAttr;
+ SEXP classAttr = PROTECT(Rf_allocVector(STRSXP, 2));
- PROTECT(classAttr = Rf_allocVector(STRSXP, 2));
SET_STRING_ELT(classAttr, 0, Rf_mkChar("IDate"));
SET_STRING_ELT(classAttr, 1, Rf_mkChar("Date"));
- UNPROTECT(1);
Rf_classgets(colVec, classAttr);
+ UNPROTECT(2); // classAttr, colVec
+
return;
}
if (columnAttribute == FstColumnAttribute::INT_32_TIMESTAMP_SECONDS)
{
- SEXP classes;
- PROTECT(classes = Rf_allocVector(STRSXP, 2));
+ SEXP classes = PROTECT(Rf_allocVector(STRSXP, 2));
SET_STRING_ELT(classes, 0, Rf_mkChar("POSIXct"));
SET_STRING_ELT(classes, 1, Rf_mkChar("POSIXt"));
- UNPROTECT(1);
Rf_classgets(colVec, classes);
+
+ UNPROTECT(2); // classAttr, colVec
+
return;
}
@@ -394,17 +429,19 @@ class IntegerColumn : public IIntegerColumn
if (scale != FstTimeScale::SECONDS)
{
+ UNPROTECT(1); // colVec
throw(std::runtime_error("ITime column with unknown scale detected"));
}
+ UNPROTECT(1); // colVec
return;
}
+ UNPROTECT(1); // colVec
}
~IntegerColumn()
{
- UNPROTECT(1);
}
int* Data()
@@ -419,12 +456,18 @@ class IntegerColumn : public IIntegerColumn
{
if (annotation.length() > 0)
{
- Rf_setAttrib(colVec, Rf_install("tzone"),
- Rf_ScalarString(Rf_mkCharLen(annotation.c_str(), annotation.length())));
+ SEXP annotation_str = PROTECT(Rf_ScalarString(Rf_mkCharLen(annotation.c_str(), annotation.length())));
+
+ Rf_setAttrib(colVec, Rf_install("tzone"), annotation_str);
+
+ UNPROTECT(1); // annotation_str
return;
}
- Rf_setAttrib(colVec, Rf_install("tzone"), Rf_mkString(""));
+ SEXP empty_str = PROTECT(Rf_mkString(""));
+ Rf_setAttrib(colVec, Rf_install("tzone"), empty_str);
+
+ UNPROTECT(1); // empty_str
return;
}
}
@@ -438,13 +481,12 @@ class ByteColumn : public IByteColumn
ByteColumn(int nrOfRows)
{
+ // Note that this SEXP needs to be protected after creation
colVec = Rf_allocVector(RAWSXP, nrOfRows);
- PROTECT(colVec);
}
~ByteColumn()
{
- UNPROTECT(1);
}
char* Data()
diff --git a/src/fst_column_factory.h b/src/fst_column_factory.h
index 7a17dc6..d9095e4 100644
--- a/src/fst_column_factory.h
+++ b/src/fst_column_factory.h
@@ -43,9 +43,9 @@ class ColumnFactory : public IColumnFactory
public:
~ColumnFactory() {};
- IFactorColumn* CreateFactorColumn(int nrOfRows, FstColumnAttribute columnAttribute)
+ IFactorColumn* CreateFactorColumn(int nrOfRows, int nrOfLevels, FstColumnAttribute columnAttribute)
{
- return new FactorColumn(nrOfRows, columnAttribute);
+ return new FactorColumn(nrOfRows, nrOfLevels, columnAttribute);
}
ILogicalColumn* CreateLogicalColumn(int nrOfRows, FstColumnAttribute columnAttribute)
diff --git a/src/fst_compress.cpp b/src/fst_compress.cpp
index af7adf9..b85629b 100644
--- a/src/fst_compress.cpp
+++ b/src/fst_compress.cpp
@@ -89,17 +89,23 @@ SEXP fstcomp(SEXP rawVec, SEXP compressor, SEXP compression, SEXP hash)
Rf_error("Please specify true of false for parameter hash.");
}
- if (Rf_NonNullStringMatch(STRING_ELT(compressor, 0), Rf_mkChar("LZ4")))
+ SEXP lz4_str = PROTECT(Rf_mkChar("LZ4"));
+ SEXP zstd_str = PROTECT(Rf_mkChar("ZSTD"));
+
+ if (Rf_NonNullStringMatch(STRING_ELT(compressor, 0), lz4_str))
{
algo = COMPRESSION_ALGORITHM::ALGORITHM_LZ4;
- } else if (Rf_NonNullStringMatch(STRING_ELT(compressor, 0), Rf_mkChar("ZSTD")))
+ } else if (Rf_NonNullStringMatch(STRING_ELT(compressor, 0), zstd_str))
{
algo = COMPRESSION_ALGORITHM::ALGORITHM_ZSTD;
} else
{
+ UNPROTECT(2); // lz4_str and zstd_str
return fst_error("Unknown compression algorithm selected");
}
+ UNPROTECT(2); // lz4_str and zstd_str
+
FstCompressor fstcompressor(algo, *INTEGER(compression), (ITypeFactory*) typeFactoryP.get());
unsigned long long vecLength = Rf_xlength(rawVec);
@@ -109,6 +115,7 @@ SEXP fstcomp(SEXP rawVec, SEXP compressor, SEXP compression, SEXP hash)
try
{
+ // Creates an UNPROTECTED SEXP vector which needs to be protected before calling an allocating R API method
blobContainerP = std::unique_ptr(fstcompressor.CompressBlob(data, vecLength, *LOGICAL(hash)));
}
catch(const std::runtime_error& e)
@@ -122,6 +129,7 @@ SEXP fstcomp(SEXP rawVec, SEXP compressor, SEXP compression, SEXP hash)
SEXP resVec = ((BlobContainer*)(blobContainerP.get()))->RVector();
+ // IBlobContainer will be destructed upon exiting function
return resVec;
}
diff --git a/src/fst_string_vector_container.h b/src/fst_string_vector_container.h
new file mode 100644
index 0000000..a12d313
--- /dev/null
+++ b/src/fst_string_vector_container.h
@@ -0,0 +1,92 @@
+/*
+ fst - R package for ultra fast storage and retrieval of datasets
+
+ Copyright (C) 2017-present, Mark AJ Klik
+
+ This file is part of the fst R package.
+
+ The fst R package is free software: you can redistribute it and/or modify it
+ under the terms of the GNU Affero General Public License version 3 as
+ published by the Free Software Foundation.
+
+ The fst R package is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License
+ for more details.
+
+ You should have received a copy of the GNU Affero General Public License along
+ with the fst R package. If not, see .
+
+ You can contact the author at:
+ - fst R package source repository : https://github.com/fstpackage/fst
+*/
+
+#ifndef STRING_VECTOR_CONTAINER_H
+#define STRING_VECTOR_CONTAINER_H
+
+
+#include
+#include
+
+#include
+
+#include
+
+
+// Class is meant for smaller character vectors that need protection from garbage collection
+
+class StringVectorContainer : public IStringColumn
+{
+ SEXP container; // make sure the contained object is PROTECTED
+ std::unique_ptr str_vector_p;
+
+public:
+ StringVectorContainer(SEXP container_list)
+ {
+ container = container_list;
+ }
+
+ ~StringVectorContainer(){}
+
+ void AllocateVec(unsigned long long vecLength)
+ {
+ str_vector_p = std::unique_ptr(new BlockReaderChar()); // unprotected SEXP
+ BlockReaderChar* block_reader = str_vector_p.get();
+
+ // allocate
+ block_reader->AllocateVec(vecLength);
+
+ // make sure vector is protected
+ SEXP str_vec = block_reader->StrVector();
+ SET_VECTOR_ELT(container, 0, str_vec);
+ }
+
+ StringEncoding GetEncoding()
+ {
+ return str_vector_p.get()->GetEncoding();
+ }
+
+ void SetEncoding(StringEncoding fst_string_encoding)
+ {
+ str_vector_p.get()->SetEncoding(fst_string_encoding);
+ }
+
+ void BufferToVec(unsigned long long nrOfElements, unsigned long long startElem, unsigned long long endElem,
+ unsigned long long vecOffset, unsigned int* sizeMeta, char* buf)
+ {
+ str_vector_p.get()->BufferToVec(nrOfElements, startElem, endElem, vecOffset, sizeMeta, buf);
+ }
+
+ const char* GetElement(unsigned long long elementNr)
+ {
+ return str_vector_p.get()->GetElement(elementNr);
+ }
+
+ SEXP StrVector()
+ {
+ return VECTOR_ELT(container, 0);
+ }
+};
+
+
+#endif // STRING_VECTOR_CONTAINER_H
diff --git a/src/fst_table.cpp b/src/fst_table.cpp
index 216e198..0a3e5b4 100644
--- a/src/fst_table.cpp
+++ b/src/fst_table.cpp
@@ -35,11 +35,11 @@
using namespace Rcpp;
-FstTable::FstTable(SEXP &table, int uniformEncoding)
+FstTable::FstTable(SEXP &table, int uniformEncoding, SEXP r_container)
{
+ this->r_container = r_container;
this->rTable = &table;
this->nrOfCols = 0;
- this->isProtected = false;
this->uniformEncoding = uniformEncoding;
}
@@ -306,9 +306,17 @@ IStringWriter* FstTable::GetStringWriter(unsigned int colNr)
IStringWriter* FstTable::GetLevelWriter(unsigned int colNr)
{
cols = VECTOR_ELT(*rTable, colNr); // retrieve column vector
- cols = Rf_getAttrib(cols, Rf_mkString("levels"));
+
+ SEXP levels_str = PROTECT(Rf_mkString("levels"));
+ cols = PROTECT(Rf_getAttrib(cols, levels_str));
unsigned int nrOfFactorLevels = LENGTH(cols);
- return new BlockWriterChar(cols, nrOfFactorLevels, MAX_CHAR_STACK_SIZE, uniformEncoding);
+
+ IStringWriter* str_writer = new BlockWriterChar(cols, nrOfFactorLevels,
+ MAX_CHAR_STACK_SIZE, uniformEncoding);
+
+ UNPROTECT(2);
+
+ return str_writer;
}
@@ -341,28 +349,44 @@ inline unsigned int FindKey(StringVector colNameList, String item)
unsigned int FstTable::NrOfKeys()
{
- SEXP keyNames = Rf_getAttrib(*rTable, Rf_mkString("sorted"));
- if (Rf_isNull(keyNames)) return 0;
+ SEXP sorted_str = PROTECT(Rf_mkString("sorted"));
+ SEXP keyNames = PROTECT(Rf_getAttrib(*rTable, sorted_str));
+
+ if (Rf_isNull(keyNames)) {
+ UNPROTECT(2);
+ return 0;
+ }
+
+ unsigned int length = LENGTH(keyNames);
+
+ UNPROTECT(2);
- return LENGTH(keyNames);
+ return length;
}
void FstTable::GetKeyColumns(int* keyColPos)
{
- SEXP keyNames = Rf_getAttrib(*rTable, Rf_mkString("sorted"));
- if (Rf_isNull(keyNames)) return;
+ SEXP sorted_str = PROTECT(Rf_mkString("sorted"));
+ SEXP keyNames = PROTECT(Rf_getAttrib(*rTable, sorted_str));
+
+ if (Rf_isNull(keyNames)) {
+ UNPROTECT(2);
+ return;
+ }
int keyLength = LENGTH(keyNames);
// Find key column index numbers, if any
StringVector keyList(keyNames);
- SEXP colNames = Rf_getAttrib(*rTable, R_NamesSymbol);
+ SEXP colNames = PROTECT(Rf_getAttrib(*rTable, R_NamesSymbol));
for (int colSel = 0; colSel < keyLength; ++colSel)
{
keyColPos[colSel] = FindKey(colNames, keyList[colSel]);
}
+
+ UNPROTECT(3); // keyNames
}
@@ -373,15 +397,19 @@ void FstTable::InitTable(unsigned int nrOfCols, unsigned long long nrOfRows)
this->nrOfCols = nrOfCols;
this->nrOfRows = nrOfRows;
- this->resTable = Rf_allocVector(VECSXP, nrOfCols);
- PROTECT(resTable);
- isProtected = true;
+ SEXP resTable = Rf_allocVector(VECSXP, nrOfCols);
+
+ // this PROTECT's the new vector
+ SET_VECTOR_ELT(this->r_container, 0, resTable);
}
void FstTable::SetStringColumn(IStringColumn* stringColumn, int colNr)
{
BlockReaderChar* sColumn = (BlockReaderChar*) stringColumn;
+
+ // retrieve from memory-safe r container
+ SEXP resTable = VECTOR_ELT(this->r_container, 0);
SET_VECTOR_ELT(resTable, colNr, sColumn->StrVector());
}
@@ -389,6 +417,9 @@ void FstTable::SetStringColumn(IStringColumn* stringColumn, int colNr)
void FstTable::SetLogicalColumn(ILogicalColumn* logicalColumn, int colNr)
{
LogicalColumn* lColumn = (LogicalColumn*) logicalColumn;
+
+ // retrieve from memory-safe r container
+ SEXP resTable = VECTOR_ELT(this->r_container, 0);
SET_VECTOR_ELT(resTable, colNr, lColumn->boolVec);
}
@@ -396,6 +427,9 @@ void FstTable::SetLogicalColumn(ILogicalColumn* logicalColumn, int colNr)
void FstTable::SetInt64Column(IInt64Column* int64Column, int colNr)
{
Int64Column* i64Column = (Int64Column*) int64Column;
+
+ // retrieve from memory-safe r container
+ SEXP resTable = VECTOR_ELT(this->r_container, 0);
SET_VECTOR_ELT(resTable, colNr, i64Column->int64Vec);
}
@@ -403,14 +437,9 @@ void FstTable::SetInt64Column(IInt64Column* int64Column, int colNr)
void FstTable::SetDoubleColumn(IDoubleColumn* doubleColumn, int colNr)
{
DoubleColumn* dColumn = (DoubleColumn*) doubleColumn;
- SET_VECTOR_ELT(resTable, colNr, dColumn->colVec);
-}
-
-void FstTable::SetDoubleColumn(IDoubleColumn* doubleColumn, int colNr, std::string &annotation)
-{
- DoubleColumn* dColumn = (DoubleColumn*) doubleColumn;
- dColumn->Annotate(annotation);
+ // retrieve from memory-safe r container
+ SEXP resTable = VECTOR_ELT(this->r_container, 0);
SET_VECTOR_ELT(resTable, colNr, dColumn->colVec);
}
@@ -418,14 +447,9 @@ void FstTable::SetDoubleColumn(IDoubleColumn* doubleColumn, int colNr, std::stri
void FstTable::SetIntegerColumn(IIntegerColumn* integerColumn, int colNr)
{
IntegerColumn* iColumn = (IntegerColumn*) integerColumn;
- SET_VECTOR_ELT(resTable, colNr, iColumn->colVec);
-}
-
-void FstTable::SetIntegerColumn(IIntegerColumn* integerColumn, int colNr, std::string &annotation)
-{
- IntegerColumn* iColumn = (IntegerColumn*) integerColumn;
- iColumn->Annotate(annotation);
+ // retrieve from memory-safe r container
+ SEXP resTable = VECTOR_ELT(this->r_container, 0);
SET_VECTOR_ELT(resTable, colNr, iColumn->colVec);
}
@@ -433,6 +457,9 @@ void FstTable::SetIntegerColumn(IIntegerColumn* integerColumn, int colNr, std::s
void FstTable::SetByteColumn(IByteColumn* byteColumn, int colNr)
{
ByteColumn* bColumn = (ByteColumn*) byteColumn;
+
+ // retrieve from memory-safe r container
+ SEXP resTable = VECTOR_ELT(this->r_container, 0);
SET_VECTOR_ELT(resTable, colNr, bColumn->colVec);
}
@@ -440,34 +467,55 @@ void FstTable::SetByteColumn(IByteColumn* byteColumn, int colNr)
void FstTable::SetFactorColumn(IFactorColumn* factorColumn, int colNr)
{
FactorColumn* factColumn = (FactorColumn*) factorColumn;
- Rf_setAttrib(factColumn->intVec, Rf_mkString("levels"), factColumn->blockReaderStrVecP->StrVector());
+ //
+ // SEXP level_str = PROTECT(Rf_mkString("levels"));
+ // Rf_setAttrib(factColumn->intVec, level_str, factColumn->blockReaderStrVecP->StrVector());
+ // UNPROTECT(1); // level_str
+ //
+ // if (factColumn->Attribute() == FstColumnAttribute::FACTOR_ORDERED) // ordered factor
+ // {
+ // SEXP classes;
+ // PROTECT(classes = Rf_allocVector(STRSXP, 2));
+ // SET_STRING_ELT(classes, 0, Rf_mkChar("ordered"));
+ // SET_STRING_ELT(classes, 1, Rf_mkChar("factor"));
+ // Rf_setAttrib(factColumn->intVec, Rf_mkString("class"), classes);
+ // UNPROTECT(1);
+ // }
+ // else // unordered factor
+ // {
+ // SEXP factor_str = PROTECT(Rf_mkString("factor"));
+ // Rf_setAttrib(factColumn->intVec, Rf_mkString("class"), factor_str);
+ // UNPROTECT(1);
+ // }
+
+ // retrieve from memory-safe r container
+ SEXP resTable = VECTOR_ELT(this->r_container, 0);
+ SET_VECTOR_ELT(resTable, colNr, factColumn->intVec);
+}
- if (factColumn->Attribute() == FstColumnAttribute::FACTOR_ORDERED) // ordered factor
- {
- SEXP classes;
- PROTECT(classes = Rf_allocVector(STRSXP, 2));
- SET_STRING_ELT(classes, 0, Rf_mkChar("ordered"));
- SET_STRING_ELT(classes, 1, Rf_mkChar("factor"));
- Rf_setAttrib(factColumn->intVec, Rf_mkString("class"), classes);
- UNPROTECT(1);
- }
- else // unordered factor
- {
- Rf_setAttrib(factColumn->intVec, Rf_mkString("class"), Rf_mkString("factor"));
- }
- SET_VECTOR_ELT(resTable, colNr, factColumn->intVec);
+void FstTable::SetColNames(IStringArray* col_names)
+{
+ StringArray* colNames = (StringArray*) col_names; // upcast
+ SEXP colNameVec = PROTECT(colNames->StrVector());
+
+ // retrieve from memory-safe r container
+ SEXP resTable = VECTOR_ELT(this->r_container, 0);
+ Rf_setAttrib(resTable, R_NamesSymbol, colNameVec);
+ UNPROTECT(1); // colNameVec
}
-void FstTable::SetColNames()
+SEXP FstTable::GetColNames()
{
- // BlockReaderChar* blockReader = new BlockReaderChar();
- // return blockReader;
+ // retrieve from memory-safe r container
+ SEXP resTable = VECTOR_ELT(this->r_container, 0);
+
+ return Rf_getAttrib(resTable, R_NamesSymbol);
}
+
void FstTable::SetKeyColumns(int* keyColPos, unsigned int nrOfKeys)
{
}
-
diff --git a/src/fst_table.h b/src/fst_table.h
index 4eed4a8..2948d41 100644
--- a/src/fst_table.h
+++ b/src/fst_table.h
@@ -43,23 +43,31 @@ class FstTable : public IFstTable
{
// References to R objects
SEXP* rTable; // reference to R table structure (e.g. data.frame or data.table)
- SEXP cols; // reference to working column
+ SEXP cols; // reference to working column
+
+ // container for r objects, must be a VECSXP of length >= 1
+ SEXP r_container;
// Table metadata
unsigned int nrOfCols;
unsigned long long nrOfRows;
- bool isProtected;
int uniformEncoding;
public:
- SEXP resTable;
+ // use r_container for res_table
+ // SEXP resTable;
+
+ FstTable(SEXP r_container)
+ {
+ this->r_container = r_container;
+ }
- FstTable() { isProtected = false; nrOfCols = 0; }
+ FstTable(SEXP &table, int uniformEncoding, SEXP r_container);
- FstTable(SEXP &table, int uniformEncoding);
+ ~FstTable() {}
- ~FstTable() { if (isProtected) UNPROTECT(1); }
+ SEXP ResTable() { return VECTOR_ELT(r_container, 0); }
void InitTable(unsigned int nrOfCols, unsigned long long nrOfRows);
@@ -69,7 +77,7 @@ class FstTable : public IFstTable
void SetIntegerColumn(IIntegerColumn* integerColumn, int colNr);
- void SetIntegerColumn(IIntegerColumn* integerColumn, int colNr, std::string &annotation);
+ // void SetIntegerColumn(IIntegerColumn* integerColumn, int colNr, std::string &annotation);
void SetByteColumn(IByteColumn* byteColumn, int colNr);
@@ -77,8 +85,6 @@ class FstTable : public IFstTable
void SetDoubleColumn(IDoubleColumn * doubleColumn, int colNr);
- void SetDoubleColumn(IDoubleColumn * doubleColumn, int colNr, std::string &annotation);
-
void SetFactorColumn(IFactorColumn* factorColumn, int colNr);
void SetColNames();
@@ -110,6 +116,10 @@ class FstTable : public IFstTable
unsigned int NrOfColumns();
+ void SetColNames(IStringArray* col_names);
+
+ SEXP GetColNames();
+
unsigned long long NrOfRows();
};
diff --git a/src/fst_type_factory.h b/src/fst_type_factory.h
index 4ef323c..640a1e0 100644
--- a/src/fst_type_factory.h
+++ b/src/fst_type_factory.h
@@ -36,12 +36,12 @@ class BlobContainer : public IBlobContainer
public:
BlobContainer(unsigned long long size)
{
- PROTECT(rawVec = Rf_allocVector(RAWSXP, size));
+ // code was carefully examined to assert that no PROTECT is needed here
+ rawVec = Rf_allocVector(RAWSXP, size);
}
~BlobContainer()
{
- UNPROTECT(1);
}
unsigned char* Data()
@@ -79,4 +79,3 @@ class TypeFactory : public ITypeFactory
#endif // TYPE_FACTORY_H
-
diff --git a/src/fstcore/character/character_v6.cpp b/src/fstcore/character/character_v6.cpp
index e396132..9044eac 100644
--- a/src/fstcore/character/character_v6.cpp
+++ b/src/fstcore/character/character_v6.cpp
@@ -321,7 +321,7 @@ inline void ReadDataBlockCompressed_v6(istream& myfile, IStringColumn* blockRead
void fdsReadCharVec_v6(istream& myfile, IStringColumn* blockReader, unsigned long long blockPos, unsigned long long startRow,
- unsigned long long vecLength, unsigned long long size)
+ unsigned long long vecLength, unsigned long long size)
{
// Jump to startRow size
myfile.seekg(blockPos);
@@ -342,7 +342,7 @@ void fdsReadCharVec_v6(istream& myfile, IStringColumn* blockReader, unsigned lon
unsigned long long nrOfBlocks = 1 + endBlock - startBlock; // total number of blocks to read
// Create result vector
- blockReader->AllocateVec(vecLength);
+ // blockReader->AllocateVec(vecLength);
blockReader->SetEncoding(stringEncoding);
// Vector data is uncompressed
diff --git a/src/fstcore/factor/factor_v7.cpp b/src/fstcore/factor/factor_v7.cpp
index c468b0e..176257a 100644
--- a/src/fstcore/factor/factor_v7.cpp
+++ b/src/fstcore/factor/factor_v7.cpp
@@ -25,6 +25,7 @@
#include
#include
#include
+#include
// Framework headers
#include
@@ -35,8 +36,6 @@
#include
-// #include
-
using namespace std;
@@ -222,8 +221,8 @@ void fdsWriteFactorVec_v7(ofstream &myfile, int* intP, IStringWriter* blockRunne
// Parameter 'startRow' is zero based
// Data vector intP is expected to point to a memory block 4 * size bytes long
-void fdsReadFactorVec_v7(istream &myfile, IStringColumn* blockReader, int* intP, unsigned long long blockPos, unsigned long long startRow,
- unsigned long long length, unsigned long long size)
+void fdsReadFactorVec_v7(IFstTable &tableReader, istream &myfile, unsigned long long blockPos, unsigned long long startRow,
+ unsigned long long length, unsigned long long size, FstColumnAttribute col_attribute, IColumnFactory* columnFactory, int colSel)
{
// Jump to factor level
myfile.seekg(blockPos);
@@ -243,29 +242,34 @@ void fdsReadFactorVec_v7(istream &myfile, IStringColumn* blockReader, int* intP,
// Read level strings
- if (*nrOfLevels > 0)
+ std::unique_ptr factorColumnP(columnFactory->CreateFactorColumn(length, *nrOfLevels, col_attribute));
+ IFactorColumn* factorColumn = factorColumnP.get();
+
+ // add to table
+ tableReader.SetFactorColumn(factorColumn, colSel);
+
+ IStringColumn* blockReader = factorColumn->Levels();
+ int* intP = factorColumn->LevelData();
+
+ if (*nrOfLevels == 0)
{
- fdsReadCharVec_v6(myfile, blockReader, blockPos + HEADER_SIZE_FACTOR, 0, *nrOfLevels, *nrOfLevels); // get level strings
+ // All level values must be NA, so we need only the number of levels
+ for (unsigned int pos = 0; pos < length; pos++)
+ {
+ intP[pos] = FST_NA_INT;
+ }
}
else
{
- // Create empty level vector
- blockReader->AllocateVec(0);
+ // non-empty level vector
+ fdsReadCharVec_v6(myfile, blockReader, blockPos + HEADER_SIZE_FACTOR, 0, *nrOfLevels, *nrOfLevels); // get level strings
- // All level values must be NA, so we need only the number of levels
- for (unsigned int pos = 0; pos < length; pos++)
- {
- intP[pos] = FST_NA_INT;
- }
+ // Read level values
+ std::string annotation;
+ bool hasAnnotation;
- return;
+ fdsReadColumn_v2(myfile, reinterpret_cast(intP), *levelVecPos, startRow, length, size, 4, annotation, BATCH_SIZE_READ_FACTOR, hasAnnotation);
}
- // Read level values
- std::string annotation;
- bool hasAnnotation;
-
- fdsReadColumn_v2(myfile, reinterpret_cast(intP), *levelVecPos, startRow, length, size, 4, annotation, BATCH_SIZE_READ_FACTOR, hasAnnotation);
-
return;
}
diff --git a/src/fstcore/factor/factor_v7.h b/src/fstcore/factor/factor_v7.h
index 10e677f..debedf3 100644
--- a/src/fstcore/factor/factor_v7.h
+++ b/src/fstcore/factor/factor_v7.h
@@ -30,6 +30,8 @@
#include
#include
+#include
+#include
void fdsWriteFactorVec_v7(std::ofstream &myfile, int* intP, IStringWriter* blockRunner, unsigned long long size, unsigned int compression,
@@ -37,8 +39,8 @@ void fdsWriteFactorVec_v7(std::ofstream &myfile, int* intP, IStringWriter* block
// Parameter 'startRow' is zero based.
-void fdsReadFactorVec_v7(std::istream &myfile, IStringColumn* blockReader, int* intP, unsigned long long blockPos, unsigned long long startRow,
- unsigned long long length, unsigned long long size);
+void fdsReadFactorVec_v7(IFstTable &tableReader, std::istream &myfile, unsigned long long blockPos, unsigned long long startRow,
+ unsigned long long length, unsigned long long size, FstColumnAttribute col_attribute, IColumnFactory* columnFactory, int colSel);
#endif // FACTOR_v7_H
diff --git a/src/fstcore/interface/fstcompressor.h b/src/fstcore/interface/fstcompressor.h
index d26c781..c0e427a 100644
--- a/src/fstcore/interface/fstcompressor.h
+++ b/src/fstcore/interface/fstcompressor.h
@@ -325,7 +325,8 @@ class FstCompressor
}
- IBlobContainer* DecompressBlob(unsigned char* blobSource, const unsigned long long blobLength, const bool checkHashes = true) const
+ IBlobContainer* DecompressBlob(unsigned char* blobSource, const unsigned long long blobLength,
+ const bool checkHashes = true) const
{
Decompressor decompressor;
int nrOfThreads = GetFstThreads(); // available threads
@@ -352,8 +353,8 @@ class FstCompressor
unsigned int* headerHash = reinterpret_cast(blobSource);
unsigned int* blockSize = reinterpret_cast(blobSource + 4);
- unsigned int* version = reinterpret_cast(blobSource + 8);
-
+ unsigned int* version = reinterpret_cast(blobSource + 8);
+
unsigned int* algo = reinterpret_cast(blobSource + 12);
unsigned long long* vecLength = reinterpret_cast(blobSource + 16);
unsigned long long* hashResult = reinterpret_cast(blobSource + 24);
@@ -375,26 +376,26 @@ class FstCompressor
const unsigned int headHash = XXH32(&blobSource[4], headerSize - 4, FST_HASH_SEED); // header hash
- // header hash check
+ // header hash check
if (*headerHash != headHash)
{
throw(std::runtime_error(FSTERROR_COMP_HEADER));
}
// version check
- if (*version > FST_COMPRESS_VERSION)
- {
- throw(std::runtime_error(FSTERROR_COMP_FUTURE_VERSION));
- }
+ if (*version > FST_COMPRESS_VERSION)
+ {
+ throw(std::runtime_error(FSTERROR_COMP_FUTURE_VERSION));
+ }
- // Source vector has correct length
- if (blockOffsets[nrOfBlocks] != blobLength)
- {
- throw(std::runtime_error(FSTERROR_COMP_SIZE));
- }
+ // Source vector has correct length
+ if (blockOffsets[nrOfBlocks] != blobLength)
+ {
+ throw(std::runtime_error(FSTERROR_COMP_SIZE));
+ }
// Create result blob
- IBlobContainer* blob_container = typeFactory->CreateBlobContainer(*vecLength);
+ IBlobContainer* blob_container = typeFactory->CreateBlobContainer(*vecLength);
unsigned char* blob_data = blob_container->Data();
// Determine required number of threads
@@ -519,8 +520,8 @@ class FstCompressor
if (error)
{
- delete blob_container;
- throw(std::runtime_error(FSTERROR_COMP_STREAM));
+ delete blob_container;
+ throw(std::runtime_error(FSTERROR_COMP_STREAM));
}
return blob_container;
diff --git a/src/fstcore/interface/fststore.cpp b/src/fstcore/interface/fststore.cpp
index 3dcf060..8cfa8e5 100644
--- a/src/fstcore/interface/fststore.cpp
+++ b/src/fstcore/interface/fststore.cpp
@@ -124,7 +124,7 @@ using namespace std;
FstStore::FstStore(std::string fstFile)
{
this->fstFile = fstFile;
- this->blockReader = nullptr;
+ // this->blockReader = nullptr;
this->keyColPos = nullptr;
this->p_nrOfRows = nullptr;
metaDataBlock = nullptr;
@@ -136,7 +136,7 @@ FstStore::FstStore(std::string fstFile)
* \param myfile a stream to a fst file
* \param keyLength the number of key columns (output)
* \param nrOfColsFirstChunk the number of columns in the first chunkset (output)
- * \return
+ * \return
*/
inline unsigned int ReadHeader(ifstream &myfile, int &keyLength, int &nrOfColsFirstChunk)
{
@@ -207,7 +207,7 @@ inline void SetKeyIndex(vector &keyIndex, const int keyLength, const int nr
/**
* \brief Write a dataset to a fst file
* \param fstTable interface to a dataset
- * \param compress compression factor in the range 0 - 100
+ * \param compress compression factor in the range 0 - 100
*/
void FstStore::fstWrite(IFstTable &fstTable, const int compress) const
{
@@ -296,7 +296,7 @@ void FstStore::fstWrite(IFstTable &fstTable, const int compress) const
const bool isLittleEndian = (*reinterpret_cast(&endianTest)) == 0x67;
// Set table header parameters
-
+
*p_tableFlags = 0;
*p_tableVersion = FST_VERSION;
@@ -537,7 +537,7 @@ void FstStore::fstWrite(IFstTable &fstTable, const int compress) const
}
-void FstStore::fstMeta(IColumnFactory* columnFactory)
+void FstStore::fstMeta(IColumnFactory* columnFactory, IStringColumn* col_names)
{
// fst file stream using a stack buffer
ifstream myfile;
@@ -630,10 +630,8 @@ void FstStore::fstMeta(IColumnFactory* columnFactory)
// Read column names
const unsigned long long colNamesOffset = metaSize + TABLE_META_SIZE;
- blockReaderP = std::unique_ptr(columnFactory->CreateStringColumn(nrOfCols, FstColumnAttribute::NONE));
- blockReader = blockReaderP.get();
-
- fdsReadCharVec_v6(myfile, blockReader, colNamesOffset, 0, static_cast(nrOfCols), static_cast(nrOfCols));
+ col_names->AllocateVec(static_cast(nrOfCols));
+ fdsReadCharVec_v6(myfile, col_names, colNamesOffset, 0, static_cast(nrOfCols), static_cast(nrOfCols));
// cleanup
myfile.close();
@@ -641,7 +639,7 @@ void FstStore::fstMeta(IColumnFactory* columnFactory)
void FstStore::fstRead(IFstTable &tableReader, IStringArray* columnSelection, const long long startRow, const long long endRow,
- IColumnFactory* columnFactory, vector &keyIndex, IStringArray* selectedCols)
+ IColumnFactory* columnFactory, vector &keyIndex, IStringArray* selectedCols, IStringColumn* col_names)
{
// fst file stream using a stack buffer
ifstream myfile;
@@ -689,7 +687,7 @@ void FstStore::fstRead(IFstTable &tableReader, IStringArray* columnSelection, co
}
// Chunkset header [node C, free leaf of A or other chunkset header] [size: 80 + 8 * nrOfCols]
-
+
unsigned long long* p_chunksetHash = reinterpret_cast(&metaDataBlock[keyIndexHeaderSize]);
//unsigned int* p_chunksetHeaderVersion = reinterpret_cast(&metaDataBlock[keyIndexHeaderSize + 8]);
//int* p_chunksetFlags = reinterpret_cast(&metaDataBlock[keyIndexHeaderSize + 12]);
@@ -736,10 +734,11 @@ void FstStore::fstRead(IFstTable &tableReader, IStringArray* columnSelection, co
const unsigned long long colNamesOffset = metaSize + TABLE_META_SIZE;
- blockReaderP = std::unique_ptr(columnFactory->CreateStringColumn(nrOfCols, FstColumnAttribute::NONE));
- blockReader = blockReaderP.get();
+ // blockReaderP = std::unique_ptr(columnFactory->CreateStringColumn(nrOfCols, FstColumnAttribute::NONE));
+ // blockReader = blockReaderP.get();
- fdsReadCharVec_v6(myfile, blockReader, colNamesOffset, 0, static_cast(nrOfCols), static_cast(nrOfCols));
+ col_names->AllocateVec(static_cast(nrOfCols));
+ fdsReadCharVec_v6(myfile, col_names, colNamesOffset, 0, static_cast(nrOfCols), static_cast(nrOfCols));
// Size of chunkset index header plus data chunk header
const unsigned long long chunkIndexSize = CHUNK_INDEX_SIZE + DATA_INDEX_SIZE + 8 * nrOfCols;
@@ -822,7 +821,7 @@ void FstStore::fstRead(IFstTable &tableReader, IStringArray* columnSelection, co
for (int colNr = 0; colNr < nrOfCols; ++colNr)
{
- const char* str2 = blockReader->GetElement(colNr);
+ const char* str2 = col_names->GetElement(colNr);
if (strcmp(str1, str2) == 0)
{
equal = colNr;
@@ -894,8 +893,12 @@ void FstStore::fstRead(IFstTable &tableReader, IStringArray* columnSelection, co
{
std::unique_ptr stringColumnP(columnFactory->CreateStringColumn(length, static_cast(colAttributeTypes[colNr])));
IStringColumn* stringColumn = stringColumnP.get();
- fdsReadCharVec_v6(myfile, stringColumn, pos, firstRow, length, nrOfRows);
+
+ stringColumn->AllocateVec(static_cast(length));
tableReader.SetStringColumn(stringColumn, colSel);
+
+ fdsReadCharVec_v6(myfile, stringColumn, pos, firstRow, length, nrOfRows);
+
break;
}
@@ -905,17 +908,19 @@ void FstStore::fstRead(IFstTable &tableReader, IStringArray* columnSelection, co
std::unique_ptr integerColumnP(columnFactory->CreateIntegerColumn(length, static_cast(colAttributeTypes[colNr]), scale));
IIntegerColumn* integerColumn = integerColumnP.get();
+ tableReader.SetIntegerColumn(integerColumn, colSel);
+
std::string annotation = "";
bool hasAnnotation;
+
fdsReadIntVec_v8(myfile, integerColumn->Data(), pos, firstRow, length, nrOfRows, annotation, hasAnnotation);
if (hasAnnotation)
{
- tableReader.SetIntegerColumn(integerColumn, colSel, annotation);
+ integerColumn->Annotate(annotation);
break;
}
- tableReader.SetIntegerColumn(integerColumn, colSel);
break;
}
@@ -925,17 +930,18 @@ void FstStore::fstRead(IFstTable &tableReader, IStringArray* columnSelection, co
std::unique_ptr doubleColumnP(columnFactory->CreateDoubleColumn(length, static_cast(colAttributeTypes[colNr]), scale));
IDoubleColumn* doubleColumn = doubleColumnP.get();
+ tableReader.SetDoubleColumn(doubleColumn, colSel);
+
std::string annotation = "";
bool hasAnnotation;
+
fdsReadRealVec_v9(myfile, doubleColumn->Data(), pos, firstRow, length, nrOfRows, annotation, hasAnnotation);
if (hasAnnotation)
{
- tableReader.SetDoubleColumn(doubleColumn, colSel, annotation);
- break;
+ doubleColumn->Annotate(annotation);
}
- tableReader.SetDoubleColumn(doubleColumn, colSel);
break;
}
@@ -944,18 +950,17 @@ void FstStore::fstRead(IFstTable &tableReader, IStringArray* columnSelection, co
{
std::unique_ptr logicalColumnP(columnFactory->CreateLogicalColumn(length, static_cast(colAttributeTypes[colNr])));
ILogicalColumn* logicalColumn = logicalColumnP.get();
- fdsReadLogicalVec_v10(myfile, logicalColumn->Data(), pos, firstRow, length, nrOfRows);
tableReader.SetLogicalColumn(logicalColumn, colSel);
+ fdsReadLogicalVec_v10(myfile, logicalColumn->Data(), pos, firstRow, length, nrOfRows);
break;
}
// Factor vector
case 7:
{
- std::unique_ptr factorColumnP(columnFactory->CreateFactorColumn(length, static_cast(colAttributeTypes[colNr])));
- IFactorColumn* factorColumn = factorColumnP.get();
- fdsReadFactorVec_v7(myfile, factorColumn->Levels(), factorColumn->LevelData(), pos, firstRow, length, nrOfRows);
- tableReader.SetFactorColumn(factorColumn, colSel);
+ FstColumnAttribute col_attribute = static_cast(colAttributeTypes[colNr]);
+ fdsReadFactorVec_v7(tableReader, myfile, pos, firstRow, length, nrOfRows, col_attribute, columnFactory, colSel);
+
break;
}
@@ -964,8 +969,8 @@ void FstStore::fstRead(IFstTable &tableReader, IStringArray* columnSelection, co
{
std::unique_ptr int64ColumP(columnFactory->CreateInt64Column(length, static_cast(colAttributeTypes[colNr]), scale));
IInt64Column* int64Column = int64ColumP.get();
- fdsReadInt64Vec_v11(myfile, int64Column->Data(), pos, firstRow, length, nrOfRows);
tableReader.SetInt64Column(int64Column, colSel);
+ fdsReadInt64Vec_v11(myfile, int64Column->Data(), pos, firstRow, length, nrOfRows);
break;
}
@@ -974,8 +979,8 @@ void FstStore::fstRead(IFstTable &tableReader, IStringArray* columnSelection, co
{
std::unique_ptr byteColumnP(columnFactory->CreateByteColumn(length, static_cast(colAttributeTypes[colNr])));
IByteColumn* byteColumn = byteColumnP.get();
- fdsReadByteVec_v12(myfile, byteColumn->Data(), pos, firstRow, length, nrOfRows);
tableReader.SetByteColumn(byteColumn, colSel);
+ fdsReadByteVec_v12(myfile, byteColumn->Data(), pos, firstRow, length, nrOfRows);
break;
}
@@ -990,11 +995,15 @@ void FstStore::fstRead(IFstTable &tableReader, IStringArray* columnSelection, co
// Key index
SetKeyIndex(keyIndex, keyLength, nrOfSelect, keyColPos, colIndex);
- selectedCols->AllocateArray(nrOfSelect);
- selectedCols->SetEncoding(blockReader->GetEncoding());
+ // TODO: if all columns are selected, no copy is required!
+
+ selectedCols->AllocateArray(nrOfSelect); // allocate column names
+ tableReader.SetColNames(&*selectedCols); // set on result table
+
+ selectedCols->SetEncoding(col_names->GetEncoding());
for (int i = 0; i < nrOfSelect; ++i)
{
- selectedCols->SetElement(i, blockReader->GetElement(colIndex[i]));
+ selectedCols->SetElement(i, col_names->GetElement(colIndex[i]));
}
}
diff --git a/src/fstcore/interface/fststore.h b/src/fstcore/interface/fststore.h
index 386a79a..a98abaf 100644
--- a/src/fstcore/interface/fststore.h
+++ b/src/fstcore/interface/fststore.h
@@ -37,11 +37,8 @@ class FstStore
{
std::string fstFile;
std::unique_ptr metaDataBlockP;
- std::unique_ptr blockReaderP;
public:
- IStringColumn* blockReader;
-
unsigned long long* p_nrOfRows;
int* keyColPos;
@@ -67,10 +64,10 @@ class FstStore
*/
void fstWrite(IFstTable &fstTable, int compress) const;
- void fstMeta(IColumnFactory* columnFactory);
+ void fstMeta(IColumnFactory* columnFactory, IStringColumn* col_names);
void fstRead(IFstTable &tableReader, IStringArray* columnSelection, long long startRow, long long endRow,
- IColumnFactory* columnFactory, std::vector &keyIndex, IStringArray* selectedCols);
+ IColumnFactory* columnFactory, std::vector &keyIndex, IStringArray* selectedCols, IStringColumn* col_names);
};
diff --git a/src/fstcore/interface/icolumnfactory.h b/src/fstcore/interface/icolumnfactory.h
index fca042a..e3c93b7 100644
--- a/src/fstcore/interface/icolumnfactory.h
+++ b/src/fstcore/interface/icolumnfactory.h
@@ -32,7 +32,7 @@ class IColumnFactory
{
public:
virtual ~IColumnFactory() {};
- virtual IFactorColumn* CreateFactorColumn(int nrOfRows, FstColumnAttribute columnAttribute) = 0;
+ virtual IFactorColumn* CreateFactorColumn(int nrOfRows, int nrOfLevels, FstColumnAttribute columnAttribute) = 0;
virtual ILogicalColumn* CreateLogicalColumn(int nrOfRows, FstColumnAttribute columnAttribute) = 0;
virtual IDoubleColumn* CreateDoubleColumn(int nrOfRows, FstColumnAttribute columnAttribute, short int scale) = 0;
virtual IIntegerColumn* CreateIntegerColumn(int nrOfRows, FstColumnAttribute columnAttribute, short int scale) = 0;
diff --git a/src/fstcore/interface/ifstcolumn.h b/src/fstcore/interface/ifstcolumn.h
index 865813b..68e60cc 100644
--- a/src/fstcore/interface/ifstcolumn.h
+++ b/src/fstcore/interface/ifstcolumn.h
@@ -169,6 +169,7 @@ class IIntegerColumn
public:
virtual ~IIntegerColumn() {};
virtual int* Data() = 0;
+ virtual void Annotate(std::string annotation) = 0;
};
diff --git a/src/fstcore/interface/ifsttable.h b/src/fstcore/interface/ifsttable.h
index 2c0fd0b..d540d95 100644
--- a/src/fstcore/interface/ifsttable.h
+++ b/src/fstcore/interface/ifsttable.h
@@ -73,21 +73,18 @@ class IFstTable
virtual void SetLogicalColumn(ILogicalColumn* logicalColumn, int colNr) = 0;
- virtual void SetIntegerColumn(IIntegerColumn* integerColumn, int colNr, std::string &annotation) = 0;
-
virtual void SetIntegerColumn(IIntegerColumn* integerColumn, int colNr) = 0;
virtual void SetDoubleColumn(IDoubleColumn* doubleColumn, int colNr) = 0;
- virtual void SetDoubleColumn(IDoubleColumn* doubleColumn, int colNr, std::string &annotation) = 0;
-
virtual void SetFactorColumn(IFactorColumn* factorColumn, int colNr) = 0;
virtual void SetInt64Column(IInt64Column* int64Column, int colNr) = 0;
virtual void SetByteColumn(IByteColumn* byteColumn, int colNr) = 0;
-// virtual void SetColNames() = 0;
+ // use more efficient string container here (e.g. std::vector)
+ virtual void SetColNames(IStringArray* col_names) = 0;
virtual void SetKeyColumns(int* keyColPos, unsigned int nrOfKeys) = 0;
};
diff --git a/src/fstcore_v1/blockstreamer/blockstreamer_v1.cpp b/src/fstcore_v1/blockstreamer/blockstreamer_v1.cpp
deleted file mode 100644
index 3d94019..0000000
--- a/src/fstcore_v1/blockstreamer/blockstreamer_v1.cpp
+++ /dev/null
@@ -1,389 +0,0 @@
-/*
- fstlib - A C++ library for ultra fast storage and retrieval of datasets
-
- Copyright (C) 2017-present, Mark AJ Klik
-
- This file is part of fstlib.
-
- fstlib is free software: you can redistribute it and/or modify it under the
- terms of the GNU Affero General Public License version 3 as published by the
- Free Software Foundation.
-
- fstlib is distributed in the hope that it will be useful, but WITHOUT ANY
- WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
- A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
- details.
-
- You should have received a copy of the GNU Affero General Public License
- along with fstlib. If not, see .
-
- You can contact the author at:
- - fstlib source repository : https://github.com/fstpackage/fstlib
-*/
-
-
-// System libraries
-#include
-#include
-#include
-
-// R libraries
-#include
-
-// fst libraries
-#include
-#include
-#include
-
-using namespace std;
-using namespace Rcpp;
-
-#define COL_META_SIZE_V1 8
-#define PREF_BLOCK_SIZE_V1 16384
-
-
-// Read data compressed with a fixed ratio compressor from a stream
-// Note that repSize is assumed to be a multiple of elementSize
-inline SEXP fdsReadFixedCompStream_v1(ifstream &myfile, char* outVec, unsigned long long blockPos,
- unsigned int* meta, unsigned int startRow, int elementSize, unsigned int vecLength)
-{
- unsigned int compAlgo = meta[1]; // identifier of the fixed ratio compressor
- unsigned int repSize = fixedRatioSourceRepSize[(int) compAlgo]; // in bytes
- unsigned int targetRepSize = fixedRatioTargetRepSize[(int) compAlgo]; // in bytes
-
- // robustness: test for correct algo here
- if (repSize < 1)
- {
- // throw exception
- }
-
- // Determine random-access starting point
- unsigned int repSizeElement = repSize / elementSize;
- unsigned int startRep = startRow / repSizeElement;
- unsigned int endRep = (startRow + vecLength - 1) / repSizeElement;
-
- Decompressor decompressor; // decompressor
-
- if (startRep > 0)
- {
- myfile.seekg(blockPos + COL_META_SIZE_V1 + startRep * targetRepSize); // move to startRep
- }
-
- unsigned int startRowRep = startRep * repSizeElement;
- unsigned int startOffset = startRow - startRowRep; // rep-block offset in number of elements
-
- char* outP = outVec; // allow shifting of vector pointer
-
- // Process partial repetition block
- if (startOffset != 0)
- {
- char repBuf[MAX_TARGET_REP_SIZE]; // rep unit buffer for target
- char buf[MAX_SOURCE_REP_SIZE]; // rep unit buffer for source
-
- myfile.read(repBuf, targetRepSize); // read single repetition block
- int resSize = decompressor.Decompress(compAlgo, buf, repSize, repBuf, targetRepSize); // decompress repetition block
-
- if (startRep == endRep) // finished
- {
- // Skip first startOffset elements
- memcpy(outVec, &buf[elementSize * startOffset], elementSize * vecLength); // data range
-
- return List::create(
- _["meta[0]"] = meta[0],
- _["meta[1]"] = meta[1],
- _["repSize"] = repSize,
- _["targetRepSize"] = targetRepSize,
- _["startOffset"] = startOffset,
- _["startRep"] = startRep,
- _["vecLength"] = vecLength,
- _["elementSize"] = elementSize,
- _["repSizeElement"] = repSizeElement,
- _["resSize"] = resSize,
- _["blockPos"] = blockPos);
- }
-
- int length = repSizeElement - startOffset; // remaining elements
- memcpy(outVec, &buf[elementSize * startOffset], elementSize * length); // data range
- outP = &outVec[elementSize * length]; // outVec with correct offset
- ++startRep;
- }
-
- // Process in large blocks
-
- // Define prefered block sizes
- unsigned int nrOfRepsPerBlock = (PREF_BLOCK_SIZE_V1 / repSize);
- unsigned int nrOfReps = 1 + endRep - startRep; // remaining reps to read
- unsigned int nrOfFullBlocks = (nrOfReps - 1) / nrOfRepsPerBlock; // excluding last (partial) block
-
- unsigned int blockSize = nrOfRepsPerBlock * repSize; // block size in bytes
- unsigned int targetBlockSize = nrOfRepsPerBlock * targetRepSize; // block size in bytes
-
- char repBuf[MAX_TARGET_BUFFER]; // maximum size read buffer for PREF_BLOCK_SIZE_V1 source
-
-
- // Decompress full blocks
- for (unsigned int block = 0; block < nrOfFullBlocks; ++block)
- {
- myfile.read(repBuf, targetBlockSize);
- decompressor.Decompress(compAlgo, &outP[block * blockSize], blockSize, repBuf, targetBlockSize);
- }
-
- unsigned int remainReps = nrOfReps - nrOfRepsPerBlock * nrOfFullBlocks; // always > 0 including last rep unit
-
- // Read last block
- unsigned int lastBlockSize = remainReps * repSize; // block size in bytes
- unsigned int lastTargetBlockSize = remainReps * targetRepSize; // block size in bytes
- myfile.read(repBuf, lastTargetBlockSize);
-
- // Decompress all but last repetition block fully
- if (lastBlockSize != repSize)
- {
- decompressor.Decompress(compAlgo, &outP[nrOfFullBlocks * blockSize], lastBlockSize - repSize, repBuf, lastTargetBlockSize - targetRepSize);
- }
-
- // Last rep unit may be partial
- char buf[MAX_SOURCE_REP_SIZE]; // single rep unit buffer
- unsigned int nrOfElemsLastRep = startRow + vecLength - endRep * repSizeElement;
-
-
- int resSize = decompressor.Decompress(compAlgo, buf, repSize, &repBuf[lastTargetBlockSize - targetRepSize], targetRepSize); // decompress repetition block
- memcpy(&outP[nrOfFullBlocks * blockSize + lastBlockSize - repSize], buf, elementSize * nrOfElemsLastRep); // skip last elements if required
-
- return List::create(
- _["nrOfElemsLastRep"] = nrOfElemsLastRep,
- _["remainReps"] = remainReps,
- _["startRep"] = startRep,
- _["nrOfRepsPerBlock"] = nrOfRepsPerBlock,
- _["endRep"] = endRep,
- _["repSize"] = repSize,
- _["resSize"] = resSize,
- _["nrOfReps"] = nrOfReps,
- _["nrOfFullBlocks"] = nrOfFullBlocks,
- _["startRow"] = startRow,
- _["lastTargetBlockSize"] = lastTargetBlockSize);
-}
-
-
-SEXP fdsReadColumn_v1(ifstream &myfile, char* outVec, unsigned long long blockPos, unsigned startRow, unsigned length, unsigned size, int elementSize)
-{
- // Read header
- unsigned int compress[2];
- myfile.seekg(blockPos);
- myfile.read((char*) compress, COL_META_SIZE_V1);
-
- // Data is uncompressed or uses a fixed-ratio compressor (logical)
- if (compress[0] == 0)
- {
- if (compress[1] == 0) // uncompressed data
- {
- // Jump to startRow position
- if (startRow > 0) myfile.seekg(blockPos + elementSize * startRow + COL_META_SIZE_V1);
-
- // Read data
- myfile.read((char*) outVec, elementSize * length);
-
- return List::create(_["1"] = (int) 1);
- }
-
- // Stream uses a fixed-ratio compressor
-
- SEXP res = fdsReadFixedCompStream_v1(myfile, outVec, blockPos, compress, startRow, elementSize, length);
- return List::create(
- _["res"] = res);
- }
-
- // Data is compressed
-
- // unsigned int* maxCompSize = (unsigned int*) &compress[0]; // 4 algorithms in index
- unsigned int blockSizeElements = compress[1]; // number of elements per block
-
- // Number of compressed data blocks, the last block can be smaller than blockSizeElements
- int nrOfBlocks = 1 + (size - 1) / blockSizeElements;
-
- // Calculate startRow data block position
- int startBlock = startRow / blockSizeElements;
- int endBlock = (startRow + length - 1) / blockSizeElements;
- int startOffset = startRow % blockSizeElements;
-
- if (startBlock > 0)
- {
- myfile.seekg(blockPos + COL_META_SIZE_V1 + 10 * startBlock); // move to startBlock meta info
- }
-
- // Read block index (position pointer and algorithm for each block)
- char* blockIndex = new char[(2 + endBlock - startBlock) * 10]; // 1 long file pointer and 1 short algorithmID per block
- myfile.read(blockIndex, (2 + endBlock - startBlock) * 10);
-
- int blockSize = elementSize * blockSizeElements;
-
- // char compBuf[*maxCompSize]; // read buffer
- // char tmpBuf[blockSize]; // temporary buffer
- char compBuf[MAX_COMPRESSBOUND]; // maximum size needed in worst case scenario compression
- char tmpBuf[MAX_SIZE_COMPRESS_BLOCK]; // temporary buffer
-
- Decompressor decompressor;
-
- unsigned long long* blockPStart = (unsigned long long*) &blockIndex[0];
- unsigned long long* blockPEnd = (unsigned long long*) &blockIndex[10];
- unsigned long long compSize = *blockPEnd - *blockPStart;
- unsigned short* algo = (unsigned short*) &blockIndex[8];
-
- // Process single block and return
- if (startBlock == endBlock) // Read single block and subset result
- {
- if (*algo == 0) // no compression on this block
- {
- myfile.seekg(blockPos + *blockPStart + elementSize * startOffset); // move to block data position
- myfile.read((char*) outVec, length * elementSize);
-
- delete[] blockIndex;
- return List::create(
- _["startBlock"] = startBlock,
- _["endBlock"] = endBlock,
- _["startOffset"] = startOffset,
- _["blockPStart"] = (int) (blockPos + *blockPStart),
- _["blockPEnd"] = (int) (blockPos + *blockPEnd),
- _["blockPos"] = (int) blockPos,
- _["algo"] = (int) *algo);
- }
-
- // Data is compressed
- unsigned int curSize = blockSizeElements;
- if (startBlock == (nrOfBlocks - 1)) // test for last block
- {
- curSize = 1 + (size + blockSizeElements - 1) % blockSizeElements; // smaller last block size
- }
-
- myfile.seekg(blockPos + *blockPStart); // move to block data position
- myfile.read(compBuf, compSize);
-
- if (length == curSize)
- {
- decompressor.Decompress(*algo, outVec, elementSize * length, compBuf, compSize); // direct decompress
- }
- else
- {
- decompressor.Decompress(*algo, tmpBuf, elementSize * curSize, compBuf, compSize); // decompress in tmp buffer
- memcpy(outVec, &tmpBuf[elementSize * startOffset], elementSize * length); // data range
- }
-
- delete[] blockIndex;
- return List::create(
- _["curSize"] = curSize,
- _["startBlock"] = startBlock,
- _["endBlock"] = endBlock,
- _["startOffset"] = startOffset,
- _["blockPStart"] = (int) (blockPos + *blockPStart),
- _["blockPEnd"] = (int) (blockPos + *blockPEnd),
- _["blockPos"] = (int) blockPos,
- _["algo"] = (int) *algo);
- }
-
- // Calculations span at least two block
-
- // First block
- int subBlockSize = blockSizeElements - startOffset;
-
- if (*algo == 0) // no compression
- {
- myfile.seekg(blockPos + *blockPStart + elementSize * startOffset); // move to block data position
- myfile.read(outVec, elementSize * subBlockSize); // read first block data
- } else
- {
- myfile.seekg(blockPos + *blockPStart); // move to block data position
- myfile.read(compBuf, compSize);
-
- if (startOffset == 0) // full block
- {
- decompressor.Decompress(*algo, outVec, blockSize, compBuf, compSize);
- }
- else
- {
- decompressor.Decompress(*algo, tmpBuf, blockSize, compBuf, compSize);
- memcpy(outVec, &tmpBuf[elementSize * startOffset], elementSize * subBlockSize);
- }
- }
-
- int remain = (startRow + length) % blockSizeElements; // remaining required items in last block
- if (remain == 0) ++endBlock;
-
- int maxBlock = endBlock - startBlock;
- int outOffset = subBlockSize * elementSize; // position in output vector
-
- // Process middle blocks (if any)
- for (int blockCount = 1; blockCount < maxBlock; ++blockCount)
- {
- // Update meta pointers
- blockPStart = blockPEnd;
- blockPEnd = (unsigned long long*) &blockIndex[10 + 10 * blockCount];
- compSize = *blockPEnd - *blockPStart;
- algo = (unsigned short*) &blockIndex[8 + 10 * blockCount];
-
- if (*algo == 0) // no compression
- {
- myfile.read(&outVec[outOffset], blockSize); // read first block data
- } else
- {
- myfile.read(compBuf, compSize);
- decompressor.Decompress(*algo, &outVec[outOffset], blockSize, compBuf, compSize);
- }
-
- outOffset += blockSize; // update position in output vector
- }
-
- // No last block
- if (remain == 0) // no additional elements required
- {
- delete[] blockIndex;
- return List::create(
- _["Remain0"] = true,
- _["endBlock"] = endBlock,
- _["compSize"] = compSize,
- _["algo"] = (int) (*algo),
- _["maxBlock"] = maxBlock);
- }
-
- // Process last block
-
- // Update meta pointers
- blockPStart = blockPEnd;
- blockPEnd = (unsigned long long*) &blockIndex[10 + 10 * maxBlock];
- compSize = *blockPEnd - *blockPStart;
- algo = (unsigned short*) &blockIndex[8 + 10 * maxBlock];
-
- int curSize = blockSizeElements; // default block size
-
- if (*algo == 0) // no compression
- {
- myfile.read(&outVec[outOffset], elementSize * remain); // read remaining elements from block
- } else
- {
- myfile.read(compBuf, compSize);
-
- if (endBlock == (nrOfBlocks - 1)) // test for last block
- {
- curSize = 1 + (size + blockSizeElements - 1) % blockSizeElements; // smaller last block size
- }
-
- if (remain == curSize) // full last block
- {
- decompressor.Decompress(*algo, &outVec[outOffset], curSize * elementSize, compBuf, compSize);
- }
- else
- {
- decompressor.Decompress(*algo, tmpBuf, curSize * elementSize, compBuf, compSize); // define tmpBuf locally for speed ?
- memcpy((char*) &outVec[outOffset], tmpBuf, elementSize * remain);
- }
- }
-
- delete[] blockIndex;
- return List::create(
- _["compSize"] = compSize,
- _["algo"] = (int) (*algo),
- _["maxBlock"] = maxBlock,
- _["remain"] = remain,
- _["curSize"] = curSize,
- _["blockPStart"] = (int) *blockPStart,
- _["blockPEnd"] = (int) *blockPEnd);
-}
-
diff --git a/src/fstcore_v1/blockstreamer/blockstreamer_v1.h b/src/fstcore_v1/blockstreamer/blockstreamer_v1.h
deleted file mode 100644
index 8d3ec8f..0000000
--- a/src/fstcore_v1/blockstreamer/blockstreamer_v1.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- fstlib - A C++ library for ultra fast storage and retrieval of datasets
-
- Copyright (C) 2017-present, Mark AJ Klik
-
- This file is part of fstlib.
-
- fstlib is free software: you can redistribute it and/or modify it under the
- terms of the GNU Affero General Public License version 3 as published by the
- Free Software Foundation.
-
- fstlib is distributed in the hope that it will be useful, but WITHOUT ANY
- WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
- A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
- details.
-
- You should have received a copy of the GNU Affero General Public License
- along with fstlib. If not, see .
-
- You can contact the author at:
- - fstlib source repository : https://github.com/fstpackage/fstlib
-*/
-
-#ifndef BLOCKSTORE_H
-#define BLOCKSTORE_H
-
-#include // Rcpp header
-
-// Framework headers
-#include
-
-
-SEXP fdsReadColumn_v1(std::ifstream &myfile, char* outVec, unsigned long long blockPos, unsigned startRow, unsigned length, unsigned size, int elementSize);
-
-
-#endif // BLOCKSTORE_H
diff --git a/src/fstcore_v1/character/character_v1.cpp b/src/fstcore_v1/character/character_v1.cpp
deleted file mode 100644
index 9444c09..0000000
--- a/src/fstcore_v1/character/character_v1.cpp
+++ /dev/null
@@ -1,513 +0,0 @@
-/*
- fstlib - A C++ library for ultra fast storage and retrieval of datasets
-
- Copyright (C) 2017-present, Mark AJ Klik
-
- This file is part of fstlib.
-
- fstlib is free software: you can redistribute it and/or modify it under the
- terms of the GNU Affero General Public License version 3 as published by the
- Free Software Foundation.
-
- fstlib is distributed in the hope that it will be useful, but WITHOUT ANY
- WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
- A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
- details.
-
- You should have received a copy of the GNU Affero General Public License
- along with fstlib. If not, see .
-
- You can contact the author at:
- - fstlib source repository : https://github.com/fstpackage/fstlib
-*/
-
-
-#include
-#include
-
-#include
-
-#include
-#include
-#include
-
-
-// External libraries
-
-using namespace std;
-using namespace Rcpp;
-
-
-#define BLOCKSIZE_CHAR 2047 // number of characters in default compression block
-#define MAX_CHAR_STACK_SIZE 32768 // number of characters in default compression block
-#define CHAR_HEADER_SIZE 8 // meta data header size
-#define CHAR_INDEX_SIZE 16 // size of 1 index entry
-
-
-inline void ReadDataBlockInfo(SEXP &strVec, unsigned long long blockSize, unsigned int nrOfElements,
- unsigned int startElem, unsigned int endElem, unsigned int vecOffset, unsigned int* sizeMeta, char* buf, unsigned int nrOfNAInts)
-{
- unsigned int* strSizes = &sizeMeta[nrOfNAInts];
- unsigned int pos = 0;
-
- if (startElem != 0)
- {
- pos = strSizes[startElem - 1]; // offset previous element
- }
-
- // Test NA flag
- unsigned int flagNA = sizeMeta[nrOfNAInts - 1] & (1 << (nrOfElements % 32));
- if (flagNA == 0) // no NA's in vector
- {
- for (unsigned int blockElem = startElem; blockElem <= endElem; ++blockElem)
- {
- unsigned int newPos = strSizes[blockElem];
- SEXP curStr = Rf_mkCharLen(buf + pos, newPos - pos);
- SET_STRING_ELT(strVec, vecOffset + blockElem - startElem, curStr);
- pos = newPos; // update to new string offset
- }
-
- return;
- }
-
- // We process the datablock in cycles of 32 strings. This minimizes the impact of NA testing for vectors with a small number of NA's
-
- unsigned int startCycle = startElem / 32;
- unsigned int endCycle = endElem / 32;
- unsigned int cycleNAs = sizeMeta[startCycle];
-
- // A single 32 string cycle
-
- if (startCycle == endCycle)
- {
- for (unsigned int blockElem = startElem; blockElem <= endElem; ++blockElem)
- {
- unsigned int bitMask = 1 << (blockElem % 32);
-
- if ((cycleNAs & bitMask) != 0) // set string to NA
- {
- SET_STRING_ELT(strVec, vecOffset + blockElem - startElem, NA_STRING);
- pos = strSizes[blockElem]; // update to new string offset
- continue;
- }
-
- // Get string from data stream
-
- unsigned int newPos = strSizes[blockElem];
- SEXP curStr = Rf_mkCharLen(buf + pos, newPos - pos);
- SET_STRING_ELT(strVec, vecOffset + blockElem - startElem, curStr);
- pos = newPos; // update to new string offset
- }
-
- return;
- }
-
- // Get possibly partial first cycle
-
- unsigned int firstCylceEnd = startCycle * 32 + 31;
- for (unsigned int blockElem = startElem; blockElem <= firstCylceEnd; ++blockElem)
- {
- unsigned int bitMask = 1 << (blockElem % 32);
-
- if ((cycleNAs & bitMask) != 0) // set string to NA
- {
- SET_STRING_ELT(strVec, vecOffset + blockElem - startElem, NA_STRING);
- pos = strSizes[blockElem]; // update to new string offset
- continue;
- }
-
- // Get string from data stream
-
- unsigned int newPos = strSizes[blockElem];
- SEXP curStr = Rf_mkCharLen(buf + pos, newPos - pos);
- SET_STRING_ELT(strVec, vecOffset + blockElem - startElem, curStr);
- pos = newPos; // update to new string offset
- }
-
- // Get all but last cycle with fast NA test
-
- for (unsigned int cycle = startCycle + 1; cycle != endCycle; ++cycle)
- {
- unsigned int cycleNAs = sizeMeta[cycle];
- unsigned int middleCycleEnd = cycle * 32 + 32;
-
- if (cycleNAs == 0) // no NA's
- {
- for (unsigned int blockElem = cycle * 32; blockElem != middleCycleEnd; ++blockElem)
- {
- unsigned int newPos = strSizes[blockElem];
- SEXP curStr = Rf_mkCharLen(buf + pos, newPos - pos);
- SET_STRING_ELT(strVec, vecOffset + blockElem - startElem, curStr);
- pos = newPos; // update to new string offset
- }
- continue;
- }
-
- // Cycle contains one or more NA's
-
- for (unsigned int blockElem = cycle * 32; blockElem != middleCycleEnd; ++blockElem)
- {
- unsigned int bitMask = 1 << (blockElem % 32);
- unsigned int newPos = strSizes[blockElem];
-
- if ((cycleNAs & bitMask) != 0) // set string to NA
- {
- SET_STRING_ELT(strVec, vecOffset + blockElem - startElem, NA_STRING);
- pos = newPos; // update to new string offset
- continue;
- }
-
- // Get string from data stream
-
- SEXP curStr = Rf_mkCharLen(buf + pos, newPos - pos);
- SET_STRING_ELT(strVec, vecOffset + blockElem - startElem, curStr);
- pos = newPos; // update to new string offset
- }
- }
-
- // Last cycle
-
- cycleNAs = sizeMeta[endCycle];
-
- ++endElem;
- for (unsigned int blockElem = endCycle * 32; blockElem != endElem; ++blockElem)
- {
- unsigned int bitMask = 1 << (blockElem % 32);
- unsigned int newPos = strSizes[blockElem];
-
- if ((cycleNAs & bitMask) != 0) // set string to NA
- {
- SET_STRING_ELT(strVec, vecOffset + blockElem - startElem, NA_STRING);
- pos = newPos; // update to new string offset
- continue;
- }
-
- // Get string from data stream
-
- SEXP curStr = Rf_mkCharLen(buf + pos, newPos - pos);
- SET_STRING_ELT(strVec, vecOffset + blockElem - startElem, curStr);
- pos = newPos; // update to new string offset
- }
-}
-
-
-inline void ReadDataBlock(ifstream &myfile, SEXP &strVec, unsigned long long blockSize, unsigned int nrOfElements,
- unsigned int startElem, unsigned int endElem, unsigned int vecOffset)
-{
- unsigned int nrOfNAInts = 1 + nrOfElements / 32; // last bit is NA flag
- unsigned int totElements = nrOfElements + nrOfNAInts;
- unsigned int *sizeMeta = new unsigned int[totElements];
- myfile.read((char*) sizeMeta, totElements * 4); // read cumulative string lengths
-
- unsigned int charDataSize = blockSize - totElements * 4;
-
- char* buf = new char[charDataSize];
- myfile.read(buf, charDataSize); // read string lengths
-
- ReadDataBlockInfo(strVec, blockSize, nrOfElements, startElem, endElem, vecOffset, sizeMeta, buf, nrOfNAInts);
-
- delete[] sizeMeta;
-}
-
-
-inline SEXP ReadDataBlockCompressed(ifstream &myfile, SEXP &strVec, unsigned long long blockSize, unsigned int nrOfElements,
- unsigned int startElem, unsigned int endElem, unsigned int vecOffset,
- unsigned int intBlockSize, Decompressor &decompressor, unsigned short int &algoInt, unsigned short int &algoChar)
-{
- unsigned int nrOfNAInts = 1 + nrOfElements / 32; // NA metadata including overall NA bit
- unsigned int totElements = nrOfElements + nrOfNAInts;
- unsigned int *sizeMeta = new unsigned int[totElements];
-
- // Read and uncompress str sizes data
- if (algoInt == 0) // uncompressed
- {
- myfile.read((char*) sizeMeta, totElements * 4); // read cumulative string lengths
- }
- else
- {
- myfile.read((char*) sizeMeta, nrOfNAInts * 4); // read cumulative string lengths
- unsigned int intBufSize = intBlockSize;
- char *strSizeBuf = new char[intBufSize];
- myfile.read(strSizeBuf, intBufSize);
-
- // Decompress size but not NA metadata (which is currently uncompressed)
-
- decompressor.Decompress(algoInt, (char*)(&sizeMeta[nrOfNAInts]), nrOfElements * 4,
- strSizeBuf, intBlockSize);
-
- delete[] strSizeBuf;
- }
-
- unsigned int charDataSizeUncompressed = sizeMeta[nrOfNAInts + nrOfElements - 1];
-
- // Read and uncompress string vector data, use stack if possible here !!!!!
- unsigned int charDataSize = blockSize - intBlockSize - nrOfNAInts * 4;
- char* buf = new char[charDataSizeUncompressed];
-
- if (algoChar == 0)
- {
- myfile.read(buf, charDataSize); // read string lengths
- }
- else
- {
- char* bufCompressed = new char[charDataSize];
- myfile.read(bufCompressed, charDataSize); // read string lengths
- decompressor.Decompress(algoChar, buf, charDataSizeUncompressed, bufCompressed, charDataSize);
- delete[] bufCompressed;
- }
-
- ReadDataBlockInfo(strVec, blockSize, nrOfElements, startElem, endElem, vecOffset, sizeMeta, buf, nrOfNAInts);
-
- delete[] buf; // character vector buffer
- delete[] sizeMeta;
-
- return List::create(
- _["startElem"] = startElem,
- _["endElem"] = endElem,
- _["algoInt"] = algoInt,
- _["charDataSize"] = charDataSize,
- _["charDataSizeUncompressed"] = charDataSizeUncompressed,
- _["algoChar"] = algoChar,
- _["intBlockSize"] = intBlockSize,
- _["nrOfElements"] = nrOfElements);
-}
-
-
-List fdsReadCharVec_v1(ifstream &myfile, SEXP &strVec, unsigned long long blockPos, unsigned int startRow, unsigned int vecLength, unsigned int size)
-{
- // Jump to startRow size
- myfile.seekg(blockPos);
-
- // Read algorithm type and block size
- unsigned int meta[3];
- myfile.read((char*) meta, CHAR_HEADER_SIZE);
-
- unsigned int blockSizeChar = meta[1];
- unsigned int totNrOfBlocks = (size - 1) / blockSizeChar; // total number of blocks minus 1
- unsigned int startBlock = startRow / blockSizeChar;
- unsigned int startOffset = startRow - (startBlock * blockSizeChar);
- unsigned int endBlock = (startRow + vecLength - 1) / blockSizeChar;
- unsigned int endOffset = (startRow + vecLength - 1) - endBlock *blockSizeChar;
- unsigned int nrOfBlocks = 1 + endBlock - startBlock; // total number of blocks to read
-
- // Vector data is uncompressed
-
- if (meta[0] == 0)
- {
- unsigned long long *blockOffset = new unsigned long long[1 + nrOfBlocks]; // block positions
-
- if (startBlock > 0) // include previous block offset
- {
- myfile.seekg(blockPos + CHAR_HEADER_SIZE + (startBlock - 1) * 8); // jump to correct block index
- myfile.read((char*) blockOffset, (1 + nrOfBlocks) * 8);
- }
- else
- {
- blockOffset[0] = CHAR_HEADER_SIZE + (totNrOfBlocks + 1) * 8;
- myfile.read((char*) &blockOffset[1], nrOfBlocks * 8);
- }
-
-
- // Navigate to first selected data block
- unsigned long long offset = blockOffset[0];
- myfile.seekg(blockPos + offset);
-
- unsigned int endElem = blockSizeChar - 1;
- unsigned int nrOfElements = blockSizeChar;
-
- if (startBlock == endBlock) // subset start and end of block
- {
- endElem = endOffset;
- if (endBlock == totNrOfBlocks)
- {
- nrOfElements = size - totNrOfBlocks * blockSizeChar; // last block can have less elements
- }
- }
-
- // Read first block with offset
- unsigned long long blockSize = blockOffset[1] - offset; // size of data block
-
- ReadDataBlock(myfile, strVec, blockSize, nrOfElements, startOffset, endElem, 0);
-
- if (startBlock == endBlock) // subset start and end of block
- {
- delete[] blockOffset;
-
- return List::create(
- _["res"] = "uncompressed",
- _["vecLength"] = vecLength,
- _["meta[0]"] = meta[0],
- _["meta[1]"] = meta[1],
- _["totNrOfBlocks"] = totNrOfBlocks,
- _["startBlock"] = startBlock,
- _["endBlock"] = endBlock,
- _["nrOfBlocks"] = nrOfBlocks,
- _["nrOfElements"] = (int) nrOfElements,
- _["endElem"] = (int) endElem,
- _["startOffset"] = startOffset,
- _["blockSize"] = blockSize,
- _["blockPos"] = (int) blockPos,
- _["blockOffset[0]"] = (int) blockOffset[0],
- _["blockOffset[1]"] = (int) blockOffset[1],
- _["blockOffset[2]"] = (int) blockOffset[2]);
- }
-
- offset = blockOffset[1];
- unsigned int vecPos = blockSizeChar - startOffset;
-
- if (endBlock == totNrOfBlocks)
- {
- nrOfElements = size - totNrOfBlocks * blockSizeChar; // last block can have less elements
- }
-
- --nrOfBlocks; // iterate full blocks
- for (unsigned int block = 1; block < nrOfBlocks; ++block)
- {
- unsigned long long newPos = blockOffset[block + 1];
- ReadDataBlock(myfile, strVec, newPos - offset, blockSizeChar, 0, blockSizeChar - 1, vecPos);
- vecPos += blockSizeChar;
- offset = newPos;
- }
-
- unsigned long long newPos = blockOffset[nrOfBlocks + 1];
- ReadDataBlock(myfile, strVec, newPos - offset, nrOfElements, 0, endOffset, vecPos);
-
- delete[] blockOffset;
-
- return List::create(
- _["res"] = "uncompressed",
- _["vecLength"] = vecLength,
- _["meta[0]"] = meta[0],
- _["meta[1]"] = meta[1],
- _["totNrOfBlocks"] = totNrOfBlocks,
- _["startBlock"] = startBlock,
- _["endBlock"] = endBlock,
- _["nrOfBlocks"] = nrOfBlocks,
- _["nrOfElements"] = (int) nrOfElements,
- _["endElem"] = (int) endElem,
- _["startOffset"] = startOffset,
- _["blockSize"] = blockSize,
- _["blockPos"] = (int) blockPos,
- _["blockOffset[0]"] = (int) blockOffset[0],
- _["blockOffset[1]"] = (int) blockOffset[1],
- _["blockOffset[2]"] = (int) blockOffset[2]);
- }
-
-
- // Vector data is compressed
-
- unsigned int bufLength = (nrOfBlocks + 1) * CHAR_INDEX_SIZE; // 1 long and 2 unsigned int per block
- char *blockInfo = new char[bufLength + CHAR_INDEX_SIZE]; // add extra first element for convenience
-
- // unsigned long long blockOffset[1 + nrOfBlocks]; // block positions, algorithm and size information
-
- if (startBlock > 0) // include previous block offset
- {
- myfile.seekg(blockPos + CHAR_HEADER_SIZE + (startBlock - 1) * CHAR_INDEX_SIZE); // jump to correct block index
- myfile.read(blockInfo, (nrOfBlocks + 1) * CHAR_INDEX_SIZE);
- }
- else
- {
- unsigned long long* firstBlock = (unsigned long long*) blockInfo;
- *firstBlock = CHAR_HEADER_SIZE + (totNrOfBlocks + 1) * CHAR_INDEX_SIZE; // offset of first data block
- myfile.read(&blockInfo[CHAR_INDEX_SIZE], nrOfBlocks * CHAR_INDEX_SIZE);
- }
-
- // Get block meta data
- unsigned long long* offset = (unsigned long long*) blockInfo;
- char* blockP = &blockInfo[CHAR_INDEX_SIZE];
- unsigned long long* curBlockPos = (unsigned long long*) blockP;
- unsigned short int* algoInt = (unsigned short int*) (blockP + 8);
- unsigned short int* algoChar = (unsigned short int*) (blockP + 10);
- int* intBufSize = (int*) (blockP + 12);
-
- // move to first data block
-
- myfile.seekg(blockPos + *offset);
-
- unsigned int endElem = blockSizeChar - 1;
- unsigned int nrOfElements = blockSizeChar;
-
- if (startBlock == endBlock) // subset start and end of block
- {
- endElem = endOffset;
- if (endBlock == totNrOfBlocks)
- {
- nrOfElements = size - totNrOfBlocks * blockSizeChar; // last block can have less elements
- }
- }
-
- // Read first block with offset
- unsigned long long blockSize = *curBlockPos - *offset; // size of data block
-
- Decompressor decompressor; // uncompress all availble algorithms
-
- SEXP res = ReadDataBlockCompressed(myfile, strVec, blockSize, nrOfElements, startOffset, endElem, 0, *intBufSize,
- decompressor, *algoInt, *algoChar);
-
- if (startBlock == endBlock) // subset start and end of block
- {
- delete[] blockInfo;
-
- return List::create(
- _["res"] = res,
- _["vecLength"] = vecLength,
- _["meta[0]"] = meta[0],
- _["meta[1]"] = meta[1],
- _["totNrOfBlocks"] = totNrOfBlocks,
- _["startBlock"] = startBlock,
- _["endBlock"] = endBlock,
- _["nrOfBlocks"] = nrOfBlocks,
- _["nrOfElements"] = (int) nrOfElements,
- _["endElem"] = (int) endElem,
- _["startOffset"] = startOffset,
- _["blockSize"] = blockSize,
- _["blockPos"] = (int) blockPos,
- _["*intBufSize"] = *intBufSize,
- _["*algoInt"] = *algoInt,
- _["*algoChar"] = *algoChar,
- _["*offset"] = *offset,
- _["*curBlockPos"] = *curBlockPos);
- }
-
- offset = curBlockPos;
-
- unsigned int vecPos = blockSizeChar - startOffset;
-
- if (endBlock == totNrOfBlocks)
- {
- nrOfElements = size - totNrOfBlocks * blockSizeChar; // last block can have less elements
- }
-
- --nrOfBlocks; // iterate all but last block
- blockP += CHAR_INDEX_SIZE; // move to next index element
- for (unsigned int block = 1; block < nrOfBlocks; ++block)
- {
- unsigned long long* curBlockPos = (unsigned long long*) blockP;
- unsigned short int* algoInt = (unsigned short int*) (blockP + 8);
- unsigned short int* algoChar = (unsigned short int*) (blockP + 10);
- int* intBufSize = (int*) (blockP + 12);
-
- ReadDataBlockCompressed(myfile, strVec, *curBlockPos - *offset, blockSizeChar, 0, blockSizeChar - 1, vecPos, *intBufSize,
- decompressor, *algoInt, *algoChar);
- vecPos += blockSizeChar;
- offset = curBlockPos;
- blockP += CHAR_INDEX_SIZE; // move to next index element
- }
-
- curBlockPos = (unsigned long long*) blockP;
- algoInt = (unsigned short int*) (blockP + 8);
- algoChar = (unsigned short int*) (blockP + 10);
- intBufSize = (int*) (blockP + 12);
-
- ReadDataBlockCompressed(myfile, strVec, *curBlockPos - *offset, nrOfElements, 0, endOffset, vecPos, *intBufSize,
- decompressor, *algoInt, *algoChar);
-
- delete[] blockInfo;
-
- return List::create(
- _["vecLength"] = vecLength,
- _["meta[0]"] = meta[0],
- _["meta[1]"] = meta[1],
- _["meta[2]"] = meta[2]);
-}
diff --git a/src/fstcore_v1/character/character_v1.h b/src/fstcore_v1/character/character_v1.h
deleted file mode 100644
index 70a9af1..0000000
--- a/src/fstcore_v1/character/character_v1.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- fstlib - A C++ library for ultra fast storage and retrieval of datasets
-
- Copyright (C) 2017-present, Mark AJ Klik
-
- This file is part of fstlib.
-
- fstlib is free software: you can redistribute it and/or modify it under the
- terms of the GNU Affero General Public License version 3 as published by the
- Free Software Foundation.
-
- fstlib is distributed in the hope that it will be useful, but WITHOUT ANY
- WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
- A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
- details.
-
- You should have received a copy of the GNU Affero General Public License
- along with fstlib. If not, see .
-
- You can contact the author at:
- - fstlib source repository : https://github.com/fstpackage/fstlib
-*/
-
-#ifndef CHARACTER_V1_H
-#define CHARACTER_V1_H
-
-
-#include
-#include
-
-#include
-#include
-
-
-Rcpp::List fdsReadCharVec_v1(std::ifstream &myfile, SEXP &strVec, unsigned long long blockPos, unsigned int startRow, unsigned int vecLength, unsigned int size);
-
-
-#endif // CHARACTER_V1_H
diff --git a/src/fstcore_v1/double/double_v3.cpp b/src/fstcore_v1/double/double_v3.cpp
deleted file mode 100644
index 942918b..0000000
--- a/src/fstcore_v1/double/double_v3.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- fstlib - A C++ library for ultra fast storage and retrieval of datasets
-
- Copyright (C) 2017-present, Mark AJ Klik
-
- This file is part of fstlib.
-
- fstlib is free software: you can redistribute it and/or modify it under the
- terms of the GNU Affero General Public License version 3 as published by the
- Free Software Foundation.
-
- fstlib is distributed in the hope that it will be useful, but WITHOUT ANY
- WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
- A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
- details.
-
- You should have received a copy of the GNU Affero General Public License
- along with fstlib. If not, see .
-
- You can contact the author at:
- - fstlib source repository : https://github.com/fstpackage/fstlib
-*/
-
-
-// Standard libraries
-#include
-#include
-
-#include
-
-#include
-#include
-#include
-
-
-using namespace std;
-using namespace Rcpp;
-
-SEXP fdsReadRealVec_v3(ifstream &myfile, SEXP &realVec, unsigned long long blockPos, unsigned startRow, unsigned length, unsigned size)
-{
- char* values = (char*) REAL(realVec); // output vector
-
- return fdsReadColumn_v1(myfile, values, blockPos, startRow, length, size, 8);
-}
diff --git a/src/fstcore_v1/double/double_v3.h b/src/fstcore_v1/double/double_v3.h
deleted file mode 100644
index 4d1d999..0000000
--- a/src/fstcore_v1/double/double_v3.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- fstlib - A C++ library for ultra fast storage and retrieval of datasets
-
- Copyright (C) 2017-present, Mark AJ Klik
-
- This file is part of fstlib.
-
- fstlib is free software: you can redistribute it and/or modify it under the
- terms of the GNU Affero General Public License version 3 as published by the
- Free Software Foundation.
-
- fstlib is distributed in the hope that it will be useful, but WITHOUT ANY
- WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
- A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
- details.
-
- You should have received a copy of the GNU Affero General Public License
- along with fstlib. If not, see .
-
- You can contact the author at:
- - fstlib source repository : https://github.com/fstpackage/fstlib
-*/
-
-#ifndef DOUBLE_V3_H
-#define DOUBLE_V3_H
-
-#include
-#include
-
-#include
-
-// fst framework
-#include
-#include
-
-
-SEXP fdsReadRealVec_v3(std::ifstream &myfile, SEXP &realVec, unsigned long long blockPos, unsigned startRow, unsigned length, unsigned size);
-
-#endif // DOUBLE_V3_H
diff --git a/src/fstcore_v1/factor/factor_v5.cpp b/src/fstcore_v1/factor/factor_v5.cpp
deleted file mode 100644
index fc017d2..0000000
--- a/src/fstcore_v1/factor/factor_v5.cpp
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- fstlib - A C++ library for ultra fast storage and retrieval of datasets
-
- Copyright (C) 2017-present, Mark AJ Klik
-
- This file is part of fstlib.
-
- fstlib is free software: you can redistribute it and/or modify it under the
- terms of the GNU Affero General Public License version 3 as published by the
- Free Software Foundation.
-
- fstlib is distributed in the hope that it will be useful, but WITHOUT ANY
- WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
- A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
- details.
-
- You should have received a copy of the GNU Affero General Public License
- along with fstlib. If not, see .
-
- You can contact the author at:
- - fstlib source repository : https://github.com/fstpackage/fstlib
-*/
-
-
-// Standard headers
-#include
-#include
-
-#include
-
-// Framework headers
-#include
-#include
-#include
-#include
-#include
-#include
-
-using namespace std;
-using namespace Rcpp;
-
-
-// Parameter 'startRow' is zero based.
-SEXP fdsReadFactorVec_v5(ifstream &myfile, SEXP &intVec, unsigned long long blockPos, unsigned int startRow,
- unsigned int length, unsigned int size)
-{
- // Jump to factor level
- myfile.seekg(blockPos);
-
- // Get vector meta data
- char meta[12];
- myfile.read(meta, 12);
- unsigned int *nrOfLevels = (unsigned int*) meta;
- unsigned long long* levelVecPos = (unsigned long long*) &meta[4];
-
- // Read level strings
- SEXP strVec;
- PROTECT(strVec = Rf_allocVector(STRSXP, *nrOfLevels));
- SEXP singleColInfo = fdsReadCharVec_v1(myfile, strVec, blockPos + 12, 0, *nrOfLevels, *nrOfLevels); // get level strings
-
- // Read level values
- char* values = (char*) INTEGER(intVec); // output vector
- SEXP intVecInfo = fdsReadColumn_v1(myfile, values, *levelVecPos, startRow, length, size, 4);
-
- Rf_setAttrib(intVec, Rf_mkString("levels"), strVec);
- Rf_setAttrib(intVec, Rf_mkString("class"), Rf_mkString("factor"));
-
- return List::create(
- _["singleColInfo"] = singleColInfo,
- _["intVecInfo"] = intVecInfo,
- _["strVec"] = strVec);
-}
diff --git a/src/fstcore_v1/factor/factor_v5.h b/src/fstcore_v1/factor/factor_v5.h
deleted file mode 100644
index 8987f44..0000000
--- a/src/fstcore_v1/factor/factor_v5.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- fstlib - A C++ library for ultra fast storage and retrieval of datasets
-
- Copyright (C) 2017-present, Mark AJ Klik
-
- This file is part of fstlib.
-
- fstlib is free software: you can redistribute it and/or modify it under the
- terms of the GNU Affero General Public License version 3 as published by the
- Free Software Foundation.
-
- fstlib is distributed in the hope that it will be useful, but WITHOUT ANY
- WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
- A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
- details.
-
- You should have received a copy of the GNU Affero General Public License
- along with fstlib. If not, see .
-
- You can contact the author at:
- - fstlib source repository : https://github.com/fstpackage/fstlib
-*/
-
-#ifndef FACTOR_V5_H
-#define FACTOR_V5_H
-
-
-#include
-#include
-
-#include
-
-
-// Parameter 'startRow' is zero based.
-SEXP fdsReadFactorVec_v5(std::ifstream &myfile, SEXP &intVec, unsigned long long blockPos, unsigned int startRow,
- unsigned int length, unsigned int size);
-
-
-#endif // FACTOR_V5_H
diff --git a/src/fstcore_v1/integer/integer_v2.cpp b/src/fstcore_v1/integer/integer_v2.cpp
deleted file mode 100644
index ad011c2..0000000
--- a/src/fstcore_v1/integer/integer_v2.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- fstlib - A C++ library for ultra fast storage and retrieval of datasets
-
- Copyright (C) 2017-present, Mark AJ Klik
-
- This file is part of fstlib.
-
- fstlib is free software: you can redistribute it and/or modify it under the
- terms of the GNU Affero General Public License version 3 as published by the
- Free Software Foundation.
-
- fstlib is distributed in the hope that it will be useful, but WITHOUT ANY
- WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
- A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
- details.
-
- You should have received a copy of the GNU Affero General Public License
- along with fstlib. If not, see .
-
- You can contact the author at:
- - fstlib source repository : https://github.com/fstpackage/fstlib
-*/
-
-
-// System libraries
-#include
-#include
-#include
-#include
-
-#include
-
-#include
-#include
-#include
-#include
-
-using namespace std;
-using namespace Rcpp;
-
-// SEXP fdsReadIntVec(ifstream &myfile, SEXP &intVec, unsigned long long blockPos, unsigned startRow, unsigned length, unsigned size, unsigned int attrBlockSize)
-SEXP fdsReadIntVec_v2(ifstream &myfile, SEXP &intVec, unsigned long long blockPos, unsigned startRow, unsigned length, unsigned size)
-{
- char* values = (char*) INTEGER(intVec); // output vector
-
- return fdsReadColumn_v1(myfile, values, blockPos, startRow, length, size, 4);
-}
diff --git a/src/fstcore_v1/integer/integer_v2.h b/src/fstcore_v1/integer/integer_v2.h
deleted file mode 100644
index 0bfa4a3..0000000
--- a/src/fstcore_v1/integer/integer_v2.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- fstlib - A C++ library for ultra fast storage and retrieval of datasets
-
- Copyright (C) 2017-present, Mark AJ Klik
-
- This file is part of fstlib.
-
- fstlib is free software: you can redistribute it and/or modify it under the
- terms of the GNU Affero General Public License version 3 as published by the
- Free Software Foundation.
-
- fstlib is distributed in the hope that it will be useful, but WITHOUT ANY
- WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
- A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
- details.
-
- You should have received a copy of the GNU Affero General Public License
- along with fstlib. If not, see .
-
- You can contact the author at:
- - fstlib source repository : https://github.com/fstpackage/fstlib
-*/
-
-#ifndef INTEGER_V2_H
-#define INTEGER_V2_H
-
-#include
-#include
-
-#include // Rcpp header
-
-SEXP fdsReadIntVec_v2(std::ifstream &myfile, SEXP &intVec, unsigned long long blockPos, unsigned startRow, unsigned length, unsigned size);
-
-#endif // INTEGER_V2_H
diff --git a/src/fstcore_v1/logical/logical_v4.cpp b/src/fstcore_v1/logical/logical_v4.cpp
deleted file mode 100644
index f47c805..0000000
--- a/src/fstcore_v1/logical/logical_v4.cpp
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- fstlib - A C++ library for ultra fast storage and retrieval of datasets
-
- Copyright (C) 2017-present, Mark AJ Klik
-
- This file is part of fstlib.
-
- fstlib is free software: you can redistribute it and/or modify it under the
- terms of the GNU Affero General Public License version 3 as published by the
- Free Software Foundation.
-
- fstlib is distributed in the hope that it will be useful, but WITHOUT ANY
- WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
- A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
- details.
-
- You should have received a copy of the GNU Affero General Public License
- along with fstlib. If not, see .
-
- You can contact the author at:
- - fstlib source repository : https://github.com/fstpackage/fstlib
-*/
-
-// System libraries
-#include
-#include
-
-#include
-
-#include
-#include
-#include
-#include
-
-using namespace std;
-using namespace Rcpp;
-
-#define BLOCKSIZE_LOGICAL 4096 // number of logicals in default compression block
-
-
-SEXP fdsReadLogicalVec_v4(ifstream &myfile, SEXP &boolVec, unsigned long long blockPos, unsigned int startRow,
- unsigned int length, unsigned int size)
-{
- char* values = (char*) LOGICAL(boolVec); // output vector
- return fdsReadColumn_v1(myfile, values, blockPos, startRow, length, size, 4);
-}
-
-
diff --git a/src/fstcore_v1/logical/logical_v4.h b/src/fstcore_v1/logical/logical_v4.h
deleted file mode 100644
index e71abfe..0000000
--- a/src/fstcore_v1/logical/logical_v4.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- fstlib - A C++ library for ultra fast storage and retrieval of datasets
-
- Copyright (C) 2017-present, Mark AJ Klik
-
- This file is part of fstlib.
-
- fstlib is free software: you can redistribute it and/or modify it under the
- terms of the GNU Affero General Public License version 3 as published by the
- Free Software Foundation.
-
- fstlib is distributed in the hope that it will be useful, but WITHOUT ANY
- WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
- A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
- details.
-
- You should have received a copy of the GNU Affero General Public License
- along with fstlib. If not, see .
-
- You can contact the author at:
- - fstlib source repository : https://github.com/fstpackage/fstlib
-*/
-
-#ifndef LOGICAL_V4_H
-#define LOGICAL_V4_H
-
-#include
-#include
-
-#include
-
-SEXP fdsReadLogicalVec_v4(std::ifstream &myfile, SEXP &boolVec, unsigned long long blockPos, unsigned int startRow,
- unsigned int length, unsigned int size);
-
-#endif // LOGICAL_V4_H
diff --git a/src/init.c b/src/init.c
index 805f042..4ff83da 100644
--- a/src/init.c
+++ b/src/init.c
@@ -12,8 +12,8 @@ Check these declarations against the C/Fortran source code.
extern SEXP _fst_fstcomp(SEXP, SEXP, SEXP, SEXP);
extern SEXP _fst_fstdecomp(SEXP);
extern SEXP _fst_fsthasher(SEXP, SEXP, SEXP);
-extern SEXP _fst_fstmetadata(SEXP, SEXP);
-extern SEXP _fst_fstretrieve(SEXP, SEXP, SEXP, SEXP, SEXP);
+extern SEXP _fst_fstmetadata(SEXP);
+extern SEXP _fst_fstretrieve(SEXP, SEXP, SEXP, SEXP);
extern SEXP _fst_fststore(SEXP, SEXP, SEXP, SEXP);
extern SEXP _fst_getnrofthreads();
extern SEXP _fst_hasopenmp();
@@ -27,8 +27,8 @@ static const R_CallMethodDef CallEntries[] = {
{"_fst_fstcomp", (DL_FUNC) &_fst_fstcomp, 4},
{"_fst_fstdecomp", (DL_FUNC) &_fst_fstdecomp, 1},
{"_fst_fsthasher", (DL_FUNC) &_fst_fsthasher, 3},
- {"_fst_fstmetadata", (DL_FUNC) &_fst_fstmetadata, 2},
- {"_fst_fstretrieve", (DL_FUNC) &_fst_fstretrieve, 5},
+ {"_fst_fstmetadata", (DL_FUNC) &_fst_fstmetadata, 1},
+ {"_fst_fstretrieve", (DL_FUNC) &_fst_fstretrieve, 4},
{"_fst_fststore", (DL_FUNC) &_fst_fststore, 4},
{"_fst_getnrofthreads", (DL_FUNC) &_fst_getnrofthreads, 0},
{"_fst_hasopenmp", (DL_FUNC) &_fst_hasopenmp, 0},
diff --git a/tests/testthat/test_fst_table.R b/tests/testthat/test_fst_table.R
index 60c2dae..abe32e7 100644
--- a/tests/testthat/test_fst_table.R
+++ b/tests/testthat/test_fst_table.R
@@ -16,6 +16,12 @@ test_file <- "testdata/fst_table.fst"
write_fst(df, test_file)
x <- fst(test_file)
+# single column table
+df2 <- df["X"]
+test_file2 <- "testdata/fst_table2.fst"
+write_fst(df2, test_file2)
+y <- fst(test_file2)
+
# see issues #175 and #136
test_that("fst_table does throw normalizePath error on non-existing file", {
@@ -102,10 +108,6 @@ test_that("fst_table [ generic", {
expect_equal(x[,2:3], df[, 2:3])
- expect_equal(x[j = 2:3, drop = FALSE], df[, 2:3])
-
- expect_equal(x[i = 2:1, drop = FALSE], df[, 2:1])
-
expect_equal(as.list(x[2,]), as.list(df[2,]))
expect_equal(as.list(x[2:10,]), as.list(df[2:10,]))
@@ -124,6 +126,29 @@ test_that("fst_table [ generic", {
})
+test_that("fst_table allows for drop argument", {
+
+ # fst drops dimensions in same cases as data.frame. Less warnings are given.
+
+ # 3 arguments:
+
+ expect_equal(df[, "X"], x[, "X"])
+ expect_equal(df[2, "X"], x[2, "X"])
+ expect_equal(df[2:4, "X"], x[2:4, "X"])
+
+ # 4 arguments:
+
+ expect_equal(df[, "X", drop = TRUE], x[, "X", drop = TRUE])
+ expect_equal(df[, "X",], x[, "X",])
+ expect_equal(df[2, "X", drop = TRUE], x[2, "X", drop = TRUE])
+ expect_equal(df[2, "X",], x[2, "X",])
+ expect_equal(df[2:4, "X", drop = TRUE], x[2:4, "X", drop = TRUE])
+ expect_equal(df[2:4, "X", ], x[2:4, "X", ])
+ expect_equal(df[2:4, 2, drop = TRUE], x[2:4, 2, drop = TRUE])
+ expect_equal(df[2:4, 2, ], x[2:4, 2, ])
+})
+
+
test_that("fst_table throws errors on incorrect use of interface", {
expect_error(x[[c("X", 3)]], "Subscript out of bounds")
diff --git a/tests/testthat/test_keys.R b/tests/testthat/test_keys.R
index 960ed5d..135c332 100644
--- a/tests/testthat/test_keys.R
+++ b/tests/testthat/test_keys.R
@@ -37,7 +37,7 @@ test_that("Missing middle key", {
test_that("Missing first key", {
fstwriteproxy(x, "testdata/keys.fst")
- res <- fst:::fstretrieve("testdata/keys.fst", c("B", "C", "D", "E"), 1L, NULL, FALSE)
+ res <- fst:::fstretrieve("testdata/keys.fst", c("B", "C", "D", "E"), 1L, NULL)
y <- fstreadproxy("testdata/keys.fst", columns = c("B", "C", "D", "E"), as.data.table = TRUE)
expect_null(key(y))
})
diff --git a/tests/testthat/test_legacy.R b/tests/testthat/test_legacy.R
index cd9f781..542261c 100644
--- a/tests/testthat/test_legacy.R
+++ b/tests/testthat/test_legacy.R
@@ -1,128 +1,15 @@
context("legacy format")
-# nolint start
-#
-# # Clean testdata directory
-# if (!file.exists("testdata")) {
-# dir.create("testdata")
-# } else {
-# file.remove(list.files("testdata", full.names = TRUE))
-# }
-#
-#
-# # Create a pool of strings
-# nroflevels <- 8
-#
-# char_vec <- function(nrofrows) {
-# sapply(1:nrofrows, function(x) {
-# paste(sample(LETTERS, sample(1:4)), collapse = "")
-# }
-# ) }
-#
-# char_veclong <- function(nrofrows) {
-# sapply(1:nrofrows,
-# function(x) {
-# paste(sample(LETTERS, sample(20:25)), collapse = "")
-# }
-# ) }
-#
-# date_vec <- function(nrofrows) {
-# date_vec <- sample(1:nrofrows, replace = TRUE)
-# class(date_vec) <- c("IDate", "Date")
-# date_vec
-# }
-#
-# difftime_vec <- function(nrOfrows, mode = "double") {
-# vec <- (Sys.time() + 1:nrofrows) - Sys.time()
-# mode(vec) <- mode
-# vec
-# }
-#
-# # Sample data
-# nrofrows <- 1000L
-# char_na <- char_vec(nrofrows)
-# char_na[sample(1:nrofrows, 10)] <- NA
-# datatable <- data.frame(
-# Xint = 1:nrofrows,
-# Ylog = sample(c(TRUE, FALSE, NA), nrofrows, replace = TRUE),
-# Zdoub = rnorm(nrofrows),
-# Qchar = char_vec(nrofrows),
-# WFact = factor(sample(char_vec(nroflevels), nrofrows, replace = TRUE)),
-# Ordered = ordered(sample(char_vec(nroflevels), nrofrows, replace = TRUE)),
-# char_na = char_na,
-# CharLong = char_veclong(nrofrows),
-# Date = date_vec(nrofrows),
-# DateDouble = as.Date("2015-01-01") + 1:nrofrows,
-# Difftime = difftime_vec(nrOfrows),
-# DiffTime_int = difftime_vec(nrOfrows, "integer"),
-# stringsAsFactors = FALSE)
-#
-# require(fst)
-#
-# write.fst(datatable, "datasets/legacy.fst")
-# x <- read.fst("datasets/legacy.fst")
-# saveRDS(x, "datasets/legacy.rds")
-#
-# nolint end
-
-
test_that("Read legacy format", {
- # only test on little endian platforms
- if (.Platform$endian == "little" && .Platform$OS.type == "windows") {
- expect_error(
- read_fst("datasets/legacy.fst"),
- "File header information does not contain the fst format marker"
- )
-
- expect_warning(
+ expect_error(
dt <- read_fst("datasets/legacy.fst", old_format = TRUE),
- "This fst file was created with a beta version of the fst package. Please"
- )
-
- expect_warning(
- dt_legacy <- read.fst("datasets/legacy.fst", old_format = TRUE),
- "This fst file was created with a beta version of the fst package. Please"
- )
-
- expect_equal(dt, dt_legacy)
-
- dt_old <- readRDS("datasets/legacy.rds")
-
- expect_equal(dt, dt_old)
-
- expect_error(
- metadata_fst("datasets/legacy.fst"),
- "File header information does not contain the fst format marker"
- )
-
- expect_warning(
- res <- capture_output(print(metadata_fst("datasets/legacy.fst", old_format = TRUE))),
- "This fst file was created with a beta version of the fst package. Please"
- )
-
- expect_warning(
- res_legacy <- capture_output(print(fst.metadata("datasets/legacy.fst", old_format = TRUE))),
- "This fst file was created with a beta version of the fst package. Please"
- )
-
- expect_equal(res, res_legacy)
+ "Parameter old_format is depricated"
+ )
- expect_equal(res, paste(
- "\n1000 rows, 12 columns (legacy.fst)\n",
- "* 'Xint' : integer",
- "* 'Ylog' : logical",
- "* 'Zdoub' : double",
- "* 'Qchar' : character",
- "* 'WFact' : factor",
- "* 'Ordered' : factor",
- "* 'char_na' : character",
- "* 'CharLong' : character",
- "* 'Date' : integer",
- "* 'DateDouble' : double",
- "* 'Difftime' : double",
- "* 'DiffTime_int': integer",
- sep = "\n"))
- }
+ expect_error(
+ metadata_fst("datasets/legacy.fst"),
+ "File header information does not contain the fst format marker"
+ )
})