More types in read_parquet()

r-lib · Aug 13, 2024 · e00c562 · e00c562
1 parent 25d3ae8
commit e00c562
Show file tree

Hide file tree

Showing 4 changed files with 83 additions and 30 deletions.
diff --git a/R/read.R b/R/read.R
@@ -13,5 +13,23 @@ read_parquet2 <- function(file, options = parquet_options()) {
 		res <- apply_arrow_schema2(res, file, dicts, types)
 	}
 
+	# convert hms from milliseconds to seconds, also integer -> double
+	hmss <- which(vapply(res, "inherits", "hms", FUN.VALUE = logical(1)))
+	for (idx in hmss) {
+		res[[idx]] <- structure(
+			unclass(res[[idx]]) / 1000,
+			class = class(res[[idx]])
+		)
+	}
+
+	# convert POSIXct from milliseconds to seconds
+	posixcts <- which(vapply(res, "inherits", "POSIXct", FUN.VALUE = logical(1)))
+	for (idx in posixcts) {
+		res[[idx]][] <- structure(
+			unclass(res[[idx]]) / 1000,
+			class = class(res[[idx]])
+		)
+	}
+
   res
 }
diff --git a/src/RParquetReader.cpp b/src/RParquetReader.cpp
@@ -46,6 +46,7 @@ RParquetReader::RParquetReader(std::string filename)
     if (rt.type != rt.tmptype && rt.tmptype != NILSXP) {
       tmpdata[i].resize(metadata.num_rows * rt.elsize);
     }
+    INTEGER(types)[idx] = file_meta_data_.schema[i].type;
     idx++;
   }
 }
@@ -1039,22 +1040,6 @@ void convert_column_to_r_int96(postprocess *pp, uint32_t cl) {
   } else if (hasdict0 && hasmiss0) {
     convert_column_to_r_int96_dict_miss(pp, cl);
   }
-
-  // TODO: make this conversion configurable
-  SEXP x = VECTOR_ELT(pp->columns, pp->leaf_cols[cl]);
-  SEXP cls = PROTECT(Rf_allocVector(STRSXP, 2));
-  SET_STRING_ELT(cls, 0, Rf_mkChar("POSIXct"));
-  SET_STRING_ELT(cls, 1, Rf_mkChar("POSIXt"));
-  Rf_setAttrib(x, Rf_install("tzone"), Rf_mkString("UTC"));
-  SET_CLASS(x, cls);
-  UNPROTECT(1);
-
-  R_xlen_t len = Rf_xlength(x);
-  double *ptr = REAL(x);
-  double *end = ptr + len;
-  for (; ptr < end; ptr++) {
-    *ptr = *ptr / 1000;
-  }
 }
 
 // ------------------------------------------------------------------------
@@ -1544,6 +1529,56 @@ void convert_columns_to_r_(postprocess *pp) {
     default:
       break;
     }
+
+    // add classes, if any
+    size_t nc = rt.classes.size();
+    if (nc > 0) {
+      SEXP x = VECTOR_ELT(pp->columns, pp->leaf_cols[cl]);
+      SEXP cls = PROTECT(Rf_allocVector(STRSXP, nc));
+      for (size_t i = 0; i < nc; i++) {
+        SET_STRING_ELT(cls, i, Rf_mkCharCE(rt.classes[i].c_str(), CE_UTF8));
+      }
+      SET_CLASS(x, cls);
+      UNPROTECT(1);
+    }
+
+    // add time zone attribute, if any
+    if (rt.tzone != "") {
+      SEXP x = VECTOR_ELT(pp->columns, pp->leaf_cols[cl]);
+      Rf_setAttrib(x, Rf_install("tzone"), Rf_mkString(rt.tzone.c_str()));
+    }
+
+    // add unit
+    size_t nu = rt.units.size();
+    if (nu > 0) {
+      SEXP x = VECTOR_ELT(pp->columns, pp->leaf_cols[cl]);
+      SEXP units = PROTECT(Rf_allocVector(STRSXP, nu));
+      for (size_t i = 0; i < nu; i++) {
+        SET_STRING_ELT(units, i, Rf_mkCharCE(rt.units[i].c_str(), CE_UTF8));
+      }
+      Rf_setAttrib(x, Rf_install("units"), units);
+      UNPROTECT(1);
+    }
+
+    // use multiplier, if any
+    if (rt.time_fct != 1.0) {
+      SEXP x = VECTOR_ELT(pp->columns, pp->leaf_cols[cl]);
+      if (TYPEOF(x) == INTSXP) {
+        int32_t *ptr = INTEGER(x);
+        int32_t *end = ptr + Rf_xlength(x);
+        for (; ptr < end; ptr++) {
+          *ptr /= rt.time_fct;
+        }
+      } else if (TYPEOF(x) == REALSXP) {
+        double *ptr = REAL(x);
+        double *end = ptr + Rf_xlength(x);
+        for (; ptr < end; ptr++) {
+          *ptr /= rt.time_fct;
+        }
+      } else {
+        Rf_error("Internal nanoparquet error, cannot multiply non-numeric");
+      }
+    }
   }
 }
 

diff --git a/tests/testthat/_snaps/read-parquet.md b/tests/testthat/_snaps/read-parquet.md
@@ -38,7 +38,7 @@
 # read hms in MICROS
 
     Code
-      as.data.frame(read_parquet(pf))
+      as.data.frame(read_parquet2(pf))
     Output
               tt
       1 14:30:00
@@ -56,7 +56,7 @@
 # read GZIP compressed files
 
     Code
-      as.data.frame(read_parquet(pf))
+      as.data.frame(read_parquet2(pf))
     Output
                          nam  mpg cyl  disp  hp drat    wt  qsec vs am gear carb
       1                 <NA> 21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
@@ -128,7 +128,7 @@
 # V2 data pages
 
     Code
-      as.data.frame(read_parquet(pf))
+      as.data.frame(read_parquet2(pf))
     Output
         FirstName                                       Data
       1      John 48, 65, 6c, 6c, 6f, 20, 57, 6f, 72, 6c, 64

diff --git a/tests/testthat/test-read-parquet.R b/tests/testthat/test-read-parquet.R
@@ -161,7 +161,7 @@ test_that("read Date", {
   )
   write_parquet(d, tmp)
 
-  d2 <- read_parquet(tmp)
+  d2 <- read_parquet2(tmp)
   expect_s3_class(d2$d, "Date")
   expect_equal(d$d, d2$d)
 })
@@ -175,15 +175,15 @@ test_that("read hms", {
   )
   write_parquet(d, tmp)
 
-  d2 <- read_parquet(tmp)
+  d2 <- read_parquet2(tmp)
   expect_s3_class(d2$h, "hms")
   expect_equal(d$h, d2$h)
 })
 
 test_that("read hms in MICROS", {
   pf <- test_path("data/timetz.parquet")
   expect_snapshot({
-    as.data.frame(read_parquet(pf))
+    as.data.frame(read_parquet2(pf))
   })
 })
 
@@ -196,7 +196,7 @@ test_that("read POSIXct", {
   )
   write_parquet(d, tmp)
 
-  d2 <- read_parquet(tmp)
+  d2 <- read_parquet2(tmp)
   expect_s3_class(d$h, "POSIXct")
   expect_equal(d$h, d2$h)
 })
@@ -206,7 +206,7 @@ test_that("read POSIXct in MILLIS", {
   # This file has UTC = FALSE, so the exact result depends on the current
   # time zone. But it should match Arrow.
   pf <- test_path("data/timestamp-ms.parquet")
-  d1 <- read_parquet(pf)
+  d1 <- read_parquet2(pf)
   d2 <- arrow::read_parquet(pf)
   expect_equal(
     as.data.frame(d1),
@@ -224,7 +224,7 @@ test_that("read difftime", {
   )
   write_parquet(d, tmp)
 
-  d2 <- read_parquet(tmp)
+  d2 <- read_parquet2(tmp)
   expect_s3_class(d2$h, "difftime")
   expect_equal(d$h, d2$h)
 
@@ -233,7 +233,7 @@ test_that("read difftime", {
     h = as.difftime(10, units = "mins")
   )
   write_parquet(d, tmp)
-  d2 <- read_parquet(tmp)
+  d2 <- read_parquet2(tmp)
   expect_snapshot({
     as.data.frame(d2)
   })
@@ -283,22 +283,22 @@ test_that("RLE BOOLEAN", {
 test_that("read GZIP compressed files", {
   pf <- test_path("data/gzip.parquet")
   expect_snapshot({
-    as.data.frame(read_parquet(pf))
+    as.data.frame(read_parquet2(pf))
   })
 })
 
 test_that("V2 data pages", {
   pf <- test_path("data/parquet_go.parquet")
   expect_snapshot({
-    as.data.frame(read_parquet(pf))
+    as.data.frame(read_parquet2(pf))
   })
 })
 
 test_that("V2 data page with missing values", {
   skip_on_cran()
   pf <- test_path("data/duckdb-bug1589.parquet")
   expect_equal(
-    as.data.frame(read_parquet(pf)),
+    as.data.frame(read_parquet2(pf)),
     as.data.frame(arrow::read_parquet(pf))
   )
 })
@@ -316,7 +316,7 @@ test_that("zstd", {
   pf <- test_path("data/zstd.parquet")
   expect_true(all(read_parquet_metadata(pf)$column_chunks$codec == "ZSTD"))
   pf2 <- test_path("data/gzip.parquet")
-  expect_equal(read_parquet(pf), read_parquet(pf2))
+  expect_equal(read_parquet2(pf), read_parquet2(pf2))
 })
 
 test_that("zstd with data page v2", {