Skip to content

Commit

Permalink
Support writing list(raw()) to FIXED_LEN_BYTE_ARRAY
Browse files Browse the repository at this point in the history
  • Loading branch information
gaborcsardi committed Jul 23, 2024
1 parent f2077ee commit ab6ce75
Show file tree
Hide file tree
Showing 6 changed files with 102 additions and 21 deletions.
3 changes: 2 additions & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,8 @@
- `double` to `INT(*, *)`,
- `character` to `UUID`,
- `double` to `FLOAT16`,
- `list` of `raw` vectors to `BYTE_ARRAY`.
- `list` of `raw` vectors to `BYTE_ARRAY`,
- `list` of `raw` vectors to `FIXED_LEN_BYTE_ARRAY`.

* `write_parquet(file = ":raw:")` now works correctly for larger data
frames (#77).
Expand Down
3 changes: 2 additions & 1 deletion man/nanoparquet-types.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

56 changes: 41 additions & 15 deletions src/write.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -913,23 +913,49 @@ void RParquetOutFile::write_fixed_len_byte_array(
}

} else {
if (TYPEOF(col) != STRSXP) {
Rf_error(
"Cannot write %s as a Parquet FIXED_LEN_BYTE_ARRAY type.",
type_names[TYPEOF(col)]
);
switch (TYPEOF(col)) {
case STRSXP: {
for (uint64_t i = from; i < until; i++) {
SEXP s = STRING_ELT(col, i);
if (s == NA_STRING) {
continue;
}
const char *c = CHAR(s);
uint32_t len1 = strlen(c);
if (len1 != type_length) {
Rf_error("Invalid string length: %d, expenting %d for "
"FIXED_LEN_TYPE_ARRAY",
len1, type_length);
}
file.write(c, type_length);
}
break;
}
for (uint64_t i = from; i < until; i++) {
SEXP s = STRING_ELT(col, i);
if (s == NA_STRING) continue;
const char *c = CHAR(s);
uint32_t len1 = strlen(c);
if (len1 != type_length) {
Rf_error(
"Invalid string length: %d, expenting %d for FIXED_LEN_TYPE_ARRAY",
len1, type_length);
case VECSXP: {
for (uint64_t i = from; i < until; i++) {
SEXP el = VECTOR_ELT(col, i);
if (Rf_isNull(el)) {
continue;
}
if (TYPEOF(el) != RAWSXP) {
Rf_error(
"Cannot write %s as a Parquet BYTE_ARRAY element when writing a"
"list column of RAW vectors.",
type_names[TYPEOF(el)]);
}
uint32_t len1 = Rf_xlength(el);
if (len1 != type_length) {
Rf_error("Invalid string length: %d, expenting %d for "
"FIXED_LEN_TYPE_ARRAY",
len1, type_length);
}
file.write((const char *)RAW(el), len1);
}
file.write(c, type_length);
break;
}
default:
Rf_error("Cannot write %s as a Parquet FIXED_LEN_BYTE_ARRAY type.",
type_names[TYPEOF(col)]);
}
}
}
Expand Down
40 changes: 40 additions & 0 deletions tests/testthat/_snaps/write-parquet-3.md
Original file line number Diff line number Diff line change
Expand Up @@ -1774,3 +1774,43 @@
4 66, 6f, 6f, 62, 61, 72
5 NULL

# list of RAW to FIXED_LEN_BYTE_ARRAY

Code
as.data.frame(read_parquet_schema(tmp)[, -1])
Output
name r_type type type_length repetition_type converted_type
1 schema <NA> <NA> NA <NA> <NA>
2 d raw FIXED_LEN_BYTE_ARRAY 3 REQUIRED <NA>
logical_type num_children scale precision field_id
1 1 NA NA NA
2 NA NA NA NA
Code
as.data.frame(read_parquet(tmp))
Output
d
1 66, 6f, 6f
2 62, 61, 72
3 61, 61, 61

---

Code
as.data.frame(read_parquet_schema(tmp)[, -1])
Output
name r_type type type_length repetition_type converted_type
1 schema <NA> <NA> NA <NA> <NA>
2 d raw FIXED_LEN_BYTE_ARRAY 3 OPTIONAL <NA>
logical_type num_children scale precision field_id
1 1 NA NA NA
2 NA NA NA NA
Code
as.data.frame(read_parquet(tmp))
Output
d
1 66, 6f, 6f
2 NULL
3 62, 61, 72
4 61, 61, 61
5 NULL

18 changes: 15 additions & 3 deletions tests/testthat/test-write-parquet-3.R
Original file line number Diff line number Diff line change
Expand Up @@ -673,15 +673,27 @@ test_that("list of RAW to BYTE_ARRAY", {
})

test_that("list of RAW to FIXED_LEN_BYTE_ARRAY", {
skip("soon")
tmp <- tempfile(fileext = ".parquet")
on.exit(unlink(tmp), add = TRUE)
schema <- parquet_schema("FIXED_LEN_BYTE_ARRAY")
schema <- parquet_schema(list("FIXED_LEN_BYTE_ARRAY", type_length = 3))

d <- data.frame(d = I(list(
charToRaw("foo"),
charToRaw("bar"),
charToRaw("foobar")
charToRaw("aaa")
)))
write_parquet(d, tmp, schema = schema)
expect_snapshot({
as.data.frame(read_parquet_schema(tmp)[, -1])
as.data.frame(read_parquet(tmp))
})

d <- data.frame(d = I(list(
charToRaw("foo"),
NULL,
charToRaw("bar"),
charToRaw("aaa"),
NULL
)))
write_parquet(d, tmp, schema = schema)
expect_snapshot({
Expand Down
3 changes: 2 additions & 1 deletion tools/types.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,8 @@ non-default mappings are:
- `double` to `INT(*, *)`,
- `character` to `UUID`,
- `double` to `FLOAT16`,
- `list` of `raw` vectors to `BYTE_ARRAY`.
- `list` of `raw` vectors to `BYTE_ARRAY`,
- `list` of `raw` vectors to `FIXED_LEN_BYTE_ARRAY`.

# Parquet's data types

Expand Down

0 comments on commit ab6ce75

Please sign in to comment.