Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for deserializing list-encoded JSON structs [#6558] #6643

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 96 additions & 0 deletions arrow-json/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,34 @@ pub use self::writer::{ArrayWriter, LineDelimitedWriter, Writer, WriterBuilder};
use half::f16;
use serde_json::{Number, Value};

/// Specifies what is considered valid JSON when reading or writing
/// RecordBatches or StructArrays.
///
/// This enum controls which form(s) the Reader will accept and which form the
/// Writer will produce. For example, if the RecordBatch Schema is
/// `[("a", Int32), ("r", Struct([("b", Boolean), ("c", Utf8)]))]`
/// then a Reader with [`StructMode::ObjectOnly`] would read rows of the form
/// `{"a": 1, "r": {"b": true, "c": "cat"}}` while with ['StructMode::ListOnly']
/// would read rows of the form `[1, [true, "cat"]]`. A Writer would produce
/// rows formatted similarly.
///
/// The list encoding is more compact if the schema is known, and is used by
/// tools such as
/// [Presto](https://prestodb.io/docs/current/develop/client-protocol.html#important-queryresults-attributes)
/// and [Trino](https://trino.io/docs/current/develop/client-protocol.html#important-queryresults-attributes).
///
/// When reading objects, the order of the key does not matter. When reading
/// lists, the entries must be the same number and in the same order as the
/// struct fields. Map columns are not affected by this option.
#[derive(Copy, Clone, Debug, Default, PartialEq, Eq)]
pub enum StructMode {
#[default]
/// Encode/decode structs as objects (e.g., {"a": 1, "b": "c"})
ObjectOnly,
/// Encode/decode structs as lists (e.g., [1, "c"])
ListOnly,
}

/// Trait declaring any type that is serializable to JSON. This includes all primitive types (bool, i32, etc.).
pub trait JsonSerializable: 'static {
/// Converts self into json value if its possible
Expand Down Expand Up @@ -156,4 +184,72 @@ mod tests {
);
assert_eq!(None, f32::NAN.into_json_value());
}

#[test]
fn test_json_roundtrip_structs() {
use crate::writer::LineDelimited;
use arrow_schema::DataType;
use arrow_schema::Field;
use arrow_schema::Fields;
use arrow_schema::Schema;
use std::sync::Arc;

let schema = Arc::new(Schema::new(vec![
Field::new(
"c1",
DataType::Struct(Fields::from(vec![
Field::new("c11", DataType::Int32, true),
Field::new(
"c12",
DataType::Struct(vec![Field::new("c121", DataType::Utf8, false)].into()),
false,
),
])),
false,
),
Field::new("c2", DataType::Utf8, false),
]));

{
let object_input = r#"{"c1":{"c11":1,"c12":{"c121":"e"}},"c2":"a"}
{"c1":{"c12":{"c121":"f"}},"c2":"b"}
{"c1":{"c11":5,"c12":{"c121":"g"}},"c2":"c"}
"#
.as_bytes();
let object_reader = ReaderBuilder::new(schema.clone())
.with_struct_mode(StructMode::ObjectOnly)
.build(object_input)
.unwrap();

let mut object_output: Vec<u8> = Vec::new();
let mut object_writer = WriterBuilder::new()
.with_struct_mode(StructMode::ObjectOnly)
.build::<_, LineDelimited>(&mut object_output);
for batch_res in object_reader {
object_writer.write(&batch_res.unwrap()).unwrap();
}
assert_eq!(object_input, &object_output);
}

{
let list_input = r#"[[1,["e"]],"a"]
[[null,["f"]],"b"]
[[5,["g"]],"c"]
"#
.as_bytes();
let list_reader = ReaderBuilder::new(schema.clone())
.with_struct_mode(StructMode::ListOnly)
.build(list_input)
.unwrap();

let mut list_output: Vec<u8> = Vec::new();
let mut list_writer = WriterBuilder::new()
.with_struct_mode(StructMode::ListOnly)
.build::<_, LineDelimited>(&mut list_output);
for batch_res in list_reader {
list_writer.write(&batch_res.unwrap()).unwrap();
}
assert_eq!(list_input, &list_output);
}
}
}
3 changes: 3 additions & 0 deletions arrow-json/src/reader/list_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

use crate::reader::tape::{Tape, TapeElement};
use crate::reader::{make_decoder, ArrayDecoder};
use crate::StructMode;
use arrow_array::builder::{BooleanBufferBuilder, BufferBuilder};
use arrow_array::OffsetSizeTrait;
use arrow_buffer::buffer::NullBuffer;
Expand All @@ -37,6 +38,7 @@ impl<O: OffsetSizeTrait> ListArrayDecoder<O> {
coerce_primitive: bool,
strict_mode: bool,
is_nullable: bool,
struct_mode: StructMode,
) -> Result<Self, ArrowError> {
let field = match &data_type {
DataType::List(f) if !O::IS_LARGE => f,
Expand All @@ -48,6 +50,7 @@ impl<O: OffsetSizeTrait> ListArrayDecoder<O> {
coerce_primitive,
strict_mode,
field.is_nullable(),
struct_mode,
)?;

Ok(Self {
Expand Down
4 changes: 4 additions & 0 deletions arrow-json/src/reader/map_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

use crate::reader::tape::{Tape, TapeElement};
use crate::reader::{make_decoder, ArrayDecoder};
use crate::StructMode;
use arrow_array::builder::{BooleanBufferBuilder, BufferBuilder};
use arrow_buffer::buffer::NullBuffer;
use arrow_buffer::ArrowNativeType;
Expand All @@ -36,6 +37,7 @@ impl MapArrayDecoder {
coerce_primitive: bool,
strict_mode: bool,
is_nullable: bool,
struct_mode: StructMode,
) -> Result<Self, ArrowError> {
let fields = match &data_type {
DataType::Map(_, true) => {
Expand All @@ -59,12 +61,14 @@ impl MapArrayDecoder {
coerce_primitive,
strict_mode,
fields[0].is_nullable(),
struct_mode,
)?;
let values = make_decoder(
fields[1].data_type().clone(),
coerce_primitive,
strict_mode,
fields[1].is_nullable(),
struct_mode,
)?;

Ok(Self {
Expand Down
Loading
Loading