From 045b7f68d9158626ef4eee3441413c1882c6d301 Mon Sep 17 00:00:00 2001 From: Nikhil Reddy Ramolla <12396684+nikhilramolla@users.noreply.github.com> Date: Thu, 25 Jul 2024 18:04:48 +0530 Subject: [PATCH] feat: add `ignore_nulls` option to `struct.json_encode` --- crates/polars-json/src/json/write/mod.rs | 2 +- .../polars-json/src/json/write/serialize.rs | 24 ++++++++++----- crates/polars-json/src/json/write/utf8.rs | 4 +-- crates/polars-json/src/ndjson/write.rs | 2 +- .../src/dsl/function_expr/struct_.rs | 14 +++++---- crates/polars-plan/src/dsl/struct_.rs | 6 ++-- py-polars/polars/expr/struct.py | 29 +++++++++++++++++-- py-polars/src/expr/struct.rs | 8 +++-- 8 files changed, 66 insertions(+), 23 deletions(-) diff --git a/crates/polars-json/src/json/write/mod.rs b/crates/polars-json/src/json/write/mod.rs index 4d80f603aceb..c596defdc681 100644 --- a/crates/polars-json/src/json/write/mod.rs +++ b/crates/polars-json/src/json/write/mod.rs @@ -88,7 +88,7 @@ impl<'a> RecordSerializer<'a> { let iterators = chunk .arrays() .iter() - .map(|arr| new_serializer(arr.as_ref(), 0, usize::MAX)) + .map(|arr| new_serializer(arr.as_ref(), false, 0, usize::MAX)) .collect(); Self { diff --git a/crates/polars-json/src/json/write/serialize.rs b/crates/polars-json/src/json/write/serialize.rs index 872e13970814..3cc105a3965a 100644 --- a/crates/polars-json/src/json/write/serialize.rs +++ b/crates/polars-json/src/json/write/serialize.rs @@ -181,6 +181,7 @@ fn utf8view_serializer<'a>( fn struct_serializer<'a>( array: &'a StructArray, + ignore_nulls: bool, offset: usize, take: usize, ) -> Box + 'a + Send + Sync> { @@ -195,7 +196,7 @@ fn struct_serializer<'a>( .values() .iter() .map(|x| x.as_ref()) - .map(|arr| new_serializer(arr, offset, take)) + .map(|arr| new_serializer(arr, ignore_nulls, offset, take)) .collect::>(); Box::new(BufStreamingIterator::new( @@ -211,6 +212,7 @@ fn struct_serializer<'a>( .map(|serializer| serializer.next().unwrap()), ), true, + ignore_nulls, ); } else { serializers.iter_mut().for_each(|iter| { @@ -237,7 +239,7 @@ fn list_serializer<'a, O: Offset>( let offsets = array.offsets().as_slice(); let start = offsets[0].to_usize(); let end = offsets.last().unwrap().to_usize(); - let mut serializer = new_serializer(array.values().as_ref(), start, end - start); + let mut serializer = new_serializer(array.values().as_ref(), false, start, end - start); let f = move |offset: Option<&[O]>, buf: &mut Vec| { if let Some(offset) = offset { @@ -267,7 +269,7 @@ fn fixed_size_list_serializer<'a>( offset: usize, take: usize, ) -> Box + 'a + Send + Sync> { - let mut serializer = new_serializer(array.values().as_ref(), offset, take); + let mut serializer = new_serializer(array.values().as_ref(), false, offset, take); Box::new(BufStreamingIterator::new( ZipValidity::new(0..array.len(), array.validity().map(|x| x.iter())), @@ -403,6 +405,7 @@ fn timestamp_tz_serializer<'a>( pub(crate) fn new_serializer<'a>( array: &'a dyn Array, + ignore_nulls: bool, offset: usize, take: usize, ) -> Box + 'a + Send + Sync> { @@ -450,9 +453,12 @@ pub(crate) fn new_serializer<'a>( ArrowDataType::Utf8View => { utf8view_serializer(array.as_any().downcast_ref().unwrap(), offset, take) }, - ArrowDataType::Struct(_) => { - struct_serializer(array.as_any().downcast_ref().unwrap(), offset, take) - }, + ArrowDataType::Struct(_) => struct_serializer( + array.as_any().downcast_ref().unwrap(), + ignore_nulls, + offset, + take, + ), ArrowDataType::FixedSizeList(_, _) => { fixed_size_list_serializer(array.as_any().downcast_ref().unwrap(), offset, take) }, @@ -522,6 +528,7 @@ fn serialize_item<'a>( buffer: &mut Vec, record: impl Iterator, is_first_row: bool, + ignore_nulls: bool, ) { if !is_first_row { buffer.push(b','); @@ -529,6 +536,9 @@ fn serialize_item<'a>( buffer.push(b'{'); let mut first_item = true; for (key, value) in record { + if ignore_nulls && value == b"null" { + continue; + } if !first_item { buffer.push(b','); } @@ -544,7 +554,7 @@ fn serialize_item<'a>( /// # Implementation /// This operation is CPU-bounded pub(crate) fn serialize(array: &dyn Array, buffer: &mut Vec) { - let mut serializer = new_serializer(array, 0, usize::MAX); + let mut serializer = new_serializer(array, false, 0, usize::MAX); (0..array.len()).for_each(|i| { if i != 0 { diff --git a/crates/polars-json/src/json/write/utf8.rs b/crates/polars-json/src/json/write/utf8.rs index f967853bc1e1..1952d1760055 100644 --- a/crates/polars-json/src/json/write/utf8.rs +++ b/crates/polars-json/src/json/write/utf8.rs @@ -141,9 +141,9 @@ where writer.write_all(s) } -pub fn serialize_to_utf8(array: &dyn Array) -> Utf8ViewArray { +pub fn serialize_to_utf8(array: &dyn Array, ignore_nulls: bool) -> Utf8ViewArray { let mut values = MutableBinaryViewArray::with_capacity(array.len()); - let mut serializer = new_serializer(array, 0, usize::MAX); + let mut serializer = new_serializer(array, ignore_nulls, 0, usize::MAX); while let Some(v) = serializer.next() { unsafe { values.push_value(std::str::from_utf8_unchecked(v)) } diff --git a/crates/polars-json/src/ndjson/write.rs b/crates/polars-json/src/ndjson/write.rs index 90f202b02360..8516f4295612 100644 --- a/crates/polars-json/src/ndjson/write.rs +++ b/crates/polars-json/src/ndjson/write.rs @@ -8,7 +8,7 @@ use polars_error::{PolarsError, PolarsResult}; use super::super::json::write::new_serializer; fn serialize(array: &dyn Array, buffer: &mut Vec) { - let mut serializer = new_serializer(array, 0, usize::MAX); + let mut serializer = new_serializer(array, false, 0, usize::MAX); (0..array.len()).for_each(|_| { buffer.extend_from_slice(serializer.next().unwrap()); buffer.push(b'\n'); diff --git a/crates/polars-plan/src/dsl/function_expr/struct_.rs b/crates/polars-plan/src/dsl/function_expr/struct_.rs index 2d2fa772d160..cf637ce471f3 100644 --- a/crates/polars-plan/src/dsl/function_expr/struct_.rs +++ b/crates/polars-plan/src/dsl/function_expr/struct_.rs @@ -12,7 +12,9 @@ pub enum StructFunction { PrefixFields(Arc), SuffixFields(Arc), #[cfg(feature = "json")] - JsonEncode, + JsonEncode { + ignore_nulls: bool, + }, WithFields, MultipleFields(Arc<[ColumnName]>), } @@ -91,7 +93,7 @@ impl StructFunction { _ => polars_bail!(op = "suffix_fields", got = dt, expected = "Struct"), }), #[cfg(feature = "json")] - JsonEncode => mapper.with_dtype(DataType::String), + JsonEncode { .. } => mapper.with_dtype(DataType::String), WithFields => { let args = mapper.args(); let struct_ = &args[0]; @@ -134,7 +136,7 @@ impl Display for StructFunction { PrefixFields(_) => write!(f, "name.prefix_fields"), SuffixFields(_) => write!(f, "name.suffixFields"), #[cfg(feature = "json")] - JsonEncode => write!(f, "struct.to_json"), + JsonEncode { .. } => write!(f, "struct.to_json"), WithFields => write!(f, "with_fields"), MultipleFields(_) => write!(f, "multiple_fields"), } @@ -151,7 +153,7 @@ impl From for SpecialEq> { PrefixFields(prefix) => map!(prefix_fields, prefix.clone()), SuffixFields(suffix) => map!(suffix_fields, suffix.clone()), #[cfg(feature = "json")] - JsonEncode => map!(to_json), + JsonEncode { ignore_nulls } => map!(to_json, ignore_nulls), WithFields => map_as_slice!(with_fields), MultipleFields(_) => unimplemented!(), } @@ -215,13 +217,13 @@ pub(super) fn suffix_fields(s: &Series, suffix: Arc) -> PolarsResult PolarsResult { +pub(super) fn to_json(s: &Series, ignore_nulls: bool) -> PolarsResult { let ca = s.struct_()?; let dtype = ca.dtype().to_arrow(CompatLevel::newest()); let iter = ca.chunks().iter().map(|arr| { let arr = arrow::compute::cast::cast_unchecked(arr.as_ref(), &dtype).unwrap(); - polars_json::json::write::serialize_to_utf8(arr.as_ref()) + polars_json::json::write::serialize_to_utf8(arr.as_ref(), ignore_nulls) }); Ok(StringChunked::from_chunk_iter(ca.name(), iter).into_series()) diff --git a/crates/polars-plan/src/dsl/struct_.rs b/crates/polars-plan/src/dsl/struct_.rs index b5a1afafa698..be9a7071a1f6 100644 --- a/crates/polars-plan/src/dsl/struct_.rs +++ b/crates/polars-plan/src/dsl/struct_.rs @@ -63,9 +63,11 @@ impl StructNameSpace { } #[cfg(feature = "json")] - pub fn json_encode(self) -> Expr { + pub fn json_encode(self, ignore_nulls: bool) -> Expr { self.0 - .map_private(FunctionExpr::StructExpr(StructFunction::JsonEncode)) + .map_private(FunctionExpr::StructExpr(StructFunction::JsonEncode { + ignore_nulls, + })) } pub fn with_fields(self, fields: Vec) -> PolarsResult { diff --git a/py-polars/polars/expr/struct.py b/py-polars/polars/expr/struct.py index 225f82b67199..6b56da3ff5c0 100644 --- a/py-polars/polars/expr/struct.py +++ b/py-polars/polars/expr/struct.py @@ -214,10 +214,21 @@ def rename_fields(self, names: Sequence[str]) -> Expr: """ return wrap_expr(self._pyexpr.struct_rename_fields(names)) - def json_encode(self) -> Expr: + def json_encode(self, *, ignore_nulls: bool = False) -> Expr: """ Convert this struct to a string column with json values. + Parameters + ---------- + ignore_nulls + Ignore missing values in the struct when serializing. + + - When `ignore_nulls=False`, the values in the struct are included even + if they are null + + - When `ignore_nulls=True`, the values in the struct are skipped if they + are null + Examples -------- >>> pl.DataFrame( @@ -232,8 +243,22 @@ def json_encode(self) -> Expr: │ {[1, 2],[45]} ┆ {"a":[1,2],"b":[45]} │ │ {[9, 1, 3],null} ┆ {"a":[9,1,3],"b":null} │ └──────────────────┴────────────────────────┘ + >>> pl.DataFrame( + ... {"a": [{"a": [1, 2], "b": [45]}, {"a": [9, 1, 3], "b": None}]} + ... ).with_columns( + ... pl.col("a").struct.json_encode(ignore_nulls=True).alias("encoded") + ... ) + shape: (2, 2) + ┌──────────────────┬────────────────────────┐ + │ a ┆ encoded │ + │ --- ┆ --- │ + │ struct[2] ┆ str │ + ╞══════════════════╪════════════════════════╡ + │ {[1, 2],[45]} ┆ {"a":[1,2],"b":[45]} │ + │ {[9, 1, 3],null} ┆ {"a":[9,1,3]} │ + └──────────────────┴────────────────────────┘ """ - return wrap_expr(self._pyexpr.struct_json_encode()) + return wrap_expr(self._pyexpr.struct_json_encode(ignore_nulls)) def with_fields( self, diff --git a/py-polars/src/expr/struct.rs b/py-polars/src/expr/struct.rs index 0188c43b01bd..21c6dacb0849 100644 --- a/py-polars/src/expr/struct.rs +++ b/py-polars/src/expr/struct.rs @@ -23,8 +23,12 @@ impl PyExpr { } #[cfg(feature = "json")] - fn struct_json_encode(&self) -> Self { - self.inner.clone().struct_().json_encode().into() + fn struct_json_encode(&self, ignore_nulls: bool) -> Self { + self.inner + .clone() + .struct_() + .json_encode(ignore_nulls) + .into() } fn struct_with_fields(&self, fields: Vec) -> PyResult {