From e1b8dc954c95189be1c95e2f695d184ef1a51a0d Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Thu, 12 Dec 2024 23:58:46 +0200 Subject: [PATCH] support null in mapped value --- .../generic_bytes_dictionary_builder.rs | 60 +++++++++++++++-- .../builder/primitive_dictionary_builder.rs | 65 +++++++++++++++++-- 2 files changed, 116 insertions(+), 9 deletions(-) diff --git a/arrow-array/src/builder/generic_bytes_dictionary_builder.rs b/arrow-array/src/builder/generic_bytes_dictionary_builder.rs index 1a86e0405231..8d90339642ad 100644 --- a/arrow-array/src/builder/generic_bytes_dictionary_builder.rs +++ b/arrow-array/src/builder/generic_bytes_dictionary_builder.rs @@ -330,8 +330,10 @@ where // Orphan values will be carried over to the new dictionary let mapped_values = values.iter() - // Dictionary values should not be null as the keys are null if the value is null - .map(|dict_value| self.get_or_insert_key(dict_value.expect("Dictionary value should not be null"))) + // Dictionary values can technically be null, so we need to handle that + .map(|dict_value| dict_value + .map(|dict_value| self.get_or_insert_key(dict_value)) + .transpose()) .collect::, _>>()?; // Just insert the keys without additional lookups @@ -343,7 +345,10 @@ where None => self.append_null(), Some(original_dict_index) => { let index = original_dict_index.as_usize().min(v_len - 1); - self.keys_builder.append_value(mapped_values[index]); + match mapped_values[index] { + None => self.append_null(), + Some(mapped_value) => self.keys_builder.append_value(mapped_value), + } } } }); @@ -491,8 +496,9 @@ mod tests { use super::*; use crate::array::Int8Array; + use crate::cast::AsArray; use crate::types::{Int16Type, Int32Type, Int8Type, Utf8Type}; - use crate::{BinaryArray, StringArray}; + use crate::{ArrowPrimitiveType, BinaryArray, StringArray}; fn test_bytes_dictionary_builder(values: Vec<&T::Native>) where @@ -740,6 +746,52 @@ mod tests { [Some("e"), Some("e"), Some("f"), Some("e"), Some("d"), Some("a"), Some("b"), Some("c"), Some("a"), Some("b"), Some("c"), None, Some("c"), Some("d"), Some("a"), None] ); } + #[test] + fn test_extend_dictionary_with_null_in_mapped_value() { + let some_dict = { + let mut values_builder = GenericByteBuilder::::new(); + let mut keys_builder = PrimitiveBuilder::::new(); + + // Manually build a dictionary values that the mapped values have null + values_builder.append_null(); + keys_builder.append_value(0); + values_builder.append_value("I like worm hugs"); + keys_builder.append_value(1); + + let values = values_builder.finish(); + let keys = keys_builder.finish(); + + let data_type = DataType::Dictionary(Box::new(Int32Type::DATA_TYPE), Box::new(Utf8Type::DATA_TYPE)); + + let builder = keys + .into_data() + .into_builder() + .data_type(data_type) + .child_data(vec![values.into_data()]); + + DictionaryArray::from(unsafe { builder.build_unchecked() }) + }; + + let some_dict_values = some_dict.values().as_string::(); + assert_eq!(some_dict_values.into_iter().collect::>(), &[None, Some("I like worm hugs")]); + + let mut builder = GenericByteDictionaryBuilder::::new(); + builder.extend_dictionary(&some_dict.downcast_dict().unwrap()).unwrap(); + let dict = builder.finish(); + + assert_eq!(dict.values().len(), 1); + + let values = dict + .downcast_dict::>() + .unwrap() + .into_iter() + .collect::>(); + + assert_eq!( + values, + [None, Some("I like worm hugs")] + ); + } #[test] fn test_extend_all_null_dictionary() { diff --git a/arrow-array/src/builder/primitive_dictionary_builder.rs b/arrow-array/src/builder/primitive_dictionary_builder.rs index 1f1701aa8cd6..ac9fa478bfe7 100644 --- a/arrow-array/src/builder/primitive_dictionary_builder.rs +++ b/arrow-array/src/builder/primitive_dictionary_builder.rs @@ -305,7 +305,9 @@ where /// Extends builder with dictionary /// - /// This is the same as `extends` but avoid lookup for each item in the iterator + /// This is the same as `extends` but avoid lookup for each item in the iterator + /// + /// when dictionary values are null (the actual mapped values) the keys are null /// pub fn extend_dictionary(&mut self, dictionary: &TypedDictionaryArray>) -> Result<(), ArrowError> { let values = dictionary.values(); @@ -328,8 +330,10 @@ where // Orphan values will be carried over to the new dictionary let mapped_values = values.iter() - // Dictionary values should not be null as the keys are null if the value is null - .map(|dict_value| self.get_or_insert_key(dict_value.expect("Dictionary value should not be null"))) + // Dictionary values can technically be null, so we need to handle that + .map(|dict_value| dict_value + .map(|dict_value| self.get_or_insert_key(dict_value)) + .transpose()) .collect::, _>>()?; // Just insert the keys without additional lookups @@ -341,7 +345,10 @@ where None => self.append_null(), Some(original_dict_index) => { let index = original_dict_index.as_usize().min(v_len - 1); - self.keys_builder.append_value(mapped_values[index]); + match mapped_values[index] { + None => self.append_null(), + Some(mapped_value) => self.keys_builder.append_value(mapped_value), + } } } }); @@ -416,6 +423,7 @@ mod tests { use crate::array::{UInt32Array, UInt8Array, Int32Array}; use crate::builder::Decimal128Builder; + use crate::cast::AsArray; use crate::types::{Decimal128Type, Int32Type, UInt32Type, UInt8Type}; #[test] @@ -489,7 +497,6 @@ mod tests { ); } - #[test] fn test_extend_dictionary() { let some_dict = { @@ -520,6 +527,54 @@ mod tests { ); } + #[test] + fn test_extend_dictionary_with_null_in_mapped_value() { + let some_dict = { + let mut values_builder = PrimitiveBuilder::::new(); + let mut keys_builder = PrimitiveBuilder::::new(); + + // Manually build a dictionary values that the mapped values have null + values_builder.append_null(); + keys_builder.append_value(0); + values_builder.append_value(42); + keys_builder.append_value(1); + + let values = values_builder.finish(); + let keys = keys_builder.finish(); + + let data_type = + DataType::Dictionary(Box::new(Int32Type::DATA_TYPE), Box::new(values.data_type().clone())); + + let builder = keys + .into_data() + .into_builder() + .data_type(data_type) + .child_data(vec![values.into_data()]); + + DictionaryArray::from(unsafe { builder.build_unchecked() }) + }; + + let some_dict_values = some_dict.values().as_primitive::(); + assert_eq!(some_dict_values.into_iter().collect::>(), &[None, Some(42)]); + + let mut builder = PrimitiveDictionaryBuilder::::new(); + builder.extend_dictionary(&some_dict.downcast_dict().unwrap()).unwrap(); + let dict = builder.finish(); + + assert_eq!(dict.values().len(), 1); + + let values = dict + .downcast_dict::() + .unwrap() + .into_iter() + .collect::>(); + + assert_eq!( + values, + [None, Some(42)] + ); + } + #[test] fn test_extend_all_null_dictionary() { let some_dict = {