diff --git a/arrow-array/src/builder/generic_bytes_dictionary_builder.rs b/arrow-array/src/builder/generic_bytes_dictionary_builder.rs index 6bc4c0a48d35..54034c847c7c 100644 --- a/arrow-array/src/builder/generic_bytes_dictionary_builder.rs +++ b/arrow-array/src/builder/generic_bytes_dictionary_builder.rs @@ -311,9 +311,12 @@ where /// /// when dictionary values are null (the actual mapped values) the keys are null /// - pub fn extend_dictionary(&mut self, dictionary: &TypedDictionaryArray>) -> Result<(), ArrowError> { + pub fn extend_dictionary( + &mut self, + dictionary: &TypedDictionaryArray>, + ) -> Result<(), ArrowError> { let values = dictionary.values(); - + let v_len = values.len(); let k_len = dictionary.keys().len(); if v_len == 0 && k_len == 0 { @@ -327,33 +330,33 @@ where } if k_len == 0 { - return Err(ArrowError::InvalidArgumentError("Dictionary keys should not be empty when values are not empty".to_string())); + return Err(ArrowError::InvalidArgumentError( + "Dictionary keys should not be empty when values are not empty".to_string(), + )); } // Orphan values will be carried over to the new dictionary - let mapped_values = values.iter() + let mapped_values = values + .iter() // Dictionary values can technically be null, so we need to handle that - .map(|dict_value| dict_value - .map(|dict_value| self.get_or_insert_key(dict_value)) - .transpose()) + .map(|dict_value| { + dict_value + .map(|dict_value| self.get_or_insert_key(dict_value)) + .transpose() + }) .collect::, _>>()?; // Just insert the keys without additional lookups - dictionary - .keys() - .iter() - .for_each(|key| { - match key { + dictionary.keys().iter().for_each(|key| match key { + None => self.append_null(), + Some(original_dict_index) => { + let index = original_dict_index.as_usize().min(v_len - 1); + match mapped_values[index] { None => self.append_null(), - Some(original_dict_index) => { - let index = original_dict_index.as_usize().min(v_len - 1); - match mapped_values[index] { - None => self.append_null(), - Some(mapped_value) => self.keys_builder.append_value(mapped_value), - } - } + Some(mapped_value) => self.keys_builder.append_value(mapped_value), } - }); + } + }); Ok(()) } @@ -732,20 +735,39 @@ mod tests { let mut builder = GenericByteDictionaryBuilder::::new(); builder.extend(["e", "e", "f", "e", "d"].into_iter().map(Some)); - builder.extend_dictionary(&some_dict.downcast_dict().unwrap()).unwrap(); + builder + .extend_dictionary(&some_dict.downcast_dict().unwrap()) + .unwrap(); let dict = builder.finish(); - + assert_eq!(dict.values().len(), 6); - + let values = dict .downcast_dict::>() .unwrap() .into_iter() .collect::>(); - + assert_eq!( values, - [Some("e"), Some("e"), Some("f"), Some("e"), Some("d"), Some("a"), Some("b"), Some("c"), Some("a"), Some("b"), Some("c"), None, Some("c"), Some("d"), Some("a"), None] + [ + Some("e"), + Some("e"), + Some("f"), + Some("e"), + Some("d"), + Some("a"), + Some("b"), + Some("c"), + Some("a"), + Some("b"), + Some("c"), + None, + Some("c"), + Some("d"), + Some("a"), + None + ] ); } #[test] @@ -763,7 +785,10 @@ mod tests { let values = values_builder.finish(); let keys = keys_builder.finish(); - let data_type = DataType::Dictionary(Box::new(Int32Type::DATA_TYPE), Box::new(Utf8Type::DATA_TYPE)); + let data_type = DataType::Dictionary( + Box::new(Int32Type::DATA_TYPE), + Box::new(Utf8Type::DATA_TYPE), + ); let builder = keys .into_data() @@ -775,10 +800,15 @@ mod tests { }; let some_dict_values = some_dict.values().as_string::(); - assert_eq!(some_dict_values.into_iter().collect::>(), &[None, Some("I like worm hugs")]); + assert_eq!( + some_dict_values.into_iter().collect::>(), + &[None, Some("I like worm hugs")] + ); let mut builder = GenericByteDictionaryBuilder::::new(); - builder.extend_dictionary(&some_dict.downcast_dict().unwrap()).unwrap(); + builder + .extend_dictionary(&some_dict.downcast_dict().unwrap()) + .unwrap(); let dict = builder.finish(); assert_eq!(dict.values().len(), 1); @@ -789,10 +819,7 @@ mod tests { .into_iter() .collect::>(); - assert_eq!( - values, - [None, Some("I like worm hugs")] - ); + assert_eq!(values, [None, Some("I like worm hugs")]); } #[test] @@ -804,20 +831,19 @@ mod tests { }; let mut builder = GenericByteDictionaryBuilder::::new(); - builder.extend_dictionary(&some_dict.downcast_dict().unwrap()).unwrap(); + builder + .extend_dictionary(&some_dict.downcast_dict().unwrap()) + .unwrap(); let dict = builder.finish(); - + assert_eq!(dict.values().len(), 0); - + let values = dict .downcast_dict::>() .unwrap() .into_iter() .collect::>(); - - assert_eq!( - values, - [None, None] - ); + + assert_eq!(values, [None, None]); } } diff --git a/arrow-array/src/builder/primitive_dictionary_builder.rs b/arrow-array/src/builder/primitive_dictionary_builder.rs index ac9fa478bfe7..90f6fcfdc866 100644 --- a/arrow-array/src/builder/primitive_dictionary_builder.rs +++ b/arrow-array/src/builder/primitive_dictionary_builder.rs @@ -17,7 +17,9 @@ use crate::builder::{ArrayBuilder, PrimitiveBuilder}; use crate::types::ArrowDictionaryKeyType; -use crate::{Array, ArrayRef, ArrowPrimitiveType, DictionaryArray, PrimitiveArray, TypedDictionaryArray}; +use crate::{ + Array, ArrayRef, ArrowPrimitiveType, DictionaryArray, PrimitiveArray, TypedDictionaryArray, +}; use arrow_buffer::{ArrowNativeType, ToByteSlice}; use arrow_schema::{ArrowError, DataType}; use std::any::Any; @@ -309,7 +311,10 @@ where /// /// when dictionary values are null (the actual mapped values) the keys are null /// - pub fn extend_dictionary(&mut self, dictionary: &TypedDictionaryArray>) -> Result<(), ArrowError> { + pub fn extend_dictionary( + &mut self, + dictionary: &TypedDictionaryArray>, + ) -> Result<(), ArrowError> { let values = dictionary.values(); let v_len = values.len(); @@ -325,33 +330,33 @@ where } if k_len == 0 { - return Err(ArrowError::InvalidArgumentError("Dictionary keys should not be empty when values are not empty".to_string())); + return Err(ArrowError::InvalidArgumentError( + "Dictionary keys should not be empty when values are not empty".to_string(), + )); } // Orphan values will be carried over to the new dictionary - let mapped_values = values.iter() + let mapped_values = values + .iter() // Dictionary values can technically be null, so we need to handle that - .map(|dict_value| dict_value - .map(|dict_value| self.get_or_insert_key(dict_value)) - .transpose()) + .map(|dict_value| { + dict_value + .map(|dict_value| self.get_or_insert_key(dict_value)) + .transpose() + }) .collect::, _>>()?; // Just insert the keys without additional lookups - dictionary - .keys() - .iter() - .for_each(|key| { - match key { + dictionary.keys().iter().for_each(|key| match key { + None => self.append_null(), + Some(original_dict_index) => { + let index = original_dict_index.as_usize().min(v_len - 1); + match mapped_values[index] { None => self.append_null(), - Some(original_dict_index) => { - let index = original_dict_index.as_usize().min(v_len - 1); - match mapped_values[index] { - None => self.append_null(), - Some(mapped_value) => self.keys_builder.append_value(mapped_value), - } - } + Some(mapped_value) => self.keys_builder.append_value(mapped_value), } - }); + } + }); Ok(()) } @@ -421,7 +426,7 @@ impl Extend> mod tests { use super::*; - use crate::array::{UInt32Array, UInt8Array, Int32Array}; + use crate::array::{Int32Array, UInt32Array, UInt8Array}; use crate::builder::Decimal128Builder; use crate::cast::AsArray; use crate::types::{Decimal128Type, Int32Type, UInt32Type, UInt8Type}; @@ -510,7 +515,9 @@ mod tests { let mut builder = PrimitiveDictionaryBuilder::::new(); builder.extend([6, 6, 7, 6, 5].into_iter().map(Some)); - builder.extend_dictionary(&some_dict.downcast_dict().unwrap()).unwrap(); + builder + .extend_dictionary(&some_dict.downcast_dict().unwrap()) + .unwrap(); let dict = builder.finish(); assert_eq!(dict.values().len(), 7); @@ -523,7 +530,29 @@ mod tests { assert_eq!( values, - [Some(6), Some(6), Some(7), Some(6), Some(5), Some(1), Some(2), Some(3), Some(1), Some(2), Some(3), Some(1), Some(2), Some(3), None, Some(4), Some(5), Some(1), Some(3), Some(1), None] + [ + Some(6), + Some(6), + Some(7), + Some(6), + Some(5), + Some(1), + Some(2), + Some(3), + Some(1), + Some(2), + Some(3), + Some(1), + Some(2), + Some(3), + None, + Some(4), + Some(5), + Some(1), + Some(3), + Some(1), + None + ] ); } @@ -542,8 +571,10 @@ mod tests { let values = values_builder.finish(); let keys = keys_builder.finish(); - let data_type = - DataType::Dictionary(Box::new(Int32Type::DATA_TYPE), Box::new(values.data_type().clone())); + let data_type = DataType::Dictionary( + Box::new(Int32Type::DATA_TYPE), + Box::new(values.data_type().clone()), + ); let builder = keys .into_data() @@ -555,10 +586,15 @@ mod tests { }; let some_dict_values = some_dict.values().as_primitive::(); - assert_eq!(some_dict_values.into_iter().collect::>(), &[None, Some(42)]); + assert_eq!( + some_dict_values.into_iter().collect::>(), + &[None, Some(42)] + ); let mut builder = PrimitiveDictionaryBuilder::::new(); - builder.extend_dictionary(&some_dict.downcast_dict().unwrap()).unwrap(); + builder + .extend_dictionary(&some_dict.downcast_dict().unwrap()) + .unwrap(); let dict = builder.finish(); assert_eq!(dict.values().len(), 1); @@ -569,10 +605,7 @@ mod tests { .into_iter() .collect::>(); - assert_eq!( - values, - [None, Some(42)] - ); + assert_eq!(values, [None, Some(42)]); } #[test] @@ -584,7 +617,9 @@ mod tests { }; let mut builder = PrimitiveDictionaryBuilder::::new(); - builder.extend_dictionary(&some_dict.downcast_dict().unwrap()).unwrap(); + builder + .extend_dictionary(&some_dict.downcast_dict().unwrap()) + .unwrap(); let dict = builder.finish(); assert_eq!(dict.values().len(), 0); @@ -595,9 +630,6 @@ mod tests { .into_iter() .collect::>(); - assert_eq!( - values, - [None, None] - ); + assert_eq!(values, [None, None]); } }