Skip to content

Commit

Permalink
support null in mapped value
Browse files Browse the repository at this point in the history
  • Loading branch information
rluvaton committed Dec 12, 2024
1 parent 5512218 commit e1b8dc9
Show file tree
Hide file tree
Showing 2 changed files with 116 additions and 9 deletions.
60 changes: 56 additions & 4 deletions arrow-array/src/builder/generic_bytes_dictionary_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -330,8 +330,10 @@ where

// Orphan values will be carried over to the new dictionary
let mapped_values = values.iter()
// Dictionary values should not be null as the keys are null if the value is null
.map(|dict_value| self.get_or_insert_key(dict_value.expect("Dictionary value should not be null")))
// Dictionary values can technically be null, so we need to handle that
.map(|dict_value| dict_value
.map(|dict_value| self.get_or_insert_key(dict_value))
.transpose())
.collect::<Result<Vec<_>, _>>()?;

// Just insert the keys without additional lookups
Expand All @@ -343,7 +345,10 @@ where
None => self.append_null(),
Some(original_dict_index) => {
let index = original_dict_index.as_usize().min(v_len - 1);
self.keys_builder.append_value(mapped_values[index]);
match mapped_values[index] {
None => self.append_null(),
Some(mapped_value) => self.keys_builder.append_value(mapped_value),
}
}
}
});
Expand Down Expand Up @@ -491,8 +496,9 @@ mod tests {
use super::*;

use crate::array::Int8Array;
use crate::cast::AsArray;
use crate::types::{Int16Type, Int32Type, Int8Type, Utf8Type};
use crate::{BinaryArray, StringArray};
use crate::{ArrowPrimitiveType, BinaryArray, StringArray};

fn test_bytes_dictionary_builder<T>(values: Vec<&T::Native>)
where
Expand Down Expand Up @@ -740,6 +746,52 @@ mod tests {
[Some("e"), Some("e"), Some("f"), Some("e"), Some("d"), Some("a"), Some("b"), Some("c"), Some("a"), Some("b"), Some("c"), None, Some("c"), Some("d"), Some("a"), None]
);
}
#[test]
fn test_extend_dictionary_with_null_in_mapped_value() {
let some_dict = {
let mut values_builder = GenericByteBuilder::<Utf8Type>::new();
let mut keys_builder = PrimitiveBuilder::<Int32Type>::new();

// Manually build a dictionary values that the mapped values have null
values_builder.append_null();
keys_builder.append_value(0);
values_builder.append_value("I like worm hugs");
keys_builder.append_value(1);

let values = values_builder.finish();
let keys = keys_builder.finish();

let data_type = DataType::Dictionary(Box::new(Int32Type::DATA_TYPE), Box::new(Utf8Type::DATA_TYPE));

let builder = keys
.into_data()
.into_builder()
.data_type(data_type)
.child_data(vec![values.into_data()]);

DictionaryArray::from(unsafe { builder.build_unchecked() })
};

let some_dict_values = some_dict.values().as_string::<i32>();
assert_eq!(some_dict_values.into_iter().collect::<Vec<_>>(), &[None, Some("I like worm hugs")]);

let mut builder = GenericByteDictionaryBuilder::<Int32Type, Utf8Type>::new();
builder.extend_dictionary(&some_dict.downcast_dict().unwrap()).unwrap();
let dict = builder.finish();

assert_eq!(dict.values().len(), 1);

let values = dict
.downcast_dict::<GenericByteArray<Utf8Type>>()
.unwrap()
.into_iter()
.collect::<Vec<_>>();

assert_eq!(
values,
[None, Some("I like worm hugs")]
);
}

#[test]
fn test_extend_all_null_dictionary() {
Expand Down
65 changes: 60 additions & 5 deletions arrow-array/src/builder/primitive_dictionary_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -305,7 +305,9 @@ where

/// Extends builder with dictionary
///
/// This is the same as `extends` but avoid lookup for each item in the iterator
/// This is the same as `extends` but avoid lookup for each item in the iterator
///
/// when dictionary values are null (the actual mapped values) the keys are null
///
pub fn extend_dictionary(&mut self, dictionary: &TypedDictionaryArray<K, PrimitiveArray<V>>) -> Result<(), ArrowError> {
let values = dictionary.values();
Expand All @@ -328,8 +330,10 @@ where

// Orphan values will be carried over to the new dictionary
let mapped_values = values.iter()
// Dictionary values should not be null as the keys are null if the value is null
.map(|dict_value| self.get_or_insert_key(dict_value.expect("Dictionary value should not be null")))
// Dictionary values can technically be null, so we need to handle that
.map(|dict_value| dict_value
.map(|dict_value| self.get_or_insert_key(dict_value))
.transpose())
.collect::<Result<Vec<_>, _>>()?;

// Just insert the keys without additional lookups
Expand All @@ -341,7 +345,10 @@ where
None => self.append_null(),
Some(original_dict_index) => {
let index = original_dict_index.as_usize().min(v_len - 1);
self.keys_builder.append_value(mapped_values[index]);
match mapped_values[index] {
None => self.append_null(),
Some(mapped_value) => self.keys_builder.append_value(mapped_value),
}
}
}
});
Expand Down Expand Up @@ -416,6 +423,7 @@ mod tests {

use crate::array::{UInt32Array, UInt8Array, Int32Array};
use crate::builder::Decimal128Builder;
use crate::cast::AsArray;
use crate::types::{Decimal128Type, Int32Type, UInt32Type, UInt8Type};

#[test]
Expand Down Expand Up @@ -489,7 +497,6 @@ mod tests {
);
}


#[test]
fn test_extend_dictionary() {
let some_dict = {
Expand Down Expand Up @@ -520,6 +527,54 @@ mod tests {
);
}

#[test]
fn test_extend_dictionary_with_null_in_mapped_value() {
let some_dict = {
let mut values_builder = PrimitiveBuilder::<Int32Type>::new();
let mut keys_builder = PrimitiveBuilder::<Int32Type>::new();

// Manually build a dictionary values that the mapped values have null
values_builder.append_null();
keys_builder.append_value(0);
values_builder.append_value(42);
keys_builder.append_value(1);

let values = values_builder.finish();
let keys = keys_builder.finish();

let data_type =
DataType::Dictionary(Box::new(Int32Type::DATA_TYPE), Box::new(values.data_type().clone()));

let builder = keys
.into_data()
.into_builder()
.data_type(data_type)
.child_data(vec![values.into_data()]);

DictionaryArray::from(unsafe { builder.build_unchecked() })
};

let some_dict_values = some_dict.values().as_primitive::<Int32Type>();
assert_eq!(some_dict_values.into_iter().collect::<Vec<_>>(), &[None, Some(42)]);

let mut builder = PrimitiveDictionaryBuilder::<Int32Type, Int32Type>::new();
builder.extend_dictionary(&some_dict.downcast_dict().unwrap()).unwrap();
let dict = builder.finish();

assert_eq!(dict.values().len(), 1);

let values = dict
.downcast_dict::<Int32Array>()
.unwrap()
.into_iter()
.collect::<Vec<_>>();

assert_eq!(
values,
[None, Some(42)]
);
}

#[test]
fn test_extend_all_null_dictionary() {
let some_dict = {
Expand Down

0 comments on commit e1b8dc9

Please sign in to comment.