-
Notifications
You must be signed in to change notification settings - Fork 837
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[arrow-cast] Support cast from Numeric (Int
, UInt
, etc) to Utf8View
#6719
Changes from 3 commits
205e40d
2a937fe
2f8f9c3
74de9bc
cdc16c9
d65e228
2ea0b51
b907827
92732ea
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -484,6 +484,13 @@ impl<T: ByteViewType + ?Sized, V: AsRef<T::Native>> Extend<Option<V>> | |
/// ``` | ||
pub type StringViewBuilder = GenericByteViewBuilder<StringViewType>; | ||
|
||
impl std::fmt::Write for StringViewBuilder { | ||
fn write_str(&mut self, s: &str) -> std::fmt::Result { | ||
self.append_value(s); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was writing some tests for this, and it turns out this is different behavior than Specifically, calling I made a PR showing the problem: tlm365#1 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am working on a potential solution so we can unblock this PR There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
Ok(()) | ||
} | ||
} | ||
|
||
/// Array builder for [`BinaryViewArray`][crate::BinaryViewArray] | ||
/// | ||
/// Values can be appended using [`GenericByteViewBuilder::append_value`], and nulls with | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -182,8 +182,8 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { | |
(Decimal128(_, _) | Decimal256(_, _), UInt8 | UInt16 | UInt32 | UInt64) | | ||
// decimal to signed numeric | ||
(Decimal128(_, _) | Decimal256(_, _), Null | Int8 | Int16 | Int32 | Int64 | Float32 | Float64) => true, | ||
// decimal to Utf8 | ||
(Decimal128(_, _) | Decimal256(_, _), Utf8 | LargeUtf8) => true, | ||
// decimal to string | ||
(Decimal128(_, _) | Decimal256(_, _), Utf8View | Utf8 | LargeUtf8) => true, | ||
// Utf8 to decimal | ||
(Utf8 | LargeUtf8, Decimal128(_, _) | Decimal256(_, _)) => true, | ||
(Struct(from_fields), Struct(to_fields)) => { | ||
|
@@ -231,7 +231,7 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { | |
(Utf8 | LargeUtf8, Utf8View) => true, | ||
(BinaryView, Binary | LargeBinary | Utf8 | LargeUtf8 | Utf8View ) => true, | ||
(Utf8 | LargeUtf8, _) => to_type.is_numeric() && to_type != &Float16, | ||
(_, Utf8 | LargeUtf8) => from_type.is_primitive(), | ||
(_, Utf8View | Utf8 | LargeUtf8) => from_type.is_primitive(), | ||
|
||
(_, Binary | LargeBinary) => from_type.is_integer(), | ||
|
||
|
@@ -917,6 +917,7 @@ pub fn cast_with_options( | |
Float64 => cast_decimal_to_float::<Decimal128Type, Float64Type, _>(array, |x| { | ||
x as f64 / 10_f64.powi(*scale as i32) | ||
}), | ||
Utf8View => value_to_string_view(array, cast_options), | ||
Utf8 => value_to_string::<i32>(array, cast_options), | ||
LargeUtf8 => value_to_string::<i64>(array, cast_options), | ||
Null => Ok(new_null_array(to_type, array.len())), | ||
|
@@ -982,6 +983,7 @@ pub fn cast_with_options( | |
Float64 => cast_decimal_to_float::<Decimal256Type, Float64Type, _>(array, |x| { | ||
x.to_f64().unwrap() / 10_f64.powi(*scale as i32) | ||
}), | ||
Utf8View => value_to_string_view(array, cast_options), | ||
Utf8 => value_to_string::<i32>(array, cast_options), | ||
LargeUtf8 => value_to_string::<i64>(array, cast_options), | ||
Null => Ok(new_null_array(to_type, array.len())), | ||
|
@@ -1462,6 +1464,9 @@ pub fn cast_with_options( | |
(BinaryView, _) => Err(ArrowError::CastError(format!( | ||
"Casting from {from_type:?} to {to_type:?} not supported", | ||
))), | ||
(from_type, Utf8View) if from_type.is_primitive() => { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I believe this also fixes the Timestamp -> Utf8View issue. It would be good to have tests for temporal -> Utf8View added to cover this case. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. After reviewing the code, I realized that the Timestamp -> Utf8View cast is not supported yet. The main issue comes from the current implementation of I think this issue deserves a separate PR to handle the temporal -> string view casting. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'll file another PR today to cover the temporal -> Utf8View case unless someone beats me to it. |
||
value_to_string_view(array, cast_options) | ||
} | ||
(from_type, LargeUtf8) if from_type.is_primitive() => { | ||
value_to_string::<i64>(array, cast_options) | ||
} | ||
|
@@ -2485,12 +2490,11 @@ where | |
|
||
#[cfg(test)] | ||
mod tests { | ||
use super::*; | ||
use arrow_buffer::{Buffer, IntervalDayTime, NullBuffer}; | ||
use chrono::NaiveDate; | ||
use half::f16; | ||
|
||
use super::*; | ||
|
||
macro_rules! generate_cast_test_case { | ||
($INPUT_ARRAY: expr, $OUTPUT_TYPE_ARRAY: ident, $OUTPUT_TYPE: expr, $OUTPUT_VALUES: expr) => { | ||
let output = | ||
|
@@ -3708,6 +3712,40 @@ mod tests { | |
assert_eq!(10.0, c.value(3)); | ||
} | ||
|
||
#[test] | ||
fn test_cast_int_to_utf8view() { | ||
assert!(can_cast_types(&DataType::Int8, &DataType::Utf8View)); | ||
assert!(can_cast_types(&DataType::Int16, &DataType::Utf8View)); | ||
assert!(can_cast_types(&DataType::Int32, &DataType::Utf8View)); | ||
assert!(can_cast_types(&DataType::Int64, &DataType::Utf8View)); | ||
|
||
let array = Int32Array::from(vec![None, Some(8), Some(9), Some(10)]); | ||
let arr = cast(&array, &DataType::Utf8View).unwrap(); | ||
assert_eq!(4, arr.len()); | ||
assert_eq!(1, arr.null_count()); | ||
let c = arr.as_string_view(); | ||
assert!(c.is_null(0)); | ||
assert_eq!("8", c.value(1)); | ||
assert_eq!("9", c.value(2)); | ||
assert_eq!("10", c.value(3)); | ||
} | ||
|
||
#[test] | ||
fn test_cast_float_to_utf8view() { | ||
assert!(can_cast_types(&DataType::Float16, &DataType::Utf8View)); | ||
assert!(can_cast_types(&DataType::Float32, &DataType::Utf8View)); | ||
assert!(can_cast_types(&DataType::Float64, &DataType::Utf8View)); | ||
|
||
let array = Float32Array::from(vec![Some(8.64), Some(9.81), None]); | ||
let arr = cast(&array, &DataType::Utf8View).unwrap(); | ||
assert_eq!(3, arr.len()); | ||
assert_eq!(1, arr.null_count()); | ||
let c = arr.as_string_view(); | ||
assert_eq!("8.64", c.value(0)); | ||
assert_eq!("9.81", c.value(1)); | ||
assert!(c.is_null(2)); | ||
} | ||
|
||
#[test] | ||
fn test_cast_utf8_to_i32() { | ||
let array = StringArray::from(vec!["5", "6", "seven", "8", "9.1"]); | ||
|
@@ -9114,26 +9152,51 @@ mod tests { | |
} | ||
|
||
#[test] | ||
fn test_cast_decimal_to_utf8() { | ||
fn test_cast_decimal_to_string() { | ||
assert!(can_cast_types( | ||
&DataType::Decimal128(10, 4), | ||
&DataType::Utf8View | ||
)); | ||
assert!(can_cast_types( | ||
&DataType::Decimal256(38, 10), | ||
&DataType::Utf8View | ||
)); | ||
|
||
macro_rules! assert_decimal_values { | ||
($array:expr) => { | ||
let c = $array; | ||
assert_eq!("1123.454", c.value(0)); | ||
assert_eq!("2123.456", c.value(1)); | ||
assert_eq!("-3123.453", c.value(2)); | ||
assert_eq!("-3123.456", c.value(3)); | ||
assert_eq!("0.000", c.value(4)); | ||
assert_eq!("0.123", c.value(5)); | ||
assert_eq!("1234.567", c.value(6)); | ||
assert_eq!("-1234.567", c.value(7)); | ||
assert!(c.is_null(8)); | ||
}; | ||
} | ||
|
||
fn test_decimal_to_string<IN: ArrowPrimitiveType, OffsetSize: OffsetSizeTrait>( | ||
output_type: DataType, | ||
array: PrimitiveArray<IN>, | ||
) { | ||
let b = cast(&array, &output_type).unwrap(); | ||
|
||
assert_eq!(b.data_type(), &output_type); | ||
let c = b.as_string::<OffsetSize>(); | ||
|
||
assert_eq!("1123.454", c.value(0)); | ||
assert_eq!("2123.456", c.value(1)); | ||
assert_eq!("-3123.453", c.value(2)); | ||
assert_eq!("-3123.456", c.value(3)); | ||
assert_eq!("0.000", c.value(4)); | ||
assert_eq!("0.123", c.value(5)); | ||
assert_eq!("1234.567", c.value(6)); | ||
assert_eq!("-1234.567", c.value(7)); | ||
assert!(c.is_null(8)); | ||
match b.data_type() { | ||
DataType::Utf8View => { | ||
let c = b.as_string_view(); | ||
assert_decimal_values!(c); | ||
} | ||
DataType::Utf8 | DataType::LargeUtf8 => { | ||
let c = b.as_string::<OffsetSize>(); | ||
assert_decimal_values!(c); | ||
} | ||
_ => (), | ||
} | ||
} | ||
|
||
let array128: Vec<Option<i128>> = vec![ | ||
Some(1123454), | ||
Some(2123456), | ||
|
@@ -9145,22 +9208,33 @@ mod tests { | |
Some(-123456789), | ||
None, | ||
]; | ||
let array256: Vec<Option<i256>> = array128 | ||
.iter() | ||
.map(|num| num.map(i256::from_i128)) | ||
.collect(); | ||
|
||
let array256: Vec<Option<i256>> = array128.iter().map(|v| v.map(i256::from_i128)).collect(); | ||
|
||
test_decimal_to_string::<arrow_array::types::Decimal128Type, i32>( | ||
test_decimal_to_string::<Decimal128Type, i32>( | ||
DataType::Utf8View, | ||
create_decimal_array(array128.clone(), 7, 3).unwrap(), | ||
); | ||
test_decimal_to_string::<Decimal128Type, i32>( | ||
DataType::Utf8, | ||
create_decimal_array(array128.clone(), 7, 3).unwrap(), | ||
); | ||
test_decimal_to_string::<arrow_array::types::Decimal128Type, i64>( | ||
test_decimal_to_string::<Decimal128Type, i64>( | ||
DataType::LargeUtf8, | ||
create_decimal_array(array128, 7, 3).unwrap(), | ||
); | ||
test_decimal_to_string::<arrow_array::types::Decimal256Type, i32>( | ||
|
||
test_decimal_to_string::<Decimal256Type, i32>( | ||
DataType::Utf8View, | ||
create_decimal256_array(array256.clone(), 7, 3).unwrap(), | ||
); | ||
test_decimal_to_string::<Decimal256Type, i32>( | ||
DataType::Utf8, | ||
create_decimal256_array(array256.clone(), 7, 3).unwrap(), | ||
); | ||
test_decimal_to_string::<arrow_array::types::Decimal256Type, i64>( | ||
test_decimal_to_string::<Decimal256Type, i64>( | ||
DataType::LargeUtf8, | ||
create_decimal256_array(array256, 7, 3).unwrap(), | ||
); | ||
|
Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
@@ -38,6 +38,22 @@ pub(crate) fn value_to_string<O: OffsetSizeTrait>( | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
Ok(Arc::new(builder.finish())) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
} | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
pub(crate) fn value_to_string_view( | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
array: &dyn Array, | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
options: &CastOptions, | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
) -> Result<ArrayRef, ArrowError> { | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
let mut builder = StringViewBuilder::with_capacity(array.len()); | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
let formatter = ArrayFormatter::try_new(array, &options.format_options)?; | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
let nulls = array.nulls(); | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
for i in 0..array.len() { | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
match nulls.map(|x| x.is_null(i)).unwrap_or_default() { | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
true => builder.append_null(), | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
false => formatter.value(i).write(&mut builder)?, | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
} | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
} | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
Ok(Arc::new(builder.finish())) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
} | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
Comment on lines
+41
to
+55
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have been dreaming about this PR and how to unblock it. I think it is currently stalled by trying to figure out how to handle Here is an alternate proposal:
So the first point might look something like this:
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This will work nicely (well, once it compiles) and would unblock downstream issues. 👍🏻 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
/// Parse UTF-8 | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
pub(crate) fn parse_string<P: Parser, O: OffsetSizeTrait>( | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
array: &dyn Array, | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this is also what is contemplated by #6373 (aka I think this PR fixes that ticket as well)