diff --git a/arrow-array/src/builder/generic_bytes_view_builder.rs b/arrow-array/src/builder/generic_bytes_view_builder.rs index d12c2b7db468..3487727d71f9 100644 --- a/arrow-array/src/builder/generic_bytes_view_builder.rs +++ b/arrow-array/src/builder/generic_bytes_view_builder.rs @@ -466,24 +466,59 @@ impl> Extend> /// Array builder for [`StringViewArray`][crate::StringViewArray] /// /// Values can be appended using [`GenericByteViewBuilder::append_value`], and nulls with -/// [`GenericByteViewBuilder::append_null`] as normal. +/// [`GenericByteViewBuilder::append_null`]. /// -/// # Example +/// This builder also implements [`std::fmt::Write`] with any written data +/// included in the next appended value. This allows using [`std::fmt::Display`] +/// with standard Rust idioms like `write!` and `writeln!` to write data +/// directly to the builder without intermediate allocations. +/// +/// # Example writing strings with `append_value` /// ``` /// # use arrow_array::builder::StringViewBuilder; /// # use arrow_array::StringViewArray; /// let mut builder = StringViewBuilder::new(); -/// builder.append_value("hello"); -/// builder.append_null(); -/// builder.append_value("world"); +/// builder.append_value("hello"); // row 0 is a value +/// builder.append_null(); // row 1 is null +/// builder.append_value("world"); // row 2 is a value /// let array = builder.finish(); /// /// let expected = vec![Some("hello"), None, Some("world")]; /// let actual: Vec<_> = array.iter().collect(); /// assert_eq!(expected, actual); /// ``` +/// +/// /// # Example incrementally writing strings with `std::fmt::Write` +/// ``` +/// # use std::fmt::Write; +/// # use arrow_array::builder::StringViewBuilder; +/// let mut builder = StringViewBuilder::new(); +/// +/// // Write data in multiple `write!` calls +/// write!(builder, "foo").unwrap(); +/// write!(builder, "bar").unwrap(); +/// // The next call to append_value finishes the current string +/// // including all previously written strings. +/// builder.append_value("baz"); +/// +/// // Write second value with a single write call +/// write!(builder, "v2").unwrap(); +/// // finish the value by calling append_value with an empty string +/// builder.append_value(""); +/// +/// let array = builder.finish(); +/// assert_eq!(array.value(0), "foobarbaz"); +/// assert_eq!(array.value(1), "v2"); +/// ``` pub type StringViewBuilder = GenericByteViewBuilder; +impl std::fmt::Write for StringViewBuilder { + fn write_str(&mut self, s: &str) -> std::fmt::Result { + self.append_value(s); + Ok(()) + } +} + /// Array builder for [`BinaryViewArray`][crate::BinaryViewArray] /// /// Values can be appended using [`GenericByteViewBuilder::append_value`], and nulls with diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs index 78a702e8c174..d1e313794f11 100644 --- a/arrow-cast/src/cast/mod.rs +++ b/arrow-cast/src/cast/mod.rs @@ -182,10 +182,11 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { (Decimal128(_, _) | Decimal256(_, _), UInt8 | UInt16 | UInt32 | UInt64) | // decimal to signed numeric (Decimal128(_, _) | Decimal256(_, _), Null | Int8 | Int16 | Int32 | Int64 | Float32 | Float64) => true, - // decimal to Utf8 - (Decimal128(_, _) | Decimal256(_, _), Utf8 | LargeUtf8) => true, + // decimal to string + (Decimal128(_, _) | Decimal256(_, _), Utf8View | Utf8 | LargeUtf8) => true, // string to decimal (Utf8View | Utf8 | LargeUtf8, Decimal128(_, _) | Decimal256(_, _)) => true, + (Struct(from_fields), Struct(to_fields)) => { from_fields.len() == to_fields.len() && from_fields.iter().zip(to_fields.iter()).all(|(f1, f2)| { @@ -232,6 +233,7 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { (BinaryView, Binary | LargeBinary | Utf8 | LargeUtf8 | Utf8View ) => true, (Utf8View | Utf8 | LargeUtf8, _) => to_type.is_numeric() && to_type != &Float16, (_, Utf8 | LargeUtf8) => from_type.is_primitive(), + (_, Utf8View) => from_type.is_numeric(), (_, Binary | LargeBinary) => from_type.is_integer(), @@ -917,6 +919,7 @@ pub fn cast_with_options( Float64 => cast_decimal_to_float::(array, |x| { x as f64 / 10_f64.powi(*scale as i32) }), + Utf8View => value_to_string_view(array, cast_options), Utf8 => value_to_string::(array, cast_options), LargeUtf8 => value_to_string::(array, cast_options), Null => Ok(new_null_array(to_type, array.len())), @@ -982,6 +985,7 @@ pub fn cast_with_options( Float64 => cast_decimal_to_float::(array, |x| { x.to_f64().unwrap() / 10_f64.powi(*scale as i32) }), + Utf8View => value_to_string_view(array, cast_options), Utf8 => value_to_string::(array, cast_options), LargeUtf8 => value_to_string::(array, cast_options), Null => Ok(new_null_array(to_type, array.len())), @@ -1462,6 +1466,9 @@ pub fn cast_with_options( (BinaryView, _) => Err(ArrowError::CastError(format!( "Casting from {from_type:?} to {to_type:?} not supported", ))), + (from_type, Utf8View) if from_type.is_numeric() => { + value_to_string_view(array, cast_options) + } (from_type, LargeUtf8) if from_type.is_primitive() => { value_to_string::(array, cast_options) } @@ -3707,6 +3714,40 @@ mod tests { assert_eq!(10.0, c.value(3)); } + #[test] + fn test_cast_int_to_utf8view() { + assert!(can_cast_types(&DataType::Int8, &DataType::Utf8View)); + assert!(can_cast_types(&DataType::Int16, &DataType::Utf8View)); + assert!(can_cast_types(&DataType::Int32, &DataType::Utf8View)); + assert!(can_cast_types(&DataType::Int64, &DataType::Utf8View)); + + let array = Int32Array::from(vec![None, Some(8), Some(9), Some(10)]); + let arr = cast(&array, &DataType::Utf8View).unwrap(); + assert_eq!(4, arr.len()); + assert_eq!(1, arr.null_count()); + let c = arr.as_string_view(); + assert!(c.is_null(0)); + assert_eq!("8", c.value(1)); + assert_eq!("9", c.value(2)); + assert_eq!("10", c.value(3)); + } + + #[test] + fn test_cast_float_to_utf8view() { + assert!(can_cast_types(&DataType::Float16, &DataType::Utf8View)); + assert!(can_cast_types(&DataType::Float32, &DataType::Utf8View)); + assert!(can_cast_types(&DataType::Float64, &DataType::Utf8View)); + + let array = Float32Array::from(vec![Some(8.64), Some(9.81), None]); + let arr = cast(&array, &DataType::Utf8View).unwrap(); + assert_eq!(3, arr.len()); + assert_eq!(1, arr.null_count()); + let c = arr.as_string_view(); + assert_eq!("8.64", c.value(0)); + assert_eq!("9.81", c.value(1)); + assert!(c.is_null(2)); + } + #[test] fn test_cast_utf8_to_i32() { let array = StringArray::from(vec!["5", "6", "seven", "8", "9.1"]); @@ -5180,41 +5221,46 @@ mod tests { assert_eq!("2018-12-25T00:00:00", c.value(1)); } + // Cast Timestamp to Utf8View is not supported yet + // TODO: Implement casting from Timestamp to Utf8View + // https://github.com/apache/arrow-rs/issues/6734 + macro_rules! assert_cast_timestamp_to_string { + ($array:expr, $datatype:expr, $output_array_type: ty, $expected:expr) => {{ + let out = cast(&$array, &$datatype).unwrap(); + let actual = out + .as_any() + .downcast_ref::<$output_array_type>() + .unwrap() + .into_iter() + .collect::>(); + assert_eq!(actual, $expected); + }}; + ($array:expr, $datatype:expr, $output_array_type: ty, $options:expr, $expected:expr) => {{ + let out = cast_with_options(&$array, &$datatype, &$options).unwrap(); + let actual = out + .as_any() + .downcast_ref::<$output_array_type>() + .unwrap() + .into_iter() + .collect::>(); + assert_eq!(actual, $expected); + }}; + } + #[test] fn test_cast_timestamp_to_strings() { // "2018-12-25T00:00:02.001", "1997-05-19T00:00:03.005", None let array = TimestampMillisecondArray::from(vec![Some(864000003005), Some(1545696002001), None]); - let out = cast(&array, &DataType::Utf8).unwrap(); - let out = out - .as_any() - .downcast_ref::() - .unwrap() - .into_iter() - .collect::>(); - assert_eq!( - out, - vec![ - Some("1997-05-19T00:00:03.005"), - Some("2018-12-25T00:00:02.001"), - None - ] - ); - let out = cast(&array, &DataType::LargeUtf8).unwrap(); - let out = out - .as_any() - .downcast_ref::() - .unwrap() - .into_iter() - .collect::>(); - assert_eq!( - out, - vec![ - Some("1997-05-19T00:00:03.005"), - Some("2018-12-25T00:00:02.001"), - None - ] - ); + let expected = vec![ + Some("1997-05-19T00:00:03.005"), + Some("2018-12-25T00:00:02.001"), + None, + ]; + + // assert_cast_timestamp_to_string!(array, DataType::Utf8View, StringViewArray, expected); + assert_cast_timestamp_to_string!(array, DataType::Utf8, StringArray, expected); + assert_cast_timestamp_to_string!(array, DataType::LargeUtf8, LargeStringArray, expected); } #[test] @@ -5227,73 +5273,53 @@ mod tests { .with_timestamp_format(Some(ts_format)) .with_timestamp_tz_format(Some(ts_format)), }; + // "2018-12-25T00:00:02.001", "1997-05-19T00:00:03.005", None let array_without_tz = TimestampMillisecondArray::from(vec![Some(864000003005), Some(1545696002001), None]); - let out = cast_with_options(&array_without_tz, &DataType::Utf8, &cast_options).unwrap(); - let out = out - .as_any() - .downcast_ref::() - .unwrap() - .into_iter() - .collect::>(); - assert_eq!( - out, - vec![ - Some("1997-05-19 00:00:03.005000"), - Some("2018-12-25 00:00:02.001000"), - None - ] + let expected = vec![ + Some("1997-05-19 00:00:03.005000"), + Some("2018-12-25 00:00:02.001000"), + None, + ]; + // assert_cast_timestamp_to_string!(array_without_tz, DataType::Utf8View, StringViewArray, cast_options, expected); + assert_cast_timestamp_to_string!( + array_without_tz, + DataType::Utf8, + StringArray, + cast_options, + expected ); - let out = - cast_with_options(&array_without_tz, &DataType::LargeUtf8, &cast_options).unwrap(); - let out = out - .as_any() - .downcast_ref::() - .unwrap() - .into_iter() - .collect::>(); - assert_eq!( - out, - vec![ - Some("1997-05-19 00:00:03.005000"), - Some("2018-12-25 00:00:02.001000"), - None - ] + assert_cast_timestamp_to_string!( + array_without_tz, + DataType::LargeUtf8, + LargeStringArray, + cast_options, + expected ); let array_with_tz = TimestampMillisecondArray::from(vec![Some(864000003005), Some(1545696002001), None]) .with_timezone(tz.to_string()); - let out = cast_with_options(&array_with_tz, &DataType::Utf8, &cast_options).unwrap(); - let out = out - .as_any() - .downcast_ref::() - .unwrap() - .into_iter() - .collect::>(); - assert_eq!( - out, - vec![ - Some("1997-05-19 05:45:03.005000"), - Some("2018-12-25 05:45:02.001000"), - None - ] + let expected = vec![ + Some("1997-05-19 05:45:03.005000"), + Some("2018-12-25 05:45:02.001000"), + None, + ]; + // assert_cast_timestamp_to_string!(array_with_tz, DataType::Utf8View, StringViewArray, cast_options, expected); + assert_cast_timestamp_to_string!( + array_with_tz, + DataType::Utf8, + StringArray, + cast_options, + expected ); - let out = cast_with_options(&array_with_tz, &DataType::LargeUtf8, &cast_options).unwrap(); - let out = out - .as_any() - .downcast_ref::() - .unwrap() - .into_iter() - .collect::>(); - assert_eq!( - out, - vec![ - Some("1997-05-19 05:45:03.005000"), - Some("2018-12-25 05:45:02.001000"), - None - ] + assert_cast_timestamp_to_string!( + array_with_tz, + DataType::LargeUtf8, + LargeStringArray, + cast_options, + expected ); } @@ -9148,7 +9174,31 @@ mod tests { } #[test] - fn test_cast_decimal_to_utf8() { + fn test_cast_decimal_to_string() { + assert!(can_cast_types( + &DataType::Decimal128(10, 4), + &DataType::Utf8View + )); + assert!(can_cast_types( + &DataType::Decimal256(38, 10), + &DataType::Utf8View + )); + + macro_rules! assert_decimal_values { + ($array:expr) => { + let c = $array; + assert_eq!("1123.454", c.value(0)); + assert_eq!("2123.456", c.value(1)); + assert_eq!("-3123.453", c.value(2)); + assert_eq!("-3123.456", c.value(3)); + assert_eq!("0.000", c.value(4)); + assert_eq!("0.123", c.value(5)); + assert_eq!("1234.567", c.value(6)); + assert_eq!("-1234.567", c.value(7)); + assert!(c.is_null(8)); + }; + } + fn test_decimal_to_string( output_type: DataType, array: PrimitiveArray, @@ -9156,18 +9206,19 @@ mod tests { let b = cast(&array, &output_type).unwrap(); assert_eq!(b.data_type(), &output_type); - let c = b.as_string::(); - - assert_eq!("1123.454", c.value(0)); - assert_eq!("2123.456", c.value(1)); - assert_eq!("-3123.453", c.value(2)); - assert_eq!("-3123.456", c.value(3)); - assert_eq!("0.000", c.value(4)); - assert_eq!("0.123", c.value(5)); - assert_eq!("1234.567", c.value(6)); - assert_eq!("-1234.567", c.value(7)); - assert!(c.is_null(8)); + match b.data_type() { + DataType::Utf8View => { + let c = b.as_string_view(); + assert_decimal_values!(c); + } + DataType::Utf8 | DataType::LargeUtf8 => { + let c = b.as_string::(); + assert_decimal_values!(c); + } + _ => (), + } } + let array128: Vec> = vec![ Some(1123454), Some(2123456), @@ -9179,22 +9230,33 @@ mod tests { Some(-123456789), None, ]; + let array256: Vec> = array128 + .iter() + .map(|num| num.map(i256::from_i128)) + .collect(); - let array256: Vec> = array128.iter().map(|v| v.map(i256::from_i128)).collect(); - - test_decimal_to_string::( + test_decimal_to_string::( + DataType::Utf8View, + create_decimal_array(array128.clone(), 7, 3).unwrap(), + ); + test_decimal_to_string::( DataType::Utf8, create_decimal_array(array128.clone(), 7, 3).unwrap(), ); - test_decimal_to_string::( + test_decimal_to_string::( DataType::LargeUtf8, create_decimal_array(array128, 7, 3).unwrap(), ); - test_decimal_to_string::( + + test_decimal_to_string::( + DataType::Utf8View, + create_decimal256_array(array256.clone(), 7, 3).unwrap(), + ); + test_decimal_to_string::( DataType::Utf8, create_decimal256_array(array256.clone(), 7, 3).unwrap(), ); - test_decimal_to_string::( + test_decimal_to_string::( DataType::LargeUtf8, create_decimal256_array(array256, 7, 3).unwrap(), ); diff --git a/arrow-cast/src/cast/string.rs b/arrow-cast/src/cast/string.rs index 7d0e7e21c859..44ba6460f9e7 100644 --- a/arrow-cast/src/cast/string.rs +++ b/arrow-cast/src/cast/string.rs @@ -38,6 +38,22 @@ pub(crate) fn value_to_string( Ok(Arc::new(builder.finish())) } +pub(crate) fn value_to_string_view( + array: &dyn Array, + options: &CastOptions, +) -> Result { + let mut builder = StringViewBuilder::with_capacity(array.len()); + let formatter = ArrayFormatter::try_new(array, &options.format_options)?; + let nulls = array.nulls(); + for i in 0..array.len() { + match nulls.map(|x| x.is_null(i)).unwrap_or_default() { + true => builder.append_null(), + false => formatter.value(i).write(&mut builder)?, + } + } + Ok(Arc::new(builder.finish())) +} + /// Parse UTF-8 pub(crate) fn parse_string( array: &dyn Array,