From 6a8f20845e43ee5e636a1071475d95969983174f Mon Sep 17 00:00:00 2001 From: Xiangpeng Hao Date: Thu, 25 Jul 2024 18:10:16 -0400 Subject: [PATCH 1/3] fix bug in return type inference --- datafusion/functions/src/utils.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/datafusion/functions/src/utils.rs b/datafusion/functions/src/utils.rs index 6fcb9c6f0840..45662dfabf36 100644 --- a/datafusion/functions/src/utils.rs +++ b/datafusion/functions/src/utils.rs @@ -32,7 +32,7 @@ use datafusion_expr::{ColumnarValue, ScalarFunctionImplementation}; /// /// If the input type is `Utf8` or `Binary` the return type is `$utf8Type`, /// -/// If the input type is `Utf8View` the return type is `Utf8View`, +/// If the input type is `Utf8View` the return type is $utf8Type, macro_rules! get_optimal_return_type { ($FUNC:ident, $largeUtf8Type:expr, $utf8Type:expr) => { pub(crate) fn $FUNC(arg_type: &DataType, name: &str) -> Result { @@ -41,8 +41,8 @@ macro_rules! get_optimal_return_type { DataType::LargeUtf8 | DataType::LargeBinary => $largeUtf8Type, // Binary inputs are automatically coerced to Utf8 DataType::Utf8 | DataType::Binary => $utf8Type, - // Utf8View inputs will yield Utf8View outputs - DataType::Utf8View => DataType::Utf8View, + // Utf8View max offset size is u32::MAX, the same as UTF8 + DataType::Utf8View | DataType::BinaryView => $utf8Type, DataType::Null => DataType::Null, DataType::Dictionary(_, value_type) => match **value_type { DataType::LargeUtf8 | DataType::LargeBinary => $largeUtf8Type, From ba2a2f1ba49c8db1ef5cb9a82d38790000ab65a9 Mon Sep 17 00:00:00 2001 From: Xiangpeng Hao Date: Fri, 26 Jul 2024 09:12:01 -0400 Subject: [PATCH 2/3] update doc --- datafusion/physical-plan/src/coalesce_batches.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/datafusion/physical-plan/src/coalesce_batches.rs b/datafusion/physical-plan/src/coalesce_batches.rs index 8169401874e6..1150d059e7d5 100644 --- a/datafusion/physical-plan/src/coalesce_batches.rs +++ b/datafusion/physical-plan/src/coalesce_batches.rs @@ -270,14 +270,14 @@ impl RecordBatchStream for CoalesceBatchesStream { } } -/// Heuristically compact [`StringViewArray`]s to reduce memory usage, if needed +/// Heuristically compact `StringViewArray`s to reduce memory usage, if needed /// /// This function decides when to consolidate the StringView into a new buffer /// to reduce memory usage and improve string locality for better performance. /// -/// This differs from [`StringViewArray::gc`] because: +/// This differs from `StringViewArray::gc` because: /// 1. It may not compact the array depending on a heuristic. -/// 2. It uses a larger default block size (2MB) to reduce the number of buffers to track. +/// 2. It uses a precise block size to reduce the number of buffers to track. /// /// # Heuristic /// From 6bdbb72b447dcba75acb22f85e85b1b172eeab00 Mon Sep 17 00:00:00 2001 From: Xiangpeng Hao Date: Fri, 26 Jul 2024 09:21:42 -0400 Subject: [PATCH 3/3] add tests --- datafusion/functions/src/utils.rs | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/datafusion/functions/src/utils.rs b/datafusion/functions/src/utils.rs index 45662dfabf36..7b367174006d 100644 --- a/datafusion/functions/src/utils.rs +++ b/datafusion/functions/src/utils.rs @@ -183,6 +183,21 @@ pub mod test { }; } + use arrow::datatypes::DataType; #[allow(unused_imports)] pub(crate) use test_function; + + use super::*; + + #[test] + fn string_to_int_type() { + let v = utf8_to_int_type(&DataType::Utf8, "test").unwrap(); + assert_eq!(v, DataType::Int32); + + let v = utf8_to_int_type(&DataType::Utf8View, "test").unwrap(); + assert_eq!(v, DataType::Int32); + + let v = utf8_to_int_type(&DataType::LargeUtf8, "test").unwrap(); + assert_eq!(v, DataType::Int64); + } }