From 90c90dd64cafe011c81ec26e200e01a6ad8a3493 Mon Sep 17 00:00:00 2001 From: osipovartem Date: Mon, 10 Feb 2025 23:23:17 +0300 Subject: [PATCH 1/4] Add e flag support --- .../functions/src/regex/regexpsubstr.rs | 131 ++++++++++-------- 1 file changed, 74 insertions(+), 57 deletions(-) diff --git a/datafusion/functions/src/regex/regexpsubstr.rs b/datafusion/functions/src/regex/regexpsubstr.rs index 42bcd412378f..c6dd964b91f7 100644 --- a/datafusion/functions/src/regex/regexpsubstr.rs +++ b/datafusion/functions/src/regex/regexpsubstr.rs @@ -227,6 +227,14 @@ fn regexp_substr_inner( } Some(regex) => regex, }; + + // Check for 'e' flag and set group_num to 1 if not provided + let group_num = if flags.map_or(false, |f| f.contains('e')) { + group_num.or(Some(1)) + } else { + group_num + }; + let regex = compile_regex(regex, flags)?; let mut builder = GenericStringBuilder::::new(); @@ -247,7 +255,6 @@ fn regexp_substr_inner( let matches = get_matches(cleaned_value.as_str(), ®ex, occurrence, group_num); - if matches.is_empty() { builder.append_null(); } else { @@ -274,6 +281,7 @@ fn get_matches( let occurrence = occurrence.unwrap_or(1) as usize; for caps in regex.captures_iter(value) { + println!("{value} caps {:?}", caps); match group_num { Some(group_num) => { if let Some(m) = caps.get(group_num as usize) { @@ -307,8 +315,12 @@ fn compile_regex(regex: &str, flags: Option<&str>) -> Result )); } // Case-sensitive enabled by default - let flags = flags.replace("c", ""); - format!("(?{}){}", flags, regex) + let flags = flags.replace("c", "").replace("e", ""); + if flags.is_empty() { + regex.to_string() + } else { + format!("(?{}){}", flags, regex) + } } }; @@ -469,66 +481,71 @@ mod tests { fn test_regexp_substr_with_params() { let values = [ "", - "aabca aabca", - "abc abc", - "Abcab abc", - "abCab cabc", - "ab", + "aabc aabca vff ddf", + "abc abca abcD vff", + "Abcab abcD caddd", + "abCab cabcd dasaaabc VfFddd", + "ab dasacabd caBcv dasaaabcdv", + ]; + let regex = ["abc", "(abc\\S)|(bca)", "(abc)|(bca)", "(abc)|(vff)|(d)"]; + let flags = ["i", "ie", "e", "i"]; + let group_num = [0, 1, 0, 2]; + let expected = [ + ["", "abc", "abc", "Abc", "abC", "aBc"], + ["", "abca", "abca", "Abca", "abCa", "aBcv"], + ["", "abc", "abc", "bca", "abc", "abc"], + ["", "vff", "vff", "", "VfF", ""] ]; - let regex = "abc"; - let position = 1; - let occurrence = 1; - let flags = "i"; - let group_num = 0; - let expected = ["", "abc", "abc", "Abc", "abC", ""]; // Scalar - values.iter().enumerate().for_each(|(pos, &value)| { - let expected = expected.get(pos).cloned().unwrap(); - // Utf8, LargeUtf8 - for (data_type, scalar) in &[ - ( - DataType::Utf8, - ScalarValue::Utf8 as fn(Option) -> ScalarValue, - ), - ( - DataType::LargeUtf8, - ScalarValue::LargeUtf8 as fn(Option) -> ScalarValue, - ), - ] { - let result = - RegexpSubstrFunc::new().invoke_with_args(ScalarFunctionArgs { - args: vec![ - ColumnarValue::Scalar(scalar(Some(value.to_string()))), - ColumnarValue::Scalar(scalar(Some(regex.to_string()))), - ColumnarValue::Scalar(ScalarValue::Int64(Some(position))), - ColumnarValue::Scalar(ScalarValue::Int64(Some(occurrence))), - ColumnarValue::Scalar(scalar(Some(flags.to_string()))), - ColumnarValue::Scalar(ScalarValue::Int64(Some(group_num))), - ], - number_rows: 1, - return_type: data_type, - }); - match result { - Ok(ColumnarValue::Scalar( - ScalarValue::Utf8(ref res) | ScalarValue::LargeUtf8(ref res), - )) => { - if res.is_some() { - assert_eq!( - res.as_ref().unwrap(), - &expected.to_string(), - "regexp_substr scalar test failed" - ); - } else { - assert_eq!( - "", expected, - "regexp_substr scalar utf8 test failed" - ) + regex.iter().enumerate().for_each(|(spos, ®ex)| { + values.iter().enumerate().for_each(|(pos, &value)| { + let expected = expected.get(spos).unwrap().get(pos).cloned().unwrap(); + // Utf8, LargeUtf8 + for (data_type, scalar) in &[ + ( + DataType::Utf8, + ScalarValue::Utf8 as fn(Option) -> ScalarValue, + ), + ( + DataType::LargeUtf8, + ScalarValue::LargeUtf8 as fn(Option) -> ScalarValue, + ), + ] { + let result = + RegexpSubstrFunc::new().invoke_with_args(ScalarFunctionArgs { + args: vec![ + ColumnarValue::Scalar(scalar(Some(value.to_string()))), + ColumnarValue::Scalar(scalar(Some(regex.to_string()))), + ColumnarValue::Scalar(ScalarValue::Int64(Some(1))), + ColumnarValue::Scalar(ScalarValue::Int64(Some(1))), + ColumnarValue::Scalar(scalar(Some(flags[spos].to_string()))), + ColumnarValue::Scalar(ScalarValue::Int64(Some(group_num[spos]))), + ], + number_rows: 1, + return_type: data_type, + }); + match result { + Ok(ColumnarValue::Scalar( + ScalarValue::Utf8(ref res) | ScalarValue::LargeUtf8(ref res), + )) => { + if res.is_some() { + assert_eq!( + res.as_ref().unwrap(), + &expected.to_string(), + "regexp_substr scalar test failed" + ); + } else { + assert_eq!( + "", expected, + "regexp_substr scalar utf8 test failed" + ) + } } + _ => panic!("Unexpected result"), } - _ => panic!("Unexpected result"), } - } + }) }); } From b13864a2226b433c23294842e8af39b304a23ede Mon Sep 17 00:00:00 2001 From: osipovartem Date: Mon, 10 Feb 2025 23:25:57 +0300 Subject: [PATCH 2/4] Add e flag support --- datafusion/functions/src/regex/regexpsubstr.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/datafusion/functions/src/regex/regexpsubstr.rs b/datafusion/functions/src/regex/regexpsubstr.rs index c6dd964b91f7..f4ec6f38d65d 100644 --- a/datafusion/functions/src/regex/regexpsubstr.rs +++ b/datafusion/functions/src/regex/regexpsubstr.rs @@ -281,7 +281,6 @@ fn get_matches( let occurrence = occurrence.unwrap_or(1) as usize; for caps in regex.captures_iter(value) { - println!("{value} caps {:?}", caps); match group_num { Some(group_num) => { if let Some(m) = caps.get(group_num as usize) { From 2391a93b22cd99f6416ac994fb77bbd8ea651903 Mon Sep 17 00:00:00 2001 From: osipovartem Date: Mon, 10 Feb 2025 23:31:22 +0300 Subject: [PATCH 3/4] fix fmt --- datafusion/functions/src/regex/regexpsubstr.rs | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/datafusion/functions/src/regex/regexpsubstr.rs b/datafusion/functions/src/regex/regexpsubstr.rs index f4ec6f38d65d..57b899f73937 100644 --- a/datafusion/functions/src/regex/regexpsubstr.rs +++ b/datafusion/functions/src/regex/regexpsubstr.rs @@ -493,7 +493,7 @@ mod tests { ["", "abc", "abc", "Abc", "abC", "aBc"], ["", "abca", "abca", "Abca", "abCa", "aBcv"], ["", "abc", "abc", "bca", "abc", "abc"], - ["", "vff", "vff", "", "VfF", ""] + ["", "vff", "vff", "", "VfF", ""], ]; // Scalar @@ -518,16 +518,20 @@ mod tests { ColumnarValue::Scalar(scalar(Some(regex.to_string()))), ColumnarValue::Scalar(ScalarValue::Int64(Some(1))), ColumnarValue::Scalar(ScalarValue::Int64(Some(1))), - ColumnarValue::Scalar(scalar(Some(flags[spos].to_string()))), - ColumnarValue::Scalar(ScalarValue::Int64(Some(group_num[spos]))), + ColumnarValue::Scalar(scalar(Some( + flags[spos].to_string(), + ))), + ColumnarValue::Scalar(ScalarValue::Int64(Some( + group_num[spos], + ))), ], number_rows: 1, return_type: data_type, }); match result { Ok(ColumnarValue::Scalar( - ScalarValue::Utf8(ref res) | ScalarValue::LargeUtf8(ref res), - )) => { + ScalarValue::Utf8(ref res) | ScalarValue::LargeUtf8(ref res), + )) => { if res.is_some() { assert_eq!( res.as_ref().unwrap(), From cb55a9d245353bff0bff51d8abb8e660bf1ef4ae Mon Sep 17 00:00:00 2001 From: osipovartem Date: Mon, 10 Feb 2025 23:48:48 +0300 Subject: [PATCH 4/4] clippy --- datafusion/functions/src/regex/regexpsubstr.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/functions/src/regex/regexpsubstr.rs b/datafusion/functions/src/regex/regexpsubstr.rs index 57b899f73937..5c6d9900ce19 100644 --- a/datafusion/functions/src/regex/regexpsubstr.rs +++ b/datafusion/functions/src/regex/regexpsubstr.rs @@ -229,7 +229,7 @@ fn regexp_substr_inner( }; // Check for 'e' flag and set group_num to 1 if not provided - let group_num = if flags.map_or(false, |f| f.contains('e')) { + let group_num = if flags.is_some_and(|f| f.contains('e')) { group_num.or(Some(1)) } else { group_num