diff --git a/crates/embucket-functions/src/datetime/date_part_extract.rs b/crates/embucket-functions/src/datetime/date_part_extract.rs index f75c2a52b..722b5940f 100644 --- a/crates/embucket-functions/src/datetime/date_part_extract.rs +++ b/crates/embucket-functions/src/datetime/date_part_extract.rs @@ -451,6 +451,7 @@ fn calculate_year_of_week(date: NaiveDate, week_start: usize, week_of_year_polic #[cfg(test)] mod tests { use super::*; + use datafusion::prelude::SessionContext; use datafusion_common::assert_batches_eq; use datafusion_expr::ScalarUDF; diff --git a/crates/embucket-functions/src/regexp/mod.rs b/crates/embucket-functions/src/regexp/mod.rs index 5109d4b52..baeff0c82 100644 --- a/crates/embucket-functions/src/regexp/mod.rs +++ b/crates/embucket-functions/src/regexp/mod.rs @@ -1,14 +1,19 @@ pub mod errors; pub mod regexp_instr; +mod regexp_substr; use crate::regexp::regexp_instr::RegexpInstrFunc; +use crate::regexp::regexp_substr::RegexpSubstrFunc; use datafusion_expr::ScalarUDF; use datafusion_expr::registry::FunctionRegistry; pub use errors::Error; use std::sync::Arc; pub fn register_udfs(registry: &mut dyn FunctionRegistry) -> datafusion_common::Result<()> { - let functions: Vec> = vec![Arc::new(ScalarUDF::from(RegexpInstrFunc::new()))]; + let functions: Vec> = vec![ + Arc::new(ScalarUDF::from(RegexpInstrFunc::new())), + Arc::new(ScalarUDF::from(RegexpSubstrFunc::new())), + ]; for func in functions { registry.register_udf(func)?; } diff --git a/crates/embucket-functions/src/regexp/regexp_instr.rs b/crates/embucket-functions/src/regexp/regexp_instr.rs index e9d19a3a3..fa65daad3 100644 --- a/crates/embucket-functions/src/regexp/regexp_instr.rs +++ b/crates/embucket-functions/src/regexp/regexp_instr.rs @@ -239,7 +239,7 @@ impl RegexpInstrFunc { } other => regexp_errors::UnsupportedInputTypeWithPositionSnafu { data_type: other.data_type(), - position: 6usize, + position: 7usize, } .fail(), }, diff --git a/crates/embucket-functions/src/regexp/regexp_substr.rs b/crates/embucket-functions/src/regexp/regexp_substr.rs new file mode 100644 index 000000000..4b25fc370 --- /dev/null +++ b/crates/embucket-functions/src/regexp/regexp_substr.rs @@ -0,0 +1,302 @@ +use super::errors as regexp_errors; +use crate::utils::{pattern_to_regex, regexp}; +use datafusion::arrow::array::{StringArray, StringBuilder}; +use datafusion::arrow::datatypes::DataType; +use datafusion::error::Result as DFResult; +use datafusion::logical_expr::{ + ColumnarValue, Signature, TypeSignature, TypeSignatureClass, Volatility, +}; +use datafusion_common::ScalarValue; +use datafusion_common::arrow::array::Array; +use datafusion_common::cast::as_generic_string_array; +use datafusion_common::types::logical_string; +use datafusion_expr::{Coercion, ScalarFunctionArgs, ScalarUDFImpl}; +use snafu::ResultExt; +use std::any::Any; +use std::fmt::Debug; +use std::sync::Arc; + +/// `REGEXP_SUBSTR` function implementation +/// +/// Returns the position of the specified occurrence of the regular expression pattern in the string subject. +/// If no match is found, returns 0. +/// +/// Syntax: `REGEXP_SUBSTR( , [ , [ , [ , [ , ] ] ] ] )` +/// +/// Arguments: +/// +/// `Required`: +/// - `` the string to search for matches. +/// - `` pattern to match. +/// +/// `Optional`: +/// - `` number of characters from the beginning of the string where the function starts searching for matches. +/// Default: `1` (the search for a match starts at the first character on the left) +/// - `` specifies the first occurrence of the pattern from which to start returning matches. +/// The function skips the first occurrence - 1 matches. For example, if there are 5 matches and you specify 3 for the occurrence argument, +/// the function ignores the first two matches and returns the third, fourth, and fifth matches. +/// Default: `1` +/// - `` String of one or more characters that specifies the parameters used for searching for matches. +/// Supported values: +/// --------------------------------------------------------------------------- +/// | Parameter | Description | +/// |-----------------|-------------------------------------------| +/// | c | Case-sensitive matching | +/// | i | Case-insensitive matching | +/// | m | Multi-line mode | +/// | e | Extract submatches | +/// | s | POSIX wildcard character `.` matches `\n` | +/// --------------------------------------------------------------------------- +/// Default: `c` +/// - `` the `group_num` parameter specifies which group to extract. +/// Groups are specified by using parentheses in the regular expression. +/// If a `group_num` is specified, it allows extraction even if the e option was not also specified. +/// The e option is implied. +/// +/// Example: `REGEXP_SUBSTR('nevermore1, nevermore2, nevermore3.', 'nevermore')` +#[derive(Debug)] +pub struct RegexpSubstrFunc { + signature: Signature, +} + +impl Default for RegexpSubstrFunc { + fn default() -> Self { + Self::new() + } +} + +impl RegexpSubstrFunc { + pub fn new() -> Self { + Self { + signature: Signature::one_of( + vec![ + TypeSignature::Coercible(vec![ + Coercion::new_exact(TypeSignatureClass::Native(logical_string())), + Coercion::new_exact(TypeSignatureClass::Native(logical_string())), + ]), + TypeSignature::Coercible(vec![ + Coercion::new_exact(TypeSignatureClass::Native(logical_string())), + Coercion::new_exact(TypeSignatureClass::Native(logical_string())), + Coercion::new_exact(TypeSignatureClass::Integer), + ]), + TypeSignature::Coercible(vec![ + Coercion::new_exact(TypeSignatureClass::Native(logical_string())), + Coercion::new_exact(TypeSignatureClass::Native(logical_string())), + Coercion::new_exact(TypeSignatureClass::Integer), + Coercion::new_exact(TypeSignatureClass::Integer), + ]), + TypeSignature::Coercible(vec![ + Coercion::new_exact(TypeSignatureClass::Native(logical_string())), + Coercion::new_exact(TypeSignatureClass::Native(logical_string())), + Coercion::new_exact(TypeSignatureClass::Integer), + Coercion::new_exact(TypeSignatureClass::Integer), + Coercion::new_exact(TypeSignatureClass::Native(logical_string())), + ]), + TypeSignature::Coercible(vec![ + Coercion::new_exact(TypeSignatureClass::Native(logical_string())), + Coercion::new_exact(TypeSignatureClass::Native(logical_string())), + Coercion::new_exact(TypeSignatureClass::Integer), + Coercion::new_exact(TypeSignatureClass::Integer), + Coercion::new_exact(TypeSignatureClass::Native(logical_string())), + Coercion::new_exact(TypeSignatureClass::Integer), + ]), + ], + Volatility::Immutable, + ), + } + } + #[allow(clippy::too_many_lines, clippy::unwrap_used)] + fn take_args_values(args: &[ColumnarValue]) -> DFResult<(usize, usize, &str, usize)> { + let position = args.get(2).map_or_else( + || Ok(0), + |value| match value { + ColumnarValue::Scalar(ScalarValue::Int64(Some(value))) if 0 <= *value => { + usize::try_from(*value - 1) + .context(regexp_errors::InvalidIntegerConversionSnafu) + } + ColumnarValue::Scalar(ScalarValue::Int64(Some(value))) if 0 > *value => { + regexp_errors::WrongArgValueSnafu { + got: value.to_string(), + reason: "Position must be positive".to_string(), + } + .fail() + } + other => regexp_errors::UnsupportedInputTypeWithPositionSnafu { + data_type: other.data_type(), + position: 3usize, + } + .fail(), + }, + )?; + + let occurrence = args.get(3).map_or_else( + || Ok(0), + |value| match value { + ColumnarValue::Scalar(ScalarValue::Int64(Some(value))) if 0 <= *value => { + usize::try_from(*value - 1) + .context(crate::regexp::errors::InvalidIntegerConversionSnafu) + } + ColumnarValue::Scalar(ScalarValue::Int64(Some(value))) if 0 > *value => { + regexp_errors::WrongArgValueSnafu { + got: value.to_string(), + reason: "Occurrence must be positive".to_string(), + } + .fail() + } + other => regexp_errors::UnsupportedInputTypeWithPositionSnafu { + data_type: other.data_type(), + position: 4usize, + } + .fail(), + }, + )?; + + let regex_parameters = args.get(4).map_or_else( + || Ok("c"), + |value| match value { + ColumnarValue::Scalar( + ScalarValue::Utf8(Some(value)) + | ScalarValue::Utf8View(Some(value)) + | ScalarValue::LargeUtf8(Some(value)), + ) if value.contains(['c', 'i', 'm', 'e', 's']) => Ok(value), + ColumnarValue::Scalar( + ScalarValue::Utf8(Some(value)) + | ScalarValue::Utf8View(Some(value)) + | ScalarValue::LargeUtf8(Some(value)), + ) if value.is_empty() => Ok("c"), + ColumnarValue::Scalar( + ScalarValue::Utf8(Some(value)) + | ScalarValue::Utf8View(Some(value)) + | ScalarValue::LargeUtf8(Some(value)), + ) => regexp_errors::WrongArgValueSnafu { + got: value.to_string(), + //We just checked if value is empty, if not - this is valid, since we are getting here the excluded range so just the zeroes character + reason: format!("Unknown parameter: '{}'", value.get(0..1).unwrap()), + } + .fail(), + other => regexp_errors::UnsupportedInputTypeWithPositionSnafu { + data_type: other.data_type(), + position: 5usize, + } + .fail(), + }, + )?; + + let group_num = args.get(5).map_or_else( + || { + if regex_parameters.contains('e') { + Ok(1) + } else { + Ok(0) + } + }, + |value| match value { + ColumnarValue::Scalar(ScalarValue::Int64(Some(value))) if 0 <= *value => { + usize::try_from(*value) + .context(crate::regexp::errors::InvalidIntegerConversionSnafu) + } + ColumnarValue::Scalar(ScalarValue::Int64(Some(value))) if 0 > *value => { + crate::regexp::errors::WrongArgValueSnafu { + got: value.to_string(), + reason: "Capture group mustbe non-negative".to_string(), + } + .fail() + } + other => crate::regexp::errors::UnsupportedInputTypeWithPositionSnafu { + data_type: other.data_type(), + position: 6usize, + } + .fail(), + }, + )?; + + Ok((position, occurrence, regex_parameters, group_num)) + } +} + +impl ScalarUDFImpl for RegexpSubstrFunc { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &'static str { + "regexp_substr" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> DFResult { + match arg_types.len() { + 0 => regexp_errors::NotEnoughArgumentsSnafu { + got: 0usize, + at_least: 2usize, + } + .fail()?, + //Return type specified as Number, probably an `Integer` which is an alias to `Number(38, 0)`, + // we return `Int64` for better internal DF compatibility + n if 7 > n && 1 < n => Ok(DataType::Utf8), + n => regexp_errors::TooManyArgumentsSnafu { + got: n, + at_maximum: 6usize, + } + .fail()?, + } + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> DFResult { + //Already checked that it's at least > 1 + let subject = &args.args[0]; + let array = match subject { + ColumnarValue::Array(array) => array, + //Can't fail (shouldn't) + ColumnarValue::Scalar(scalar) => &scalar.to_array()?, + }; + + //Already checked that it's at least > 1 + let pattern = match &args.args[1] { + ColumnarValue::Scalar( + ScalarValue::Utf8(Some(pattern)) + | ScalarValue::LargeUtf8(Some(pattern)) + | ScalarValue::Utf8View(Some(pattern)), + ) => pattern, + other => { + return regexp_errors::UnsupportedInputTypeWithPositionSnafu { + data_type: other.data_type(), + position: 2usize, + } + .fail()?; + } + }; + + let (position, occurrence, regex_parameters, group_num) = + Self::take_args_values(&args.args)?; + + //TODO: Or data_capacity: 1024 + let mut result_array = StringBuilder::with_capacity(array.len(), array.len() * 10); + + match array.data_type() { + DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View => { + let string_array: &StringArray = as_generic_string_array(array)?; + let regex = pattern_to_regex(pattern, regex_parameters) + .context(regexp_errors::UnsupportedRegexSnafu)?; + regexp(string_array, ®ex, position).for_each(|opt_iter| { + result_array.append_option(opt_iter.and_then(|mut cap_iter| { + cap_iter.nth(occurrence).and_then(|cap| { + //group_num == 0, means get the whole match (seems docs in regex are incorrect) + cap.get(group_num).map(|mat| mat.as_str()) + }) + })); + }); + } + other => regexp_errors::UnsupportedInputTypeWithPositionSnafu { + position: 1usize, + data_type: other.clone(), + } + .fail()?, + } + + Ok(ColumnarValue::Array(Arc::new(result_array.finish()))) + } +} diff --git a/crates/embucket-functions/src/tests/regexp/mod.rs b/crates/embucket-functions/src/tests/regexp/mod.rs index 72c9db3cc..7dd4d6172 100644 --- a/crates/embucket-functions/src/tests/regexp/mod.rs +++ b/crates/embucket-functions/src/tests/regexp/mod.rs @@ -1 +1,2 @@ mod regexp_instr; +mod regexp_substr; diff --git a/crates/embucket-functions/src/tests/regexp/regexp_substr.rs b/crates/embucket-functions/src/tests/regexp/regexp_substr.rs new file mode 100644 index 000000000..3ee83a7b2 --- /dev/null +++ b/crates/embucket-functions/src/tests/regexp/regexp_substr.rs @@ -0,0 +1,85 @@ +use crate::test_query; + +test_query!( + regexp_substr_basic_scalar, + "SELECT REGEXP_SUBSTR('nevermore1, nevermore2, nevermore3.', 'nevermore')", + snapshot_path = "regexp_substr" +); + +test_query!( + regexp_substr_basic_column, + "SELECT REGEXP_SUBSTR(column1, 'the\\W+\\w+') + FROM VALUES ('It was the best of times, it was the worst of times.'), + ('In the string the extra spaces are redundant.'), + ('A thespian theater is nearby.')", + snapshot_path = "regexp_substr" +); + +test_query!( + regexp_substr_occurrence, + "SELECT REGEXP_SUBSTR(column1, 'the\\W+\\w+', 1, 2) + FROM VALUES ('It was the best of times, it was the worst of times.'), + ('In the string the extra spaces are redundant.'), + ('A thespian theater is nearby.')", + snapshot_path = "regexp_substr" +); + +test_query!( + regexp_substr_group_num, + "SELECT REGEXP_SUBSTR(column1, 'the\\W+(\\w+)', 1, 2, 'e', 1) + FROM VALUES ('It was the best of times, it was the worst of times.'), + ('In the string the extra spaces are redundant.'), + ('A thespian theater is nearby.')", + snapshot_path = "regexp_substr" +); + +test_query!( + regexp_substr_word_groups, + "SELECT REGEXP_SUBSTR(column1, 'A\\W+(\\w+)', 1, 1, 'e', 1), + REGEXP_SUBSTR(column1, 'A\\W+(\\w+)', 1, 2, 'e', 1), + REGEXP_SUBSTR(column1, 'A\\W+(\\w+)', 1, 3, 'e', 1), + REGEXP_SUBSTR(column1, 'A\\W+(\\w+)', 1, 4, 'e', 1) + FROM VALUES ('A MAN A PLAN A CANAL')", + snapshot_path = "regexp_substr" +); + +test_query!( + regexp_substr_letter_groups, + "SELECT REGEXP_SUBSTR(column1, 'A\\W+(\\w)(\\w)(\\w)', 1, 1, 'e', 1), + REGEXP_SUBSTR(column1, 'A\\W+(\\w)(\\w)(\\w)', 1, 1, 'e', 2), + REGEXP_SUBSTR(column1, 'A\\W+(\\w)(\\w)(\\w)', 1, 1, 'e', 3) + FROM VALUES ('A MAN A PLAN A CANAL')", + snapshot_path = "regexp_substr" +); + +test_query!( + regexp_substr_word_boundary, + "SELECT REGEXP_SUBSTR('It was the best of times, it was the worst of times','\\bwas\\b', 1, 1)", + snapshot_path = "regexp_substr" +); + +test_query!( + regexp_substr_regex_patterns_1, + "SELECT REGEXP_SUBSTR('It was the best of times, it was the worst of times', '[[:alpha:]]{2,}st', 15, 1)", + snapshot_path = "regexp_substr" +); + +test_query!( + regexp_substr_regex_patterns_2, + "SELECT REGEXP_SUBSTR(column1, '\\b\\S*o\\S*\\b') + FROM VALUES ('Hellooo World'), + ('How are you doing today?'), + ('the quick brown fox jumps over the lazy dog'), + ('PACK MY BOX WITH FIVE DOZEN LIQUOR JUGS')", + snapshot_path = "regexp_substr" +); + +test_query!( + regexp_substr_regex_patterns_3, + "SELECT REGEXP_SUBSTR(column1, '\\b\\S*o\\S*\\b', 3, 3, 'i') + FROM VALUES ('Hellooo World'), + ('How are you doing today?'), + ('the quick brown fox jumps over the lazy dog'), + ('PACK MY BOX WITH FIVE DOZEN LIQUOR JUGS')", + snapshot_path = "regexp_substr" +); diff --git a/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_basic_column.snap b/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_basic_column.snap new file mode 100644 index 000000000..c4d3ac70f --- /dev/null +++ b/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_basic_column.snap @@ -0,0 +1,15 @@ +--- +source: crates/embucket-functions/src/tests/regexp/regexp_substr.rs +description: "\"SELECT REGEXP_SUBSTR(column1, 'the\\\\W+\\\\w+')\n FROM VALUES ('It was the best of times, it was the worst of times.'),\n ('In the string the extra spaces are redundant.'),\n ('A thespian theater is nearby.')\"" +--- +Ok( + [ + "+------------------------------------------+", + "| regexp_substr(column1,Utf8(\"the\\W+\\w+\")) |", + "+------------------------------------------+", + "| the best |", + "| the string |", + "| |", + "+------------------------------------------+", + ], +) diff --git a/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_basic_scalar.snap b/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_basic_scalar.snap new file mode 100644 index 000000000..310a51b42 --- /dev/null +++ b/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_basic_scalar.snap @@ -0,0 +1,13 @@ +--- +source: crates/embucket-functions/src/tests/regexp/regexp_substr.rs +description: "\"SELECT REGEXP_SUBSTR('nevermore1, nevermore2, nevermore3.', 'nevermore')\"" +--- +Ok( + [ + "+------------------------------------------------------------------------------+", + "| regexp_substr(Utf8(\"nevermore1, nevermore2, nevermore3.\"),Utf8(\"nevermore\")) |", + "+------------------------------------------------------------------------------+", + "| nevermore |", + "+------------------------------------------------------------------------------+", + ], +) diff --git a/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_group_num.snap b/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_group_num.snap new file mode 100644 index 000000000..f4a4269f5 --- /dev/null +++ b/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_group_num.snap @@ -0,0 +1,15 @@ +--- +source: crates/embucket-functions/src/tests/regexp/regexp_substr.rs +description: "\"SELECT REGEXP_SUBSTR(column1, 'the\\\\W+(\\\\w+)', 1, 2, 'e', 1)\n FROM VALUES ('It was the best of times, it was the worst of times.'),\n ('In the string the extra spaces are redundant.'),\n ('A thespian theater is nearby.')\"" +--- +Ok( + [ + "+---------------------------------------------------------------------------------+", + "| regexp_substr(column1,Utf8(\"the\\W+(\\w+)\"),Int64(1),Int64(2),Utf8(\"e\"),Int64(1)) |", + "+---------------------------------------------------------------------------------+", + "| worst |", + "| extra |", + "| |", + "+---------------------------------------------------------------------------------+", + ], +) diff --git a/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_letter_groups.snap b/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_letter_groups.snap new file mode 100644 index 000000000..d0fd5469c --- /dev/null +++ b/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_letter_groups.snap @@ -0,0 +1,13 @@ +--- +source: crates/embucket-functions/src/tests/regexp/regexp_substr.rs +description: "\"SELECT REGEXP_SUBSTR(column1, 'A\\\\W+(\\\\w)(\\\\w)(\\\\w)', 1, 1, 'e', 1),\n REGEXP_SUBSTR(column1, 'A\\\\W+(\\\\w)(\\\\w)(\\\\w)', 1, 1, 'e', 2),\n REGEXP_SUBSTR(column1, 'A\\\\W+(\\\\w)(\\\\w)(\\\\w)', 1, 1, 'e', 3)\n FROM VALUES ('A MAN A PLAN A CANAL')\"" +--- +Ok( + [ + "+--------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------+", + "| regexp_substr(column1,Utf8(\"A\\W+(\\w)(\\w)(\\w)\"),Int64(1),Int64(1),Utf8(\"e\"),Int64(1)) | regexp_substr(column1,Utf8(\"A\\W+(\\w)(\\w)(\\w)\"),Int64(1),Int64(1),Utf8(\"e\"),Int64(2)) | regexp_substr(column1,Utf8(\"A\\W+(\\w)(\\w)(\\w)\"),Int64(1),Int64(1),Utf8(\"e\"),Int64(3)) |", + "+--------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------+", + "| M | A | N |", + "+--------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------+", + ], +) diff --git a/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_occurrence.snap b/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_occurrence.snap new file mode 100644 index 000000000..7d184b765 --- /dev/null +++ b/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_occurrence.snap @@ -0,0 +1,15 @@ +--- +source: crates/embucket-functions/src/tests/regexp/regexp_substr.rs +description: "\"SELECT REGEXP_SUBSTR(column1, 'the\\\\W+\\\\w+', 1, 2)\n FROM VALUES ('It was the best of times, it was the worst of times.'),\n ('In the string the extra spaces are redundant.'),\n ('A thespian theater is nearby.')\"" +--- +Ok( + [ + "+------------------------------------------------------------+", + "| regexp_substr(column1,Utf8(\"the\\W+\\w+\"),Int64(1),Int64(2)) |", + "+------------------------------------------------------------+", + "| the worst |", + "| the extra |", + "| |", + "+------------------------------------------------------------+", + ], +) diff --git a/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_regex_patterns_1.snap b/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_regex_patterns_1.snap new file mode 100644 index 000000000..7433e5e65 --- /dev/null +++ b/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_regex_patterns_1.snap @@ -0,0 +1,13 @@ +--- +source: crates/embucket-functions/src/tests/regexp/regexp_substr.rs +description: "\"SELECT REGEXP_SUBSTR('It was the best of times, it was the worst of times', '[[:alpha:]]{2,}st', 15, 1)\"" +--- +Ok( + [ + "+-------------------------------------------------------------------------------------------------------------------------+", + "| regexp_substr(Utf8(\"It was the best of times, it was the worst of times\"),Utf8(\"[[:alpha:]]{2,}st\"),Int64(15),Int64(1)) |", + "+-------------------------------------------------------------------------------------------------------------------------+", + "| worst |", + "+-------------------------------------------------------------------------------------------------------------------------+", + ], +) diff --git a/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_regex_patterns_2.snap b/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_regex_patterns_2.snap new file mode 100644 index 000000000..0a2d587d9 --- /dev/null +++ b/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_regex_patterns_2.snap @@ -0,0 +1,16 @@ +--- +source: crates/embucket-functions/src/tests/regexp/regexp_substr.rs +description: "\"SELECT REGEXP_SUBSTR(column1, '\\\\b\\\\S*o\\\\S*\\\\b')\n FROM VALUES ('Hellooo World'),\n ('How are you doing today?'),\n ('the quick brown fox jumps over the lazy dog'),\n ('PACK MY BOX WITH FIVE DOZEN LIQUOR JUGS')\"" +--- +Ok( + [ + "+--------------------------------------------+", + "| regexp_substr(column1,Utf8(\"\\b\\S*o\\S*\\b\")) |", + "+--------------------------------------------+", + "| Hellooo |", + "| How |", + "| brown |", + "| |", + "+--------------------------------------------+", + ], +) diff --git a/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_regex_patterns_3.snap b/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_regex_patterns_3.snap new file mode 100644 index 000000000..d60f3722f --- /dev/null +++ b/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_regex_patterns_3.snap @@ -0,0 +1,16 @@ +--- +source: crates/embucket-functions/src/tests/regexp/regexp_substr.rs +description: "\"SELECT REGEXP_SUBSTR(column1, '\\\\b\\\\S*o\\\\S*\\\\b', 3, 3, 'i')\n FROM VALUES ('Hellooo World'),\n ('How are you doing today?'),\n ('the quick brown fox jumps over the lazy dog'),\n ('PACK MY BOX WITH FIVE DOZEN LIQUOR JUGS')\"" +--- +Ok( + [ + "+------------------------------------------------------------------------+", + "| regexp_substr(column1,Utf8(\"\\b\\S*o\\S*\\b\"),Int64(3),Int64(3),Utf8(\"i\")) |", + "+------------------------------------------------------------------------+", + "| |", + "| today |", + "| over |", + "| LIQUOR |", + "+------------------------------------------------------------------------+", + ], +) diff --git a/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_word_boundary.snap b/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_word_boundary.snap new file mode 100644 index 000000000..4a1cc93ab --- /dev/null +++ b/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_word_boundary.snap @@ -0,0 +1,13 @@ +--- +source: crates/embucket-functions/src/tests/regexp/regexp_substr.rs +description: "\"SELECT REGEXP_SUBSTR('It was the best of times, it was the worst of times','\\\\bwas\\\\b', 1, 1)\"" +--- +Ok( + [ + "+--------------------------------------------------------------------------------------------------------------+", + "| regexp_substr(Utf8(\"It was the best of times, it was the worst of times\"),Utf8(\"\\bwas\\b\"),Int64(1),Int64(1)) |", + "+--------------------------------------------------------------------------------------------------------------+", + "| was |", + "+--------------------------------------------------------------------------------------------------------------+", + ], +) diff --git a/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_word_groups.snap b/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_word_groups.snap new file mode 100644 index 000000000..0c697e589 --- /dev/null +++ b/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_word_groups.snap @@ -0,0 +1,13 @@ +--- +source: crates/embucket-functions/src/tests/regexp/regexp_substr.rs +description: "\"SELECT REGEXP_SUBSTR(column1, 'A\\\\W+(\\\\w+)', 1, 1, 'e', 1),\n REGEXP_SUBSTR(column1, 'A\\\\W+(\\\\w+)', 1, 2, 'e', 1),\n REGEXP_SUBSTR(column1, 'A\\\\W+(\\\\w+)', 1, 3, 'e', 1),\n REGEXP_SUBSTR(column1, 'A\\\\W+(\\\\w+)', 1, 4, 'e', 1)\n FROM VALUES ('A MAN A PLAN A CANAL')\"" +--- +Ok( + [ + "+-------------------------------------------------------------------------------+-------------------------------------------------------------------------------+-------------------------------------------------------------------------------+-------------------------------------------------------------------------------+", + "| regexp_substr(column1,Utf8(\"A\\W+(\\w+)\"),Int64(1),Int64(1),Utf8(\"e\"),Int64(1)) | regexp_substr(column1,Utf8(\"A\\W+(\\w+)\"),Int64(1),Int64(2),Utf8(\"e\"),Int64(1)) | regexp_substr(column1,Utf8(\"A\\W+(\\w+)\"),Int64(1),Int64(3),Utf8(\"e\"),Int64(1)) | regexp_substr(column1,Utf8(\"A\\W+(\\w+)\"),Int64(1),Int64(4),Utf8(\"e\"),Int64(1)) |", + "+-------------------------------------------------------------------------------+-------------------------------------------------------------------------------+-------------------------------------------------------------------------------+-------------------------------------------------------------------------------+", + "| MAN | PLAN | CANAL | |", + "+-------------------------------------------------------------------------------+-------------------------------------------------------------------------------+-------------------------------------------------------------------------------+-------------------------------------------------------------------------------+", + ], +) diff --git a/crates/embucket-functions/src/visitors/unimplemented/helper/implemented_functions.csv b/crates/embucket-functions/src/visitors/unimplemented/helper/implemented_functions.csv index bb3876eee..cbf495a07 100644 --- a/crates/embucket-functions/src/visitors/unimplemented/helper/implemented_functions.csv +++ b/crates/embucket-functions/src/visitors/unimplemented/helper/implemented_functions.csv @@ -441,4 +441,4 @@ variant_element version week year -zeroifnull \ No newline at end of file +zeroifnull