From 676deff274125c823d1ba498bb598ea751557737 Mon Sep 17 00:00:00 2001 From: DanCodedThis Date: Wed, 6 Aug 2025 19:45:28 +0300 Subject: [PATCH 1/3] regexp_substr impl --- .../src/datetime/date_part_extract.rs | 1 + crates/embucket-functions/src/regexp/mod.rs | 7 +- .../src/regexp/regexp_instr.rs | 2 +- .../src/regexp/regexp_substr.rs | 265 ++++++++++++++++++ .../src/tests/regexp/mod.rs | 1 + .../src/tests/regexp/regexp_substr.rs | 85 ++++++ .../query_regexp_substr_basic_column.snap | 15 + .../query_regexp_substr_basic_scalar.snap | 13 + .../query_regexp_substr_group_num.snap | 15 + .../query_regexp_substr_letter_groups.snap | 13 + .../query_regexp_substr_occurrence.snap | 15 + .../query_regexp_substr_regex_patterns_1.snap | 13 + .../query_regexp_substr_regex_patterns_2.snap | 16 ++ .../query_regexp_substr_regex_patterns_3.snap | 16 ++ .../query_regexp_substr_word_boundary.snap | 13 + .../query_regexp_substr_word_groups.snap | 13 + .../generated_snowflake_functions.rs | 106 +++++++ .../helper/implemented_functions.csv | 21 +- 18 files changed, 608 insertions(+), 22 deletions(-) create mode 100644 crates/embucket-functions/src/regexp/regexp_substr.rs create mode 100644 crates/embucket-functions/src/tests/regexp/regexp_substr.rs create mode 100644 crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_basic_column.snap create mode 100644 crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_basic_scalar.snap create mode 100644 crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_group_num.snap create mode 100644 crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_letter_groups.snap create mode 100644 crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_occurrence.snap create mode 100644 crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_regex_patterns_1.snap create mode 100644 crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_regex_patterns_2.snap create mode 100644 crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_regex_patterns_3.snap create mode 100644 crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_word_boundary.snap create mode 100644 crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_word_groups.snap diff --git a/crates/embucket-functions/src/datetime/date_part_extract.rs b/crates/embucket-functions/src/datetime/date_part_extract.rs index f75c2a52b..722b5940f 100644 --- a/crates/embucket-functions/src/datetime/date_part_extract.rs +++ b/crates/embucket-functions/src/datetime/date_part_extract.rs @@ -451,6 +451,7 @@ fn calculate_year_of_week(date: NaiveDate, week_start: usize, week_of_year_polic #[cfg(test)] mod tests { use super::*; + use datafusion::prelude::SessionContext; use datafusion_common::assert_batches_eq; use datafusion_expr::ScalarUDF; diff --git a/crates/embucket-functions/src/regexp/mod.rs b/crates/embucket-functions/src/regexp/mod.rs index 5109d4b52..baeff0c82 100644 --- a/crates/embucket-functions/src/regexp/mod.rs +++ b/crates/embucket-functions/src/regexp/mod.rs @@ -1,14 +1,19 @@ pub mod errors; pub mod regexp_instr; +mod regexp_substr; use crate::regexp::regexp_instr::RegexpInstrFunc; +use crate::regexp::regexp_substr::RegexpSubstrFunc; use datafusion_expr::ScalarUDF; use datafusion_expr::registry::FunctionRegistry; pub use errors::Error; use std::sync::Arc; pub fn register_udfs(registry: &mut dyn FunctionRegistry) -> datafusion_common::Result<()> { - let functions: Vec> = vec![Arc::new(ScalarUDF::from(RegexpInstrFunc::new()))]; + let functions: Vec> = vec![ + Arc::new(ScalarUDF::from(RegexpInstrFunc::new())), + Arc::new(ScalarUDF::from(RegexpSubstrFunc::new())), + ]; for func in functions { registry.register_udf(func)?; } diff --git a/crates/embucket-functions/src/regexp/regexp_instr.rs b/crates/embucket-functions/src/regexp/regexp_instr.rs index e9d19a3a3..fa65daad3 100644 --- a/crates/embucket-functions/src/regexp/regexp_instr.rs +++ b/crates/embucket-functions/src/regexp/regexp_instr.rs @@ -239,7 +239,7 @@ impl RegexpInstrFunc { } other => regexp_errors::UnsupportedInputTypeWithPositionSnafu { data_type: other.data_type(), - position: 6usize, + position: 7usize, } .fail(), }, diff --git a/crates/embucket-functions/src/regexp/regexp_substr.rs b/crates/embucket-functions/src/regexp/regexp_substr.rs new file mode 100644 index 000000000..6c91415ba --- /dev/null +++ b/crates/embucket-functions/src/regexp/regexp_substr.rs @@ -0,0 +1,265 @@ +use super::errors as regexp_errors; +use crate::utils::{pattern_to_regex, regexp}; +use datafusion::arrow::array::{StringArray, StringBuilder}; +use datafusion::arrow::datatypes::DataType; +use datafusion::error::Result as DFResult; +use datafusion::logical_expr::{ + ColumnarValue, Signature, TypeSignature, TypeSignatureClass, Volatility, +}; +use datafusion_common::ScalarValue; +use datafusion_common::arrow::array::Array; +use datafusion_common::cast::as_generic_string_array; +use datafusion_common::types::logical_string; +use datafusion_expr::{Coercion, ScalarFunctionArgs, ScalarUDFImpl}; +use snafu::ResultExt; +use std::any::Any; +use std::fmt::Debug; +use std::sync::Arc; + +//TODO: Docs +#[derive(Debug)] +pub struct RegexpSubstrFunc { + signature: Signature, +} + +impl Default for RegexpSubstrFunc { + fn default() -> Self { + Self::new() + } +} + +impl RegexpSubstrFunc { + pub fn new() -> Self { + Self { + signature: Signature::one_of( + vec![ + TypeSignature::Coercible(vec![ + Coercion::new_exact(TypeSignatureClass::Native(logical_string())), + Coercion::new_exact(TypeSignatureClass::Native(logical_string())), + ]), + TypeSignature::Coercible(vec![ + Coercion::new_exact(TypeSignatureClass::Native(logical_string())), + Coercion::new_exact(TypeSignatureClass::Native(logical_string())), + Coercion::new_exact(TypeSignatureClass::Integer), + ]), + TypeSignature::Coercible(vec![ + Coercion::new_exact(TypeSignatureClass::Native(logical_string())), + Coercion::new_exact(TypeSignatureClass::Native(logical_string())), + Coercion::new_exact(TypeSignatureClass::Integer), + Coercion::new_exact(TypeSignatureClass::Integer), + ]), + TypeSignature::Coercible(vec![ + Coercion::new_exact(TypeSignatureClass::Native(logical_string())), + Coercion::new_exact(TypeSignatureClass::Native(logical_string())), + Coercion::new_exact(TypeSignatureClass::Integer), + Coercion::new_exact(TypeSignatureClass::Integer), + Coercion::new_exact(TypeSignatureClass::Native(logical_string())), + ]), + TypeSignature::Coercible(vec![ + Coercion::new_exact(TypeSignatureClass::Native(logical_string())), + Coercion::new_exact(TypeSignatureClass::Native(logical_string())), + Coercion::new_exact(TypeSignatureClass::Integer), + Coercion::new_exact(TypeSignatureClass::Integer), + Coercion::new_exact(TypeSignatureClass::Native(logical_string())), + Coercion::new_exact(TypeSignatureClass::Integer), + ]), + ], + Volatility::Immutable, + ), + } + } + #[allow(clippy::too_many_lines, clippy::unwrap_used)] + fn take_args_values(args: &[ColumnarValue]) -> DFResult<(usize, usize, &str, usize)> { + let position = args.get(2).map_or_else( + || Ok(0), + |value| match value { + ColumnarValue::Scalar(ScalarValue::Int64(Some(value))) if 0 <= *value => { + usize::try_from(*value - 1) + .context(regexp_errors::InvalidIntegerConversionSnafu) + } + ColumnarValue::Scalar(ScalarValue::Int64(Some(value))) if 0 > *value => { + regexp_errors::WrongArgValueSnafu { + got: value.to_string(), + reason: "Position must be positive".to_string(), + } + .fail() + } + other => regexp_errors::UnsupportedInputTypeWithPositionSnafu { + data_type: other.data_type(), + position: 3usize, + } + .fail(), + }, + )?; + + let occurrence = args.get(3).map_or_else( + || Ok(0), + |value| match value { + ColumnarValue::Scalar(ScalarValue::Int64(Some(value))) if 0 <= *value => { + usize::try_from(*value - 1) + .context(crate::regexp::errors::InvalidIntegerConversionSnafu) + } + ColumnarValue::Scalar(ScalarValue::Int64(Some(value))) if 0 > *value => { + regexp_errors::WrongArgValueSnafu { + got: value.to_string(), + reason: "Occurrence must be positive".to_string(), + } + .fail() + } + other => regexp_errors::UnsupportedInputTypeWithPositionSnafu { + data_type: other.data_type(), + position: 4usize, + } + .fail(), + }, + )?; + + let regexp_parameters = args.get(4).map_or_else( + || Ok("c"), + |value| match value { + ColumnarValue::Scalar( + ScalarValue::Utf8(Some(value)) + | ScalarValue::Utf8View(Some(value)) + | ScalarValue::LargeUtf8(Some(value)), + ) if value.contains(['c', 'i', 'm', 'e', 's']) => Ok(value), + ColumnarValue::Scalar( + ScalarValue::Utf8(Some(value)) + | ScalarValue::Utf8View(Some(value)) + | ScalarValue::LargeUtf8(Some(value)), + ) if value.is_empty() => Ok("c"), + ColumnarValue::Scalar( + ScalarValue::Utf8(Some(value)) + | ScalarValue::Utf8View(Some(value)) + | ScalarValue::LargeUtf8(Some(value)), + ) => regexp_errors::WrongArgValueSnafu { + got: value.to_string(), + //We just checked if value is empty, if not - this is valid, since we are getting here the excluded range so just the zeroes character + reason: format!("Unknown parameter: '{}'", value.get(0..1).unwrap()), + } + .fail(), + other => regexp_errors::UnsupportedInputTypeWithPositionSnafu { + data_type: other.data_type(), + position: 5usize, + } + .fail(), + }, + )?; + + let group_num = args.get(5).map_or_else( + || { + if regexp_parameters.contains('e') { + Ok(1) + } else { + Ok(0) + } + }, + |value| match value { + ColumnarValue::Scalar(ScalarValue::Int64(Some(value))) if 0 <= *value => { + usize::try_from(*value) + .context(crate::regexp::errors::InvalidIntegerConversionSnafu) + } + ColumnarValue::Scalar(ScalarValue::Int64(Some(value))) if 0 > *value => { + crate::regexp::errors::WrongArgValueSnafu { + got: value.to_string(), + reason: "Capture group mustbe non-negative".to_string(), + } + .fail() + } + other => crate::regexp::errors::UnsupportedInputTypeWithPositionSnafu { + data_type: other.data_type(), + position: 6usize, + } + .fail(), + }, + )?; + + Ok((position, occurrence, regexp_parameters, group_num)) + } +} + +impl ScalarUDFImpl for RegexpSubstrFunc { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &'static str { + "regexp_substr" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> DFResult { + match arg_types.len() { + 0 => regexp_errors::NotEnoughArgumentsSnafu { + got: 0usize, + at_least: 2usize, + } + .fail()?, + //Return type specified as Number, probably an `Integer` which is an alias to `Number(38, 0)`, + // we return `Int64` for better internal DF compatibility + n if 7 > n && 1 < n => Ok(DataType::Utf8), + n => regexp_errors::TooManyArgumentsSnafu { + got: n, + at_maximum: 6usize, + } + .fail()?, + } + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> DFResult { + //Already checked that it's at least > 1 + let subject = &args.args[0]; + let array = match subject { + ColumnarValue::Array(array) => array, + //Can't fail (shouldn't) + ColumnarValue::Scalar(scalar) => &scalar.to_array()?, + }; + + //Already checked that it's at least > 1 + let pattern = match &args.args[1] { + ColumnarValue::Scalar( + ScalarValue::Utf8(Some(pattern)) + | ScalarValue::LargeUtf8(Some(pattern)) + | ScalarValue::Utf8View(Some(pattern)), + ) => pattern, + other => { + return regexp_errors::UnsupportedInputTypeWithPositionSnafu { + data_type: other.data_type(), + position: 2usize, + } + .fail()?; + } + }; + + let (position, occurrence, regexp_parameters, group_num) = + Self::take_args_values(&args.args)?; + + //TODO: Or data_capacity: 1024 + let mut result_array = StringBuilder::with_capacity(array.len(), array.len() * 10); + + match array.data_type() { + DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View => { + let string_array: &StringArray = as_generic_string_array(array)?; + let regex = pattern_to_regex(pattern, regexp_parameters) + .context(regexp_errors::UnsupportedRegexSnafu)?; + regexp(string_array, ®ex, position).for_each(|opt_iter| { + result_array.append_option(opt_iter.and_then(|mut cap_iter| { + cap_iter.nth(occurrence).and_then(|cap| { + //group_num == 0, means get the whole match (seems docs in regex are incorrect) + cap.get(group_num).map(|mat| mat.as_str()) + }) + })); + }); + } + other => regexp_errors::UnsupportedInputTypeWithPositionSnafu { + position: 1usize, + data_type: other.clone(), + } + .fail()?, + } + + Ok(ColumnarValue::Array(Arc::new(result_array.finish()))) + } +} diff --git a/crates/embucket-functions/src/tests/regexp/mod.rs b/crates/embucket-functions/src/tests/regexp/mod.rs index 72c9db3cc..7dd4d6172 100644 --- a/crates/embucket-functions/src/tests/regexp/mod.rs +++ b/crates/embucket-functions/src/tests/regexp/mod.rs @@ -1 +1,2 @@ mod regexp_instr; +mod regexp_substr; diff --git a/crates/embucket-functions/src/tests/regexp/regexp_substr.rs b/crates/embucket-functions/src/tests/regexp/regexp_substr.rs new file mode 100644 index 000000000..4b2ca212b --- /dev/null +++ b/crates/embucket-functions/src/tests/regexp/regexp_substr.rs @@ -0,0 +1,85 @@ +use crate::test_query; + +test_query!( + regexp_substr_basic_scalar, + "SELECT REGEXP_SUBSTR('nevermore1, nevermore2, nevermore3.', 'nevermore\\d')", + snapshot_path = "regexp_substr" +); + +test_query!( + regexp_substr_basic_column, + "SELECT REGEXP_SUBSTR(column1, 'the\\W+\\w+') + FROM VALUES ('It was the best of times, it was the worst of times.'), + ('In the string the extra spaces are redundant.'), + ('A thespian theater is nearby.')", + snapshot_path = "regexp_substr" +); + +test_query!( + regexp_substr_occurrence, + "SELECT REGEXP_SUBSTR(column1, 'the\\W+\\w+', 1, 2) + FROM VALUES ('It was the best of times, it was the worst of times.'), + ('In the string the extra spaces are redundant.'), + ('A thespian theater is nearby.')", + snapshot_path = "regexp_substr" +); + +test_query!( + regexp_substr_group_num, + "SELECT REGEXP_SUBSTR(column1, 'the\\W+(\\w+)', 1, 2, 'e', 1) + FROM VALUES ('It was the best of times, it was the worst of times.'), + ('In the string the extra spaces are redundant.'), + ('A thespian theater is nearby.')", + snapshot_path = "regexp_substr" +); + +test_query!( + regexp_substr_word_groups, + "SELECT REGEXP_SUBSTR(column1, 'A\\W+(\\w+)', 1, 1, 'e', 1), + REGEXP_SUBSTR(column1, 'A\\W+(\\w+)', 1, 2, 'e', 1), + REGEXP_SUBSTR(column1, 'A\\W+(\\w+)', 1, 3, 'e', 1), + REGEXP_SUBSTR(column1, 'A\\W+(\\w+)', 1, 4, 'e', 1) + FROM VALUES ('A MAN A PLAN A CANAL')", + snapshot_path = "regexp_substr" +); + +test_query!( + regexp_substr_letter_groups, + "SELECT REGEXP_SUBSTR(column1, 'A\\W+(\\w)(\\w)(\\w)', 1, 1, 'e', 1), + REGEXP_SUBSTR(column1, 'A\\W+(\\w)(\\w)(\\w)', 1, 1, 'e', 2), + REGEXP_SUBSTR(column1, 'A\\W+(\\w)(\\w)(\\w)', 1, 1, 'e', 3) + FROM VALUES ('A MAN A PLAN A CANAL')", + snapshot_path = "regexp_substr" +); + +test_query!( + regexp_substr_word_boundary, + "SELECT REGEXP_SUBSTR('It was the best of times, it was the worst of times','\\bwas\\b', 1, 1)", + snapshot_path = "regexp_substr" +); + +test_query!( + regexp_substr_regex_patterns_1, + "SELECT REGEXP_SUBSTR('It was the best of times, it was the worst of times', '[[:alpha:]]{2,}st', 15, 1)", + snapshot_path = "regexp_substr" +); + +test_query!( + regexp_substr_regex_patterns_2, + "SELECT REGEXP_SUBSTR(column1, '\\b\\S*o\\S*\\b') + FROM VALUES ('Hellooo World'), + ('How are you doing today?'), + ('the quick brown fox jumps over the lazy dog'), + ('PACK MY BOX WITH FIVE DOZEN LIQUOR JUGS')", + snapshot_path = "regexp_substr" +); + +test_query!( + regexp_substr_regex_patterns_3, + "SELECT REGEXP_SUBSTR(column1, '\\b\\S*o\\S*\\b', 3, 3, 'i') + FROM VALUES ('Hellooo World'), + ('How are you doing today?'), + ('the quick brown fox jumps over the lazy dog'), + ('PACK MY BOX WITH FIVE DOZEN LIQUOR JUGS')", + snapshot_path = "regexp_substr" +); diff --git a/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_basic_column.snap b/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_basic_column.snap new file mode 100644 index 000000000..c4d3ac70f --- /dev/null +++ b/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_basic_column.snap @@ -0,0 +1,15 @@ +--- +source: crates/embucket-functions/src/tests/regexp/regexp_substr.rs +description: "\"SELECT REGEXP_SUBSTR(column1, 'the\\\\W+\\\\w+')\n FROM VALUES ('It was the best of times, it was the worst of times.'),\n ('In the string the extra spaces are redundant.'),\n ('A thespian theater is nearby.')\"" +--- +Ok( + [ + "+------------------------------------------+", + "| regexp_substr(column1,Utf8(\"the\\W+\\w+\")) |", + "+------------------------------------------+", + "| the best |", + "| the string |", + "| |", + "+------------------------------------------+", + ], +) diff --git a/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_basic_scalar.snap b/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_basic_scalar.snap new file mode 100644 index 000000000..86f833e73 --- /dev/null +++ b/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_basic_scalar.snap @@ -0,0 +1,13 @@ +--- +source: crates/embucket-functions/src/tests/regexp/regexp_substr.rs +description: "\"SELECT REGEXP_SUBSTR('nevermore1, nevermore2, nevermore3.', 'nevermore\\\\d')\"" +--- +Ok( + [ + "+--------------------------------------------------------------------------------+", + "| regexp_substr(Utf8(\"nevermore1, nevermore2, nevermore3.\"),Utf8(\"nevermore\\d\")) |", + "+--------------------------------------------------------------------------------+", + "| nevermore1 |", + "+--------------------------------------------------------------------------------+", + ], +) diff --git a/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_group_num.snap b/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_group_num.snap new file mode 100644 index 000000000..f4a4269f5 --- /dev/null +++ b/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_group_num.snap @@ -0,0 +1,15 @@ +--- +source: crates/embucket-functions/src/tests/regexp/regexp_substr.rs +description: "\"SELECT REGEXP_SUBSTR(column1, 'the\\\\W+(\\\\w+)', 1, 2, 'e', 1)\n FROM VALUES ('It was the best of times, it was the worst of times.'),\n ('In the string the extra spaces are redundant.'),\n ('A thespian theater is nearby.')\"" +--- +Ok( + [ + "+---------------------------------------------------------------------------------+", + "| regexp_substr(column1,Utf8(\"the\\W+(\\w+)\"),Int64(1),Int64(2),Utf8(\"e\"),Int64(1)) |", + "+---------------------------------------------------------------------------------+", + "| worst |", + "| extra |", + "| |", + "+---------------------------------------------------------------------------------+", + ], +) diff --git a/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_letter_groups.snap b/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_letter_groups.snap new file mode 100644 index 000000000..d0fd5469c --- /dev/null +++ b/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_letter_groups.snap @@ -0,0 +1,13 @@ +--- +source: crates/embucket-functions/src/tests/regexp/regexp_substr.rs +description: "\"SELECT REGEXP_SUBSTR(column1, 'A\\\\W+(\\\\w)(\\\\w)(\\\\w)', 1, 1, 'e', 1),\n REGEXP_SUBSTR(column1, 'A\\\\W+(\\\\w)(\\\\w)(\\\\w)', 1, 1, 'e', 2),\n REGEXP_SUBSTR(column1, 'A\\\\W+(\\\\w)(\\\\w)(\\\\w)', 1, 1, 'e', 3)\n FROM VALUES ('A MAN A PLAN A CANAL')\"" +--- +Ok( + [ + "+--------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------+", + "| regexp_substr(column1,Utf8(\"A\\W+(\\w)(\\w)(\\w)\"),Int64(1),Int64(1),Utf8(\"e\"),Int64(1)) | regexp_substr(column1,Utf8(\"A\\W+(\\w)(\\w)(\\w)\"),Int64(1),Int64(1),Utf8(\"e\"),Int64(2)) | regexp_substr(column1,Utf8(\"A\\W+(\\w)(\\w)(\\w)\"),Int64(1),Int64(1),Utf8(\"e\"),Int64(3)) |", + "+--------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------+", + "| M | A | N |", + "+--------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------+", + ], +) diff --git a/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_occurrence.snap b/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_occurrence.snap new file mode 100644 index 000000000..7d184b765 --- /dev/null +++ b/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_occurrence.snap @@ -0,0 +1,15 @@ +--- +source: crates/embucket-functions/src/tests/regexp/regexp_substr.rs +description: "\"SELECT REGEXP_SUBSTR(column1, 'the\\\\W+\\\\w+', 1, 2)\n FROM VALUES ('It was the best of times, it was the worst of times.'),\n ('In the string the extra spaces are redundant.'),\n ('A thespian theater is nearby.')\"" +--- +Ok( + [ + "+------------------------------------------------------------+", + "| regexp_substr(column1,Utf8(\"the\\W+\\w+\"),Int64(1),Int64(2)) |", + "+------------------------------------------------------------+", + "| the worst |", + "| the extra |", + "| |", + "+------------------------------------------------------------+", + ], +) diff --git a/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_regex_patterns_1.snap b/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_regex_patterns_1.snap new file mode 100644 index 000000000..7433e5e65 --- /dev/null +++ b/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_regex_patterns_1.snap @@ -0,0 +1,13 @@ +--- +source: crates/embucket-functions/src/tests/regexp/regexp_substr.rs +description: "\"SELECT REGEXP_SUBSTR('It was the best of times, it was the worst of times', '[[:alpha:]]{2,}st', 15, 1)\"" +--- +Ok( + [ + "+-------------------------------------------------------------------------------------------------------------------------+", + "| regexp_substr(Utf8(\"It was the best of times, it was the worst of times\"),Utf8(\"[[:alpha:]]{2,}st\"),Int64(15),Int64(1)) |", + "+-------------------------------------------------------------------------------------------------------------------------+", + "| worst |", + "+-------------------------------------------------------------------------------------------------------------------------+", + ], +) diff --git a/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_regex_patterns_2.snap b/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_regex_patterns_2.snap new file mode 100644 index 000000000..0a2d587d9 --- /dev/null +++ b/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_regex_patterns_2.snap @@ -0,0 +1,16 @@ +--- +source: crates/embucket-functions/src/tests/regexp/regexp_substr.rs +description: "\"SELECT REGEXP_SUBSTR(column1, '\\\\b\\\\S*o\\\\S*\\\\b')\n FROM VALUES ('Hellooo World'),\n ('How are you doing today?'),\n ('the quick brown fox jumps over the lazy dog'),\n ('PACK MY BOX WITH FIVE DOZEN LIQUOR JUGS')\"" +--- +Ok( + [ + "+--------------------------------------------+", + "| regexp_substr(column1,Utf8(\"\\b\\S*o\\S*\\b\")) |", + "+--------------------------------------------+", + "| Hellooo |", + "| How |", + "| brown |", + "| |", + "+--------------------------------------------+", + ], +) diff --git a/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_regex_patterns_3.snap b/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_regex_patterns_3.snap new file mode 100644 index 000000000..d60f3722f --- /dev/null +++ b/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_regex_patterns_3.snap @@ -0,0 +1,16 @@ +--- +source: crates/embucket-functions/src/tests/regexp/regexp_substr.rs +description: "\"SELECT REGEXP_SUBSTR(column1, '\\\\b\\\\S*o\\\\S*\\\\b', 3, 3, 'i')\n FROM VALUES ('Hellooo World'),\n ('How are you doing today?'),\n ('the quick brown fox jumps over the lazy dog'),\n ('PACK MY BOX WITH FIVE DOZEN LIQUOR JUGS')\"" +--- +Ok( + [ + "+------------------------------------------------------------------------+", + "| regexp_substr(column1,Utf8(\"\\b\\S*o\\S*\\b\"),Int64(3),Int64(3),Utf8(\"i\")) |", + "+------------------------------------------------------------------------+", + "| |", + "| today |", + "| over |", + "| LIQUOR |", + "+------------------------------------------------------------------------+", + ], +) diff --git a/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_word_boundary.snap b/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_word_boundary.snap new file mode 100644 index 000000000..4a1cc93ab --- /dev/null +++ b/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_word_boundary.snap @@ -0,0 +1,13 @@ +--- +source: crates/embucket-functions/src/tests/regexp/regexp_substr.rs +description: "\"SELECT REGEXP_SUBSTR('It was the best of times, it was the worst of times','\\\\bwas\\\\b', 1, 1)\"" +--- +Ok( + [ + "+--------------------------------------------------------------------------------------------------------------+", + "| regexp_substr(Utf8(\"It was the best of times, it was the worst of times\"),Utf8(\"\\bwas\\b\"),Int64(1),Int64(1)) |", + "+--------------------------------------------------------------------------------------------------------------+", + "| was |", + "+--------------------------------------------------------------------------------------------------------------+", + ], +) diff --git a/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_word_groups.snap b/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_word_groups.snap new file mode 100644 index 000000000..0c697e589 --- /dev/null +++ b/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_word_groups.snap @@ -0,0 +1,13 @@ +--- +source: crates/embucket-functions/src/tests/regexp/regexp_substr.rs +description: "\"SELECT REGEXP_SUBSTR(column1, 'A\\\\W+(\\\\w+)', 1, 1, 'e', 1),\n REGEXP_SUBSTR(column1, 'A\\\\W+(\\\\w+)', 1, 2, 'e', 1),\n REGEXP_SUBSTR(column1, 'A\\\\W+(\\\\w+)', 1, 3, 'e', 1),\n REGEXP_SUBSTR(column1, 'A\\\\W+(\\\\w+)', 1, 4, 'e', 1)\n FROM VALUES ('A MAN A PLAN A CANAL')\"" +--- +Ok( + [ + "+-------------------------------------------------------------------------------+-------------------------------------------------------------------------------+-------------------------------------------------------------------------------+-------------------------------------------------------------------------------+", + "| regexp_substr(column1,Utf8(\"A\\W+(\\w+)\"),Int64(1),Int64(1),Utf8(\"e\"),Int64(1)) | regexp_substr(column1,Utf8(\"A\\W+(\\w+)\"),Int64(1),Int64(2),Utf8(\"e\"),Int64(1)) | regexp_substr(column1,Utf8(\"A\\W+(\\w+)\"),Int64(1),Int64(3),Utf8(\"e\"),Int64(1)) | regexp_substr(column1,Utf8(\"A\\W+(\\w+)\"),Int64(1),Int64(4),Utf8(\"e\"),Int64(1)) |", + "+-------------------------------------------------------------------------------+-------------------------------------------------------------------------------+-------------------------------------------------------------------------------+-------------------------------------------------------------------------------+", + "| MAN | PLAN | CANAL | |", + "+-------------------------------------------------------------------------------+-------------------------------------------------------------------------------+-------------------------------------------------------------------------------+-------------------------------------------------------------------------------+", + ], +) diff --git a/crates/embucket-functions/src/visitors/unimplemented/generated_snowflake_functions.rs b/crates/embucket-functions/src/visitors/unimplemented/generated_snowflake_functions.rs index 624651910..1b820a03d 100644 --- a/crates/embucket-functions/src/visitors/unimplemented/generated_snowflake_functions.rs +++ b/crates/embucket-functions/src/visitors/unimplemented/generated_snowflake_functions.rs @@ -643,6 +643,40 @@ pub const CONVERSION_FUNCTIONS: &[(&str, FunctionInfo)] = &[ .with_docs("https://docs.snowflake.com/en/sql-reference/functions/try_to_geometry") .with_subcategory("geospatial") ), + ("TRY_TO_TIMESTAMP", FunctionInfo::new( + "TRY_TO_TIMESTAMP", + "Converts an input expression into the corresponding timestamp." + ) + .with_docs("https://docs.snowflake.com/en/sql-reference/functions/to_timestamp") + .with_subcategory("datetime") + ), + ("TRY_TO_TIMESTAMP", FunctionInfo::new( + "TRY_TO_TIMESTAMP", + "A special version of TO_TIMESTAMP / TO_TIMESTAMP_* that performs the same operation (i.e. converts an input expression into a timestamp), but with error-handling support (i.e. if the conversion cannot be performed, it returns a NULL value instead of raising an error)." + ) + .with_docs("https://docs.snowflake.com/en/sql-reference/functions/try_to_timestamp") + ), + ("TRY_TO_TIMESTAMP_LTZ", FunctionInfo::new( + "TRY_TO_TIMESTAMP_LTZ", + "Converts an input expression into the corresponding timestamp." + ) + .with_docs("https://docs.snowflake.com/en/sql-reference/functions/to_timestamp") + .with_subcategory("datetime") + ), + ("TRY_TO_TIMESTAMP_NTZ", FunctionInfo::new( + "TRY_TO_TIMESTAMP_NTZ", + "Converts an input expression into the corresponding timestamp." + ) + .with_docs("https://docs.snowflake.com/en/sql-reference/functions/to_timestamp") + .with_subcategory("datetime") + ), + ("TRY_TO_TIMESTAMP_TZ", FunctionInfo::new( + "TRY_TO_TIMESTAMP_TZ", + "Converts an input expression into the corresponding timestamp." + ) + .with_docs("https://docs.snowflake.com/en/sql-reference/functions/to_timestamp") + .with_subcategory("datetime") + ), ]; pub const DATA_METRIC_FUNCTIONS: &[(&str, FunctionInfo)] = &[ @@ -712,36 +746,102 @@ pub const DATA_QUALITY_FUNCTIONS: &[(&str, FunctionInfo)] = &[ ]; pub const DATETIME_FUNCTIONS: &[(&str, FunctionInfo)] = &[ + ("DAY", FunctionInfo::new( + "DAY", + "Extracts the corresponding date part from a date or timestamp." + ) + .with_docs("https://docs.snowflake.com/en/sql-reference/functions/year") + ), + ("DAYOFMONTH", FunctionInfo::new( + "DAYOFMONTH", + "Extracts the corresponding date part from a date or timestamp." + ) + .with_docs("https://docs.snowflake.com/en/sql-reference/functions/year") + ), + ("DAYOFWEEK", FunctionInfo::new( + "DAYOFWEEK", + "Extracts the corresponding date part from a date or timestamp." + ) + .with_docs("https://docs.snowflake.com/en/sql-reference/functions/year") + ), + ("DAYOFWEEKISO", FunctionInfo::new( + "DAYOFWEEKISO", + "Extracts the corresponding date part from a date or timestamp." + ) + .with_docs("https://docs.snowflake.com/en/sql-reference/functions/year") + ), + ("DAYOFYEAR", FunctionInfo::new( + "DAYOFYEAR", + "Extracts the corresponding date part from a date or timestamp." + ) + .with_docs("https://docs.snowflake.com/en/sql-reference/functions/year") + ), ("EXTRACT", FunctionInfo::new( "EXTRACT", "Extracts the specified date or time part from a date, time, or timestamp." ) .with_docs("https://docs.snowflake.com/en/sql-reference/functions/extract") ), + ("HOUR", FunctionInfo::new( + "HOUR", + "Extracts the corresponding time part from a time or timestamp value." + ) + .with_docs("https://docs.snowflake.com/en/sql-reference/functions/hour-minute-second") + ), ("LAST_SUCCESSFUL_SCHEDULED_TIME", FunctionInfo::new( "LAST_SUCCESSFUL_SCHEDULED_TIME", "Returns the timestamp representing the scheduled time for the most recent successful evaluation of the alert condition, where no errors occurred when executing the action." ) .with_docs("https://docs.snowflake.com/en/sql-reference/functions/last_successful_scheduled_time") ), + ("MINUTE", FunctionInfo::new( + "MINUTE", + "Extracts the corresponding time part from a time or timestamp value." + ) + .with_docs("https://docs.snowflake.com/en/sql-reference/functions/hour-minute-second") + ), + ("MONTH", FunctionInfo::new( + "MONTH", + "Extracts the corresponding date part from a date or timestamp." + ) + .with_docs("https://docs.snowflake.com/en/sql-reference/functions/year") + ), ("MONTHS_BETWEEN", FunctionInfo::new( "MONTHS_BETWEEN", "Returns the number of months between two DATE or TIMESTAMP values." ) .with_docs("https://docs.snowflake.com/en/sql-reference/functions/months_between") ), + ("QUARTER", FunctionInfo::new( + "QUARTER", + "Extracts the corresponding date part from a date or timestamp." + ) + .with_docs("https://docs.snowflake.com/en/sql-reference/functions/year") + ), ("SCHEDULED_TIME", FunctionInfo::new( "SCHEDULED_TIME", "Returns the timestamp representing the scheduled time of the current alert." ) .with_docs("https://docs.snowflake.com/en/sql-reference/functions/scheduled_time") ), + ("SECOND", FunctionInfo::new( + "SECOND", + "Extracts the corresponding time part from a time or timestamp value." + ) + .with_docs("https://docs.snowflake.com/en/sql-reference/functions/hour-minute-second") + ), ("TIME_SLICE", FunctionInfo::new( "TIME_SLICE", "Calculates the beginning or end of a “slice” of time, where the length of the slice is a multiple of a standard unit of time (minute, hour, day, etc.)." ) .with_docs("https://docs.snowflake.com/en/sql-reference/functions/time_slice") ), + ("WEEK", FunctionInfo::new( + "WEEK", + "Extracts the corresponding date part from a date or timestamp." + ) + .with_docs("https://docs.snowflake.com/en/sql-reference/functions/year") + ), ("WEEKISO", FunctionInfo::new( "WEEKISO", "Extracts the corresponding date part from a date or timestamp." @@ -754,6 +854,12 @@ pub const DATETIME_FUNCTIONS: &[(&str, FunctionInfo)] = &[ ) .with_docs("https://docs.snowflake.com/en/sql-reference/functions/year") ), + ("YEAR", FunctionInfo::new( + "YEAR", + "Extracts the corresponding date part from a date or timestamp." + ) + .with_docs("https://docs.snowflake.com/en/sql-reference/functions/year") + ), ("YEAROFWEEK", FunctionInfo::new( "YEAROFWEEK", "Extracts the corresponding date part from a date or timestamp." diff --git a/crates/embucket-functions/src/visitors/unimplemented/helper/implemented_functions.csv b/crates/embucket-functions/src/visitors/unimplemented/helper/implemented_functions.csv index bb3876eee..b8325ef59 100644 --- a/crates/embucket-functions/src/visitors/unimplemented/helper/implemented_functions.csv +++ b/crates/embucket-functions/src/visitors/unimplemented/helper/implemented_functions.csv @@ -134,12 +134,7 @@ datediff datefromparts datepart datetrunc -day dayname -dayofmonth -dayofweek -dayofweekiso -dayofyear decode decrypt_raw degrees @@ -172,7 +167,6 @@ grouping hex_decode_binary hex_decode_string hex_encode -hour iff ifnull initcap @@ -274,8 +268,6 @@ md5 mean median min -minute -month monthname named_struct nanvl @@ -304,7 +296,6 @@ position pow power previous_day -quarter radians random range @@ -336,7 +327,6 @@ row_number rpad rtrim rtrimmed_length -second sha2 sha224 sha256 @@ -398,13 +388,10 @@ to_number to_numeric to_time to_timestamp -to_timestamp_ltz to_timestamp_micros to_timestamp_millis to_timestamp_nanos -to_timestamp_ntz to_timestamp_seconds -to_timestamp_tz to_unixtime to_varchar to_variant @@ -423,10 +410,6 @@ try_to_decimal try_to_number try_to_numeric try_to_time -try_to_timestamp -try_to_timestamp_ltz -try_to_timestamp_ntz -try_to_timestamp_tz try_to_varchar typeof union_extract @@ -439,6 +422,4 @@ var_samp var_sample variant_element version -week -year -zeroifnull \ No newline at end of file +zeroifnull From a96e07e4b82fbf4d1e1e27769ded4c733f83a14c Mon Sep 17 00:00:00 2001 From: DanCodedThis Date: Wed, 6 Aug 2025 19:54:33 +0300 Subject: [PATCH 2/3] docs + minor tweaks --- .../src/regexp/regexp_substr.rs | 49 ++++++++++++++++--- .../src/tests/regexp/regexp_substr.rs | 2 +- .../query_regexp_substr_basic_scalar.snap | 12 ++--- 3 files changed, 50 insertions(+), 13 deletions(-) diff --git a/crates/embucket-functions/src/regexp/regexp_substr.rs b/crates/embucket-functions/src/regexp/regexp_substr.rs index 6c91415ba..4b25fc370 100644 --- a/crates/embucket-functions/src/regexp/regexp_substr.rs +++ b/crates/embucket-functions/src/regexp/regexp_substr.rs @@ -16,7 +16,44 @@ use std::any::Any; use std::fmt::Debug; use std::sync::Arc; -//TODO: Docs +/// `REGEXP_SUBSTR` function implementation +/// +/// Returns the position of the specified occurrence of the regular expression pattern in the string subject. +/// If no match is found, returns 0. +/// +/// Syntax: `REGEXP_SUBSTR( , [ , [ , [ , [ , ] ] ] ] )` +/// +/// Arguments: +/// +/// `Required`: +/// - `` the string to search for matches. +/// - `` pattern to match. +/// +/// `Optional`: +/// - `` number of characters from the beginning of the string where the function starts searching for matches. +/// Default: `1` (the search for a match starts at the first character on the left) +/// - `` specifies the first occurrence of the pattern from which to start returning matches. +/// The function skips the first occurrence - 1 matches. For example, if there are 5 matches and you specify 3 for the occurrence argument, +/// the function ignores the first two matches and returns the third, fourth, and fifth matches. +/// Default: `1` +/// - `` String of one or more characters that specifies the parameters used for searching for matches. +/// Supported values: +/// --------------------------------------------------------------------------- +/// | Parameter | Description | +/// |-----------------|-------------------------------------------| +/// | c | Case-sensitive matching | +/// | i | Case-insensitive matching | +/// | m | Multi-line mode | +/// | e | Extract submatches | +/// | s | POSIX wildcard character `.` matches `\n` | +/// --------------------------------------------------------------------------- +/// Default: `c` +/// - `` the `group_num` parameter specifies which group to extract. +/// Groups are specified by using parentheses in the regular expression. +/// If a `group_num` is specified, it allows extraction even if the e option was not also specified. +/// The e option is implied. +/// +/// Example: `REGEXP_SUBSTR('nevermore1, nevermore2, nevermore3.', 'nevermore')` #[derive(Debug)] pub struct RegexpSubstrFunc { signature: Signature, @@ -114,7 +151,7 @@ impl RegexpSubstrFunc { }, )?; - let regexp_parameters = args.get(4).map_or_else( + let regex_parameters = args.get(4).map_or_else( || Ok("c"), |value| match value { ColumnarValue::Scalar( @@ -147,7 +184,7 @@ impl RegexpSubstrFunc { let group_num = args.get(5).map_or_else( || { - if regexp_parameters.contains('e') { + if regex_parameters.contains('e') { Ok(1) } else { Ok(0) @@ -173,7 +210,7 @@ impl RegexpSubstrFunc { }, )?; - Ok((position, occurrence, regexp_parameters, group_num)) + Ok((position, occurrence, regex_parameters, group_num)) } } @@ -233,7 +270,7 @@ impl ScalarUDFImpl for RegexpSubstrFunc { } }; - let (position, occurrence, regexp_parameters, group_num) = + let (position, occurrence, regex_parameters, group_num) = Self::take_args_values(&args.args)?; //TODO: Or data_capacity: 1024 @@ -242,7 +279,7 @@ impl ScalarUDFImpl for RegexpSubstrFunc { match array.data_type() { DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View => { let string_array: &StringArray = as_generic_string_array(array)?; - let regex = pattern_to_regex(pattern, regexp_parameters) + let regex = pattern_to_regex(pattern, regex_parameters) .context(regexp_errors::UnsupportedRegexSnafu)?; regexp(string_array, ®ex, position).for_each(|opt_iter| { result_array.append_option(opt_iter.and_then(|mut cap_iter| { diff --git a/crates/embucket-functions/src/tests/regexp/regexp_substr.rs b/crates/embucket-functions/src/tests/regexp/regexp_substr.rs index 4b2ca212b..3ee83a7b2 100644 --- a/crates/embucket-functions/src/tests/regexp/regexp_substr.rs +++ b/crates/embucket-functions/src/tests/regexp/regexp_substr.rs @@ -2,7 +2,7 @@ use crate::test_query; test_query!( regexp_substr_basic_scalar, - "SELECT REGEXP_SUBSTR('nevermore1, nevermore2, nevermore3.', 'nevermore\\d')", + "SELECT REGEXP_SUBSTR('nevermore1, nevermore2, nevermore3.', 'nevermore')", snapshot_path = "regexp_substr" ); diff --git a/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_basic_scalar.snap b/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_basic_scalar.snap index 86f833e73..310a51b42 100644 --- a/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_basic_scalar.snap +++ b/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_basic_scalar.snap @@ -1,13 +1,13 @@ --- source: crates/embucket-functions/src/tests/regexp/regexp_substr.rs -description: "\"SELECT REGEXP_SUBSTR('nevermore1, nevermore2, nevermore3.', 'nevermore\\\\d')\"" +description: "\"SELECT REGEXP_SUBSTR('nevermore1, nevermore2, nevermore3.', 'nevermore')\"" --- Ok( [ - "+--------------------------------------------------------------------------------+", - "| regexp_substr(Utf8(\"nevermore1, nevermore2, nevermore3.\"),Utf8(\"nevermore\\d\")) |", - "+--------------------------------------------------------------------------------+", - "| nevermore1 |", - "+--------------------------------------------------------------------------------+", + "+------------------------------------------------------------------------------+", + "| regexp_substr(Utf8(\"nevermore1, nevermore2, nevermore3.\"),Utf8(\"nevermore\")) |", + "+------------------------------------------------------------------------------+", + "| nevermore |", + "+------------------------------------------------------------------------------+", ], ) From c799b58d48256065664831f9338222ec9aa2179c Mon Sep 17 00:00:00 2001 From: DanCodedThis Date: Wed, 6 Aug 2025 20:12:42 +0300 Subject: [PATCH 3/3] fix deleting registered functions --- .../generated_snowflake_functions.rs | 106 ------------------ .../helper/implemented_functions.csv | 19 ++++ 2 files changed, 19 insertions(+), 106 deletions(-) diff --git a/crates/embucket-functions/src/visitors/unimplemented/generated_snowflake_functions.rs b/crates/embucket-functions/src/visitors/unimplemented/generated_snowflake_functions.rs index 1b820a03d..624651910 100644 --- a/crates/embucket-functions/src/visitors/unimplemented/generated_snowflake_functions.rs +++ b/crates/embucket-functions/src/visitors/unimplemented/generated_snowflake_functions.rs @@ -643,40 +643,6 @@ pub const CONVERSION_FUNCTIONS: &[(&str, FunctionInfo)] = &[ .with_docs("https://docs.snowflake.com/en/sql-reference/functions/try_to_geometry") .with_subcategory("geospatial") ), - ("TRY_TO_TIMESTAMP", FunctionInfo::new( - "TRY_TO_TIMESTAMP", - "Converts an input expression into the corresponding timestamp." - ) - .with_docs("https://docs.snowflake.com/en/sql-reference/functions/to_timestamp") - .with_subcategory("datetime") - ), - ("TRY_TO_TIMESTAMP", FunctionInfo::new( - "TRY_TO_TIMESTAMP", - "A special version of TO_TIMESTAMP / TO_TIMESTAMP_* that performs the same operation (i.e. converts an input expression into a timestamp), but with error-handling support (i.e. if the conversion cannot be performed, it returns a NULL value instead of raising an error)." - ) - .with_docs("https://docs.snowflake.com/en/sql-reference/functions/try_to_timestamp") - ), - ("TRY_TO_TIMESTAMP_LTZ", FunctionInfo::new( - "TRY_TO_TIMESTAMP_LTZ", - "Converts an input expression into the corresponding timestamp." - ) - .with_docs("https://docs.snowflake.com/en/sql-reference/functions/to_timestamp") - .with_subcategory("datetime") - ), - ("TRY_TO_TIMESTAMP_NTZ", FunctionInfo::new( - "TRY_TO_TIMESTAMP_NTZ", - "Converts an input expression into the corresponding timestamp." - ) - .with_docs("https://docs.snowflake.com/en/sql-reference/functions/to_timestamp") - .with_subcategory("datetime") - ), - ("TRY_TO_TIMESTAMP_TZ", FunctionInfo::new( - "TRY_TO_TIMESTAMP_TZ", - "Converts an input expression into the corresponding timestamp." - ) - .with_docs("https://docs.snowflake.com/en/sql-reference/functions/to_timestamp") - .with_subcategory("datetime") - ), ]; pub const DATA_METRIC_FUNCTIONS: &[(&str, FunctionInfo)] = &[ @@ -746,102 +712,36 @@ pub const DATA_QUALITY_FUNCTIONS: &[(&str, FunctionInfo)] = &[ ]; pub const DATETIME_FUNCTIONS: &[(&str, FunctionInfo)] = &[ - ("DAY", FunctionInfo::new( - "DAY", - "Extracts the corresponding date part from a date or timestamp." - ) - .with_docs("https://docs.snowflake.com/en/sql-reference/functions/year") - ), - ("DAYOFMONTH", FunctionInfo::new( - "DAYOFMONTH", - "Extracts the corresponding date part from a date or timestamp." - ) - .with_docs("https://docs.snowflake.com/en/sql-reference/functions/year") - ), - ("DAYOFWEEK", FunctionInfo::new( - "DAYOFWEEK", - "Extracts the corresponding date part from a date or timestamp." - ) - .with_docs("https://docs.snowflake.com/en/sql-reference/functions/year") - ), - ("DAYOFWEEKISO", FunctionInfo::new( - "DAYOFWEEKISO", - "Extracts the corresponding date part from a date or timestamp." - ) - .with_docs("https://docs.snowflake.com/en/sql-reference/functions/year") - ), - ("DAYOFYEAR", FunctionInfo::new( - "DAYOFYEAR", - "Extracts the corresponding date part from a date or timestamp." - ) - .with_docs("https://docs.snowflake.com/en/sql-reference/functions/year") - ), ("EXTRACT", FunctionInfo::new( "EXTRACT", "Extracts the specified date or time part from a date, time, or timestamp." ) .with_docs("https://docs.snowflake.com/en/sql-reference/functions/extract") ), - ("HOUR", FunctionInfo::new( - "HOUR", - "Extracts the corresponding time part from a time or timestamp value." - ) - .with_docs("https://docs.snowflake.com/en/sql-reference/functions/hour-minute-second") - ), ("LAST_SUCCESSFUL_SCHEDULED_TIME", FunctionInfo::new( "LAST_SUCCESSFUL_SCHEDULED_TIME", "Returns the timestamp representing the scheduled time for the most recent successful evaluation of the alert condition, where no errors occurred when executing the action." ) .with_docs("https://docs.snowflake.com/en/sql-reference/functions/last_successful_scheduled_time") ), - ("MINUTE", FunctionInfo::new( - "MINUTE", - "Extracts the corresponding time part from a time or timestamp value." - ) - .with_docs("https://docs.snowflake.com/en/sql-reference/functions/hour-minute-second") - ), - ("MONTH", FunctionInfo::new( - "MONTH", - "Extracts the corresponding date part from a date or timestamp." - ) - .with_docs("https://docs.snowflake.com/en/sql-reference/functions/year") - ), ("MONTHS_BETWEEN", FunctionInfo::new( "MONTHS_BETWEEN", "Returns the number of months between two DATE or TIMESTAMP values." ) .with_docs("https://docs.snowflake.com/en/sql-reference/functions/months_between") ), - ("QUARTER", FunctionInfo::new( - "QUARTER", - "Extracts the corresponding date part from a date or timestamp." - ) - .with_docs("https://docs.snowflake.com/en/sql-reference/functions/year") - ), ("SCHEDULED_TIME", FunctionInfo::new( "SCHEDULED_TIME", "Returns the timestamp representing the scheduled time of the current alert." ) .with_docs("https://docs.snowflake.com/en/sql-reference/functions/scheduled_time") ), - ("SECOND", FunctionInfo::new( - "SECOND", - "Extracts the corresponding time part from a time or timestamp value." - ) - .with_docs("https://docs.snowflake.com/en/sql-reference/functions/hour-minute-second") - ), ("TIME_SLICE", FunctionInfo::new( "TIME_SLICE", "Calculates the beginning or end of a “slice” of time, where the length of the slice is a multiple of a standard unit of time (minute, hour, day, etc.)." ) .with_docs("https://docs.snowflake.com/en/sql-reference/functions/time_slice") ), - ("WEEK", FunctionInfo::new( - "WEEK", - "Extracts the corresponding date part from a date or timestamp." - ) - .with_docs("https://docs.snowflake.com/en/sql-reference/functions/year") - ), ("WEEKISO", FunctionInfo::new( "WEEKISO", "Extracts the corresponding date part from a date or timestamp." @@ -854,12 +754,6 @@ pub const DATETIME_FUNCTIONS: &[(&str, FunctionInfo)] = &[ ) .with_docs("https://docs.snowflake.com/en/sql-reference/functions/year") ), - ("YEAR", FunctionInfo::new( - "YEAR", - "Extracts the corresponding date part from a date or timestamp." - ) - .with_docs("https://docs.snowflake.com/en/sql-reference/functions/year") - ), ("YEAROFWEEK", FunctionInfo::new( "YEAROFWEEK", "Extracts the corresponding date part from a date or timestamp." diff --git a/crates/embucket-functions/src/visitors/unimplemented/helper/implemented_functions.csv b/crates/embucket-functions/src/visitors/unimplemented/helper/implemented_functions.csv index b8325ef59..cbf495a07 100644 --- a/crates/embucket-functions/src/visitors/unimplemented/helper/implemented_functions.csv +++ b/crates/embucket-functions/src/visitors/unimplemented/helper/implemented_functions.csv @@ -134,7 +134,12 @@ datediff datefromparts datepart datetrunc +day dayname +dayofmonth +dayofweek +dayofweekiso +dayofyear decode decrypt_raw degrees @@ -167,6 +172,7 @@ grouping hex_decode_binary hex_decode_string hex_encode +hour iff ifnull initcap @@ -268,6 +274,8 @@ md5 mean median min +minute +month monthname named_struct nanvl @@ -296,6 +304,7 @@ position pow power previous_day +quarter radians random range @@ -327,6 +336,7 @@ row_number rpad rtrim rtrimmed_length +second sha2 sha224 sha256 @@ -388,10 +398,13 @@ to_number to_numeric to_time to_timestamp +to_timestamp_ltz to_timestamp_micros to_timestamp_millis to_timestamp_nanos +to_timestamp_ntz to_timestamp_seconds +to_timestamp_tz to_unixtime to_varchar to_variant @@ -410,6 +423,10 @@ try_to_decimal try_to_number try_to_numeric try_to_time +try_to_timestamp +try_to_timestamp_ltz +try_to_timestamp_ntz +try_to_timestamp_tz try_to_varchar typeof union_extract @@ -422,4 +439,6 @@ var_samp var_sample variant_element version +week +year zeroifnull