From 676deff274125c823d1ba498bb598ea751557737 Mon Sep 17 00:00:00 2001
From: DanCodedThis <daniil.vysotskyi@gmail.com>
Date: Wed, 6 Aug 2025 19:45:28 +0300
Subject: [PATCH 1/3] regexp_substr impl

---
 .../src/datetime/date_part_extract.rs         |   1 +
 crates/embucket-functions/src/regexp/mod.rs   |   7 +-
 .../src/regexp/regexp_instr.rs                |   2 +-
 .../src/regexp/regexp_substr.rs               | 265 ++++++++++++++++++
 .../src/tests/regexp/mod.rs                   |   1 +
 .../src/tests/regexp/regexp_substr.rs         |  85 ++++++
 .../query_regexp_substr_basic_column.snap     |  15 +
 .../query_regexp_substr_basic_scalar.snap     |  13 +
 .../query_regexp_substr_group_num.snap        |  15 +
 .../query_regexp_substr_letter_groups.snap    |  13 +
 .../query_regexp_substr_occurrence.snap       |  15 +
 .../query_regexp_substr_regex_patterns_1.snap |  13 +
 .../query_regexp_substr_regex_patterns_2.snap |  16 ++
 .../query_regexp_substr_regex_patterns_3.snap |  16 ++
 .../query_regexp_substr_word_boundary.snap    |  13 +
 .../query_regexp_substr_word_groups.snap      |  13 +
 .../generated_snowflake_functions.rs          | 106 +++++++
 .../helper/implemented_functions.csv          |  21 +-
 18 files changed, 608 insertions(+), 22 deletions(-)
 create mode 100644 crates/embucket-functions/src/regexp/regexp_substr.rs
 create mode 100644 crates/embucket-functions/src/tests/regexp/regexp_substr.rs
 create mode 100644 crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_basic_column.snap
 create mode 100644 crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_basic_scalar.snap
 create mode 100644 crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_group_num.snap
 create mode 100644 crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_letter_groups.snap
 create mode 100644 crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_occurrence.snap
 create mode 100644 crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_regex_patterns_1.snap
 create mode 100644 crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_regex_patterns_2.snap
 create mode 100644 crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_regex_patterns_3.snap
 create mode 100644 crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_word_boundary.snap
 create mode 100644 crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_word_groups.snap
diff --git a/crates/embucket-functions/src/datetime/date_part_extract.rs b/crates/embucket-functions/src/datetime/date_part_extract.rs
index f75c2a52b..722b5940f 100644
--- a/crates/embucket-functions/src/datetime/date_part_extract.rs
+++ b/crates/embucket-functions/src/datetime/date_part_extract.rs
@@ -451,6 +451,7 @@ fn calculate_year_of_week(date: NaiveDate, week_start: usize, week_of_year_polic
 #[cfg(test)]
 mod tests {
     use super::*;
+
     use datafusion::prelude::SessionContext;
     use datafusion_common::assert_batches_eq;
     use datafusion_expr::ScalarUDF;
diff --git a/crates/embucket-functions/src/regexp/mod.rs b/crates/embucket-functions/src/regexp/mod.rs
index 5109d4b52..baeff0c82 100644
--- a/crates/embucket-functions/src/regexp/mod.rs
+++ b/crates/embucket-functions/src/regexp/mod.rs
@@ -1,14 +1,19 @@
 pub mod errors;
 pub mod regexp_instr;
+mod regexp_substr;
 
 use crate::regexp::regexp_instr::RegexpInstrFunc;
+use crate::regexp::regexp_substr::RegexpSubstrFunc;
 use datafusion_expr::ScalarUDF;
 use datafusion_expr::registry::FunctionRegistry;
 pub use errors::Error;
 use std::sync::Arc;
 
 pub fn register_udfs(registry: &mut dyn FunctionRegistry) -> datafusion_common::Result<()> {
-    let functions: Vec<Arc<ScalarUDF>> = vec![Arc::new(ScalarUDF::from(RegexpInstrFunc::new()))];
+    let functions: Vec<Arc<ScalarUDF>> = vec![
+        Arc::new(ScalarUDF::from(RegexpInstrFunc::new())),
+        Arc::new(ScalarUDF::from(RegexpSubstrFunc::new())),
+    ];
     for func in functions {
         registry.register_udf(func)?;
     }
diff --git a/crates/embucket-functions/src/regexp/regexp_instr.rs b/crates/embucket-functions/src/regexp/regexp_instr.rs
index e9d19a3a3..fa65daad3 100644
--- a/crates/embucket-functions/src/regexp/regexp_instr.rs
+++ b/crates/embucket-functions/src/regexp/regexp_instr.rs
@@ -239,7 +239,7 @@ impl RegexpInstrFunc {
                 }
                 other => regexp_errors::UnsupportedInputTypeWithPositionSnafu {
                     data_type: other.data_type(),
-                    position: 6usize,
+                    position: 7usize,
                 }
                 .fail(),
             },
diff --git a/crates/embucket-functions/src/regexp/regexp_substr.rs b/crates/embucket-functions/src/regexp/regexp_substr.rs
new file mode 100644
index 000000000..6c91415ba
--- /dev/null
+++ b/crates/embucket-functions/src/regexp/regexp_substr.rs
@@ -0,0 +1,265 @@
+use super::errors as regexp_errors;
+use crate::utils::{pattern_to_regex, regexp};
+use datafusion::arrow::array::{StringArray, StringBuilder};
+use datafusion::arrow::datatypes::DataType;
+use datafusion::error::Result as DFResult;
+use datafusion::logical_expr::{
+    ColumnarValue, Signature, TypeSignature, TypeSignatureClass, Volatility,
+};
+use datafusion_common::ScalarValue;
+use datafusion_common::arrow::array::Array;
+use datafusion_common::cast::as_generic_string_array;
+use datafusion_common::types::logical_string;
+use datafusion_expr::{Coercion, ScalarFunctionArgs, ScalarUDFImpl};
+use snafu::ResultExt;
+use std::any::Any;
+use std::fmt::Debug;
+use std::sync::Arc;
+
+//TODO: Docs
+#[derive(Debug)]
+pub struct RegexpSubstrFunc {
+    signature: Signature,
+}
+
+impl Default for RegexpSubstrFunc {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl RegexpSubstrFunc {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::one_of(
+                vec![
+                    TypeSignature::Coercible(vec![
+                        Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
+                        Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
+                    ]),
+                    TypeSignature::Coercible(vec![
+                        Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
+                        Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
+                        Coercion::new_exact(TypeSignatureClass::Integer),
+                    ]),
+                    TypeSignature::Coercible(vec![
+                        Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
+                        Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
+                        Coercion::new_exact(TypeSignatureClass::Integer),
+                        Coercion::new_exact(TypeSignatureClass::Integer),
+                    ]),
+                    TypeSignature::Coercible(vec![
+                        Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
+                        Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
+                        Coercion::new_exact(TypeSignatureClass::Integer),
+                        Coercion::new_exact(TypeSignatureClass::Integer),
+                        Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
+                    ]),
+                    TypeSignature::Coercible(vec![
+                        Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
+                        Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
+                        Coercion::new_exact(TypeSignatureClass::Integer),
+                        Coercion::new_exact(TypeSignatureClass::Integer),
+                        Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
+                        Coercion::new_exact(TypeSignatureClass::Integer),
+                    ]),
+                ],
+                Volatility::Immutable,
+            ),
+        }
+    }
+    #[allow(clippy::too_many_lines, clippy::unwrap_used)]
+    fn take_args_values(args: &[ColumnarValue]) -> DFResult<(usize, usize, &str, usize)> {
+        let position = args.get(2).map_or_else(
+            || Ok(0),
+            |value| match value {
+                ColumnarValue::Scalar(ScalarValue::Int64(Some(value))) if 0 <= *value => {
+                    usize::try_from(*value - 1)
+                        .context(regexp_errors::InvalidIntegerConversionSnafu)
+                }
+                ColumnarValue::Scalar(ScalarValue::Int64(Some(value))) if 0 > *value => {
+                    regexp_errors::WrongArgValueSnafu {
+                        got: value.to_string(),
+                        reason: "Position must be positive".to_string(),
+                    }
+                    .fail()
+                }
+                other => regexp_errors::UnsupportedInputTypeWithPositionSnafu {
+                    data_type: other.data_type(),
+                    position: 3usize,
+                }
+                .fail(),
+            },
+        )?;
+
+        let occurrence = args.get(3).map_or_else(
+            || Ok(0),
+            |value| match value {
+                ColumnarValue::Scalar(ScalarValue::Int64(Some(value))) if 0 <= *value => {
+                    usize::try_from(*value - 1)
+                        .context(crate::regexp::errors::InvalidIntegerConversionSnafu)
+                }
+                ColumnarValue::Scalar(ScalarValue::Int64(Some(value))) if 0 > *value => {
+                    regexp_errors::WrongArgValueSnafu {
+                        got: value.to_string(),
+                        reason: "Occurrence must be positive".to_string(),
+                    }
+                    .fail()
+                }
+                other => regexp_errors::UnsupportedInputTypeWithPositionSnafu {
+                    data_type: other.data_type(),
+                    position: 4usize,
+                }
+                .fail(),
+            },
+        )?;
+
+        let regexp_parameters = args.get(4).map_or_else(
+            || Ok("c"),
+            |value| match value {
+                ColumnarValue::Scalar(
+                    ScalarValue::Utf8(Some(value))
+                    | ScalarValue::Utf8View(Some(value))
+                    | ScalarValue::LargeUtf8(Some(value)),
+                ) if value.contains(['c', 'i', 'm', 'e', 's']) => Ok(value),
+                ColumnarValue::Scalar(
+                    ScalarValue::Utf8(Some(value))
+                    | ScalarValue::Utf8View(Some(value))
+                    | ScalarValue::LargeUtf8(Some(value)),
+                ) if value.is_empty() => Ok("c"),
+                ColumnarValue::Scalar(
+                    ScalarValue::Utf8(Some(value))
+                    | ScalarValue::Utf8View(Some(value))
+                    | ScalarValue::LargeUtf8(Some(value)),
+                ) => regexp_errors::WrongArgValueSnafu {
+                    got: value.to_string(),
+                    //We just checked if value is empty, if not - this is valid, since we are getting here the excluded range so just the zeroes character
+                    reason: format!("Unknown parameter: '{}'", value.get(0..1).unwrap()),
+                }
+                .fail(),
+                other => regexp_errors::UnsupportedInputTypeWithPositionSnafu {
+                    data_type: other.data_type(),
+                    position: 5usize,
+                }
+                .fail(),
+            },
+        )?;
+
+        let group_num = args.get(5).map_or_else(
+            || {
+                if regexp_parameters.contains('e') {
+                    Ok(1)
+                } else {
+                    Ok(0)
+                }
+            },
+            |value| match value {
+                ColumnarValue::Scalar(ScalarValue::Int64(Some(value))) if 0 <= *value => {
+                    usize::try_from(*value)
+                        .context(crate::regexp::errors::InvalidIntegerConversionSnafu)
+                }
+                ColumnarValue::Scalar(ScalarValue::Int64(Some(value))) if 0 > *value => {
+                    crate::regexp::errors::WrongArgValueSnafu {
+                        got: value.to_string(),
+                        reason: "Capture group mustbe non-negative".to_string(),
+                    }
+                    .fail()
+                }
+                other => crate::regexp::errors::UnsupportedInputTypeWithPositionSnafu {
+                    data_type: other.data_type(),
+                    position: 6usize,
+                }
+                .fail(),
+            },
+        )?;
+
+        Ok((position, occurrence, regexp_parameters, group_num))
+    }
+}
+
+impl ScalarUDFImpl for RegexpSubstrFunc {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &'static str {
+        "regexp_substr"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> DFResult<DataType> {
+        match arg_types.len() {
+            0 => regexp_errors::NotEnoughArgumentsSnafu {
+                got: 0usize,
+                at_least: 2usize,
+            }
+            .fail()?,
+            //Return type specified as Number, probably an `Integer` which is an alias to `Number(38, 0)`,
+            // we return `Int64` for better internal DF compatibility
+            n if 7 > n && 1 < n => Ok(DataType::Utf8),
+            n => regexp_errors::TooManyArgumentsSnafu {
+                got: n,
+                at_maximum: 6usize,
+            }
+            .fail()?,
+        }
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> DFResult<ColumnarValue> {
+        //Already checked that it's at least > 1
+        let subject = &args.args[0];
+        let array = match subject {
+            ColumnarValue::Array(array) => array,
+            //Can't fail (shouldn't)
+            ColumnarValue::Scalar(scalar) => &scalar.to_array()?,
+        };
+
+        //Already checked that it's at least > 1
+        let pattern = match &args.args[1] {
+            ColumnarValue::Scalar(
+                ScalarValue::Utf8(Some(pattern))
+                | ScalarValue::LargeUtf8(Some(pattern))
+                | ScalarValue::Utf8View(Some(pattern)),
+            ) => pattern,
+            other => {
+                return regexp_errors::UnsupportedInputTypeWithPositionSnafu {
+                    data_type: other.data_type(),
+                    position: 2usize,
+                }
+                .fail()?;
+            }
+        };
+
+        let (position, occurrence, regexp_parameters, group_num) =
+            Self::take_args_values(&args.args)?;
+
+        //TODO: Or data_capacity: 1024
+        let mut result_array = StringBuilder::with_capacity(array.len(), array.len() * 10);
+
+        match array.data_type() {
+            DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View => {
+                let string_array: &StringArray = as_generic_string_array(array)?;
+                let regex = pattern_to_regex(pattern, regexp_parameters)
+                    .context(regexp_errors::UnsupportedRegexSnafu)?;
+                regexp(string_array, &regex, position).for_each(|opt_iter| {
+                    result_array.append_option(opt_iter.and_then(|mut cap_iter| {
+                        cap_iter.nth(occurrence).and_then(|cap| {
+                            //group_num == 0, means get the whole match (seems docs in regex are incorrect)
+                            cap.get(group_num).map(|mat| mat.as_str())
+                        })
+                    }));
+                });
+            }
+            other => regexp_errors::UnsupportedInputTypeWithPositionSnafu {
+                position: 1usize,
+                data_type: other.clone(),
+            }
+            .fail()?,
+        }
+
+        Ok(ColumnarValue::Array(Arc::new(result_array.finish())))
+    }
+}
diff --git a/crates/embucket-functions/src/tests/regexp/mod.rs b/crates/embucket-functions/src/tests/regexp/mod.rs
index 72c9db3cc..7dd4d6172 100644
--- a/crates/embucket-functions/src/tests/regexp/mod.rs
+++ b/crates/embucket-functions/src/tests/regexp/mod.rs
@@ -1 +1,2 @@
 mod regexp_instr;
+mod regexp_substr;
diff --git a/crates/embucket-functions/src/tests/regexp/regexp_substr.rs b/crates/embucket-functions/src/tests/regexp/regexp_substr.rs
new file mode 100644
index 000000000..4b2ca212b
--- /dev/null
+++ b/crates/embucket-functions/src/tests/regexp/regexp_substr.rs
@@ -0,0 +1,85 @@
+use crate::test_query;
+
+test_query!(
+    regexp_substr_basic_scalar,
+    "SELECT REGEXP_SUBSTR('nevermore1, nevermore2, nevermore3.', 'nevermore\\d')",
+    snapshot_path = "regexp_substr"
+);
+
+test_query!(
+    regexp_substr_basic_column,
+    "SELECT REGEXP_SUBSTR(column1, 'the\\W+\\w+')
+    FROM VALUES ('It was the best of times, it was the worst of times.'),
+    ('In    the   string   the   extra   spaces  are   redundant.'),
+    ('A thespian theater is nearby.')",
+    snapshot_path = "regexp_substr"
+);
+
+test_query!(
+    regexp_substr_occurrence,
+    "SELECT REGEXP_SUBSTR(column1, 'the\\W+\\w+', 1, 2)
+    FROM VALUES ('It was the best of times, it was the worst of times.'),
+    ('In    the   string   the   extra   spaces  are   redundant.'),
+    ('A thespian theater is nearby.')",
+    snapshot_path = "regexp_substr"
+);
+
+test_query!(
+    regexp_substr_group_num,
+    "SELECT REGEXP_SUBSTR(column1, 'the\\W+(\\w+)', 1, 2, 'e', 1)
+    FROM VALUES ('It was the best of times, it was the worst of times.'),
+    ('In    the   string   the   extra   spaces  are   redundant.'),
+    ('A thespian theater is nearby.')",
+    snapshot_path = "regexp_substr"
+);
+
+test_query!(
+    regexp_substr_word_groups,
+    "SELECT REGEXP_SUBSTR(column1, 'A\\W+(\\w+)', 1, 1, 'e', 1),
+    REGEXP_SUBSTR(column1, 'A\\W+(\\w+)', 1, 2, 'e', 1),
+    REGEXP_SUBSTR(column1, 'A\\W+(\\w+)', 1, 3, 'e', 1),
+    REGEXP_SUBSTR(column1, 'A\\W+(\\w+)', 1, 4, 'e', 1)
+    FROM VALUES ('A MAN A PLAN A CANAL')",
+    snapshot_path = "regexp_substr"
+);
+
+test_query!(
+    regexp_substr_letter_groups,
+    "SELECT REGEXP_SUBSTR(column1, 'A\\W+(\\w)(\\w)(\\w)', 1, 1, 'e', 1),
+    REGEXP_SUBSTR(column1, 'A\\W+(\\w)(\\w)(\\w)', 1, 1, 'e', 2),
+    REGEXP_SUBSTR(column1, 'A\\W+(\\w)(\\w)(\\w)', 1, 1, 'e', 3)
+    FROM VALUES ('A MAN A PLAN A CANAL')",
+    snapshot_path = "regexp_substr"
+);
+
+test_query!(
+    regexp_substr_word_boundary,
+    "SELECT REGEXP_SUBSTR('It was the best of times, it was the worst of times','\\bwas\\b', 1, 1)",
+    snapshot_path = "regexp_substr"
+);
+
+test_query!(
+    regexp_substr_regex_patterns_1,
+    "SELECT REGEXP_SUBSTR('It was the best of times, it was the worst of times', '[[:alpha:]]{2,}st', 15, 1)",
+    snapshot_path = "regexp_substr"
+);
+
+test_query!(
+    regexp_substr_regex_patterns_2,
+    "SELECT REGEXP_SUBSTR(column1, '\\b\\S*o\\S*\\b')
+    FROM VALUES ('Hellooo World'),
+    ('How are you doing today?'),
+    ('the quick brown fox jumps over the lazy dog'),
+    ('PACK MY BOX WITH FIVE DOZEN LIQUOR JUGS')",
+    snapshot_path = "regexp_substr"
+);
+
+test_query!(
+    regexp_substr_regex_patterns_3,
+    "SELECT REGEXP_SUBSTR(column1, '\\b\\S*o\\S*\\b', 3, 3, 'i')
+    FROM VALUES ('Hellooo World'),
+    ('How are you doing today?'),
+    ('the quick brown fox jumps over the lazy dog'),
+    ('PACK MY BOX WITH FIVE DOZEN LIQUOR JUGS')",
+    snapshot_path = "regexp_substr"
+);
diff --git a/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_basic_column.snap b/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_basic_column.snap
new file mode 100644
index 000000000..c4d3ac70f
--- /dev/null
+++ b/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_basic_column.snap
@@ -0,0 +1,15 @@
+---
+source: crates/embucket-functions/src/tests/regexp/regexp_substr.rs
+description: "\"SELECT REGEXP_SUBSTR(column1, 'the\\\\W+\\\\w+')\n    FROM VALUES ('It was the best of times, it was the worst of times.'),\n    ('In    the   string   the   extra   spaces  are   redundant.'),\n    ('A thespian theater is nearby.')\""
+---
+Ok(
+    [
+        "+------------------------------------------+",
+        "| regexp_substr(column1,Utf8(\"the\\W+\\w+\")) |",
+        "+------------------------------------------+",
+        "| the best                                 |",
+        "| the   string                             |",
+        "|                                          |",
+        "+------------------------------------------+",
+    ],
+)
diff --git a/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_basic_scalar.snap b/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_basic_scalar.snap
new file mode 100644
index 000000000..86f833e73
--- /dev/null
+++ b/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_basic_scalar.snap
@@ -0,0 +1,13 @@
+---
+source: crates/embucket-functions/src/tests/regexp/regexp_substr.rs
+description: "\"SELECT REGEXP_SUBSTR('nevermore1, nevermore2, nevermore3.', 'nevermore\\\\d')\""
+---
+Ok(
+    [
+        "+--------------------------------------------------------------------------------+",
+        "| regexp_substr(Utf8(\"nevermore1, nevermore2, nevermore3.\"),Utf8(\"nevermore\\d\")) |",
+        "+--------------------------------------------------------------------------------+",
+        "| nevermore1                                                                     |",
+        "+--------------------------------------------------------------------------------+",
+    ],
+)
diff --git a/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_group_num.snap b/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_group_num.snap
new file mode 100644
index 000000000..f4a4269f5
--- /dev/null
+++ b/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_group_num.snap
@@ -0,0 +1,15 @@
+---
+source: crates/embucket-functions/src/tests/regexp/regexp_substr.rs
+description: "\"SELECT REGEXP_SUBSTR(column1, 'the\\\\W+(\\\\w+)', 1, 2, 'e', 1)\n    FROM VALUES ('It was the best of times, it was the worst of times.'),\n    ('In    the   string   the   extra   spaces  are   redundant.'),\n    ('A thespian theater is nearby.')\""
+---
+Ok(
+    [
+        "+---------------------------------------------------------------------------------+",
+        "| regexp_substr(column1,Utf8(\"the\\W+(\\w+)\"),Int64(1),Int64(2),Utf8(\"e\"),Int64(1)) |",
+        "+---------------------------------------------------------------------------------+",
+        "| worst                                                                           |",
+        "| extra                                                                           |",
+        "|                                                                                 |",
+        "+---------------------------------------------------------------------------------+",
+    ],
+)
diff --git a/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_letter_groups.snap b/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_letter_groups.snap
new file mode 100644
index 000000000..d0fd5469c
--- /dev/null
+++ b/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_letter_groups.snap
@@ -0,0 +1,13 @@
+---
+source: crates/embucket-functions/src/tests/regexp/regexp_substr.rs
+description: "\"SELECT REGEXP_SUBSTR(column1, 'A\\\\W+(\\\\w)(\\\\w)(\\\\w)', 1, 1, 'e', 1),\n    REGEXP_SUBSTR(column1, 'A\\\\W+(\\\\w)(\\\\w)(\\\\w)', 1, 1, 'e', 2),\n    REGEXP_SUBSTR(column1, 'A\\\\W+(\\\\w)(\\\\w)(\\\\w)', 1, 1, 'e', 3)\n    FROM VALUES ('A MAN A PLAN A CANAL')\""
+---
+Ok(
+    [
+        "+--------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------+",
+        "| regexp_substr(column1,Utf8(\"A\\W+(\\w)(\\w)(\\w)\"),Int64(1),Int64(1),Utf8(\"e\"),Int64(1)) | regexp_substr(column1,Utf8(\"A\\W+(\\w)(\\w)(\\w)\"),Int64(1),Int64(1),Utf8(\"e\"),Int64(2)) | regexp_substr(column1,Utf8(\"A\\W+(\\w)(\\w)(\\w)\"),Int64(1),Int64(1),Utf8(\"e\"),Int64(3)) |",
+        "+--------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------+",
+        "| M                                                                                    | A                                                                                    | N                                                                                    |",
+        "+--------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------+",
+    ],
+)
diff --git a/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_occurrence.snap b/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_occurrence.snap
new file mode 100644
index 000000000..7d184b765
--- /dev/null
+++ b/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_occurrence.snap
@@ -0,0 +1,15 @@
+---
+source: crates/embucket-functions/src/tests/regexp/regexp_substr.rs
+description: "\"SELECT REGEXP_SUBSTR(column1, 'the\\\\W+\\\\w+', 1, 2)\n    FROM VALUES ('It was the best of times, it was the worst of times.'),\n    ('In    the   string   the   extra   spaces  are   redundant.'),\n    ('A thespian theater is nearby.')\""
+---
+Ok(
+    [
+        "+------------------------------------------------------------+",
+        "| regexp_substr(column1,Utf8(\"the\\W+\\w+\"),Int64(1),Int64(2)) |",
+        "+------------------------------------------------------------+",
+        "| the worst                                                  |",
+        "| the   extra                                                |",
+        "|                                                            |",
+        "+------------------------------------------------------------+",
+    ],
+)
diff --git a/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_regex_patterns_1.snap b/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_regex_patterns_1.snap
new file mode 100644
index 000000000..7433e5e65
--- /dev/null
+++ b/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_regex_patterns_1.snap
@@ -0,0 +1,13 @@
+---
+source: crates/embucket-functions/src/tests/regexp/regexp_substr.rs
+description: "\"SELECT REGEXP_SUBSTR('It was the best of times, it was the worst of times', '[[:alpha:]]{2,}st', 15, 1)\""
+---
+Ok(
+    [
+        "+-------------------------------------------------------------------------------------------------------------------------+",
+        "| regexp_substr(Utf8(\"It was the best of times, it was the worst of times\"),Utf8(\"[[:alpha:]]{2,}st\"),Int64(15),Int64(1)) |",
+        "+-------------------------------------------------------------------------------------------------------------------------+",
+        "| worst                                                                                                                   |",
+        "+-------------------------------------------------------------------------------------------------------------------------+",
+    ],
+)
diff --git a/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_regex_patterns_2.snap b/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_regex_patterns_2.snap
new file mode 100644
index 000000000..0a2d587d9
--- /dev/null
+++ b/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_regex_patterns_2.snap
@@ -0,0 +1,16 @@
+---
+source: crates/embucket-functions/src/tests/regexp/regexp_substr.rs
+description: "\"SELECT REGEXP_SUBSTR(column1, '\\\\b\\\\S*o\\\\S*\\\\b')\n    FROM VALUES ('Hellooo World'),\n    ('How are you doing today?'),\n    ('the quick brown fox jumps over the lazy dog'),\n    ('PACK MY BOX WITH FIVE DOZEN LIQUOR JUGS')\""
+---
+Ok(
+    [
+        "+--------------------------------------------+",
+        "| regexp_substr(column1,Utf8(\"\\b\\S*o\\S*\\b\")) |",
+        "+--------------------------------------------+",
+        "| Hellooo                                    |",
+        "| How                                        |",
+        "| brown                                      |",
+        "|                                            |",
+        "+--------------------------------------------+",
+    ],
+)
diff --git a/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_regex_patterns_3.snap b/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_regex_patterns_3.snap
new file mode 100644
index 000000000..d60f3722f
--- /dev/null
+++ b/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_regex_patterns_3.snap
@@ -0,0 +1,16 @@
+---
+source: crates/embucket-functions/src/tests/regexp/regexp_substr.rs
+description: "\"SELECT REGEXP_SUBSTR(column1, '\\\\b\\\\S*o\\\\S*\\\\b', 3, 3, 'i')\n    FROM VALUES ('Hellooo World'),\n    ('How are you doing today?'),\n    ('the quick brown fox jumps over the lazy dog'),\n    ('PACK MY BOX WITH FIVE DOZEN LIQUOR JUGS')\""
+---
+Ok(
+    [
+        "+------------------------------------------------------------------------+",
+        "| regexp_substr(column1,Utf8(\"\\b\\S*o\\S*\\b\"),Int64(3),Int64(3),Utf8(\"i\")) |",
+        "+------------------------------------------------------------------------+",
+        "|                                                                        |",
+        "| today                                                                  |",
+        "| over                                                                   |",
+        "| LIQUOR                                                                 |",
+        "+------------------------------------------------------------------------+",
+    ],
+)
diff --git a/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_word_boundary.snap b/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_word_boundary.snap
new file mode 100644
index 000000000..4a1cc93ab
--- /dev/null
+++ b/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_word_boundary.snap
@@ -0,0 +1,13 @@
+---
+source: crates/embucket-functions/src/tests/regexp/regexp_substr.rs
+description: "\"SELECT REGEXP_SUBSTR('It was the best of times, it was the worst of times','\\\\bwas\\\\b', 1, 1)\""
+---
+Ok(
+    [
+        "+--------------------------------------------------------------------------------------------------------------+",
+        "| regexp_substr(Utf8(\"It was the best of times, it was the worst of times\"),Utf8(\"\\bwas\\b\"),Int64(1),Int64(1)) |",
+        "+--------------------------------------------------------------------------------------------------------------+",
+        "| was                                                                                                          |",
+        "+--------------------------------------------------------------------------------------------------------------+",
+    ],
+)
diff --git a/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_word_groups.snap b/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_word_groups.snap
new file mode 100644
index 000000000..0c697e589
--- /dev/null
+++ b/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_word_groups.snap
@@ -0,0 +1,13 @@
+---
+source: crates/embucket-functions/src/tests/regexp/regexp_substr.rs
+description: "\"SELECT REGEXP_SUBSTR(column1, 'A\\\\W+(\\\\w+)', 1, 1, 'e', 1),\n    REGEXP_SUBSTR(column1, 'A\\\\W+(\\\\w+)', 1, 2, 'e', 1),\n    REGEXP_SUBSTR(column1, 'A\\\\W+(\\\\w+)', 1, 3, 'e', 1),\n    REGEXP_SUBSTR(column1, 'A\\\\W+(\\\\w+)', 1, 4, 'e', 1)\n    FROM VALUES ('A MAN A PLAN A CANAL')\""
+---
+Ok(
+    [
+        "+-------------------------------------------------------------------------------+-------------------------------------------------------------------------------+-------------------------------------------------------------------------------+-------------------------------------------------------------------------------+",
+        "| regexp_substr(column1,Utf8(\"A\\W+(\\w+)\"),Int64(1),Int64(1),Utf8(\"e\"),Int64(1)) | regexp_substr(column1,Utf8(\"A\\W+(\\w+)\"),Int64(1),Int64(2),Utf8(\"e\"),Int64(1)) | regexp_substr(column1,Utf8(\"A\\W+(\\w+)\"),Int64(1),Int64(3),Utf8(\"e\"),Int64(1)) | regexp_substr(column1,Utf8(\"A\\W+(\\w+)\"),Int64(1),Int64(4),Utf8(\"e\"),Int64(1)) |",
+        "+-------------------------------------------------------------------------------+-------------------------------------------------------------------------------+-------------------------------------------------------------------------------+-------------------------------------------------------------------------------+",
+        "| MAN                                                                           | PLAN                                                                          | CANAL                                                                         |                                                                               |",
+        "+-------------------------------------------------------------------------------+-------------------------------------------------------------------------------+-------------------------------------------------------------------------------+-------------------------------------------------------------------------------+",
+    ],
+)
diff --git a/crates/embucket-functions/src/visitors/unimplemented/generated_snowflake_functions.rs b/crates/embucket-functions/src/visitors/unimplemented/generated_snowflake_functions.rs
index 624651910..1b820a03d 100644
--- a/crates/embucket-functions/src/visitors/unimplemented/generated_snowflake_functions.rs
+++ b/crates/embucket-functions/src/visitors/unimplemented/generated_snowflake_functions.rs
@@ -643,6 +643,40 @@ pub const CONVERSION_FUNCTIONS: &[(&str, FunctionInfo)] = &[
     .with_docs("https://docs.snowflake.com/en/sql-reference/functions/try_to_geometry")
     .with_subcategory("geospatial")
     ),
+    ("TRY_TO_TIMESTAMP", FunctionInfo::new(
+        "TRY_TO_TIMESTAMP",
+        "Converts an input expression into the corresponding timestamp."
+    )
+    .with_docs("https://docs.snowflake.com/en/sql-reference/functions/to_timestamp")
+    .with_subcategory("datetime")
+    ),
+    ("TRY_TO_TIMESTAMP", FunctionInfo::new(
+        "TRY_TO_TIMESTAMP",
+        "A special version of TO_TIMESTAMP / TO_TIMESTAMP_* that performs the same operation (i.e. converts an input expression into a timestamp), but with error-handling support (i.e. if the conversion cannot be performed, it returns a NULL value instead of raising an error)."
+    )
+    .with_docs("https://docs.snowflake.com/en/sql-reference/functions/try_to_timestamp")
+    ),
+    ("TRY_TO_TIMESTAMP_LTZ", FunctionInfo::new(
+        "TRY_TO_TIMESTAMP_LTZ",
+        "Converts an input expression into the corresponding timestamp."
+    )
+    .with_docs("https://docs.snowflake.com/en/sql-reference/functions/to_timestamp")
+    .with_subcategory("datetime")
+    ),
+    ("TRY_TO_TIMESTAMP_NTZ", FunctionInfo::new(
+        "TRY_TO_TIMESTAMP_NTZ",
+        "Converts an input expression into the corresponding timestamp."
+    )
+    .with_docs("https://docs.snowflake.com/en/sql-reference/functions/to_timestamp")
+    .with_subcategory("datetime")
+    ),
+    ("TRY_TO_TIMESTAMP_TZ", FunctionInfo::new(
+        "TRY_TO_TIMESTAMP_TZ",
+        "Converts an input expression into the corresponding timestamp."
+    )
+    .with_docs("https://docs.snowflake.com/en/sql-reference/functions/to_timestamp")
+    .with_subcategory("datetime")
+    ),
 ];
 
 pub const DATA_METRIC_FUNCTIONS: &[(&str, FunctionInfo)] = &[
@@ -712,36 +746,102 @@ pub const DATA_QUALITY_FUNCTIONS: &[(&str, FunctionInfo)] = &[
 ];
 
 pub const DATETIME_FUNCTIONS: &[(&str, FunctionInfo)] = &[
+    ("DAY", FunctionInfo::new(
+        "DAY",
+        "Extracts the corresponding date part from a date or timestamp."
+    )
+    .with_docs("https://docs.snowflake.com/en/sql-reference/functions/year")
+    ),
+    ("DAYOFMONTH", FunctionInfo::new(
+        "DAYOFMONTH",
+        "Extracts the corresponding date part from a date or timestamp."
+    )
+    .with_docs("https://docs.snowflake.com/en/sql-reference/functions/year")
+    ),
+    ("DAYOFWEEK", FunctionInfo::new(
+        "DAYOFWEEK",
+        "Extracts the corresponding date part from a date or timestamp."
+    )
+    .with_docs("https://docs.snowflake.com/en/sql-reference/functions/year")
+    ),
+    ("DAYOFWEEKISO", FunctionInfo::new(
+        "DAYOFWEEKISO",
+        "Extracts the corresponding date part from a date or timestamp."
+    )
+    .with_docs("https://docs.snowflake.com/en/sql-reference/functions/year")
+    ),
+    ("DAYOFYEAR", FunctionInfo::new(
+        "DAYOFYEAR",
+        "Extracts the corresponding date part from a date or timestamp."
+    )
+    .with_docs("https://docs.snowflake.com/en/sql-reference/functions/year")
+    ),
     ("EXTRACT", FunctionInfo::new(
         "EXTRACT",
         "Extracts the specified date or time part from a date, time, or timestamp."
     )
     .with_docs("https://docs.snowflake.com/en/sql-reference/functions/extract")
     ),
+    ("HOUR", FunctionInfo::new(
+        "HOUR",
+        "Extracts the corresponding time part from a time or timestamp value."
+    )
+    .with_docs("https://docs.snowflake.com/en/sql-reference/functions/hour-minute-second")
+    ),
     ("LAST_SUCCESSFUL_SCHEDULED_TIME", FunctionInfo::new(
         "LAST_SUCCESSFUL_SCHEDULED_TIME",
         "Returns the timestamp representing the scheduled time for the most recent successful evaluation of the alert condition, where no errors occurred when executing the action."
     )
     .with_docs("https://docs.snowflake.com/en/sql-reference/functions/last_successful_scheduled_time")
     ),
+    ("MINUTE", FunctionInfo::new(
+        "MINUTE",
+        "Extracts the corresponding time part from a time or timestamp value."
+    )
+    .with_docs("https://docs.snowflake.com/en/sql-reference/functions/hour-minute-second")
+    ),
+    ("MONTH", FunctionInfo::new(
+        "MONTH",
+        "Extracts the corresponding date part from a date or timestamp."
+    )
+    .with_docs("https://docs.snowflake.com/en/sql-reference/functions/year")
+    ),
     ("MONTHS_BETWEEN", FunctionInfo::new(
         "MONTHS_BETWEEN",
         "Returns the number of months between two DATE or TIMESTAMP values."
     )
     .with_docs("https://docs.snowflake.com/en/sql-reference/functions/months_between")
     ),
+    ("QUARTER", FunctionInfo::new(
+        "QUARTER",
+        "Extracts the corresponding date part from a date or timestamp."
+    )
+    .with_docs("https://docs.snowflake.com/en/sql-reference/functions/year")
+    ),
     ("SCHEDULED_TIME", FunctionInfo::new(
         "SCHEDULED_TIME",
         "Returns the timestamp representing the scheduled time of the current alert."
     )
     .with_docs("https://docs.snowflake.com/en/sql-reference/functions/scheduled_time")
     ),
+    ("SECOND", FunctionInfo::new(
+        "SECOND",
+        "Extracts the corresponding time part from a time or timestamp value."
+    )
+    .with_docs("https://docs.snowflake.com/en/sql-reference/functions/hour-minute-second")
+    ),
     ("TIME_SLICE", FunctionInfo::new(
         "TIME_SLICE",
         "Calculates the beginning or end of a “slice” of time, where the length of the slice is a multiple of a standard unit of time (minute, hour, day, etc.)."
     )
     .with_docs("https://docs.snowflake.com/en/sql-reference/functions/time_slice")
     ),
+    ("WEEK", FunctionInfo::new(
+        "WEEK",
+        "Extracts the corresponding date part from a date or timestamp."
+    )
+    .with_docs("https://docs.snowflake.com/en/sql-reference/functions/year")
+    ),
     ("WEEKISO", FunctionInfo::new(
         "WEEKISO",
         "Extracts the corresponding date part from a date or timestamp."
@@ -754,6 +854,12 @@ pub const DATETIME_FUNCTIONS: &[(&str, FunctionInfo)] = &[
     )
     .with_docs("https://docs.snowflake.com/en/sql-reference/functions/year")
     ),
+    ("YEAR", FunctionInfo::new(
+        "YEAR",
+        "Extracts the corresponding date part from a date or timestamp."
+    )
+    .with_docs("https://docs.snowflake.com/en/sql-reference/functions/year")
+    ),
     ("YEAROFWEEK", FunctionInfo::new(
         "YEAROFWEEK",
         "Extracts the corresponding date part from a date or timestamp."
diff --git a/crates/embucket-functions/src/visitors/unimplemented/helper/implemented_functions.csv b/crates/embucket-functions/src/visitors/unimplemented/helper/implemented_functions.csv
index bb3876eee..b8325ef59 100644
--- a/crates/embucket-functions/src/visitors/unimplemented/helper/implemented_functions.csv
+++ b/crates/embucket-functions/src/visitors/unimplemented/helper/implemented_functions.csv
@@ -134,12 +134,7 @@ datediff
 datefromparts
 datepart
 datetrunc
-day
 dayname
-dayofmonth
-dayofweek
-dayofweekiso
-dayofyear
 decode
 decrypt_raw
 degrees
@@ -172,7 +167,6 @@ grouping
 hex_decode_binary
 hex_decode_string
 hex_encode
-hour
 iff
 ifnull
 initcap
@@ -274,8 +268,6 @@ md5
 mean
 median
 min
-minute
-month
 monthname
 named_struct
 nanvl
@@ -304,7 +296,6 @@ position
 pow
 power
 previous_day
-quarter
 radians
 random
 range
@@ -336,7 +327,6 @@ row_number
 rpad
 rtrim
 rtrimmed_length
-second
 sha2
 sha224
 sha256
@@ -398,13 +388,10 @@ to_number
 to_numeric
 to_time
 to_timestamp
-to_timestamp_ltz
 to_timestamp_micros
 to_timestamp_millis
 to_timestamp_nanos
-to_timestamp_ntz
 to_timestamp_seconds
-to_timestamp_tz
 to_unixtime
 to_varchar
 to_variant
@@ -423,10 +410,6 @@ try_to_decimal
 try_to_number
 try_to_numeric
 try_to_time
-try_to_timestamp
-try_to_timestamp_ltz
-try_to_timestamp_ntz
-try_to_timestamp_tz
 try_to_varchar
 typeof
 union_extract
@@ -439,6 +422,4 @@ var_samp
 var_sample
 variant_element
 version
-week
-year
-zeroifnull
\ No newline at end of file
+zeroifnull

From a96e07e4b82fbf4d1e1e27769ded4c733f83a14c Mon Sep 17 00:00:00 2001
From: DanCodedThis <daniil.vysotskyi@gmail.com>
Date: Wed, 6 Aug 2025 19:54:33 +0300
Subject: [PATCH 2/3] docs + minor tweaks

---
 .../src/regexp/regexp_substr.rs               | 49 ++++++++++++++++---
 .../src/tests/regexp/regexp_substr.rs         |  2 +-
 .../query_regexp_substr_basic_scalar.snap     | 12 ++---
 3 files changed, 50 insertions(+), 13 deletions(-)

diff --git a/crates/embucket-functions/src/regexp/regexp_substr.rs b/crates/embucket-functions/src/regexp/regexp_substr.rs
index 6c91415ba..4b25fc370 100644
--- a/crates/embucket-functions/src/regexp/regexp_substr.rs
+++ b/crates/embucket-functions/src/regexp/regexp_substr.rs
@@ -16,7 +16,44 @@ use std::any::Any;
 use std::fmt::Debug;
 use std::sync::Arc;
 
-//TODO: Docs
+/// `REGEXP_SUBSTR` function implementation
+///
+/// Returns the position of the specified occurrence of the regular expression pattern in the string subject.
+/// If no match is found, returns 0.
+///
+/// Syntax: `REGEXP_SUBSTR( <subject> , <pattern> [ , <position> [ , <occurrence> [ , <regex_parameters> [ , <group_num> ] ] ] ] )`
+///
+/// Arguments:
+///
+/// `Required`:
+/// - `<subject>` the string to search for matches.
+/// - `<pattern>` pattern to match.
+///
+/// `Optional`:
+/// - `<position>` number of characters from the beginning of the string where the function starts searching for matches.
+///   Default: `1` (the search for a match starts at the first character on the left)
+/// - `<occurrence>` specifies the first occurrence of the pattern from which to start returning matches.
+///   The function skips the first occurrence - 1 matches. For example, if there are 5 matches and you specify 3 for the occurrence argument,
+///   the function ignores the first two matches and returns the third, fourth, and fifth matches.
+///   Default: `1`
+/// - `<regex_parameters>` String of one or more characters that specifies the parameters used for searching for matches.
+///   Supported values:
+///   ---------------------------------------------------------------------------
+///   | Parameter       | Description                               |
+///   |-----------------|-------------------------------------------|
+///   | c               | Case-sensitive matching                   |
+///   | i               | Case-insensitive matching                 |
+///   | m               | Multi-line mode                           |
+///   | e               | Extract submatches                        |
+///   | s               | POSIX wildcard character `.` matches `\n` |
+///   ---------------------------------------------------------------------------
+///   Default: `c`
+/// - `<group_num>` the `group_num` parameter specifies which group to extract.
+///   Groups are specified by using parentheses in the regular expression.
+///   If a `group_num` is specified, it allows extraction even if the e option was not also specified.
+///   The e option is implied.
+///
+/// Example: `REGEXP_SUBSTR('nevermore1, nevermore2, nevermore3.', 'nevermore')`
 #[derive(Debug)]
 pub struct RegexpSubstrFunc {
     signature: Signature,
@@ -114,7 +151,7 @@ impl RegexpSubstrFunc {
             },
         )?;
 
-        let regexp_parameters = args.get(4).map_or_else(
+        let regex_parameters = args.get(4).map_or_else(
             || Ok("c"),
             |value| match value {
                 ColumnarValue::Scalar(
@@ -147,7 +184,7 @@ impl RegexpSubstrFunc {
 
         let group_num = args.get(5).map_or_else(
             || {
-                if regexp_parameters.contains('e') {
+                if regex_parameters.contains('e') {
                     Ok(1)
                 } else {
                     Ok(0)
@@ -173,7 +210,7 @@ impl RegexpSubstrFunc {
             },
         )?;
 
-        Ok((position, occurrence, regexp_parameters, group_num))
+        Ok((position, occurrence, regex_parameters, group_num))
     }
 }
 
@@ -233,7 +270,7 @@ impl ScalarUDFImpl for RegexpSubstrFunc {
             }
         };
 
-        let (position, occurrence, regexp_parameters, group_num) =
+        let (position, occurrence, regex_parameters, group_num) =
             Self::take_args_values(&args.args)?;
 
         //TODO: Or data_capacity: 1024
@@ -242,7 +279,7 @@ impl ScalarUDFImpl for RegexpSubstrFunc {
         match array.data_type() {
             DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View => {
                 let string_array: &StringArray = as_generic_string_array(array)?;
-                let regex = pattern_to_regex(pattern, regexp_parameters)
+                let regex = pattern_to_regex(pattern, regex_parameters)
                     .context(regexp_errors::UnsupportedRegexSnafu)?;
                 regexp(string_array, &regex, position).for_each(|opt_iter| {
                     result_array.append_option(opt_iter.and_then(|mut cap_iter| {
diff --git a/crates/embucket-functions/src/tests/regexp/regexp_substr.rs b/crates/embucket-functions/src/tests/regexp/regexp_substr.rs
index 4b2ca212b..3ee83a7b2 100644
--- a/crates/embucket-functions/src/tests/regexp/regexp_substr.rs
+++ b/crates/embucket-functions/src/tests/regexp/regexp_substr.rs
@@ -2,7 +2,7 @@ use crate::test_query;
 
 test_query!(
     regexp_substr_basic_scalar,
-    "SELECT REGEXP_SUBSTR('nevermore1, nevermore2, nevermore3.', 'nevermore\\d')",
+    "SELECT REGEXP_SUBSTR('nevermore1, nevermore2, nevermore3.', 'nevermore')",
     snapshot_path = "regexp_substr"
 );
 
diff --git a/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_basic_scalar.snap b/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_basic_scalar.snap
index 86f833e73..310a51b42 100644
--- a/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_basic_scalar.snap
+++ b/crates/embucket-functions/src/tests/regexp/snapshots/regexp_substr/query_regexp_substr_basic_scalar.snap
@@ -1,13 +1,13 @@
 ---
 source: crates/embucket-functions/src/tests/regexp/regexp_substr.rs
-description: "\"SELECT REGEXP_SUBSTR('nevermore1, nevermore2, nevermore3.', 'nevermore\\\\d')\""
+description: "\"SELECT REGEXP_SUBSTR('nevermore1, nevermore2, nevermore3.', 'nevermore')\""
 ---
 Ok(
     [
-        "+--------------------------------------------------------------------------------+",
-        "| regexp_substr(Utf8(\"nevermore1, nevermore2, nevermore3.\"),Utf8(\"nevermore\\d\")) |",
-        "+--------------------------------------------------------------------------------+",
-        "| nevermore1                                                                     |",
-        "+--------------------------------------------------------------------------------+",
+        "+------------------------------------------------------------------------------+",
+        "| regexp_substr(Utf8(\"nevermore1, nevermore2, nevermore3.\"),Utf8(\"nevermore\")) |",
+        "+------------------------------------------------------------------------------+",
+        "| nevermore                                                                    |",
+        "+------------------------------------------------------------------------------+",
     ],
 )

From c799b58d48256065664831f9338222ec9aa2179c Mon Sep 17 00:00:00 2001
From: DanCodedThis <daniil.vysotskyi@gmail.com>
Date: Wed, 6 Aug 2025 20:12:42 +0300
Subject: [PATCH 3/3] fix deleting registered functions

---
 .../generated_snowflake_functions.rs          | 106 ------------------
 .../helper/implemented_functions.csv          |  19 ++++
 2 files changed, 19 insertions(+), 106 deletions(-)

diff --git a/crates/embucket-functions/src/visitors/unimplemented/generated_snowflake_functions.rs b/crates/embucket-functions/src/visitors/unimplemented/generated_snowflake_functions.rs
index 1b820a03d..624651910 100644
--- a/crates/embucket-functions/src/visitors/unimplemented/generated_snowflake_functions.rs
+++ b/crates/embucket-functions/src/visitors/unimplemented/generated_snowflake_functions.rs
@@ -643,40 +643,6 @@ pub const CONVERSION_FUNCTIONS: &[(&str, FunctionInfo)] = &[
     .with_docs("https://docs.snowflake.com/en/sql-reference/functions/try_to_geometry")
     .with_subcategory("geospatial")
     ),
-    ("TRY_TO_TIMESTAMP", FunctionInfo::new(
-        "TRY_TO_TIMESTAMP",
-        "Converts an input expression into the corresponding timestamp."
-    )
-    .with_docs("https://docs.snowflake.com/en/sql-reference/functions/to_timestamp")
-    .with_subcategory("datetime")
-    ),
-    ("TRY_TO_TIMESTAMP", FunctionInfo::new(
-        "TRY_TO_TIMESTAMP",
-        "A special version of TO_TIMESTAMP / TO_TIMESTAMP_* that performs the same operation (i.e. converts an input expression into a timestamp), but with error-handling support (i.e. if the conversion cannot be performed, it returns a NULL value instead of raising an error)."
-    )
-    .with_docs("https://docs.snowflake.com/en/sql-reference/functions/try_to_timestamp")
-    ),
-    ("TRY_TO_TIMESTAMP_LTZ", FunctionInfo::new(
-        "TRY_TO_TIMESTAMP_LTZ",
-        "Converts an input expression into the corresponding timestamp."
-    )
-    .with_docs("https://docs.snowflake.com/en/sql-reference/functions/to_timestamp")
-    .with_subcategory("datetime")
-    ),
-    ("TRY_TO_TIMESTAMP_NTZ", FunctionInfo::new(
-        "TRY_TO_TIMESTAMP_NTZ",
-        "Converts an input expression into the corresponding timestamp."
-    )
-    .with_docs("https://docs.snowflake.com/en/sql-reference/functions/to_timestamp")
-    .with_subcategory("datetime")
-    ),
-    ("TRY_TO_TIMESTAMP_TZ", FunctionInfo::new(
-        "TRY_TO_TIMESTAMP_TZ",
-        "Converts an input expression into the corresponding timestamp."
-    )
-    .with_docs("https://docs.snowflake.com/en/sql-reference/functions/to_timestamp")
-    .with_subcategory("datetime")
-    ),
 ];
 
 pub const DATA_METRIC_FUNCTIONS: &[(&str, FunctionInfo)] = &[
@@ -746,102 +712,36 @@ pub const DATA_QUALITY_FUNCTIONS: &[(&str, FunctionInfo)] = &[
 ];
 
 pub const DATETIME_FUNCTIONS: &[(&str, FunctionInfo)] = &[
-    ("DAY", FunctionInfo::new(
-        "DAY",
-        "Extracts the corresponding date part from a date or timestamp."
-    )
-    .with_docs("https://docs.snowflake.com/en/sql-reference/functions/year")
-    ),
-    ("DAYOFMONTH", FunctionInfo::new(
-        "DAYOFMONTH",
-        "Extracts the corresponding date part from a date or timestamp."
-    )
-    .with_docs("https://docs.snowflake.com/en/sql-reference/functions/year")
-    ),
-    ("DAYOFWEEK", FunctionInfo::new(
-        "DAYOFWEEK",
-        "Extracts the corresponding date part from a date or timestamp."
-    )
-    .with_docs("https://docs.snowflake.com/en/sql-reference/functions/year")
-    ),
-    ("DAYOFWEEKISO", FunctionInfo::new(
-        "DAYOFWEEKISO",
-        "Extracts the corresponding date part from a date or timestamp."
-    )
-    .with_docs("https://docs.snowflake.com/en/sql-reference/functions/year")
-    ),
-    ("DAYOFYEAR", FunctionInfo::new(
-        "DAYOFYEAR",
-        "Extracts the corresponding date part from a date or timestamp."
-    )
-    .with_docs("https://docs.snowflake.com/en/sql-reference/functions/year")
-    ),
     ("EXTRACT", FunctionInfo::new(
         "EXTRACT",
         "Extracts the specified date or time part from a date, time, or timestamp."
     )
     .with_docs("https://docs.snowflake.com/en/sql-reference/functions/extract")
     ),
-    ("HOUR", FunctionInfo::new(
-        "HOUR",
-        "Extracts the corresponding time part from a time or timestamp value."
-    )
-    .with_docs("https://docs.snowflake.com/en/sql-reference/functions/hour-minute-second")
-    ),
     ("LAST_SUCCESSFUL_SCHEDULED_TIME", FunctionInfo::new(
         "LAST_SUCCESSFUL_SCHEDULED_TIME",
         "Returns the timestamp representing the scheduled time for the most recent successful evaluation of the alert condition, where no errors occurred when executing the action."
     )
     .with_docs("https://docs.snowflake.com/en/sql-reference/functions/last_successful_scheduled_time")
     ),
-    ("MINUTE", FunctionInfo::new(
-        "MINUTE",
-        "Extracts the corresponding time part from a time or timestamp value."
-    )
-    .with_docs("https://docs.snowflake.com/en/sql-reference/functions/hour-minute-second")
-    ),
-    ("MONTH", FunctionInfo::new(
-        "MONTH",
-        "Extracts the corresponding date part from a date or timestamp."
-    )
-    .with_docs("https://docs.snowflake.com/en/sql-reference/functions/year")
-    ),
     ("MONTHS_BETWEEN", FunctionInfo::new(
         "MONTHS_BETWEEN",
         "Returns the number of months between two DATE or TIMESTAMP values."
     )
     .with_docs("https://docs.snowflake.com/en/sql-reference/functions/months_between")
     ),
-    ("QUARTER", FunctionInfo::new(
-        "QUARTER",
-        "Extracts the corresponding date part from a date or timestamp."
-    )
-    .with_docs("https://docs.snowflake.com/en/sql-reference/functions/year")
-    ),
     ("SCHEDULED_TIME", FunctionInfo::new(
         "SCHEDULED_TIME",
         "Returns the timestamp representing the scheduled time of the current alert."
     )
     .with_docs("https://docs.snowflake.com/en/sql-reference/functions/scheduled_time")
     ),
-    ("SECOND", FunctionInfo::new(
-        "SECOND",
-        "Extracts the corresponding time part from a time or timestamp value."
-    )
-    .with_docs("https://docs.snowflake.com/en/sql-reference/functions/hour-minute-second")
-    ),
     ("TIME_SLICE", FunctionInfo::new(
         "TIME_SLICE",
         "Calculates the beginning or end of a “slice” of time, where the length of the slice is a multiple of a standard unit of time (minute, hour, day, etc.)."
     )
     .with_docs("https://docs.snowflake.com/en/sql-reference/functions/time_slice")
     ),
-    ("WEEK", FunctionInfo::new(
-        "WEEK",
-        "Extracts the corresponding date part from a date or timestamp."
-    )
-    .with_docs("https://docs.snowflake.com/en/sql-reference/functions/year")
-    ),
     ("WEEKISO", FunctionInfo::new(
         "WEEKISO",
         "Extracts the corresponding date part from a date or timestamp."
@@ -854,12 +754,6 @@ pub const DATETIME_FUNCTIONS: &[(&str, FunctionInfo)] = &[
     )
     .with_docs("https://docs.snowflake.com/en/sql-reference/functions/year")
     ),
-    ("YEAR", FunctionInfo::new(
-        "YEAR",
-        "Extracts the corresponding date part from a date or timestamp."
-    )
-    .with_docs("https://docs.snowflake.com/en/sql-reference/functions/year")
-    ),
     ("YEAROFWEEK", FunctionInfo::new(
         "YEAROFWEEK",
         "Extracts the corresponding date part from a date or timestamp."
diff --git a/crates/embucket-functions/src/visitors/unimplemented/helper/implemented_functions.csv b/crates/embucket-functions/src/visitors/unimplemented/helper/implemented_functions.csv
index b8325ef59..cbf495a07 100644
--- a/crates/embucket-functions/src/visitors/unimplemented/helper/implemented_functions.csv
+++ b/crates/embucket-functions/src/visitors/unimplemented/helper/implemented_functions.csv
@@ -134,7 +134,12 @@ datediff
 datefromparts
 datepart
 datetrunc
+day
 dayname
+dayofmonth
+dayofweek
+dayofweekiso
+dayofyear
 decode
 decrypt_raw
 degrees
@@ -167,6 +172,7 @@ grouping
 hex_decode_binary
 hex_decode_string
 hex_encode
+hour
 iff
 ifnull
 initcap
@@ -268,6 +274,8 @@ md5
 mean
 median
 min
+minute
+month
 monthname
 named_struct
 nanvl
@@ -296,6 +304,7 @@ position
 pow
 power
 previous_day
+quarter
 radians
 random
 range
@@ -327,6 +336,7 @@ row_number
 rpad
 rtrim
 rtrimmed_length
+second
 sha2
 sha224
 sha256
@@ -388,10 +398,13 @@ to_number
 to_numeric
 to_time
 to_timestamp
+to_timestamp_ltz
 to_timestamp_micros
 to_timestamp_millis
 to_timestamp_nanos
+to_timestamp_ntz
 to_timestamp_seconds
+to_timestamp_tz
 to_unixtime
 to_varchar
 to_variant
@@ -410,6 +423,10 @@ try_to_decimal
 try_to_number
 try_to_numeric
 try_to_time
+try_to_timestamp
+try_to_timestamp_ltz
+try_to_timestamp_ntz
+try_to_timestamp_tz
 try_to_varchar
 typeof
 union_extract
@@ -422,4 +439,6 @@ var_samp
 var_sample
 variant_element
 version
+week
+year
 zeroifnull