Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,22 @@ impl CastAnalyzer {
data_type @ (DataType::Decimal128(_, _) | DataType::Int32 | DataType::Int64) => {
Self::rewrite_numeric_cast(expr, data_type, try_mode)
}
DataType::Utf8 | DataType::LargeUtf8 => {
// Rewrite json_get::VARCHAR to json_get_str to get unwrapped string values
// This matches Snowflake behavior where casting VARIANT accessor to VARCHAR strips quotes
if let Expr::ScalarFunction(ScalarFunction { func, args }) = expr {
let func_name = func.name().to_lowercase();
if func_name == "json_get" || func_name.ends_with(".json_get") {
// Replace json_get with json_get_str - it returns unwrapped strings
let json_get_str_udf = datafusion_functions_json::udfs::json_get_str_udf();
return Ok(Transformed::yes(Expr::ScalarFunction(ScalarFunction {
func: json_get_str_udf,
args: args.clone(),
})));
}
}
Ok(Transformed::no(original_expr.clone()))
}
_ => Ok(Transformed::no(original_expr.clone())),
}
}
Expand Down
12 changes: 12 additions & 0 deletions crates/core-executor/src/query.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3510,6 +3510,18 @@ fn create_file_format(
csv_format
};

// Handle field_optionally_enclosed_by parameter (quote character)
let csv_format = if let Some(quote) = get_kv_option(file_format, "field_optionally_enclosed_by")
{
if quote.len() == 1 {
csv_format.with_quote(quote.as_bytes()[0])
} else {
csv_format
}
} else {
csv_format
};

Ok(Some((Arc::new(csv_format), infer_schema)))
} else if format_type.eq_ignore_ascii_case("json") {
Ok(Some((Arc::new(JsonFormat::default()), true)))
Expand Down
3 changes: 3 additions & 0 deletions crates/core-executor/src/tests/data/variant_double_quotes.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
schema,version,uuid,contexts
jsonschema,1-0-0,5f56bf12-3959-404e-986f-92f0ec0edbbc,"[{""id"": ""1d110fd7-0f54-4452-8dc6-c4a3eac02024""}]"
other_schema,2-0-0,6g67cg23-4060-505f-097g-03g1fd1feeccd,"[{""id"": ""2e221ge8-1g65-5563-9ed7-d5b4fbd03135""}]"
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
schema,version,uuid,contexts
jsonschema,1-0-0,5f56bf12-3959-404e-986f-92f0ec0edbbc,[{""deviceFamily"": ""Desktop"", ""osFamily"": ""Windows"", ""useragentFamily"": ""Safari""}]
other_schema,2-0-0,6g67cg23-4060-505f-097g-03g1fd1feeccd,[{""deviceFamily"": ""Desktop"", ""osFamily"": ""Windows"", ""useragentFamily"": ""Safari""}]
81 changes: 81 additions & 0 deletions crates/core-executor/src/tests/query.rs
Original file line number Diff line number Diff line change
Expand Up @@ -714,6 +714,87 @@ test_query!(
]
);

// Test baseline: INSERT INTO with VALUES - should work correctly
test_query!(
variant_insert_baseline,
"SELECT schema, version, uuid, contexts, contexts[0]:id::VARCHAR as extracted_id FROM embucket.public.variant_test;",
setup_queries = [
"CREATE TABLE embucket.public.variant_test (
schema VARCHAR,
version VARCHAR,
uuid VARCHAR,
contexts VARIANT );",
"INSERT INTO embucket.public.variant_test VALUES ('jsonschema', '1-0-0', '5f56bf12-3959-404e-986f-92f0ec0edbbc', '[{\"id\": \"1d110fd7-0f54-4452-8dc6-c4a3eac02024\"}]');"
]
);

// Test COPY INTO with VARIANT field and double quotes in CSV WITH FIELD_OPTIONALLY_ENCLOSED_BY
// CSV format: field with JSON containing quotes should be: "[{""id"": ""value""}]"
// The double quotes are CSV escape sequences and should be interpreted as single quotes
test_query!(
copy_into_variant_double_quotes_with_enclosed,
"SELECT schema, version, uuid, contexts, contexts[0]:id::VARCHAR as extracted_id FROM embucket.public.variant_csv_test;",
setup_queries = [
"CREATE TABLE embucket.public.variant_csv_test (
schema VARCHAR,
version VARCHAR,
uuid VARCHAR,
contexts VARIANT );",
"COPY INTO embucket.public.variant_csv_test FROM 's3://embucket-testdata/variant_double_quotes.csv' FILE_FORMAT = ( TYPE = CSV SKIP_HEADER = 1 FIELD_OPTIONALLY_ENCLOSED_BY = '\"' );"
],
snowflake_error = true
);

// Test COPY INTO WITHOUT FIELD_OPTIONALLY_ENCLOSED_BY - this might reproduce the bug
test_query!(
copy_into_variant_double_quotes_no_enclosed,
"SELECT schema, version, uuid, contexts, contexts[0]:id::VARCHAR as extracted_id FROM embucket.public.variant_csv_test2;",
setup_queries = [
"CREATE TABLE embucket.public.variant_csv_test2 (
schema VARCHAR,
version VARCHAR,
uuid VARCHAR,
contexts VARIANT );",
"COPY INTO embucket.public.variant_csv_test2 FROM 's3://embucket-testdata/variant_double_quotes.csv' FILE_FORMAT = ( TYPE = CSV SKIP_HEADER = 1 );"
],
snowflake_error = true
);

// Test COPY INTO with CSV field NOT enclosed in quotes (literal double quotes in the field)
// This should reproduce the bug where "" becomes literal "" in the JSON instead of being unescaped to "
test_query!(
copy_into_variant_literal_double_quotes,
"SELECT schema, version, uuid, contexts, contexts[0]:id::VARCHAR as extracted_id FROM embucket.public.variant_csv_test3;",
setup_queries = [
"CREATE TABLE embucket.public.variant_csv_test3 (
schema VARCHAR,
version VARCHAR,
uuid VARCHAR,
contexts VARIANT );",
"COPY INTO embucket.public.variant_csv_test3 FROM 's3://embucket-testdata/variant_no_enclosing_quotes.csv' FILE_FORMAT = ( TYPE = CSV SKIP_HEADER = 1 );"
],
snowflake_error = true
);

// Test to check if extracted values should have quotes or not
test_query!(
variant_extract_quote_investigation,
"SELECT
contexts[0]:id::VARCHAR as with_cast,
contexts[0]:id as without_cast,
length(contexts[0]:id::VARCHAR) as value_length,
contexts[0] as first_element
FROM embucket.public.variant_test;",
setup_queries = [
"CREATE TABLE embucket.public.variant_test (
schema VARCHAR,
version VARCHAR,
uuid VARCHAR,
contexts VARIANT );",
"INSERT INTO embucket.public.variant_test VALUES ('jsonschema', '1-0-0', '5f56bf12-3959-404e-986f-92f0ec0edbbc', '[{\"id\": \"1d110fd7-0f54-4452-8dc6-c4a3eac02024\"}]');"
]
);

test_query!(
timestamp_str_format,
"SELECT
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
---
source: crates/core-executor/src/tests/query.rs
assertion_line: 734
description: "\"SELECT schema, version, uuid, contexts, contexts[0]:id::VARCHAR as extracted_id FROM embucket.public.variant_csv_test;\""
info: "Tests Snowflake Error; Setup queries: CREATE TABLE embucket.public.variant_csv_test (\n schema VARCHAR,\n version VARCHAR,\n uuid VARCHAR,\n contexts VARIANT );; COPY INTO embucket.public.variant_csv_test FROM 's3://embucket-testdata/variant_double_quotes.csv' FILE_FORMAT = ( TYPE = CSV SKIP_HEADER = 1 FIELD_OPTIONALLY_ENCLOSED_BY = '\"' );"
---
Ok(
[
"+--------------+---------+---------------------------------------+--------------------------------------------------+----------------------------------------+",
"| schema | version | uuid | contexts | extracted_id |",
"+--------------+---------+---------------------------------------+--------------------------------------------------+----------------------------------------+",
"| jsonschema | 1-0-0 | 5f56bf12-3959-404e-986f-92f0ec0edbbc | [{\"id\": \"1d110fd7-0f54-4452-8dc6-c4a3eac02024\"}] | \"1d110fd7-0f54-4452-8dc6-c4a3eac02024\" |",
"| other_schema | 2-0-0 | 6g67cg23-4060-505f-097g-03g1fd1feeccd | [{\"id\": \"2e221ge8-1g65-5563-9ed7-d5b4fbd03135\"}] | \"2e221ge8-1g65-5563-9ed7-d5b4fbd03135\" |",
"+--------------+---------+---------------------------------------+--------------------------------------------------+----------------------------------------+",
],
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
---
source: crates/core-executor/src/tests/query.rs
assertion_line: 718
description: "\"SELECT schema, version, uuid, contexts, contexts[0]:id::VARCHAR as extracted_id FROM embucket.public.variant_test;\""
info: "Setup queries: CREATE TABLE embucket.public.variant_test (\n schema VARCHAR,\n version VARCHAR,\n uuid VARCHAR,\n contexts VARIANT );; INSERT INTO embucket.public.variant_test VALUES ('jsonschema', '1-0-0', '5f56bf12-3959-404e-986f-92f0ec0edbbc', '[{\"id\": \"1d110fd7-0f54-4452-8dc6-c4a3eac02024\"}]');"
---
Ok(
[
"+------------+---------+--------------------------------------+--------------------------------------------------+----------------------------------------+",
"| schema | version | uuid | contexts | extracted_id |",
"+------------+---------+--------------------------------------+--------------------------------------------------+----------------------------------------+",
"| jsonschema | 1-0-0 | 5f56bf12-3959-404e-986f-92f0ec0edbbc | [{\"id\": \"1d110fd7-0f54-4452-8dc6-c4a3eac02024\"}] | \"1d110fd7-0f54-4452-8dc6-c4a3eac02024\" |",
"+------------+---------+--------------------------------------+--------------------------------------------------+----------------------------------------+",
],
)
11 changes: 11 additions & 0 deletions crates/embucket-functions/src/conversion/to_varchar.rs
Original file line number Diff line number Diff line change
Expand Up @@ -572,6 +572,17 @@ fn convert_string_to_string(
_try_mode: bool,
) -> DFResult<Option<String>> {
let value = extract_string_from_array(array, index)?;

// Check if this is a JSON-encoded string value (from VARIANT/json_get)
// JSON strings start and end with double quotes
// When casting to VARCHAR, we need to unwrap the JSON string to match Snowflake behavior
if value.len() >= 2 && value.starts_with('"') && value.ends_with('"') {
// Try to parse as JSON to properly unescape the string
if let Ok(serde_json::Value::String(unquoted)) = serde_json::from_str(&value) {
return Ok(Some(unquoted));
}
}

Ok(Some(value))
}

Expand Down
Loading