From b8177751e033f367795680dca6694d4a4e0cf0a3 Mon Sep 17 00:00:00 2001 From: Sergei Turukin Date: Wed, 5 Nov 2025 15:08:29 -0800 Subject: [PATCH 1/2] Reproduction and fix attempt #1 --- crates/core-executor/src/query.rs | 12 +++ .../src/tests/data/variant_double_quotes.csv | 3 + .../data/variant_no_enclosing_quotes.csv | 3 + crates/core-executor/src/tests/query.rs | 81 +++++++++++++++++++ ...o_variant_double_quotes_with_enclosed.snap | 16 ++++ .../query_variant_insert_baseline.snap | 15 ++++ .../src/conversion/to_varchar.rs | 11 +++ 7 files changed, 141 insertions(+) create mode 100644 crates/core-executor/src/tests/data/variant_double_quotes.csv create mode 100644 crates/core-executor/src/tests/data/variant_no_enclosing_quotes.csv create mode 100644 crates/core-executor/src/tests/snapshots/query_copy_into_variant_double_quotes_with_enclosed.snap create mode 100644 crates/core-executor/src/tests/snapshots/query_variant_insert_baseline.snap diff --git a/crates/core-executor/src/query.rs b/crates/core-executor/src/query.rs index c0947ab55..352d4906a 100644 --- a/crates/core-executor/src/query.rs +++ b/crates/core-executor/src/query.rs @@ -3510,6 +3510,18 @@ fn create_file_format( csv_format }; + // Handle field_optionally_enclosed_by parameter (quote character) + let csv_format = if let Some(quote) = get_kv_option(file_format, "field_optionally_enclosed_by") + { + if quote.len() == 1 { + csv_format.with_quote(quote.as_bytes()[0]) + } else { + csv_format + } + } else { + csv_format + }; + Ok(Some((Arc::new(csv_format), infer_schema))) } else if format_type.eq_ignore_ascii_case("json") { Ok(Some((Arc::new(JsonFormat::default()), true))) diff --git a/crates/core-executor/src/tests/data/variant_double_quotes.csv b/crates/core-executor/src/tests/data/variant_double_quotes.csv new file mode 100644 index 000000000..1fbf9953d --- /dev/null +++ b/crates/core-executor/src/tests/data/variant_double_quotes.csv @@ -0,0 +1,3 @@ +schema,version,uuid,contexts +jsonschema,1-0-0,5f56bf12-3959-404e-986f-92f0ec0edbbc,"[{""id"": ""1d110fd7-0f54-4452-8dc6-c4a3eac02024""}]" +other_schema,2-0-0,6g67cg23-4060-505f-097g-03g1fd1feeccd,"[{""id"": ""2e221ge8-1g65-5563-9ed7-d5b4fbd03135""}]" diff --git a/crates/core-executor/src/tests/data/variant_no_enclosing_quotes.csv b/crates/core-executor/src/tests/data/variant_no_enclosing_quotes.csv new file mode 100644 index 000000000..8e443ee4f --- /dev/null +++ b/crates/core-executor/src/tests/data/variant_no_enclosing_quotes.csv @@ -0,0 +1,3 @@ +schema,version,uuid,contexts +jsonschema,1-0-0,5f56bf12-3959-404e-986f-92f0ec0edbbc,[{""id"": ""1d110fd7-0f54-4452-8dc6-c4a3eac02024""}] +other_schema,2-0-0,6g67cg23-4060-505f-097g-03g1fd1feeccd,[{""id"": ""2e221ge8-1g65-5563-9ed7-d5b4fbd03135""}] diff --git a/crates/core-executor/src/tests/query.rs b/crates/core-executor/src/tests/query.rs index c6221b067..fa9f952ed 100644 --- a/crates/core-executor/src/tests/query.rs +++ b/crates/core-executor/src/tests/query.rs @@ -714,6 +714,87 @@ test_query!( ] ); +// Test baseline: INSERT INTO with VALUES - should work correctly +test_query!( + variant_insert_baseline, + "SELECT schema, version, uuid, contexts, contexts[0]:id::VARCHAR as extracted_id FROM embucket.public.variant_test;", + setup_queries = [ + "CREATE TABLE embucket.public.variant_test ( + schema VARCHAR, + version VARCHAR, + uuid VARCHAR, + contexts VARIANT );", + "INSERT INTO embucket.public.variant_test VALUES ('jsonschema', '1-0-0', '5f56bf12-3959-404e-986f-92f0ec0edbbc', '[{\"id\": \"1d110fd7-0f54-4452-8dc6-c4a3eac02024\"}]');" + ] +); + +// Test COPY INTO with VARIANT field and double quotes in CSV WITH FIELD_OPTIONALLY_ENCLOSED_BY +// CSV format: field with JSON containing quotes should be: "[{""id"": ""value""}]" +// The double quotes are CSV escape sequences and should be interpreted as single quotes +test_query!( + copy_into_variant_double_quotes_with_enclosed, + "SELECT schema, version, uuid, contexts, contexts[0]:id::VARCHAR as extracted_id FROM embucket.public.variant_csv_test;", + setup_queries = [ + "CREATE TABLE embucket.public.variant_csv_test ( + schema VARCHAR, + version VARCHAR, + uuid VARCHAR, + contexts VARIANT );", + "COPY INTO embucket.public.variant_csv_test FROM 's3://embucket-testdata/variant_double_quotes.csv' FILE_FORMAT = ( TYPE = CSV SKIP_HEADER = 1 FIELD_OPTIONALLY_ENCLOSED_BY = '\"' );" + ], + snowflake_error = true +); + +// Test COPY INTO WITHOUT FIELD_OPTIONALLY_ENCLOSED_BY - this might reproduce the bug +test_query!( + copy_into_variant_double_quotes_no_enclosed, + "SELECT schema, version, uuid, contexts, contexts[0]:id::VARCHAR as extracted_id FROM embucket.public.variant_csv_test2;", + setup_queries = [ + "CREATE TABLE embucket.public.variant_csv_test2 ( + schema VARCHAR, + version VARCHAR, + uuid VARCHAR, + contexts VARIANT );", + "COPY INTO embucket.public.variant_csv_test2 FROM 's3://embucket-testdata/variant_double_quotes.csv' FILE_FORMAT = ( TYPE = CSV SKIP_HEADER = 1 );" + ], + snowflake_error = true +); + +// Test COPY INTO with CSV field NOT enclosed in quotes (literal double quotes in the field) +// This should reproduce the bug where "" becomes literal "" in the JSON instead of being unescaped to " +test_query!( + copy_into_variant_literal_double_quotes, + "SELECT schema, version, uuid, contexts, contexts[0]:id::VARCHAR as extracted_id FROM embucket.public.variant_csv_test3;", + setup_queries = [ + "CREATE TABLE embucket.public.variant_csv_test3 ( + schema VARCHAR, + version VARCHAR, + uuid VARCHAR, + contexts VARIANT );", + "COPY INTO embucket.public.variant_csv_test3 FROM 's3://embucket-testdata/variant_no_enclosing_quotes.csv' FILE_FORMAT = ( TYPE = CSV SKIP_HEADER = 1 );" + ], + snowflake_error = true +); + +// Test to check if extracted values should have quotes or not +test_query!( + variant_extract_quote_investigation, + "SELECT + contexts[0]:id::VARCHAR as with_cast, + contexts[0]:id as without_cast, + length(contexts[0]:id::VARCHAR) as value_length, + contexts[0] as first_element + FROM embucket.public.variant_test;", + setup_queries = [ + "CREATE TABLE embucket.public.variant_test ( + schema VARCHAR, + version VARCHAR, + uuid VARCHAR, + contexts VARIANT );", + "INSERT INTO embucket.public.variant_test VALUES ('jsonschema', '1-0-0', '5f56bf12-3959-404e-986f-92f0ec0edbbc', '[{\"id\": \"1d110fd7-0f54-4452-8dc6-c4a3eac02024\"}]');" + ] +); + test_query!( timestamp_str_format, "SELECT diff --git a/crates/core-executor/src/tests/snapshots/query_copy_into_variant_double_quotes_with_enclosed.snap b/crates/core-executor/src/tests/snapshots/query_copy_into_variant_double_quotes_with_enclosed.snap new file mode 100644 index 000000000..4115f52bd --- /dev/null +++ b/crates/core-executor/src/tests/snapshots/query_copy_into_variant_double_quotes_with_enclosed.snap @@ -0,0 +1,16 @@ +--- +source: crates/core-executor/src/tests/query.rs +assertion_line: 734 +description: "\"SELECT schema, version, uuid, contexts, contexts[0]:id::VARCHAR as extracted_id FROM embucket.public.variant_csv_test;\"" +info: "Tests Snowflake Error; Setup queries: CREATE TABLE embucket.public.variant_csv_test (\n schema VARCHAR,\n version VARCHAR,\n uuid VARCHAR,\n contexts VARIANT );; COPY INTO embucket.public.variant_csv_test FROM 's3://embucket-testdata/variant_double_quotes.csv' FILE_FORMAT = ( TYPE = CSV SKIP_HEADER = 1 FIELD_OPTIONALLY_ENCLOSED_BY = '\"' );" +--- +Ok( + [ + "+--------------+---------+---------------------------------------+--------------------------------------------------+----------------------------------------+", + "| schema | version | uuid | contexts | extracted_id |", + "+--------------+---------+---------------------------------------+--------------------------------------------------+----------------------------------------+", + "| jsonschema | 1-0-0 | 5f56bf12-3959-404e-986f-92f0ec0edbbc | [{\"id\": \"1d110fd7-0f54-4452-8dc6-c4a3eac02024\"}] | \"1d110fd7-0f54-4452-8dc6-c4a3eac02024\" |", + "| other_schema | 2-0-0 | 6g67cg23-4060-505f-097g-03g1fd1feeccd | [{\"id\": \"2e221ge8-1g65-5563-9ed7-d5b4fbd03135\"}] | \"2e221ge8-1g65-5563-9ed7-d5b4fbd03135\" |", + "+--------------+---------+---------------------------------------+--------------------------------------------------+----------------------------------------+", + ], +) diff --git a/crates/core-executor/src/tests/snapshots/query_variant_insert_baseline.snap b/crates/core-executor/src/tests/snapshots/query_variant_insert_baseline.snap new file mode 100644 index 000000000..91170e016 --- /dev/null +++ b/crates/core-executor/src/tests/snapshots/query_variant_insert_baseline.snap @@ -0,0 +1,15 @@ +--- +source: crates/core-executor/src/tests/query.rs +assertion_line: 718 +description: "\"SELECT schema, version, uuid, contexts, contexts[0]:id::VARCHAR as extracted_id FROM embucket.public.variant_test;\"" +info: "Setup queries: CREATE TABLE embucket.public.variant_test (\n schema VARCHAR,\n version VARCHAR,\n uuid VARCHAR,\n contexts VARIANT );; INSERT INTO embucket.public.variant_test VALUES ('jsonschema', '1-0-0', '5f56bf12-3959-404e-986f-92f0ec0edbbc', '[{\"id\": \"1d110fd7-0f54-4452-8dc6-c4a3eac02024\"}]');" +--- +Ok( + [ + "+------------+---------+--------------------------------------+--------------------------------------------------+----------------------------------------+", + "| schema | version | uuid | contexts | extracted_id |", + "+------------+---------+--------------------------------------+--------------------------------------------------+----------------------------------------+", + "| jsonschema | 1-0-0 | 5f56bf12-3959-404e-986f-92f0ec0edbbc | [{\"id\": \"1d110fd7-0f54-4452-8dc6-c4a3eac02024\"}] | \"1d110fd7-0f54-4452-8dc6-c4a3eac02024\" |", + "+------------+---------+--------------------------------------+--------------------------------------------------+----------------------------------------+", + ], +) diff --git a/crates/embucket-functions/src/conversion/to_varchar.rs b/crates/embucket-functions/src/conversion/to_varchar.rs index cc8f1c956..1237c95da 100644 --- a/crates/embucket-functions/src/conversion/to_varchar.rs +++ b/crates/embucket-functions/src/conversion/to_varchar.rs @@ -572,6 +572,17 @@ fn convert_string_to_string( _try_mode: bool, ) -> DFResult> { let value = extract_string_from_array(array, index)?; + + // Check if this is a JSON-encoded string value (from VARIANT/json_get) + // JSON strings start and end with double quotes + // When casting to VARCHAR, we need to unwrap the JSON string to match Snowflake behavior + if value.len() >= 2 && value.starts_with('"') && value.ends_with('"') { + // Try to parse as JSON to properly unescape the string + if let Ok(serde_json::Value::String(unquoted)) = serde_json::from_str(&value) { + return Ok(Some(unquoted)); + } + } + Ok(Some(value)) } From f2c3d352732d71f7f405fcf8a69f844e3d1124d8 Mon Sep 17 00:00:00 2001 From: Sergei Turukin Date: Thu, 6 Nov 2025 04:28:40 -0800 Subject: [PATCH 2/2] Reproduction and fix attempt #2 --- .../datafusion/logical_analyzer/cast_analyzer.rs | 16 ++++++++++++++++ .../tests/data/variant_no_enclosing_quotes.csv | 4 ++-- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/crates/core-executor/src/datafusion/logical_analyzer/cast_analyzer.rs b/crates/core-executor/src/datafusion/logical_analyzer/cast_analyzer.rs index a39df252a..8aac48ddf 100644 --- a/crates/core-executor/src/datafusion/logical_analyzer/cast_analyzer.rs +++ b/crates/core-executor/src/datafusion/logical_analyzer/cast_analyzer.rs @@ -101,6 +101,22 @@ impl CastAnalyzer { data_type @ (DataType::Decimal128(_, _) | DataType::Int32 | DataType::Int64) => { Self::rewrite_numeric_cast(expr, data_type, try_mode) } + DataType::Utf8 | DataType::LargeUtf8 => { + // Rewrite json_get::VARCHAR to json_get_str to get unwrapped string values + // This matches Snowflake behavior where casting VARIANT accessor to VARCHAR strips quotes + if let Expr::ScalarFunction(ScalarFunction { func, args }) = expr { + let func_name = func.name().to_lowercase(); + if func_name == "json_get" || func_name.ends_with(".json_get") { + // Replace json_get with json_get_str - it returns unwrapped strings + let json_get_str_udf = datafusion_functions_json::udfs::json_get_str_udf(); + return Ok(Transformed::yes(Expr::ScalarFunction(ScalarFunction { + func: json_get_str_udf, + args: args.clone(), + }))); + } + } + Ok(Transformed::no(original_expr.clone())) + } _ => Ok(Transformed::no(original_expr.clone())), } } diff --git a/crates/core-executor/src/tests/data/variant_no_enclosing_quotes.csv b/crates/core-executor/src/tests/data/variant_no_enclosing_quotes.csv index 8e443ee4f..28c2b5ec0 100644 --- a/crates/core-executor/src/tests/data/variant_no_enclosing_quotes.csv +++ b/crates/core-executor/src/tests/data/variant_no_enclosing_quotes.csv @@ -1,3 +1,3 @@ schema,version,uuid,contexts -jsonschema,1-0-0,5f56bf12-3959-404e-986f-92f0ec0edbbc,[{""id"": ""1d110fd7-0f54-4452-8dc6-c4a3eac02024""}] -other_schema,2-0-0,6g67cg23-4060-505f-097g-03g1fd1feeccd,[{""id"": ""2e221ge8-1g65-5563-9ed7-d5b4fbd03135""}] +jsonschema,1-0-0,5f56bf12-3959-404e-986f-92f0ec0edbbc,[{""deviceFamily"": ""Desktop"", ""osFamily"": ""Windows"", ""useragentFamily"": ""Safari""}] +other_schema,2-0-0,6g67cg23-4060-505f-097g-03g1fd1feeccd,[{""deviceFamily"": ""Desktop"", ""osFamily"": ""Windows"", ""useragentFamily"": ""Safari""}]