diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 8c093a9db899..b0f17630c910 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -252,6 +252,11 @@ config_namespace! { /// string length and thus DataFusion can not enforce such limits. pub support_varchar_with_length: bool, default = true + /// If true, `VARCHAR` is mapped to `Utf8View` during SQL planning. + /// If false, `VARCHAR` is mapped to `Utf8` during SQL planning. + /// Default is false. + pub map_varchar_to_utf8view: bool, default = false + /// When set to true, the source locations relative to the original SQL /// query (i.e. [`Span`](https://docs.rs/sqlparser/latest/sqlparser/tokenizer/struct.Span.html)) will be collected /// and recorded in the logical plan nodes. diff --git a/datafusion/core/src/execution/session_state.rs b/datafusion/core/src/execution/session_state.rs index 0e83156ab53f..f4b0fd0c125f 100644 --- a/datafusion/core/src/execution/session_state.rs +++ b/datafusion/core/src/execution/session_state.rs @@ -489,6 +489,7 @@ impl SessionState { enable_options_value_normalization: sql_parser_options .enable_options_value_normalization, support_varchar_with_length: sql_parser_options.support_varchar_with_length, + map_varchar_to_utf8view: sql_parser_options.map_varchar_to_utf8view, collect_spans: sql_parser_options.collect_spans, } } diff --git a/datafusion/sql/src/planner.rs b/datafusion/sql/src/planner.rs index bc7c2b7f4377..daaf70f95304 100644 --- a/datafusion/sql/src/planner.rs +++ b/datafusion/sql/src/planner.rs @@ -54,6 +54,8 @@ pub struct ParserOptions { pub enable_options_value_normalization: bool, /// Whether to collect spans pub collect_spans: bool, + /// Whether `VARCHAR` is mapped to `Utf8View` during SQL planning. + pub map_varchar_to_utf8view: bool, } impl ParserOptions { @@ -72,6 +74,7 @@ impl ParserOptions { parse_float_as_decimal: false, enable_ident_normalization: true, support_varchar_with_length: true, + map_varchar_to_utf8view: false, enable_options_value_normalization: false, collect_spans: false, } @@ -111,6 +114,12 @@ impl ParserOptions { self } + /// Sets the `map_varchar_to_utf8view` option. + pub fn with_map_varchar_to_utf8view(mut self, value: bool) -> Self { + self.map_varchar_to_utf8view = value; + self + } + /// Sets the `enable_options_value_normalization` option. pub fn with_enable_options_value_normalization(mut self, value: bool) -> Self { self.enable_options_value_normalization = value; @@ -136,6 +145,7 @@ impl From<&SqlParserOptions> for ParserOptions { parse_float_as_decimal: options.parse_float_as_decimal, enable_ident_normalization: options.enable_ident_normalization, support_varchar_with_length: options.support_varchar_with_length, + map_varchar_to_utf8view: options.map_varchar_to_utf8view, enable_options_value_normalization: options .enable_options_value_normalization, collect_spans: options.collect_spans, @@ -558,7 +568,13 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { SQLDataType::Varchar(length) => { match (length, self.options.support_varchar_with_length) { (Some(_), false) => plan_err!("does not support Varchar with length, please set `support_varchar_with_length` to be true"), - _ => Ok(DataType::Utf8), + _ => { + if self.options.map_varchar_to_utf8view { + Ok(DataType::Utf8View) + } else { + Ok(DataType::Utf8) + } + } } } SQLDataType::UnsignedBigInt(_) | SQLDataType::UnsignedInt8(_) => Ok(DataType::UInt64), diff --git a/datafusion/sql/tests/sql_integration.rs b/datafusion/sql/tests/sql_integration.rs index 1df18302687e..b98763a9d6b5 100644 --- a/datafusion/sql/tests/sql_integration.rs +++ b/datafusion/sql/tests/sql_integration.rs @@ -92,6 +92,7 @@ fn parse_decimals() { parse_float_as_decimal: true, enable_ident_normalization: false, support_varchar_with_length: false, + map_varchar_to_utf8view: false, enable_options_value_normalization: false, collect_spans: false, }, @@ -148,6 +149,7 @@ fn parse_ident_normalization() { parse_float_as_decimal: false, enable_ident_normalization, support_varchar_with_length: false, + map_varchar_to_utf8view: false, enable_options_value_normalization: false, collect_spans: false, }, diff --git a/datafusion/sqllogictest/test_files/ddl.slt b/datafusion/sqllogictest/test_files/ddl.slt index 6f75a7d7f8fd..bc15f2210380 100644 --- a/datafusion/sqllogictest/test_files/ddl.slt +++ b/datafusion/sqllogictest/test_files/ddl.slt @@ -827,3 +827,31 @@ drop table table_with_pk; statement ok set datafusion.catalog.information_schema = false; + +# Test VARCHAR is mapped to Utf8View during SQL planning when setting map_varchar_to_utf8view to true +statement ok +CREATE TABLE t1(c1 VARCHAR(10) NOT NULL, c2 VARCHAR); + +query TTT +DESCRIBE t1; +---- +c1 Utf8 NO +c2 Utf8 YES + +statement ok +set datafusion.sql_parser.map_varchar_to_utf8view = true; + +statement ok +CREATE TABLE t2(c1 VARCHAR(10) NOT NULL, c2 VARCHAR); + +query TTT +DESCRIBE t2; +---- +c1 Utf8View NO +c2 Utf8View YES + +statement ok +DROP TABLE t1; + +statement ok +DROP TABLE t2; diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt index 454055b53930..496f24abf6ed 100644 --- a/datafusion/sqllogictest/test_files/information_schema.slt +++ b/datafusion/sqllogictest/test_files/information_schema.slt @@ -263,6 +263,7 @@ datafusion.sql_parser.collect_spans false datafusion.sql_parser.dialect generic datafusion.sql_parser.enable_ident_normalization true datafusion.sql_parser.enable_options_value_normalization false +datafusion.sql_parser.map_varchar_to_utf8view false datafusion.sql_parser.parse_float_as_decimal false datafusion.sql_parser.recursion_limit 50 datafusion.sql_parser.support_varchar_with_length true @@ -361,6 +362,7 @@ datafusion.sql_parser.collect_spans false When set to true, the source locations datafusion.sql_parser.dialect generic Configure the SQL dialect used by DataFusion's parser; supported values include: Generic, MySQL, PostgreSQL, Hive, SQLite, Snowflake, Redshift, MsSQL, ClickHouse, BigQuery, Ansi, DuckDB and Databricks. datafusion.sql_parser.enable_ident_normalization true When set to true, SQL parser will normalize ident (convert ident to lowercase when not quoted) datafusion.sql_parser.enable_options_value_normalization false When set to true, SQL parser will normalize options value (convert value to lowercase). Note that this option is ignored and will be removed in the future. All case-insensitive values are normalized automatically. +datafusion.sql_parser.map_varchar_to_utf8view false If true, `VARCHAR` is mapped to `Utf8View` during SQL planning. If false, `VARCHAR` is mapped to `Utf8` during SQL planning. Default is false. datafusion.sql_parser.parse_float_as_decimal false When set to true, SQL parser will parse float as decimal type datafusion.sql_parser.recursion_limit 50 Specifies the recursion depth limit when parsing complex SQL Queries datafusion.sql_parser.support_varchar_with_length true If true, permit lengths for `VARCHAR` such as `VARCHAR(20)`, but ignore the length. If false, error if a `VARCHAR` with a length is specified. The Arrow type system does not have a notion of maximum string length and thus DataFusion can not enforce such limits. diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md index 635eb2b0a67f..b6b53cfe49b3 100644 --- a/docs/source/user-guide/configs.md +++ b/docs/source/user-guide/configs.md @@ -128,5 +128,6 @@ Environment variables are read during `SessionConfig` initialisation so they mus | datafusion.sql_parser.enable_options_value_normalization | false | When set to true, SQL parser will normalize options value (convert value to lowercase). Note that this option is ignored and will be removed in the future. All case-insensitive values are normalized automatically. | | datafusion.sql_parser.dialect | generic | Configure the SQL dialect used by DataFusion's parser; supported values include: Generic, MySQL, PostgreSQL, Hive, SQLite, Snowflake, Redshift, MsSQL, ClickHouse, BigQuery, Ansi, DuckDB and Databricks. | | datafusion.sql_parser.support_varchar_with_length | true | If true, permit lengths for `VARCHAR` such as `VARCHAR(20)`, but ignore the length. If false, error if a `VARCHAR` with a length is specified. The Arrow type system does not have a notion of maximum string length and thus DataFusion can not enforce such limits. | +| datafusion.sql_parser.map_varchar_to_utf8view | false | If true, `VARCHAR` is mapped to `Utf8View` during SQL planning. If false, `VARCHAR` is mapped to `Utf8` during SQL planning. Default is false. | | datafusion.sql_parser.collect_spans | false | When set to true, the source locations relative to the original SQL query (i.e. [`Span`](https://docs.rs/sqlparser/latest/sqlparser/tokenizer/struct.Span.html)) will be collected and recorded in the logical plan nodes. | | datafusion.sql_parser.recursion_limit | 50 | Specifies the recursion depth limit when parsing complex SQL Queries |