From 0a2cca725860a7b69c5057a221466fc0dcbe62bb Mon Sep 17 00:00:00 2001 From: jzc Date: Tue, 15 Sep 2020 21:02:16 +0800 Subject: [PATCH 1/3] orc table column name supports special characters. --- .../datasources/orc/OrcFileFormat.scala | 7 +++++- .../sql/hive/execution/SQLQuerySuite.scala | 23 +++++++++++++++++++ 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala index 69badb4f7d59..5439e7943367 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala @@ -38,6 +38,7 @@ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection import org.apache.spark.sql.execution.datasources._ +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.sources._ import org.apache.spark.sql.types._ import org.apache.spark.util.{SerializableConfiguration, Utils} @@ -45,7 +46,11 @@ import org.apache.spark.util.{SerializableConfiguration, Utils} private[sql] object OrcFileFormat { private def checkFieldName(name: String): Unit = { try { - TypeDescription.fromString(s"struct<$name:int>") + if (SQLConf.get.getConfString("spark.sql.orc.column.allowSpecialChar", "false").toBoolean) { + TypeDescription.fromString(s"struct<`$name`:int>") + } else { + TypeDescription.fromString(s"struct<$name:int>") + } } catch { case _: IllegalArgumentException => throw new AnalysisException( diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala index 431790e1fbb6..6b8ac2e5b658 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala @@ -2242,6 +2242,29 @@ abstract class SQLQuerySuiteBase extends QueryTest with SQLTestUtils with TestHi } } + test("SPARK-32889 ORC table column name supports special characters like $ eg.") { + Seq("$").foreach { name => + Seq("ORC").foreach { source => + withSQLConf("spark.sql.orc.column.allowSpecialChar" -> "true") { + Seq(s"CREATE TABLE t32889(`col$name` INT) USING $source", + s"CREATE TABLE t32889 STORED AS $source AS SELECT 1 `col$name`", + s"CREATE TABLE t32889 USING $source AS SELECT 1 `col$name`", + s"CREATE TABLE t32889(`col$name` INT) USING hive OPTIONS (fileFormat '$source')") + .foreach { command => + withTable("t32889") { + sql(command) + } + } + + withTable("t32889") { + sql(s"CREATE TABLE t32889(`col` INT) USING $source") + sql(s"ALTER TABLE t32889 ADD COLUMNS(`col$name` INT)") + } + } + } + } + } + Seq("orc", "parquet").foreach { format => test(s"SPARK-18355 Read data from a hive table with a new column - $format") { val client = From c3c7f4cbd7d9b16ae4ebccd73d1f8d03f4446e8a Mon Sep 17 00:00:00 2001 From: jzc Date: Thu, 17 Sep 2020 12:03:50 +0800 Subject: [PATCH 2/3] add orc datasource column name check. --- .../datasources/orc/OrcFileFormat.scala | 7 +- .../spark/sql/FileBasedDataSourceSuite.scala | 13 +++ .../sql/hive/execution/SQLQuerySuite.scala | 87 ++++++++++--------- 3 files changed, 58 insertions(+), 49 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala index 5439e7943367..8e9a566d4597 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala @@ -38,7 +38,6 @@ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection import org.apache.spark.sql.execution.datasources._ -import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.sources._ import org.apache.spark.sql.types._ import org.apache.spark.util.{SerializableConfiguration, Utils} @@ -46,11 +45,7 @@ import org.apache.spark.util.{SerializableConfiguration, Utils} private[sql] object OrcFileFormat { private def checkFieldName(name: String): Unit = { try { - if (SQLConf.get.getConfString("spark.sql.orc.column.allowSpecialChar", "false").toBoolean) { - TypeDescription.fromString(s"struct<`$name`:int>") - } else { - TypeDescription.fromString(s"struct<$name:int>") - } + TypeDescription.fromString(s"struct<`$name`:int>") } catch { case _: IllegalArgumentException => throw new AnalysisException( diff --git a/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala index 48b2e22457e3..5a732f500372 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala @@ -233,6 +233,19 @@ class FileBasedDataSourceSuite extends QueryTest } } + test("column name supports special characters using orc") { + val format = "orc" + Seq("$", " ", ",", ";", "{", "}", "(", ")", "\n", "\t", "=").foreach { name => + withTempDir { dir => + val dataDir = new File(dir, "file").getCanonicalPath + Seq(1).toDF(name).write.orc(dataDir) + val schema = spark.read.orc(dataDir).schema + assert(schema.size == 1) + assertResult(name)(schema(0).name) + } + } + } + // Text file format only supports string type test("SPARK-24691 error handling for unsupported types - text") { withTempDir { dir => diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala index 6b8ac2e5b658..44c1833b8d00 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala @@ -2206,62 +2206,63 @@ abstract class SQLQuerySuiteBase extends QueryTest with SQLTestUtils with TestHi } } - test("SPARK-21912 ORC/Parquet table should not create invalid column names") { + test("SPARK-21912 Parquet table should not create invalid column names") { Seq(" ", ",", ";", "{", "}", "(", ")", "\n", "\t", "=").foreach { name => - Seq("ORC", "PARQUET").foreach { source => - withTable("t21912") { - val m = intercept[AnalysisException] { - sql(s"CREATE TABLE t21912(`col$name` INT) USING $source") - }.getMessage - assert(m.contains(s"contains invalid character(s)")) - - val m1 = intercept[AnalysisException] { - sql(s"CREATE TABLE t21912 STORED AS $source AS SELECT 1 `col$name`") - }.getMessage - assert(m1.contains(s"contains invalid character(s)")) + val source = "PARQUET" + withTable("t21912") { + val m = intercept[AnalysisException] { + sql(s"CREATE TABLE t21912(`col$name` INT) USING $source") + }.getMessage + assert(m.contains(s"contains invalid character(s)")) - val m2 = intercept[AnalysisException] { - sql(s"CREATE TABLE t21912 USING $source AS SELECT 1 `col$name`") - }.getMessage - assert(m2.contains(s"contains invalid character(s)")) + val m1 = intercept[AnalysisException] { + sql(s"CREATE TABLE t21912 STORED AS $source AS SELECT 1 `col$name`") + }.getMessage + assert(m1.contains(s"contains invalid character(s)")) - withSQLConf(HiveUtils.CONVERT_METASTORE_PARQUET.key -> "false") { - val m3 = intercept[AnalysisException] { - sql(s"CREATE TABLE t21912(`col$name` INT) USING hive OPTIONS (fileFormat '$source')") - }.getMessage - assert(m3.contains(s"contains invalid character(s)")) - } + val m2 = intercept[AnalysisException] { + sql(s"CREATE TABLE t21912 USING $source AS SELECT 1 `col$name`") + }.getMessage + assert(m2.contains(s"contains invalid character(s)")) - sql(s"CREATE TABLE t21912(`col` INT) USING $source") - val m4 = intercept[AnalysisException] { - sql(s"ALTER TABLE t21912 ADD COLUMNS(`col$name` INT)") + withSQLConf(HiveUtils.CONVERT_METASTORE_PARQUET.key -> "false") { + val m3 = intercept[AnalysisException] { + sql(s"CREATE TABLE t21912(`col$name` INT) USING hive OPTIONS (fileFormat '$source')") }.getMessage - assert(m4.contains(s"contains invalid character(s)")) + assert(m3.contains(s"contains invalid character(s)")) } + + sql(s"CREATE TABLE t21912(`col` INT) USING $source") + val m4 = intercept[AnalysisException] { + sql(s"ALTER TABLE t21912 ADD COLUMNS(`col$name` INT)") + }.getMessage + assert(m4.contains(s"contains invalid character(s)")) } } } test("SPARK-32889 ORC table column name supports special characters like $ eg.") { - Seq("$").foreach { name => - Seq("ORC").foreach { source => - withSQLConf("spark.sql.orc.column.allowSpecialChar" -> "true") { - Seq(s"CREATE TABLE t32889(`col$name` INT) USING $source", - s"CREATE TABLE t32889 STORED AS $source AS SELECT 1 `col$name`", - s"CREATE TABLE t32889 USING $source AS SELECT 1 `col$name`", - s"CREATE TABLE t32889(`col$name` INT) USING hive OPTIONS (fileFormat '$source')") - .foreach { command => - withTable("t32889") { - sql(command) - } - } - - withTable("t32889") { - sql(s"CREATE TABLE t32889(`col` INT) USING $source") - sql(s"ALTER TABLE t32889 ADD COLUMNS(`col$name` INT)") - } + // " " "," is not allowed. + Seq("$", ";", "{", "}", "(", ")", "\n", "\t", "=").foreach { name => + val source = "ORC" + Seq(s"CREATE TABLE t32889(`$name` INT) USING $source", + s"CREATE TABLE t32889 STORED AS $source AS SELECT 1 `$name`", + s"CREATE TABLE t32889 USING $source AS SELECT 1 `$name`", + s"CREATE TABLE t32889(`$name` INT) USING hive OPTIONS (fileFormat '$source')") + .foreach { command => + withTable("t32889") { + sql(command) + assertResult(name)( + sessionState.catalog.getTableMetadata(TableIdentifier("t32889")).schema.fields(0).name) } } + + withTable("t32889") { + sql(s"CREATE TABLE t32889(`col` INT) USING $source") + sql(s"ALTER TABLE t32889 ADD COLUMNS(`$name` INT)") + assertResult(name)( + sessionState.catalog.getTableMetadata(TableIdentifier("t32889")).schema.fields(1).name) + } } } From 7d63fd96462fda9b820873a679464fe2662e6869 Mon Sep 17 00:00:00 2001 From: jzc Date: Thu, 17 Sep 2020 14:34:23 +0800 Subject: [PATCH 3/3] review fix. --- .../spark/sql/FileBasedDataSourceSuite.scala | 19 ++++++++++--------- .../sql/hive/execution/SQLQuerySuite.scala | 2 +- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala index 5a732f500372..523ca1e68e78 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala @@ -233,15 +233,16 @@ class FileBasedDataSourceSuite extends QueryTest } } - test("column name supports special characters using orc") { - val format = "orc" - Seq("$", " ", ",", ";", "{", "}", "(", ")", "\n", "\t", "=").foreach { name => - withTempDir { dir => - val dataDir = new File(dir, "file").getCanonicalPath - Seq(1).toDF(name).write.orc(dataDir) - val schema = spark.read.orc(dataDir).schema - assert(schema.size == 1) - assertResult(name)(schema(0).name) + Seq("json", "orc").foreach { format => + test(s"SPARK-32889: column name supports special characters using $format") { + Seq("$", " ", ",", ";", "{", "}", "(", ")", "\n", "\t", "=").foreach { name => + withTempDir { dir => + val dataDir = new File(dir, "file").getCanonicalPath + Seq(1).toDF(name).write.format(format).save(dataDir) + val schema = spark.read.format(format).load(dataDir).schema + assert(schema.size == 1) + assertResult(name)(schema.head.name) + } } } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala index 44c1833b8d00..a69a949e3a3a 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala @@ -2241,7 +2241,7 @@ abstract class SQLQuerySuiteBase extends QueryTest with SQLTestUtils with TestHi } } - test("SPARK-32889 ORC table column name supports special characters like $ eg.") { + test("SPARK-32889: ORC table column name supports special characters") { // " " "," is not allowed. Seq("$", ";", "{", "}", "(", ")", "\n", "\t", "=").foreach { name => val source = "ORC"