From c4fbc9c48b2fbcc028c22b30ac1c9d216993bb43 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Thu, 20 Jan 2022 11:13:10 +0800 Subject: [PATCH 1/5] [SPARK-37965][SQL]Remove check field name when reading/writing existing data in Orc --- .../sql/execution/datasources/orc/OrcFileFormat.scala | 9 --------- 1 file changed, 9 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala index ce851c58cc4f..39a876316053 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala @@ -228,13 +228,4 @@ class OrcFileFormat case _ => false } - - override def supportFieldName(name: String): Boolean = { - try { - TypeDescription.fromString(s"struct<`$name`:int>") - true - } catch { - case _: IllegalArgumentException => false - } - } } From c3d6386521a81afe296afcabd08e4f6c54b7c39a Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Fri, 21 Jan 2022 11:03:04 +0800 Subject: [PATCH 2/5] Revert "[SPARK-37965][SQL]Remove check field name when reading/writing existing data in Orc" This reverts commit c4fbc9c48b2fbcc028c22b30ac1c9d216993bb43. --- .../sql/execution/datasources/orc/OrcFileFormat.scala | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala index 39a876316053..ce851c58cc4f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala @@ -228,4 +228,13 @@ class OrcFileFormat case _ => false } + + override def supportFieldName(name: String): Boolean = { + try { + TypeDescription.fromString(s"struct<`$name`:int>") + true + } catch { + case _: IllegalArgumentException => false + } + } } From d2406e6c472ceefabf97b2fe610a0508b6524f70 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Fri, 21 Jan 2022 11:19:26 +0800 Subject: [PATCH 3/5] Add UT --- .../org/apache/spark/sql/SQLQuerySuite.scala | 19 +++++++++++++++++++ .../sql/hive/execution/SQLQuerySuite.scala | 5 +++-- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala index d7f18ee801d7..9bce02124ddb 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala @@ -4243,6 +4243,25 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession with AdaptiveSpark checkAnswer(df3, df4) } } + + test("SPARK-37965: Spark support read/write orc file with invalid char in field name") { + withTempDir { dir => + Seq((1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11), (2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22)) + .toDF("max(t)", "max(t", "=", "\n", ";", "a b", "{", ".", "a.b", "a", ",") + .repartition(1) + .write.mode(SaveMode.Overwrite).orc(dir.getAbsolutePath) + val df = spark.read.orc(dir.getAbsolutePath) + checkAnswer(df, + Row(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11) :: + Row(2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22) :: Nil) + assert(df.schema.names.sameElements( + Array("max(t)", "max(t", "=", "\n", ";", "a b", "{", ".", "a.b", "a", ","))) + checkAnswer(df.select("`max(t)`", "`a b`", "`{`", "`.`", "`a.b`"), + Row(1, 6, 7, 8, 9) :: Row(2, 12, 14, 16, 18) :: Nil) + checkAnswer(df.where("`a.b` > 10"), + Row(2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22) :: Nil) + } + } } case class Foo(bar: Option[String]) diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala index 1829f38fe577..3fd5949067a9 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala @@ -2248,8 +2248,9 @@ abstract class SQLQuerySuiteBase extends QueryTest with SQLTestUtils with TestHi } test("SPARK-32889: ORC table column name supports special characters") { - // " " "," is not allowed. - Seq("$", ";", "{", "}", "(", ")", "\n", "\t", "=").foreach { name => + // "," is not allowed since cannot create a table having a column whose name + // contains commas in Hive metastore. + Seq("$", ";", "{", "}", "(", ")", "\n", "\t", "=", " ", "a b").foreach { name => val source = "ORC" Seq(s"CREATE TABLE t32889(`$name` INT) USING $source", s"CREATE TABLE t32889 STORED AS $source AS SELECT 1 `$name`", From 8e3a409e982aa6820603c532360b45cf54e99332 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Fri, 21 Jan 2022 16:32:37 +0800 Subject: [PATCH 4/5] Update OrcFileFormat.scala --- .../sql/execution/datasources/orc/OrcFileFormat.scala | 9 --------- 1 file changed, 9 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala index ce851c58cc4f..39a876316053 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala @@ -228,13 +228,4 @@ class OrcFileFormat case _ => false } - - override def supportFieldName(name: String): Boolean = { - try { - TypeDescription.fromString(s"struct<`$name`:int>") - true - } catch { - case _: IllegalArgumentException => false - } - } } From fd5d1d9c938eddaec92d6f17d474e3d2597500a3 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Fri, 21 Jan 2022 22:16:08 +0800 Subject: [PATCH 5/5] re-trigger