[SPARK-21912][SQL] Creating ORC datasource table should check invalid column names

dongjoon-hyun · dongjoon-hyun · commit 808dfe0fcd9d · 2017-09-04T13:46:15.000-07:00
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
@@ -34,12 +34,12 @@ import org.apache.hadoop.mapreduce._
 import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat, FileSplit}
 
 import org.apache.spark.TaskContext
-import org.apache.spark.sql.{Row, SparkSession}
+import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.execution.datasources._
 import org.apache.spark.sql.hive.{HiveInspectors, HiveShim}
-import org.apache.spark.sql.sources.{Filter, _}
+import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.util.SerializableConfiguration
 
@@ -83,6 +83,8 @@ class OrcFileFormat extends FileFormat with DataSourceRegister with Serializable
           classOf[MapRedOutputFormat[_, _]])
     }
 
+    dataSchema.map(_.name).foreach(checkFieldName)
+
     new OutputWriterFactory {
       override def newInstance(
           path: String,
@@ -169,6 +171,16 @@ class OrcFileFormat extends FileFormat with DataSourceRegister with Serializable
       }
     }
   }
+
+  private def checkFieldName(name: String): Unit = {
+    // ,;{}()\n\t= and space are special characters in ORC schema
+    if (name.matches(".*[ ,;{}()\n\t=].*")) {
+      throw new AnalysisException(
+        s"""Attribute name "$name" contains invalid character(s) among " ,;{}()\\n\\t=".
+           |Please use alias to rename it.
+         """.stripMargin.split("\n").mkString(" ").trim)
+    }
+  }
 }
 
 private[orc] class OrcSerializer(dataSchema: StructType, conf: Configuration)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -2000,4 +2000,13 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
       assert(setOfPath.size() == pathSizeToDeleteOnExit)
     }
   }
+
+  test("SPARK-21912 Creating ORC datasource table should check invalid column names") {
+    withTable("orc1") {
+      val m = intercept[AnalysisException] {
+        sql("CREATE TABLE orc1 USING ORC AS SELECT 1 `a b`")
+      }.getMessage
+      assert(m.contains("""Attribute name "a b" contains invalid character(s)"""))
+    }
+  }
 }

Original file line number	Diff line number	Diff line change
`@@ -2000,4 +2000,13 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {`
`2000`	`2000`	`assert(setOfPath.size() == pathSizeToDeleteOnExit)`
`2001`	`2001`	`}`
`2002`	`2002`	`}`
	`2003`	`+`
	`2004`	`+ test("SPARK-21912 Creating ORC datasource table should check invalid column names") {`
	`2005`	`+ withTable("orc1") {`
	`2006`	`+ val m = intercept[AnalysisException] {`
	`2007`	+ sql("CREATE TABLE orc1 USING ORC AS SELECT 1 `a b`")
	`2008`	`+ }.getMessage`
	`2009`	`+ assert(m.contains("""Attribute name "a b" contains invalid character(s)"""))`
	`2010`	`+ }`
	`2011`	`+ }`
`2003`	`2012`	`}`