address comments

cloud-fan · cloud-fan · commit 08ec4a7e0d4d · 2017-01-05T17:54:27.000+08:00
diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
@@ -956,17 +956,17 @@ adds support for finding tables in the MetaStore and writing queries using HiveQ
 When you create a Hive table, you need to define how this table should read/write data from/to file system,
 i.e. the "input format" and "output format". You also need to define how this table should deserialize the data
 to rows, or serialize rows to data, i.e. the "serde". The following options can be used to specify the storage
-format("serde", "input format", "output format"), e.g. `CREATE TABLE src(id int) USING hive OPTIONS(format 'parquet')`.
+format("serde", "input format", "output format"), e.g. `CREATE TABLE src(id int) USING hive OPTIONS(fileFormat 'parquet')`.
 By default, we will read the table files as plain text. Note that, Hive storage handler is not supported yet when
 creating table, you can create a table using storage handler at Hive side, and use Spark SQL to read it.
 
 <table class="table">
   <tr><th>Property Name</th><th>Meaning</th></tr>
   <tr>
-    <td><code>format</code></td>
+    <td><code>fileFormat</code></td>
     <td>
-      A format is kind of a package of storage format specification, including "serde", "input format" and
-      "output format". Currently we supports 6 formats: 'sequencefile', 'rcfile', 'orc', 'parquet', 'textfile' and 'avro'.
+      A fileFormat is kind of a package of storage format specifications, including "serde", "input format" and
+      "output format". Currently we support 6 fileFormats: 'sequencefile', 'rcfile', 'orc', 'parquet', 'textfile' and 'avro'.
     </td>
   </tr>
 
@@ -975,23 +975,23 @@ creating table, you can create a table using storage handler at Hive side, and u
     <td>
       These 2 options specify the name of a corresponding `InputFormat` and `OutputFormat` class as a string literal,
       e.g. `org.apache.hadoop.hive.ql.io.orc.OrcInputFormat`. These 2 options must be appeared in pair, and you can not
-      specify them if you already specified the `format` option.
+      specify them if you already specified the `fileFormat` option.
     </td>
   </tr>
 
   <tr>
     <td><code>serde</code></td>
     <td>
-      This option specifies the name of a serde class. When the `format` option is specified, do not specify this option
-      if the given `format` already include the information of serde. Currently "sequencefile", "textfile" and "rcfile"
-      don't include the serde information and you can use this option with these 3 formats.
+      This option specifies the name of a serde class. When the `fileFormat` option is specified, do not specify this option
+      if the given `fileFormat` already include the information of serde. Currently "sequencefile", "textfile" and "rcfile"
+      don't include the serde information and you can use this option with these 3 fileFormats.
     </td>
   </tr>
 
   <tr>
     <td><code>fieldDelim, escapeDelim, collectionDelim, mapkeyDelim, lineDelim</code></td>
     <td>
-      These options can only be used with "textfile" format. They define how to read delimited files into rows.
+      These options can only be used with "textfile" fileFormat. They define how to read delimited files into rows.
     </td>
   </tr>
 </table>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
@@ -377,7 +377,9 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder {
     val storage = DataSource.buildStorageFormatFromOptions(options)
 
     if (location.isDefined && storage.locationUri.isDefined) {
-      throw new ParseException("Cannot specify LOCATION when there is 'path' in OPTIONS.", ctx)
+      throw new ParseException(
+        "LOCATION and 'path' in OPTIONS are both used to indicate the custom table path, " +
+          "you can only specify one of them.", ctx)
     }
     val customLocation = storage.locationUri.orElse(location)
 
@@ -1060,19 +1062,7 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder {
     val schema = StructType(dataCols ++ partitionCols)
 
     // Storage format
-    val defaultStorage: CatalogStorageFormat = {
-      val defaultStorageType = conf.getConfString("hive.default.fileformat", "textfile")
-      val defaultHiveSerde = HiveSerDe.sourceToSerDe(defaultStorageType)
-      CatalogStorageFormat(
-        locationUri = None,
-        inputFormat = defaultHiveSerde.flatMap(_.inputFormat)
-          .orElse(Some("org.apache.hadoop.mapred.TextInputFormat")),
-        outputFormat = defaultHiveSerde.flatMap(_.outputFormat)
-          .orElse(Some("org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat")),
-        serde = defaultHiveSerde.flatMap(_.serde),
-        compressed = false,
-        properties = Map())
-    }
+    val defaultStorage = HiveSerDe.getDefaultStorage(conf)
     validateRowFormatFileFormat(ctx.rowFormat, ctx.createFileFormat, ctx)
     val fileStorage = Option(ctx.createFileFormat).map(visitCreateFileFormat)
       .getOrElse(CatalogStorageFormat.empty)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
@@ -449,19 +449,6 @@ case class PreWriteCheck(conf: SQLConf, catalog: SessionCatalog)
         // The relation in l is not an InsertableRelation.
         failAnalysis(s"$l does not allow insertion.")
 
-      case CreateTable(table, _, query) if DDLUtils.isHiveTable(table) =>
-        if (table.bucketSpec.isDefined) {
-          throw new AnalysisException("Creating bucketed Hive serde table is not supported yet.")
-        }
-        if (table.partitionColumnNames.nonEmpty && query.isDefined) {
-          val errorMessage = "A Create Table As Select (CTAS) statement is not allowed to " +
-            "create a partitioned table using Hive's file formats. " +
-            "Please use the syntax of \"CREATE TABLE tableName USING dataSource " +
-            "OPTIONS (...) PARTITIONED BY ...\" to create a partitioned table through a " +
-            "CTAS statement."
-          throw new AnalysisException(errorMessage)
-        }
-
       case _ => // OK
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/HiveSerDe.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/HiveSerDe.scala
@@ -17,12 +17,49 @@
 
 package org.apache.spark.sql.internal
 
+import org.apache.spark.sql.catalyst.catalog.CatalogStorageFormat
+
 case class HiveSerDe(
   inputFormat: Option[String] = None,
   outputFormat: Option[String] = None,
   serde: Option[String] = None)
 
 object HiveSerDe {
+  val serdeMap = Map(
+    "sequencefile" ->
+      HiveSerDe(
+        inputFormat = Option("org.apache.hadoop.mapred.SequenceFileInputFormat"),
+        outputFormat = Option("org.apache.hadoop.mapred.SequenceFileOutputFormat")),
+
+    "rcfile" ->
+      HiveSerDe(
+        inputFormat = Option("org.apache.hadoop.hive.ql.io.RCFileInputFormat"),
+        outputFormat = Option("org.apache.hadoop.hive.ql.io.RCFileOutputFormat"),
+        serde = Option("org.apache.hadoop.hive.serde2.columnar.LazyBinaryColumnarSerDe")),
+
+    "orc" ->
+      HiveSerDe(
+        inputFormat = Option("org.apache.hadoop.hive.ql.io.orc.OrcInputFormat"),
+        outputFormat = Option("org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat"),
+        serde = Option("org.apache.hadoop.hive.ql.io.orc.OrcSerde")),
+
+    "parquet" ->
+      HiveSerDe(
+        inputFormat = Option("org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat"),
+        outputFormat = Option("org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat"),
+        serde = Option("org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe")),
+
+    "textfile" ->
+      HiveSerDe(
+        inputFormat = Option("org.apache.hadoop.mapred.TextInputFormat"),
+        outputFormat = Option("org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat")),
+
+    "avro" ->
+      HiveSerDe(
+        inputFormat = Option("org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat"),
+        outputFormat = Option("org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat"),
+        serde = Option("org.apache.hadoop.hive.serde2.avro.AvroSerDe")))
+
   /**
    * Get the Hive SerDe information from the data source abbreviation string or classname.
    *
@@ -31,41 +68,6 @@ object HiveSerDe {
    * @return HiveSerDe associated with the specified source
    */
   def sourceToSerDe(source: String): Option[HiveSerDe] = {
-    val serdeMap = Map(
-      "sequencefile" ->
-        HiveSerDe(
-          inputFormat = Option("org.apache.hadoop.mapred.SequenceFileInputFormat"),
-          outputFormat = Option("org.apache.hadoop.mapred.SequenceFileOutputFormat")),
-
-      "rcfile" ->
-        HiveSerDe(
-          inputFormat = Option("org.apache.hadoop.hive.ql.io.RCFileInputFormat"),
-          outputFormat = Option("org.apache.hadoop.hive.ql.io.RCFileOutputFormat"),
-          serde = Option("org.apache.hadoop.hive.serde2.columnar.LazyBinaryColumnarSerDe")),
-
-      "orc" ->
-        HiveSerDe(
-          inputFormat = Option("org.apache.hadoop.hive.ql.io.orc.OrcInputFormat"),
-          outputFormat = Option("org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat"),
-          serde = Option("org.apache.hadoop.hive.ql.io.orc.OrcSerde")),
-
-      "parquet" ->
-        HiveSerDe(
-          inputFormat = Option("org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat"),
-          outputFormat = Option("org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat"),
-          serde = Option("org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe")),
-
-      "textfile" ->
-        HiveSerDe(
-          inputFormat = Option("org.apache.hadoop.mapred.TextInputFormat"),
-          outputFormat = Option("org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat")),
-
-      "avro" ->
-        HiveSerDe(
-          inputFormat = Option("org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat"),
-          outputFormat = Option("org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat"),
-          serde = Option("org.apache.hadoop.hive.serde2.avro.AvroSerDe")))
-
     val key = source.toLowerCase match {
       case s if s.startsWith("org.apache.spark.sql.parquet") => "parquet"
       case s if s.startsWith("org.apache.spark.sql.orc") => "orc"
@@ -77,4 +79,16 @@ object HiveSerDe {
 
     serdeMap.get(key)
   }
+
+  def getDefaultStorage(conf: SQLConf): CatalogStorageFormat = {
+    val defaultStorageType = conf.getConfString("hive.default.fileformat", "textfile")
+    val defaultHiveSerde = sourceToSerDe(defaultStorageType)
+    CatalogStorageFormat.empty.copy(
+      inputFormat = defaultHiveSerde.flatMap(_.inputFormat)
+        .orElse(Some("org.apache.hadoop.mapred.TextInputFormat")),
+      outputFormat = defaultHiveSerde.flatMap(_.outputFormat)
+        .orElse(Some("org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat")),
+      serde = defaultHiveSerde.flatMap(_.serde)
+        .orElse(Some("org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe")))
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlParserSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlParserSuite.scala
@@ -121,7 +121,8 @@ class SparkSqlParserSuite extends PlanTest {
       tableType: CatalogTableType = CatalogTableType.MANAGED,
       storage: CatalogStorageFormat = CatalogStorageFormat.empty.copy(
         inputFormat = HiveSerDe.sourceToSerDe("textfile").get.inputFormat,
-        outputFormat = HiveSerDe.sourceToSerDe("textfile").get.outputFormat),
+        outputFormat = HiveSerDe.sourceToSerDe("textfile").get.outputFormat,
+        serde = Some("org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe")),
       schema: StructType = new StructType,
       provider: Option[String] = Some("hive"),
       partitionColumnNames: Seq[String] = Seq.empty,
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala
@@ -245,7 +245,8 @@ class DDLCommandSuite extends PlanTest {
       val ct = parseAs[CreateTable](query)
       val hiveSerde = HiveSerDe.sourceToSerDe(s)
       assert(hiveSerde.isDefined)
-      assert(ct.tableDesc.storage.serde == hiveSerde.get.serde)
+      assert(ct.tableDesc.storage.serde ==
+        hiveSerde.get.serde.orElse(Some("org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe")))
       assert(ct.tableDesc.storage.inputFormat == hiveSerde.get.inputFormat)
       assert(ct.tableDesc.storage.outputFormat == hiveSerde.get.outputFormat)
     }
@@ -262,8 +263,10 @@ class DDLCommandSuite extends PlanTest {
     assert(parsed1.tableDesc.storage.serde == Some("anything"))
     assert(parsed1.tableDesc.storage.inputFormat == Some("inputfmt"))
     assert(parsed1.tableDesc.storage.outputFormat == Some("outputfmt"))
+
     val parsed2 = parseAs[CreateTable](query2)
-    assert(parsed2.tableDesc.storage.serde.isEmpty)
+    assert(parsed2.tableDesc.storage.serde ==
+      Some("org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe"))
     assert(parsed2.tableDesc.storage.inputFormat == Some("inputfmt"))
     assert(parsed2.tableDesc.storage.outputFormat == Some("outputfmt"))
   }
@@ -297,7 +300,8 @@ class DDLCommandSuite extends PlanTest {
         val ct = parseAs[CreateTable](query)
         val hiveSerde = HiveSerDe.sourceToSerDe(s)
         assert(hiveSerde.isDefined)
-        assert(ct.tableDesc.storage.serde == hiveSerde.get.serde)
+        assert(ct.tableDesc.storage.serde ==
+          hiveSerde.get.serde.orElse(Some("org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe")))
         assert(ct.tableDesc.storage.inputFormat == hiveSerde.get.inputFormat)
         assert(ct.tableDesc.storage.outputFormat == hiveSerde.get.outputFormat)
       } else {
@@ -427,7 +431,7 @@ class DDLCommandSuite extends PlanTest {
     val e = intercept[ParseException] {
       parser.parsePlan(v2)
     }
-    assert(e.message.contains("Cannot specify LOCATION when there is 'path' in OPTIONS"))
+    assert(e.message.contains("you can only specify one of them."))
   }
 
   // ALTER TABLE table_name RENAME TO new_table_name;
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
@@ -36,39 +36,33 @@ import org.apache.spark.sql.internal.{HiveSerDe, SQLConf}
  */
 class DetermineHiveSerde(conf: SQLConf) extends Rule[LogicalPlan] {
   override def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators {
-    case c @ CreateTable(t, _, _) if DDLUtils.isHiveTable(t) && t.storage.inputFormat.isEmpty =>
-      if (t.bucketSpec.nonEmpty) {
-        throw new AnalysisException("Cannot create bucketed Hive serde table.")
+    case c @ CreateTable(t, _, query) if DDLUtils.isHiveTable(t) && t.storage.serde.isEmpty =>
+      if (t.bucketSpec.isDefined) {
+        throw new AnalysisException("Creating bucketed Hive serde table is not supported yet.")
       }
-
-      val defaultStorage: CatalogStorageFormat = {
-        val defaultStorageType = conf.getConfString("hive.default.fileformat", "textfile")
-        val defaultHiveSerde = HiveSerDe.sourceToSerDe(defaultStorageType)
-        CatalogStorageFormat(
-          locationUri = None,
-          inputFormat = defaultHiveSerde.flatMap(_.inputFormat)
-            .orElse(Some("org.apache.hadoop.mapred.TextInputFormat")),
-          outputFormat = defaultHiveSerde.flatMap(_.outputFormat)
-            .orElse(Some("org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat")),
-          serde = defaultHiveSerde.flatMap(_.serde)
-            .orElse(Some("org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe")),
-          compressed = false,
-          properties = Map())
+      if (t.partitionColumnNames.nonEmpty && query.isDefined) {
+        val errorMessage = "A Create Table As Select (CTAS) statement is not allowed to " +
+          "create a partitioned table using Hive's file formats. " +
+          "Please use the syntax of \"CREATE TABLE tableName USING dataSource " +
+          "OPTIONS (...) PARTITIONED BY ...\" to create a partitioned table through a " +
+          "CTAS statement."
+        throw new AnalysisException(errorMessage)
       }
 
+      val defaultStorage = HiveSerDe.getDefaultStorage(conf)
       val options = new HiveOptions(t.storage.properties)
 
-      val fileStorage = if (options.format.isDefined) {
-        HiveSerDe.sourceToSerDe(options.format.get) match {
+      val fileStorage = if (options.fileFormat.isDefined) {
+        HiveSerDe.sourceToSerDe(options.fileFormat.get) match {
           case Some(s) =>
             CatalogStorageFormat.empty.copy(
               inputFormat = s.inputFormat,
               outputFormat = s.outputFormat,
               serde = s.serde)
           case None =>
-            throw new IllegalArgumentException(s"invalid format: '${options.format.get}'")
+            throw new IllegalArgumentException(s"invalid fileFormat: '${options.fileFormat.get}'")
         }
-      } else if (options.inputFormat.isDefined) {
+      } else if (options.hasInputOutputFormat) {
         CatalogStorageFormat.empty.copy(
           inputFormat = options.inputFormat,
           outputFormat = options.outputFormat)
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveOptions.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveOptions.scala
@@ -20,14 +20,15 @@ package org.apache.spark.sql.hive.execution
 import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
 
 /**
- * Options for the Hive data source.
+ * Options for the Hive data source. Note that rule `DetermineHiveSerde` will extract Hive
+ * serde/format information from these options.
  */
 class HiveOptions(@transient private val parameters: CaseInsensitiveMap) extends Serializable {
   import HiveOptions._
 
   def this(parameters: Map[String, String]) = this(new CaseInsensitiveMap(parameters))
 
-  val format = parameters.get(FORMAT).map(_.toLowerCase)
+  val fileFormat = parameters.get(FILE_FORMAT).map(_.toLowerCase)
   val inputFormat = parameters.get(INPUT_FORMAT)
   val outputFormat = parameters.get(OUTPUT_FORMAT)
 
@@ -36,24 +37,35 @@ class HiveOptions(@transient private val parameters: CaseInsensitiveMap) extends
       "have to specify both of them.")
   }
 
-  if (format.isDefined && inputFormat.isDefined) {
-    throw new IllegalArgumentException("Cannot specify format and inputFormat/outputFormat " +
+  def hasInputOutputFormat: Boolean = inputFormat.isDefined
+
+  if (fileFormat.isDefined && inputFormat.isDefined) {
+    throw new IllegalArgumentException("Cannot specify fileFormat and inputFormat/outputFormat " +
       "together for Hive data source.")
   }
 
   val serde = parameters.get(SERDE)
 
-  for (f <- format if serde.isDefined) {
-    if (!Set("sequencefile", "textfile", "rcfile").contains(f)) {
-      throw new IllegalArgumentException(s"format '$f' already specifies a serde.")
+  if (fileFormat.isDefined && serde.isDefined) {
+    if (!Set("sequencefile", "textfile", "rcfile").contains(fileFormat.get)) {
+      throw new IllegalArgumentException(
+        s"fileFormat '${fileFormat.get}' already specifies a serde.")
     }
   }
 
   val containsDelimiters = delimiterOptions.keys.exists(parameters.contains)
 
-  for (f <- format if f != "textfile" && containsDelimiters) {
-    throw new IllegalArgumentException("Cannot specify delimiters as they are only compatible " +
-      s"with format 'textfile', not $f.")
+  if (containsDelimiters) {
+    if (serde.isDefined) {
+      throw new IllegalArgumentException("Cannot specify delimiters with a custom serde.")
+    }
+    if (fileFormat.isEmpty) {
+      throw new IllegalArgumentException("Cannot specify delimiters without fileFormat.")
+    }
+    if (fileFormat.get != "textfile") {
+      throw new IllegalArgumentException("Cannot specify delimiters as they are only compatible " +
+        s"with fileFormat 'textfile', not ${fileFormat.get}.")
+    }
   }
 
   for (lineDelim <- parameters.get("lineDelim") if lineDelim != "\n") {
@@ -74,7 +86,7 @@ object HiveOptions {
     name
   }
 
-  val FORMAT = newOption("format")
+  val FILE_FORMAT = newOption("fileFormat")
   val INPUT_FORMAT = newOption("inputFormat")
   val OUTPUT_FORMAT = newOption("outputFormat")
   val SERDE = newOption("serde")
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileOperator.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileOperator.scala
@@ -27,7 +27,7 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.sql.catalyst.parser.CatalystSqlParser
 import org.apache.spark.sql.types.StructType
 
-private[orc] object OrcFileOperator extends Logging {
+private[hive] object OrcFileOperator extends Logging {
   /**
    * Retrieves an ORC file reader from a given path.  The path can point to either a directory or a
    * single ORC file.  If it points to a directory, it picks any non-empty ORC file within that
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDDLCommandSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveDDLCommandSuite.scala
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcHadoopFsRelationSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcHadoopFsRelationSuite.scala