Use SQLConfEntry in HiveContext

zsxwing · zsxwing · commit 7d09bad23a7a · 2015-06-18T00:07:55.000+08:00
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
@@ -157,6 +157,24 @@ private[spark] object SQLConf {
         }
         _v
       }, _.toString, doc, isPublic)
+
+    def seqConf[T](
+        key: String,
+        valueConverter: String => T,
+        defaultValue: Option[Seq[T]] = None,
+        doc: String = "",
+        isPublic: Boolean = true): SQLConfEntry[Seq[T]] = {
+      SQLConfEntry(
+        key, defaultValue, _.split(",").map(valueConverter), _.mkString(","), doc, isPublic)
+    }
+
+    def stringSeqConf(
+        key: String,
+        defaultValue: Option[Seq[String]] = None,
+        doc: String = "",
+        isPublic: Boolean = true): SQLConfEntry[Seq[String]] = {
+      seqConf(key, s => s, defaultValue, doc, isPublic)
+    }
   }
 
   import SQLConfEntry._
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -21,8 +21,6 @@ import java.beans.Introspector
 import java.util.Properties
 import java.util.concurrent.atomic.AtomicReference
 
-import org.apache.spark.sql.SQLConf.SQLConfEntry
-
 import scala.collection.JavaConversions._
 import scala.collection.immutable
 import scala.language.implicitConversions
@@ -33,6 +31,7 @@ import org.apache.spark.SparkContext
 import org.apache.spark.annotation.{DeveloperApi, Experimental}
 import org.apache.spark.api.java.{JavaRDD, JavaSparkContext}
 import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.SQLConf.SQLConfEntry
 import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.errors.DialectException
 import org.apache.spark.sql.catalyst.expressions._
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLConfEntrySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLConfEntrySuite.scala
@@ -132,4 +132,19 @@ class SQLConfEntrySuite extends SparkFunSuite {
     }
     assert(e.getMessage === s"The value of $key should be one of a, b, c, but was d")
   }
+
+  test("stringSeqConf") {
+    val key = "spark.sql.SQLConfEntrySuite.stringSeq"
+    val confEntry = SQLConfEntry.stringSeqConf("spark.sql.SQLConfEntrySuite.stringSeq",
+      defaultValue = Some(Nil))
+    assert(conf.getConf(confEntry, Seq("a", "b", "c")) === Seq("a", "b", "c"))
+
+    conf.setConf(confEntry, Seq("a", "b", "c", "d"))
+    assert(conf.getConf(confEntry, Seq("a", "b", "c")) === Seq("a", "b", "c", "d"))
+
+    conf.setConfString(key, "a,b,c,d,e")
+    assert(conf.getConfString(key, "a,b,c") === "a,b,c,d,e")
+    assert(conf.getConfString(key) === "a,b,c,d,e")
+    assert(conf.getConf(confEntry, Seq("a", "b", "c")) === Seq("a", "b", "c", "d", "e"))
+  }
 }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
@@ -21,16 +21,13 @@ import java.io.File
 import java.net.{URL, URLClassLoader}
 import java.sql.Timestamp
 
-import org.apache.hadoop.hive.common.StatsSetupConst
-import org.apache.hadoop.hive.common.`type`.HiveDecimal
-import org.apache.spark.sql.SQLConf.SQLConfEntry
-import org.apache.spark.sql.catalyst.ParserDialect
-
 import scala.collection.JavaConversions._
 import scala.collection.mutable.HashMap
 import scala.language.implicitConversions
 
 import org.apache.hadoop.fs.{FileSystem, Path}
+import org.apache.hadoop.hive.common.StatsSetupConst
+import org.apache.hadoop.hive.common.`type`.HiveDecimal
 import org.apache.hadoop.hive.conf.HiveConf
 import org.apache.hadoop.hive.ql.metadata.Table
 import org.apache.hadoop.hive.ql.parse.VariableSubstitution
@@ -40,6 +37,9 @@ import org.apache.hadoop.hive.serde2.io.{DateWritable, TimestampWritable}
 import org.apache.spark.SparkContext
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.sql._
+import org.apache.spark.sql.SQLConf.SQLConfEntry
+import org.apache.spark.sql.SQLConf.SQLConfEntry._
+import org.apache.spark.sql.catalyst.ParserDialect
 import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.execution.{ExecutedCommand, ExtractPythonUdfs, SetCommand}
@@ -70,13 +70,14 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
 
   import HiveContext._
 
+  println("create HiveContext")
+
   /**
    * When true, enables an experimental feature where metastore tables that use the parquet SerDe
    * are automatically converted to use the Spark SQL parquet table scan, instead of the Hive
    * SerDe.
    */
-  protected[sql] def convertMetastoreParquet: Boolean =
-    getConf("spark.sql.hive.convertMetastoreParquet", "true") == "true"
+  protected[sql] def convertMetastoreParquet: Boolean = getConf(CONVERT_METASTORE_PARQUET)
 
   /**
    * When true, also tries to merge possibly different but compatible Parquet schemas in different
@@ -85,7 +86,7 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
    * This configuration is only effective when "spark.sql.hive.convertMetastoreParquet" is true.
    */
   protected[sql] def convertMetastoreParquetWithSchemaMerging: Boolean =
-    getConf("spark.sql.hive.convertMetastoreParquet.mergeSchema", "false") == "true"
+    getConf(CONVERT_METASTORE_PARQUET_WITH_SCHEMA_MERGING)
 
   /**
    * When true, a table created by a Hive CTAS statement (no USING clause) will be
@@ -99,8 +100,7 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
    *   - The CTAS statement specifies SequenceFile (STORED AS SEQUENCEFILE) as the file format
    *     and no SerDe is specified (no ROW FORMAT SERDE clause).
    */
-  protected[sql] def convertCTAS: Boolean =
-    getConf("spark.sql.hive.convertCTAS", "false").toBoolean
+  protected[sql] def convertCTAS: Boolean = getConf(CONVERT_CTAS)
 
   /**
    * The version of the hive client that will be used to communicate with the metastore.  Note that
@@ -118,8 +118,7 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
    *              option is only valid when using the execution version of Hive.
    *  - maven - download the correct version of hive on demand from maven.
    */
-  protected[hive] def hiveMetastoreJars: String =
-    getConf(HIVE_METASTORE_JARS, "builtin")
+  protected[hive] def hiveMetastoreJars: String = getConf(HIVE_METASTORE_JARS)
 
   /**
    * A comma separated list of class prefixes that should be loaded using the classloader that
@@ -129,26 +128,20 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) {
    * custom appenders that are used by log4j.
    */
   protected[hive] def hiveMetastoreSharedPrefixes: Seq[String] =
-    getConf("spark.sql.hive.metastore.sharedPrefixes", jdbcPrefixes)
-      .split(",").filterNot(_ == "")
-
-  private def jdbcPrefixes = Seq(
-    "com.mysql.jdbc", "org.postgresql", "com.microsoft.sqlserver", "oracle.jdbc").mkString(",")
+    getConf(HIVE_METASTORE_SHARED_PREFIXES).filterNot(_ == "")
 
   /**
    * A comma separated list of class prefixes that should explicitly be reloaded for each version
    * of Hive that Spark SQL is communicating with.  For example, Hive UDFs that are declared in a
    * prefix that typically would be shared (i.e. org.apache.spark.*)
    */
   protected[hive] def hiveMetastoreBarrierPrefixes: Seq[String] =
-    getConf("spark.sql.hive.metastore.barrierPrefixes", "")
-      .split(",").filterNot(_ == "")
+    getConf(HIVE_METASTORE_BARRIER_PREFIXES).filterNot(_ == "")
 
   /*
    * hive thrift server use background spark sql thread pool to execute sql queries
    */
-  protected[hive] def hiveThriftServerAsync: Boolean =
-    getConf("spark.sql.hive.thriftServer.async", "true").toBoolean
+  protected[hive] def hiveThriftServerAsync: Boolean = getConf(HIVE_THRIFT_SERVER_ASYNC)
 
   @transient
   protected[sql] lazy val substitutor = new VariableSubstitution()
@@ -525,7 +518,50 @@ private[hive] object HiveContext {
   val hiveExecutionVersion: String = "0.13.1"
 
   val HIVE_METASTORE_VERSION: String = "spark.sql.hive.metastore.version"
-  val HIVE_METASTORE_JARS: String = "spark.sql.hive.metastore.jars"
+  val HIVE_METASTORE_JARS = stringConf("spark.sql.hive.metastore.jars",
+    defaultValue = Some("builtin"),
+    doc = "Location of the jars that should be used to instantiate the HiveMetastoreClient. This" +
+      " property can be one of three options: " +
+      "1. \"builtin\" Use Hive 0.13.1, which is bundled with the Spark assembly jar when " +
+      "<code>-Phive</code> is enabled. When this option is chosen, " +
+      "spark.sql.hive.metastore.version must be either <code>0.13.1</code> or not defined. " +
+      "2. \"maven\" Use Hive jars of specified version downloaded from Maven repositories." +
+      "3. A classpath in the standard format for both Hive and Hadoop.")
+
+  val CONVERT_METASTORE_PARQUET = booleanConf("spark.sql.hive.convertMetastoreParquet",
+    defaultValue = Some(true),
+    doc = "When set to false, Spark SQL will use the Hive SerDe for parquet tables instead of " +
+      "the built in support.")
+
+  val CONVERT_METASTORE_PARQUET_WITH_SCHEMA_MERGING = booleanConf(
+    "spark.sql.hive.convertMetastoreParquet.mergeSchema",
+    defaultValue = Some(false),
+    doc = "TODO")
+
+  val CONVERT_CTAS = booleanConf("spark.sql.hive.convertCTAS",
+    defaultValue = Some(false),
+    doc = "TODO")
+
+  val HIVE_METASTORE_SHARED_PREFIXES = stringSeqConf("spark.sql.hive.metastore.sharedPrefixes",
+    defaultValue = Some(jdbcPrefixes),
+    doc = "A comma separated list of class prefixes that should be loaded using the classloader " +
+      "that is shared between Spark SQL and a specific version of Hive. An example of classes " +
+      "that should be shared is JDBC drivers that are needed to talk to the metastore. Other " +
+      "classes that need to be shared are those that interact with classes that are already " +
+      "shared. For example, custom appenders that are used by log4j.")
+
+  private def jdbcPrefixes = Seq(
+    "com.mysql.jdbc", "org.postgresql", "com.microsoft.sqlserver", "oracle.jdbc")
+
+  val HIVE_METASTORE_BARRIER_PREFIXES = stringSeqConf("spark.sql.hive.metastore.barrierPrefixes",
+    defaultValue = Some(Seq()),
+    doc = "A comma separated list of class prefixes that should explicitly be reloaded for each " +
+      "version of Hive that Spark SQL is communicating with. For example, Hive UDFs that are " +
+      "declared in a prefix that typically would be shared (i.e. <code>org.apache.spark.*</code>).")
+
+  val HIVE_THRIFT_SERVER_ASYNC = booleanConf("spark.sql.hive.thriftServer.async",
+    defaultValue = Some(true),
+    doc = "TODO")
 
   /** Constructs a configuration for hive, where the metastore is located in a temp directory. */
   def newTemporaryConfiguration(): Map[String, String] = {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
@@ -563,7 +563,7 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with BeforeA
 
   test("scan a parquet table created through a CTAS statement") {
     withSQLConf(
-      "spark.sql.hive.convertMetastoreParquet" -> "true",
+      HiveContext.CONVERT_METASTORE_PARQUET.key -> "true",
       SQLConf.PARQUET_USE_DATA_SOURCE_API.key -> "true") {
 
       withTempTable("jt") {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -24,7 +24,7 @@ import org.apache.spark.sql._
 import org.apache.spark.sql.hive.test.TestHive
 import org.apache.spark.sql.hive.test.TestHive._
 import org.apache.spark.sql.hive.test.TestHive.implicits._
-import org.apache.spark.sql.hive.{HiveQLDialect, MetastoreRelation}
+import org.apache.spark.sql.hive.{HiveContext, HiveQLDialect, MetastoreRelation}
 import org.apache.spark.sql.parquet.ParquetRelation2
 import org.apache.spark.sql.sources.LogicalRelation
 import org.apache.spark.sql.types._
@@ -191,9 +191,9 @@ class SQLQuerySuite extends QueryTest {
       }
     }
 
-    val originalConf = getConf("spark.sql.hive.convertCTAS", "false")
+    val originalConf = convertCTAS
 
-    setConf("spark.sql.hive.convertCTAS", "true")
+    setConf(HiveContext.CONVERT_CTAS, true)
 
     sql("CREATE TABLE ctas1 AS SELECT key k, value FROM src ORDER BY k, value")
     sql("CREATE TABLE IF NOT EXISTS ctas1 AS SELECT key k, value FROM src ORDER BY k, value")
@@ -235,7 +235,7 @@ class SQLQuerySuite extends QueryTest {
     checkRelation("ctas1", false)
     sql("DROP TABLE ctas1")
 
-    setConf("spark.sql.hive.convertCTAS", originalConf)
+    setConf(HiveContext.CONVERT_CTAS, originalConf)
   }
 
   test("SQL Dialect Switching") {
@@ -348,7 +348,7 @@ class SQLQuerySuite extends QueryTest {
         "MANAGED_TABLE"
       )
 
-      val default = getConf("spark.sql.hive.convertMetastoreParquet", "true")
+      val default = convertMetastoreParquet
       // use the Hive SerDe for parquet tables
       sql("set spark.sql.hive.convertMetastoreParquet = false")
       checkAnswer(
@@ -603,8 +603,8 @@ class SQLQuerySuite extends QueryTest {
     // generates an invalid query plan.
     val rdd = sparkContext.makeRDD((1 to 5).map(i => s"""{"a":[$i, ${i + 1}]}"""))
     read.json(rdd).registerTempTable("data")
-    val originalConf = getConf("spark.sql.hive.convertCTAS", "false")
-    setConf("spark.sql.hive.convertCTAS", "false")
+    val originalConf = convertCTAS
+    setConf(HiveContext.CONVERT_CTAS, false)
 
     sql("CREATE TABLE explodeTest (key bigInt)")
     table("explodeTest").queryExecution.analyzed match {
@@ -621,7 +621,7 @@ class SQLQuerySuite extends QueryTest {
 
     sql("DROP TABLE explodeTest")
     dropTempTable("data")
-    setConf("spark.sql.hive.convertCTAS", originalConf)
+    setConf(HiveContext.CONVERT_CTAS, originalConf)
   }
 
   test("sanity test for SPARK-6618") {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala
@@ -153,7 +153,7 @@ class ParquetMetastoreSuiteBase extends ParquetPartitioningTest {
     val rdd2 = sparkContext.parallelize((1 to 10).map(i => s"""{"a":[$i, null]}"""))
     read.json(rdd2).registerTempTable("jt_array")
 
-    setConf("spark.sql.hive.convertMetastoreParquet", "true")
+    setConf(HiveContext.CONVERT_METASTORE_PARQUET, true)
   }
 
   override def afterAll(): Unit = {
@@ -164,7 +164,7 @@ class ParquetMetastoreSuiteBase extends ParquetPartitioningTest {
     sql("DROP TABLE normal_parquet")
     sql("DROP TABLE IF EXISTS jt")
     sql("DROP TABLE IF EXISTS jt_array")
-    setConf("spark.sql.hive.convertMetastoreParquet", "false")
+    setConf(HiveContext.CONVERT_METASTORE_PARQUET, false)
   }
 
   test(s"conversion is working") {

Original file line number	Diff line number	Diff line change
`@@ -153,7 +153,7 @@ class ParquetMetastoreSuiteBase extends ParquetPartitioningTest {`
`153`	`153`	`val rdd2 = sparkContext.parallelize((1 to 10).map(i => s"""{"a":[$i, null]}"""))`
`154`	`154`	`read.json(rdd2).registerTempTable("jt_array")`
`155`	`155`
`156`		`- setConf("spark.sql.hive.convertMetastoreParquet", "true")`
	`156`	`+ setConf(HiveContext.CONVERT_METASTORE_PARQUET, true)`
`157`	`157`	`}`
`158`	`158`
`159`	`159`	`override def afterAll(): Unit = {`
`@@ -164,7 +164,7 @@ class ParquetMetastoreSuiteBase extends ParquetPartitioningTest {`
`164`	`164`	`sql("DROP TABLE normal_parquet")`
`165`	`165`	`sql("DROP TABLE IF EXISTS jt")`
`166`	`166`	`sql("DROP TABLE IF EXISTS jt_array")`
`167`		`- setConf("spark.sql.hive.convertMetastoreParquet", "false")`
	`167`	`+ setConf(HiveContext.CONVERT_METASTORE_PARQUET, false)`
`168`	`168`	`}`
`169`	`169`
`170`	`170`	`test(s"conversion is working") {`