Skip to content

Commit e679bc3

Browse files
committed
[SPARK-16901] Hive settings in hive-site.xml may be overridden by Hive's default values
## What changes were proposed in this pull request? When we create the HiveConf for metastore client, we use a Hadoop Conf as the base, which may contain Hive settings in hive-site.xml (https://github.com/apache/spark/blob/master/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala#L49). However, HiveConf's initialize function basically ignores the base Hadoop Conf and always its default values (i.e. settings with non-null default values) as the base (https://github.com/apache/hive/blob/release-1.2.1/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java#L2687). So, even a user put javax.jdo.option.ConnectionURL in hive-site.xml, it is not used and Hive will use its default, which is jdbc:derby:;databaseName=metastore_db;create=true. This issue only shows up when `spark.sql.hive.metastore.jars` is not set to builtin. ## How was this patch tested? New test in HiveSparkSubmitSuite. Author: Yin Huai <[email protected]> Closes #14497 from yhuai/SPARK-16901.
1 parent 6cbde33 commit e679bc3

File tree

2 files changed

+101
-3
lines changed

2 files changed

+101
-3
lines changed

sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -141,14 +141,32 @@ private[hive] class HiveClientImpl(
141141
// so we should keep `conf` and reuse the existing instance of `CliSessionState`.
142142
originalState
143143
} else {
144-
val hiveConf = new HiveConf(hadoopConf, classOf[SessionState])
144+
val hiveConf = new HiveConf(classOf[SessionState])
145+
// 1: we set all confs in the hadoopConf to this hiveConf.
146+
// This hadoopConf contains user settings in Hadoop's core-site.xml file
147+
// and Hive's hive-site.xml file. Note, we load hive-site.xml file manually in
148+
// SharedState and put settings in this hadoopConf instead of relying on HiveConf
149+
// to load user settings. Otherwise, HiveConf's initialize method will override
150+
// settings in the hadoopConf. This issue only shows up when spark.sql.hive.metastore.jars
151+
// is not set to builtin. When spark.sql.hive.metastore.jars is builtin, the classpath
152+
// has hive-site.xml. So, HiveConf will use that to override its default values.
153+
hadoopConf.iterator().asScala.foreach { entry =>
154+
val key = entry.getKey
155+
val value = entry.getValue
156+
if (key.toLowerCase.contains("password")) {
157+
logDebug(s"Applying Hadoop and Hive config to Hive Conf: $key=xxx")
158+
} else {
159+
logDebug(s"Applying Hadoop and Hive config to Hive Conf: $key=$value")
160+
}
161+
hiveConf.set(key, value)
162+
}
145163
// HiveConf is a Hadoop Configuration, which has a field of classLoader and
146164
// the initial value will be the current thread's context class loader
147165
// (i.e. initClassLoader at here).
148166
// We call initialConf.setClassLoader(initClassLoader) at here to make
149167
// this action explicit.
150168
hiveConf.setClassLoader(initClassLoader)
151-
// First, we set all spark confs to this hiveConf.
169+
// 2: we set all spark confs to this hiveConf.
152170
sparkConf.getAll.foreach { case (k, v) =>
153171
if (k.toLowerCase.contains("password")) {
154172
logDebug(s"Applying Spark config to Hive Conf: $k=xxx")
@@ -157,7 +175,7 @@ private[hive] class HiveClientImpl(
157175
}
158176
hiveConf.set(k, v)
159177
}
160-
// Second, we set all entries in config to this hiveConf.
178+
// 3: we set all entries in config to this hiveConf.
161179
extraConfig.foreach { case (k, v) =>
162180
if (k.toLowerCase.contains("password")) {
163181
logDebug(s"Applying extra config to HiveConf: $k=xxx")

sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -253,6 +253,47 @@ class HiveSparkSubmitSuite
253253
runSparkSubmit(args)
254254
}
255255

256+
test("SPARK-16901: set javax.jdo.option.ConnectionURL") {
257+
// In this test, we set javax.jdo.option.ConnectionURL and set metastore version to
258+
// 0.13. This test will make sure that javax.jdo.option.ConnectionURL will not be
259+
// overridden by hive's default settings when we create a HiveConf object inside
260+
// HiveClientImpl. Please see SPARK-16901 for more details.
261+
262+
val metastoreLocation = Utils.createTempDir()
263+
metastoreLocation.delete()
264+
val metastoreURL =
265+
s"jdbc:derby:memory:;databaseName=${metastoreLocation.getAbsolutePath};create=true"
266+
val hiveSiteXmlContent =
267+
s"""
268+
|<configuration>
269+
| <property>
270+
| <name>javax.jdo.option.ConnectionURL</name>
271+
| <value>$metastoreURL</value>
272+
| </property>
273+
|</configuration>
274+
""".stripMargin
275+
276+
// Write a hive-site.xml containing a setting of hive.metastore.warehouse.dir.
277+
val hiveSiteDir = Utils.createTempDir()
278+
val file = new File(hiveSiteDir.getCanonicalPath, "hive-site.xml")
279+
val bw = new BufferedWriter(new FileWriter(file))
280+
bw.write(hiveSiteXmlContent)
281+
bw.close()
282+
283+
val unusedJar = TestUtils.createJarWithClasses(Seq.empty)
284+
val args = Seq(
285+
"--class", SetMetastoreURLTest.getClass.getName.stripSuffix("$"),
286+
"--name", "SetMetastoreURLTest",
287+
"--master", "local[1]",
288+
"--conf", "spark.ui.enabled=false",
289+
"--conf", "spark.master.rest.enabled=false",
290+
"--conf", s"spark.sql.test.expectedMetastoreURL=$metastoreURL",
291+
"--conf", s"spark.driver.extraClassPath=${hiveSiteDir.getCanonicalPath}",
292+
"--driver-java-options", "-Dderby.system.durability=test",
293+
unusedJar.toString)
294+
runSparkSubmit(args)
295+
}
296+
256297
// NOTE: This is an expensive operation in terms of time (10 seconds+). Use sparingly.
257298
// This is copied from org.apache.spark.deploy.SparkSubmitSuite
258299
private def runSparkSubmit(args: Seq[String]): Unit = {
@@ -313,6 +354,45 @@ class HiveSparkSubmitSuite
313354
}
314355
}
315356

357+
object SetMetastoreURLTest extends Logging {
358+
def main(args: Array[String]): Unit = {
359+
Utils.configTestLog4j("INFO")
360+
361+
val sparkConf = new SparkConf(loadDefaults = true)
362+
val builder = SparkSession.builder()
363+
.config(sparkConf)
364+
.config("spark.ui.enabled", "false")
365+
.config("spark.sql.hive.metastore.version", "0.13.1")
366+
// The issue described in SPARK-16901 only appear when
367+
// spark.sql.hive.metastore.jars is not set to builtin.
368+
.config("spark.sql.hive.metastore.jars", "maven")
369+
.enableHiveSupport()
370+
371+
val spark = builder.getOrCreate()
372+
val expectedMetastoreURL =
373+
spark.conf.get("spark.sql.test.expectedMetastoreURL")
374+
logInfo(s"spark.sql.test.expectedMetastoreURL is $expectedMetastoreURL")
375+
376+
if (expectedMetastoreURL == null) {
377+
throw new Exception(
378+
s"spark.sql.test.expectedMetastoreURL should be set.")
379+
}
380+
381+
// HiveSharedState is used when Hive support is enabled.
382+
val actualMetastoreURL =
383+
spark.sharedState.asInstanceOf[HiveSharedState]
384+
.metadataHive
385+
.getConf("javax.jdo.option.ConnectionURL", "this_is_a_wrong_URL")
386+
logInfo(s"javax.jdo.option.ConnectionURL is $actualMetastoreURL")
387+
388+
if (actualMetastoreURL != expectedMetastoreURL) {
389+
throw new Exception(
390+
s"Expected value of javax.jdo.option.ConnectionURL is $expectedMetastoreURL. But, " +
391+
s"the actual value is $actualMetastoreURL")
392+
}
393+
}
394+
}
395+
316396
object SetWarehouseLocationTest extends Logging {
317397
def main(args: Array[String]): Unit = {
318398
Utils.configTestLog4j("INFO")

0 commit comments

Comments
 (0)