@@ -34,23 +34,51 @@ import org.apache.spark.sql.hive.HiveContext
3434import org .apache .spark .util .{MutableURLClassLoader , Utils }
3535
3636/** Factory for `IsolatedClientLoader` with specific versions of hive. */
37- private [hive] object IsolatedClientLoader {
37+ private [hive] object IsolatedClientLoader extends Logging {
3838 /**
3939 * Creates isolated Hive client loaders by downloading the requested version from maven.
4040 */
4141 def forVersion (
42- version : String ,
42+ hiveMetastoreVersion : String ,
43+ hadoopVersion : String ,
4344 config : Map [String , String ] = Map .empty,
4445 ivyPath : Option [String ] = None ,
4546 sharedPrefixes : Seq [String ] = Seq .empty,
4647 barrierPrefixes : Seq [String ] = Seq .empty): IsolatedClientLoader = synchronized {
47- val resolvedVersion = hiveVersion(version)
48- val files = resolvedVersions.getOrElseUpdate(resolvedVersion,
49- downloadVersion(resolvedVersion, ivyPath))
48+ val resolvedVersion = hiveVersion(hiveMetastoreVersion)
49+ // We will first try to share Hadoop classes. If we cannot resolve the Hadoop artifact
50+ // with the given version, we will use Hadoop 2.4.0 and then will not share Hadoop classes.
51+ var sharesHadoopClasses = true
52+ val files = if (resolvedVersions.contains((resolvedVersion, hadoopVersion))) {
53+ resolvedVersions((resolvedVersion, hadoopVersion))
54+ } else {
55+ val (downloadedFiles, actualHadoopVersion) =
56+ try {
57+ (downloadVersion(resolvedVersion, hadoopVersion, ivyPath), hadoopVersion)
58+ } catch {
59+ case e : RuntimeException if e.getMessage.contains(" hadoop" ) =>
60+ // If the error message contains hadoop, it is probably because the hadoop
61+ // version cannot be resolved (e.g. it is a vendor specific version like
62+ // 2.0.0-cdh4.1.1). If it is the case, we will try just
63+ // "org.apache.hadoop:hadoop-client:2.4.0". "org.apache.hadoop:hadoop-client:2.4.0"
64+ // is used just because we used to hard code it as the hadoop artifact to download.
65+ logWarning(s " Failed to resolve Hadoop artifacts for the version ${hadoopVersion}. " +
66+ s " We will change the hadoop version from ${hadoopVersion} to 2.4.0 and try again. " +
67+ " Hadoop classes will not be shared between Spark and Hive metastore client. " +
68+ " It is recommended to set jars used by Hive metastore client through " +
69+ " spark.sql.hive.metastore.jars in the production environment." )
70+ sharesHadoopClasses = false
71+ (downloadVersion(resolvedVersion, " 2.4.0" , ivyPath), " 2.4.0" )
72+ }
73+ resolvedVersions.put((resolvedVersion, actualHadoopVersion), downloadedFiles)
74+ resolvedVersions((resolvedVersion, actualHadoopVersion))
75+ }
76+
5077 new IsolatedClientLoader (
51- version = hiveVersion(version ),
78+ version = hiveVersion(hiveMetastoreVersion ),
5279 execJars = files,
5380 config = config,
81+ sharesHadoopClasses = sharesHadoopClasses,
5482 sharedPrefixes = sharedPrefixes,
5583 barrierPrefixes = barrierPrefixes)
5684 }
@@ -64,12 +92,15 @@ private[hive] object IsolatedClientLoader {
6492 case " 1.2" | " 1.2.0" | " 1.2.1" => hive.v1_2
6593 }
6694
67- private def downloadVersion (version : HiveVersion , ivyPath : Option [String ]): Seq [URL ] = {
95+ private def downloadVersion (
96+ version : HiveVersion ,
97+ hadoopVersion : String ,
98+ ivyPath : Option [String ]): Seq [URL ] = {
6899 val hiveArtifacts = version.extraDeps ++
69100 Seq (" hive-metastore" , " hive-exec" , " hive-common" , " hive-serde" )
70101 .map(a => s " org.apache.hive: $a: ${version.fullVersion}" ) ++
71102 Seq (" com.google.guava:guava:14.0.1" ,
72- " org.apache.hadoop:hadoop-client:2.4.0 " )
103+ s " org.apache.hadoop:hadoop-client: $hadoopVersion " )
73104
74105 val classpath = quietly {
75106 SparkSubmitUtils .resolveMavenCoordinates(
@@ -86,7 +117,10 @@ private[hive] object IsolatedClientLoader {
86117 tempDir.listFiles().map(_.toURI.toURL)
87118 }
88119
89- private def resolvedVersions = new scala.collection.mutable.HashMap [HiveVersion , Seq [URL ]]
120+ // A map from a given pair of HiveVersion and Hadoop version to jar files.
121+ // It is only used by forVersion.
122+ private val resolvedVersions =
123+ new scala.collection.mutable.HashMap [(HiveVersion , String ), Seq [URL ]]
90124}
91125
92126/**
@@ -106,6 +140,7 @@ private[hive] object IsolatedClientLoader {
106140 * @param config A set of options that will be added to the HiveConf of the constructed client.
107141 * @param isolationOn When true, custom versions of barrier classes will be constructed. Must be
108142 * true unless loading the version of hive that is on Sparks classloader.
143+ * @param sharesHadoopClasses When true, we will share Hadoop classes between Spark and
109144 * @param rootClassLoader The system root classloader. Must not know about Hive classes.
110145 * @param baseClassLoader The spark classloader that is used to load shared classes.
111146 */
@@ -114,6 +149,7 @@ private[hive] class IsolatedClientLoader(
114149 val execJars : Seq [URL ] = Seq .empty,
115150 val config : Map [String , String ] = Map .empty,
116151 val isolationOn : Boolean = true ,
152+ val sharesHadoopClasses : Boolean = true ,
117153 val rootClassLoader : ClassLoader = ClassLoader .getSystemClassLoader.getParent.getParent,
118154 val baseClassLoader : ClassLoader = Thread .currentThread().getContextClassLoader,
119155 val sharedPrefixes : Seq [String ] = Seq .empty,
@@ -126,16 +162,20 @@ private[hive] class IsolatedClientLoader(
126162 /** All jars used by the hive specific classloader. */
127163 protected def allJars = execJars.toArray
128164
129- protected def isSharedClass (name : String ): Boolean =
165+ protected def isSharedClass (name : String ): Boolean = {
166+ val isHadoopClass =
167+ name.startsWith(" org.apache.hadoop." ) && ! name.startsWith(" org.apache.hadoop.hive." )
168+
130169 name.contains(" slf4j" ) ||
131170 name.contains(" log4j" ) ||
132171 name.startsWith(" org.apache.spark." ) ||
133- (name.startsWith( " org.apache.hadoop. " ) && ! name.startsWith( " org.apache.hadoop.hive. " ) ) ||
172+ (sharesHadoopClasses && isHadoopClass ) ||
134173 name.startsWith(" scala." ) ||
135174 (name.startsWith(" com.google" ) && ! name.startsWith(" com.google.cloud" )) ||
136175 name.startsWith(" java.lang." ) ||
137176 name.startsWith(" java.net" ) ||
138177 sharedPrefixes.exists(name.startsWith)
178+ }
139179
140180 /** True if `name` refers to a spark class that must see specific version of Hive. */
141181 protected def isBarrierClass (name : String ): Boolean =
0 commit comments