From 88b04e0f6ea75b5126f7835fbfaeaa600c059167 Mon Sep 17 00:00:00 2001 From: Sandy Ryza Date: Tue, 18 Feb 2014 16:30:06 -0800 Subject: [PATCH 1/2] SPARK-1064. Make it possible to run on YARN without bundling Hadoop jars in Spark assembly --- docs/building-with-maven.md | 6 +++ pom.xml | 45 +++++++++++++++++++ .../apache/spark/deploy/yarn/ClientBase.scala | 14 +++++- 3 files changed, 64 insertions(+), 1 deletion(-) diff --git a/docs/building-with-maven.md b/docs/building-with-maven.md index d3bc34e68b24..730a6e793256 100644 --- a/docs/building-with-maven.md +++ b/docs/building-with-maven.md @@ -88,3 +88,9 @@ Running only java 8 tests and nothing else. Java 8 tests are run when -Pjava8-tests profile is enabled, they will run in spite of -DskipTests. For these tests to run your system must have a JDK 8 installation. If you have JDK 8 installed but it is not the system default, you can set JAVA_HOME to point to JDK 8 before running the tests. + +## Packaging without Hadoop dependencies for deployment on YARN ## + +The assembly jar produced by "mvn package" will, by default, include all of Spark's dependencies, including Hadoop and some of its ecosystem projects. On YARN deployments, this causes multiple versions of these to appear on executor classpaths: the version packaged in the Spark assembly and the version on each node, included with yarn.application.classpath. The "hadoop-provided" profile builds the assembly without including Hadoop-ecosystem projects, like ZooKeeper and Hadoop itself. + + diff --git a/pom.xml b/pom.xml index f0c877dcfe7b..99f0d410de8a 100644 --- a/pom.xml +++ b/pom.xml @@ -800,5 +800,50 @@ + + + hadoop-provided + + false + + + + org.apache.hadoop + hadoop-client + provided + + + org.apache.hadoop + hadoop-yarn-api + provided + + + org.apache.hadoop + hadoop-yarn-common + provided + + + org.apache.hadoop + hadoop-yarn-client + provided + + + org.apache.avro + avro + provided + + + org.apache.avro + avro-ipc + provided + + + org.apache.zookeeper + zookeeper + provided + + + + diff --git a/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ClientBase.scala b/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ClientBase.scala index 4b6c7db836b0..d0817fc2baef 100644 --- a/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ClientBase.scala +++ b/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ClientBase.scala @@ -29,8 +29,10 @@ import org.apache.hadoop.fs._ import org.apache.hadoop.fs.permission.FsPermission; import org.apache.hadoop.io.DataOutputBuffer import org.apache.hadoop.mapred.Master +import org.apache.hadoop.mapreduce.MRJobConfig import org.apache.hadoop.net.NetUtils import org.apache.hadoop.security.UserGroupInformation +import org.apache.hadoop.util.StringUtils import org.apache.hadoop.yarn.api._ import org.apache.hadoop.yarn.api.ApplicationConstants.Environment import org.apache.hadoop.yarn.api.protocolrecords._ @@ -379,7 +381,17 @@ object ClientBase { // Based on code from org.apache.hadoop.mapreduce.v2.util.MRApps def populateHadoopClasspath(conf: Configuration, env: HashMap[String, String]) { - for (c <- conf.getStrings(YarnConfiguration.YARN_APPLICATION_CLASSPATH)) { + val classpathEntries = Option(conf.getStrings( + YarnConfiguration.YARN_APPLICATION_CLASSPATH)).getOrElse( + YarnConfiguration.DEFAULT_YARN_APPLICATION_CLASSPATH) + for (c <- classpathEntries) { + Apps.addToEnvironment(env, Environment.CLASSPATH.name, c.trim) + } + + val mrClasspathEntries = Option(conf.getStrings( + MRJobConfig.MAPREDUCE_APPLICATION_CLASSPATH)).getOrElse( + StringUtils.getStrings(MRJobConfig.DEFAULT_MAPREDUCE_APPLICATION_CLASSPATH)) + for (c <- mrClasspathEntries) { Apps.addToEnvironment(env, Environment.CLASSPATH.name, c.trim) } } From 270e4903e6f2f03433871ae84050c3210a8ecbd8 Mon Sep 17 00:00:00 2001 From: Sandy Ryza Date: Tue, 11 Mar 2014 15:10:13 -0700 Subject: [PATCH 2/2] Handle different application classpath variables in different versions --- pom.xml | 1 + .../apache/spark/deploy/yarn/ClientBase.scala | 39 ++++++++++++++++--- 2 files changed, 35 insertions(+), 5 deletions(-) diff --git a/pom.xml b/pom.xml index 99f0d410de8a..d053651dffc0 100644 --- a/pom.xml +++ b/pom.xml @@ -801,6 +801,7 @@ + hadoop-provided diff --git a/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ClientBase.scala b/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ClientBase.scala index d0817fc2baef..74de4293d909 100644 --- a/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ClientBase.scala +++ b/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ClientBase.scala @@ -383,16 +383,45 @@ object ClientBase { def populateHadoopClasspath(conf: Configuration, env: HashMap[String, String]) { val classpathEntries = Option(conf.getStrings( YarnConfiguration.YARN_APPLICATION_CLASSPATH)).getOrElse( - YarnConfiguration.DEFAULT_YARN_APPLICATION_CLASSPATH) + getDefaultYarnApplicationClasspath()) for (c <- classpathEntries) { Apps.addToEnvironment(env, Environment.CLASSPATH.name, c.trim) } val mrClasspathEntries = Option(conf.getStrings( - MRJobConfig.MAPREDUCE_APPLICATION_CLASSPATH)).getOrElse( - StringUtils.getStrings(MRJobConfig.DEFAULT_MAPREDUCE_APPLICATION_CLASSPATH)) - for (c <- mrClasspathEntries) { - Apps.addToEnvironment(env, Environment.CLASSPATH.name, c.trim) + "mapreduce.application.classpath")).getOrElse( + getDefaultMRApplicationClasspath()) + if (mrClasspathEntries != null) { + for (c <- mrClasspathEntries) { + Apps.addToEnvironment(env, Environment.CLASSPATH.name, c.trim) + } + } + } + + def getDefaultYarnApplicationClasspath(): Array[String] = { + try { + val field = classOf[MRJobConfig].getField("DEFAULT_YARN_APPLICATION_CLASSPATH") + field.get(null).asInstanceOf[Array[String]] + } catch { + case err: NoSuchFieldError => null + } + } + + /** + * In Hadoop 0.23, the MR application classpath comes with the YARN application + * classpath. In Hadoop 2.0, it's an array of Strings, and in 2.2+ it's a String. + * So we need to use reflection to retrieve it. + */ + def getDefaultMRApplicationClasspath(): Array[String] = { + try { + val field = classOf[MRJobConfig].getField("DEFAULT_MAPREDUCE_APPLICATION_CLASSPATH") + if (field.getType == classOf[String]) { + StringUtils.getStrings(field.get(null).asInstanceOf[String]) + } else { + field.get(null).asInstanceOf[Array[String]] + } + } catch { + case err: NoSuchFieldError => null } }