apache
diff --git a/‎assembly/pom.xml‎
Lines changed: 0 additions & 13 deletions b/‎assembly/pom.xml‎
Lines changed: 0 additions & 13 deletions
diff --git a/‎core/pom.xml‎
Lines changed: 5 additions & 0 deletions b/‎core/pom.xml‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/api/python/PythonWorkerFactory.scala‎
Lines changed: 2 additions & 2 deletions b/‎core/src/main/scala/org/apache/spark/api/python/PythonWorkerFactory.scala‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala‎
Lines changed: 73 additions & 0 deletions b/‎core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala‎
Lines changed: 73 additions & 0 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala‎
Lines changed: 65 additions & 41 deletions b/‎core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala‎
Lines changed: 65 additions & 41 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala‎
Lines changed: 23 additions & 13 deletions b/‎core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala‎
Lines changed: 23 additions & 13 deletions
@@ -40,14 +40,6 @@
     <deb.user>root</deb.user>
   </properties>
 
-  <repositories>
-    <!-- A repository in the local filesystem for the Py4J JAR, which is not in Maven central -->
-    <repository>
-      <id>lib</id>
-      <url>file://${project.basedir}/lib</url>
-    </repository>
-  </repositories>
-
   <dependencies>
     <dependency>
       <groupId>org.apache.spark</groupId>
@@ -84,11 +76,6 @@
       <artifactId>spark-sql_${scala.binary.version}</artifactId>
       <version>${project.version}</version>
     </dependency>
-    <dependency>
-      <groupId>net.sf.py4j</groupId>
-      <artifactId>py4j</artifactId>
-      <version>0.8.1</version>
-    </dependency>
   </dependencies>
 
   <build>
 
@@ -247,6 +247,11 @@
       <artifactId>pyrolite</artifactId>
       <version>2.0.1</version>
     </dependency>
+    <dependency>
+      <groupId>net.sf.py4j</groupId>
+      <artifactId>py4j</artifactId>
+      <version>0.8.1</version>
+    </dependency>
   </dependencies>
   <build>
     <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
 
@@ -78,7 +78,7 @@ private[spark] class PythonWorkerFactory(pythonExec: String, envVars: Map[String
       serverSocket = new ServerSocket(0, 1, InetAddress.getByAddress(Array(127, 0, 0, 1)))
 
       // Create and start the worker
-      val pb = new ProcessBuilder(Seq(pythonExec, "-m", "pyspark.worker"))
+      val pb = new ProcessBuilder(Seq(pythonExec, "-u", "-m", "pyspark.worker"))
       val workerEnv = pb.environment()
       workerEnv.putAll(envVars)
       val worker = pb.start()
@@ -151,7 +151,7 @@ private[spark] class PythonWorkerFactory(pythonExec: String, envVars: Map[String
 
       try {
         // Create and start the daemon
-        val pb = new ProcessBuilder(Seq(pythonExec, "-m", "pyspark.daemon"))
+        val pb = new ProcessBuilder(Seq(pythonExec, "-u", "-m", "pyspark.daemon"))
         val workerEnv = pb.environment()
         workerEnv.putAll(envVars)
         daemon = pb.start()
 
@@ -0,0 +1,73 @@
+package org.apache.spark.deploy
+
+import java.io.{IOException, File, InputStream, OutputStream}
+
+import scala.collection.mutable.ArrayBuffer
+import scala.collection.JavaConversions._
+
+import org.apache.spark.SparkContext
+
+/**
+ * A main class used by spark-submit to launch Python applications. It executes python as a
+ * subprocess and then has it connect back to the JVM to access system properties, etc.
+ */
+object PythonRunner {
+  def main(args: Array[String]) {
+    val primaryResource = args(0)
+    val pyFiles = args(1)
+    val otherArgs = args.slice(2, args.length)
+
+    val pythonExec = sys.env.get("PYSPARK_PYTHON").getOrElse("python") // TODO: get this from conf
+
+    // Launch a Py4J gateway server for the process to connect to; this will let it see our
+    // Java system properties and such
+    val gatewayServer = new py4j.GatewayServer(null, 0)
+    gatewayServer.start()
+
+    // Build up a PYTHONPATH that includes the Spark assembly JAR (where this class is), the
+    // python directories in SPARK_HOME (if set), and any files in the pyFiles argument
+    val sparkJar = SparkContext.jarOfObject(this).get
+    val pathSeparator: String = System.getProperty("path.separator")
+    val pythonPath = new ArrayBuffer[String]
+    pythonPath += sparkJar
+    pythonPath ++= pyFiles.split(",").filter(_ != "")
+    for (sparkHome <- sys.env.get("SPARK_HOME")) {
+      pythonPath += Seq(sparkHome, "python").mkString(File.separator)
+      pythonPath += Seq(sparkHome, "python", "lib", "py4j-0.8.1-src.zip").mkString(File.separator)
+    }
+    for (oldPythonPath <- sys.env.get("PYTHONPATH")) {
+      pythonPath ++= oldPythonPath.split(pathSeparator)
+    }
+
+    // Launch Python process
+    val builder = new ProcessBuilder(Seq(pythonExec, "-u", primaryResource) ++ otherArgs)
+    val env = builder.environment()
+    env.put("PYTHONPATH", pythonPath.mkString(pathSeparator))
+    env.put("PYSPARK_GATEWAY_PORT", "" + gatewayServer.getListeningPort)
+    builder.redirectErrorStream(true) // Ugly but needed for stdout and stderr to synchronize
+    val process = builder.start()
+
+    new RedirectThread(process.getInputStream, System.out, "redirect output").start()
+
+    System.exit(process.waitFor())
+  }
+
+  /**
+   * A utility class to redirect the child process's stdout or stderr
+   */
+  class RedirectThread(in: InputStream, out: OutputStream, name: String) extends Thread(name) {
+    setDaemon(true)
+    override def run() {
+      scala.util.control.Exception.ignoring(classOf[IOException]) {
+        // FIXME: We copy the stream on the level of bytes to avoid encoding problems.
+        val buf = new Array[Byte](1024)
+        var len = in.read(buf)
+        while (len != -1) {
+          out.write(buf, 0, len)
+          out.flush()
+          len = in.read(buf)
+        }
+      }
+    }
+  }
+}
@@ -60,11 +60,11 @@ object SparkSubmit {
   private[spark] var exitFn: () => Unit = () => System.exit(-1)
 
   private[spark] def printErrorAndExit(str: String) = {
-    printStream.println("error: " + str)
-    printStream.println("run with --help for more information or --verbose for debugging output")
+    printStream.println("Error: " + str)
+    printStream.println("Run with --help for usage help or --verbose for debug output")
     exitFn()
   }
-  private[spark] def printWarning(str: String) = printStream.println("warning: " + str)
+  private[spark] def printWarning(str: String) = printStream.println("Warning: " + str)
 
   /**
    * @return
@@ -83,7 +83,7 @@ object SparkSubmit {
     } else if (appArgs.master.startsWith("mesos")) {
       clusterManager = MESOS
     } else {
-      printErrorAndExit("master must start with yarn, mesos, spark, or local")
+      printErrorAndExit("Master must start with yarn, mesos, spark, or local")
     }
 
     // Because "yarn-cluster" and "yarn-client" encapsulate both the master
@@ -116,9 +116,20 @@ object SparkSubmit {
     var childMainClass = ""
 
     if (clusterManager == MESOS && deployOnCluster) {
-      printErrorAndExit("Mesos does not support running the driver on the cluster")
+      printErrorAndExit("Cannot run driver on the cluster in Mesos")
     }
 
+    // If we're running a Python app, set the Java class to run to be our PythonRunner, add
+    // Python files to deployment list, and pass the main file and Python path to PythonRunner
+    if (appArgs.isPython) {
+      appArgs.mainClass = "org.apache.spark.deploy.PythonRunner"
+      appArgs.files = mergeFileLists(appArgs.files, appArgs.pyFiles, appArgs.primaryResource)
+      val pyFiles = Option(appArgs.pyFiles).getOrElse("")
+      appArgs.childArgs = ArrayBuffer(appArgs.primaryResource, pyFiles) ++ appArgs.childArgs
+      appArgs.primaryResource = RESERVED_JAR_NAME
+    }
+
+    // If we're deploying into YARN, use yarn.Client as a wrapper around the user class
     if (!deployOnCluster) {
       childMainClass = appArgs.mainClass
       if (appArgs.primaryResource != RESERVED_JAR_NAME) {
@@ -130,8 +141,8 @@ object SparkSubmit {
       childArgs += ("--class", appArgs.mainClass)
     }
 
+    // Make sure YARN is included in our build if we're trying to use it
     if (clusterManager == YARN) {
-      // The choice of class is arbitrary, could use any spark-yarn class
       if (!Utils.classIsLoadable("org.apache.spark.deploy.yarn.Client") && !Utils.isTesting) {
         val msg = "Could not load YARN classes. This copy of Spark may not have been compiled " +
           "with YARN support."
@@ -142,38 +153,39 @@ object SparkSubmit {
     // Special flag to avoid deprecation warnings at the client
     sysProps("SPARK_SUBMIT") = "true"
 
+    // A list of rules to map each argument to system properties or command-line options in
+    // each deploy mode; we iterate through these below
     val options = List[OptionAssigner](
-      new OptionAssigner(appArgs.master, ALL_CLUSTER_MGRS, false, sysProp = "spark.master"),
-      new OptionAssigner(appArgs.driverExtraClassPath, STANDALONE | YARN, true,
+      OptionAssigner(appArgs.master, ALL_CLUSTER_MGRS, false, sysProp = "spark.master"),
+      OptionAssigner(appArgs.driverExtraClassPath, STANDALONE | YARN, true,
         sysProp = "spark.driver.extraClassPath"),
-      new OptionAssigner(appArgs.driverExtraJavaOptions, STANDALONE | YARN, true,
+      OptionAssigner(appArgs.driverExtraJavaOptions, STANDALONE | YARN, true,
         sysProp = "spark.driver.extraJavaOptions"),
-      new OptionAssigner(appArgs.driverExtraLibraryPath, STANDALONE | YARN, true,
+      OptionAssigner(appArgs.driverExtraLibraryPath, STANDALONE | YARN, true,
         sysProp = "spark.driver.extraLibraryPath"),
-      new OptionAssigner(appArgs.driverMemory, YARN, true, clOption = "--driver-memory"),
-      new OptionAssigner(appArgs.name, YARN, true, clOption = "--name"),
-      new OptionAssigner(appArgs.queue, YARN, true, clOption = "--queue"),
-      new OptionAssigner(appArgs.queue, YARN, false, sysProp = "spark.yarn.queue"),
-      new OptionAssigner(appArgs.numExecutors, YARN, true, clOption = "--num-executors"),
-      new OptionAssigner(appArgs.numExecutors, YARN, false, sysProp = "spark.executor.instances"),
-      new OptionAssigner(appArgs.executorMemory, YARN, true, clOption = "--executor-memory"),
-      new OptionAssigner(appArgs.executorMemory, STANDALONE | MESOS | YARN, false,
+      OptionAssigner(appArgs.driverMemory, YARN, true, clOption = "--driver-memory"),
+      OptionAssigner(appArgs.name, YARN, true, clOption = "--name"),
+      OptionAssigner(appArgs.queue, YARN, true, clOption = "--queue"),
+      OptionAssigner(appArgs.queue, YARN, false, sysProp = "spark.yarn.queue"),
+      OptionAssigner(appArgs.numExecutors, YARN, true, clOption = "--num-executors"),
+      OptionAssigner(appArgs.numExecutors, YARN, false, sysProp = "spark.executor.instances"),
+      OptionAssigner(appArgs.executorMemory, YARN, true, clOption = "--executor-memory"),
+      OptionAssigner(appArgs.executorMemory, STANDALONE | MESOS | YARN, false,
         sysProp = "spark.executor.memory"),
-      new OptionAssigner(appArgs.driverMemory, STANDALONE, true, clOption = "--memory"),
-      new OptionAssigner(appArgs.driverCores, STANDALONE, true, clOption = "--cores"),
-      new OptionAssigner(appArgs.executorCores, YARN, true, clOption = "--executor-cores"),
-      new OptionAssigner(appArgs.executorCores, YARN, false, sysProp = "spark.executor.cores"),
-      new OptionAssigner(appArgs.totalExecutorCores, STANDALONE | MESOS, false,
+      OptionAssigner(appArgs.driverMemory, STANDALONE, true, clOption = "--memory"),
+      OptionAssigner(appArgs.driverCores, STANDALONE, true, clOption = "--cores"),
+      OptionAssigner(appArgs.executorCores, YARN, true, clOption = "--executor-cores"),
+      OptionAssigner(appArgs.executorCores, YARN, false, sysProp = "spark.executor.cores"),
+      OptionAssigner(appArgs.totalExecutorCores, STANDALONE | MESOS, false,
         sysProp = "spark.cores.max"),
-      new OptionAssigner(appArgs.files, YARN, false, sysProp = "spark.yarn.dist.files"),
-      new OptionAssigner(appArgs.files, YARN, true, clOption = "--files"),
-      new OptionAssigner(appArgs.archives, YARN, false, sysProp = "spark.yarn.dist.archives"),
-      new OptionAssigner(appArgs.archives, YARN, true, clOption = "--archives"),
-      new OptionAssigner(appArgs.jars, YARN, true, clOption = "--addJars"),
-      new OptionAssigner(appArgs.files, LOCAL | STANDALONE | MESOS, true, sysProp = "spark.files"),
-      new OptionAssigner(appArgs.jars, LOCAL | STANDALONE | MESOS, false, sysProp = "spark.jars"),
-      new OptionAssigner(appArgs.name, LOCAL | STANDALONE | MESOS, false,
-        sysProp = "spark.app.name")
+      OptionAssigner(appArgs.files, YARN, false, sysProp = "spark.yarn.dist.files"),
+      OptionAssigner(appArgs.files, YARN, true, clOption = "--files"),
+      OptionAssigner(appArgs.archives, YARN, false, sysProp = "spark.yarn.dist.archives"),
+      OptionAssigner(appArgs.archives, YARN, true, clOption = "--archives"),
+      OptionAssigner(appArgs.jars, YARN, true, clOption = "--addJars"),
+      OptionAssigner(appArgs.files, LOCAL | STANDALONE | MESOS, true, sysProp = "spark.files"),
+      OptionAssigner(appArgs.jars, LOCAL | STANDALONE | MESOS, false, sysProp = "spark.jars"),
+      OptionAssigner(appArgs.name, LOCAL | STANDALONE | MESOS, false, sysProp = "spark.app.name")
     )
 
     // For client mode make any added jars immediately visible on the classpath
@@ -183,9 +195,10 @@ object SparkSubmit {
       }
     }
 
+    // Map all arguments to command-line options or system properties for our chosen mode
     for (opt <- options) {
       if (opt.value != null && deployOnCluster == opt.deployOnCluster &&
-        (clusterManager & opt.clusterManager) != 0) {
+          (clusterManager & opt.clusterManager) != 0) {
         if (opt.clOption != null) {
           childArgs += (opt.clOption, opt.value)
         } else if (opt.sysProp != null) {
@@ -230,8 +243,8 @@ object SparkSubmit {
   }
 
   private def launch(childArgs: ArrayBuffer[String], childClasspath: ArrayBuffer[String],
-      sysProps: Map[String, String], childMainClass: String, verbose: Boolean = false) {
-
+      sysProps: Map[String, String], childMainClass: String, verbose: Boolean = false)
+  {
     if (verbose) {
       printStream.println(s"Main class:\n$childMainClass")
       printStream.println(s"Arguments:\n${childArgs.mkString("\n")}")
@@ -273,15 +286,26 @@ object SparkSubmit {
     val url = localJarFile.getAbsoluteFile.toURI.toURL
     loader.addURL(url)
   }
+
+  /**
+   * Merge a sequence of comma-separated file lists, some of which may be null to indicate
+   * no files, into a single comma-separated string.
+   */
+  private[spark] def mergeFileLists(lists: String*): String = {
+    val merged = lists.filter(_ != null)
+                      .flatMap(_.split(","))
+                      .mkString(",")
+    if (merged == "") null else merged
+  }
 }
 
 /**
  * Provides an indirection layer for passing arguments as system properties or flags to
  * the user's driver program or to downstream launcher tools.
  */
-private[spark] class OptionAssigner(val value: String,
-  val clusterManager: Int,
-  val deployOnCluster: Boolean,
-  val clOption: String = null,
-  val sysProp: String = null
-) { }
+private[spark] case class OptionAssigner(
+    value: String,
+    clusterManager: Int,
+    deployOnCluster: Boolean,
+    clOption: String = null,
+    sysProp: String = null)
@@ -79,7 +79,7 @@ private[spark] class SparkSubmitArguments(args: Seq[String]) {
   }
 
   /** Fill in any undefined values based on the current properties file or built-in defaults. */
-  private def loadDefaults() = {
+  private def loadDefaults(): Unit = {
 
     // Use common defaults file, if not specified by user
     if (propertiesFile == null) {
@@ -112,16 +112,25 @@ private[spark] class SparkSubmitArguments(args: Seq[String]) {
 
     // Try to set main class from JAR if no --class argument is given
     if (mainClass == null && !isPython && primaryResource != null) {
-      val jar = new JarFile(primaryResource)
-      // Note that this might still return null if no main-class is set; we catch that later
-      mainClass = jar.getManifest.getMainAttributes.getValue("Main-Class")
+      try {
+        val jar = new JarFile(primaryResource)
+        // Note that this might still return null if no main-class is set; we catch that later
+        mainClass = jar.getManifest.getMainAttributes.getValue("Main-Class")
+      } catch {
+        case e: Exception =>
+          SparkSubmit.printErrorAndExit("Failed to read JAR: " + primaryResource)
+          return
+      }
     }
 
     // Global defaults. These should be keep to minimum to avoid confusing behavior.
     master = Option(master).getOrElse("local[*]")
 
     // Set name from main class if not given
-    name = Option(name).orElse(Option(mainClass)).getOrElse(new File(primaryResource).getName)
+    name = Option(name).orElse(Option(mainClass)).orNull
+    if (name == null && primaryResource != null) {
+      name = Utils.stripDirectory(primaryResource)
+    }
   }
 
   /** Ensure that required fields exists. Call this only once all defaults are loaded. */
@@ -133,7 +142,7 @@ private[spark] class SparkSubmitArguments(args: Seq[String]) {
       SparkSubmit.printErrorAndExit("Must specify a primary resource (JAR or Python file)")
     }
     if (mainClass == null && !isPython) {
-      SparkSubmit.printErrorAndExit("Must specify a main class with --class")
+      SparkSubmit.printErrorAndExit("No main class set in JAR; please specify one with --class")
     }
     if (pyFiles != null && !isPython) {
       SparkSubmit.printErrorAndExit("--py-files given but primary resource is not a Python script")
@@ -165,6 +174,7 @@ private[spark] class SparkSubmitArguments(args: Seq[String]) {
     |  queue                   $queue
     |  numExecutors            $numExecutors
     |  files                   $files
+    |  pyFiles                 $pyFiles
     |  archives                $archives
     |  mainClass               $mainClass
     |  primaryResource         $primaryResource
@@ -309,15 +319,15 @@ private[spark] class SparkSubmitArguments(args: Seq[String]) {
       """Usage: spark-submit [options] <app jar | python file> [app options]
         |Options:
         |  --master MASTER_URL         spark://host:port, mesos://host:port, yarn, or local.
-        |  --deploy-mode DEPLOY_MODE   Where to run the driver program: either 'client' to run
-        |                              on the local machine, or 'cluster' to run inside cluster.
-        |  --class CLASS_NAME          Your application's main class (for Java apps).
+        |  --deploy-mode DEPLOY_MODE   Where to run the driver program: either "client" to run
+        |                              on the local machine, or "cluster" to run inside cluster.
+        |  --class CLASS_NAME          Your application's main class (for Java / Scala apps).
         |  --name NAME                 A name of your application.
         |  --jars JARS                 Comma-separated list of local jars to include on the driver
         |                              and executor classpaths. Doesn't work for drivers in
-        |                              standalone mode with 'cluster' deploy mode.
-        |  --py-files PY_FILES         Comma-separated list of files to place on the PYTHONPATH
-        |                              for Python apps. Can be .py, .zip, or .egg files.
+        |                              standalone mode with "cluster" deploy mode.
+        |  --py-files PY_FILES         Comma-separated list of .zip or .egg files to place on the
+        |                              PYTHONPATH for Python apps.
         |  --files FILES               Comma-separated list of files to be placed in the working
         |                              directory of each executor.
         |  --properties-file FILE      Path to a file from which to load extra properties. If not
@@ -341,7 +351,7 @@ private[spark] class SparkSubmitArguments(args: Seq[String]) {
         |
         | YARN-only:
         |  --executor-cores NUM        Number of cores per executor (Default: 1).
-        |  --queue QUEUE_NAME          The YARN queue to submit to (Default: 'default').
+        |  --queue QUEUE_NAME          The YARN queue to submit to (Default: "default").
         |  --num-executors NUM         Number of executors to launch (Default: 2).
         |  --archives ARCHIVES         Comma separated list of archives to be extracted into the
         |                              working directory of each executor.""".stripMargin