apache · zjffdu · Jan 29, 2016 · holdenk · Jan 26, 2018 · zjffdu
diff --git a/core/src/main/scala/org/apache/spark/SparkEnv.scala b/core/src/main/scala/org/apache/spark/SparkEnv.scala
@@ -114,7 +114,8 @@ class SparkEnv (
   def createPythonWorker(pythonExec: String, envVars: Map[String, String]): java.net.Socket = {
     synchronized {
       val key = (pythonExec, envVars)
-      pythonWorkers.getOrElseUpdate(key, new PythonWorkerFactory(pythonExec, envVars)).create()
+      pythonWorkers.getOrElseUpdate(key,
+        new PythonWorkerFactory(pythonExec, envVars, conf)).create()
     }
   }
 

diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonWorkerFactory.scala b/core/src/main/scala/org/apache/spark/api/python/PythonWorkerFactory.scala
@@ -21,6 +21,7 @@ import java.io.{DataInputStream, DataOutputStream, EOFException, InputStream, Ou
 import java.net.{InetAddress, ServerSocket, Socket, SocketException}
 import java.nio.charset.StandardCharsets
 import java.util.Arrays
+import java.util.concurrent.atomic.AtomicInteger
 
 import scala.collection.JavaConverters._
 import scala.collection.mutable
@@ -30,7 +31,11 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.security.SocketAuthHelper
 import org.apache.spark.util.{RedirectThread, Utils}
 
-private[spark] class PythonWorkerFactory(pythonExec: String, envVars: Map[String, String])
+
+private[spark] class PythonWorkerFactory(
+    pythonExec: String,
+    envVars: Map[String, String],
+    conf: SparkConf)
   extends Logging {
 
   import PythonWorkerFactory._
@@ -76,6 +81,14 @@ private[spark] class PythonWorkerFactory(pythonExec: String, envVars: Map[String
   val daemonWorkers = new mutable.WeakHashMap[Socket, Int]()
   val idleWorkers = new mutable.Queue[Socket]()
   var lastActivity = 0L
+  val virtualEnvEnabled = conf.getBoolean("spark.pyspark.virtualenv.enabled", false)
+  val virtualenvPythonExec = if (virtualEnvEnabled) {
+    val virtualEnvFactory = new VirtualEnvFactory(pythonExec, conf, false)
+    virtualEnvFactory.setupVirtualEnv()
+  } else {
+    pythonExec
+  }
+
   new MonitorThread().start()
 
   var simpleWorkers = new mutable.WeakHashMap[Socket, Process]()
@@ -144,7 +157,7 @@ private[spark] class PythonWorkerFactory(pythonExec: String, envVars: Map[String
       serverSocket = new ServerSocket(0, 1, InetAddress.getByAddress(Array(127, 0, 0, 1)))
 
       // Create and start the worker
-      val pb = new ProcessBuilder(Arrays.asList(pythonExec, "-m", workerModule))
+      val pb = new ProcessBuilder(Arrays.asList(virtualenvPythonExec, "-m", workerModule))
       val workerEnv = pb.environment()
       workerEnv.putAll(envVars.asJava)
       workerEnv.put("PYTHONPATH", pythonPath)
@@ -186,7 +199,7 @@ private[spark] class PythonWorkerFactory(pythonExec: String, envVars: Map[String
 
       try {
         // Create and start the daemon
-        val command = Arrays.asList(pythonExec, "-m", daemonModule)
+        val command = Arrays.asList(virtualenvPythonExec, "-m", daemonModule)
         val pb = new ProcessBuilder(command)
         val workerEnv = pb.environment()
         workerEnv.putAll(envVars.asJava)

diff --git a/core/src/main/scala/org/apache/spark/api/python/VirtualEnvFactory.scala b/core/src/main/scala/org/apache/spark/api/python/VirtualEnvFactory.scala
@@ -0,0 +1,164 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.api.python
+
+import java.io.File
+import java.util.{Map => JMap}
+import java.util.Arrays
+import java.util.concurrent.atomic.AtomicInteger
+
+import scala.collection.JavaConverters._
+
+import com.google.common.io.Files
+
+import org.apache.spark.SparkConf
+import org.apache.spark.internal.Logging
+
+
+class VirtualEnvFactory(pythonExec: String, conf: SparkConf, isDriver: Boolean)
+  extends Logging {
+
+  private val virtualEnvType = conf.get("spark.pyspark.virtualenv.type", "native")
+  private val virtualEnvBinPath = conf.get("spark.pyspark.virtualenv.bin.path", "")
+  private val initPythonPackages = conf.getOption("spark.pyspark.virtualenv.packages")
+  private var virtualEnvName: String = _
+  private var virtualPythonExec: String = _
+  private val VIRTUALENV_ID = new AtomicInteger()
+  private var isLauncher: Boolean = false
+
+  // used by launcher when user want to use virtualenv in pyspark shell. Launcher need this class
+  // to create virtualenv for driver.
+  def this(pythonExec: String, properties: JMap[String, String], isDriver: java.lang.Boolean) {
+    this(pythonExec, new SparkConf().setAll(properties.asScala), isDriver)
+    this.isLauncher = true
+  }
+
+  /*
+   * Create virtualenv using native virtualenv or conda
+   *
+   */
+  def setupVirtualEnv(): String = {
+    /*
+     *
+     * Native Virtualenv:
+     *   -  Execute command: virtualenv -p <pythonExec> --no-site-packages <virtualenvName>
+     *   -  Execute command: python -m pip --cache-dir <cache-dir> install -r <requirement_file>
+     *
+     * Conda
+     *   -  Execute command: conda create --prefix <prefix> --file <requirement_file> -y
+     *
+     */
+    logInfo("Start to setup virtualenv...")
+    logDebug("user.dir=" + System.getProperty("user.dir"))
+    logDebug("user.home=" + System.getProperty("user.home"))
+
+    require(virtualEnvType == "native" || virtualEnvType == "conda",
+      s"VirtualEnvType: $virtualEnvType is not supported." )
+    require(new File(virtualEnvBinPath).exists(),
+      s"VirtualEnvBinPath: $virtualEnvBinPath is not defined or doesn't exist.")
+    // Two scenarios of creating virtualenv:
+    // 1. created in yarn container. Yarn will clean it up after container is exited
+    // 2. created outside yarn container. Spark need to create temp directory and clean it after app
+    //    finish.
+    //      - driver of PySpark shell
+    //      - driver of yarn-client mode
+    if (isLauncher ||
+      (isDriver && conf.get("spark.submit.deployMode") == "client")) {
+      val virtualenvBasedir = Files.createTempDir()
+      virtualenvBasedir.deleteOnExit()
+      virtualEnvName = virtualenvBasedir.getAbsolutePath
+    } else if (isDriver && conf.get("spark.submit.deployMode") == "cluster") {
+      virtualEnvName = "virtualenv_driver"
+    } else {
+      // use the working directory of Executor
+      virtualEnvName = "virtualenv_" + conf.getAppId + "_" + VIRTUALENV_ID.getAndIncrement()
+    }
+
+    // Use the absolute path of requirement file in the following cases
+    // 1. driver of pyspark shell
+    // 2. driver of yarn-client mode
+    // otherwise just use filename as it would be downloaded to the working directory of Executor
+    val pysparkRequirements =
+      if (isLauncher ||
+        (isDriver && conf.get("spark.submit.deployMode") == "client")) {
+        conf.getOption("spark.pyspark.virtualenv.requirements")
+      } else {
+        conf.getOption("spark.pyspark.virtualenv.requirements").map(_.split("/").last)
+      }
+
+    val createEnvCommand =
+      if (virtualEnvType == "native") {
+        List(virtualEnvBinPath,
+          "-p", pythonExec,
+          "--no-site-packages", virtualEnvName)
+      } else {
+        // Two cases of conda
+        //    1. requirement file is specified. (Batch mode)
+        //    2. requirement file is not specified. (Interactive mode).
+        //       In this case `spark.pyspark.virtualenv.python_version` must be specified.
+
+        if (pysparkRequirements.isDefined) {
+          List(virtualEnvBinPath,
+            "create", "--prefix", virtualEnvName,
+            "--file", pysparkRequirements.get, "-y")
+        } else {
+          require(conf.contains("spark.pyspark.virtualenv.python_version"),
+            "spark.pyspark.virtualenv.python_version is not set when using conda " +
+              "in interactive mode")
+          val pythonVersion = conf.get("spark.pyspark.virtualenv.python_version")
+          List(virtualEnvBinPath,
+            "create", "--prefix", virtualEnvName,
+            "python=" + pythonVersion, "-y")
+        }
+      }
+    execCommand(createEnvCommand)
+
+    virtualPythonExec = virtualEnvName + "/bin/python"
+    if (virtualEnvType == "native" && pysparkRequirements.isDefined) {
+      // requirement file for native is not mandatory, run this only when requirement file
+      // is specified.
+      execCommand(List(virtualPythonExec, "-m", "pip",
+        "--cache-dir", System.getProperty("user.home"),
+        "install", "-r", pysparkRequirements.get))
+    }
+    // install additional packages
+    if (initPythonPackages.isDefined) {
+      execCommand(List(virtualPythonExec, "-m", "pip",
+        "install") ::: initPythonPackages.get.split(":").toList);
+    }
+    logInfo(s"virtualenv is created at to $virtualPythonExec")
+    virtualPythonExec
+  }
+
+  private def execCommand(commands: List[String]): Unit = {
+    logInfo("Running command:" + commands.mkString(" "))
+    val pb = new ProcessBuilder(commands.asJava)
+    // don't inheritIO when it is used in launcher, because launcher would capture the standard
+    // output to assemble the spark-submit command.
+    if(!isLauncher) {
+      pb.inheritIO();
+    }
+    // pip internally use environment variable `HOME`
+    pb.environment().put("HOME", System.getProperty("user.home"))
+    val proc = pb.start()
+    val exitCode = proc.waitFor()
+    if (exitCode != 0) {
+      throw new RuntimeException("Fail to run command: " + commands.mkString(" "))
+    }
+  }
+}
diff --git a/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala b/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala
@@ -26,7 +26,7 @@ import scala.collection.mutable.ArrayBuffer
 import scala.util.Try
 
 import org.apache.spark.{SparkConf, SparkUserAppException}
-import org.apache.spark.api.python.PythonUtils
+import org.apache.spark.api.python.{PythonUtils, VirtualEnvFactory}
 import org.apache.spark.internal.config._
 import org.apache.spark.util.{RedirectThread, Utils}
 
@@ -41,12 +41,17 @@ object PythonRunner {
     val otherArgs = args.slice(2, args.length)
     val sparkConf = new SparkConf()
     val secret = Utils.createSecret(sparkConf)
-    val pythonExec = sparkConf.get(PYSPARK_DRIVER_PYTHON)
+    var pythonExec = sparkConf.get(PYSPARK_DRIVER_PYTHON)
       .orElse(sparkConf.get(PYSPARK_PYTHON))
       .orElse(sys.env.get("PYSPARK_DRIVER_PYTHON"))
       .orElse(sys.env.get("PYSPARK_PYTHON"))
       .getOrElse("python")
 
+    if (sparkConf.getBoolean("spark.pyspark.virtualenv.enabled", false)) {
+      val virtualEnvFactory = new VirtualEnvFactory(pythonExec, sparkConf, true)
+      pythonExec = virtualEnvFactory.setupVirtualEnv()
+    }
+
     // Format python file paths before adding them to the PYTHONPATH
     val formattedPythonFile = formatPath(pythonFile)
     val formattedPyFiles = resolvePyFiles(formatPaths(pyFiles))

diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -439,6 +439,19 @@ private[spark] class SparkSubmit extends Logging {
       sparkConf.set("spark.submit.pyFiles", localPyFiles)
     }
 
+    // for pyspark virtualenv
+    if (args.isPython) {
+      if (clusterManager != YARN &&
+        args.sparkProperties.getOrElse("spark.pyspark.virtualenv.enabled", "false") == "true") {
+        printErrorAndExit("virtualenv is only supported in yarn mode")
+      }
+      if (args.sparkProperties.contains("spark.pyspark.virtualenv.requirements")) {
+        // distribute virtualenv requirement file as --files
+        args.files = mergeFileLists(args.files,
+          args.sparkProperties("spark.pyspark.virtualenv.requirements"))
+      }
+    }
+
     // In YARN mode for an R app, add the SparkR package archive and the R package
     // archive containing all of the built R libraries to archives so that they can
     // be distributed with the job

diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
@@ -24,7 +24,7 @@ import javax.annotation.concurrent.GuardedBy
 import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet}
 import scala.concurrent.Future
 
-import org.apache.spark.{ExecutorAllocationClient, SparkEnv, SparkException, TaskState}
+import org.apache.spark._
 import org.apache.spark.internal.Logging
 import org.apache.spark.rpc._
 import org.apache.spark.scheduler._
@@ -98,7 +98,7 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
   private val reviveThread =
     ThreadUtils.newDaemonSingleThreadScheduledExecutor("driver-revive-thread")
 
-  class DriverEndpoint(override val rpcEnv: RpcEnv, sparkProperties: Seq[(String, String)])
+  class DriverEndpoint(override val rpcEnv: RpcEnv)
     extends ThreadSafeRpcEndpoint with Logging {
 
     // Executors that have been lost, but for which we don't yet know the real exit reason.
@@ -228,6 +228,12 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
         context.reply(true)
 
       case RetrieveSparkAppConfig =>
+        val sparkProperties = new ArrayBuffer[(String, String)]
+        for ((key, value) <- scheduler.sc.conf.getAll) {
+          if (key.startsWith("spark.")) {
+            sparkProperties += ((key, value))
+          }
+        }
         val reply = SparkAppConfig(
           sparkProperties,
           SparkEnv.get.securityManager.getIOEncryptionKey(),
@@ -380,24 +386,16 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
   protected def minRegisteredRatio: Double = _minRegisteredRatio
 
   override def start() {
-    val properties = new ArrayBuffer[(String, String)]
-    for ((key, value) <- scheduler.sc.conf.getAll) {
-      if (key.startsWith("spark.")) {
-        properties += ((key, value))
-      }
-    }
-
     // TODO (prashant) send conf instead of properties
-    driverEndpoint = createDriverEndpointRef(properties)
+    driverEndpoint = createDriverEndpointRef()
   }
 
-  protected def createDriverEndpointRef(
-      properties: ArrayBuffer[(String, String)]): RpcEndpointRef = {
-    rpcEnv.setupEndpoint(ENDPOINT_NAME, createDriverEndpoint(properties))
+  protected def createDriverEndpointRef(): RpcEndpointRef = {
+    rpcEnv.setupEndpoint(ENDPOINT_NAME, createDriverEndpoint())
   }
 
-  protected def createDriverEndpoint(properties: Seq[(String, String)]): DriverEndpoint = {
-    new DriverEndpoint(rpcEnv, properties)
+  protected def createDriverEndpoint(): DriverEndpoint = {
+    new DriverEndpoint(rpcEnv)
   }
 
   def stopExecutors() {