apache
diff --git a/‎core/src/main/scala/org/apache/spark/ExecutorAllocationClient.scala‎
Lines changed: 38 additions & 0 deletions b/‎core/src/main/scala/org/apache/spark/ExecutorAllocationClient.scala‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala‎
Lines changed: 23 additions & 5 deletions b/‎core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala‎
Lines changed: 23 additions & 5 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala‎
Lines changed: 98 additions & 13 deletions b/‎core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala‎
Lines changed: 98 additions & 13 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/scheduler/cluster/StandaloneSchedulerBackend.scala‎
Lines changed: 2 additions & 1 deletion b/‎core/src/main/scala/org/apache/spark/scheduler/cluster/StandaloneSchedulerBackend.scala‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎core/src/main/scala/org/apache/spark/scheduler/dynalloc/ExecutorMonitor.scala‎
Lines changed: 54 additions & 7 deletions b/‎core/src/main/scala/org/apache/spark/scheduler/dynalloc/ExecutorMonitor.scala‎
Lines changed: 54 additions & 7 deletions
@@ -17,6 +17,7 @@
 
 package org.apache.spark
 
+import org.apache.spark.scheduler.ExecutorDecommissionInfo
 /**
  * A client that communicates with the cluster manager to request or kill executors.
  * This is currently supported only in YARN mode.
@@ -81,6 +82,43 @@ private[spark] trait ExecutorAllocationClient {
     countFailures: Boolean,
     force: Boolean = false): Seq[String]
 
+  /**
+   * Request that the cluster manager decommission the specified executors.
+   * Default implementation delegates to kill, scheduler must override
+   * if it supports graceful decommissioning.
+   *
+   * @param executors identifiers of executors & decom info.
+   * @param adjustTargetNumExecutors whether the target number of executors will be adjusted down
+   *                                 after these executors have been decommissioned.
+   * @return the ids of the executors acknowledged by the cluster manager to be removed.
+   */
+  def decommissionExecutors(
+    executors: Seq[(String, ExecutorDecommissionInfo)],
+    adjustTargetNumExecutors: Boolean): Seq[String] = {
+    killExecutors(executors.map(_._1),
+      adjustTargetNumExecutors,
+      countFailures = false)
+  }
+
+
+  /**
+   * Request that the cluster manager decommission the specified executor.
+   * Default implementation delegates to decommissionExecutors, scheduler can override
+   * if it supports graceful decommissioning.
+   *
+   * @param executorId identifiers of executor to decommission
+   * @param decommissionInfo information about the decommission (reason, host loss)
+   * @return whether the request is acknowledged by the cluster manager.
+   */
+  def decommissionExecutor(executorId: String,
+      decommissionInfo: ExecutorDecommissionInfo): Boolean = {
+    val decommissionedExecutors = decommissionExecutors(
+      Seq((executorId, decommissionInfo)),
+      adjustTargetNumExecutors = true)
+    decommissionedExecutors.nonEmpty && decommissionedExecutors(0).equals(executorId)
+  }
+
+
   /**
    * Request that the cluster manager kill every executor on the specified host.
    *
 
@@ -28,6 +28,7 @@ import com.codahale.metrics.{Gauge, MetricRegistry}
 import org.apache.spark.internal.{config, Logging}
 import org.apache.spark.internal.config._
 import org.apache.spark.internal.config.Tests.TEST_SCHEDULE_INTERVAL
+import org.apache.spark.internal.config.Worker.WORKER_DECOMMISSION_ENABLED
 import org.apache.spark.metrics.source.Source
 import org.apache.spark.resource.ResourceProfile.UNKNOWN_RESOURCE_PROFILE_ID
 import org.apache.spark.resource.ResourceProfileManager
@@ -204,7 +205,12 @@ private[spark] class ExecutorAllocationManager(
         s"s${DYN_ALLOCATION_SUSTAINED_SCHEDULER_BACKLOG_TIMEOUT.key} must be > 0!")
     }
     if (!conf.get(config.SHUFFLE_SERVICE_ENABLED)) {
-      if (conf.get(config.DYN_ALLOCATION_SHUFFLE_TRACKING_ENABLED)) {
+      // If dynamic allocation shuffle tracking or worker decommissioning along with
+      // storage shuffle decommissioning is enabled we have *experimental* support for
+      // decommissioning without a shuffle service.
+      if (conf.get(config.DYN_ALLOCATION_SHUFFLE_TRACKING_ENABLED) ||
+          (conf.get(WORKER_DECOMMISSION_ENABLED) &&
+            conf.get(config.STORAGE_DECOMMISSION_SHUFFLE_BLOCKS_ENABLED))) {
         logWarning("Dynamic allocation without a shuffle service is an experimental feature.")
       } else if (!testing) {
         throw new SparkException("Dynamic allocation of executors requires the external " +
@@ -539,7 +545,9 @@ private[spark] class ExecutorAllocationManager(
         // get the running total as we remove or initialize it to the count - pendingRemoval
         val newExecutorTotal = numExecutorsTotalPerRpId.getOrElseUpdate(rpId,
           (executorMonitor.executorCountWithResourceProfile(rpId) -
-            executorMonitor.pendingRemovalCountPerResourceProfileId(rpId)))
+            executorMonitor.pendingRemovalCountPerResourceProfileId(rpId) -
+            executorMonitor.pendingDecommissioningPerResourceProfileId(rpId)
+          ))
         if (newExecutorTotal - 1 < minNumExecutors) {
           logDebug(s"Not removing idle executor $executorIdToBeRemoved because there " +
             s"are only $newExecutorTotal executor(s) left (minimum number of executor limit " +
@@ -565,8 +573,14 @@ private[spark] class ExecutorAllocationManager(
     } else {
       // We don't want to change our target number of executors, because we already did that
       // when the task backlog decreased.
-      client.killExecutors(executorIdsToBeRemoved.toSeq, adjustTargetNumExecutors = false,
-        countFailures = false, force = false)
+      if (conf.get(WORKER_DECOMMISSION_ENABLED)) {
+        val executorIdsWithoutHostLoss = executorIdsToBeRemoved.toSeq.map(
+          id => (id, ExecutorDecommissionInfo("spark scale down", false)))
+        client.decommissionExecutors(executorIdsWithoutHostLoss, adjustTargetNumExecutors = false)
+      } else {
+        client.killExecutors(executorIdsToBeRemoved.toSeq, adjustTargetNumExecutors = false,
+          countFailures = false, force = false)
+      }
     }
 
     // [SPARK-21834] killExecutors api reduces the target number of executors.
@@ -578,7 +592,11 @@ private[spark] class ExecutorAllocationManager(
 
     // reset the newExecutorTotal to the existing number of executors
     if (testing || executorsRemoved.nonEmpty) {
-      executorMonitor.executorsKilled(executorsRemoved.toSeq)
+      if (conf.get(WORKER_DECOMMISSION_ENABLED)) {
+        executorMonitor.executorsDecommissioned(executorsRemoved)
+      } else {
+        executorMonitor.executorsKilled(executorsRemoved.toSeq)
+      }
       logInfo(s"Executors ${executorsRemoved.mkString(",")} removed due to idle timeout.")
       executorsRemoved.toSeq
     } else {
 
@@ -193,7 +193,8 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
 
       case DecommissionExecutor(executorId, decommissionInfo) =>
         logError(s"Received decommission executor message ${executorId}: $decommissionInfo")
-        decommissionExecutor(executorId, decommissionInfo)
+        decommissionExecutors(Seq((executorId, decommissionInfo)),
+          adjustTargetNumExecutors = false)
 
       case RemoveWorker(workerId, host, message) =>
         removeWorker(workerId, host, message)
@@ -274,7 +275,8 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
 
       case DecommissionExecutor(executorId, decommissionInfo) =>
         logError(s"Received decommission executor message ${executorId}: ${decommissionInfo}.")
-        decommissionExecutor(executorId, decommissionInfo)
+        decommissionExecutors(Seq((executorId, decommissionInfo)),
+          adjustTargetNumExecutors = false)
         context.reply(true)
 
       case RetrieveSparkAppConfig(resourceProfileId) =>
@@ -503,6 +505,100 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
 
   protected def minRegisteredRatio: Double = _minRegisteredRatio
 
+  /**
+   * Request that the cluster manager decommission the specified executors.
+   *
+   * @param executors Identifiers of executors & decommission info.
+   * @param adjustTargetNumExecutors whether the target number of executors will be adjusted down
+   *                                 after these executors have been decommissioned.
+   * @return the ids of the executors acknowledged by the cluster manager to be removed.
+   */
+  override def decommissionExecutors(
+      executors: Seq[(String, ExecutorDecommissionInfo)],
+      adjustTargetNumExecutors: Boolean): Seq[String] = {
+
+    val executorsToDecommission = executors.filter{case (executorId, _) =>
+      CoarseGrainedSchedulerBackend.this.synchronized {
+        // Only bother decommissioning executors which are alive.
+        if (isExecutorActive(executorId)) {
+          executorsPendingDecommission += executorId
+          true
+        } else {
+          false
+        }
+      }
+    }
+
+    // If we don't want to replace the executors we are decommissioning
+    if (adjustTargetNumExecutors) {
+      executorsToDecommission.foreach { case (exec, _) =>
+        val rpId = executorDataMap(exec).resourceProfileId
+        val rp = scheduler.sc.resourceProfileManager.resourceProfileFromId(rpId)
+        if (requestedTotalExecutorsPerResourceProfile.isEmpty) {
+          // Assume that we are killing an executor that was started by default and
+          // not through the request api
+          requestedTotalExecutorsPerResourceProfile(rp) = 0
+        } else {
+          val requestedTotalForRp = requestedTotalExecutorsPerResourceProfile(rp)
+          requestedTotalExecutorsPerResourceProfile(rp) = math.max(requestedTotalForRp - 1, 0)
+        }
+      }
+      doRequestTotalExecutors(requestedTotalExecutorsPerResourceProfile.toMap)
+    }
+
+    val decommissioned = executorsToDecommission.filter{case (executorId, decomInfo) =>
+      doDecommission(executorId, decomInfo)
+    }.map(_._1)
+    decommissioned
+  }
+
+
+  private def doDecommission(executorId: String,
+      decomInfo: ExecutorDecommissionInfo): Boolean = {
+
+    logInfo(s"Starting decommissioning executor $executorId.")
+    try {
+      scheduler.executorDecommission(executorId, decomInfo)
+      if (driverEndpoint != null) {
+        logInfo("Propagating executor decommission to driver.")
+        driverEndpoint.send(DecommissionExecutor(executorId, decomInfo))
+      }
+    } catch {
+      case e: Exception =>
+        logError(s"Unexpected error during decommissioning ${e.toString}", e)
+        return false
+    }
+    // Send decommission message to the executor (it could have originated on the executor
+    // but not necessarily.
+    CoarseGrainedSchedulerBackend.this.synchronized {
+      executorDataMap.get(executorId) match {
+        case Some(executorInfo) =>
+          executorInfo.executorEndpoint.send(DecommissionSelf)
+        case None =>
+          // Ignoring the executor since it is not registered.
+          logWarning(s"Attempted to decommission unknown executor $executorId.")
+          return false
+      }
+    }
+    logInfo(s"Finished decommissioning executor $executorId.")
+
+    if (conf.get(STORAGE_DECOMMISSION_ENABLED)) {
+      try {
+        logInfo("Starting decommissioning block manager corresponding to " +
+          s"executor $executorId.")
+        scheduler.sc.env.blockManager.master.decommissionBlockManagers(Seq(executorId))
+      } catch {
+        case e: Exception =>
+          logError("Unexpected error during block manager " +
+            s"decommissioning for executor $executorId: ${e.toString}", e)
+          return false
+      }
+      logInfo(s"Acknowledged decommissioning block manager corresponding to $executorId.")
+    }
+    true
+  }
+
+
   override def start(): Unit = {
     if (UserGroupInformation.isSecurityEnabled()) {
       delegationTokenManager = createTokenManager()
@@ -598,17 +694,6 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
     driverEndpoint.send(RemoveWorker(workerId, host, message))
   }
 
-  /**
-   * Called by subclasses when notified of a decommissioning executor.
-   */
-  private[spark] def decommissionExecutor(
-      executorId: String, decommissionInfo: ExecutorDecommissionInfo): Unit = {
-    if (driverEndpoint != null) {
-      logInfo("Propagating executor decommission to driver.")
-      driverEndpoint.send(DecommissionExecutor(executorId, decommissionInfo))
-    }
-  }
-
   def sufficientResourcesRegistered(): Boolean = true
 
   override def isReady(): Boolean = {
 
@@ -176,7 +176,8 @@ private[spark] class StandaloneSchedulerBackend(
 
   override def executorDecommissioned(fullId: String, decommissionInfo: ExecutorDecommissionInfo) {
     logInfo("Asked to decommission executor")
-    decommissionExecutor(fullId.split("/")(1), decommissionInfo)
+    val execId = fullId.split("/")(1)
+    decommissionExecutors(Seq((execId, decommissionInfo)), adjustTargetNumExecutors = false)
     logInfo("Executor %s decommissioned: %s".format(fullId, decommissionInfo))
   }
 
 
@@ -28,7 +28,7 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.internal.config._
 import org.apache.spark.resource.ResourceProfile.UNKNOWN_RESOURCE_PROFILE_ID
 import org.apache.spark.scheduler._
-import org.apache.spark.storage.RDDBlockId
+import org.apache.spark.storage.{RDDBlockId, ShuffleDataBlockId}
 import org.apache.spark.util.Clock
 
 /**
@@ -114,7 +114,8 @@ private[spark] class ExecutorMonitor(
 
       var newNextTimeout = Long.MaxValue
       timedOutExecs = executors.asScala
-        .filter { case (_, exec) => !exec.pendingRemoval && !exec.hasActiveShuffle }
+        .filter { case (_, exec) =>
+          !exec.pendingRemoval && !exec.hasActiveShuffle && !exec.pendingDecommissioning}
         .filter { case (_, exec) =>
           val deadline = exec.timeoutAt
           if (deadline > now) {
@@ -135,6 +136,7 @@ private[spark] class ExecutorMonitor(
 
   /**
    * Mark the given executors as pending to be removed. Should only be called in the EAM thread.
+   * This covers both kills and decommissions.
    */
   def executorsKilled(ids: Seq[String]): Unit = {
     ids.foreach { id =>
@@ -149,6 +151,19 @@ private[spark] class ExecutorMonitor(
     nextTimeout.set(Long.MinValue)
   }
 
+  private[spark] def executorsDecommissioned(ids: Seq[String]): Unit = {
+    ids.foreach { id =>
+      val tracker = executors.get(id)
+      if (tracker != null) {
+        tracker.pendingDecommissioning = true
+      }
+    }
+
+    // Recompute timed out executors in the next EAM callback, since this call invalidates
+    // the current list.
+    nextTimeout.set(Long.MinValue)
+  }
+
   def executorCount: Int = executors.size()
 
   def executorCountWithResourceProfile(id: Int): Int = {
@@ -171,6 +186,16 @@ private[spark] class ExecutorMonitor(
     executors.asScala.filter { case (k, v) => v.resourceProfileId == id && v.pendingRemoval }.size
   }
 
+  def pendingDecommissioningCount: Int = executors.asScala.count { case (_, exec) =>
+    exec.pendingDecommissioning
+  }
+
+  def pendingDecommissioningPerResourceProfileId(id: Int): Int = {
+    executors.asScala.filter { case (k, v) =>
+      v.resourceProfileId == id && v.pendingDecommissioning
+    }.size
+  }
+
   override def onJobStart(event: SparkListenerJobStart): Unit = {
     if (!shuffleTrackingEnabled) {
       return
@@ -298,6 +323,7 @@ private[spark] class ExecutorMonitor(
       //
       // This means that an executor may be marked as having shuffle data, and thus prevented
       // from being removed, even though the data may not be used.
+      // TODO: Only track used files (SPARK-31974)
       if (shuffleTrackingEnabled && event.reason == Success) {
         stageToShuffleID.get(event.stageId).foreach { shuffleId =>
           exec.addShuffle(shuffleId)
@@ -326,18 +352,33 @@ private[spark] class ExecutorMonitor(
     val removed = executors.remove(event.executorId)
     if (removed != null) {
       decrementExecResourceProfileCount(removed.resourceProfileId)
-      if (!removed.pendingRemoval) {
+      if (!removed.pendingRemoval || !removed.pendingDecommissioning) {
         nextTimeout.set(Long.MinValue)
       }
     }
   }
 
   override def onBlockUpdated(event: SparkListenerBlockUpdated): Unit = {
-    if (!event.blockUpdatedInfo.blockId.isInstanceOf[RDDBlockId]) {
-      return
-    }
     val exec = ensureExecutorIsTracked(event.blockUpdatedInfo.blockManagerId.executorId,
       UNKNOWN_RESOURCE_PROFILE_ID)
+
+    // Check if it is a shuffle file, or RDD to pick the correct codepath for update
+    if (event.blockUpdatedInfo.blockId.isInstanceOf[ShuffleDataBlockId] && shuffleTrackingEnabled) {
+      /**
+       * The executor monitor keeps track of locations of cache and shuffle blocks and this can be
+       * used to decide which executor(s) Spark should shutdown first. Since we move shuffle blocks
+       * around now this wires it up so that it keeps track of it. We only do this for data blocks
+       * as index and other blocks blocks do not necessarily mean the entire block has been
+       * committed.
+       */
+      event.blockUpdatedInfo.blockId match {
+        case ShuffleDataBlockId(shuffleId, _, _) => exec.addShuffle(shuffleId)
+        case _ => // For now we only update on data blocks
+      }
+      return
+    } else if (!event.blockUpdatedInfo.blockId.isInstanceOf[RDDBlockId]) {
+      return
+    }
     val storageLevel = event.blockUpdatedInfo.storageLevel
     val blockId = event.blockUpdatedInfo.blockId.asInstanceOf[RDDBlockId]
 
@@ -410,10 +451,15 @@ private[spark] class ExecutorMonitor(
   }
 
   // Visible for testing
-  def executorsPendingToRemove(): Set[String] = {
+  private[spark] def executorsPendingToRemove(): Set[String] = {
     executors.asScala.filter { case (_, exec) => exec.pendingRemoval }.keys.toSet
   }
 
+  // Visible for testing
+  private[spark] def executorsDecommissioning(): Set[String] = {
+    executors.asScala.filter { case (_, exec) => exec.pendingDecommissioning }.keys.toSet
+  }
+
   /**
    * This method should be used when updating executor state. It guards against a race condition in
    * which the `SparkListenerTaskStart` event is posted before the `SparkListenerBlockManagerAdded`
@@ -466,6 +512,7 @@ private[spark] class ExecutorMonitor(
     @volatile var timedOut: Boolean = false
 
     var pendingRemoval: Boolean = false
+    var pendingDecommissioning: Boolean = false
     var hasActiveShuffle: Boolean = false
 
     private var idleStart: Long = -1
Original file line number	Diff line number	Diff line change
`@@ -176,7 +176,8 @@ private[spark] class StandaloneSchedulerBackend(`
`176`	`176`
`177`	`177`	`override def executorDecommissioned(fullId: String, decommissionInfo: ExecutorDecommissionInfo) {`
`178`	`178`	`logInfo("Asked to decommission executor")`
`179`		`- decommissionExecutor(fullId.split("/")(1), decommissionInfo)`
	`179`	`+ val execId = fullId.split("/")(1)`
	`180`	`+ decommissionExecutors(Seq((execId, decommissionInfo)), adjustTargetNumExecutors = false)`
`180`	`181`	`logInfo("Executor %s decommissioned: %s".format(fullId, decommissionInfo))`
`181`	`182`	`}`
`182`	`183`