[SPARK-20079][yarn] Fix client AM not allocating executors aftert restart.

Marcelo Vanzin · Marcelo Vanzin · commit 1496b78d2bcd · 2017-07-17T17:18:50.000-07:00
The main goal of this change is to avoid the situation described
in the bug, where an AM restart in the middle of a job may cause
no new executors to be allocated because of faulty logic in the
reset path.

The change does two things:

- fixes the executor alloc manager's reset() so that it does not
  stop allocation after a reset() in the middle of a job
- re-orders the initialization of the YarnAllocator class so that
  it fetches the current executor ID before triggering the reset()
  above.

This ensures both that the new allocator gets new requests for executors,
and that it starts from the correct executor id.

Tested with unit tests and by manually causing AM restarts while
running jobs using spark-shell in YARN mode.
diff --git a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
@@ -21,7 +21,7 @@ import java.util.concurrent.TimeUnit
 
 import scala.collection.mutable
 import scala.collection.mutable.ArrayBuffer
-import scala.util.control.ControlThrowable
+import scala.util.control.{ControlThrowable, NonFatal}
 
 import com.codahale.metrics.{Gauge, MetricRegistry}
 
@@ -245,10 +245,15 @@ private[spark] class ExecutorAllocationManager(
   }
 
   /**
-   * Reset the allocation manager to the initial state. Currently this will only be called in
-   * yarn-client mode when AM re-registers after a failure.
+   * Reset the allocation manager when the cluster manager loses track of the driver's state.
+   * This is currently only done in YARN client mode, when the AM is restarted.
+   *
+   * This method forgets about any state about existing executors, and forces the scheduler to
+   * re-evaluate the number of needed executors the next time it's run.
    */
   def reset(): Unit = synchronized {
+    addTime = 0L
+    numExecutorsTarget = initialNumExecutors
     executorsPendingToRemove.clear()
     removeTimes.clear()
   }
@@ -372,8 +377,17 @@ private[spark] class ExecutorAllocationManager(
       return 0
     }
 
-    val addRequestAcknowledged = testing ||
-      client.requestTotalExecutors(numExecutorsTarget, localityAwareTasks, hostToLocalTaskCount)
+    val addRequestAcknowledged = try {
+      testing ||
+        client.requestTotalExecutors(numExecutorsTarget, localityAwareTasks, hostToLocalTaskCount)
+    } catch {
+      case NonFatal(e) =>
+        // Use INFO level so the error it doesn't show up by default in shells. Errors here are more
+        // commonly caused by YARN AM restarts, which is a recoverable issue, and generate a lot of
+        // noisy output.
+        logInfo("Error reaching cluster manager.", e)
+        false
+    }
     if (addRequestAcknowledged) {
       val executorsString = "executor" + { if (delta > 1) "s" else "" }
       logInfo(s"Requesting $delta new $executorsString because tasks are backlogged" +
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala
@@ -35,7 +35,7 @@ private[spark] object CoarseGrainedClusterMessages {
       ioEncryptionKey: Option[Array[Byte]])
     extends CoarseGrainedClusterMessage
 
-  case object GetAMInitialState extends CoarseGrainedClusterMessage
+  case object RetrieveLastAllocatedExecutorId extends CoarseGrainedClusterMessage
 
   // Driver to executors
   case class LaunchTask(data: SerializableBuffer) extends CoarseGrainedClusterMessage
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
@@ -93,6 +93,9 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
   @GuardedBy("CoarseGrainedSchedulerBackend.this")
   protected var localityAwareTasks = 0
 
+  // The num of current max ExecutorId used to re-register appMaster
+  @volatile protected var currentExecutorIdCounter = 0
+
   class DriverEndpoint(override val rpcEnv: RpcEnv, sparkProperties: Seq[(String, String)])
     extends ThreadSafeRpcEndpoint with Logging {
 
@@ -181,7 +184,9 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
           // in this block are read when requesting executors
           CoarseGrainedSchedulerBackend.this.synchronized {
             executorDataMap.put(executorId, data)
-            setCurrentExecutorIdCounter(executorId.toInt)
+            if (currentExecutorIdCounter < executorId.toInt) {
+              currentExecutorIdCounter = executorId.toInt
+            }
             if (numPendingExecutors > 0) {
               numPendingExecutors -= 1
               logDebug(s"Decremented number of pending executors ($numPendingExecutors left)")
@@ -649,9 +654,6 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
     defaultAskTimeout.awaitResult(response)
   }
 
-  // Set the num of current max ExecutorId used to re-register appMaster
-  protected def setCurrentExecutorIdCounter(executorId: Int): Unit = {}
-
   /**
    * Kill the given list of executors through the cluster manager.
    * @return whether the kill request is acknowledged.
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
@@ -127,7 +127,6 @@ private[spark] class ApplicationMaster(
   private var nextAllocationInterval = initialAllocationInterval
 
   private var rpcEnv: RpcEnv = null
-  private var amEndpoint: RpcEndpointRef = _
 
   // In cluster mode, used to tell the AM when the user's SparkContext has been initialized.
   private val sparkContextPromise = Promise[SparkContext]()
@@ -388,53 +387,35 @@ private[spark] class ApplicationMaster(
       dummyRunner.launchContextDebugInfo()
     }
 
-    /**
-     * (executorIdCounter, requestExecutors) should be the initial state
-     * or the last state AM restart.
-     *
-     * @see SPARK-12864, SPARK-20079
-     */
-    val (executorIdCounter, requestExecutors) =
-      driverRef.askSync[(Int, RequestExecutors)](GetAMInitialState)
     allocator = client.register(driverUrl,
       driverRef,
       yarnConf,
       _sparkConf,
       uiAddress,
       historyAddress,
       securityMgr,
-      localResources,
-      executorIdCounter)
-    if (requestExecutors.requestedTotal != allocator.getTargetNumExecutors) {
-      amEndpoint.send(requestExecutors)
-    }
+      localResources)
+
+    // Initialize the AM endpoint *after* the allocator has been initialized. This ensures
+    // that when the driver sends an initial executor request (e.g. after an AM restart),
+    // the allocator is ready to service requests.
+    rpcEnv.setupEndpoint("YarnAM", new AMEndpoint(rpcEnv, driverRef))
 
     allocator.allocateResources()
     reporterThread = launchReporterThread()
   }
 
   /**
-   * Create an [[RpcEndpoint]] that communicates with the driver.
-   *
-   * In cluster mode, the AM and the driver belong to same process
-   * so the AMEndpoint need not monitor lifecycle of the driver.
-   *
-   * @return A reference to the driver's RPC endpoint.
+   * @return An [[RpcEndpoint]] that communicates with the driver's scheduler backend.
    */
-  private def runAMEndpoint(
-      host: String,
-      port: String,
-      isClusterMode: Boolean): RpcEndpointRef = {
-    val driverEndpoint = rpcEnv.setupEndpointRef(
+  private def createSchedulerRef(host: String, port: String): RpcEndpointRef = {
+    rpcEnv.setupEndpointRef(
       RpcAddress(host, port.toInt),
       YarnSchedulerBackend.ENDPOINT_NAME)
-    amEndpoint =
-      rpcEnv.setupEndpoint("YarnAM", new AMEndpoint(rpcEnv, driverEndpoint, isClusterMode))
-    driverEndpoint
   }
 
   private def runDriver(securityMgr: SecurityManager): Unit = {
-    addAmIpFilter()
+    addAmIpFilter(None)
     userClassThread = startUserApplication()
 
     // This a bit hacky, but we need to wait until the spark.driver.port property has
@@ -446,10 +427,9 @@ private[spark] class ApplicationMaster(
         Duration(totalWaitTime, TimeUnit.MILLISECONDS))
       if (sc != null) {
         rpcEnv = sc.env.rpcEnv
-        val driverRef = runAMEndpoint(
+        val driverRef = createSchedulerRef(
           sc.getConf.get("spark.driver.host"),
-          sc.getConf.get("spark.driver.port"),
-          isClusterMode = true)
+          sc.getConf.get("spark.driver.port"))
         registerAM(sc.getConf, rpcEnv, driverRef, sc.ui.map(_.webUrl), securityMgr)
       } else {
         // Sanity check; should never happen in normal operation, since sc should only be null
@@ -474,7 +454,7 @@ private[spark] class ApplicationMaster(
     rpcEnv = RpcEnv.create("sparkYarnAM", Utils.localHostName, -1, sparkConf, securityMgr,
       clientMode = true)
     val driverRef = waitForSparkDriver()
-    addAmIpFilter()
+    addAmIpFilter(Some(driverRef))
     registerAM(sparkConf, rpcEnv, driverRef, sparkConf.getOption("spark.driver.appUIAddress"),
       securityMgr)
 
@@ -622,20 +602,21 @@ private[spark] class ApplicationMaster(
 
     sparkConf.set("spark.driver.host", driverHost)
     sparkConf.set("spark.driver.port", driverPort.toString)
-
-    runAMEndpoint(driverHost, driverPort.toString, isClusterMode = false)
+    createSchedulerRef(driverHost, driverPort.toString)
   }
 
   /** Add the Yarn IP filter that is required for properly securing the UI. */
-  private def addAmIpFilter() = {
+  private def addAmIpFilter(driver: Option[RpcEndpointRef]) = {
     val proxyBase = System.getenv(ApplicationConstants.APPLICATION_WEB_PROXY_BASE_ENV)
     val amFilter = "org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter"
     val params = client.getAmIpFilterParams(yarnConf, proxyBase)
-    if (isClusterMode) {
-      System.setProperty("spark.ui.filters", amFilter)
-      params.foreach { case (k, v) => System.setProperty(s"spark.$amFilter.param.$k", v) }
-    } else {
-      amEndpoint.send(AddWebUIFilter(amFilter, params.toMap, proxyBase))
+    driver match {
+      case Some(d) =>
+        d.send(AddWebUIFilter(amFilter, params.toMap, proxyBase))
+
+      case None =>
+        System.setProperty("spark.ui.filters", amFilter)
+        params.foreach { case (k, v) => System.setProperty(s"spark.$amFilter.param.$k", v) }
     }
   }
 
@@ -706,20 +687,13 @@ private[spark] class ApplicationMaster(
   /**
    * An [[RpcEndpoint]] that communicates with the driver's scheduler backend.
    */
-  private class AMEndpoint(
-      override val rpcEnv: RpcEnv, driver: RpcEndpointRef, isClusterMode: Boolean)
+  private class AMEndpoint(override val rpcEnv: RpcEnv, driver: RpcEndpointRef)
     extends RpcEndpoint with Logging {
 
     override def onStart(): Unit = {
       driver.send(RegisterClusterManager(self))
     }
 
-    override def receive: PartialFunction[Any, Unit] = {
-      case x: AddWebUIFilter =>
-        logInfo(s"Add WebUI Filter. $x")
-        driver.send(x)
-    }
-
     override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {
       case r: RequestExecutors =>
         Option(allocator) match {
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
@@ -40,6 +40,7 @@ import org.apache.spark.internal.config._
 import org.apache.spark.rpc.{RpcCallContext, RpcEndpointRef}
 import org.apache.spark.scheduler.{ExecutorExited, ExecutorLossReason}
 import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages.RemoveExecutor
+import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages.RetrieveLastAllocatedExecutorId
 import org.apache.spark.util.{Clock, SystemClock, ThreadUtils}
 
 /**
@@ -64,8 +65,7 @@ private[yarn] class YarnAllocator(
     appAttemptId: ApplicationAttemptId,
     securityMgr: SecurityManager,
     localResources: Map[String, LocalResource],
-    resolver: SparkRackResolver,
-    private var executorIdCounter: Int)
+    resolver: SparkRackResolver)
   extends Logging {
 
   import YarnAllocator._
@@ -82,6 +82,22 @@ private[yarn] class YarnAllocator(
 
   @volatile private var numExecutorsRunning = 0
 
+  /**
+   * Used to generate a unique ID per executor
+   *
+   * Init `executorIdCounter`. when AM restart, `executorIdCounter` will reset to 0. Then
+   * the id of new executor will start from 1, this will conflict with the executor has
+   * already created before. So, we should initialize the `executorIdCounter` by getting
+   * the max executorId from driver.
+   *
+   * And this situation of executorId conflict is just in yarn client mode, so this is an issue
+   * in yarn client mode. For more details, can check in jira.
+   *
+   * @see SPARK-12864
+   */
+  private var executorIdCounter: Int =
+    driverRef.askSync[Int](RetrieveLastAllocatedExecutorId)
+
   // Queue to store the timestamp of failed executors
   private val failedExecutorsTimeStamps = new Queue[Long]()
 
@@ -147,8 +163,6 @@ private[yarn] class YarnAllocator(
     clock = newClock
   }
 
-  def getTargetNumExecutors: Int = targetNumExecutors
-
   def getNumExecutorsRunning: Int = numExecutorsRunning
 
   def getNumExecutorsFailed: Int = synchronized {
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnRMClient.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnRMClient.scala
@@ -58,8 +58,7 @@ private[spark] class YarnRMClient extends Logging {
       uiAddress: Option[String],
       uiHistoryAddress: String,
       securityMgr: SecurityManager,
-      localResources: Map[String, LocalResource],
-      executorIdCounter: Int
+      localResources: Map[String, LocalResource]
     ): YarnAllocator = {
     amClient = AMRMClient.createAMRMClient()
     amClient.init(conf)
@@ -76,7 +75,7 @@ private[spark] class YarnRMClient extends Logging {
       registered = true
     }
     new YarnAllocator(driverUrl, driverRef, conf, sparkConf, amClient, getAttemptId(), securityMgr,
-      localResources, new SparkRackResolver(), executorIdCounter)
+      localResources, new SparkRackResolver())
   }
 
   /**
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala
@@ -69,15 +69,6 @@ private[spark] abstract class YarnSchedulerBackend(
   /** Scheduler extension services. */
   private val services: SchedulerExtensionServices = new SchedulerExtensionServices()
 
-  // Flag to specify whether this schedulerBackend should be reset.
-  private var shouldResetOnAmRegister = false
-
-  private val currentState = new CurrentAMState(0,
-    RequestExecutors(Utils.getDynamicAllocationInitialExecutors(conf), 0, Map.empty, Set.empty))
-
-  protected class CurrentAMState(
-    var executorIdCounter: Int,
-    var requestExecutors: RequestExecutors)
   /**
    * Bind to YARN. This *must* be done before calling [[start()]].
    *
@@ -146,20 +137,7 @@ private[spark] abstract class YarnSchedulerBackend(
    * This includes executors already pending or running.
    */
   override def doRequestTotalExecutors(requestedTotal: Int): Future[Boolean] = {
-    val requestExecutors = prepareRequestExecutors(requestedTotal)
-    val future = yarnSchedulerEndpointRef.ask[Boolean](requestExecutors)
-    setCurrentRequestExecutors(requestExecutors)
-    future
-  }
-
-  override def setCurrentExecutorIdCounter(executorId: Int): Unit = synchronized {
-    if (currentState.executorIdCounter < executorId.toInt) {
-      currentState.executorIdCounter = executorId.toInt
-    }
-  }
-
-  def setCurrentRequestExecutors(requestExecutors: RequestExecutors): Unit = synchronized {
-    currentState.requestExecutors = requestExecutors
+    yarnSchedulerEndpointRef.ask[Boolean](prepareRequestExecutors(requestedTotal))
   }
 
   /**
@@ -281,13 +259,7 @@ private[spark] abstract class YarnSchedulerBackend(
       case RegisterClusterManager(am) =>
         logInfo(s"ApplicationMaster registered as $am")
         amEndpoint = Option(am)
-        if (!shouldResetOnAmRegister) {
-          shouldResetOnAmRegister = true
-        } else {
-          // AM is already registered before, this potentially means that AM failed and
-          // a new one registered after the failure. This will only happen in yarn-client mode.
-          reset()
-        }
+        reset()
 
       case AddWebUIFilter(filterName, filterParams, proxyBase) =>
         addWebUIFilter(filterName, filterParams, proxyBase)
@@ -331,8 +303,8 @@ private[spark] abstract class YarnSchedulerBackend(
             context.reply(false)
         }
 
-      case GetAMInitialState =>
-        context.reply((currentState.executorIdCounter, currentState.requestExecutors))
+      case RetrieveLastAllocatedExecutorId =>
+        context.reply(currentExecutorIdCounter)
     }
 
     override def onDisconnected(remoteAddress: RpcAddress): Unit = {
diff --git a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnAllocatorSuite.scala b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnAllocatorSuite.scala
@@ -101,8 +101,7 @@ class YarnAllocatorSuite extends SparkFunSuite with Matchers with BeforeAndAfter
       appAttemptId,
       new SecurityManager(sparkConf),
       Map(),
-      new MockResolver(),
-      0)
+      new MockResolver())
   }
 
   def createContainer(host: String): Container = {

Original file line number	Diff line number	Diff line change
`@@ -101,8 +101,7 @@ class YarnAllocatorSuite extends SparkFunSuite with Matchers with BeforeAndAfter`
`101`	`101`	`appAttemptId,`
`102`	`102`	`new SecurityManager(sparkConf),`
`103`	`103`	`Map(),`
`104`		`- new MockResolver(),`
`105`		`- 0)`
	`104`	`+ new MockResolver())`
`106`	`105`	`}`
`107`	`106`
`108`	`107`	`def createContainer(host: String): Container = {`