apache · attilapiros · Mar 9, 2021 · Apr 29, 2021 · Jun 6, 2021 · Jun 6, 2021
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Config.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Config.scala
@@ -292,6 +292,14 @@ private[spark] object Config extends Logging {
       .checkValue(value => value > 0, "Allocation batch size should be a positive integer")
       .createWithDefault(5)
 
+  val KUBERNETES_MAX_PENDING_PODS =
+    ConfigBuilder("spark.kubernetes.allocation.max.pendingPods")
+      .doc("Maximum number of pending pods allowed during executor alloction for this application.")
+      .version("3.2.0")
+      .intConf
+      .checkValue(value => value > 0, "Maximum number of pending pods should be a positive integer")
+      .createWithDefault(150)
+
   val KUBERNETES_ALLOCATION_BATCH_DELAY =
     ConfigBuilder("spark.kubernetes.allocation.batch.delay")
       .doc("Time to wait between each round of executor allocation.")

diff --git a/...es/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocator.scala b/...es/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocator.scala
@@ -57,6 +57,8 @@ private[spark] class ExecutorPodsAllocator(
 
   private val podAllocationDelay = conf.get(KUBERNETES_ALLOCATION_BATCH_DELAY)
 
+  private val maxPendingPods = conf.get(KUBERNETES_MAX_PENDING_PODS)
+
   private val podCreationTimeout = math.max(
     podAllocationDelay * 5,
     conf.get(KUBERNETES_ALLOCATION_EXECUTOR_TIMEOUT))
@@ -91,7 +93,11 @@ private[spark] class ExecutorPodsAllocator(
   private val dynamicAllocationEnabled = Utils.isDynamicAllocationEnabled(conf)
 
   // visible for tests
-  private[k8s] val numOutstandingPods = new AtomicInteger()
+  private[k8s] val numNewlyCreatedUnknownPods = new AtomicInteger()
+
+  // visible for tests
+  // number of pending PODs: including the known and unknown ones
+  private[k8s] val numPendingPods = new AtomicInteger()
 
   private var lastSnapshot = ExecutorPodsSnapshot()
 
@@ -122,7 +128,7 @@ private[spark] class ExecutorPodsAllocator(
       totalExpectedExecutorsPerResourceProfileId.put(rp.id, numExecs)
     }
     logDebug(s"Set total expected execs to $totalExpectedExecutorsPerResourceProfileId")
-    if (numOutstandingPods.get() == 0) {
+    if (numNewlyCreatedUnknownPods.get() < podAllocationSize) {
       snapshotsStore.notifySubscribers()
     }
   }
@@ -216,10 +222,13 @@ private[spark] class ExecutorPodsAllocator(
         execPods(execId) = execPodState
       }
     }
-
-    var totalPendingCount = 0
+    var sumPendingPods = 0
     // The order we request executors for each ResourceProfile is not guaranteed.
-    totalExpectedExecutorsPerResourceProfileId.asScala.foreach { case (rpId, targetNum) =>
+    val knownPodsPerTargetForRpId = totalExpectedExecutorsPerResourceProfileId
+      .asScala
+      .toSeq
+      .sortBy(_._1)
+      .map { case (rpId, targetNum) =>
       val podsForRpId = rpIdToExecsAndPodState.getOrElse(rpId, mutable.HashMap.empty)
 
       val currentRunningCount = podsForRpId.values.count {
@@ -233,9 +242,8 @@ private[spark] class ExecutorPodsAllocator(
       }.partition { case (k, _) =>
         schedulerKnownExecs.contains(k)
       }
-      // This variable is used later to print some debug logs. It's updated when cleaning up
-      // excess pod requests, since currentPendingExecutorsForRpId is immutable.
-      var knownPendingCount = currentPendingExecutorsForRpId.size
+      sumPendingPods += currentPendingExecutorsForRpId.size
+      sumPendingPods += schedulerKnownPendingExecsForRpId.size
 
       val newlyCreatedExecutorsForRpId =
         newlyCreatedExecutors.filter { case (_, (waitingRpId, _)) =>
@@ -248,7 +256,7 @@ private[spark] class ExecutorPodsAllocator(
         }
 
       if (podsForRpId.nonEmpty) {
-        logDebug(s"ResourceProfile Id: $rpId " +
+        logDebug(s"ResourceProfile Id: $rpId, " +
           s"pod allocation status: $currentRunningCount running, " +
           s"${currentPendingExecutorsForRpId.size} unknown pending, " +
           s"${schedulerKnownPendingExecsForRpId.size} scheduler backend known pending, " +
@@ -293,39 +301,51 @@ private[spark] class ExecutorPodsAllocator(
               .withLabelIn(SPARK_EXECUTOR_ID_LABEL, toDelete.sorted.map(_.toString): _*)
               .delete()
             newlyCreatedExecutors --= newlyCreatedToDelete
-            knownPendingCount -= knownPendingToDelete.size
+            sumPendingPods -= knownPendingToDelete.size
           }
         }
       }
+      (rpId -> (knownPodCount, targetNum))
+    }
 
-      if (newlyCreatedExecutorsForRpId.isEmpty
-        && knownPodCount < targetNum) {
-        requestNewExecutors(targetNum, knownPodCount, applicationId, rpId, k8sKnownPVCNames)
-      }
-      totalPendingCount += knownPendingCount
-
-      // The code below just prints debug messages, which are only useful when there's a change
-      // in the snapshot state. Since the messages are a little spammy, avoid them when we know
-      // there are no useful updates.
-      if (log.isDebugEnabled && snapshots.nonEmpty) {
-        val outstanding = knownPendingCount + newlyCreatedExecutorsForRpId.size
-        if (currentRunningCount >= targetNum && !dynamicAllocationEnabled) {
+    numPendingPods.set(sumPendingPods)
+    // after the downscale is triggered for all the resource profiles the size of
+    // newlyCreatedExecutors can be used for calculating the remaining batch size for upscaling
+    knownPodsPerTargetForRpId.foreach { case (rpId, (knownPodCount, targetNum)) =>
+      val allMissingExecutors = targetNum - knownPodCount
+      val remainingBatchAllocSize = podAllocationSize - newlyCreatedExecutors.size
+      val remainingPendingPods = maxPendingPods -sumPendingPods
+      if (allMissingExecutors <= 0) {
+        if (allMissingExecutors == 0 && !dynamicAllocationEnabled && snapshots.nonEmpty) {
           logDebug(s"Current number of running executors for ResourceProfile Id $rpId is " +
             "equal to the number of requested executors. Not scaling up further.")
-        } else {
-          if (outstanding > 0) {
-            logDebug(s"Still waiting for $outstanding executors for ResourceProfile " +
-              s"Id $rpId before requesting more.")
-          }
         }
+      } else if (remainingBatchAllocSize <= 0) {
+        if (snapshots.nonEmpty) {
+          logDebug("Batch size limit is reached for ResourceProfile " +
+            s"Id $rpId before requesting more.")
+        }
+      } else if (remainingPendingPods <= 0) {
+        if (snapshots.nonEmpty) {
+          logDebug("Max number of pending pod limit is reached for ResourceProfile " +
+            s"Id $rpId waiting for pods to become running.")
+        }
+      } else {
+        val numExecutorsToAllocate =
+          math.min(math.min(allMissingExecutors, remainingBatchAllocSize), remainingPendingPods)
+          logInfo(s"Going to request $numExecutorsToAllocate executors from Kubernetes for " +
+            s"ResourceProfile Id: $rpId, target: $targetNum, running: $knownPodCount, " +
+            s"remainingBatchAllocSize: $remainingBatchAllocSize," +
+            s"remainingPendingPods: $remainingPendingPods.")
+        requestNewExecutors(numExecutorsToAllocate, applicationId, rpId, k8sKnownPVCNames)
       }
     }
     deletedExecutorIds = _deletedExecutorIds
 
     // Update the flag that helps the setTotalExpectedExecutors() callback avoid triggering this
     // update method when not needed. PODs known by the scheduler backend are not counted here as
     // they considered running PODs and they should not block upscaling.
-    numOutstandingPods.set(totalPendingCount + newlyCreatedExecutors.size)
+    numNewlyCreatedUnknownPods.set(newlyCreatedExecutors.size)
   }
 
   private def getReusablePVCs(applicationId: String, pvcsInUse: Seq[String]) = {
@@ -347,14 +367,10 @@ private[spark] class ExecutorPodsAllocator(
   }
 
   private def requestNewExecutors(
-      expected: Int,
-      running: Int,
+      numExecutorsToAllocate: Int,
       applicationId: String,
       resourceProfileId: Int,
       pvcsInUse: Seq[String]): Unit = {
-    val numExecutorsToAllocate = math.min(expected - running, podAllocationSize)
-    logInfo(s"Going to request $numExecutorsToAllocate executors from Kubernetes for " +
-      s"ResourceProfile Id: $resourceProfileId, target: $expected running: $running.")
     // Check reusable PVCs for this executor allocation batch
     val reusablePVCs = getReusablePVCs(applicationId, pvcsInUse)
     for ( _ <- 0 until numExecutorsToAllocate) {