Make the block manager decommissioning test be less flaky

dagrawal3409 · dagrawal3409 · commit 2cea5f4673b7 · 2020-07-26T14:15:52.000-07:00
An interesting failure happens when migrateDuring = true (and persist or
shuffle is true):

- We schedule the job with tasks on executors 0, 1, 2.
- We wait 300 ms and decommission executor 0.
- If the task is not yet done on executor 0, it will now fail because
  the block manager won't be able to save the block. This condition is
  easy to trigger on a loaded machine where the github checks run.
- The task with retry on a different executor (1 or 2) and its shuffle
  blocks will land there.
- No actual block migration happens here because the decommissioned
  executor technically failed before it could even produce a block.

So this change makes two fixes to remove the above race condition.
- When migrateDuring = true, wait for a task to complete and write the
  block, and then decommission that executor.
- When migrateDuring = false, it is still possible (because of delay
  scheduling) for two tasks to be run on the same executor serially and
  one executor to go idle. In which case, we must make sure to
  decommission an executor that actually had a task run on it.
diff --git a/core/src/test/scala/org/apache/spark/storage/BlockManagerDecommissionIntegrationSuite.scala b/core/src/test/scala/org/apache/spark/storage/BlockManagerDecommissionIntegrationSuite.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.storage
 
 import java.util.concurrent.Semaphore
+import java.util.concurrent.atomic.AtomicReference
 
 import scala.collection.mutable.ArrayBuffer
 import scala.concurrent.duration._
@@ -92,8 +93,22 @@ class BlockManagerDecommissionIntegrationSuite extends SparkFunSuite with LocalS
     val executorRemovedSem = new Semaphore(0)
     val taskEndEvents = ArrayBuffer.empty[SparkListenerTaskEnd]
     val blocksUpdated = ArrayBuffer.empty[SparkListenerBlockUpdated]
-    sc.addSparkListener(new SparkListener {
+    val sched = sc.schedulerBackend.asInstanceOf[StandaloneSchedulerBackend]
+    val execToDecommission = new AtomicReference[String](null)
+
+    def maybeDoDecommission(candidateExecToDecom: String): Boolean = {
+      if (execToDecommission.compareAndSet(null, candidateExecToDecom)) {
+        val decomContext = s"Decommissioning executor ${candidateExecToDecom}"
+        logInfo(decomContext)
+        sched.decommissionExecutor(candidateExecToDecom,
+          ExecutorDecommissionInfo(decomContext, false))
+        true
+      } else {
+        false
+      }
+    }
 
+    sc.addSparkListener(new SparkListener {
       override def onExecutorRemoved(execRemoved: SparkListenerExecutorRemoved): Unit = {
         executorRemovedSem.release()
       }
@@ -104,6 +119,15 @@ class BlockManagerDecommissionIntegrationSuite extends SparkFunSuite with LocalS
 
       override def onTaskEnd(taskEnd: SparkListenerTaskEnd): Unit = {
         taskEndEvents.append(taskEnd)
+        if (migrateDuring && taskEnd.taskInfo.successful) {
+          // When migrating during, we decommission an executor _after_ it is done with a task.
+          // We don't decommission an executor while it is running, because doing so might cause
+          // a task to fail. The task fails because a decommissioned block manager cannot save
+          // blocks. When this happens the task will be re-run on a different executor, but by
+          // which time we have already decommissioned a "wrong" executor. To avoid this condition,
+          // we wait for a task to finish successfully and go and decommission its executor.
+          maybeDoDecommission(taskEnd.taskInfo.executorId)
+        }
       }
 
       override def onBlockUpdated(blockUpdated: SparkListenerBlockUpdated): Unit = {
@@ -139,22 +163,28 @@ class BlockManagerDecommissionIntegrationSuite extends SparkFunSuite with LocalS
       ThreadUtils.awaitResult(asyncCount, 15.seconds)
     }
 
-    // Decommission one of the executors.
-    val sched = sc.schedulerBackend.asInstanceOf[StandaloneSchedulerBackend]
-    val execs = sched.getExecutorIds()
-    assert(execs.size == numExecs, s"Expected ${numExecs} executors but found ${execs.size}")
+    sc.listenerBus.waitUntilEmpty()
 
-    val execToDecommission = execs.head
-    logDebug(s"Decommissioning executor ${execToDecommission}")
-    sched.decommissionExecutor(execToDecommission, ExecutorDecommissionInfo("", false))
+    val asyncCountResult = if (migrateDuring) {
+      // Wait for the job to finish
+      ThreadUtils.awaitResult(asyncCount, 15.seconds)
+    } else {
+      // (See comment in listener's onTaskEnd method.)
+      // But when migrating after a job is done, we can decommission any executor that had a task
+      val executorWithSuccessfulTask = taskEndEvents
+        .filter(_.taskInfo.successful)
+        .map(_.taskInfo.executorId)
+        .head
+      val decommissioned = maybeDoDecommission(executorWithSuccessfulTask)
+      assert(decommissioned, s"No prior decommissioning should have happened")
+      // We had already waited for the job to finish above, so the timeout is zero
+      ThreadUtils.awaitResult(asyncCount, 0.seconds)
+    }
 
-    // Wait for job to finish.
-    val asyncCountResult = ThreadUtils.awaitResult(asyncCount, 15.seconds)
     assert(asyncCountResult === numParts)
     // All tasks finished, so accum should have been increased numParts times.
     assert(accum.value === numParts)
 
-    sc.listenerBus.waitUntilEmpty()
     if (shuffle) {
       //  mappers & reducers which succeeded
       assert(taskEndEvents.count(_.reason == Success) === 2 * numParts,
@@ -206,15 +236,15 @@ class BlockManagerDecommissionIntegrationSuite extends SparkFunSuite with LocalS
     val execIdToBlocksMapping = storageStatus.map(
       status => (status.blockManagerId.executorId, status.blocks)).toMap
     // No cached blocks should be present on executor which was decommissioned
-    assert(execIdToBlocksMapping(execToDecommission).keys.filter(_.isRDD).toSeq === Seq(),
+    assert(execIdToBlocksMapping(execToDecommission.get()).keys.filter(_.isRDD).toSeq === Seq(),
       "Cache blocks should be migrated")
     if (persist) {
       // There should still be all the RDD blocks cached
       assert(execIdToBlocksMapping.values.flatMap(_.keys).count(_.isRDD) === numParts)
     }
 
     // Make the executor we decommissioned exit
-    sched.client.killExecutors(List(execToDecommission))
+    sched.client.killExecutors(List(execToDecommission.get()))
 
     // Wait for the executor to be removed
     executorRemovedSem.acquire(1)