@cloudfan's comments

dagrawal3409 · dagrawal3409 · commit 9a4cce67c8b8 · 2020-08-17T14:55:55.000-07:00
diff --git a/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala b/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
@@ -297,7 +297,7 @@ private[spark] class CoarseGrainedExecutorBackend(
           // This config is internal and only used by unit tests to force an executor
           // to hang around for longer when decommissioned.
           val initialSleepMillis = env.conf.getInt(
-            "spark.executor.decommission.initial.sleep.millis", sleep_time)
+            "spark.test.executor.decommission.initial.sleep.millis", sleep_time)
           if (initialSleepMillis > 0) {
             Thread.sleep(initialSleepMillis)
           }
diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala
@@ -1877,6 +1877,16 @@ package object config {
       .timeConf(TimeUnit.SECONDS)
       .createOptional
 
+  private[spark] val DECOMMISSIONED_EXECUTORS_REMEMBER_AFTER_REMOVAL_TTL =
+    ConfigBuilder("spark.executor.decommission.removed.infoCacheTTL")
+      .doc("Duration for which a decommissioned executor's information will be kept after its" +
+        "removal. Keeping the decommissioned info after removal helps pinpoint fetch failures to " +
+        "decommissioning even after the mapper executor has been decommissioned. This allows " +
+        "eager recovery from fetch failures caused by decommissioning, increasing job robustness.")
+      .version("3.1.0")
+      .timeConf(TimeUnit.SECONDS)
+      .createWithDefaultString("5m")
+
   private[spark] val STAGING_DIR = ConfigBuilder("spark.yarn.stagingDir")
     .doc("Staging directory used while submitting applications.")
     .version("2.0.0")
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
@@ -148,7 +148,7 @@ private[spark] class TaskSchedulerImpl(
   // decommissioned.
   lazy val decommissionedExecutorsRemoved = CacheBuilder.newBuilder()
     .expireAfterWrite(
-      conf.getLong("spark.decommissioningRememberAfterRemoval.seconds", 60L), TimeUnit.SECONDS)
+      conf.get(DECOMMISSIONED_EXECUTORS_REMEMBER_AFTER_REMOVAL_TTL), TimeUnit.SECONDS)
     .ticker(new Ticker{
       override def read(): Long = TimeUnit.MILLISECONDS.toNanos(clock.getTimeMillis())
     })
diff --git a/core/src/test/scala/org/apache/spark/deploy/DecommissionWorkerSuite.scala b/core/src/test/scala/org/apache/spark/deploy/DecommissionWorkerSuite.scala
@@ -207,7 +207,7 @@ class DecommissionWorkerSuite
     createWorkers(2)
     sc = createSparkContext(
       config.Tests.TEST_NO_STAGE_RETRY.key -> "false",
-      "spark.executor.decommission.initial.sleep.millis" -> initialSleepMillis.toString,
+      "spark.test.executor.decommission.initial.sleep.millis" -> initialSleepMillis.toString,
       config.UNREGISTER_OUTPUT_ON_HOST_ON_FETCH_FAILURE.key -> "true")
     val executorIdToWorkerInfo = getExecutorToWorkerAssignments
     val executorToDecom = executorIdToWorkerInfo.keysIterator.next

Original file line number	Diff line number	Diff line change
`@@ -297,7 +297,7 @@ private[spark] class CoarseGrainedExecutorBackend(`
`297`	`297`	`// This config is internal and only used by unit tests to force an executor`
`298`	`298`	`// to hang around for longer when decommissioned.`
`299`	`299`	`val initialSleepMillis = env.conf.getInt(`
`300`		`- "spark.executor.decommission.initial.sleep.millis", sleep_time)`
	`300`	`+ "spark.test.executor.decommission.initial.sleep.millis", sleep_time)`
`301`	`301`	`if (initialSleepMillis > 0) {`
`302`	`302`	`Thread.sleep(initialSleepMillis)`
`303`	`303`	`}`