From 636959afbb7d12550c36d864aa52cb871686a56f Mon Sep 17 00:00:00 2001 From: antiout Date: Wed, 12 Apr 2017 02:17:30 -0400 Subject: [PATCH 01/12] Removed hardcoded blacklist functionality, must be controled by BlacklistTracker --- .../mesos/MesosCoarseGrainedSchedulerBackend.scala | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala index 53f5f61cca486..75f49e068cf04 100644 --- a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala +++ b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala @@ -62,9 +62,6 @@ private[spark] class MesosCoarseGrainedSchedulerBackend( private lazy val hadoopDelegationTokenManager: MesosHadoopDelegationTokenManager = new MesosHadoopDelegationTokenManager(conf, sc.hadoopConfiguration, driverEndpoint) - // Blacklist a slave after this many failures - private val MAX_SLAVE_FAILURES = 2 - private val maxCoresOption = conf.getOption("spark.cores.max").map(_.toInt) private val executorCoresOption = conf.getOption("spark.executor.cores").map(_.toInt) @@ -571,7 +568,6 @@ private[spark] class MesosCoarseGrainedSchedulerBackend( cpus + totalCoresAcquired <= maxCores && mem <= offerMem && numExecutors < executorLimit && - slaves.get(slaveId).map(_.taskFailures).getOrElse(0) < MAX_SLAVE_FAILURES && meetsPortRequirements && satisfiesLocality(offerHostname) } @@ -648,15 +644,6 @@ private[spark] class MesosCoarseGrainedSchedulerBackend( totalGpusAcquired -= gpus gpusByTaskId -= taskId } - // If it was a failure, mark the slave as failed for blacklisting purposes - if (TaskState.isFailed(state)) { - slave.taskFailures += 1 - - if (slave.taskFailures >= MAX_SLAVE_FAILURES) { - logInfo(s"Blacklisting Mesos slave $slaveId due to too many failures; " + - "is Spark installed on it?") - } - } executorTerminated(d, slaveId, taskId, s"Executor finished with state $state") // In case we'd rejected everything before but have now lost a node d.reviveOffers() @@ -798,7 +785,6 @@ private[spark] class MesosCoarseGrainedSchedulerBackend( private class Slave(val hostname: String) { val taskIDs = new mutable.HashSet[String]() - var taskFailures = 0 var shuffleRegistered = false } From f09faf71ee050f0f3bf4c379d72dabb95432f5d7 Mon Sep 17 00:00:00 2001 From: antiout Date: Wed, 12 Apr 2017 02:35:28 -0400 Subject: [PATCH 02/12] Removed hardcoded blacklist functionality, must be controled by BlacklistTracker --- .../cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala index 75f49e068cf04..d099ca5bfeb81 100644 --- a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala +++ b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala @@ -792,4 +792,4 @@ object IdHelper { // Use atomic values since Spark contexts can be initialized in parallel private[mesos] val nextSCNumber = new AtomicLong(0) private[mesos] val startedBefore = new AtomicBoolean(false) -} +} \ No newline at end of file From e2ddc1be19e2f978df4fe84073aff3f5b46afe45 Mon Sep 17 00:00:00 2001 From: IgorBerman Date: Tue, 20 Feb 2018 22:19:06 +0200 Subject: [PATCH 03/12] SPARK-19755 declining offers from blacklisted slave by BlacklistTracker --- .../MesosCoarseGrainedSchedulerBackend.scala | 3 ++- ...osCoarseGrainedSchedulerBackendSuite.scala | 23 +++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala index d099ca5bfeb81..30301a19c0bbb 100644 --- a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala +++ b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala @@ -568,6 +568,7 @@ private[spark] class MesosCoarseGrainedSchedulerBackend( cpus + totalCoresAcquired <= maxCores && mem <= offerMem && numExecutors < executorLimit && + !scheduler.nodeBlacklist().contains(offerHostname) && meetsPortRequirements && satisfiesLocality(offerHostname) } @@ -792,4 +793,4 @@ object IdHelper { // Use atomic values since Spark contexts can be initialized in parallel private[mesos] val nextSCNumber = new AtomicLong(0) private[mesos] val startedBefore = new AtomicBoolean(false) -} \ No newline at end of file +} diff --git a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala index f4bd1ee9da6f7..1646e57097639 100644 --- a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala +++ b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala @@ -108,6 +108,28 @@ class MesosCoarseGrainedSchedulerBackendSuite extends SparkFunSuite verifyTaskLaunched(driver, "o2") } + test("mesos declines offers from blacklisted slave") { + setBackend() + + // launches a task on a valid offer on slave s1 + val minMem = backend.executorMemory(sc) + 1024 + val minCpu = 4 + val offer1 = Resources(minMem, minCpu) + offerResources(List(offer1)) + verifyTaskLaunched(driver, "o1") + + // for any reason executor(aka mesos task) failed on s1 + val status = createTaskStatus("0", "s1", TaskState.TASK_FAILED) + backend.statusUpdate(driver, status) + when(taskScheduler.nodeBlacklist()).thenReturn(Set("hosts1")) + + val offer2 = Resources(minMem, minCpu) + // Offer resources from the same slave + offerResources(List(offer2)) + // but since it's blacklisted the offer is declined + verifyDeclinedOffer(driver, createOfferId("o1")) + } + test("mesos supports spark.executor.cores") { val executorCores = 4 setBackend(Map("spark.executor.cores" -> executorCores.toString)) @@ -790,6 +812,7 @@ class MesosCoarseGrainedSchedulerBackendSuite extends SparkFunSuite taskScheduler = mock[TaskSchedulerImpl] when(taskScheduler.sc).thenReturn(sc) + when(taskScheduler.nodeBlacklist()).thenReturn(Set[String]()) externalShuffleClient = mock[MesosExternalShuffleClient] From 66ed5afae1a5b4856e84c805c7858d552f38b26a Mon Sep 17 00:00:00 2001 From: IgorBerman Date: Wed, 21 Mar 2018 22:53:52 +0200 Subject: [PATCH 04/12] [SPARK-19755][Mesos] reverting logging on mesos slave task(executor) failure --- .../cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala | 3 +++ 1 file changed, 3 insertions(+) diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala index 30301a19c0bbb..21f515adfc3ac 100644 --- a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala +++ b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala @@ -645,6 +645,9 @@ private[spark] class MesosCoarseGrainedSchedulerBackend( totalGpusAcquired -= gpus gpusByTaskId -= taskId } + if (TaskState.isFailed(state)) { + logWarning(s"Mesos slave $slaveId failed") + } executorTerminated(d, slaveId, taskId, s"Executor finished with state $state") // In case we'd rejected everything before but have now lost a node d.reviveOffers() From a7ff8cccd1b7e5564880c40c503c169c6bed46b9 Mon Sep 17 00:00:00 2001 From: IgorBerman Date: Tue, 27 Mar 2018 17:11:54 +0300 Subject: [PATCH 05/12] [SPARK-19755][Mesos] rewording log message and changing it to error level --- .../cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala index 21f515adfc3ac..fe981638ecfd5 100644 --- a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala +++ b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala @@ -646,7 +646,7 @@ private[spark] class MesosCoarseGrainedSchedulerBackend( gpusByTaskId -= taskId } if (TaskState.isFailed(state)) { - logWarning(s"Mesos slave $slaveId failed") + logError(s"Task $taskId failed on Mesos slave $slaveId.") } executorTerminated(d, slaveId, taskId, s"Executor finished with state $state") // In case we'd rejected everything before but have now lost a node From 2c47271176b82e4859667ede9bb02b28b8fba50e Mon Sep 17 00:00:00 2001 From: IgorBerman Date: Fri, 15 Jun 2018 14:34:34 +0300 Subject: [PATCH 06/12] [SPARK-19755][Mesos] adding comment regarding failures of mesos task failures and linking to relevant jira --- .../cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala index fe981638ecfd5..de142df7eab9e 100644 --- a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala +++ b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala @@ -568,6 +568,10 @@ private[spark] class MesosCoarseGrainedSchedulerBackend( cpus + totalCoresAcquired <= maxCores && mem <= offerMem && numExecutors < executorLimit && + // nodeBlacklist() currently only gets updated based on failures in spark tasks. + // If a mesos task fails to even start -- that is, + // if a spark executor fails to launch on a node -- nodeBlacklist does not get updated + // see SPARK-24567 for details !scheduler.nodeBlacklist().contains(offerHostname) && meetsPortRequirements && satisfiesLocality(offerHostname) From 5eda874e1b9b05396c57413b743995201e02ec3d Mon Sep 17 00:00:00 2001 From: IgorBerman Date: Fri, 15 Jun 2018 21:29:20 +0300 Subject: [PATCH 07/12] [SPARK-19755][Mesos] specifying that it's mesos task failing --- .../cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala index de142df7eab9e..42001c1a1f624 100644 --- a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala +++ b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala @@ -650,7 +650,7 @@ private[spark] class MesosCoarseGrainedSchedulerBackend( gpusByTaskId -= taskId } if (TaskState.isFailed(state)) { - logError(s"Task $taskId failed on Mesos slave $slaveId.") + logError(s"Mesos task $taskId failed on Mesos slave $slaveId.") } executorTerminated(d, slaveId, taskId, s"Executor finished with state $state") // In case we'd rejected everything before but have now lost a node From 95ca22c9765ee6a941cf991e225678a4f829fc5d Mon Sep 17 00:00:00 2001 From: IgorBerman Date: Wed, 25 Sep 2019 10:09:03 +0300 Subject: [PATCH 08/12] SPARK-19755 fixing merge --- .../cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala | 3 --- 1 file changed, 3 deletions(-) diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala index 96a553510db16..5b5f449548988 100644 --- a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala +++ b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala @@ -61,9 +61,6 @@ private[spark] class MesosCoarseGrainedSchedulerBackend( extends CoarseGrainedSchedulerBackend(scheduler, sc.env.rpcEnv) with org.apache.mesos.Scheduler with MesosSchedulerUtils { - private lazy val hadoopDelegationTokenManager: MesosHadoopDelegationTokenManager = - new MesosHadoopDelegationTokenManager(conf, sc.hadoopConfiguration, driverEndpoint) - private val maxCoresOption = conf.get(config.CORES_MAX) private val executorCoresOption = conf.getOption(config.EXECUTOR_CORES.key).map(_.toInt) From cb8cb576ac5f16c343df2d815abe1e2ea48083c4 Mon Sep 17 00:00:00 2001 From: IgorBerman Date: Wed, 25 Sep 2019 15:27:50 +0300 Subject: [PATCH 09/12] SPARK-19755 adding jira to testcase name --- .../cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala index adb2fb626677f..fe6020b56fa0e 100644 --- a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala +++ b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala @@ -107,7 +107,7 @@ class MesosCoarseGrainedSchedulerBackendSuite extends SparkFunSuite verifyTaskLaunched(driver, "o2") } - test("mesos declines offers from blacklisted slave") { + test("SPARK-19755 mesos declines offers from blacklisted slave") { setBackend() // launches a task on a valid offer on slave s1 From 83cabff157115c36df7429381c3edc21733b4d5f Mon Sep 17 00:00:00 2001 From: IgorBerman Date: Wed, 25 Sep 2019 15:57:37 +0300 Subject: [PATCH 10/12] SPARK-19755 removing duplicated line --- .../cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala | 1 - 1 file changed, 1 deletion(-) diff --git a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala index fe6020b56fa0e..aa8fd1c86aa80 100644 --- a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala +++ b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala @@ -841,7 +841,6 @@ class MesosCoarseGrainedSchedulerBackendSuite extends SparkFunSuite taskScheduler = mock[TaskSchedulerImpl] when(taskScheduler.nodeBlacklist).thenReturn(Set[String]()) when(taskScheduler.sc).thenReturn(sc) - when(taskScheduler.nodeBlacklist()).thenReturn(Set[String]()) externalShuffleClient = mock[MesosExternalBlockStoreClient] From 2fc02885f70c789d1c7912c618534a644ec73021 Mon Sep 17 00:00:00 2001 From: Dongjoon Hyun Date: Wed, 25 Sep 2019 13:37:12 -0700 Subject: [PATCH 11/12] alternative --- .../MesosCoarseGrainedSchedulerBackend.scala | 16 ++++++++-------- ...MesosCoarseGrainedSchedulerBackendSuite.scala | 4 ++-- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala index 5b5f449548988..4a14fbbb62eb1 100644 --- a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala +++ b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala @@ -38,7 +38,7 @@ import org.apache.spark.internal.config.Tests.IS_TESTING import org.apache.spark.launcher.{LauncherBackend, SparkAppHandle} import org.apache.spark.network.netty.SparkTransportConf import org.apache.spark.network.shuffle.mesos.MesosExternalBlockStoreClient -import org.apache.spark.rpc.{RpcEndpointAddress, RpcEndpointRef} +import org.apache.spark.rpc.RpcEndpointAddress import org.apache.spark.scheduler.{SlaveLost, TaskSchedulerImpl} import org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend import org.apache.spark.util.Utils @@ -375,9 +375,14 @@ private[spark] class MesosCoarseGrainedSchedulerBackend( logDebug(s"Received ${offers.size} resource offers.") + val blacklist = scheduler.nodeBlacklist() val (matchedOffers, unmatchedOffers) = offers.asScala.partition { offer => - val offerAttributes = toAttributeMap(offer.getAttributesList) - matchesAttributeRequirements(slaveOfferConstraints, offerAttributes) + if (blacklist.contains(offer.getHostname)) { + false + } else { + val offerAttributes = toAttributeMap(offer.getAttributesList) + matchesAttributeRequirements(slaveOfferConstraints, offerAttributes) + } } declineUnmatchedOffers(d, unmatchedOffers) @@ -580,11 +585,6 @@ private[spark] class MesosCoarseGrainedSchedulerBackend( cpus + totalCoresAcquired <= maxCores && mem <= offerMem && numExecutors < executorLimit && - // nodeBlacklist() currently only gets updated based on failures in spark tasks. - // If a mesos task fails to even start -- that is, - // if a spark executor fails to launch on a node -- nodeBlacklist does not get updated - // see SPARK-24567 for details - !scheduler.nodeBlacklist().contains(offerHostname) && meetsPortRequirements && satisfiesLocality(offerHostname) } diff --git a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala index aa8fd1c86aa80..d2e21b49caa95 100644 --- a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala +++ b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackendSuite.scala @@ -36,7 +36,7 @@ import org.apache.spark.internal.config._ import org.apache.spark.network.shuffle.mesos.MesosExternalBlockStoreClient import org.apache.spark.rpc.{RpcAddress, RpcEndpointRef} import org.apache.spark.scheduler.TaskSchedulerImpl -import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages.{RegisterExecutor} +import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages.RegisterExecutor import org.apache.spark.scheduler.cluster.mesos.Utils._ class MesosCoarseGrainedSchedulerBackendSuite extends SparkFunSuite @@ -126,7 +126,7 @@ class MesosCoarseGrainedSchedulerBackendSuite extends SparkFunSuite // Offer resources from the same slave offerResources(List(offer2)) // but since it's blacklisted the offer is declined - verifyDeclinedOffer(driver, createOfferId("o1")) + verifyDeclinedOffer(driver, createOfferId("o1"), true) } test("mesos supports spark.executor.cores") { From 04f931d0348b4b6583d704a01b3421a4a8d91d3c Mon Sep 17 00:00:00 2001 From: Dongjoon Hyun Date: Wed, 25 Sep 2019 13:41:18 -0700 Subject: [PATCH 12/12] Recover comments --- .../cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala index 4a14fbbb62eb1..59c0fd81475a7 100644 --- a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala +++ b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala @@ -375,6 +375,10 @@ private[spark] class MesosCoarseGrainedSchedulerBackend( logDebug(s"Received ${offers.size} resource offers.") + // nodeBlacklist() currently only gets updated based on failures in spark tasks. + // If a mesos task fails to even start -- that is, + // if a spark executor fails to launch on a node -- nodeBlacklist does not get updated + // see SPARK-24567 for details val blacklist = scheduler.nodeBlacklist() val (matchedOffers, unmatchedOffers) = offers.asScala.partition { offer => if (blacklist.contains(offer.getHostname)) {