From 70ed95ad8c73c1c1ff46dcf191b26f96c51ea09e Mon Sep 17 00:00:00 2001 From: antiout Date: Wed, 12 Apr 2017 02:17:30 -0400 Subject: [PATCH 1/2] Removed hardcoded blacklist functionality, must be controled by BlacklistTracker --- .../MesosCoarseGrainedSchedulerBackend.scala | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala index 5bdc2a2b840e..a49f5f430101 100644 --- a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala +++ b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala @@ -55,9 +55,6 @@ private[spark] class MesosCoarseGrainedSchedulerBackend( with org.apache.mesos.Scheduler with MesosSchedulerUtils { - // Blacklist a slave after this many failures - private val MAX_SLAVE_FAILURES = 2 - private val maxCoresOption = conf.getOption("spark.cores.max").map(_.toInt) // Maximum number of cores to acquire @@ -460,7 +457,6 @@ private[spark] class MesosCoarseGrainedSchedulerBackend( cpus + totalCoresAcquired <= maxCores && mem <= offerMem && numExecutors() < executorLimit && - slaves.get(slaveId).map(_.taskFailures).getOrElse(0) < MAX_SLAVE_FAILURES && meetsPortRequirements } @@ -516,15 +512,6 @@ private[spark] class MesosCoarseGrainedSchedulerBackend( totalGpusAcquired -= gpus gpusByTaskId -= taskId } - // If it was a failure, mark the slave as failed for blacklisting purposes - if (TaskState.isFailed(state)) { - slave.taskFailures += 1 - - if (slave.taskFailures >= MAX_SLAVE_FAILURES) { - logInfo(s"Blacklisting Mesos slave $slaveId due to too many failures; " + - "is Spark installed on it?") - } - } executorTerminated(d, slaveId, taskId, s"Executor finished with state $state") // In case we'd rejected everything before but have now lost a node d.reviveOffers() @@ -650,6 +637,5 @@ private[spark] class MesosCoarseGrainedSchedulerBackend( private class Slave(val hostname: String) { val taskIDs = new mutable.HashSet[String]() - var taskFailures = 0 var shuffleRegistered = false -} +} \ No newline at end of file From df2f319a518e1a533dae04d5d6bfa019a8b6845c Mon Sep 17 00:00:00 2001 From: antiout Date: Wed, 12 Apr 2017 02:35:28 -0400 Subject: [PATCH 2/2] Removed hardcoded blacklist functionality, must be controled by BlacklistTracker --- .../cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala index ffa2378c45c4..8ef996358ca7 100644 --- a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala +++ b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala @@ -662,4 +662,4 @@ private[spark] class MesosCoarseGrainedSchedulerBackend( private class Slave(val hostname: String) { val taskIDs = new mutable.HashSet[String]() var shuffleRegistered = false -} \ No newline at end of file +}