Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,6 @@ private[spark] class MesosCoarseGrainedSchedulerBackend(
with org.apache.mesos.Scheduler
with MesosSchedulerUtils {

// Blacklist a slave after this many failures
private val MAX_SLAVE_FAILURES = 2

private val maxCoresOption = conf.getOption("spark.cores.max").map(_.toInt)

// Maximum number of cores to acquire
Expand Down Expand Up @@ -484,7 +481,6 @@ private[spark] class MesosCoarseGrainedSchedulerBackend(
cpus + totalCoresAcquired <= maxCores &&
mem <= offerMem &&
numExecutors() < executorLimit &&
slaves.get(slaveId).map(_.taskFailures).getOrElse(0) < MAX_SLAVE_FAILURES &&
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

rather than just deleting this, we should replace it with a check to scheduler.nodeBlacklist(), like the YarnScheduler is doing here:

Copy link
Contributor

@skonto skonto Feb 20, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@squito regarding 3. That part of code launches aggressively tasks on matched offers.
The remainder of the offer is implicitly declined https://mail-archives.apache.org/mod_mbox/mesos-user/201507.mbox/%[email protected]%3E
In addition if an offer cannot be used to launch a task is declined later on here:
https://github.com/apache/spark/blob/83c008762af444eef73d835eb6f506ecf5aebc17/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala#L440-#L444

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks @skonto , also I realized there was more explicit calls to declineOffer than I thought initially after a closer read of the code. btw @IgorBerman has opened an updated version of this PR here #20640 -- would appreciate a review over there

meetsPortRequirements
}

Expand Down Expand Up @@ -540,15 +536,6 @@ private[spark] class MesosCoarseGrainedSchedulerBackend(
totalGpusAcquired -= gpus
gpusByTaskId -= taskId
}
// If it was a failure, mark the slave as failed for blacklisting purposes
if (TaskState.isFailed(state)) {
slave.taskFailures += 1

if (slave.taskFailures >= MAX_SLAVE_FAILURES) {
logInfo(s"Blacklisting Mesos slave $slaveId due to too many failures; " +
"is Spark installed on it?")
}
}
executorTerminated(d, slaveId, taskId, s"Executor finished with state $state")
// In case we'd rejected everything before but have now lost a node
d.reviveOffers()
Expand Down Expand Up @@ -674,6 +661,5 @@ private[spark] class MesosCoarseGrainedSchedulerBackend(

private class Slave(val hostname: String) {
val taskIDs = new mutable.HashSet[String]()
var taskFailures = 0
var shuffleRegistered = false
}