-
Notifications
You must be signed in to change notification settings - Fork 28.9k
Bugfixes/improvements to scheduler #159
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Closed
Closed
Changes from all commits
Commits
Show all changes
5 commits
Select commit
Hold shift + click to select a range
fa5d9f1
Bugfixes/improvements to scheduler : PR #517
mridulm 270d841
Address review comments
mridulm 9bda70e
Address review comments, akwats add to failedExecutors
mridulm 167fad8
Address review comments
mridulm 5ff59c2
Change property in suite also
mridulm File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -59,6 +59,15 @@ private[spark] class TaskSetManager( | |
| // CPUs to request per task | ||
| val CPUS_PER_TASK = conf.getInt("spark.task.cpus", 1) | ||
|
|
||
| /* | ||
| * Sometimes if an executor is dead or in an otherwise invalid state, the driver | ||
| * does not realize right away leading to repeated task failures. If enabled, | ||
| * this temporarily prevents a task from re-launching on an executor where | ||
| * it just failed. | ||
| */ | ||
| private val EXECUTOR_TASK_BLACKLIST_TIMEOUT = | ||
| conf.getLong("spark.scheduler.executorTaskBlacklistTime", 0L) | ||
|
|
||
| // Quantile of tasks at which to start speculation | ||
| val SPECULATION_QUANTILE = conf.getDouble("spark.speculation.quantile", 0.75) | ||
| val SPECULATION_MULTIPLIER = conf.getDouble("spark.speculation.multiplier", 1.5) | ||
|
|
@@ -71,7 +80,9 @@ private[spark] class TaskSetManager( | |
| val numTasks = tasks.length | ||
| val copiesRunning = new Array[Int](numTasks) | ||
| val successful = new Array[Boolean](numTasks) | ||
| val numFailures = new Array[Int](numTasks) | ||
| private val numFailures = new Array[Int](numTasks) | ||
| // key is taskId, value is a Map of executor id to when it failed | ||
| private val failedExecutors = new HashMap[Int, HashMap[String, Long]]() | ||
| val taskAttempts = Array.fill[List[TaskInfo]](numTasks)(Nil) | ||
| var tasksSuccessful = 0 | ||
|
|
||
|
|
@@ -228,12 +239,18 @@ private[spark] class TaskSetManager( | |
| * This method also cleans up any tasks in the list that have already | ||
| * been launched, since we want that to happen lazily. | ||
| */ | ||
| private def findTaskFromList(list: ArrayBuffer[Int]): Option[Int] = { | ||
| while (!list.isEmpty) { | ||
| val index = list.last | ||
| list.trimEnd(1) | ||
| if (copiesRunning(index) == 0 && !successful(index)) { | ||
| return Some(index) | ||
| private def findTaskFromList(execId: String, list: ArrayBuffer[Int]): Option[Int] = { | ||
| var indexOffset = list.size | ||
|
|
||
| while (indexOffset > 0) { | ||
| indexOffset -= 1 | ||
| val index = list(indexOffset) | ||
| if (!executorIsBlacklisted(execId, index)) { | ||
| // This should almost always be list.trimEnd(1) to remove tail | ||
| list.remove(indexOffset) | ||
| if (copiesRunning(index) == 0 && !successful(index)) { | ||
| return Some(index) | ||
| } | ||
| } | ||
| } | ||
| None | ||
|
|
@@ -244,6 +261,21 @@ private[spark] class TaskSetManager( | |
| taskAttempts(taskIndex).exists(_.host == host) | ||
| } | ||
|
|
||
| /** | ||
| * Is this re-execution of a failed task on an executor it already failed in before | ||
| * EXECUTOR_TASK_BLACKLIST_TIMEOUT has elapsed ? | ||
| */ | ||
| private def executorIsBlacklisted(execId: String, taskId: Int): Boolean = { | ||
| if (failedExecutors.contains(taskId)) { | ||
| val failed = failedExecutors.get(taskId).get | ||
|
|
||
| return failed.contains(execId) && | ||
| clock.getTime() - failed.get(execId).get < EXECUTOR_TASK_BLACKLIST_TIMEOUT | ||
| } | ||
|
|
||
| false | ||
| } | ||
|
|
||
| /** | ||
| * Return a speculative task for a given executor if any are available. The task should not have | ||
| * an attempt running on this host, in case the host is slow. In addition, the task should meet | ||
|
|
@@ -254,10 +286,13 @@ private[spark] class TaskSetManager( | |
| { | ||
| speculatableTasks.retain(index => !successful(index)) // Remove finished tasks from set | ||
|
|
||
| def canRunOnHost(index: Int): Boolean = | ||
| !hasAttemptOnHost(index, host) && !executorIsBlacklisted(execId, index) | ||
|
|
||
| if (!speculatableTasks.isEmpty) { | ||
| // Check for process-local or preference-less tasks; note that tasks can be process-local | ||
| // on multiple nodes when we replicate cached blocks, as in Spark Streaming | ||
| for (index <- speculatableTasks if !hasAttemptOnHost(index, host)) { | ||
| for (index <- speculatableTasks if canRunOnHost(index)) { | ||
| val prefs = tasks(index).preferredLocations | ||
| val executors = prefs.flatMap(_.executorId) | ||
| if (prefs.size == 0 || executors.contains(execId)) { | ||
|
|
@@ -268,7 +303,7 @@ private[spark] class TaskSetManager( | |
|
|
||
| // Check for node-local tasks | ||
| if (TaskLocality.isAllowed(locality, TaskLocality.NODE_LOCAL)) { | ||
| for (index <- speculatableTasks if !hasAttemptOnHost(index, host)) { | ||
| for (index <- speculatableTasks if canRunOnHost(index)) { | ||
| val locations = tasks(index).preferredLocations.map(_.host) | ||
| if (locations.contains(host)) { | ||
| speculatableTasks -= index | ||
|
|
@@ -280,7 +315,7 @@ private[spark] class TaskSetManager( | |
| // Check for rack-local tasks | ||
| if (TaskLocality.isAllowed(locality, TaskLocality.RACK_LOCAL)) { | ||
| for (rack <- sched.getRackForHost(host)) { | ||
| for (index <- speculatableTasks if !hasAttemptOnHost(index, host)) { | ||
| for (index <- speculatableTasks if canRunOnHost(index)) { | ||
| val racks = tasks(index).preferredLocations.map(_.host).map(sched.getRackForHost) | ||
| if (racks.contains(rack)) { | ||
| speculatableTasks -= index | ||
|
|
@@ -292,7 +327,7 @@ private[spark] class TaskSetManager( | |
|
|
||
| // Check for non-local tasks | ||
| if (TaskLocality.isAllowed(locality, TaskLocality.ANY)) { | ||
| for (index <- speculatableTasks if !hasAttemptOnHost(index, host)) { | ||
| for (index <- speculatableTasks if canRunOnHost(index)) { | ||
| speculatableTasks -= index | ||
| return Some((index, TaskLocality.ANY)) | ||
| } | ||
|
|
@@ -309,32 +344,32 @@ private[spark] class TaskSetManager( | |
| private def findTask(execId: String, host: String, locality: TaskLocality.Value) | ||
| : Option[(Int, TaskLocality.Value)] = | ||
| { | ||
| for (index <- findTaskFromList(getPendingTasksForExecutor(execId))) { | ||
| for (index <- findTaskFromList(execId, getPendingTasksForExecutor(execId))) { | ||
| return Some((index, TaskLocality.PROCESS_LOCAL)) | ||
| } | ||
|
|
||
| if (TaskLocality.isAllowed(locality, TaskLocality.NODE_LOCAL)) { | ||
| for (index <- findTaskFromList(getPendingTasksForHost(host))) { | ||
| for (index <- findTaskFromList(execId, getPendingTasksForHost(host))) { | ||
| return Some((index, TaskLocality.NODE_LOCAL)) | ||
| } | ||
| } | ||
|
|
||
| if (TaskLocality.isAllowed(locality, TaskLocality.RACK_LOCAL)) { | ||
| for { | ||
| rack <- sched.getRackForHost(host) | ||
| index <- findTaskFromList(getPendingTasksForRack(rack)) | ||
| index <- findTaskFromList(execId, getPendingTasksForRack(rack)) | ||
| } { | ||
| return Some((index, TaskLocality.RACK_LOCAL)) | ||
| } | ||
| } | ||
|
|
||
| // Look for no-pref tasks after rack-local tasks since they can run anywhere. | ||
| for (index <- findTaskFromList(pendingTasksWithNoPrefs)) { | ||
| for (index <- findTaskFromList(execId, pendingTasksWithNoPrefs)) { | ||
| return Some((index, TaskLocality.PROCESS_LOCAL)) | ||
| } | ||
|
|
||
| if (TaskLocality.isAllowed(locality, TaskLocality.ANY)) { | ||
| for (index <- findTaskFromList(allPendingTasks)) { | ||
| for (index <- findTaskFromList(execId, allPendingTasks)) { | ||
| return Some((index, TaskLocality.ANY)) | ||
| } | ||
| } | ||
|
|
@@ -460,6 +495,7 @@ private[spark] class TaskSetManager( | |
| logInfo("Ignorning task-finished event for TID " + tid + " because task " + | ||
| index + " has already completed successfully") | ||
| } | ||
| failedExecutors.remove(index) | ||
| maybeFinishTaskSet() | ||
| } | ||
|
|
||
|
|
@@ -480,17 +516,19 @@ private[spark] class TaskSetManager( | |
| logWarning("Lost TID %s (task %s:%d)".format(tid, taskSet.id, index)) | ||
| } | ||
| var taskMetrics : TaskMetrics = null | ||
| var failureReason = "unknown" | ||
| var failureReason: String = null | ||
| reason match { | ||
| case fetchFailed: FetchFailed => | ||
| logWarning("Loss was due to fetch failure from " + fetchFailed.bmAddress) | ||
| if (!successful(index)) { | ||
| successful(index) = true | ||
| tasksSuccessful += 1 | ||
| } | ||
| // Not adding to failed executors for FetchFailed. | ||
| isZombie = true | ||
|
|
||
| case TaskKilled => | ||
| // Not adding to failed executors for TaskKilled. | ||
| logWarning("Task %d was killed.".format(tid)) | ||
|
|
||
| case ef: ExceptionFailure => | ||
|
|
@@ -504,7 +542,8 @@ private[spark] class TaskSetManager( | |
| return | ||
| } | ||
| val key = ef.description | ||
| failureReason = "Exception failure: %s".format(ef.description) | ||
| failureReason = "Exception failure in TID %s on host %s: %s".format( | ||
| tid, info.host, ef.description) | ||
| val now = clock.getTime() | ||
| val (printFull, dupCount) = { | ||
| if (recentExceptions.contains(key)) { | ||
|
|
@@ -533,11 +572,16 @@ private[spark] class TaskSetManager( | |
| failureReason = "Lost result for TID %s on host %s".format(tid, info.host) | ||
| logWarning(failureReason) | ||
|
|
||
| case _ => {} | ||
| case _ => | ||
| failureReason = "TID %s on host %s failed for unknown reason".format(tid, info.host) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you move this to line 519 instead (or if you prefer, remove the default setting on line 519, which now won't ever be used)
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. will make default null. |
||
| } | ||
| // always add to failed executors | ||
| failedExecutors.getOrElseUpdate(index, new HashMap[String, Long]()). | ||
| put(info.executorId, clock.getTime()) | ||
| sched.dagScheduler.taskEnded(tasks(index), reason, null, null, info, taskMetrics) | ||
| addPendingTask(index) | ||
| if (!isZombie && state != TaskState.KILLED) { | ||
| assert (null != failureReason) | ||
| numFailures(index) += 1 | ||
| if (numFailures(index) >= maxTaskFailures) { | ||
| logError("Task %s:%d failed %d times; aborting job".format( | ||
|
|
||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You can just write this as
failedExectors(taskId)