-
Notifications
You must be signed in to change notification settings - Fork 28.9k
[SPARK-13704][CORE][YARN] Re-implement RackResolver to reduce resolving time #23951
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
56b7f70
342a6fd
1e1180c
e521bcf
8514b3e
63666ad
69b62e4
6246e57
7e4c729
d4a7cde
d3e1592
47add6f
e8ab99e
11fdd41
ade4caa
c9dace8
e2faee6
aca97f1
75d22ed
011b4c0
2876ad3
06a6264
92ef335
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -156,6 +156,12 @@ private[spark] class TaskSchedulerImpl( | |
|
|
||
| private[scheduler] var barrierCoordinator: RpcEndpoint = null | ||
|
|
||
| /** | ||
| * It can be override in different TaskScheduler, like Yarn. | ||
| * None by default. This should be initialized before any invocation. | ||
| */ | ||
| protected val defaultRackValue: Option[String] = None | ||
|
|
||
| private def maybeInitBarrierCoordinator(): Unit = { | ||
| if (barrierCoordinator == null) { | ||
| barrierCoordinator = new BarrierCoordinator(barrierSyncTimeout, sc.listenerBus, | ||
|
|
@@ -375,9 +381,10 @@ private[spark] class TaskSchedulerImpl( | |
| executorIdToRunningTaskIds(o.executorId) = HashSet[Long]() | ||
| newExecAvail = true | ||
| } | ||
| for (rack <- getRackForHost(o.host)) { | ||
| hostsByRack.getOrElseUpdate(rack, new HashSet[String]()) += o.host | ||
| } | ||
| } | ||
| val hosts = offers.map(_.host).toSet.toSeq | ||
| for ((host, Some(rack)) <- hosts.zip(getRacksForHosts(hosts))) { | ||
| hostsByRack.getOrElseUpdate(rack, new HashSet[String]()) += host | ||
| } | ||
|
|
||
| // Before making any offers, remove any nodes from the blacklist whose blacklist has expired. Do | ||
|
|
@@ -811,8 +818,38 @@ private[spark] class TaskSchedulerImpl( | |
| blacklistTrackerOpt.map(_.nodeBlacklist()).getOrElse(scala.collection.immutable.Set()) | ||
| } | ||
|
|
||
| // Add a on-off switch to save time for rack resolving | ||
| private def skipRackResolving: Boolean = sc.conf.get(LOCALITY_WAIT_RACK) == 0L | ||
|
|
||
| /** | ||
| * Get racks info for hosts. This is the internal method of [[getRacksForHosts]]. | ||
| * It should be override in different TaskScheduler. | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Remove this line. |
||
| * The return racks must have to be the same length as the hosts passed in. | ||
| * Return [[defaultRackValue]] sequence by default. | ||
| */ | ||
| protected def doGetRacksForHosts(hosts: Seq[String]): Seq[Option[String]] = { | ||
| hosts.map(_ => defaultRackValue) | ||
| } | ||
|
|
||
| // By default, rack is unknown | ||
| def getRackForHost(value: String): Option[String] = None | ||
LantaoJin marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| def getRackForHost(hosts: String): Option[String] = { | ||
| if (skipRackResolving) { | ||
| defaultRackValue | ||
| } else { | ||
| doGetRacksForHosts(Seq(hosts)).head | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * null in return sequences will be replaced to [[defaultRackValue]]. | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not sure what this is explaining? The code in this method doesn't seem to have any |
||
| */ | ||
| def getRacksForHosts(hosts: Seq[String]): Seq[Option[String]] = { | ||
| if (skipRackResolving) { | ||
| hosts.map(_ => defaultRackValue) | ||
| } else { | ||
| doGetRacksForHosts(hosts) | ||
| } | ||
| } | ||
|
|
||
| private def waitBackendReady(): Unit = { | ||
| if (backend.isReady) { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -186,8 +186,23 @@ private[spark] class TaskSetManager( | |
|
|
||
| // Add all our tasks to the pending lists. We do this in reverse order | ||
| // of task index so that tasks with low indices get launched first. | ||
| for (i <- (0 until numTasks).reverse) { | ||
| addPendingTask(i) | ||
| addPendingTasks() | ||
|
|
||
| private def addPendingTasks(): Unit = { | ||
| val (_, duration) = Utils.timeTakenMs { | ||
| val hostToIndices = new HashMap[String, ArrayBuffer[Int]]() | ||
| for (i <- (0 until numTasks).reverse) { | ||
| addPendingTask(i, initializing = true, Some(hostToIndices)) | ||
| } | ||
| // Resolve the rack for each host. This can be slow, so de-dupe the list of hosts, | ||
| // and assign the rack to all relevant task indices. | ||
| for ( | ||
| (Some(rack), indices) <- sched.getRacksForHosts(hostToIndices.keySet.toSeq) | ||
| .zip(hostToIndices.values)) { | ||
| pendingTasksForRack.getOrElseUpdate(rack, new ArrayBuffer) ++= indices | ||
| } | ||
| } | ||
| logDebug(s"Adding pending tasks take $duration ms") | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. s/take/took |
||
| } | ||
|
|
||
| /** | ||
|
|
@@ -214,7 +229,10 @@ private[spark] class TaskSetManager( | |
| private[scheduler] var emittedTaskSizeWarning = false | ||
|
|
||
| /** Add a task to all the pending-task lists that it should be on. */ | ||
| private[spark] def addPendingTask(index: Int) { | ||
| private[spark] def addPendingTask( | ||
| index: Int, | ||
| initializing: Boolean = false, | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure I like passing these arguments to this function. Could this be solved with a It seems that
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. in case it was unclear, that was exactly my suggestion earlier as well. Sorry for any confusion. |
||
| initializingMap: Option[HashMap[String, ArrayBuffer[Int]]] = None) { | ||
| for (loc <- tasks(index).preferredLocations) { | ||
| loc match { | ||
| case e: ExecutorCacheTaskLocation => | ||
|
|
@@ -234,8 +252,14 @@ private[spark] class TaskSetManager( | |
| case _ => | ||
| } | ||
| pendingTasksForHost.getOrElseUpdate(loc.host, new ArrayBuffer) += index | ||
| for (rack <- sched.getRackForHost(loc.host)) { | ||
| pendingTasksForRack.getOrElseUpdate(rack, new ArrayBuffer) += index | ||
|
|
||
| if (initializing) { | ||
| // preferredLocation -> task indices, initializingMap used when TaskSetManager initializing | ||
| initializingMap.foreach(_.getOrElseUpdate(loc.host, new ArrayBuffer) += index) | ||
| } else { | ||
| for (rack <- sched.getRackForHost(loc.host)) { | ||
| pendingTasksForRack.getOrElseUpdate(rack, new ArrayBuffer) += index | ||
| } | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -249,24 +273,27 @@ private[spark] class TaskSetManager( | |
| /** | ||
| * Return the pending tasks list for a given executor ID, or an empty list if | ||
| * there is no map entry for that host | ||
| * This is visible for testing. | ||
| */ | ||
| private def getPendingTasksForExecutor(executorId: String): ArrayBuffer[Int] = { | ||
| private[scheduler] def getPendingTasksForExecutor(executorId: String): ArrayBuffer[Int] = { | ||
| pendingTasksForExecutor.getOrElse(executorId, ArrayBuffer()) | ||
| } | ||
|
|
||
| /** | ||
| * Return the pending tasks list for a given host, or an empty list if | ||
| * there is no map entry for that host | ||
| * This is visible for testing. | ||
| */ | ||
| private def getPendingTasksForHost(host: String): ArrayBuffer[Int] = { | ||
| private[scheduler] def getPendingTasksForHost(host: String): ArrayBuffer[Int] = { | ||
| pendingTasksForHost.getOrElse(host, ArrayBuffer()) | ||
| } | ||
|
|
||
| /** | ||
| * Return the pending rack-local task list for a given rack, or an empty list if | ||
| * there is no map entry for that rack | ||
| * This is visible for testing. | ||
| */ | ||
| private def getPendingTasksForRack(rack: String): ArrayBuffer[Int] = { | ||
| private[scheduler] def getPendingTasksForRack(rack: String): ArrayBuffer[Int] = { | ||
| pendingTasksForRack.getOrElse(rack, ArrayBuffer()) | ||
| } | ||
|
|
||
|
|
@@ -331,7 +358,7 @@ private[spark] class TaskSetManager( | |
| val executors = prefs.flatMap(_ match { | ||
| case e: ExecutorCacheTaskLocation => Some(e.executorId) | ||
| case _ => None | ||
| }); | ||
| }) | ||
| if (executors.contains(execId)) { | ||
| speculatableTasks -= index | ||
| return Some((index, TaskLocality.PROCESS_LOCAL)) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I was just reviewing another patch related to delay scheduling, and I realized this optimization is a bit too aggressive. That configuration only controls whether you wait for a resource that is rack-local. Even when the wait is 0, spark still tries to to find a rack-local task for a given resource offer; it just will schedule a non-rack-local task even if it can't find a rack-local one. But it won't be able to do that if it doesn't know what racks the resource offers are on.
So I think you either need to:
a) change this to use a new conf, with an extra check that you only turn off rack resolution entirely if its also true that
sc.conf.get(LOCALITY_WAIT_RACK) == 0Lb) is this optimization even needed, considering how much time the rest of this change should save? Maybe we should still always do the rack resolution, since it should be pretty fast after the rest of your change.
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I knew that. Isolation deployment of computing and storage is more and more popular in industry. If someone set
LOCALITY_WAITto 0 on purpose, almost time it is no need data locality at all (especially rack level). Rack resolving and locality algorithm still spends time on this isolation deployment case. Could we open a new ticket to address this?Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@squito #24175 also faces same situation
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
thanks for pointing out the other pr that is similar -- yes I agree, I can see cases where you would want to skip rack resolution entirely for the "disaggregated" clusters you are talking about. I'll comment on the other ticket as well. Would you like to follow up with that?