-
Notifications
You must be signed in to change notification settings - Fork 28.9k
[SPARK-13704][CORE][YARN] Reduce rack resolution time #24245
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
56b7f70
342a6fd
1e1180c
e521bcf
8514b3e
63666ad
69b62e4
6246e57
7e4c729
d4a7cde
d3e1592
47add6f
e8ab99e
11fdd41
ade4caa
c9dace8
e2faee6
aca97f1
75d22ed
011b4c0
2876ad3
06a6264
92ef335
fa7daa4
99ea54b
6bcbc88
e598984
f5efc74
cd97b62
ad63e15
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,24 +17,100 @@ | |
|
|
||
| package org.apache.spark.deploy.yarn | ||
|
|
||
| import scala.collection.JavaConverters._ | ||
| import scala.collection.mutable.ArrayBuffer | ||
|
|
||
| import com.google.common.base.Strings | ||
| import org.apache.hadoop.conf.Configuration | ||
| import org.apache.hadoop.fs.CommonConfigurationKeysPublic | ||
| import org.apache.hadoop.net._ | ||
| import org.apache.hadoop.util.ReflectionUtils | ||
| import org.apache.hadoop.yarn.util.RackResolver | ||
| import org.apache.log4j.{Level, Logger} | ||
|
|
||
| import org.apache.spark.internal.Logging | ||
|
|
||
| /** | ||
| * Wrapper around YARN's [[RackResolver]]. This allows Spark tests to easily override the | ||
| * default behavior, since YARN's class self-initializes the first time it's called, and | ||
| * future calls all use the initial configuration. | ||
| * Re-implement YARN's [[RackResolver]] for hadoop releases without YARN-9332. | ||
| * This also allows Spark tests to easily override the default behavior, since YARN's class | ||
| * self-initializes the first time it's called, and future calls all use the initial configuration. | ||
| */ | ||
| private[yarn] class SparkRackResolver { | ||
| private[spark] class SparkRackResolver(conf: Configuration) extends Logging { | ||
|
|
||
| // RackResolver logs an INFO message whenever it resolves a rack, which is way too often. | ||
| if (Logger.getLogger(classOf[RackResolver]).getLevel == null) { | ||
| Logger.getLogger(classOf[RackResolver]).setLevel(Level.WARN) | ||
| } | ||
|
|
||
| def resolve(conf: Configuration, hostName: String): String = { | ||
| RackResolver.resolve(conf, hostName).getNetworkLocation() | ||
| private val dnsToSwitchMapping: DNSToSwitchMapping = { | ||
| val dnsToSwitchMappingClass = | ||
| conf.getClass(CommonConfigurationKeysPublic.NET_TOPOLOGY_NODE_SWITCH_MAPPING_IMPL_KEY, | ||
| classOf[ScriptBasedMapping], classOf[DNSToSwitchMapping]) | ||
| ReflectionUtils.newInstance(dnsToSwitchMappingClass, conf) | ||
| .asInstanceOf[DNSToSwitchMapping] match { | ||
| case c: CachedDNSToSwitchMapping => c | ||
| case o => new CachedDNSToSwitchMapping(o) | ||
| } | ||
| } | ||
|
|
||
| def resolve(hostName: String): String = { | ||
| coreResolve(Seq(hostName)).head.getNetworkLocation | ||
| } | ||
|
|
||
| /** | ||
| * Added in SPARK-13704. | ||
| * This should be changed to `RackResolver.resolve(conf, hostNames)` | ||
| * in hadoop releases with YARN-9332. | ||
| */ | ||
| def resolve(hostNames: Seq[String]): Seq[Node] = { | ||
| coreResolve(hostNames) | ||
| } | ||
|
|
||
| private def coreResolve(hostNames: Seq[String]): Seq[Node] = { | ||
| val nodes = new ArrayBuffer[Node] | ||
| // dnsToSwitchMapping is thread-safe | ||
| val rNameList = dnsToSwitchMapping.resolve(hostNames.toList.asJava).asScala | ||
| if (rNameList == null || rNameList.isEmpty) { | ||
| hostNames.foreach(nodes += new NodeBase(_, NetworkTopology.DEFAULT_RACK)) | ||
| logInfo(s"Got an error when resolving hostNames. " + | ||
| s"Falling back to ${NetworkTopology.DEFAULT_RACK} for all") | ||
| } else { | ||
| for ((hostName, rName) <- hostNames.zip(rNameList)) { | ||
| if (Strings.isNullOrEmpty(rName)) { | ||
| nodes += new NodeBase(hostName, NetworkTopology.DEFAULT_RACK) | ||
| logDebug(s"Could not resolve $hostName. " + | ||
| s"Falling back to ${NetworkTopology.DEFAULT_RACK}") | ||
| } else { | ||
| nodes += new NodeBase(hostName, rName) | ||
| } | ||
| } | ||
| } | ||
| nodes.toList | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Utility to resolve the rack for hosts in an efficient manner. | ||
| * It will cache the rack for individual hosts to avoid | ||
| * repeatedly performing the same expensive lookup. | ||
| */ | ||
| object SparkRackResolver extends Logging { | ||
| @volatile private var instance: SparkRackResolver = _ | ||
|
|
||
| /** | ||
| * It will return the static resolver instance. If there is already an instance, the passed | ||
| * conf is entirely ignored. If there is not a shared instance, it will create one with the | ||
| * given conf. | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we explain how to instantiate a separate resolver with a separate config here?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. its kinda obvious, no? |
||
| */ | ||
| def get(conf: Configuration): SparkRackResolver = { | ||
| if (instance == null) { | ||
| synchronized { | ||
| if (instance == null) { | ||
| instance = new SparkRackResolver(conf) | ||
| } | ||
| } | ||
| } | ||
| instance | ||
| } | ||
|
|
||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As
addPendingTasks()is called during the construction ofTaskSetManagerinstance (and that is the only one place where multiple hosts can be passed togetRacksForHosts()) I did not get whynumBatchInvocation === 1is so important that it is emphasised by this assert.I assume we are thinking about any potential future code which would use the
getRacksForHosts()(meanwhile it is an expensive call), am I right?