This repository was archived by the owner on Jan 9, 2020. It is now read-only.
forked from apache/spark
-
Notifications
You must be signed in to change notification settings - Fork 117
Use node affinity to launch executors on preferred nodes benefitting from data locality #316
Merged
ash211
merged 12 commits into
apache-spark-on-k8s:branch-2.1-kubernetes
from
kimoonkim:locality-node-affinity
Jun 14, 2017
Merged
Changes from all commits
Commits
Show all changes
12 commits
Select commit
Hold shift + click to select a range
343e677
Use node affinity to launch executors on data local nodes
kimoonkim beb3786
Fix comment style
kimoonkim c22fce0
Use JSON object mapper
kimoonkim 89e81b6
Merge branch 'branch-2.1-kubernetes' into locality-node-affinity
kimoonkim 74755c5
Address review comments
kimoonkim f0fe6f6
Fix a style issue
kimoonkim a409581
Merge branch 'branch-2.1-kubernetes' into locality-node-affinity
ash211 b066b45
Clean up and add a TODO
kimoonkim ea58c11
Merge branch 'branch-2.1-kubernetes' into locality-node-affinity
kimoonkim 6e92842
Merge remote-tracking branch 'origin/locality-node-affinity' into loc…
kimoonkim c67b8ab
Fix style issue
kimoonkim 7eb6365
Address review comments
kimoonkim File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,9 +17,12 @@ | |
| package org.apache.spark.scheduler.cluster.kubernetes | ||
|
|
||
| import java.io.Closeable | ||
| import java.net.InetAddress | ||
| import java.util.concurrent.TimeUnit | ||
| import java.util.concurrent.atomic.{AtomicInteger, AtomicLong, AtomicReference} | ||
|
|
||
| import com.fasterxml.jackson.databind.ObjectMapper | ||
| import com.fasterxml.jackson.module.scala.DefaultScalaModule | ||
| import io.fabric8.kubernetes.api.model.{ContainerPortBuilder, EnvVarBuilder, EnvVarSourceBuilder, Pod, PodBuilder, QuantityBuilder} | ||
| import io.fabric8.kubernetes.client.{KubernetesClient, KubernetesClientException, Watcher} | ||
| import io.fabric8.kubernetes.client.Watcher.Action | ||
|
|
@@ -177,16 +180,18 @@ private[spark] class KubernetesClusterSchedulerBackend( | |
| .newDaemonSingleThreadScheduledExecutor("kubernetes-pod-allocator") | ||
|
|
||
| private val allocatorRunnable: Runnable = new Runnable { | ||
|
|
||
| override def run(): Unit = { | ||
| if (totalRegisteredExecutors.get() < runningExecutorPods.size) { | ||
| logDebug("Waiting for pending executors before scaling") | ||
| } else if (totalExpectedExecutors.get() <= runningExecutorPods.size) { | ||
| logDebug("Maximum allowed executor limit reached. Not scaling up further.") | ||
| } else { | ||
| val nodeToLocalTaskCount = getNodesWithLocalTaskCounts | ||
| RUNNING_EXECUTOR_PODS_LOCK.synchronized { | ||
| for (i <- 0 until math.min( | ||
| totalExpectedExecutors.get - runningExecutorPods.size, podAllocationSize)) { | ||
| runningExecutorPods += allocateNewExecutorPod() | ||
| runningExecutorPods += allocateNewExecutorPod(nodeToLocalTaskCount) | ||
| logInfo( | ||
| s"Requesting a new executor, total executors is now ${runningExecutorPods.size}") | ||
| } | ||
|
|
@@ -195,6 +200,8 @@ private[spark] class KubernetesClusterSchedulerBackend( | |
| } | ||
| } | ||
|
|
||
| private val objectMapper = new ObjectMapper().registerModule(DefaultScalaModule) | ||
|
|
||
| private def getShuffleClient(): KubernetesExternalShuffleClient = { | ||
| new KubernetesExternalShuffleClient( | ||
| SparkTransportConf.fromSparkConf(conf, "shuffle"), | ||
|
|
@@ -283,7 +290,70 @@ private[spark] class KubernetesClusterSchedulerBackend( | |
| } | ||
| } | ||
|
|
||
| private def allocateNewExecutorPod(): (String, Pod) = { | ||
| /** | ||
| * @return A map of K8s cluster nodes to the number of tasks that could benefit from data | ||
| * locality if an executor launches on the cluster node. | ||
| */ | ||
| private def getNodesWithLocalTaskCounts() : Map[String, Int] = { | ||
| val executorPodsWithIPs = EXECUTOR_PODS_BY_IPS_LOCK.synchronized { | ||
| executorPodsByIPs.values.toList // toList makes a defensive copy. | ||
| } | ||
| val nodeToLocalTaskCount = mutable.Map[String, Int]() ++ | ||
| KubernetesClusterSchedulerBackend.this.synchronized { | ||
| hostToLocalTaskCount | ||
| } | ||
| for (pod <- executorPodsWithIPs) { | ||
| // Remove cluster nodes that are running our executors already. | ||
| // TODO: This prefers spreading out executors across nodes. In case users want | ||
| // consolidating executors on fewer nodes, introduce a flag. See the spark.deploy.spreadOut | ||
| // flag that Spark standalone has: https://spark.apache.org/docs/latest/spark-standalone.html | ||
| nodeToLocalTaskCount.remove(pod.getSpec.getNodeName).nonEmpty || | ||
| nodeToLocalTaskCount.remove(pod.getStatus.getHostIP).nonEmpty || | ||
| nodeToLocalTaskCount.remove( | ||
| InetAddress.getByName(pod.getStatus.getHostIP).getCanonicalHostName).nonEmpty | ||
| } | ||
| nodeToLocalTaskCount.toMap[String, Int] | ||
| } | ||
|
|
||
| private def addNodeAffinityAnnotationIfUseful(basePodBuilder: PodBuilder, | ||
| nodeToTaskCount: Map[String, Int]): PodBuilder = { | ||
| def scaleToRange(value: Int, baseMin: Double, baseMax: Double, | ||
| rangeMin: Double, rangeMax: Double): Int = | ||
| (((rangeMax - rangeMin) * (value - baseMin) / (baseMax - baseMin)) + rangeMin).toInt | ||
|
|
||
| if (nodeToTaskCount.nonEmpty) { | ||
| val taskTotal = nodeToTaskCount.foldLeft(0)(_ + _._2) | ||
| // Normalize to node affinity weights in 1 to 100 range. | ||
| val nodeToWeight = nodeToTaskCount.map{ | ||
| case (node, taskCount) => | ||
| (node, scaleToRange(taskCount, 1, taskTotal, rangeMin = 1, rangeMax = 100))} | ||
| val weightToNodes = nodeToWeight.groupBy(_._2).mapValues(_.keys) | ||
| // @see https://kubernetes.io/docs/concepts/configuration/assign-pod-node | ||
| val nodeAffinityJson = objectMapper.writeValueAsString(SchedulerAffinity(NodeAffinity( | ||
| preferredDuringSchedulingIgnoredDuringExecution = | ||
| for ((weight, nodes) <- weightToNodes) yield | ||
| WeightedPreference(weight, | ||
| Preference(Array(MatchExpression("kubernetes.io/hostname", "In", nodes)))) | ||
| ))) | ||
| // TODO: Use non-annotation syntax when we switch to K8s version 1.6. | ||
| logDebug(s"Adding nodeAffinity as annotation $nodeAffinityJson") | ||
| basePodBuilder.editMetadata() | ||
| .addToAnnotations(ANNOTATION_EXECUTOR_NODE_AFFINITY, nodeAffinityJson) | ||
| .endMetadata() | ||
| } else { | ||
| basePodBuilder | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Allocates a new executor pod | ||
| * | ||
| * @param nodeToLocalTaskCount A map of K8s cluster nodes to the number of tasks that could | ||
| * benefit from data locality if an executor launches on the cluster | ||
| * node. | ||
| * @return A tuple of the new executor name and the Pod data structure. | ||
| */ | ||
| private def allocateNewExecutorPod(nodeToLocalTaskCount: Map[String, Int]): (String, Pod) = { | ||
| val executorId = EXECUTOR_ID_COUNTER.incrementAndGet().toString | ||
| val name = s"$executorPodNamePrefix-exec-$executorId" | ||
|
|
||
|
|
@@ -393,14 +463,19 @@ private[spark] class KubernetesClusterSchedulerBackend( | |
| .endSpec() | ||
| } | ||
| }.getOrElse(basePodBuilder) | ||
| val resolvedExecutorPod = executorInitContainerBootstrap.map { bootstrap => | ||
| bootstrap.bootstrapInitContainerAndVolumes( | ||
| "executor", | ||
| withMaybeShuffleConfigPodBuilder) | ||
| }.getOrElse(withMaybeShuffleConfigPodBuilder) | ||
|
|
||
| val executorInitContainerPodBuilder = executorInitContainerBootstrap.map { | ||
| bootstrap => | ||
| bootstrap.bootstrapInitContainerAndVolumes( | ||
| "executor", | ||
| withMaybeShuffleConfigPodBuilder) | ||
| }.getOrElse(withMaybeShuffleConfigPodBuilder) | ||
|
|
||
| val resolvedExecutorPodBuilder = addNodeAffinityAnnotationIfUseful( | ||
| executorInitContainerPodBuilder, nodeToLocalTaskCount) | ||
|
|
||
| try { | ||
| (executorId, kubernetesClient.pods.create(resolvedExecutorPod.build())) | ||
| (executorId, kubernetesClient.pods.create(resolvedExecutorPodBuilder.build())) | ||
| } catch { | ||
| case throwable: Throwable => | ||
| logError("Failed to allocate executor pod.", throwable) | ||
|
|
@@ -521,3 +596,15 @@ private object KubernetesClusterSchedulerBackend { | |
| private val DEFAULT_STATIC_PORT = 10000 | ||
| private val EXECUTOR_ID_COUNTER = new AtomicLong(0L) | ||
| } | ||
|
|
||
| /** | ||
| * These case classes model K8s node affinity syntax for | ||
| * preferredDuringSchedulingIgnoredDuringExecution. | ||
| * @see https://kubernetes.io/docs/concepts/configuration/assign-pod-node | ||
| */ | ||
| case class SchedulerAffinity(nodeAffinity: NodeAffinity) | ||
| case class NodeAffinity(preferredDuringSchedulingIgnoredDuringExecution: | ||
| Iterable[WeightedPreference]) | ||
| case class WeightedPreference(weight: Int, preference: Preference) | ||
| case class Preference(matchExpressions: Array[MatchExpression]) | ||
| case class MatchExpression(key: String, operator: String, values: Iterable[String]) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is there a k8s spec that these are following? if so please add a comment above these case classes with a link to k8s docs on what they mean
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed. |
||
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is preferring not-previously-used executors always a good idea? It reminds me of the
spark.deploy.spreadOutflag that Spark standalone has: https://spark.apache.org/docs/latest/spark-standalone.htmlIf you were doing a highly iterative workload that was network bound, it may be better to allow (or even prefer!) scheduling multiple executor pods on the same k8s node so that network is only on the loopback interface rather than actually crossing between servers.
Possibly we should consider adding a flag for this in the future. The typical job I would run would prefer the spread out mode you have here so I'm not in a rush to support the consolidated mode.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That is a great question. Good to learn about the
spark.deploy.spreadOutflag. I agree we should consider a flag for this in the future. I'll add a TODO here.One interesting related fact is that the loop at line 192 - 197 may have the effect of consolidated mode within the next batch of pods, because we are not changing nodes within this loop. But we don't know how much consolidation will be done by the scheduler.