@@ -36,6 +36,7 @@ import org.apache.log4j.{Level, Logger}
3636
3737import org .apache .spark .{Logging , SecurityManager , SparkConf }
3838import org .apache .spark .deploy .yarn .YarnSparkHadoopUtil ._
39+ import org .apache .spark .scheduler .cluster .CoarseGrainedSchedulerBackend
3940
4041/**
4142 * YarnAllocator is charged with requesting containers from the YARN ResourceManager and deciding
@@ -88,6 +89,10 @@ private[yarn] class YarnAllocator(
8889 // Visible for testing.
8990 private [yarn] val executorIdToContainer = new HashMap [String , Container ]
9091
92+ private var numUnexpectedContainerRelease = 0L
93+ private var backend : CoarseGrainedSchedulerBackend = _
94+ private val containerIdToExecutorId = new HashMap [ContainerId , String ]
95+
9196 // Executor memory in MB.
9297 protected val executorMemory = args.executorMemory
9398 // Additional memory overhead.
@@ -165,6 +170,7 @@ private[yarn] class YarnAllocator(
165170 def killExecutor (executorId : String ): Unit = synchronized {
166171 if (executorIdToContainer.contains(executorId)) {
167172 val container = executorIdToContainer.remove(executorId).get
173+ containerIdToExecutorId.remove(container.getId)
168174 internalReleaseContainer(container)
169175 numExecutorsRunning -= 1
170176 } else {
@@ -353,6 +359,7 @@ private[yarn] class YarnAllocator(
353359
354360 logInfo(" Launching container %s for on host %s" .format(containerId, executorHostname))
355361 executorIdToContainer(executorId) = container
362+ containerIdToExecutorId(container.getId) = executorId
356363
357364 val containerSet = allocatedHostToContainersMap.getOrElseUpdate(executorHostname,
358365 new HashSet [ContainerId ])
@@ -384,6 +391,7 @@ private[yarn] class YarnAllocator(
384391 for (completedContainer <- completedContainers) {
385392 val containerId = completedContainer.getContainerId
386393
394+ var needNotify = false
387395 if (releasedContainers.contains(containerId)) {
388396 // Already marked the container for release, so remove it from
389397 // `releasedContainers`.
@@ -415,6 +423,7 @@ private[yarn] class YarnAllocator(
415423 " . Diagnostics: " + completedContainer.getDiagnostics)
416424 numExecutorsFailed += 1
417425 }
426+ needNotify = true
418427 }
419428
420429 if (allocatedContainerToHostMap.containsKey(containerId)) {
@@ -430,6 +439,15 @@ private[yarn] class YarnAllocator(
430439
431440 allocatedContainerToHostMap.remove(containerId)
432441 }
442+
443+ val executorIdOpt = containerIdToExecutorId.remove(containerId)
444+ if (executorIdOpt.isDefined) executorIdToContainer.remove(executorIdOpt.get)
445+
446+ if (needNotify && executorIdOpt.isDefined) {
447+ // The executor could have gone away (like no route to host, node failure, etc)
448+ // Notify backend about the failure of the executor
449+ notifyBackend(executorIdOpt.get, containerId)
450+ }
433451 }
434452 }
435453
@@ -438,6 +456,19 @@ private[yarn] class YarnAllocator(
438456 amClient.releaseAssignedContainer(container.getId())
439457 }
440458
459+ private [yarn] def notifyBackend (executorId : String , containerId : ContainerId ): Unit = {
460+ numUnexpectedContainerRelease += 1
461+ if (null != backend) {
462+ backend.removeExecutor(executorId,
463+ " Yarn deallocated the executor (" + executorId + " ) container " + containerId)
464+ }
465+ }
466+
467+ private [yarn] def getNumUnexpectedContainerRelease = numUnexpectedContainerRelease
468+
469+ private [yarn] def setScheduler (backend : CoarseGrainedSchedulerBackend ): Unit = synchronized {
470+ this .backend = backend
471+ }
441472}
442473
443474private object YarnAllocator {
0 commit comments