Skip to content

Commit e00b656

Browse files
committed
In some cases, yarn does not automatically restart the container
1 parent 79fa8fd commit e00b656

File tree

2 files changed

+8
-0
lines changed

2 files changed

+8
-0
lines changed

yarn/stable/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -261,6 +261,10 @@ class ApplicationMaster(args: ApplicationMasterArguments, conf: Configuration,
261261
finishApplicationMaster(FinalApplicationStatus.FAILED,
262262
"max number of executor failures reached")
263263
}
264+
val numExecutorsFailed = yarnAllocator.getNumExecutorsFailed
265+
if (numExecutorsFailed > 0) {
266+
yarnAllocator.addResourceRequests(numExecutorsFailed)
267+
}
264268
yarnAllocator.allocateResources()
265269
ApplicationMaster.incrementAllocatorLoop(1)
266270
Thread.sleep(100)

yarn/stable/src/main/scala/org/apache/spark/deploy/yarn/ExecutorLauncher.scala

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -206,6 +206,10 @@ class ExecutorLauncher(args: ApplicationMasterArguments, conf: Configuration, sp
206206
yarnAllocator.addResourceRequests(args.numExecutors)
207207
while ((yarnAllocator.getNumExecutorsRunning < args.numExecutors) && (!driverClosed)) {
208208
yarnAllocator.allocateResources()
209+
val numExecutorsFailed = yarnAllocator.getNumExecutorsFailed
210+
if (numExecutorsFailed > 0) {
211+
yarnAllocator.addResourceRequests(numExecutorsFailed)
212+
}
209213
Thread.sleep(100)
210214
}
211215

0 commit comments

Comments
 (0)