Skip to content

Commit 33ea5ae

Browse files
committed
YARN-2846. Incorrect persist exit code for running containers in reacquireContainer() that interrupted by NodeManager restart. Contributed by Junping Du
1 parent 177e809 commit 33ea5ae

File tree

4 files changed

+25
-22
lines changed

4 files changed

+25
-22
lines changed

hadoop-yarn-project/CHANGES.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -955,6 +955,10 @@ Release 2.6.0 - 2014-11-15
955955
YARN-2794. Fixed log messages about distributing system-credentials. (Jian He via
956956
zjshen)
957957

958+
YARN-2846. Incorrect persist exit code for running containers in
959+
reacquireContainer() that interrupted by NodeManager restart. (Junping Du
960+
via jlowe)
961+
958962
Release 2.5.2 - 2014-11-10
959963

960964
INCOMPATIBLE CHANGES

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/ContainerExecutor.java

Lines changed: 7 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -159,9 +159,10 @@ public abstract boolean isContainerProcessAlive(String user, String pid)
159159
* @param containerId The ID of the container to reacquire
160160
* @return The exit code of the pre-existing container
161161
* @throws IOException
162+
* @throws InterruptedException
162163
*/
163164
public int reacquireContainer(String user, ContainerId containerId)
164-
throws IOException {
165+
throws IOException, InterruptedException {
165166
Path pidPath = getPidFilePath(containerId);
166167
if (pidPath == null) {
167168
LOG.warn(containerId + " is not active, returning terminated error");
@@ -175,13 +176,8 @@ public int reacquireContainer(String user, ContainerId containerId)
175176
}
176177

177178
LOG.info("Reacquiring " + containerId + " with pid " + pid);
178-
try {
179-
while(isContainerProcessAlive(user, pid)) {
180-
Thread.sleep(1000);
181-
}
182-
} catch (InterruptedException e) {
183-
throw new IOException("Interrupted while waiting for process " + pid
184-
+ " to exit", e);
179+
while(isContainerProcessAlive(user, pid)) {
180+
Thread.sleep(1000);
185181
}
186182

187183
// wait for exit code file to appear
@@ -194,12 +190,9 @@ public int reacquireContainer(String user, ContainerId containerId)
194190
LOG.info(containerId + " was deactivated");
195191
return ExitCode.TERMINATED.getExitCode();
196192
}
197-
try {
198-
Thread.sleep(sleepMsec);
199-
} catch (InterruptedException e) {
200-
throw new IOException(
201-
"Interrupted while waiting for exit code from " + containerId, e);
202-
}
193+
194+
Thread.sleep(sleepMsec);
195+
203196
msecLeft -= sleepMsec;
204197
}
205198
if (msecLeft < 0) {

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -347,7 +347,7 @@ public int launchContainer(Container container,
347347

348348
@Override
349349
public int reacquireContainer(String user, ContainerId containerId)
350-
throws IOException {
350+
throws IOException, InterruptedException {
351351
try {
352352
return super.reacquireContainer(user, containerId);
353353
} finally {

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/RecoveredContainerLaunch.java

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ public Integer call() {
7373
dispatcher.getEventHandler().handle(new ContainerEvent(containerId,
7474
ContainerEventType.CONTAINER_LAUNCHED));
7575

76+
boolean notInterrupted = true;
7677
try {
7778
File pidFile = locatePidFile(appIdStr, containerIdStr);
7879
if (pidFile != null) {
@@ -85,14 +86,19 @@ public Integer call() {
8586
}
8687
} catch (IOException e) {
8788
LOG.error("Unable to recover container " + containerIdStr, e);
89+
} catch (InterruptedException e) {
90+
LOG.warn("Interrupted while waiting for exit code from " + containerId);
91+
notInterrupted = false;
8892
} finally {
89-
this.completed.set(true);
90-
exec.deactivateContainer(containerId);
91-
try {
92-
getContext().getNMStateStore().storeContainerCompleted(containerId,
93-
retCode);
94-
} catch (IOException e) {
95-
LOG.error("Unable to set exit code for container " + containerId);
93+
if (notInterrupted) {
94+
this.completed.set(true);
95+
exec.deactivateContainer(containerId);
96+
try {
97+
getContext().getNMStateStore().storeContainerCompleted(containerId,
98+
retCode);
99+
} catch (IOException e) {
100+
LOG.error("Unable to set exit code for container " + containerId);
101+
}
96102
}
97103
}
98104

0 commit comments

Comments
 (0)