Skip to content

Commit 6711df1

Browse files
szilard-nemethbrumi1024
authored andcommitted
YARN-6862. Nodemanager resource usage metrics sometimes are negative. Contributed by Benjamin Teke
1 parent 6114816 commit 6711df1

File tree

5 files changed

+131
-6
lines changed

5 files changed

+131
-6
lines changed

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/ContainersMonitorImpl.java

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -540,6 +540,14 @@ public void run() {
540540
pTree.updateProcessTree(); // update process-tree
541541
long currentVmemUsage = pTree.getVirtualMemorySize();
542542
long currentPmemUsage = pTree.getRssMemorySize();
543+
if (currentVmemUsage < 0 || currentPmemUsage < 0) {
544+
// YARN-6862/YARN-5021 If the container just exited or for
545+
// another reason the physical/virtual memory is UNAVAILABLE (-1)
546+
// the values shouldn't be aggregated.
547+
LOG.info("Skipping monitoring container {} because "
548+
+ "memory usage is not available.", containerId);
549+
continue;
550+
}
543551

544552
// if machine has 6 cores and 3 are used,
545553
// cpuUsagePercentPerCore should be 300%

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/MockCPUResourceCalculatorProcessTree.java

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,16 @@ public boolean checkPidPgrpidForMatch() {
5656
return true;
5757
}
5858

59+
@Override
60+
public long getVirtualMemorySize(int olderThanAge) {
61+
return 0;
62+
}
63+
64+
@Override
65+
public long getRssMemorySize(int olderThanAge) {
66+
return 0;
67+
}
68+
5969
@Override
6070
public float getCpuUsagePercent() {
6171
long cpu = this.cpuPercentage;
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
/**
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
19+
package org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor;
20+
21+
import org.apache.hadoop.yarn.util.ResourceCalculatorProcessTree;
22+
23+
/**
24+
* Mock class to obtain resource usage (Memory).
25+
*/
26+
public class MockMemoryResourceCalculatorProcessTree extends ResourceCalculatorProcessTree {
27+
private final long memorySize = 500000000L;
28+
29+
private long rssMemorySize = memorySize;
30+
private long virtualMemorySize = ResourceCalculatorProcessTree.UNAVAILABLE;
31+
32+
/**
33+
* Constructor for MockMemoryResourceCalculatorProcessTree with specified root
34+
* process.
35+
* @param root
36+
*/
37+
public MockMemoryResourceCalculatorProcessTree(String root) {
38+
super(root);
39+
}
40+
41+
@Override
42+
public void updateProcessTree() {
43+
}
44+
45+
@Override
46+
public String getProcessTreeDump() {
47+
return "";
48+
}
49+
50+
@Override
51+
public long getCumulativeCpuTime() {
52+
return 0;
53+
}
54+
55+
@Override
56+
public boolean checkPidPgrpidForMatch() {
57+
return true;
58+
}
59+
60+
@Override
61+
public long getRssMemorySize(int olderThanAge) {
62+
long rssMemory = this.rssMemorySize;
63+
// First getter call will return with 500000000, and second call will
64+
// return -1, rest of the calls will return a valid value.
65+
if (rssMemory == memorySize) {
66+
this.rssMemorySize = ResourceCalculatorProcessTree.UNAVAILABLE;
67+
}
68+
if (rssMemory == ResourceCalculatorProcessTree.UNAVAILABLE) {
69+
this.rssMemorySize = 2 * memorySize;
70+
}
71+
return rssMemory;
72+
}
73+
74+
@Override
75+
public long getVirtualMemorySize(int olderThanAge) {
76+
long virtualMemory = this.virtualMemorySize;
77+
// First getter call will return with -1, and rest of the calls will
78+
// return a valid value.
79+
if (virtualMemory == ResourceCalculatorProcessTree.UNAVAILABLE) {
80+
this.virtualMemorySize = 3 * memorySize;
81+
}
82+
return virtualMemory;
83+
}
84+
85+
@Override
86+
public float getCpuUsagePercent() {
87+
return 0;
88+
}
89+
}

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/MockResourceCalculatorProcessTree.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,10 +51,16 @@ public void setRssMemorySize(long rssMemorySize) {
5151
this.rssMemorySize = rssMemorySize;
5252
}
5353

54+
@Override
5455
public long getRssMemorySize() {
5556
return this.rssMemorySize;
5657
}
5758

59+
@Override
60+
public long getVirtualMemorySize() {
61+
return 0;
62+
}
63+
5864
@Override
5965
public float getCpuUsagePercent() {
6066
return 0;

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/TestContainersMonitorResourceChange.java

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -268,13 +268,24 @@ public void testContainersResourceChangeIsTriggeredImmediately()
268268

269269
@Test
270270
public void testContainersCPUResourceForDefaultValue() throws Exception {
271+
testContainerMonitoringInvalidResources(
272+
MockCPUResourceCalculatorProcessTree.class.getCanonicalName());
273+
}
274+
275+
@Test
276+
public void testContainersMemoryResourceUnavailable() throws Exception {
277+
testContainerMonitoringInvalidResources(
278+
MockMemoryResourceCalculatorProcessTree.class.getCanonicalName());
279+
}
280+
281+
private void testContainerMonitoringInvalidResources(
282+
String processTreeClassName) throws Exception {
271283
Configuration newConf = new Configuration(conf);
272-
// set container monitor interval to be 20s
284+
// set container monitor interval to be 20ms
273285
newConf.setLong(YarnConfiguration.NM_CONTAINER_MON_INTERVAL_MS, 20L);
274286
containersMonitor = createContainersMonitor(executor, dispatcher, context);
275287
newConf.set(YarnConfiguration.NM_CONTAINER_MON_PROCESS_TREE,
276-
MockCPUResourceCalculatorProcessTree.class.getCanonicalName());
277-
// set container monitor interval to be 20ms
288+
processTreeClassName);
278289
containersMonitor.init(newConf);
279290
containersMonitor.start();
280291

@@ -291,7 +302,7 @@ public void testContainersCPUResourceForDefaultValue() throws Exception {
291302
0, containersMonitor.getContainersUtilization()
292303
.compareTo(ResourceUtilization.newInstance(0, 0, 0.0f)));
293304

294-
// Verify the container utilization value. Since atleast one round is done,
305+
// Verify the container utilization value. Since at least one round is done,
295306
// we can expect a non-zero value for container utilization as
296307
// MockCPUResourceCalculatorProcessTree#getCpuUsagePercent will return 50.
297308
waitForContainerResourceUtilizationChange(containersMonitor, 100);
@@ -310,12 +321,13 @@ public static void waitForContainerResourceUtilizationChange(
310321
}
311322

312323
LOG.info(
313-
"Monitor thread is waiting for resource utlization change.");
324+
"Monitor thread is waiting for resource utilization change.");
314325
Thread.sleep(WAIT_MS_PER_LOOP);
315326
timeWaiting += WAIT_MS_PER_LOOP;
316327
}
317328

318-
assertTrue("Resource utilization is not changed from second run onwards",
329+
assertTrue("Resource utilization is not changed after " +
330+
timeoutMsecs / WAIT_MS_PER_LOOP + " updates",
319331
0 != containersMonitor.getContainersUtilization()
320332
.compareTo(ResourceUtilization.newInstance(0, 0, 0.0f)));
321333
}

0 commit comments

Comments
 (0)