Skip to content

Commit 07a4220

Browse files
authored
HDFS-15821. Add metrics for in-service datanodes (#2690). Contributed by Zehao Chen.
1 parent bad6038 commit 07a4220

File tree

4 files changed

+121
-0
lines changed

4 files changed

+121
-0
lines changed

hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/NamenodeBeanMetrics.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -739,6 +739,11 @@ public int getNumEnteringMaintenanceDataNodes() {
739739
return 0;
740740
}
741741

742+
@Override
743+
public int getNumInServiceLiveDataNodes() {
744+
return 0;
745+
}
746+
742747
@Override
743748
public int getVolumeFailuresTotal() {
744749
return 0;

hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5579,6 +5579,19 @@ public int getNumDecomDeadDataNodes() {
55795579
return deadDecommissioned;
55805580
}
55815581

5582+
@Override // FSNamesystemMBean
5583+
@Metric({"NumInServiceLiveDataNodes",
5584+
"Number of live datanodes which are currently in service"})
5585+
public int getNumInServiceLiveDataNodes() {
5586+
final List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>();
5587+
getBlockManager().getDatanodeManager().fetchDatanodes(live, null, true);
5588+
int liveInService = live.size();
5589+
for (DatanodeDescriptor node : live) {
5590+
liveInService -= node.isInMaintenance() ? 1 : 0;
5591+
}
5592+
return liveInService;
5593+
}
5594+
55825595
@Override // FSNamesystemMBean
55835596
@Metric({"VolumeFailuresTotal",
55845597
"Total number of volume failures across all Datanodes"})

hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/metrics/FSNamesystemMBean.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,12 @@ public interface FSNamesystemMBean {
150150
*/
151151
public int getNumDecomDeadDataNodes();
152152

153+
/**
154+
* @return Number of in-service data nodes, where NumInServiceDataNodes =
155+
* NumLiveDataNodes - NumDecomLiveDataNodes - NumInMaintenanceLiveDataNodes
156+
*/
157+
int getNumInServiceLiveDataNodes();
158+
153159
/**
154160
* Number of failed data volumes across all live data nodes.
155161
* @return number of failed data volumes across all live data nodes

hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestNameNodeMXBean.java

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -435,6 +435,103 @@ public Boolean get() {
435435
}
436436
}
437437

438+
@Test(timeout = 120000)
439+
public void testInServiceNodes() throws Exception {
440+
Configuration conf = new Configuration();
441+
conf.setInt(DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_KEY, 1);
442+
conf.setInt(DFSConfigKeys.DFS_NAMENODE_HEARTBEAT_RECHECK_INTERVAL_KEY,
443+
30);
444+
conf.setClass(DFSConfigKeys.DFS_NAMENODE_HOSTS_PROVIDER_CLASSNAME_KEY,
445+
CombinedHostFileManager.class, HostConfigManager.class);
446+
MiniDFSCluster cluster = null;
447+
HostsFileWriter hostsFileWriter = new HostsFileWriter();
448+
hostsFileWriter.initialize(conf, "temp/TestInServiceNodes");
449+
450+
try {
451+
cluster = new MiniDFSCluster.Builder(conf).numDataNodes(3).build();
452+
cluster.waitActive();
453+
454+
final FSNamesystem fsn = cluster.getNameNode().namesystem;
455+
final MBeanServer mbs = ManagementFactory.getPlatformMBeanServer();
456+
final ObjectName mxbeanName = new ObjectName(
457+
"Hadoop:service=NameNode,name=FSNamesystem");
458+
459+
List<String> hosts = new ArrayList<>();
460+
for (DataNode dn : cluster.getDataNodes()) {
461+
hosts.add(dn.getDisplayName());
462+
}
463+
hostsFileWriter.initIncludeHosts(hosts.toArray(
464+
new String[hosts.size()]));
465+
fsn.getBlockManager().getDatanodeManager().refreshNodes(conf);
466+
467+
GenericTestUtils.waitFor(new Supplier<Boolean>() {
468+
@Override
469+
public Boolean get() {
470+
try {
471+
int numLiveDataNodes = (int) mbs.getAttribute(mxbeanName,
472+
"NumLiveDataNodes");
473+
return numLiveDataNodes == 3;
474+
} catch (Exception e) {
475+
return false;
476+
}
477+
}
478+
}, 1000, 60000);
479+
480+
// Verify nodes
481+
int numDecomLiveDataNodes = (int) mbs.getAttribute(mxbeanName,
482+
"NumDecomLiveDataNodes");
483+
int numInMaintenanceLiveDataNodes = (int) mbs.getAttribute(mxbeanName,
484+
"NumInMaintenanceLiveDataNodes");
485+
int numInServiceLiveDataNodes = (int) mbs.getAttribute(mxbeanName,
486+
"NumInServiceLiveDataNodes");
487+
assertEquals(0, numDecomLiveDataNodes);
488+
assertEquals(0, numInMaintenanceLiveDataNodes);
489+
assertEquals(3, numInServiceLiveDataNodes);
490+
491+
// Add 2 nodes to out-of-service list
492+
ArrayList<String> decomNodes = new ArrayList<>();
493+
decomNodes.add(cluster.getDataNodes().get(0).getDisplayName());
494+
495+
Map<String, Long> maintenanceNodes = new HashMap<>();
496+
final int expirationInMs = 30 * 1000;
497+
maintenanceNodes.put(cluster.getDataNodes().get(1).getDisplayName(),
498+
Time.now() + expirationInMs);
499+
500+
hostsFileWriter.initOutOfServiceHosts(decomNodes, maintenanceNodes);
501+
fsn.getBlockManager().getDatanodeManager().refreshNodes(conf);
502+
503+
// Wait for the DatanodeAdminManager to complete check
504+
GenericTestUtils.waitFor(new Supplier<Boolean>() {
505+
@Override
506+
public Boolean get() {
507+
try {
508+
int numLiveDataNodes = (int) mbs.getAttribute(mxbeanName,
509+
"NumLiveDataNodes");
510+
int numDecomLiveDataNodes = (int) mbs.getAttribute(mxbeanName,
511+
"NumDecomLiveDataNodes");
512+
int numInMaintenanceLiveDataNodes = (int) mbs.getAttribute(
513+
mxbeanName, "NumInMaintenanceLiveDataNodes");
514+
return numLiveDataNodes == 3 &&
515+
numDecomLiveDataNodes == 1 &&
516+
numInMaintenanceLiveDataNodes == 1;
517+
} catch (Exception e) {
518+
return false;
519+
}
520+
}
521+
}, 1000, 60000);
522+
523+
// Verify nodes
524+
numInServiceLiveDataNodes = (int) mbs.getAttribute(mxbeanName,
525+
"NumInServiceLiveDataNodes");
526+
assertEquals(1, numInServiceLiveDataNodes);
527+
} finally {
528+
if (cluster != null) {
529+
cluster.shutdown();
530+
}
531+
hostsFileWriter.cleanup();
532+
}
533+
}
534+
438535
@Test (timeout = 120000)
439536
public void testMaintenanceNodes() throws Exception {
440537
LOG.info("Starting testMaintenanceNodes");

0 commit comments

Comments
 (0)