From 30a9d13682aaa66a8efca78b9079424b2b040382 Mon Sep 17 00:00:00 2001 From: "zhanghaobo@kanzhun.com" Date: Mon, 1 Jan 2024 17:43:26 +0800 Subject: [PATCH 1/3] HDFS-17314. Add a metrics to record congestion backoff counts. --- .../org/apache/hadoop/hdfs/server/datanode/DataNode.java | 6 +++++- .../hdfs/server/datanode/metrics/DataNodeMetrics.java | 5 +++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java index 956f5bbe519d4..35875095b946c 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java @@ -1101,7 +1101,11 @@ public PipelineAck.ECN getECN() { } double load = ManagementFactory.getOperatingSystemMXBean() .getSystemLoadAverage(); - return load > NUM_CORES * congestionRatio ? PipelineAck.ECN.CONGESTED : + double threshold = NUM_CORES * congestionRatio; + if (load > threshold) { + metrics.incrCongestedCount(); + } + return load > threshold ? PipelineAck.ECN.CONGESTED : PipelineAck.ECN.SUPPORTED; } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/metrics/DataNodeMetrics.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/metrics/DataNodeMetrics.java index 77e6dab067b9d..398af6df51186 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/metrics/DataNodeMetrics.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/metrics/DataNodeMetrics.java @@ -210,6 +210,8 @@ public class DataNodeMetrics { private MutableCounterLong replaceBlockOpOnSameMount; @Metric("Number of replaceBlock ops to another node") private MutableCounterLong replaceBlockOpToOtherHost; + @Metric("Number of congested count") + private MutableCounterLong congestedCount; final MetricsRegistry registry = new MetricsRegistry("datanode"); @Metric("Milliseconds spent on calling NN rpc") @@ -807,4 +809,7 @@ public void incrReplaceBlockOpToOtherHost() { replaceBlockOpToOtherHost.incr(); } + public void incrCongestedCount() { + congestedCount.incr(); + } } From 31ec1b7c027ee948c057c85e2ca6adf24b029a69 Mon Sep 17 00:00:00 2001 From: "zhanghaobo@kanzhun.com" Date: Wed, 3 Jan 2024 14:23:32 +0800 Subject: [PATCH 2/3] Add an unit test. --- .../hadoop/hdfs/server/datanode/DataNode.java | 3 +- .../datanode/DataNodeFaultInjector.java | 4 +++ .../server/datanode/TestDataNodeMetrics.java | 28 +++++++++++++++++++ 3 files changed, 34 insertions(+), 1 deletion(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java index 35875095b946c..756b273c8ac1f 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java @@ -1102,7 +1102,8 @@ public PipelineAck.ECN getECN() { double load = ManagementFactory.getOperatingSystemMXBean() .getSystemLoadAverage(); double threshold = NUM_CORES * congestionRatio; - if (load > threshold) { + + if (load > threshold || DataNodeFaultInjector.get().mockCongestedForTest()) { metrics.incrCongestedCount(); } return load > threshold ? PipelineAck.ECN.CONGESTED : diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNodeFaultInjector.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNodeFaultInjector.java index 7b116d9e566f3..4c022535118bd 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNodeFaultInjector.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNodeFaultInjector.java @@ -162,4 +162,8 @@ public void markSlow(String dnAddr, int[] replies) {} * Just delay delete replica a while. */ public void delayDeleteReplica() {} + + public boolean mockCongestedForTest() { + return false; + } } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeMetrics.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeMetrics.java index 3a0b5238360a2..812e7e0dc67f0 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeMetrics.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeMetrics.java @@ -42,9 +42,11 @@ import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.StorageType; import org.apache.hadoop.hdfs.MiniDFSNNTopology; +import org.apache.hadoop.hdfs.protocol.datatransfer.PipelineAck; import org.apache.hadoop.net.unix.DomainSocket; import org.apache.hadoop.net.unix.TemporarySocketDirectory; import org.apache.hadoop.util.Lists; +import org.junit.Assert; import org.junit.Assume; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -816,4 +818,30 @@ public Boolean get() { }, 100, 10000); } } + + @Test + public void testCongestedCount() throws Exception { + Configuration conf = new Configuration(); + conf.setBoolean(DFSConfigKeys.DFS_PIPELINE_ECN_ENABLED, true); + MiniDFSCluster cluster = null; + DataNodeFaultInjector old = null; + try { + cluster = new MiniDFSCluster.Builder(conf).numDataNodes(3).build(); + old = DataNodeFaultInjector.get(); + DataNodeFaultInjector.set(new DataNodeFaultInjector(){ + @Override + public boolean mockCongestedForTest() { + return true; + } + }); + PipelineAck.ECN ecn = cluster.getDataNodes().get(0).getECN(); + MetricsRecordBuilder dnMetrics = getMetrics(cluster.getDataNodes().get(0).getMetrics().name()); + Assert.assertEquals(1L, getLongCounter("CongestedCount", dnMetrics)); + } finally { + if (cluster != null) { + DataNodeFaultInjector.set(old); + cluster.shutdown(); + } + } + } } From 95958c2e178949d6a5f06aecbf402ca3a402bdcf Mon Sep 17 00:00:00 2001 From: "zhanghaobo@kanzhun.com" Date: Wed, 3 Jan 2024 21:03:02 +0800 Subject: [PATCH 3/3] fix checkstyle. --- .../hadoop/hdfs/server/datanode/TestDataNodeMetrics.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeMetrics.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeMetrics.java index 812e7e0dc67f0..9806cb7f3164d 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeMetrics.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeMetrics.java @@ -835,7 +835,8 @@ public boolean mockCongestedForTest() { } }); PipelineAck.ECN ecn = cluster.getDataNodes().get(0).getECN(); - MetricsRecordBuilder dnMetrics = getMetrics(cluster.getDataNodes().get(0).getMetrics().name()); + MetricsRecordBuilder dnMetrics = getMetrics(cluster.getDataNodes().get(0) + .getMetrics().name()); Assert.assertEquals(1L, getLongCounter("CongestedCount", dnMetrics)); } finally { if (cluster != null) {