From 8a4daadba13a74eb88e8b7e056f6927e9e0e53dd Mon Sep 17 00:00:00 2001 From: tasanuma Date: Tue, 5 Apr 2022 11:55:30 +0900 Subject: [PATCH 1/6] HDFS-16479. EC: NameNode should not send a reconstruction work when the source datanodes are insufficient --- .../server/blockmanagement/BlockManager.java | 9 ++++ .../blockmanagement/TestBlockManager.java | 43 +++++++++++++++++++ 2 files changed, 52 insertions(+) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java index bfa8457dd4e31..ba6b682dd8465 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java @@ -2163,6 +2163,15 @@ BlockReconstructionWork scheduleReconstruction(BlockInfo block, return null; } + // skip if source datanodes for reconstructing ec block are not enough + if (block.isStriped()) { + BlockInfoStriped stripedBlock = (BlockInfoStriped) block; + if (stripedBlock.getDataBlockNum() > srcNodes.length) { + LOG.debug("Block {} cannot be reconstructed due to shortage of source datanodes ", block); + return null; + } + } + // liveReplicaNodes can include READ_ONLY_SHARED replicas which are // not included in the numReplicas.liveReplicas() count assert liveReplicaNodes.size() >= numReplicas.liveReplicas(); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java index d5e0a99fe789b..cc4f9756ed94f 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java @@ -852,6 +852,49 @@ public void testChooseSrcDNWithDupECInDecommissioningNode() throws Exception { 0, numReplicas.redundantInternalBlocks()); } + @Test + public void testSkipReconstructionWithManyBusyNodes() { + long blockId = -9223372036854775776L; // real ec block id + // RS-3-2 EC policy + ErasureCodingPolicy ecPolicy = + SystemErasureCodingPolicies.getPolicies().get(1); + // striped blockInfo + Block aBlock = new Block(blockId, ecPolicy.getCellSize() * ecPolicy.getNumDataUnits(), 0); + BlockInfoStriped aBlockInfoStriped = new BlockInfoStriped(aBlock, ecPolicy); + // ec storageInfo + DatanodeStorageInfo ds1 = DFSTestUtil.createDatanodeStorageInfo( + "storage1", "1.1.1.1", "rack1", "host1"); + DatanodeStorageInfo ds2 = DFSTestUtil.createDatanodeStorageInfo( + "storage2", "2.2.2.2", "rack2", "host2"); + DatanodeStorageInfo ds3 = DFSTestUtil.createDatanodeStorageInfo( + "storage3", "3.3.3.3", "rack3", "host3"); + DatanodeStorageInfo ds4 = DFSTestUtil.createDatanodeStorageInfo( + "storage4", "4.4.4.4", "rack4", "host4"); + + // link block with storage + aBlockInfoStriped.addStorage(ds1, aBlock); + aBlockInfoStriped.addStorage(ds2, new Block(blockId + 1, 0, 0)); + aBlockInfoStriped.addStorage(ds3, new Block(blockId + 2, 0, 0)); + aBlockInfoStriped.addStorage(ds4, new Block(blockId + 3, 0, 0)); + + addEcBlockToBM(blockId, ecPolicy); + aBlockInfoStriped.setBlockCollectionId(mockINodeId); + + // reconstruction should be scheduled + BlockReconstructionWork work = bm.scheduleReconstruction(aBlockInfoStriped, 3); + assertNotNull(work); + + // simulate the 3 nodes reach maxReplicationStreams + for(int i = 0; i < bm.maxReplicationStreams; i++){ + ds3.getDatanodeDescriptor().incrementPendingReplicationWithoutTargets(); + ds4.getDatanodeDescriptor().incrementPendingReplicationWithoutTargets(); + } + + // reconstruction should be skipped since the number of non-busy nodes are not enough + work = bm.scheduleReconstruction(aBlockInfoStriped, 3); + assertNull(work); + } + @Test public void testFavorDecomUntilHardLimit() throws Exception { bm.maxReplicationStreams = 0; From 1b40cf594ea0cfaa72c803e7e374aa0ecd8b7cdf Mon Sep 17 00:00:00 2001 From: tasanuma Date: Wed, 6 Apr 2022 03:04:34 +0900 Subject: [PATCH 2/6] addressing short data blocks --- .../server/blockmanagement/BlockManager.java | 4 +- .../blockmanagement/TestBlockManager.java | 58 ++++++++++++++++++- 2 files changed, 58 insertions(+), 4 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java index ba6b682dd8465..bb96b4ab4060d 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java @@ -2166,7 +2166,9 @@ BlockReconstructionWork scheduleReconstruction(BlockInfo block, // skip if source datanodes for reconstructing ec block are not enough if (block.isStriped()) { BlockInfoStriped stripedBlock = (BlockInfoStriped) block; - if (stripedBlock.getDataBlockNum() > srcNodes.length) { + int cellsNum = (int) ((stripedBlock.getNumBytes() - 1) / stripedBlock.getCellSize() + 1); + int minRequiredSources = Math.min(cellsNum, stripedBlock.getDataBlockNum()); + if (minRequiredSources > srcNodes.length) { LOG.debug("Block {} cannot be reconstructed due to shortage of source datanodes ", block); return null; } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java index cc4f9756ed94f..9edc6384ec501 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java @@ -858,10 +858,12 @@ public void testSkipReconstructionWithManyBusyNodes() { // RS-3-2 EC policy ErasureCodingPolicy ecPolicy = SystemErasureCodingPolicies.getPolicies().get(1); - // striped blockInfo + + // striped blockInfo: 3 data blocks + 2 parity blocks Block aBlock = new Block(blockId, ecPolicy.getCellSize() * ecPolicy.getNumDataUnits(), 0); BlockInfoStriped aBlockInfoStriped = new BlockInfoStriped(aBlock, ecPolicy); - // ec storageInfo + + // create 4 storageInfo, which means 1 block is missing DatanodeStorageInfo ds1 = DFSTestUtil.createDatanodeStorageInfo( "storage1", "1.1.1.1", "rack1", "host1"); DatanodeStorageInfo ds2 = DFSTestUtil.createDatanodeStorageInfo( @@ -884,7 +886,7 @@ public void testSkipReconstructionWithManyBusyNodes() { BlockReconstructionWork work = bm.scheduleReconstruction(aBlockInfoStriped, 3); assertNotNull(work); - // simulate the 3 nodes reach maxReplicationStreams + // simulate the 2 nodes reach maxReplicationStreams for(int i = 0; i < bm.maxReplicationStreams; i++){ ds3.getDatanodeDescriptor().incrementPendingReplicationWithoutTargets(); ds4.getDatanodeDescriptor().incrementPendingReplicationWithoutTargets(); @@ -895,6 +897,56 @@ public void testSkipReconstructionWithManyBusyNodes() { assertNull(work); } + @Test + public void testSkipReconstructionWithManyBusyNodes2() { + long blockId = -9223372036854775776L; // real ec block id + // RS-3-2 EC policy + ErasureCodingPolicy ecPolicy = + SystemErasureCodingPolicies.getPolicies().get(1); + + // striped blockInfo: 2 data blocks + 2 paritys + Block aBlock = new Block(blockId, ecPolicy.getCellSize() * (ecPolicy.getNumDataUnits() - 1), 0); + BlockInfoStriped aBlockInfoStriped = new BlockInfoStriped(aBlock, ecPolicy); + + // create 3 storageInfo, which means 1 block is missing + DatanodeStorageInfo ds1 = DFSTestUtil.createDatanodeStorageInfo( + "storage1", "1.1.1.1", "rack1", "host1"); + DatanodeStorageInfo ds2 = DFSTestUtil.createDatanodeStorageInfo( + "storage2", "2.2.2.2", "rack2", "host2"); + DatanodeStorageInfo ds3 = DFSTestUtil.createDatanodeStorageInfo( + "storage3", "3.3.3.3", "rack3", "host3"); + + // link block with storage + aBlockInfoStriped.addStorage(ds1, aBlock); + aBlockInfoStriped.addStorage(ds2, new Block(blockId + 1, 0, 0)); + aBlockInfoStriped.addStorage(ds3, new Block(blockId + 2, 0, 0)); + + addEcBlockToBM(blockId, ecPolicy); + aBlockInfoStriped.setBlockCollectionId(mockINodeId); + + // reconstruction should be scheduled + BlockReconstructionWork work = bm.scheduleReconstruction(aBlockInfoStriped, 3); + assertNotNull(work); + + // simulate the 1 node reaches maxReplicationStreams + for(int i = 0; i < bm.maxReplicationStreams; i++){ + ds2.getDatanodeDescriptor().incrementPendingReplicationWithoutTargets(); + } + + // reconstruction should still be scheduled since there are 2 source nodes to create 2 blocks + work = bm.scheduleReconstruction(aBlockInfoStriped, 3); + assertNotNull(work); + + // simulate the 1 more node reaches maxReplicationStreams + for(int i = 0; i < bm.maxReplicationStreams; i++){ + ds3.getDatanodeDescriptor().incrementPendingReplicationWithoutTargets(); + } + + // reconstruction should be skipped since the number of non-busy nodes are not enough + work = bm.scheduleReconstruction(aBlockInfoStriped, 3); + assertNull(work); + } + @Test public void testFavorDecomUntilHardLimit() throws Exception { bm.maxReplicationStreams = 0; From 7784d027eec358d0dc513b12ed108b5c3e2503eb Mon Sep 17 00:00:00 2001 From: tasanuma Date: Tue, 12 Apr 2022 10:31:25 +0900 Subject: [PATCH 3/6] use BlockInfoStriped.getRealDataBlockNum() --- .../hadoop/hdfs/server/blockmanagement/BlockManager.java | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java index bb96b4ab4060d..c916194b6c361 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java @@ -2166,9 +2166,7 @@ BlockReconstructionWork scheduleReconstruction(BlockInfo block, // skip if source datanodes for reconstructing ec block are not enough if (block.isStriped()) { BlockInfoStriped stripedBlock = (BlockInfoStriped) block; - int cellsNum = (int) ((stripedBlock.getNumBytes() - 1) / stripedBlock.getCellSize() + 1); - int minRequiredSources = Math.min(cellsNum, stripedBlock.getDataBlockNum()); - if (minRequiredSources > srcNodes.length) { + if (stripedBlock.getRealDataBlockNum() > srcNodes.length) { LOG.debug("Block {} cannot be reconstructed due to shortage of source datanodes ", block); return null; } From f6c9d6be53b5d9bfc186d95f43fcea280d339fdc Mon Sep 17 00:00:00 2001 From: tasanuma Date: Tue, 12 Apr 2022 10:32:42 +0900 Subject: [PATCH 4/6] add NameNode.getNameNodeMetrics().incNumTimesReReplicationNotScheduled() --- .../apache/hadoop/hdfs/server/blockmanagement/BlockManager.java | 1 + 1 file changed, 1 insertion(+) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java index c916194b6c361..a48d5e5480139 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java @@ -2168,6 +2168,7 @@ BlockReconstructionWork scheduleReconstruction(BlockInfo block, BlockInfoStriped stripedBlock = (BlockInfoStriped) block; if (stripedBlock.getRealDataBlockNum() > srcNodes.length) { LOG.debug("Block {} cannot be reconstructed due to shortage of source datanodes ", block); + NameNode.getNameNodeMetrics().incNumTimesReReplicationNotScheduled(); return null; } } From 494a9212166a1d192f0a91e73f1108640b8008ca Mon Sep 17 00:00:00 2001 From: tasanuma Date: Tue, 12 Apr 2022 10:35:29 +0900 Subject: [PATCH 5/6] fix typo comment --- .../hadoop/hdfs/server/blockmanagement/TestBlockManager.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java index 9edc6384ec501..ec3de67ab54c4 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java @@ -904,7 +904,7 @@ public void testSkipReconstructionWithManyBusyNodes2() { ErasureCodingPolicy ecPolicy = SystemErasureCodingPolicies.getPolicies().get(1); - // striped blockInfo: 2 data blocks + 2 paritys + // striped blockInfo: 2 data blocks + 2 parity blocks Block aBlock = new Block(blockId, ecPolicy.getCellSize() * (ecPolicy.getNumDataUnits() - 1), 0); BlockInfoStriped aBlockInfoStriped = new BlockInfoStriped(aBlock, ecPolicy); From a0d57569803182cf83330753661d6a5d1d7ba660 Mon Sep 17 00:00:00 2001 From: tasanuma Date: Tue, 12 Apr 2022 10:54:24 +0900 Subject: [PATCH 6/6] update comments and variable names --- .../blockmanagement/TestBlockManager.java | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java index ec3de67ab54c4..a507fce34f31a 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java @@ -859,9 +859,9 @@ public void testSkipReconstructionWithManyBusyNodes() { ErasureCodingPolicy ecPolicy = SystemErasureCodingPolicies.getPolicies().get(1); - // striped blockInfo: 3 data blocks + 2 parity blocks - Block aBlock = new Block(blockId, ecPolicy.getCellSize() * ecPolicy.getNumDataUnits(), 0); - BlockInfoStriped aBlockInfoStriped = new BlockInfoStriped(aBlock, ecPolicy); + // create an EC block group: 3 data blocks + 2 parity blocks + Block aBlockGroup = new Block(blockId, ecPolicy.getCellSize() * ecPolicy.getNumDataUnits(), 0); + BlockInfoStriped aBlockInfoStriped = new BlockInfoStriped(aBlockGroup, ecPolicy); // create 4 storageInfo, which means 1 block is missing DatanodeStorageInfo ds1 = DFSTestUtil.createDatanodeStorageInfo( @@ -874,7 +874,7 @@ public void testSkipReconstructionWithManyBusyNodes() { "storage4", "4.4.4.4", "rack4", "host4"); // link block with storage - aBlockInfoStriped.addStorage(ds1, aBlock); + aBlockInfoStriped.addStorage(ds1, aBlockGroup); aBlockInfoStriped.addStorage(ds2, new Block(blockId + 1, 0, 0)); aBlockInfoStriped.addStorage(ds3, new Block(blockId + 2, 0, 0)); aBlockInfoStriped.addStorage(ds4, new Block(blockId + 3, 0, 0)); @@ -904,9 +904,10 @@ public void testSkipReconstructionWithManyBusyNodes2() { ErasureCodingPolicy ecPolicy = SystemErasureCodingPolicies.getPolicies().get(1); - // striped blockInfo: 2 data blocks + 2 parity blocks - Block aBlock = new Block(blockId, ecPolicy.getCellSize() * (ecPolicy.getNumDataUnits() - 1), 0); - BlockInfoStriped aBlockInfoStriped = new BlockInfoStriped(aBlock, ecPolicy); + // create an EC block group: 2 data blocks + 2 parity blocks + Block aBlockGroup = new Block(blockId, + ecPolicy.getCellSize() * (ecPolicy.getNumDataUnits() - 1), 0); + BlockInfoStriped aBlockInfoStriped = new BlockInfoStriped(aBlockGroup, ecPolicy); // create 3 storageInfo, which means 1 block is missing DatanodeStorageInfo ds1 = DFSTestUtil.createDatanodeStorageInfo( @@ -917,7 +918,7 @@ public void testSkipReconstructionWithManyBusyNodes2() { "storage3", "3.3.3.3", "rack3", "host3"); // link block with storage - aBlockInfoStriped.addStorage(ds1, aBlock); + aBlockInfoStriped.addStorage(ds1, aBlockGroup); aBlockInfoStriped.addStorage(ds2, new Block(blockId + 1, 0, 0)); aBlockInfoStriped.addStorage(ds3, new Block(blockId + 2, 0, 0));