From e66897ec0f5866ba5f378834e7aec96555b1c286 Mon Sep 17 00:00:00 2001 From: Kevin Wikant Date: Sat, 16 Jul 2022 11:46:22 -0400 Subject: [PATCH 1/5] HDFS-16664. Use correct GenerationStamp when invalidating corrupt block replicas --- .../server/blockmanagement/BlockManager.java | 16 +++++-- .../blockmanagement/CorruptReplicasMap.java | 47 +++++++++++++++---- 2 files changed, 50 insertions(+), 13 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java index ed3a59ee7a476..1c95b6852eed2 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java @@ -3730,9 +3730,19 @@ private void invalidateCorruptReplicas(BlockInfo blk, Block reported, nodes.toArray(new DatanodeDescriptor[nodes.size()]); for (DatanodeDescriptor node : nodesCopy) { try { - if (!invalidateBlock(new BlockToMarkCorrupt(reported, blk, null, - Reason.ANY), node, numberReplicas)) { - removedFromBlocksMap = false; + Long genStamp = corruptReplicas.getCorruptReplicaGenerationStamp(blk, node); + if (genStamp == null) { + LOG.warn("CorruptReplicasMap unexpectedly missing generationStamp for datanode {}", + node.getXferAddr()); + if (!invalidateBlock(new BlockToMarkCorrupt(reported, blk, null, + Reason.ANY), node, numberReplicas)) { + removedFromBlocksMap = false; + } + } else { + if (!invalidateBlock(new BlockToMarkCorrupt(reported, blk, genStamp, null, + Reason.ANY), node, numberReplicas)) { + removedFromBlocksMap = false; + } } } catch (IOException e) { if(blockLog.isDebugEnabled()) { diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/CorruptReplicasMap.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/CorruptReplicasMap.java index 3d2c260dcaebc..76adbe90b29d1 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/CorruptReplicasMap.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/CorruptReplicasMap.java @@ -54,8 +54,18 @@ public enum Reason { CORRUPTION_REPORTED // client or datanode reported the corruption } - private final Map> corruptReplicasMap = - new HashMap>(); + public class CorruptBlockReplica { + public final Reason reason; + public final long generationStamp; + + CorruptBlockReplica(Reason reason, long generationStamp) { + this.reason = reason; + this.generationStamp = generationStamp; + } + } + + private final Map> corruptReplicasMap = + new HashMap>(); private final LongAdder totalCorruptBlocks = new LongAdder(); private final LongAdder totalCorruptECBlockGroups = new LongAdder(); @@ -70,9 +80,9 @@ public enum Reason { */ void addToCorruptReplicasMap(Block blk, DatanodeDescriptor dn, String reason, Reason reasonCode, boolean isStriped) { - Map nodes = corruptReplicasMap.get(blk); + Map nodes = corruptReplicasMap.get(blk); if (nodes == null) { - nodes = new HashMap(); + nodes = new HashMap(); corruptReplicasMap.put(blk, nodes); incrementBlockStat(isStriped); } @@ -96,7 +106,7 @@ void addToCorruptReplicasMap(Block blk, DatanodeDescriptor dn, Server.getRemoteIp(), reasonText); } // Add the node or update the reason. - nodes.put(dn, reasonCode); + nodes.put(dn, new CorruptBlockReplica(reasonCode, blk.getGenerationStamp())); } /** @@ -105,7 +115,7 @@ void addToCorruptReplicasMap(Block blk, DatanodeDescriptor dn, */ void removeFromCorruptReplicasMap(BlockInfo blk) { if (corruptReplicasMap != null) { - Map value = corruptReplicasMap.remove(blk); + Map value = corruptReplicasMap.remove(blk); if (value != null) { decrementBlockStat(blk.isStriped()); } @@ -126,13 +136,13 @@ boolean removeFromCorruptReplicasMap( boolean removeFromCorruptReplicasMap( BlockInfo blk, DatanodeDescriptor datanode, Reason reason) { - Map datanodes = corruptReplicasMap.get(blk); + Map datanodes = corruptReplicasMap.get(blk); if (datanodes == null) { return false; } // if reasons can be compared but don't match, return false. - Reason storedReason = datanodes.get(datanode); + Reason storedReason = datanodes.get(datanode).reason; if (reason != Reason.ANY && storedReason != null && reason != storedReason) { return false; @@ -172,7 +182,7 @@ private void decrementBlockStat(boolean isStriped) { * @return collection of nodes. Null if does not exists */ Collection getNodes(Block blk) { - Map nodes = corruptReplicasMap.get(blk); + Map nodes = corruptReplicasMap.get(blk); if (nodes == null) return null; return nodes.keySet(); @@ -247,6 +257,23 @@ Set getCorruptBlocksSet() { return corruptBlocks; } + /** + * return the generation stamp of a corrupt replica for a given block + * on a given dn + * @param block block that has corrupted replica + * @param node datanode that contains this corrupted replica + * @return reason + */ + Long getCorruptReplicaGenerationStamp(Block block, DatanodeDescriptor node) { + Long generationStamp = null; + if(corruptReplicasMap.containsKey(block)) { + if (corruptReplicasMap.get(block).containsKey(node)) { + generationStamp = corruptReplicasMap.get(block).get(node).generationStamp; + } + } + return generationStamp; + } + /** * return the reason about corrupted replica for a given block * on a given dn @@ -258,7 +285,7 @@ String getCorruptReason(Block block, DatanodeDescriptor node) { Reason reason = null; if(corruptReplicasMap.containsKey(block)) { if (corruptReplicasMap.get(block).containsKey(node)) { - reason = corruptReplicasMap.get(block).get(node); + reason = corruptReplicasMap.get(block).get(node).reason; } } if (reason != null) { From 9884cf088d26412ee438b22dd977469f9377c63b Mon Sep 17 00:00:00 2001 From: Kevin Wikant Date: Sun, 17 Jul 2022 08:19:41 -0400 Subject: [PATCH 2/5] HDFS-16664. Use correct GenerationStamp when invalidating corrupt block replicas rev2 --- .../hdfs/server/blockmanagement/CorruptReplicasMap.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/CorruptReplicasMap.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/CorruptReplicasMap.java index 76adbe90b29d1..e766297ac274e 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/CorruptReplicasMap.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/CorruptReplicasMap.java @@ -54,7 +54,7 @@ public enum Reason { CORRUPTION_REPORTED // client or datanode reported the corruption } - public class CorruptBlockReplica { + private static class CorruptBlockReplica { public final Reason reason; public final long generationStamp; @@ -142,9 +142,9 @@ boolean removeFromCorruptReplicasMap( } // if reasons can be compared but don't match, return false. - Reason storedReason = datanodes.get(datanode).reason; - if (reason != Reason.ANY && storedReason != null && - reason != storedReason) { + CorruptBlockReplica corruptReplica = datanodes.get(datanode); + if (reason != Reason.ANY && corruptReplica != null && + reason != corruptReplica.reason) { return false; } From 637b3b23cff287cb67e688661a1354dd0579eb35 Mon Sep 17 00:00:00 2001 From: Kevin Wikant Date: Tue, 2 Aug 2022 16:05:58 -0400 Subject: [PATCH 3/5] HDFS-16664. Use correct GenerationStamp when invalidating corrupt block replicas rev3 --- .../apache/hadoop/hdfs/TestDecommission.java | 16 +++++++++++ .../TestCorruptReplicaInfo.java | 28 +++++++++++++++++++ 2 files changed, 44 insertions(+) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java index 0133d3aec37b1..05a8b3cab67f9 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java @@ -1913,6 +1913,22 @@ private void createClusterWithDeadNodesDecommissionInProgress(final int numLiveN */ @Test(timeout = 60000) public void testDeleteCorruptReplicaForUnderReplicatedBlock() throws Exception { + testDeleteCorruptReplicaForUnderReplicatedBlockInternal(); + } + + /* + Same test as testDeleteCorruptReplicaForUnderReplicatedBlock except + "dfs.namenode.corrupt.block.delete.immediately.enabled = false" such that the block invalidation + gets postponed. + */ + @Test(timeout = 60000) + public void testDeleteCorruptReplicaForUnderReplicatedBlockWithInvalidationPostponed() throws Exception { + getConf().setBoolean(DFSConfigKeys.DFS_NAMENODE_CORRUPT_BLOCK_DELETE_IMMEDIATELY_ENABLED, + false); + testDeleteCorruptReplicaForUnderReplicatedBlockInternal(); + } + + public void testDeleteCorruptReplicaForUnderReplicatedBlockInternal() throws Exception { // Constants final Path file = new Path("/test-file"); final int numDatanode = 3; diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestCorruptReplicaInfo.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestCorruptReplicaInfo.java index c107c73ff54e3..cfa3ea37def99 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestCorruptReplicaInfo.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestCorruptReplicaInfo.java @@ -186,6 +186,34 @@ public void testCorruptReplicaInfo() 10, getStripedBlock(7).getBlockId()))); } + @Test + public void testGetCorruptReplicaGenerationStamp() { + final CorruptReplicasMap crm = new CorruptReplicasMap(); + final DatanodeDescriptor dn1 = DFSTestUtil.getLocalDatanodeDescriptor(); + final DatanodeDescriptor dn2 = DFSTestUtil.getLocalDatanodeDescriptor(); + final long len = 1000L; + short replFactor = 2; + + // Create block replicas with different GenStamps + final Block blk1v1 = new Block(1L, len, 1L); + final Block blk1v2 = new Block(1L, len, 2L); + final Block blk2 = new Block(2L, len, 100L); + addToCorruptReplicasMap(crm, new BlockInfoContiguous(blk1v1, replFactor), dn1); + addToCorruptReplicasMap(crm, new BlockInfoContiguous(blk1v2, replFactor), dn2); + addToCorruptReplicasMap(crm, new BlockInfoContiguous(blk2, replFactor), dn1); + + // Validate correct GenStamp is reported for each replica; GenStamp is based on the + // DatanodeDescriptor object being passed & not based on the Block object being passed. + assertEquals(Long.valueOf(1L), crm.getCorruptReplicaGenerationStamp(blk1v1, dn1)); + assertEquals(Long.valueOf(2L), crm.getCorruptReplicaGenerationStamp(blk1v1, dn2)); + assertEquals(Long.valueOf(1L), crm.getCorruptReplicaGenerationStamp(blk1v2, dn1)); + assertEquals(Long.valueOf(2L), crm.getCorruptReplicaGenerationStamp(blk1v2, dn2)); + + // Validate null returned for non-existent block replica + assertEquals(Long.valueOf(100L), crm.getCorruptReplicaGenerationStamp(blk2, dn1)); + assertNull(crm.getCorruptReplicaGenerationStamp(blk2, dn2)); + } + private static void addToCorruptReplicasMap(CorruptReplicasMap crm, BlockInfo blk, DatanodeDescriptor dn) { crm.addToCorruptReplicasMap(blk, dn, "TEST", Reason.NONE, blk.isStriped()); From 60433bffc3a7fbb2153a78782d257534f8c7e34f Mon Sep 17 00:00:00 2001 From: Kevin Wikant Date: Wed, 3 Aug 2022 08:49:40 -0400 Subject: [PATCH 4/5] HDFS-16664. Use correct GenerationStamp when invalidating corrupt block replicas rev4 --- .../blockmanagement/CorruptReplicasMap.java | 18 +++++++++++++----- .../apache/hadoop/hdfs/TestDecommission.java | 7 ++++--- .../TestCorruptReplicaInfo.java | 8 ++++---- 3 files changed, 21 insertions(+), 12 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/CorruptReplicasMap.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/CorruptReplicasMap.java index e766297ac274e..8a46b0ba0a2c0 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/CorruptReplicasMap.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/CorruptReplicasMap.java @@ -55,13 +55,21 @@ public enum Reason { } private static class CorruptBlockReplica { - public final Reason reason; - public final long generationStamp; + private final Reason reason; + private final long generationStamp; CorruptBlockReplica(Reason reason, long generationStamp) { this.reason = reason; this.generationStamp = generationStamp; } + + public Reason getReason() { + return reason; + } + + public long getGenerationStamp() { + return generationStamp; + } } private final Map> corruptReplicasMap = @@ -144,7 +152,7 @@ boolean removeFromCorruptReplicasMap( // if reasons can be compared but don't match, return false. CorruptBlockReplica corruptReplica = datanodes.get(datanode); if (reason != Reason.ANY && corruptReplica != null && - reason != corruptReplica.reason) { + reason != corruptReplica.getReason()) { return false; } @@ -268,7 +276,7 @@ Long getCorruptReplicaGenerationStamp(Block block, DatanodeDescriptor node) { Long generationStamp = null; if(corruptReplicasMap.containsKey(block)) { if (corruptReplicasMap.get(block).containsKey(node)) { - generationStamp = corruptReplicasMap.get(block).get(node).generationStamp; + generationStamp = corruptReplicasMap.get(block).get(node).getGenerationStamp(); } } return generationStamp; @@ -285,7 +293,7 @@ String getCorruptReason(Block block, DatanodeDescriptor node) { Reason reason = null; if(corruptReplicasMap.containsKey(block)) { if (corruptReplicasMap.get(block).containsKey(node)) { - reason = corruptReplicasMap.get(block).get(node).reason; + reason = corruptReplicasMap.get(block).get(node).getReason(); } } if (reason != null) { diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java index 05a8b3cab67f9..e06c9a72b7e10 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java @@ -1915,19 +1915,20 @@ private void createClusterWithDeadNodesDecommissionInProgress(final int numLiveN public void testDeleteCorruptReplicaForUnderReplicatedBlock() throws Exception { testDeleteCorruptReplicaForUnderReplicatedBlockInternal(); } - + /* Same test as testDeleteCorruptReplicaForUnderReplicatedBlock except "dfs.namenode.corrupt.block.delete.immediately.enabled = false" such that the block invalidation gets postponed. */ @Test(timeout = 60000) - public void testDeleteCorruptReplicaForUnderReplicatedBlockWithInvalidationPostponed() throws Exception { + public void testDeleteCorruptReplicaForUnderReplicatedBlockWithInvalidationPostponed() + throws Exception { getConf().setBoolean(DFSConfigKeys.DFS_NAMENODE_CORRUPT_BLOCK_DELETE_IMMEDIATELY_ENABLED, false); testDeleteCorruptReplicaForUnderReplicatedBlockInternal(); } - + public void testDeleteCorruptReplicaForUnderReplicatedBlockInternal() throws Exception { // Constants final Path file = new Path("/test-file"); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestCorruptReplicaInfo.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestCorruptReplicaInfo.java index cfa3ea37def99..9b643417938b2 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestCorruptReplicaInfo.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestCorruptReplicaInfo.java @@ -185,7 +185,7 @@ public void testCorruptReplicaInfo() crm.getCorruptBlockIdsForTesting(bim, BlockType.STRIPED, 10, getStripedBlock(7).getBlockId()))); } - + @Test public void testGetCorruptReplicaGenerationStamp() { final CorruptReplicasMap crm = new CorruptReplicasMap(); @@ -193,7 +193,7 @@ public void testGetCorruptReplicaGenerationStamp() { final DatanodeDescriptor dn2 = DFSTestUtil.getLocalDatanodeDescriptor(); final long len = 1000L; short replFactor = 2; - + // Create block replicas with different GenStamps final Block blk1v1 = new Block(1L, len, 1L); final Block blk1v2 = new Block(1L, len, 2L); @@ -208,12 +208,12 @@ public void testGetCorruptReplicaGenerationStamp() { assertEquals(Long.valueOf(2L), crm.getCorruptReplicaGenerationStamp(blk1v1, dn2)); assertEquals(Long.valueOf(1L), crm.getCorruptReplicaGenerationStamp(blk1v2, dn1)); assertEquals(Long.valueOf(2L), crm.getCorruptReplicaGenerationStamp(blk1v2, dn2)); - + // Validate null returned for non-existent block replica assertEquals(Long.valueOf(100L), crm.getCorruptReplicaGenerationStamp(blk2, dn1)); assertNull(crm.getCorruptReplicaGenerationStamp(blk2, dn2)); } - + private static void addToCorruptReplicasMap(CorruptReplicasMap crm, BlockInfo blk, DatanodeDescriptor dn) { crm.addToCorruptReplicasMap(blk, dn, "TEST", Reason.NONE, blk.isStriped()); From 99d99144e2bae0344e3ab0484a1e5609f051acea Mon Sep 17 00:00:00 2001 From: Kevin Wikant Date: Thu, 4 Aug 2022 09:00:10 -0400 Subject: [PATCH 5/5] Empty commit to re-trigger flaky Yetus workflow