From 367d7c90a1dcdd263bb6b3c550c830eff5a904ae Mon Sep 17 00:00:00 2001 From: "Vico.Wu" <583424568@qq.com> Date: Thu, 4 Jul 2024 10:39:56 +0800 Subject: [PATCH 01/20] change repeated decommission log level to info --- .../hdfs/server/blockmanagement/DatanodeAdminManager.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeAdminManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeAdminManager.java index af207a843fd27..450d688f75280 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeAdminManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeAdminManager.java @@ -160,7 +160,7 @@ public void startDecommission(DatanodeDescriptor node) { monitor.startTrackingNode(node); } } else { - LOG.trace("startDecommission: Node {} in {}, nothing to do.", + LOG.info("startDecommission: Node {} in {}, nothing to do.", node, node.getAdminState()); } } From da38f3c17b74629bb85e115f4eb6790bae2e4db4 Mon Sep 17 00:00:00 2001 From: "Vico.Wu" <583424568@qq.com> Date: Thu, 4 Jul 2024 16:50:18 +0800 Subject: [PATCH 02/20] Trigger CI From 1a87b97ccdc43042e48a190d770859ad6ca122f8 Mon Sep 17 00:00:00 2001 From: "Vico.Wu" <583424568@qq.com> Date: Fri, 5 Jul 2024 18:16:22 +0800 Subject: [PATCH 03/20] centralized storage not chosen reason --- .../server/blockmanagement/BlockManager.java | 10 ++ .../StorageNotChosenReason.java | 93 +++++++++++++++++++ .../blockmanagement/TestBlockManager.java | 54 ++++++++++- 3 files changed, 152 insertions(+), 5 deletions(-) create mode 100644 hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/StorageNotChosenReason.java diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java index 41845152514fe..f97dd2f4e12b7 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java @@ -20,6 +20,7 @@ import static org.apache.hadoop.hdfs.DFSConfigKeys.*; import static org.apache.hadoop.hdfs.protocol.BlockType.CONTIGUOUS; import static org.apache.hadoop.hdfs.protocol.BlockType.STRIPED; +import static org.apache.hadoop.hdfs.server.blockmanagement.StorageNotChosenReason.*; import static org.apache.hadoop.util.ExitUtil.terminate; import static org.apache.hadoop.util.Time.now; @@ -2244,6 +2245,9 @@ BlockReconstructionWork scheduleReconstruction(BlockInfo block, final DatanodeDescriptor[] srcNodes = chooseSourceDatanodes(block, containingNodes, liveReplicaNodes, numReplicas, liveBlockIndices, liveBusyBlockIndices, excludeReconstructed, priority); + if(LOG.isDebugEnabled()){ + LOG.debug(getStorageNotChosenReason(block)); + } short requiredRedundancy = getExpectedLiveRedundancyNum(block, numReplicas); if (srcNodes == null || srcNodes.length == 0) { @@ -2571,6 +2575,7 @@ DatanodeDescriptor[] chooseSourceDatanodes(BlockInfo block, final boolean isStriped = block.isStriped(); DatanodeDescriptor decommissionedSrc = null; + StorageNotChosenReason.start(); BitSet liveBitSet = null; BitSet decommissioningBitSet = null; if (isStriped) { @@ -2595,6 +2600,7 @@ DatanodeDescriptor[] chooseSourceDatanodes(BlockInfo block, // do not select the replica if it is corrupt or excess if (state == StoredReplicaState.CORRUPT || state == StoredReplicaState.EXCESS) { + logStorageIsNotChooseForReplication(storage, StorageNotChosenReason.REPLICA_CORRUPT_OR_EXCESS); continue; } @@ -2602,6 +2608,7 @@ DatanodeDescriptor[] chooseSourceDatanodes(BlockInfo block, // or unknown state replicas. if (state == null || state == StoredReplicaState.MAINTENANCE_NOT_FOR_READ) { + logStorageIsNotChooseForReplication(storage, StorageNotChosenReason.REPLICA_MAINTENANCE_NOT_FOR_READ); continue; } @@ -2613,6 +2620,7 @@ DatanodeDescriptor[] chooseSourceDatanodes(BlockInfo block, ThreadLocalRandom.current().nextBoolean()) { decommissionedSrc = node; } + logStorageIsNotChooseForReplication(storage, StorageNotChosenReason.REPLICA_DECOMMISSIONED); continue; } @@ -2637,6 +2645,7 @@ DatanodeDescriptor[] chooseSourceDatanodes(BlockInfo block, //HDFS-16566 ExcludeReconstructed won't be reconstructed. excludeReconstructed.add(blockIndex); } + logStorageIsNotChooseForReplication(storage, StorageNotChosenReason.REPLICA_ALREADY_REACH_REPLICATION_LIMIT); continue; // already reached replication limit } @@ -2648,6 +2657,7 @@ DatanodeDescriptor[] chooseSourceDatanodes(BlockInfo block, //HDFS-16566 ExcludeReconstructed won't be reconstructed. excludeReconstructed.add(blockIndex); } + logStorageIsNotChooseForReplication(storage, StorageNotChosenReason.REPLICA_ALREADY_REACH_REPLICATION_HARD_LIMIT); continue; } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/StorageNotChosenReason.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/StorageNotChosenReason.java new file mode 100644 index 0000000000000..5a87550af0440 --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/StorageNotChosenReason.java @@ -0,0 +1,93 @@ +package org.apache.hadoop.hdfs.server.blockmanagement; + + +import org.apache.hadoop.classification.VisibleForTesting; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.HashMap; + + +public enum StorageNotChosenReason { + REPLICA_CORRUPT_OR_EXCESS("stored replica state is corrupt or excess"), + REPLICA_MAINTENANCE_NOT_FOR_READ("stored replica is maintenance not for read"), + REPLICA_DECOMMISSIONED("replica is already decommissioned"), + REPLICA_ALREADY_REACH_REPLICATION_LIMIT("replica already reached replication soft limit"), + REPLICA_ALREADY_REACH_REPLICATION_HARD_LIMIT("replica already reached replication hard limit"); + + public static final Logger LOG = LoggerFactory.getLogger( + BlockManager.class); + + private static final ThreadLocal> + REASONS_SUMMARY = ThreadLocal + .withInitial(() -> new HashMap()); + + + private static final ThreadLocal debugLoggingBuilder + = new ThreadLocal() { + @Override + protected StringBuilder initialValue() { + return new StringBuilder(); + } + }; + + private final String text; + + StorageNotChosenReason(final String logText) { + text = logText; + } + + private String getText() { + return text; + } + + public static void start(){ + REASONS_SUMMARY.get().clear(); + debugLoggingBuilder.get().setLength(0); + } + + public static void logStorageIsNotChooseForReplication(DatanodeStorageInfo storage, + StorageNotChosenReason reason, String reasonDetails) { + if(LOG.isDebugEnabled()){ + genStorageIsNotChooseForReplication(storage, reason, reasonDetails); + } + } + + + @VisibleForTesting + static void genStorageIsNotChooseForReplication(DatanodeStorageInfo storage, + StorageNotChosenReason reason, String reasonDetails){ + // build the error message for later use. + debugLoggingBuilder.get() + .append("\n Storage ").append((storage==null)?"None":storage) + .append(" is not chosen since ").append(reason.getText()); + if (reasonDetails != null) { + debugLoggingBuilder.get().append(" ").append(reasonDetails); + } + debugLoggingBuilder.get().append("."); + final HashMap reasonMap = + REASONS_SUMMARY.get(); + Integer base = reasonMap.get(reason); + if (base == null) { + base = 0; + } + reasonMap.put(reason, base + 1); + } + + @VisibleForTesting + static String getStorageNotChosenReason(BlockInfo block){ + StringBuilder blockInfoPrefix = new StringBuilder("Block ").append(block); + final HashMap reasonMap = + REASONS_SUMMARY.get(); + if(reasonMap.isEmpty()){ + return blockInfoPrefix.append(" successfully chosen storage.").toString(); + }else{ + blockInfoPrefix.append(" has no chosen storage. Reason: [\n") ; + debugLoggingBuilder.get().append("\n]"); + StringBuilder reasonMapResult = new StringBuilder(); + reasonMapResult.append("Reason statistics: ").append(reasonMap); + return blockInfoPrefix.append(debugLoggingBuilder.get()).append("\n") + .append(reasonMapResult).toString(); + } + } +} \ No newline at end of file diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java index d9d236b66468d..08b9d45349a46 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java @@ -121,11 +121,7 @@ import static org.apache.hadoop.hdfs.server.common.HdfsServerConstants.BlockUCState.UNDER_CONSTRUCTION; import static org.apache.hadoop.test.MetricsAsserts.getLongCounter; import static org.apache.hadoop.test.MetricsAsserts.getMetrics; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertNotNull; -import static org.junit.Assert.assertNull; -import static org.junit.Assert.assertTrue; +import static org.junit.Assert.*; import static org.mockito.ArgumentMatchers.any; import static org.mockito.Mockito.doReturn; import static org.mockito.Mockito.mock; @@ -2329,4 +2325,52 @@ public void delayDeleteReplica() { DataNodeFaultInjector.set(oldInjector); } } + + /** + * Test the log mechasim is working as expected when storage is not chosen + * @throws IOException + * @throws InterruptedException + * @throws TimeoutException + */ + @Test(timeout = 6000) + public void testStorageNotChosenReason() throws InterruptedException { + String storageID = "storageID"; + DatanodeStorageInfo targetDN = BlockManagerTestUtil + .newDatanodeStorageInfo(DFSTestUtil.getLocalDatanodeDescriptor(), + new DatanodeStorage("storage_test_0")); + BlockInfo blk = new BlockInfoContiguous(new Block(0), (short) 0); + StorageNotChosenReason.start(); + String reason = StorageNotChosenReason.getStorageNotChosenReason(blk); + assertTrue(reason.contains(storageID) ); + assertFalse(reason.contains(targetDN.toString())); + assertTrue(reason.contains("successfully chosen storage")); + assertFalse(reason.contains("is not chosen since")); + assertFalse(reason.contains("Reason statistics")); + + int threadNum = 10; + Thread[] threads = new Thread[threadNum]; + for(int i = 0; i { + String newStorageID = "storageID"+index; + StorageNotChosenReason.start(); + DatanodeStorageInfo newTargetStorage = BlockManagerTestUtil + .newDatanodeStorageInfo(DFSTestUtil.getLocalDatanodeDescriptor(), + new DatanodeStorage(newStorageID)); + BlockInfo newBlk = new BlockInfoContiguous(new Block(index), (short) index); + StorageNotChosenReason.genStorageIsNotChooseForReplication(newTargetStorage, + StorageNotChosenReason.REPLICA_DECOMMISSIONED, null); + String reason1 = StorageNotChosenReason.getStorageNotChosenReason(newBlk); + assertTrue(reason1.contains(newBlk.toString())); + assertTrue(reason1.contains(newStorageID)); + assertTrue(reason1.contains(newTargetStorage.toString())); + assertTrue(reason1.contains("is not chosen since")); + assertTrue(reason1.contains("Reason statistics")); + }); + } + for(int i = 0;i Date: Sat, 6 Jul 2024 10:57:31 +0800 Subject: [PATCH 04/20] change compile issue --- .../server/blockmanagement/StorageNotChosenReason.java | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/StorageNotChosenReason.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/StorageNotChosenReason.java index 5a87550af0440..7ba143917fb21 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/StorageNotChosenReason.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/StorageNotChosenReason.java @@ -46,6 +46,13 @@ public static void start(){ debugLoggingBuilder.get().setLength(0); } + public static void logStorageIsNotChooseForReplication(DatanodeStorageInfo storage, + StorageNotChosenReason reason) { + if(LOG.isDebugEnabled()){ + genStorageIsNotChooseForReplication(storage, reason, null); + } + } + public static void logStorageIsNotChooseForReplication(DatanodeStorageInfo storage, StorageNotChosenReason reason, String reasonDetails) { if(LOG.isDebugEnabled()){ @@ -53,7 +60,6 @@ public static void logStorageIsNotChooseForReplication(DatanodeStorageInfo stora } } - @VisibleForTesting static void genStorageIsNotChooseForReplication(DatanodeStorageInfo storage, StorageNotChosenReason reason, String reasonDetails){ From 0373db76375d400a66900b0389cf205e2883c027 Mon Sep 17 00:00:00 2001 From: "Vico.Wu" <583424568@qq.com> Date: Mon, 8 Jul 2024 10:04:01 +0800 Subject: [PATCH 05/20] add apache licencse --- .../blockmanagement/StorageNotChosenReason.java | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/StorageNotChosenReason.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/StorageNotChosenReason.java index 7ba143917fb21..056d2e4d98d5b 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/StorageNotChosenReason.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/StorageNotChosenReason.java @@ -1,3 +1,20 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.hadoop.hdfs.server.blockmanagement; From df77429cda8c673acdbc9e5527b57bd810f1e059 Mon Sep 17 00:00:00 2001 From: "Vico.Wu" <583424568@qq.com> Date: Mon, 8 Jul 2024 20:15:10 +0800 Subject: [PATCH 06/20] dfdsf --- .../hadoop/hdfs/server/blockmanagement/BlockManager.java | 4 ++++ .../hdfs/server/blockmanagement/StorageNotChosenReason.java | 4 +++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java index f97dd2f4e12b7..b2adcbc0888fb 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java @@ -2181,6 +2181,7 @@ int computeReconstructionWorkForBlocks( for (BlockReconstructionWork rw : reconWork) { final DatanodeStorageInfo[] targets = rw.getTargets(); if (targets == null || targets.length == 0) { + logStorageIsNotChooseForReplication(); rw.resetTargets(); continue; } @@ -2189,6 +2190,9 @@ int computeReconstructionWorkForBlocks( if (validateReconstructionWork(rw)) { scheduledWork++; } + else{ + + } } } } finally { diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/StorageNotChosenReason.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/StorageNotChosenReason.java index 056d2e4d98d5b..b5132b538b18b 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/StorageNotChosenReason.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/StorageNotChosenReason.java @@ -30,7 +30,9 @@ public enum StorageNotChosenReason { REPLICA_MAINTENANCE_NOT_FOR_READ("stored replica is maintenance not for read"), REPLICA_DECOMMISSIONED("replica is already decommissioned"), REPLICA_ALREADY_REACH_REPLICATION_LIMIT("replica already reached replication soft limit"), - REPLICA_ALREADY_REACH_REPLICATION_HARD_LIMIT("replica already reached replication hard limit"); + REPLICA_ALREADY_REACH_REPLICATION_HARD_LIMIT("replica already reached replication hard limit"), + NO_AVAILABLE_TARGET_FOUND("cannot find available target"), + RECONSTRUCTION_WORK_NOT_PASS_VALIDATION("validation for reconstruction work failed"); public static final Logger LOG = LoggerFactory.getLogger( BlockManager.class); From ae8a03fd40c28c88c22c3ca29cc8257e80b68061 Mon Sep 17 00:00:00 2001 From: "Vico.Wu" <583424568@qq.com> Date: Tue, 9 Jul 2024 22:26:54 +0800 Subject: [PATCH 07/20] test --- .../server/blockmanagement/BlockManager.java | 42 +++--- .../BlockSkippedForReconstructionReason.java | 131 ++++++++++++++++++ .../StorageNotChosenReason.java | 118 ---------------- .../blockmanagement/TestBlockManager.java | 43 +++--- 4 files changed, 179 insertions(+), 155 deletions(-) create mode 100644 hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockSkippedForReconstructionReason.java delete mode 100644 hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/StorageNotChosenReason.java diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java index b2adcbc0888fb..031473a23f657 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java @@ -20,7 +20,7 @@ import static org.apache.hadoop.hdfs.DFSConfigKeys.*; import static org.apache.hadoop.hdfs.protocol.BlockType.CONTIGUOUS; import static org.apache.hadoop.hdfs.protocol.BlockType.STRIPED; -import static org.apache.hadoop.hdfs.server.blockmanagement.StorageNotChosenReason.*; +import static org.apache.hadoop.hdfs.server.blockmanagement.BlockSkippedForReconstructionReason.DetailedReason; import static org.apache.hadoop.util.ExitUtil.terminate; import static org.apache.hadoop.util.Time.now; @@ -2134,7 +2134,6 @@ int computeReconstructionWorkForBlocks( List> blocksToReconstruct) { int scheduledWork = 0; List reconWork = new ArrayList<>(); - // Step 1: categorize at-risk blocks into replication and EC tasks namesystem.writeLock(); try { @@ -2181,7 +2180,8 @@ int computeReconstructionWorkForBlocks( for (BlockReconstructionWork rw : reconWork) { final DatanodeStorageInfo[] targets = rw.getTargets(); if (targets == null || targets.length == 0) { - logStorageIsNotChooseForReplication(); + BlockSkippedForReconstructionReason.genSkipReconstructionReason(rw.getBlock(), null, + BlockSkippedForReconstructionReason.NO_AVAILABLE_TARGET_HOST_FOUND); rw.resetTargets(); continue; } @@ -2191,7 +2191,8 @@ int computeReconstructionWorkForBlocks( scheduledWork++; } else{ - + BlockSkippedForReconstructionReason.genSkipReconstructionReason(rw.getBlock(), null, + BlockSkippedForReconstructionReason.RECONSTRUCTION_WORK_NOT_PASS_VALIDATION); } } } @@ -2249,9 +2250,6 @@ BlockReconstructionWork scheduleReconstruction(BlockInfo block, final DatanodeDescriptor[] srcNodes = chooseSourceDatanodes(block, containingNodes, liveReplicaNodes, numReplicas, liveBlockIndices, liveBusyBlockIndices, excludeReconstructed, priority); - if(LOG.isDebugEnabled()){ - LOG.debug(getStorageNotChosenReason(block)); - } short requiredRedundancy = getExpectedLiveRedundancyNum(block, numReplicas); if (srcNodes == null || srcNodes.length == 0) { @@ -2578,8 +2576,6 @@ DatanodeDescriptor[] chooseSourceDatanodes(BlockInfo block, liveBlockIndices.clear(); final boolean isStriped = block.isStriped(); DatanodeDescriptor decommissionedSrc = null; - - StorageNotChosenReason.start(); BitSet liveBitSet = null; BitSet decommissioningBitSet = null; if (isStriped) { @@ -2604,7 +2600,9 @@ DatanodeDescriptor[] chooseSourceDatanodes(BlockInfo block, // do not select the replica if it is corrupt or excess if (state == StoredReplicaState.CORRUPT || state == StoredReplicaState.EXCESS) { - logStorageIsNotChooseForReplication(storage, StorageNotChosenReason.REPLICA_CORRUPT_OR_EXCESS); + BlockSkippedForReconstructionReason.genSkipReconstructionReason(block, storage, + BlockSkippedForReconstructionReason.SOURCE_NODE_UNAVAILABLE, + DetailedReason.REPLICA_CORRUPT_OR_EXCESS); continue; } @@ -2612,7 +2610,9 @@ DatanodeDescriptor[] chooseSourceDatanodes(BlockInfo block, // or unknown state replicas. if (state == null || state == StoredReplicaState.MAINTENANCE_NOT_FOR_READ) { - logStorageIsNotChooseForReplication(storage, StorageNotChosenReason.REPLICA_MAINTENANCE_NOT_FOR_READ); + BlockSkippedForReconstructionReason.genSkipReconstructionReason(block, storage, + BlockSkippedForReconstructionReason.SOURCE_NODE_UNAVAILABLE, + DetailedReason.REPLICA_MAINTENANCE_NOT_FOR_READ); continue; } @@ -2624,7 +2624,9 @@ DatanodeDescriptor[] chooseSourceDatanodes(BlockInfo block, ThreadLocalRandom.current().nextBoolean()) { decommissionedSrc = node; } - logStorageIsNotChooseForReplication(storage, StorageNotChosenReason.REPLICA_DECOMMISSIONED); + BlockSkippedForReconstructionReason.genSkipReconstructionReason(block, storage, + BlockSkippedForReconstructionReason.SOURCE_NODE_UNAVAILABLE, + DetailedReason.REPLICA_DECOMMISSIONED); continue; } @@ -2649,7 +2651,9 @@ DatanodeDescriptor[] chooseSourceDatanodes(BlockInfo block, //HDFS-16566 ExcludeReconstructed won't be reconstructed. excludeReconstructed.add(blockIndex); } - logStorageIsNotChooseForReplication(storage, StorageNotChosenReason.REPLICA_ALREADY_REACH_REPLICATION_LIMIT); + BlockSkippedForReconstructionReason.genSkipReconstructionReason(block, storage, + BlockSkippedForReconstructionReason.SOURCE_NODE_UNAVAILABLE, + DetailedReason.REPLICA_ALREADY_REACH_REPLICATION_LIMIT); continue; // already reached replication limit } @@ -2661,7 +2665,9 @@ DatanodeDescriptor[] chooseSourceDatanodes(BlockInfo block, //HDFS-16566 ExcludeReconstructed won't be reconstructed. excludeReconstructed.add(blockIndex); } - logStorageIsNotChooseForReplication(storage, StorageNotChosenReason.REPLICA_ALREADY_REACH_REPLICATION_HARD_LIMIT); + BlockSkippedForReconstructionReason.genSkipReconstructionReason(block, storage, + BlockSkippedForReconstructionReason.SOURCE_NODE_UNAVAILABLE, + DetailedReason.REPLICA_ALREADY_REACH_REPLICATION_HARD_LIMIT); continue; } @@ -5420,9 +5426,13 @@ int computeDatanodeWork() { * this.blocksReplWorkMultiplier; final int nodesToProcess = (int) Math.ceil(numlive * this.blocksInvalidateWorkPct); - + if(LOG.isDebugEnabled()){ + BlockSkippedForReconstructionReason.start(); + } int workFound = this.computeBlockReconstructionWork(blocksToProcess); - + if(LOG.isDebugEnabled()){ + BlockSkippedForReconstructionReason.summaryBlockSkippedForReconstructionReason(); + } // Update counters namesystem.writeLock(); try { diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockSkippedForReconstructionReason.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockSkippedForReconstructionReason.java new file mode 100644 index 0000000000000..7a37545c53686 --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockSkippedForReconstructionReason.java @@ -0,0 +1,131 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdfs.server.blockmanagement; + + +import org.apache.hadoop.classification.VisibleForTesting; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.HashMap; +import java.util.Map; + +/** + * When scheduling ReconstructionWork for a low-redundancy block, the scheduling may fail for overall 3 reasons: + * 1. No source node is available + * 2. No Target node is available + * 3. ReconstructionWork validation failed + * I put above 3 cases as BlockSkippedForReconstruction. + * - For the detailed reason of `No source node is available`, I put it into DetailedReason enum + * - For the detailed reason of `No Target node is available`, we already has NodeNotChosenReason in BlockPlacementPolicyDefault + */ +public enum BlockSkippedForReconstructionReason { + SOURCE_NODE_UNAVAILABLE("source node or storage unavailable"), + NO_AVAILABLE_TARGET_HOST_FOUND("cannot find available target host"), + RECONSTRUCTION_WORK_NOT_PASS_VALIDATION("validation for reconstruction work failed"); + + enum DetailedReason{ + REPLICA_CORRUPT_OR_EXCESS("stored replica state is corrupt or excess"), + REPLICA_MAINTENANCE_NOT_FOR_READ("stored replica is maintenance not for read"), + REPLICA_DECOMMISSIONED("replica is already decommissioned"), + REPLICA_ALREADY_REACH_REPLICATION_LIMIT("replica already reached replication soft limit"), + REPLICA_ALREADY_REACH_REPLICATION_HARD_LIMIT("replica already reached replication hard limit"); + private final String text; + + DetailedReason(final String logText) { + text = logText; + } + + private String getText() { + return text; + } + } + public static final Logger LOG = LoggerFactory.getLogger( + BlockManager.class); + + private static final ThreadLocal> + blockNotChosenReasonMap = ThreadLocal + .withInitial(() -> new HashMap()); + + private final String text; + + BlockSkippedForReconstructionReason(final String logText) { + text = logText; + } + + private String getText() { + return text; + } + + public static void start(){ + blockNotChosenReasonMap.get().clear(); + } + + public static void genSkipReconstructionReason(BlockInfo block, DatanodeStorageInfo storage, + BlockSkippedForReconstructionReason reason) { + if(LOG.isDebugEnabled()){ + genStorageIsNotChooseForReplication(block, storage, reason, null); + } + } + + public static void genSkipReconstructionReason(BlockInfo block, DatanodeStorageInfo storage, + BlockSkippedForReconstructionReason reason, DetailedReason reasonDetails) { + if(LOG.isDebugEnabled()){ + genStorageIsNotChooseForReplication(block, storage, reason, reasonDetails); + } + } + + @VisibleForTesting + static void genStorageIsNotChooseForReplication(BlockInfo block, DatanodeStorageInfo storage, + BlockSkippedForReconstructionReason reason, DetailedReason reasonDetails){ + // build the error message for later use. + HashMap blockReason = blockNotChosenReasonMap.get(); + StringBuilder reasonForBlock = null; + if(!blockReason.containsKey(block)){ + reasonForBlock = new StringBuilder() + .append("Block ") + .append(block) + .append(" didn't schedule ReconstructionWork for below reasons: \n ["); + }else{ + reasonForBlock = blockReason.get(block); + } + switch (reason){ + case SOURCE_NODE_UNAVAILABLE: + reasonForBlock.append(" Source node storage ").append(storage).append(" is not chosen since ").append(reason); + break; + case NO_AVAILABLE_TARGET_HOST_FOUND: + case RECONSTRUCTION_WORK_NOT_PASS_VALIDATION: + reasonForBlock.append(" ").append(reason); + } + if (reasonDetails != null) { + reasonForBlock.append(" ").append(reasonDetails.getText()); + } + reasonForBlock.append("."); + } + + @VisibleForTesting + static String summaryBlockSkippedForReconstructionReason(){ + StringBuilder finalReasonForAllBlocks = new StringBuilder(); + for(Map.Entry blockReason: blockNotChosenReasonMap.get().entrySet()){ + blockReason.getValue().append("]\n"); + finalReasonForAllBlocks.append(blockReason); + } + blockNotChosenReasonMap.get().clear(); + return finalReasonForAllBlocks.toString(); + } +} \ No newline at end of file diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/StorageNotChosenReason.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/StorageNotChosenReason.java deleted file mode 100644 index b5132b538b18b..0000000000000 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/StorageNotChosenReason.java +++ /dev/null @@ -1,118 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.hadoop.hdfs.server.blockmanagement; - - -import org.apache.hadoop.classification.VisibleForTesting; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.HashMap; - - -public enum StorageNotChosenReason { - REPLICA_CORRUPT_OR_EXCESS("stored replica state is corrupt or excess"), - REPLICA_MAINTENANCE_NOT_FOR_READ("stored replica is maintenance not for read"), - REPLICA_DECOMMISSIONED("replica is already decommissioned"), - REPLICA_ALREADY_REACH_REPLICATION_LIMIT("replica already reached replication soft limit"), - REPLICA_ALREADY_REACH_REPLICATION_HARD_LIMIT("replica already reached replication hard limit"), - NO_AVAILABLE_TARGET_FOUND("cannot find available target"), - RECONSTRUCTION_WORK_NOT_PASS_VALIDATION("validation for reconstruction work failed"); - - public static final Logger LOG = LoggerFactory.getLogger( - BlockManager.class); - - private static final ThreadLocal> - REASONS_SUMMARY = ThreadLocal - .withInitial(() -> new HashMap()); - - - private static final ThreadLocal debugLoggingBuilder - = new ThreadLocal() { - @Override - protected StringBuilder initialValue() { - return new StringBuilder(); - } - }; - - private final String text; - - StorageNotChosenReason(final String logText) { - text = logText; - } - - private String getText() { - return text; - } - - public static void start(){ - REASONS_SUMMARY.get().clear(); - debugLoggingBuilder.get().setLength(0); - } - - public static void logStorageIsNotChooseForReplication(DatanodeStorageInfo storage, - StorageNotChosenReason reason) { - if(LOG.isDebugEnabled()){ - genStorageIsNotChooseForReplication(storage, reason, null); - } - } - - public static void logStorageIsNotChooseForReplication(DatanodeStorageInfo storage, - StorageNotChosenReason reason, String reasonDetails) { - if(LOG.isDebugEnabled()){ - genStorageIsNotChooseForReplication(storage, reason, reasonDetails); - } - } - - @VisibleForTesting - static void genStorageIsNotChooseForReplication(DatanodeStorageInfo storage, - StorageNotChosenReason reason, String reasonDetails){ - // build the error message for later use. - debugLoggingBuilder.get() - .append("\n Storage ").append((storage==null)?"None":storage) - .append(" is not chosen since ").append(reason.getText()); - if (reasonDetails != null) { - debugLoggingBuilder.get().append(" ").append(reasonDetails); - } - debugLoggingBuilder.get().append("."); - final HashMap reasonMap = - REASONS_SUMMARY.get(); - Integer base = reasonMap.get(reason); - if (base == null) { - base = 0; - } - reasonMap.put(reason, base + 1); - } - - @VisibleForTesting - static String getStorageNotChosenReason(BlockInfo block){ - StringBuilder blockInfoPrefix = new StringBuilder("Block ").append(block); - final HashMap reasonMap = - REASONS_SUMMARY.get(); - if(reasonMap.isEmpty()){ - return blockInfoPrefix.append(" successfully chosen storage.").toString(); - }else{ - blockInfoPrefix.append(" has no chosen storage. Reason: [\n") ; - debugLoggingBuilder.get().append("\n]"); - StringBuilder reasonMapResult = new StringBuilder(); - reasonMapResult.append("Reason statistics: ").append(reasonMap); - return blockInfoPrefix.append(debugLoggingBuilder.get()).append("\n") - .append(reasonMapResult).toString(); - } - } -} \ No newline at end of file diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java index 08b9d45349a46..f91968e2626a6 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java @@ -118,6 +118,9 @@ import java.util.concurrent.TimeoutException; import java.util.concurrent.atomic.AtomicBoolean; +import org.apache.hadoop.hdfs.server.blockmanagement.BlockSkippedForReconstructionReason; +import static org.apache.hadoop.hdfs.server.blockmanagement.BlockSkippedForReconstructionReason.DetailedReason; +import static org.apache.hadoop.hdfs.server.blockmanagement.BlockSkippedForReconstructionReason.SOURCE_NODE_UNAVAILABLE; import static org.apache.hadoop.hdfs.server.common.HdfsServerConstants.BlockUCState.UNDER_CONSTRUCTION; import static org.apache.hadoop.test.MetricsAsserts.getLongCounter; import static org.apache.hadoop.test.MetricsAsserts.getMetrics; @@ -2334,18 +2337,18 @@ public void delayDeleteReplica() { */ @Test(timeout = 6000) public void testStorageNotChosenReason() throws InterruptedException { - String storageID = "storageID"; - DatanodeStorageInfo targetDN = BlockManagerTestUtil - .newDatanodeStorageInfo(DFSTestUtil.getLocalDatanodeDescriptor(), - new DatanodeStorage("storage_test_0")); - BlockInfo blk = new BlockInfoContiguous(new Block(0), (short) 0); - StorageNotChosenReason.start(); - String reason = StorageNotChosenReason.getStorageNotChosenReason(blk); - assertTrue(reason.contains(storageID) ); - assertFalse(reason.contains(targetDN.toString())); - assertTrue(reason.contains("successfully chosen storage")); - assertFalse(reason.contains("is not chosen since")); - assertFalse(reason.contains("Reason statistics")); +// String storageID = "storageID"; +// DatanodeStorageInfo targetDN = BlockManagerTestUtil +// .newDatanodeStorageInfo(DFSTestUtil.getLocalDatanodeDescriptor(), +// new DatanodeStorage("storage_test_0")); +// BlockInfo blk = new BlockInfoContiguous(new Block(0), (short) 0); +// BlockSkippedForReconstructionReason.start(); +// String reason = BlockSkippedForReconstructionReason.genSkipReconstructionReason(blk); +// assertTrue(reason.contains(storageID) ); +// assertFalse(reason.contains(targetDN.toString())); +// assertTrue(reason.contains("successfully chosen storage")); +// assertFalse(reason.contains("is not chosen since")); +// assertFalse(reason.contains("Reason statistics")); int threadNum = 10; Thread[] threads = new Thread[threadNum]; @@ -2353,19 +2356,17 @@ public void testStorageNotChosenReason() throws InterruptedException { final int index = i; threads[i] = new Thread(() -> { String newStorageID = "storageID"+index; - StorageNotChosenReason.start(); - DatanodeStorageInfo newTargetStorage = BlockManagerTestUtil + BlockSkippedForReconstructionReason.start(); + DatanodeStorageInfo sourceStorage = BlockManagerTestUtil .newDatanodeStorageInfo(DFSTestUtil.getLocalDatanodeDescriptor(), new DatanodeStorage(newStorageID)); BlockInfo newBlk = new BlockInfoContiguous(new Block(index), (short) index); - StorageNotChosenReason.genStorageIsNotChooseForReplication(newTargetStorage, - StorageNotChosenReason.REPLICA_DECOMMISSIONED, null); - String reason1 = StorageNotChosenReason.getStorageNotChosenReason(newBlk); + BlockSkippedForReconstructionReason.genStorageIsNotChooseForReplication(newBlk, sourceStorage, + BlockSkippedForReconstructionReason.SOURCE_NODE_UNAVAILABLE, DetailedReason.REPLICA_DECOMMISSIONED); + String reason1 = BlockSkippedForReconstructionReason.summaryBlockSkippedForReconstructionReason(); assertTrue(reason1.contains(newBlk.toString())); - assertTrue(reason1.contains(newStorageID)); - assertTrue(reason1.contains(newTargetStorage.toString())); - assertTrue(reason1.contains("is not chosen since")); - assertTrue(reason1.contains("Reason statistics")); + assertTrue(reason1.contains(sourceStorage.toString())); + assertTrue(reason1.contains(SOURCE_NODE_UNAVAILABLE.toString())); }); } for(int i = 0;i Date: Tue, 9 Jul 2024 22:35:28 +0800 Subject: [PATCH 08/20] test --- .../BlockSkippedForReconstructionReason.java | 2 +- .../blockmanagement/TestBlockManager.java | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockSkippedForReconstructionReason.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockSkippedForReconstructionReason.java index 7a37545c53686..6c1268020f144 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockSkippedForReconstructionReason.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockSkippedForReconstructionReason.java @@ -106,7 +106,7 @@ static void genStorageIsNotChooseForReplication(BlockInfo block, DatanodeStorage } switch (reason){ case SOURCE_NODE_UNAVAILABLE: - reasonForBlock.append(" Source node storage ").append(storage).append(" is not chosen since ").append(reason); + reasonForBlock.append(" Source node storage ").append(storage==null?"None":storage).append(" is not chosen since ").append(reason); break; case NO_AVAILABLE_TARGET_HOST_FOUND: case RECONSTRUCTION_WORK_NOT_PASS_VALIDATION: diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java index f91968e2626a6..c09398e5d3734 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java @@ -2367,6 +2367,24 @@ public void testStorageNotChosenReason() throws InterruptedException { assertTrue(reason1.contains(newBlk.toString())); assertTrue(reason1.contains(sourceStorage.toString())); assertTrue(reason1.contains(SOURCE_NODE_UNAVAILABLE.toString())); + + LOG.info("Reason1 for " + newBlk + " in storage " + newStorageID + " is " + reason1); + + BlockSkippedForReconstructionReason.start(); + BlockSkippedForReconstructionReason.genStorageIsNotChooseForReplication(newBlk, null, + BlockSkippedForReconstructionReason.RECONSTRUCTION_WORK_NOT_PASS_VALIDATION, null); + String reason2 = BlockSkippedForReconstructionReason.summaryBlockSkippedForReconstructionReason(); + assertTrue(reason2.contains(newBlk.toString())); + assertTrue(reason2.contains(BlockSkippedForReconstructionReason.RECONSTRUCTION_WORK_NOT_PASS_VALIDATION.toString())); + LOG.info("Reason2 for " + newBlk + " in storage " + newStorageID + " is " + reason2); + + BlockSkippedForReconstructionReason.start(); + BlockSkippedForReconstructionReason.genStorageIsNotChooseForReplication(newBlk, null, + BlockSkippedForReconstructionReason.NO_AVAILABLE_TARGET_HOST_FOUND, null); + String reason3 = BlockSkippedForReconstructionReason.summaryBlockSkippedForReconstructionReason(); + assertTrue(reason3.contains(newBlk.toString())); + assertTrue(reason3.contains(BlockSkippedForReconstructionReason.RECONSTRUCTION_WORK_NOT_PASS_VALIDATION.toString())); + LOG.info("Reason3 for " + newBlk + " in storage " + newStorageID + " is " + reason3); }); } for(int i = 0;i Date: Tue, 9 Jul 2024 22:41:40 +0800 Subject: [PATCH 09/20] test --- .../blockmanagement/TestBlockManager.java | 69 +++++++++++-------- 1 file changed, 39 insertions(+), 30 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java index c09398e5d3734..19e1a355568db 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java @@ -2337,6 +2337,7 @@ public void delayDeleteReplica() { */ @Test(timeout = 6000) public void testStorageNotChosenReason() throws InterruptedException { + final AtomicBoolean failure = new AtomicBoolean(); // String storageID = "storageID"; // DatanodeStorageInfo targetDN = BlockManagerTestUtil // .newDatanodeStorageInfo(DFSTestUtil.getLocalDatanodeDescriptor(), @@ -2355,41 +2356,49 @@ public void testStorageNotChosenReason() throws InterruptedException { for(int i = 0; i { - String newStorageID = "storageID"+index; - BlockSkippedForReconstructionReason.start(); - DatanodeStorageInfo sourceStorage = BlockManagerTestUtil - .newDatanodeStorageInfo(DFSTestUtil.getLocalDatanodeDescriptor(), - new DatanodeStorage(newStorageID)); - BlockInfo newBlk = new BlockInfoContiguous(new Block(index), (short) index); - BlockSkippedForReconstructionReason.genStorageIsNotChooseForReplication(newBlk, sourceStorage, - BlockSkippedForReconstructionReason.SOURCE_NODE_UNAVAILABLE, DetailedReason.REPLICA_DECOMMISSIONED); - String reason1 = BlockSkippedForReconstructionReason.summaryBlockSkippedForReconstructionReason(); - assertTrue(reason1.contains(newBlk.toString())); - assertTrue(reason1.contains(sourceStorage.toString())); - assertTrue(reason1.contains(SOURCE_NODE_UNAVAILABLE.toString())); - - LOG.info("Reason1 for " + newBlk + " in storage " + newStorageID + " is " + reason1); - - BlockSkippedForReconstructionReason.start(); - BlockSkippedForReconstructionReason.genStorageIsNotChooseForReplication(newBlk, null, - BlockSkippedForReconstructionReason.RECONSTRUCTION_WORK_NOT_PASS_VALIDATION, null); - String reason2 = BlockSkippedForReconstructionReason.summaryBlockSkippedForReconstructionReason(); - assertTrue(reason2.contains(newBlk.toString())); - assertTrue(reason2.contains(BlockSkippedForReconstructionReason.RECONSTRUCTION_WORK_NOT_PASS_VALIDATION.toString())); - LOG.info("Reason2 for " + newBlk + " in storage " + newStorageID + " is " + reason2); - - BlockSkippedForReconstructionReason.start(); - BlockSkippedForReconstructionReason.genStorageIsNotChooseForReplication(newBlk, null, - BlockSkippedForReconstructionReason.NO_AVAILABLE_TARGET_HOST_FOUND, null); - String reason3 = BlockSkippedForReconstructionReason.summaryBlockSkippedForReconstructionReason(); - assertTrue(reason3.contains(newBlk.toString())); - assertTrue(reason3.contains(BlockSkippedForReconstructionReason.RECONSTRUCTION_WORK_NOT_PASS_VALIDATION.toString())); - LOG.info("Reason3 for " + newBlk + " in storage " + newStorageID + " is " + reason3); + try{ + String newStorageID = "storageID"+index; + BlockSkippedForReconstructionReason.start(); + DatanodeStorageInfo sourceStorage = BlockManagerTestUtil + .newDatanodeStorageInfo(DFSTestUtil.getLocalDatanodeDescriptor(), + new DatanodeStorage(newStorageID)); + BlockInfo newBlk = new BlockInfoContiguous(new Block(index), (short) index); + BlockSkippedForReconstructionReason.genStorageIsNotChooseForReplication(newBlk, sourceStorage, + BlockSkippedForReconstructionReason.SOURCE_NODE_UNAVAILABLE, DetailedReason.REPLICA_DECOMMISSIONED); + String reason1 = BlockSkippedForReconstructionReason.summaryBlockSkippedForReconstructionReason(); + LOG.info("Reason1 for " + newBlk + " in storage " + newStorageID + " is " + reason1); + assertTrue(reason1.contains(newBlk.toString())); + assertTrue(reason1.contains(sourceStorage.toString())); + assertTrue(reason1.contains(SOURCE_NODE_UNAVAILABLE.toString())); + + + BlockSkippedForReconstructionReason.start(); + BlockSkippedForReconstructionReason.genStorageIsNotChooseForReplication(newBlk, null, + BlockSkippedForReconstructionReason.RECONSTRUCTION_WORK_NOT_PASS_VALIDATION, null); + String reason2 = BlockSkippedForReconstructionReason.summaryBlockSkippedForReconstructionReason(); + LOG.info("Reason2 for " + newBlk + " in storage " + newStorageID + " is " + reason2); + assertTrue(reason2.contains(newBlk.toString())); + assertTrue(reason2.contains(BlockSkippedForReconstructionReason.RECONSTRUCTION_WORK_NOT_PASS_VALIDATION.toString())); + + + BlockSkippedForReconstructionReason.start(); + BlockSkippedForReconstructionReason.genStorageIsNotChooseForReplication(newBlk, null, + BlockSkippedForReconstructionReason.NO_AVAILABLE_TARGET_HOST_FOUND, null); + LOG.info("Reason3 for " + newBlk + " in storage " + newStorageID + " is " + reason3); + String reason3 = BlockSkippedForReconstructionReason.summaryBlockSkippedForReconstructionReason(); + assertTrue(reason3.contains(newBlk.toString())); + assertTrue(reason3.contains(BlockSkippedForReconstructionReason.RECONSTRUCTION_WORK_NOT_PASS_VALIDATION.toString())); + + }catch (Exception e){ + e.printStackTrace(); + failure.set(true); + } }); } for(int i = 0;i Date: Tue, 9 Jul 2024 22:43:09 +0800 Subject: [PATCH 10/20] test --- .../hadoop/hdfs/server/blockmanagement/TestBlockManager.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java index 19e1a355568db..d198e7ad2af12 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java @@ -2384,8 +2384,8 @@ public void testStorageNotChosenReason() throws InterruptedException { BlockSkippedForReconstructionReason.start(); BlockSkippedForReconstructionReason.genStorageIsNotChooseForReplication(newBlk, null, BlockSkippedForReconstructionReason.NO_AVAILABLE_TARGET_HOST_FOUND, null); - LOG.info("Reason3 for " + newBlk + " in storage " + newStorageID + " is " + reason3); String reason3 = BlockSkippedForReconstructionReason.summaryBlockSkippedForReconstructionReason(); + LOG.info("Reason3 for " + newBlk + " in storage " + newStorageID + " is " + reason3); assertTrue(reason3.contains(newBlk.toString())); assertTrue(reason3.contains(BlockSkippedForReconstructionReason.RECONSTRUCTION_WORK_NOT_PASS_VALIDATION.toString())); From 5b61d4a52b6005ec72b3c707f03820c92659fce3 Mon Sep 17 00:00:00 2001 From: "Vico.Wu" <583424568@qq.com> Date: Wed, 10 Jul 2024 09:40:10 +0800 Subject: [PATCH 11/20] dfsdfs --- .../blockmanagement/TestBlockManager.java | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java index d198e7ad2af12..9673b88e03cf7 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java @@ -2337,7 +2337,7 @@ public void delayDeleteReplica() { */ @Test(timeout = 6000) public void testStorageNotChosenReason() throws InterruptedException { - final AtomicBoolean failure = new AtomicBoolean(); + final AtomicBoolean failure = new AtomicBoolean(false); // String storageID = "storageID"; // DatanodeStorageInfo targetDN = BlockManagerTestUtil // .newDatanodeStorageInfo(DFSTestUtil.getLocalDatanodeDescriptor(), @@ -2367,9 +2367,9 @@ public void testStorageNotChosenReason() throws InterruptedException { BlockSkippedForReconstructionReason.SOURCE_NODE_UNAVAILABLE, DetailedReason.REPLICA_DECOMMISSIONED); String reason1 = BlockSkippedForReconstructionReason.summaryBlockSkippedForReconstructionReason(); LOG.info("Reason1 for " + newBlk + " in storage " + newStorageID + " is " + reason1); - assertTrue(reason1.contains(newBlk.toString())); - assertTrue(reason1.contains(sourceStorage.toString())); - assertTrue(reason1.contains(SOURCE_NODE_UNAVAILABLE.toString())); + assertTrue("reason should contain block ID " + newBlk, reason1.contains(newBlk.toString())); + assertTrue("reason should contain source node", reason1.contains(sourceStorage.toString())); + assertTrue("reason should contain "+ SOURCE_NODE_UNAVAILABLE, reason1.contains(SOURCE_NODE_UNAVAILABLE.toString())); BlockSkippedForReconstructionReason.start(); @@ -2377,8 +2377,9 @@ public void testStorageNotChosenReason() throws InterruptedException { BlockSkippedForReconstructionReason.RECONSTRUCTION_WORK_NOT_PASS_VALIDATION, null); String reason2 = BlockSkippedForReconstructionReason.summaryBlockSkippedForReconstructionReason(); LOG.info("Reason2 for " + newBlk + " in storage " + newStorageID + " is " + reason2); - assertTrue(reason2.contains(newBlk.toString())); - assertTrue(reason2.contains(BlockSkippedForReconstructionReason.RECONSTRUCTION_WORK_NOT_PASS_VALIDATION.toString())); + assertTrue("reason should contain block ID " + newBlk, reason2.contains(newBlk.toString())); + assertTrue("reason should contain [" + BlockSkippedForReconstructionReason.RECONSTRUCTION_WORK_NOT_PASS_VALIDATION + "]", + reason2.contains(BlockSkippedForReconstructionReason.RECONSTRUCTION_WORK_NOT_PASS_VALIDATION.toString())); BlockSkippedForReconstructionReason.start(); @@ -2386,10 +2387,10 @@ public void testStorageNotChosenReason() throws InterruptedException { BlockSkippedForReconstructionReason.NO_AVAILABLE_TARGET_HOST_FOUND, null); String reason3 = BlockSkippedForReconstructionReason.summaryBlockSkippedForReconstructionReason(); LOG.info("Reason3 for " + newBlk + " in storage " + newStorageID + " is " + reason3); - assertTrue(reason3.contains(newBlk.toString())); - assertTrue(reason3.contains(BlockSkippedForReconstructionReason.RECONSTRUCTION_WORK_NOT_PASS_VALIDATION.toString())); + assertTrue("reason should contain block ID " + newBlk, reason3.contains(newBlk.toString())); + assertTrue("reason should contain [" + BlockSkippedForReconstructionReason.NO_AVAILABLE_TARGET_HOST_FOUND + "]", reason3.contains(BlockSkippedForReconstructionReason.NO_AVAILABLE_TARGET_HOST_FOUND.toString())); - }catch (Exception e){ + }catch (Throwable e){ e.printStackTrace(); failure.set(true); } From 01f1d83d685b102da263f096e441f1a9be95228d Mon Sep 17 00:00:00 2001 From: "Vico.Wu" <583424568@qq.com> Date: Wed, 10 Jul 2024 09:47:47 +0800 Subject: [PATCH 12/20] dafasdf --- .../BlockSkippedForReconstructionReason.java | 13 +++++-------- .../server/blockmanagement/TestBlockManager.java | 4 ++-- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockSkippedForReconstructionReason.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockSkippedForReconstructionReason.java index 6c1268020f144..aa16067e22dd0 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockSkippedForReconstructionReason.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockSkippedForReconstructionReason.java @@ -96,14 +96,11 @@ static void genStorageIsNotChooseForReplication(BlockInfo block, DatanodeStorage // build the error message for later use. HashMap blockReason = blockNotChosenReasonMap.get(); StringBuilder reasonForBlock = null; - if(!blockReason.containsKey(block)){ - reasonForBlock = new StringBuilder() - .append("Block ") - .append(block) - .append(" didn't schedule ReconstructionWork for below reasons: \n ["); - }else{ - reasonForBlock = blockReason.get(block); - } + blockReason.putIfAbsent(block, new StringBuilder() + .append("Block ") + .append(block) + .append(" didn't schedule ReconstructionWork for below reasons: \n [")); + reasonForBlock = blockReason.get(block); switch (reason){ case SOURCE_NODE_UNAVAILABLE: reasonForBlock.append(" Source node storage ").append(storage==null?"None":storage).append(" is not chosen since ").append(reason); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java index 9673b88e03cf7..6671d3873a069 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java @@ -2357,7 +2357,7 @@ public void testStorageNotChosenReason() throws InterruptedException { final int index = i; threads[i] = new Thread(() -> { try{ - String newStorageID = "storageID"+index; + String newStorageID = "storageID_"+index; BlockSkippedForReconstructionReason.start(); DatanodeStorageInfo sourceStorage = BlockManagerTestUtil .newDatanodeStorageInfo(DFSTestUtil.getLocalDatanodeDescriptor(), @@ -2400,6 +2400,6 @@ public void testStorageNotChosenReason() throws InterruptedException { threads[i].start(); threads[i].join(0); } - assertFalse(failure.get()); + assertFalse("TestStorageNotChosenReason has error. Check the log for the details.", failure.get()); } } \ No newline at end of file From 63c0dd29ab4594f6ef1565529beb45593aff749c Mon Sep 17 00:00:00 2001 From: "Vico.Wu" <583424568@qq.com> Date: Wed, 10 Jul 2024 10:00:01 +0800 Subject: [PATCH 13/20] dafasdf --- .../BlockSkippedForReconstructionReason.java | 21 +++++++------------ 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockSkippedForReconstructionReason.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockSkippedForReconstructionReason.java index aa16067e22dd0..8d0e89245de12 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockSkippedForReconstructionReason.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockSkippedForReconstructionReason.java @@ -39,7 +39,7 @@ public enum BlockSkippedForReconstructionReason { NO_AVAILABLE_TARGET_HOST_FOUND("cannot find available target host"), RECONSTRUCTION_WORK_NOT_PASS_VALIDATION("validation for reconstruction work failed"); - enum DetailedReason{ + enum DetailedReason { REPLICA_CORRUPT_OR_EXCESS("stored replica state is corrupt or excess"), REPLICA_MAINTENANCE_NOT_FOR_READ("stored replica is maintenance not for read"), REPLICA_DECOMMISSIONED("replica is already decommissioned"), @@ -97,20 +97,15 @@ static void genStorageIsNotChooseForReplication(BlockInfo block, DatanodeStorage HashMap blockReason = blockNotChosenReasonMap.get(); StringBuilder reasonForBlock = null; blockReason.putIfAbsent(block, new StringBuilder() - .append("Block ") + .append("Block [") .append(block) - .append(" didn't schedule ReconstructionWork for below reasons: \n [")); + .append("] is not scheduled for reconstruction since: \n [")); reasonForBlock = blockReason.get(block); - switch (reason){ - case SOURCE_NODE_UNAVAILABLE: - reasonForBlock.append(" Source node storage ").append(storage==null?"None":storage).append(" is not chosen since ").append(reason); - break; - case NO_AVAILABLE_TARGET_HOST_FOUND: - case RECONSTRUCTION_WORK_NOT_PASS_VALIDATION: - reasonForBlock.append(" ").append(reason); - } + reasonForBlock.append(" ").append(reason.getText()); + if(storage != null) + reasonForBlock.append(" on node ").append(storage); if (reasonDetails != null) { - reasonForBlock.append(" ").append(reasonDetails.getText()); + reasonForBlock.append(". Detail Reason: [").append(reasonDetails.getText()).append("]"); } reasonForBlock.append("."); } @@ -120,7 +115,7 @@ static String summaryBlockSkippedForReconstructionReason(){ StringBuilder finalReasonForAllBlocks = new StringBuilder(); for(Map.Entry blockReason: blockNotChosenReasonMap.get().entrySet()){ blockReason.getValue().append("]\n"); - finalReasonForAllBlocks.append(blockReason); + finalReasonForAllBlocks.append(blockReason.getValue()); } blockNotChosenReasonMap.get().clear(); return finalReasonForAllBlocks.toString(); From 16caf705951d2d8d673c925cfb6e766bc5b12bec Mon Sep 17 00:00:00 2001 From: "Vico.Wu" <583424568@qq.com> Date: Wed, 10 Jul 2024 10:24:12 +0800 Subject: [PATCH 14/20] dafasdf --- .../server/blockmanagement/BlockManager.java | 40 ++++++------ ...son.java => ReconstructionSkipReason.java} | 53 +++++++-------- .../blockmanagement/TestBlockManager.java | 64 +++++++++---------- 3 files changed, 78 insertions(+), 79 deletions(-) rename hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/{BlockSkippedForReconstructionReason.java => ReconstructionSkipReason.java} (63%) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java index 031473a23f657..b19952bde7291 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java @@ -20,7 +20,7 @@ import static org.apache.hadoop.hdfs.DFSConfigKeys.*; import static org.apache.hadoop.hdfs.protocol.BlockType.CONTIGUOUS; import static org.apache.hadoop.hdfs.protocol.BlockType.STRIPED; -import static org.apache.hadoop.hdfs.server.blockmanagement.BlockSkippedForReconstructionReason.DetailedReason; +import static org.apache.hadoop.hdfs.server.blockmanagement.ReconstructionSkipReason.DetailedReason; import static org.apache.hadoop.util.ExitUtil.terminate; import static org.apache.hadoop.util.Time.now; @@ -2180,8 +2180,8 @@ int computeReconstructionWorkForBlocks( for (BlockReconstructionWork rw : reconWork) { final DatanodeStorageInfo[] targets = rw.getTargets(); if (targets == null || targets.length == 0) { - BlockSkippedForReconstructionReason.genSkipReconstructionReason(rw.getBlock(), null, - BlockSkippedForReconstructionReason.NO_AVAILABLE_TARGET_HOST_FOUND); + ReconstructionSkipReason.genReasonWithDetail(rw.getBlock(), null, + ReconstructionSkipReason.TARGET_UNAVAILABLE); rw.resetTargets(); continue; } @@ -2191,8 +2191,8 @@ int computeReconstructionWorkForBlocks( scheduledWork++; } else{ - BlockSkippedForReconstructionReason.genSkipReconstructionReason(rw.getBlock(), null, - BlockSkippedForReconstructionReason.RECONSTRUCTION_WORK_NOT_PASS_VALIDATION); + ReconstructionSkipReason.genReasonWithDetail(rw.getBlock(), null, + ReconstructionSkipReason.VALIDATION_FAILED); } } } @@ -2600,9 +2600,9 @@ DatanodeDescriptor[] chooseSourceDatanodes(BlockInfo block, // do not select the replica if it is corrupt or excess if (state == StoredReplicaState.CORRUPT || state == StoredReplicaState.EXCESS) { - BlockSkippedForReconstructionReason.genSkipReconstructionReason(block, storage, - BlockSkippedForReconstructionReason.SOURCE_NODE_UNAVAILABLE, - DetailedReason.REPLICA_CORRUPT_OR_EXCESS); + ReconstructionSkipReason.genReasonWithDetail(block, storage, + ReconstructionSkipReason.SOURCE_UNAVAILABLE, + DetailedReason.CORRUPT_OR_EXCESS); continue; } @@ -2610,8 +2610,8 @@ DatanodeDescriptor[] chooseSourceDatanodes(BlockInfo block, // or unknown state replicas. if (state == null || state == StoredReplicaState.MAINTENANCE_NOT_FOR_READ) { - BlockSkippedForReconstructionReason.genSkipReconstructionReason(block, storage, - BlockSkippedForReconstructionReason.SOURCE_NODE_UNAVAILABLE, + ReconstructionSkipReason.genReasonWithDetail(block, storage, + ReconstructionSkipReason.SOURCE_UNAVAILABLE, DetailedReason.REPLICA_MAINTENANCE_NOT_FOR_READ); continue; } @@ -2624,8 +2624,8 @@ DatanodeDescriptor[] chooseSourceDatanodes(BlockInfo block, ThreadLocalRandom.current().nextBoolean()) { decommissionedSrc = node; } - BlockSkippedForReconstructionReason.genSkipReconstructionReason(block, storage, - BlockSkippedForReconstructionReason.SOURCE_NODE_UNAVAILABLE, + ReconstructionSkipReason.genReasonWithDetail(block, storage, + ReconstructionSkipReason.SOURCE_UNAVAILABLE, DetailedReason.REPLICA_DECOMMISSIONED); continue; } @@ -2651,9 +2651,9 @@ DatanodeDescriptor[] chooseSourceDatanodes(BlockInfo block, //HDFS-16566 ExcludeReconstructed won't be reconstructed. excludeReconstructed.add(blockIndex); } - BlockSkippedForReconstructionReason.genSkipReconstructionReason(block, storage, - BlockSkippedForReconstructionReason.SOURCE_NODE_UNAVAILABLE, - DetailedReason.REPLICA_ALREADY_REACH_REPLICATION_LIMIT); + ReconstructionSkipReason.genReasonWithDetail(block, storage, + ReconstructionSkipReason.SOURCE_UNAVAILABLE, + DetailedReason.REACH_REPLICATION_SOFT_LIMIT); continue; // already reached replication limit } @@ -2665,9 +2665,9 @@ DatanodeDescriptor[] chooseSourceDatanodes(BlockInfo block, //HDFS-16566 ExcludeReconstructed won't be reconstructed. excludeReconstructed.add(blockIndex); } - BlockSkippedForReconstructionReason.genSkipReconstructionReason(block, storage, - BlockSkippedForReconstructionReason.SOURCE_NODE_UNAVAILABLE, - DetailedReason.REPLICA_ALREADY_REACH_REPLICATION_HARD_LIMIT); + ReconstructionSkipReason.genReasonWithDetail(block, storage, + ReconstructionSkipReason.SOURCE_UNAVAILABLE, + DetailedReason.REACH_REPLICATION_HARD_LIMIT); continue; } @@ -5427,11 +5427,11 @@ int computeDatanodeWork() { final int nodesToProcess = (int) Math.ceil(numlive * this.blocksInvalidateWorkPct); if(LOG.isDebugEnabled()){ - BlockSkippedForReconstructionReason.start(); + ReconstructionSkipReason.start(); } int workFound = this.computeBlockReconstructionWork(blocksToProcess); if(LOG.isDebugEnabled()){ - BlockSkippedForReconstructionReason.summaryBlockSkippedForReconstructionReason(); + ReconstructionSkipReason.summary(); } // Update counters namesystem.writeLock(); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockSkippedForReconstructionReason.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/ReconstructionSkipReason.java similarity index 63% rename from hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockSkippedForReconstructionReason.java rename to hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/ReconstructionSkipReason.java index 8d0e89245de12..d4043c393090a 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockSkippedForReconstructionReason.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/ReconstructionSkipReason.java @@ -34,24 +34,25 @@ * - For the detailed reason of `No source node is available`, I put it into DetailedReason enum * - For the detailed reason of `No Target node is available`, we already has NodeNotChosenReason in BlockPlacementPolicyDefault */ -public enum BlockSkippedForReconstructionReason { - SOURCE_NODE_UNAVAILABLE("source node or storage unavailable"), - NO_AVAILABLE_TARGET_HOST_FOUND("cannot find available target host"), - RECONSTRUCTION_WORK_NOT_PASS_VALIDATION("validation for reconstruction work failed"); +public enum ReconstructionSkipReason { + SOURCE_UNAVAILABLE("source node or storage unavailable"), + TARGET_UNAVAILABLE("cannot find available target host"), + VALIDATION_FAILED("validation for reconstruction work failed"); enum DetailedReason { - REPLICA_CORRUPT_OR_EXCESS("stored replica state is corrupt or excess"), - REPLICA_MAINTENANCE_NOT_FOR_READ("stored replica is maintenance not for read"), - REPLICA_DECOMMISSIONED("replica is already decommissioned"), - REPLICA_ALREADY_REACH_REPLICATION_LIMIT("replica already reached replication soft limit"), - REPLICA_ALREADY_REACH_REPLICATION_HARD_LIMIT("replica already reached replication hard limit"); + CORRUPT_OR_EXCESS("stored replica state is corrupt or excess"), + MAINTENANCE_NOT_FOR_READ("stored replica is maintenance not for read"), + DECOMMISSIONED("replica is already decommissioned"), + REACH_REPLICATION_SOFT_LIMIT("replica already reached replication soft limit"), + REACH_REPLICATION_HARD_LIMIT("replica already reached replication hard limit"); private final String text; DetailedReason(final String logText) { text = logText; } - private String getText() { + @Override + public String toString() { return text; } } @@ -64,11 +65,12 @@ private String getText() { private final String text; - BlockSkippedForReconstructionReason(final String logText) { + ReconstructionSkipReason(final String logText) { text = logText; } - private String getText() { + @Override + public String toString() { return text; } @@ -76,45 +78,44 @@ public static void start(){ blockNotChosenReasonMap.get().clear(); } - public static void genSkipReconstructionReason(BlockInfo block, DatanodeStorageInfo storage, - BlockSkippedForReconstructionReason reason) { + public static void genReasonWithDetail(BlockInfo block, DatanodeStorageInfo storage, + ReconstructionSkipReason reason) { if(LOG.isDebugEnabled()){ - genStorageIsNotChooseForReplication(block, storage, reason, null); + genReasonImpl(block, storage, reason, null); } } - public static void genSkipReconstructionReason(BlockInfo block, DatanodeStorageInfo storage, - BlockSkippedForReconstructionReason reason, DetailedReason reasonDetails) { + public static void genReasonWithDetail(BlockInfo block, DatanodeStorageInfo storage, + ReconstructionSkipReason reason, DetailedReason reasonDetails) { if(LOG.isDebugEnabled()){ - genStorageIsNotChooseForReplication(block, storage, reason, reasonDetails); + genReasonImpl(block, storage, reason, reasonDetails); } } @VisibleForTesting - static void genStorageIsNotChooseForReplication(BlockInfo block, DatanodeStorageInfo storage, - BlockSkippedForReconstructionReason reason, DetailedReason reasonDetails){ + static void genReasonImpl(BlockInfo block, DatanodeStorageInfo storage, + ReconstructionSkipReason reason, DetailedReason reasonDetails){ // build the error message for later use. HashMap blockReason = blockNotChosenReasonMap.get(); StringBuilder reasonForBlock = null; blockReason.putIfAbsent(block, new StringBuilder() .append("Block [") .append(block) - .append("] is not scheduled for reconstruction since: \n [")); + .append("] is not scheduled for reconstruction since: \n [ ")); reasonForBlock = blockReason.get(block); - reasonForBlock.append(" ").append(reason.getText()); + reasonForBlock.append(" ").append(reason); if(storage != null) reasonForBlock.append(" on node ").append(storage); if (reasonDetails != null) { - reasonForBlock.append(". Detail Reason: [").append(reasonDetails.getText()).append("]"); + reasonForBlock.append(". Detail Reason: [").append(reasonDetails).append("]"); } - reasonForBlock.append("."); } @VisibleForTesting - static String summaryBlockSkippedForReconstructionReason(){ + static String summary(){ StringBuilder finalReasonForAllBlocks = new StringBuilder(); for(Map.Entry blockReason: blockNotChosenReasonMap.get().entrySet()){ - blockReason.getValue().append("]\n"); + blockReason.getValue().append(" ]\n"); finalReasonForAllBlocks.append(blockReason.getValue()); } blockNotChosenReasonMap.get().clear(); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java index 6671d3873a069..66abe638483cb 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java @@ -118,12 +118,9 @@ import java.util.concurrent.TimeoutException; import java.util.concurrent.atomic.AtomicBoolean; -import org.apache.hadoop.hdfs.server.blockmanagement.BlockSkippedForReconstructionReason; -import static org.apache.hadoop.hdfs.server.blockmanagement.BlockSkippedForReconstructionReason.DetailedReason; -import static org.apache.hadoop.hdfs.server.blockmanagement.BlockSkippedForReconstructionReason.SOURCE_NODE_UNAVAILABLE; +import static org.apache.hadoop.hdfs.server.blockmanagement.ReconstructionSkipReason.DetailedReason; import static org.apache.hadoop.hdfs.server.common.HdfsServerConstants.BlockUCState.UNDER_CONSTRUCTION; -import static org.apache.hadoop.test.MetricsAsserts.getLongCounter; -import static org.apache.hadoop.test.MetricsAsserts.getMetrics; +import static org.apache.hadoop.test.MetricsAsserts.*; import static org.junit.Assert.*; import static org.mockito.ArgumentMatchers.any; import static org.mockito.Mockito.doReturn; @@ -2358,38 +2355,39 @@ public void testStorageNotChosenReason() throws InterruptedException { threads[i] = new Thread(() -> { try{ String newStorageID = "storageID_"+index; - BlockSkippedForReconstructionReason.start(); + ReconstructionSkipReason.start(); DatanodeStorageInfo sourceStorage = BlockManagerTestUtil .newDatanodeStorageInfo(DFSTestUtil.getLocalDatanodeDescriptor(), new DatanodeStorage(newStorageID)); BlockInfo newBlk = new BlockInfoContiguous(new Block(index), (short) index); - BlockSkippedForReconstructionReason.genStorageIsNotChooseForReplication(newBlk, sourceStorage, - BlockSkippedForReconstructionReason.SOURCE_NODE_UNAVAILABLE, DetailedReason.REPLICA_DECOMMISSIONED); - String reason1 = BlockSkippedForReconstructionReason.summaryBlockSkippedForReconstructionReason(); - LOG.info("Reason1 for " + newBlk + " in storage " + newStorageID + " is " + reason1); - assertTrue("reason should contain block ID " + newBlk, reason1.contains(newBlk.toString())); - assertTrue("reason should contain source node", reason1.contains(sourceStorage.toString())); - assertTrue("reason should contain "+ SOURCE_NODE_UNAVAILABLE, reason1.contains(SOURCE_NODE_UNAVAILABLE.toString())); - - - BlockSkippedForReconstructionReason.start(); - BlockSkippedForReconstructionReason.genStorageIsNotChooseForReplication(newBlk, null, - BlockSkippedForReconstructionReason.RECONSTRUCTION_WORK_NOT_PASS_VALIDATION, null); - String reason2 = BlockSkippedForReconstructionReason.summaryBlockSkippedForReconstructionReason(); - LOG.info("Reason2 for " + newBlk + " in storage " + newStorageID + " is " + reason2); - assertTrue("reason should contain block ID " + newBlk, reason2.contains(newBlk.toString())); - assertTrue("reason should contain [" + BlockSkippedForReconstructionReason.RECONSTRUCTION_WORK_NOT_PASS_VALIDATION + "]", - reason2.contains(BlockSkippedForReconstructionReason.RECONSTRUCTION_WORK_NOT_PASS_VALIDATION.toString())); - - - BlockSkippedForReconstructionReason.start(); - BlockSkippedForReconstructionReason.genStorageIsNotChooseForReplication(newBlk, null, - BlockSkippedForReconstructionReason.NO_AVAILABLE_TARGET_HOST_FOUND, null); - String reason3 = BlockSkippedForReconstructionReason.summaryBlockSkippedForReconstructionReason(); - LOG.info("Reason3 for " + newBlk + " in storage " + newStorageID + " is " + reason3); - assertTrue("reason should contain block ID " + newBlk, reason3.contains(newBlk.toString())); - assertTrue("reason should contain [" + BlockSkippedForReconstructionReason.NO_AVAILABLE_TARGET_HOST_FOUND + "]", reason3.contains(BlockSkippedForReconstructionReason.NO_AVAILABLE_TARGET_HOST_FOUND.toString())); - + ReconstructionSkipReason.genReasonImpl(newBlk, sourceStorage, + ReconstructionSkipReason.SOURCE_UNAVAILABLE, DetailedReason.DECOMMISSIONED); + String reasonForSrcNodeUnavailable = ReconstructionSkipReason.summary(); + LOG.info("Reason for " + newBlk + " in storage " + newStorageID + " is " + reasonForSrcNodeUnavailable); + assertTrue("reason for reconstruction not-scheduled should contain block ID " + newBlk, reasonForSrcNodeUnavailable.contains(newBlk.toString())); + assertTrue("reason for reconstruction not-scheduled should contain source node ID", reasonForSrcNodeUnavailable.contains(sourceStorage.toString())); + assertTrue("reason for reconstruction not-scheduled should be correct", reasonForSrcNodeUnavailable.contains(ReconstructionSkipReason.SOURCE_UNAVAILABLE.toString())); + assertTrue("reason detail for reconstruction not-scheduled should be correct", reasonForSrcNodeUnavailable.contains(DetailedReason.REPLICA_DECOMMISSIONED.toString())); + assertEquals("after summary, the reason should be cleared", "", ReconstructionSkipReason.summary()); + + ReconstructionSkipReason.start(); + ReconstructionSkipReason.genReasonImpl(newBlk, null, + ReconstructionSkipReason.VALIDATION_FAILED, null); + String reasonForValidationNotPass = ReconstructionSkipReason.summary(); + LOG.info("Reason for " + newBlk + " in storage " + newStorageID + " is " + reasonForValidationNotPass); + assertTrue("reason for reconstruction not-scheduled should contain block ID" + newBlk, reasonForValidationNotPass.contains(newBlk.toString())); + assertTrue("reason for reconstruction not-scheduled should be correct", + reasonForValidationNotPass.contains(ReconstructionSkipReason.VALIDATION_FAILED.toString())); + assertEquals("after summary, the reason should be cleared", "", ReconstructionSkipReason.summary()); + + ReconstructionSkipReason.start(); + ReconstructionSkipReason.genReasonImpl(newBlk, null, + ReconstructionSkipReason.TARGET_UNAVAILABLE, null); + String reasonForTargetNotFound = ReconstructionSkipReason.summary(); + LOG.info("Reason for " + newBlk + " in storage " + newStorageID + " is " + reasonForTargetNotFound); + assertTrue("reason for reconstruction not-scheduled should contain block ID" + newBlk, reasonForTargetNotFound.contains(newBlk.toString())); + assertTrue("reason for reconstruction not-scheduled should be correct", reasonForTargetNotFound.contains(ReconstructionSkipReason.TARGET_UNAVAILABLE.toString())); + assertEquals("after summary, the reason should be cleared", "", ReconstructionSkipReason.summary()); }catch (Throwable e){ e.printStackTrace(); failure.set(true); From 27b9bb5cb1b27f16a052cdc741795a3a3436a9fa Mon Sep 17 00:00:00 2001 From: "Vico.Wu" <583424568@qq.com> Date: Wed, 10 Jul 2024 10:26:51 +0800 Subject: [PATCH 15/20] dafasdf --- .../apache/hadoop/hdfs/server/blockmanagement/BlockManager.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java index b19952bde7291..c48d851e698d1 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java @@ -2626,7 +2626,7 @@ DatanodeDescriptor[] chooseSourceDatanodes(BlockInfo block, } ReconstructionSkipReason.genReasonWithDetail(block, storage, ReconstructionSkipReason.SOURCE_UNAVAILABLE, - DetailedReason.REPLICA_DECOMMISSIONED); + DetailedReason.DECOMMISSIONED); continue; } From 06a006f507ee25dd951808a7fe0416fadac7e1d3 Mon Sep 17 00:00:00 2001 From: "Vico.Wu" <583424568@qq.com> Date: Wed, 10 Jul 2024 10:44:40 +0800 Subject: [PATCH 16/20] dafasdf --- .../server/blockmanagement/BlockManager.java | 6 +-- .../ReconstructionSkipReason.java | 4 +- .../blockmanagement/TestBlockManager.java | 54 ++++++++----------- 3 files changed, 27 insertions(+), 37 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java index c48d851e698d1..cd7e80b7ada76 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java @@ -2612,7 +2612,7 @@ DatanodeDescriptor[] chooseSourceDatanodes(BlockInfo block, || state == StoredReplicaState.MAINTENANCE_NOT_FOR_READ) { ReconstructionSkipReason.genReasonWithDetail(block, storage, ReconstructionSkipReason.SOURCE_UNAVAILABLE, - DetailedReason.REPLICA_MAINTENANCE_NOT_FOR_READ); + DetailedReason.MAINTENANCE_NOT_FOR_READ); continue; } @@ -2653,7 +2653,7 @@ DatanodeDescriptor[] chooseSourceDatanodes(BlockInfo block, } ReconstructionSkipReason.genReasonWithDetail(block, storage, ReconstructionSkipReason.SOURCE_UNAVAILABLE, - DetailedReason.REACH_REPLICATION_SOFT_LIMIT); + DetailedReason.REPLICATION_SOFT_LIMIT); continue; // already reached replication limit } @@ -2667,7 +2667,7 @@ DatanodeDescriptor[] chooseSourceDatanodes(BlockInfo block, } ReconstructionSkipReason.genReasonWithDetail(block, storage, ReconstructionSkipReason.SOURCE_UNAVAILABLE, - DetailedReason.REACH_REPLICATION_HARD_LIMIT); + DetailedReason.REPLICATION_HARD_LIMIT); continue; } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/ReconstructionSkipReason.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/ReconstructionSkipReason.java index d4043c393090a..32a16cc9d11bf 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/ReconstructionSkipReason.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/ReconstructionSkipReason.java @@ -43,8 +43,8 @@ enum DetailedReason { CORRUPT_OR_EXCESS("stored replica state is corrupt or excess"), MAINTENANCE_NOT_FOR_READ("stored replica is maintenance not for read"), DECOMMISSIONED("replica is already decommissioned"), - REACH_REPLICATION_SOFT_LIMIT("replica already reached replication soft limit"), - REACH_REPLICATION_HARD_LIMIT("replica already reached replication hard limit"); + REPLICATION_SOFT_LIMIT("replica already reached replication soft limit"), + REPLICATION_HARD_LIMIT("replica already reached replication hard limit"); private final String text; DetailedReason(final String logText) { diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java index 66abe638483cb..3e5810e4bba13 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java @@ -119,6 +119,9 @@ import java.util.concurrent.atomic.AtomicBoolean; import static org.apache.hadoop.hdfs.server.blockmanagement.ReconstructionSkipReason.DetailedReason; +import static org.apache.hadoop.hdfs.server.blockmanagement.ReconstructionSkipReason.DetailedReason.CORRUPT_OR_EXCESS; +import static org.apache.hadoop.hdfs.server.blockmanagement.ReconstructionSkipReason.DetailedReason.DECOMMISSIONED; +import static org.apache.hadoop.hdfs.server.blockmanagement.ReconstructionSkipReason.SOURCE_UNAVAILABLE; import static org.apache.hadoop.hdfs.server.common.HdfsServerConstants.BlockUCState.UNDER_CONSTRUCTION; import static org.apache.hadoop.test.MetricsAsserts.*; import static org.junit.Assert.*; @@ -2354,40 +2357,27 @@ public void testStorageNotChosenReason() throws InterruptedException { final int index = i; threads[i] = new Thread(() -> { try{ - String newStorageID = "storageID_"+index; - ReconstructionSkipReason.start(); - DatanodeStorageInfo sourceStorage = BlockManagerTestUtil + String storageID0 = "storageID_0_"+index; + String storageID1 = "storageID_1_"+index; + DatanodeStorageInfo sourceStorage0 = BlockManagerTestUtil .newDatanodeStorageInfo(DFSTestUtil.getLocalDatanodeDescriptor(), - new DatanodeStorage(newStorageID)); + new DatanodeStorage(storageID0)); + DatanodeStorageInfo sourceStorage1 = BlockManagerTestUtil + .newDatanodeStorageInfo(DFSTestUtil.getLocalDatanodeDescriptor(), + new DatanodeStorage(storageID1)); BlockInfo newBlk = new BlockInfoContiguous(new Block(index), (short) index); - ReconstructionSkipReason.genReasonImpl(newBlk, sourceStorage, - ReconstructionSkipReason.SOURCE_UNAVAILABLE, DetailedReason.DECOMMISSIONED); - String reasonForSrcNodeUnavailable = ReconstructionSkipReason.summary(); - LOG.info("Reason for " + newBlk + " in storage " + newStorageID + " is " + reasonForSrcNodeUnavailable); - assertTrue("reason for reconstruction not-scheduled should contain block ID " + newBlk, reasonForSrcNodeUnavailable.contains(newBlk.toString())); - assertTrue("reason for reconstruction not-scheduled should contain source node ID", reasonForSrcNodeUnavailable.contains(sourceStorage.toString())); - assertTrue("reason for reconstruction not-scheduled should be correct", reasonForSrcNodeUnavailable.contains(ReconstructionSkipReason.SOURCE_UNAVAILABLE.toString())); - assertTrue("reason detail for reconstruction not-scheduled should be correct", reasonForSrcNodeUnavailable.contains(DetailedReason.REPLICA_DECOMMISSIONED.toString())); - assertEquals("after summary, the reason should be cleared", "", ReconstructionSkipReason.summary()); - - ReconstructionSkipReason.start(); - ReconstructionSkipReason.genReasonImpl(newBlk, null, - ReconstructionSkipReason.VALIDATION_FAILED, null); - String reasonForValidationNotPass = ReconstructionSkipReason.summary(); - LOG.info("Reason for " + newBlk + " in storage " + newStorageID + " is " + reasonForValidationNotPass); - assertTrue("reason for reconstruction not-scheduled should contain block ID" + newBlk, reasonForValidationNotPass.contains(newBlk.toString())); - assertTrue("reason for reconstruction not-scheduled should be correct", - reasonForValidationNotPass.contains(ReconstructionSkipReason.VALIDATION_FAILED.toString())); - assertEquals("after summary, the reason should be cleared", "", ReconstructionSkipReason.summary()); - - ReconstructionSkipReason.start(); - ReconstructionSkipReason.genReasonImpl(newBlk, null, - ReconstructionSkipReason.TARGET_UNAVAILABLE, null); - String reasonForTargetNotFound = ReconstructionSkipReason.summary(); - LOG.info("Reason for " + newBlk + " in storage " + newStorageID + " is " + reasonForTargetNotFound); - assertTrue("reason for reconstruction not-scheduled should contain block ID" + newBlk, reasonForTargetNotFound.contains(newBlk.toString())); - assertTrue("reason for reconstruction not-scheduled should be correct", reasonForTargetNotFound.contains(ReconstructionSkipReason.TARGET_UNAVAILABLE.toString())); - assertEquals("after summary, the reason should be cleared", "", ReconstructionSkipReason.summary()); + for(ReconstructionSkipReason reason: ReconstructionSkipReason.values()){ + ReconstructionSkipReason.start(); + ReconstructionSkipReason.genReasonImpl(newBlk,sourceStorage0,reason,CORRUPT_OR_EXCESS); + ReconstructionSkipReason.genReasonImpl(newBlk,sourceStorage1,reason,DECOMMISSIONED); + String summary = ReconstructionSkipReason.summary(); + LOG.info("Reason for " + newBlk + " in storage " + storageID0 + " storage " + storageID1 + " is " + summary); + assertEquals("after summary, the reason should be cleared", "", ReconstructionSkipReason.summary()); + assertTrue("reason content should be correct", summary.contains(reason.toString())); + assertTrue("reason should contain block ID " + newBlk, summary.contains(newBlk.toString())); + assertTrue("reason should contain storage " + sourceStorage0, summary.contains(sourceStorage0.toString())); + assertTrue("reason should contain storage " + sourceStorage1, summary.contains(sourceStorage1.toString())); + } }catch (Throwable e){ e.printStackTrace(); failure.set(true); From a988104436782a6451c98952d1151cf08f20e249 Mon Sep 17 00:00:00 2001 From: "Vico.Wu" <583424568@qq.com> Date: Wed, 10 Jul 2024 10:57:25 +0800 Subject: [PATCH 17/20] dafasdf --- .../ReconstructionSkipReason.java | 10 +++--- .../blockmanagement/TestBlockManager.java | 35 ++++++------------- 2 files changed, 15 insertions(+), 30 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/ReconstructionSkipReason.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/ReconstructionSkipReason.java index 32a16cc9d11bf..2f5a28724bcf1 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/ReconstructionSkipReason.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/ReconstructionSkipReason.java @@ -99,15 +99,15 @@ static void genReasonImpl(BlockInfo block, DatanodeStorageInfo storage, HashMap blockReason = blockNotChosenReasonMap.get(); StringBuilder reasonForBlock = null; blockReason.putIfAbsent(block, new StringBuilder() - .append("Block [") + .append("Block ") .append(block) - .append("] is not scheduled for reconstruction since: \n [ ")); + .append(" is not scheduled for reconstruction since: [")); reasonForBlock = blockReason.get(block); - reasonForBlock.append(" ").append(reason); + reasonForBlock.append("\n").append(reason); if(storage != null) reasonForBlock.append(" on node ").append(storage); if (reasonDetails != null) { - reasonForBlock.append(". Detail Reason: [").append(reasonDetails).append("]"); + reasonForBlock.append(". Detail : [").append(reasonDetails).append("]"); } } @@ -115,7 +115,7 @@ static void genReasonImpl(BlockInfo block, DatanodeStorageInfo storage, static String summary(){ StringBuilder finalReasonForAllBlocks = new StringBuilder(); for(Map.Entry blockReason: blockNotChosenReasonMap.get().entrySet()){ - blockReason.getValue().append(" ]\n"); + blockReason.getValue().append("\n]"); finalReasonForAllBlocks.append(blockReason.getValue()); } blockNotChosenReasonMap.get().clear(); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java index 3e5810e4bba13..7c2b67d056836 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java @@ -2338,19 +2338,6 @@ public void delayDeleteReplica() { @Test(timeout = 6000) public void testStorageNotChosenReason() throws InterruptedException { final AtomicBoolean failure = new AtomicBoolean(false); -// String storageID = "storageID"; -// DatanodeStorageInfo targetDN = BlockManagerTestUtil -// .newDatanodeStorageInfo(DFSTestUtil.getLocalDatanodeDescriptor(), -// new DatanodeStorage("storage_test_0")); -// BlockInfo blk = new BlockInfoContiguous(new Block(0), (short) 0); -// BlockSkippedForReconstructionReason.start(); -// String reason = BlockSkippedForReconstructionReason.genSkipReconstructionReason(blk); -// assertTrue(reason.contains(storageID) ); -// assertFalse(reason.contains(targetDN.toString())); -// assertTrue(reason.contains("successfully chosen storage")); -// assertFalse(reason.contains("is not chosen since")); -// assertFalse(reason.contains("Reason statistics")); - int threadNum = 10; Thread[] threads = new Thread[threadNum]; for(int i = 0; i Date: Wed, 10 Jul 2024 10:58:19 +0800 Subject: [PATCH 18/20] dafasdf --- .../hdfs/server/blockmanagement/BlockManager.java | 12 ++++++------ .../blockmanagement/ReconstructionSkipReason.java | 8 ++++---- .../server/blockmanagement/TestBlockManager.java | 5 ++--- 3 files changed, 12 insertions(+), 13 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java index cd7e80b7ada76..800373f256bf8 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java @@ -20,7 +20,7 @@ import static org.apache.hadoop.hdfs.DFSConfigKeys.*; import static org.apache.hadoop.hdfs.protocol.BlockType.CONTIGUOUS; import static org.apache.hadoop.hdfs.protocol.BlockType.STRIPED; -import static org.apache.hadoop.hdfs.server.blockmanagement.ReconstructionSkipReason.DetailedReason; +import static org.apache.hadoop.hdfs.server.blockmanagement.ReconstructionSkipReason.SourceUnavailableDetail; import static org.apache.hadoop.util.ExitUtil.terminate; import static org.apache.hadoop.util.Time.now; @@ -2602,7 +2602,7 @@ DatanodeDescriptor[] chooseSourceDatanodes(BlockInfo block, state == StoredReplicaState.EXCESS) { ReconstructionSkipReason.genReasonWithDetail(block, storage, ReconstructionSkipReason.SOURCE_UNAVAILABLE, - DetailedReason.CORRUPT_OR_EXCESS); + SourceUnavailableDetail.CORRUPT_OR_EXCESS); continue; } @@ -2612,7 +2612,7 @@ DatanodeDescriptor[] chooseSourceDatanodes(BlockInfo block, || state == StoredReplicaState.MAINTENANCE_NOT_FOR_READ) { ReconstructionSkipReason.genReasonWithDetail(block, storage, ReconstructionSkipReason.SOURCE_UNAVAILABLE, - DetailedReason.MAINTENANCE_NOT_FOR_READ); + SourceUnavailableDetail.MAINTENANCE_NOT_FOR_READ); continue; } @@ -2626,7 +2626,7 @@ DatanodeDescriptor[] chooseSourceDatanodes(BlockInfo block, } ReconstructionSkipReason.genReasonWithDetail(block, storage, ReconstructionSkipReason.SOURCE_UNAVAILABLE, - DetailedReason.DECOMMISSIONED); + SourceUnavailableDetail.DECOMMISSIONED); continue; } @@ -2653,7 +2653,7 @@ DatanodeDescriptor[] chooseSourceDatanodes(BlockInfo block, } ReconstructionSkipReason.genReasonWithDetail(block, storage, ReconstructionSkipReason.SOURCE_UNAVAILABLE, - DetailedReason.REPLICATION_SOFT_LIMIT); + SourceUnavailableDetail.REPLICATION_SOFT_LIMIT); continue; // already reached replication limit } @@ -2667,7 +2667,7 @@ DatanodeDescriptor[] chooseSourceDatanodes(BlockInfo block, } ReconstructionSkipReason.genReasonWithDetail(block, storage, ReconstructionSkipReason.SOURCE_UNAVAILABLE, - DetailedReason.REPLICATION_HARD_LIMIT); + SourceUnavailableDetail.REPLICATION_HARD_LIMIT); continue; } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/ReconstructionSkipReason.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/ReconstructionSkipReason.java index 2f5a28724bcf1..ab1cee17a2056 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/ReconstructionSkipReason.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/ReconstructionSkipReason.java @@ -39,7 +39,7 @@ public enum ReconstructionSkipReason { TARGET_UNAVAILABLE("cannot find available target host"), VALIDATION_FAILED("validation for reconstruction work failed"); - enum DetailedReason { + enum SourceUnavailableDetail { CORRUPT_OR_EXCESS("stored replica state is corrupt or excess"), MAINTENANCE_NOT_FOR_READ("stored replica is maintenance not for read"), DECOMMISSIONED("replica is already decommissioned"), @@ -47,7 +47,7 @@ enum DetailedReason { REPLICATION_HARD_LIMIT("replica already reached replication hard limit"); private final String text; - DetailedReason(final String logText) { + SourceUnavailableDetail(final String logText) { text = logText; } @@ -86,7 +86,7 @@ public static void genReasonWithDetail(BlockInfo block, DatanodeStorageInfo stor } public static void genReasonWithDetail(BlockInfo block, DatanodeStorageInfo storage, - ReconstructionSkipReason reason, DetailedReason reasonDetails) { + ReconstructionSkipReason reason, SourceUnavailableDetail reasonDetails) { if(LOG.isDebugEnabled()){ genReasonImpl(block, storage, reason, reasonDetails); } @@ -94,7 +94,7 @@ public static void genReasonWithDetail(BlockInfo block, DatanodeStorageInfo stor @VisibleForTesting static void genReasonImpl(BlockInfo block, DatanodeStorageInfo storage, - ReconstructionSkipReason reason, DetailedReason reasonDetails){ + ReconstructionSkipReason reason, SourceUnavailableDetail reasonDetails){ // build the error message for later use. HashMap blockReason = blockNotChosenReasonMap.get(); StringBuilder reasonForBlock = null; diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java index 7c2b67d056836..607ad456ca9c7 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java @@ -118,9 +118,8 @@ import java.util.concurrent.TimeoutException; import java.util.concurrent.atomic.AtomicBoolean; -import static org.apache.hadoop.hdfs.server.blockmanagement.ReconstructionSkipReason.DetailedReason; -import static org.apache.hadoop.hdfs.server.blockmanagement.ReconstructionSkipReason.DetailedReason.CORRUPT_OR_EXCESS; -import static org.apache.hadoop.hdfs.server.blockmanagement.ReconstructionSkipReason.DetailedReason.DECOMMISSIONED; +import static org.apache.hadoop.hdfs.server.blockmanagement.ReconstructionSkipReason.SourceUnavailableDetail.CORRUPT_OR_EXCESS; +import static org.apache.hadoop.hdfs.server.blockmanagement.ReconstructionSkipReason.SourceUnavailableDetail.DECOMMISSIONED; import static org.apache.hadoop.hdfs.server.blockmanagement.ReconstructionSkipReason.SOURCE_UNAVAILABLE; import static org.apache.hadoop.hdfs.server.common.HdfsServerConstants.BlockUCState.UNDER_CONSTRUCTION; import static org.apache.hadoop.test.MetricsAsserts.*; From 3a7828d5721d61c7866333e88378a9870d1f8a43 Mon Sep 17 00:00:00 2001 From: "Vico.Wu" <583424568@qq.com> Date: Wed, 10 Jul 2024 11:23:40 +0800 Subject: [PATCH 19/20] dfs --- .../blockmanagement/ReconstructionSkipReason.java | 8 ++++---- .../server/blockmanagement/TestBlockManager.java | 14 ++++++-------- 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/ReconstructionSkipReason.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/ReconstructionSkipReason.java index ab1cee17a2056..b8489a17c0d50 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/ReconstructionSkipReason.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/ReconstructionSkipReason.java @@ -26,12 +26,12 @@ import java.util.Map; /** - * When scheduling ReconstructionWork for a low-redundancy block, the scheduling may fail for overall 3 reasons: + * When scheduling ReconstructionWork for a low-redundancy block, the scheduling may fail for overall 3 high-level reasons: * 1. No source node is available * 2. No Target node is available - * 3. ReconstructionWork validation failed - * I put above 3 cases as BlockSkippedForReconstruction. - * - For the detailed reason of `No source node is available`, I put it into DetailedReason enum + * 3. ReconstructionWork is built but validation failed + * I put above 3 cases as ReconstructionSkipReason. + * - For the detailed reason of `No source node is available`, I put it into SourceUnavailableDetail enum * - For the detailed reason of `No Target node is available`, we already has NodeNotChosenReason in BlockPlacementPolicyDefault */ public enum ReconstructionSkipReason { diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java index 607ad456ca9c7..5eafb72ed49c2 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java @@ -2329,15 +2329,13 @@ public void delayDeleteReplica() { } /** - * Test the log mechasim is working as expected when storage is not chosen - * @throws IOException + * Test the reason output work as expected even in multi-thread environment. * @throws InterruptedException - * @throws TimeoutException */ - @Test(timeout = 6000) - public void testStorageNotChosenReason() throws InterruptedException { + @Test(timeout = 360000) + public void testReconstructionSkipReason() throws InterruptedException { final AtomicBoolean failure = new AtomicBoolean(false); - int threadNum = 10; + int threadNum = 5; Thread[] threads = new Thread[threadNum]; for(int i = 0; i Date: Wed, 10 Jul 2024 11:34:04 +0800 Subject: [PATCH 20/20] refine code --- .../server/blockmanagement/BlockManager.java | 3 ++- .../ReconstructionSkipReason.java | 27 ++++++++++--------- 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java index 800373f256bf8..7e1b2264b2f8f 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java @@ -20,7 +20,6 @@ import static org.apache.hadoop.hdfs.DFSConfigKeys.*; import static org.apache.hadoop.hdfs.protocol.BlockType.CONTIGUOUS; import static org.apache.hadoop.hdfs.protocol.BlockType.STRIPED; -import static org.apache.hadoop.hdfs.server.blockmanagement.ReconstructionSkipReason.SourceUnavailableDetail; import static org.apache.hadoop.util.ExitUtil.terminate; import static org.apache.hadoop.util.Time.now; @@ -90,6 +89,7 @@ import org.apache.hadoop.hdfs.server.blockmanagement.PendingDataNodeMessages.ReportedBlockInfo; import org.apache.hadoop.hdfs.server.blockmanagement.PendingReconstructionBlocks.PendingBlockInfo; import org.apache.hadoop.hdfs.server.blockmanagement.ExcessRedundancyMap.ExcessBlockInfo; +import org.apache.hadoop.hdfs.server.blockmanagement.ReconstructionSkipReason.SourceUnavailableDetail; import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.BlockUCState; import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.ReplicaState; import org.apache.hadoop.hdfs.server.namenode.CachedBlock; @@ -2134,6 +2134,7 @@ int computeReconstructionWorkForBlocks( List> blocksToReconstruct) { int scheduledWork = 0; List reconWork = new ArrayList<>(); + // Step 1: categorize at-risk blocks into replication and EC tasks namesystem.writeLock(); try { diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/ReconstructionSkipReason.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/ReconstructionSkipReason.java index b8489a17c0d50..f48dcd7fa98eb 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/ReconstructionSkipReason.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/ReconstructionSkipReason.java @@ -6,9 +6,9 @@ * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

* Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -31,8 +31,8 @@ * 2. No Target node is available * 3. ReconstructionWork is built but validation failed * I put above 3 cases as ReconstructionSkipReason. - * - For the detailed reason of `No source node is available`, I put it into SourceUnavailableDetail enum - * - For the detailed reason of `No Target node is available`, we already has NodeNotChosenReason in BlockPlacementPolicyDefault + * - For the detailed reason of `No source node is available`, I put it into SourceUnavailableDetail enum + * - For the detailed reason of `No Target node is available`, we already has NodeNotChosenReason in BlockPlacementPolicyDefault */ public enum ReconstructionSkipReason { SOURCE_UNAVAILABLE("source node or storage unavailable"), @@ -56,6 +56,7 @@ public String toString() { return text; } } + public static final Logger LOG = LoggerFactory.getLogger( BlockManager.class); @@ -74,29 +75,29 @@ public String toString() { return text; } - public static void start(){ + public static void start() { blockNotChosenReasonMap.get().clear(); } public static void genReasonWithDetail(BlockInfo block, DatanodeStorageInfo storage, ReconstructionSkipReason reason) { - if(LOG.isDebugEnabled()){ + if (LOG.isDebugEnabled()) { genReasonImpl(block, storage, reason, null); } } public static void genReasonWithDetail(BlockInfo block, DatanodeStorageInfo storage, ReconstructionSkipReason reason, SourceUnavailableDetail reasonDetails) { - if(LOG.isDebugEnabled()){ + if (LOG.isDebugEnabled()) { genReasonImpl(block, storage, reason, reasonDetails); } } @VisibleForTesting static void genReasonImpl(BlockInfo block, DatanodeStorageInfo storage, - ReconstructionSkipReason reason, SourceUnavailableDetail reasonDetails){ + ReconstructionSkipReason reason, SourceUnavailableDetail reasonDetails) { // build the error message for later use. - HashMap blockReason = blockNotChosenReasonMap.get(); + HashMap blockReason = blockNotChosenReasonMap.get(); StringBuilder reasonForBlock = null; blockReason.putIfAbsent(block, new StringBuilder() .append("Block ") @@ -104,7 +105,7 @@ static void genReasonImpl(BlockInfo block, DatanodeStorageInfo storage, .append(" is not scheduled for reconstruction since: [")); reasonForBlock = blockReason.get(block); reasonForBlock.append("\n").append(reason); - if(storage != null) + if (storage != null) reasonForBlock.append(" on node ").append(storage); if (reasonDetails != null) { reasonForBlock.append(". Detail : [").append(reasonDetails).append("]"); @@ -112,9 +113,9 @@ static void genReasonImpl(BlockInfo block, DatanodeStorageInfo storage, } @VisibleForTesting - static String summary(){ + static String summary() { StringBuilder finalReasonForAllBlocks = new StringBuilder(); - for(Map.Entry blockReason: blockNotChosenReasonMap.get().entrySet()){ + for (Map.Entry blockReason : blockNotChosenReasonMap.get().entrySet()) { blockReason.getValue().append("\n]"); finalReasonForAllBlocks.append(blockReason.getValue()); }