diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java index 41845152514fe..7e1b2264b2f8f 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java @@ -89,6 +89,7 @@ import org.apache.hadoop.hdfs.server.blockmanagement.PendingDataNodeMessages.ReportedBlockInfo; import org.apache.hadoop.hdfs.server.blockmanagement.PendingReconstructionBlocks.PendingBlockInfo; import org.apache.hadoop.hdfs.server.blockmanagement.ExcessRedundancyMap.ExcessBlockInfo; +import org.apache.hadoop.hdfs.server.blockmanagement.ReconstructionSkipReason.SourceUnavailableDetail; import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.BlockUCState; import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.ReplicaState; import org.apache.hadoop.hdfs.server.namenode.CachedBlock; @@ -2180,6 +2181,8 @@ int computeReconstructionWorkForBlocks( for (BlockReconstructionWork rw : reconWork) { final DatanodeStorageInfo[] targets = rw.getTargets(); if (targets == null || targets.length == 0) { + ReconstructionSkipReason.genReasonWithDetail(rw.getBlock(), null, + ReconstructionSkipReason.TARGET_UNAVAILABLE); rw.resetTargets(); continue; } @@ -2188,6 +2191,10 @@ int computeReconstructionWorkForBlocks( if (validateReconstructionWork(rw)) { scheduledWork++; } + else{ + ReconstructionSkipReason.genReasonWithDetail(rw.getBlock(), null, + ReconstructionSkipReason.VALIDATION_FAILED); + } } } } finally { @@ -2570,7 +2577,6 @@ DatanodeDescriptor[] chooseSourceDatanodes(BlockInfo block, liveBlockIndices.clear(); final boolean isStriped = block.isStriped(); DatanodeDescriptor decommissionedSrc = null; - BitSet liveBitSet = null; BitSet decommissioningBitSet = null; if (isStriped) { @@ -2595,6 +2601,9 @@ DatanodeDescriptor[] chooseSourceDatanodes(BlockInfo block, // do not select the replica if it is corrupt or excess if (state == StoredReplicaState.CORRUPT || state == StoredReplicaState.EXCESS) { + ReconstructionSkipReason.genReasonWithDetail(block, storage, + ReconstructionSkipReason.SOURCE_UNAVAILABLE, + SourceUnavailableDetail.CORRUPT_OR_EXCESS); continue; } @@ -2602,6 +2611,9 @@ DatanodeDescriptor[] chooseSourceDatanodes(BlockInfo block, // or unknown state replicas. if (state == null || state == StoredReplicaState.MAINTENANCE_NOT_FOR_READ) { + ReconstructionSkipReason.genReasonWithDetail(block, storage, + ReconstructionSkipReason.SOURCE_UNAVAILABLE, + SourceUnavailableDetail.MAINTENANCE_NOT_FOR_READ); continue; } @@ -2613,6 +2625,9 @@ DatanodeDescriptor[] chooseSourceDatanodes(BlockInfo block, ThreadLocalRandom.current().nextBoolean()) { decommissionedSrc = node; } + ReconstructionSkipReason.genReasonWithDetail(block, storage, + ReconstructionSkipReason.SOURCE_UNAVAILABLE, + SourceUnavailableDetail.DECOMMISSIONED); continue; } @@ -2637,6 +2652,9 @@ DatanodeDescriptor[] chooseSourceDatanodes(BlockInfo block, //HDFS-16566 ExcludeReconstructed won't be reconstructed. excludeReconstructed.add(blockIndex); } + ReconstructionSkipReason.genReasonWithDetail(block, storage, + ReconstructionSkipReason.SOURCE_UNAVAILABLE, + SourceUnavailableDetail.REPLICATION_SOFT_LIMIT); continue; // already reached replication limit } @@ -2648,6 +2666,9 @@ DatanodeDescriptor[] chooseSourceDatanodes(BlockInfo block, //HDFS-16566 ExcludeReconstructed won't be reconstructed. excludeReconstructed.add(blockIndex); } + ReconstructionSkipReason.genReasonWithDetail(block, storage, + ReconstructionSkipReason.SOURCE_UNAVAILABLE, + SourceUnavailableDetail.REPLICATION_HARD_LIMIT); continue; } @@ -5406,9 +5427,13 @@ int computeDatanodeWork() { * this.blocksReplWorkMultiplier; final int nodesToProcess = (int) Math.ceil(numlive * this.blocksInvalidateWorkPct); - + if(LOG.isDebugEnabled()){ + ReconstructionSkipReason.start(); + } int workFound = this.computeBlockReconstructionWork(blocksToProcess); - + if(LOG.isDebugEnabled()){ + ReconstructionSkipReason.summary(); + } // Update counters namesystem.writeLock(); try { diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeAdminManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeAdminManager.java index af207a843fd27..450d688f75280 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeAdminManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeAdminManager.java @@ -160,7 +160,7 @@ public void startDecommission(DatanodeDescriptor node) { monitor.startTrackingNode(node); } } else { - LOG.trace("startDecommission: Node {} in {}, nothing to do.", + LOG.info("startDecommission: Node {} in {}, nothing to do.", node, node.getAdminState()); } } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/ReconstructionSkipReason.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/ReconstructionSkipReason.java new file mode 100644 index 0000000000000..f48dcd7fa98eb --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/ReconstructionSkipReason.java @@ -0,0 +1,125 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *
+ * http://www.apache.org/licenses/LICENSE-2.0 + *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs.server.blockmanagement;
+
+
+import org.apache.hadoop.classification.VisibleForTesting;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ * When scheduling ReconstructionWork for a low-redundancy block, the scheduling may fail for overall 3 high-level reasons:
+ * 1. No source node is available
+ * 2. No Target node is available
+ * 3. ReconstructionWork is built but validation failed
+ * I put above 3 cases as ReconstructionSkipReason.
+ * - For the detailed reason of `No source node is available`, I put it into SourceUnavailableDetail enum
+ * - For the detailed reason of `No Target node is available`, we already has NodeNotChosenReason in BlockPlacementPolicyDefault
+ */
+public enum ReconstructionSkipReason {
+ SOURCE_UNAVAILABLE("source node or storage unavailable"),
+ TARGET_UNAVAILABLE("cannot find available target host"),
+ VALIDATION_FAILED("validation for reconstruction work failed");
+
+ enum SourceUnavailableDetail {
+ CORRUPT_OR_EXCESS("stored replica state is corrupt or excess"),
+ MAINTENANCE_NOT_FOR_READ("stored replica is maintenance not for read"),
+ DECOMMISSIONED("replica is already decommissioned"),
+ REPLICATION_SOFT_LIMIT("replica already reached replication soft limit"),
+ REPLICATION_HARD_LIMIT("replica already reached replication hard limit");
+ private final String text;
+
+ SourceUnavailableDetail(final String logText) {
+ text = logText;
+ }
+
+ @Override
+ public String toString() {
+ return text;
+ }
+ }
+
+ public static final Logger LOG = LoggerFactory.getLogger(
+ BlockManager.class);
+
+ private static final ThreadLocal