Skip to content

Commit 7bff8ca

Browse files
Kai ZhengZhe Zhang
authored andcommitted
HDFS-8920. Erasure Coding: when recovering lost blocks, logs can be too verbose and hurt performance. Contributed by Rui Li
1 parent 1080c37 commit 7bff8ca

File tree

3 files changed

+49
-3
lines changed

3 files changed

+49
-3
lines changed

hadoop-hdfs-project/hadoop-hdfs/CHANGES-HDFS-EC-7285.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -444,3 +444,6 @@
444444

445445
HDFS-9091. Erasure Coding: Provide DistributedFilesystem API to
446446
getAllErasureCodingPolicies. (Rakesh R via zhz)
447+
448+
HDFS-8920. Erasure Coding: when recovering lost blocks, logs can be too
449+
verbose and hurt performance. (Rui Li via Kai Zheng)

hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSInputStream.java

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1057,9 +1057,7 @@ protected DNAddrPair getBestNodeDNAddrPair(LocatedBlock block,
10571057
}
10581058
}
10591059
if (chosenNode == null) {
1060-
DFSClient.LOG.warn("No live nodes contain block " + block.getBlock() +
1061-
" after checking nodes = " + Arrays.toString(nodes) +
1062-
", ignoredNodes = " + ignoredNodes);
1060+
reportLostBlock(block, ignoredNodes);
10631061
return null;
10641062
}
10651063
final String dnAddr =
@@ -1071,6 +1069,17 @@ protected DNAddrPair getBestNodeDNAddrPair(LocatedBlock block,
10711069
return new DNAddrPair(chosenNode, targetAddr, storageType);
10721070
}
10731071

1072+
/**
1073+
* Warn the user of a lost block
1074+
*/
1075+
protected void reportLostBlock(LocatedBlock lostBlock,
1076+
Collection<DatanodeInfo> ignoredNodes) {
1077+
DatanodeInfo[] nodes = lostBlock.getLocations();
1078+
DFSClient.LOG.warn("No live nodes contain block " + lostBlock.getBlock() +
1079+
" after checking nodes = " + Arrays.toString(nodes) +
1080+
", ignoredNodes = " + ignoredNodes);
1081+
}
1082+
10741083
private static String getBestNodeDNAddrPairErrorString(
10751084
DatanodeInfo nodes[], AbstractMap<DatanodeInfo,
10761085
DatanodeInfo> deadNodes, Collection<DatanodeInfo> ignoredNodes) {

hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSStripedInputStream.java

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,8 +45,11 @@
4545
import java.io.IOException;
4646
import java.io.InterruptedIOException;
4747
import java.nio.ByteBuffer;
48+
import java.util.ArrayList;
4849
import java.util.Arrays;
50+
import java.util.Collections;
4951
import java.util.EnumSet;
52+
import java.util.List;
5053
import java.util.Set;
5154
import java.util.Collection;
5255
import java.util.Map;
@@ -154,6 +157,17 @@ void skip() {
154157
private StripeRange curStripeRange;
155158
private final CompletionService<Void> readingService;
156159

160+
/**
161+
* When warning the user of a lost block in striping mode, we remember the
162+
* dead nodes we've logged. All other striping blocks on these nodes can be
163+
* considered lost too, and we don't want to log a warning for each of them.
164+
* This is to prevent the log from being too verbose. Refer to HDFS-8920.
165+
*
166+
* To minimize the overhead, we only store the datanodeUuid in this set
167+
*/
168+
private final Set<String> warnedNodes = Collections.newSetFromMap(
169+
new ConcurrentHashMap<String, Boolean>());
170+
157171
DFSStripedInputStream(DFSClient dfsClient, String src,
158172
boolean verifyChecksum, ErasureCodingPolicy ecPolicy,
159173
LocatedBlocks locatedBlocks) throws IOException {
@@ -527,6 +541,26 @@ protected void fetchBlockByteRange(LocatedBlock block, long start,
527541
}
528542
}
529543

544+
@Override
545+
protected void reportLostBlock(LocatedBlock lostBlock,
546+
Collection<DatanodeInfo> ignoredNodes) {
547+
DatanodeInfo[] nodes = lostBlock.getLocations();
548+
if (nodes != null && nodes.length > 0) {
549+
List<String> dnUUIDs = new ArrayList<>();
550+
for (DatanodeInfo node : nodes) {
551+
dnUUIDs.add(node.getDatanodeUuid());
552+
}
553+
if (!warnedNodes.containsAll(dnUUIDs)) {
554+
DFSClient.LOG.warn(Arrays.toString(nodes) + " are unavailable and " +
555+
"all striping blocks on them are lost. " +
556+
"IgnoredNodes = " + ignoredNodes);
557+
warnedNodes.addAll(dnUUIDs);
558+
}
559+
} else {
560+
super.reportLostBlock(lostBlock, ignoredNodes);
561+
}
562+
}
563+
530564
/**
531565
* The reader for reading a complete {@link AlignedStripe}. Note that an
532566
* {@link AlignedStripe} may cross multiple stripes with cellSize width.

0 commit comments

Comments
 (0)