Skip to content

Commit 48cdf5f

Browse files
committed
HDFS-15519. Check inaccessible INodes in FsImageValidation.
1 parent 1071604 commit 48cdf5f

File tree

5 files changed

+258
-25
lines changed

5 files changed

+258
-25
lines changed

hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FsImageValidation.java

Lines changed: 131 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
*/
1818
package org.apache.hadoop.hdfs.server.namenode;
1919

20+
import com.google.common.base.Preconditions;
2021
import org.apache.commons.logging.Log;
2122
import org.apache.commons.logging.LogFactory;
2223
import org.apache.commons.logging.impl.Log4JLogger;
@@ -28,8 +29,12 @@
2829
import org.apache.hadoop.hdfs.server.blockmanagement.BlockManager;
2930
import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeManager;
3031
import org.apache.hadoop.hdfs.server.common.HdfsServerConstants;
32+
import org.apache.hadoop.hdfs.server.common.Storage;
3133
import org.apache.hadoop.hdfs.server.namenode.startupprogress.Phase;
34+
import org.apache.hadoop.hdfs.server.namenode.NNStorage.NameNodeFile;
3235
import org.apache.hadoop.hdfs.server.namenode.top.metrics.TopMetrics;
36+
import org.apache.hadoop.hdfs.server.namenode.visitor.INodeCountVisitor;
37+
import org.apache.hadoop.hdfs.server.namenode.visitor.INodeCountVisitor.Counts;
3338
import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
3439
import org.apache.hadoop.util.GSet;
3540
import org.apache.hadoop.util.StringUtils;
@@ -40,15 +45,21 @@
4045
import org.slf4j.LoggerFactory;
4146

4247
import java.io.File;
48+
import java.io.FilenameFilter;
49+
import java.io.IOException;
4350
import java.util.Arrays;
51+
import java.util.Collections;
52+
import java.util.Objects;
4453
import java.util.Timer;
4554
import java.util.TimerTask;
55+
import java.util.concurrent.atomic.AtomicInteger;
4656

4757
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_HA_NAMENODES_KEY_PREFIX;
4858
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_ENABLE_RETRY_CACHE_KEY;
4959
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_READ_LOCK_REPORTING_THRESHOLD_MS_KEY;
5060
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RPC_ADDRESS_KEY;
5161
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_WRITE_LOCK_REPORTING_THRESHOLD_MS_KEY;
62+
import static org.apache.hadoop.hdfs.server.namenode.FsImageValidation.Cli.println;
5263
import static org.apache.hadoop.util.Time.now;
5364

5465
/**
@@ -134,6 +145,25 @@ static String toCommaSeparatedNumber(long n) {
134145
}
135146
return b.insert(0, n).toString();
136147
}
148+
149+
/** @return a filter for the given type. */
150+
static FilenameFilter newFilenameFilter(NameNodeFile type) {
151+
final String prefix = type.getName() + "_";
152+
return new FilenameFilter() {
153+
@Override
154+
public boolean accept(File dir, String name) {
155+
if (!name.startsWith(prefix)) {
156+
return false;
157+
}
158+
for (int i = prefix.length(); i < name.length(); i++) {
159+
if (!Character.isDigit(name.charAt(i))) {
160+
return false;
161+
}
162+
}
163+
return true;
164+
}
165+
};
166+
}
137167
}
138168

139169
private final File fsImageFile;
@@ -142,21 +172,44 @@ static String toCommaSeparatedNumber(long n) {
142172
this.fsImageFile = fsImageFile;
143173
}
144174

145-
int checkINodeReference(Configuration conf) throws Exception {
175+
int run() throws Exception {
176+
return run(new Configuration(), new AtomicInteger());
177+
}
178+
179+
int run(AtomicInteger errorCount) throws Exception {
180+
return run(new Configuration(), errorCount);
181+
}
182+
183+
int run(Configuration conf, AtomicInteger errorCount) throws Exception {
184+
final int initCount = errorCount.get();
146185
LOG.info(Util.memoryInfo());
147186
initConf(conf);
148187

188+
// check INodeReference
189+
final FSNamesystem namesystem = checkINodeReference(conf, errorCount);
190+
191+
// check INodeMap
192+
INodeMapValidation.run(namesystem.getFSDirectory(), errorCount);
193+
LOG.info(Util.memoryInfo());
194+
195+
final int d = errorCount.get() - initCount;
196+
if (d > 0) {
197+
Cli.println("Found %d error(s) in %s", d, fsImageFile.getAbsolutePath());
198+
}
199+
return d;
200+
}
201+
202+
private FSNamesystem loadImage(Configuration conf) throws IOException {
149203
final TimerTask checkProgress = new TimerTask() {
150204
@Override
151205
public void run() {
152206
final double percent = NameNode.getStartupProgress().createView()
153207
.getPercentComplete(Phase.LOADING_FSIMAGE);
154-
LOG.info(String.format("%s Progress: %.1f%%",
155-
Phase.LOADING_FSIMAGE, 100*percent));
208+
LOG.info(String.format("%s Progress: %.1f%% (%s)",
209+
Phase.LOADING_FSIMAGE, 100*percent, Util.memoryInfo()));
156210
}
157211
};
158212

159-
INodeReferenceValidation.start();
160213
final Timer t = new Timer();
161214
t.scheduleAtFixedRate(checkProgress, 0, 60_000);
162215
final long loadStart = now();
@@ -197,10 +250,33 @@ public void run() {
197250
t.cancel();
198251
Cli.println("Loaded %s %s successfully in %s",
199252
FS_IMAGE, fsImageFile, StringUtils.formatTime(now() - loadStart));
253+
return namesystem;
254+
}
255+
256+
FSNamesystem checkINodeReference(Configuration conf,
257+
AtomicInteger errorCount) throws Exception {
258+
INodeReferenceValidation.start();
259+
final FSNamesystem namesystem = loadImage(conf);
200260
LOG.info(Util.memoryInfo());
201-
final int errorCount = INodeReferenceValidation.end();
261+
INodeReferenceValidation.end(errorCount);
202262
LOG.info(Util.memoryInfo());
203-
return errorCount;
263+
return namesystem;
264+
}
265+
266+
static class INodeMapValidation {
267+
static void run(FSDirectory fsdir, AtomicInteger errorCount) {
268+
final int initErrorCount = errorCount.get();
269+
final Counts counts = INodeCountVisitor.countTree(fsdir.getRoot());
270+
for (INodeWithAdditionalFields i : fsdir.getINodeMap()) {
271+
if (counts.getCount(i) == 0) {
272+
Cli.printError(errorCount, "%s (%d) is inaccessible (%s)",
273+
i, i.getId(), i.getFullPathName());
274+
}
275+
}
276+
println("%s ended successfully: %d error(s) found.",
277+
INodeMapValidation.class.getSimpleName(),
278+
errorCount.get() - initErrorCount);
279+
}
204280
}
205281

206282
static class Cli extends Configured implements Tool {
@@ -217,9 +293,10 @@ public int run(String[] args) throws Exception {
217293
initLogLevels();
218294

219295
final FsImageValidation validation = FsImageValidation.newInstance(args);
220-
final int errorCount = validation.checkINodeReference(getConf());
296+
final AtomicInteger errorCount = new AtomicInteger();
297+
validation.run(getConf(), errorCount);
221298
println("Error Count: %s", errorCount);
222-
return errorCount == 0? 0: 1;
299+
return errorCount.get() == 0? 0: 1;
223300
}
224301

225302
static String parse(String... args) {
@@ -240,19 +317,63 @@ static String parse(String... args) {
240317
return f;
241318
}
242319

243-
static void println(String format, Object... args) {
320+
static synchronized void println(String format, Object... args) {
244321
final String s = String.format(format, args);
245322
System.out.println(s);
246323
LOG.info(s);
247324
}
248325

249-
static void printError(String message, Throwable t) {
326+
static synchronized void warn(String format, Object... args) {
327+
final String s = "WARN: " + String.format(format, args);
328+
System.out.println(s);
329+
LOG.warn(s);
330+
}
331+
332+
static synchronized void printError(String message, Throwable t) {
250333
System.out.println(message);
251334
if (t != null) {
252335
t.printStackTrace(System.out);
253336
}
254337
LOG.error(message, t);
255338
}
339+
340+
static synchronized void printError(AtomicInteger errorCount,
341+
String format, Object... args) {
342+
final int count = errorCount.incrementAndGet();
343+
final String s = "FSIMAGE_ERROR " + count + ": " + String.format(format, args);
344+
System.out.println(s);
345+
LOG.info(s);
346+
}
347+
}
348+
349+
public static int validate(FSNamesystem namesystem) throws Exception {
350+
final AtomicInteger errorCount = new AtomicInteger();
351+
final NNStorage nnStorage = namesystem.getFSImage().getStorage();
352+
for(Storage.StorageDirectory sd : nnStorage.getStorageDirs()) {
353+
validate(sd.getCurrentDir(), errorCount);
354+
}
355+
return errorCount.get();
356+
}
357+
358+
public static void validate(File path, AtomicInteger errorCount)
359+
throws Exception {
360+
if (path.isFile()) {
361+
new FsImageValidation(path).run(errorCount);
362+
} else if (path.isDirectory()) {
363+
final File[] images = path.listFiles(
364+
Util.newFilenameFilter(NameNodeFile.IMAGE));
365+
Objects.requireNonNull(images);
366+
Preconditions.checkState(images.length > 0);
367+
368+
Arrays.sort(images, Collections.reverseOrder());
369+
for (int i = 0; i < images.length; i++) {
370+
final File image = images[i];
371+
Cli.println("%d) image=%s", i, image);
372+
FsImageValidation.validate(image, errorCount);
373+
}
374+
}
375+
376+
Cli.warn("%s is neither a file nor a directory", path.getAbsolutePath());
256377
}
257378

258379
public static void main(String[] args) {

hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/INodeMap.java

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232
* Storing all the {@link INode}s and maintaining the mapping between INode ID
3333
* and INode.
3434
*/
35-
public class INodeMap {
35+
public class INodeMap implements Iterable<INodeWithAdditionalFields> {
3636

3737
static INodeMap newInstance(INodeDirectory rootDir) {
3838
// Compute the map capacity by allocating 1% of total memory
@@ -47,6 +47,11 @@ static INodeMap newInstance(INodeDirectory rootDir) {
4747
private final GSet<INode, INodeWithAdditionalFields> map;
4848

4949
public Iterator<INodeWithAdditionalFields> getMapIterator() {
50+
return iterator();
51+
}
52+
53+
@Override
54+
public Iterator<INodeWithAdditionalFields> iterator() {
5055
return map.iterator();
5156
}
5257

hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/INodeReferenceValidation.java

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -46,18 +46,20 @@ public class INodeReferenceValidation {
4646

4747
public static void start() {
4848
INSTANCE.compareAndSet(null, new INodeReferenceValidation());
49-
println("Validation started");
49+
println("%s started", INodeReferenceValidation.class.getSimpleName());
5050
}
5151

52-
public static int end() {
52+
public static void end(AtomicInteger errorCount) {
5353
final INodeReferenceValidation instance = INSTANCE.getAndSet(null);
5454
if (instance == null) {
55-
return 0;
55+
return;
5656
}
5757

58-
final int errorCount = instance.assertReferences();
59-
println("Validation ended successfully: %d error(s) found.", errorCount);
60-
return errorCount;
58+
final int initCount = errorCount.get();
59+
instance.assertReferences(errorCount);
60+
println("%s ended successfully: %d error(s) found.",
61+
INodeReferenceValidation.class.getSimpleName(),
62+
errorCount.get() - initCount);
6163
}
6264

6365
static <REF extends INodeReference> void add(REF ref, Class<REF> clazz) {
@@ -153,7 +155,7 @@ <REF extends INodeReference> ReferenceSet<REF> getReferences(
153155
throw new IllegalArgumentException("References not found for " + clazz);
154156
}
155157

156-
private int assertReferences() {
158+
private void assertReferences(AtomicInteger errorCount) {
157159
final int p = Runtime.getRuntime().availableProcessors();
158160
LOG.info("Available Processors: {}", p);
159161
final ExecutorService service = Executors.newFixedThreadPool(p);
@@ -168,7 +170,6 @@ public void run() {
168170
final Timer t = new Timer();
169171
t.scheduleAtFixedRate(checkProgress, 0, 1_000);
170172

171-
final AtomicInteger errorCount = new AtomicInteger();
172173
try {
173174
dstReferences.submit(errorCount, service);
174175
withCounts.submit(errorCount, service);
@@ -183,7 +184,6 @@ public void run() {
183184
service.shutdown();
184185
t.cancel();
185186
}
186-
return errorCount.get();
187187
}
188188

189189
static <REF extends INodeReference> List<Task<REF>> createTasks(
@@ -215,7 +215,7 @@ public Integer call() throws Exception {
215215
try {
216216
ref.assertReferences();
217217
} catch (Throwable t) {
218-
println("%d: %s", errorCount.incrementAndGet(), t);
218+
printError(errorCount, "%s", t);
219219
}
220220
}
221221
return references.size();

0 commit comments

Comments
 (0)