Skip to content

Commit b93dd7c

Browse files
authored
HDFS-15519. Check inaccessible INodes in FsImageValidation. (apache#2224)
1 parent 15a76e8 commit b93dd7c

File tree

4 files changed

+263
-24
lines changed

4 files changed

+263
-24
lines changed

hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FsImageValidation.java

Lines changed: 144 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,12 @@
2828
import org.apache.hadoop.hdfs.server.blockmanagement.BlockManager;
2929
import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeManager;
3030
import org.apache.hadoop.hdfs.server.common.HdfsServerConstants;
31+
import org.apache.hadoop.hdfs.server.common.Storage;
3132
import org.apache.hadoop.hdfs.server.namenode.startupprogress.Phase;
33+
import org.apache.hadoop.hdfs.server.namenode.NNStorage.NameNodeFile;
3234
import org.apache.hadoop.hdfs.server.namenode.top.metrics.TopMetrics;
35+
import org.apache.hadoop.hdfs.server.namenode.visitor.INodeCountVisitor;
36+
import org.apache.hadoop.hdfs.server.namenode.visitor.INodeCountVisitor.Counts;
3337
import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
3438
import org.apache.hadoop.util.GSet;
3539
import org.apache.hadoop.util.StringUtils;
@@ -40,15 +44,21 @@
4044
import org.slf4j.LoggerFactory;
4145

4246
import java.io.File;
47+
import java.io.FilenameFilter;
48+
import java.io.IOException;
4349
import java.util.Arrays;
50+
import java.util.Collections;
51+
import java.util.Iterator;
4452
import java.util.Timer;
4553
import java.util.TimerTask;
54+
import java.util.concurrent.atomic.AtomicInteger;
4655

4756
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_HA_NAMENODES_KEY_PREFIX;
4857
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_ENABLE_RETRY_CACHE_KEY;
4958
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_READ_LOCK_REPORTING_THRESHOLD_MS_KEY;
5059
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RPC_ADDRESS_KEY;
5160
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_WRITE_LOCK_REPORTING_THRESHOLD_MS_KEY;
61+
import static org.apache.hadoop.hdfs.server.namenode.FsImageValidation.Cli.println;
5262
import static org.apache.hadoop.util.Time.now;
5363

5464
/**
@@ -134,6 +144,25 @@ static String toCommaSeparatedNumber(long n) {
134144
}
135145
return b.insert(0, n).toString();
136146
}
147+
148+
/** @return a filter for the given type. */
149+
static FilenameFilter newFilenameFilter(NameNodeFile type) {
150+
final String prefix = type.getName() + "_";
151+
return new FilenameFilter() {
152+
@Override
153+
public boolean accept(File dir, String name) {
154+
if (!name.startsWith(prefix)) {
155+
return false;
156+
}
157+
for (int i = prefix.length(); i < name.length(); i++) {
158+
if (!Character.isDigit(name.charAt(i))) {
159+
return false;
160+
}
161+
}
162+
return true;
163+
}
164+
};
165+
}
137166
}
138167

139168
private final File fsImageFile;
@@ -142,21 +171,44 @@ static String toCommaSeparatedNumber(long n) {
142171
this.fsImageFile = fsImageFile;
143172
}
144173

145-
int checkINodeReference(Configuration conf) throws Exception {
174+
int run() throws Exception {
175+
return run(new Configuration(), new AtomicInteger());
176+
}
177+
178+
int run(AtomicInteger errorCount) throws Exception {
179+
return run(new Configuration(), errorCount);
180+
}
181+
182+
int run(Configuration conf, AtomicInteger errorCount) throws Exception {
183+
final int initCount = errorCount.get();
146184
LOG.info(Util.memoryInfo());
147185
initConf(conf);
148186

187+
// check INodeReference
188+
final FSNamesystem namesystem = checkINodeReference(conf, errorCount);
189+
190+
// check INodeMap
191+
INodeMapValidation.run(namesystem.getFSDirectory(), errorCount);
192+
LOG.info(Util.memoryInfo());
193+
194+
final int d = errorCount.get() - initCount;
195+
if (d > 0) {
196+
Cli.println("Found %d error(s) in %s", d, fsImageFile.getAbsolutePath());
197+
}
198+
return d;
199+
}
200+
201+
private FSNamesystem loadImage(Configuration conf) throws IOException {
149202
final TimerTask checkProgress = new TimerTask() {
150203
@Override
151204
public void run() {
152205
final double percent = NameNode.getStartupProgress().createView()
153206
.getPercentComplete(Phase.LOADING_FSIMAGE);
154-
LOG.info(String.format("%s Progress: %.1f%%",
155-
Phase.LOADING_FSIMAGE, 100*percent));
207+
LOG.info(String.format("%s Progress: %.1f%% (%s)",
208+
Phase.LOADING_FSIMAGE, 100*percent, Util.memoryInfo()));
156209
}
157210
};
158211

159-
INodeReferenceValidation.start();
160212
final Timer t = new Timer();
161213
t.scheduleAtFixedRate(checkProgress, 0, 60_000);
162214
final long loadStart = now();
@@ -197,10 +249,42 @@ public void run() {
197249
t.cancel();
198250
Cli.println("Loaded %s %s successfully in %s",
199251
FS_IMAGE, fsImageFile, StringUtils.formatTime(now() - loadStart));
252+
return namesystem;
253+
}
254+
255+
FSNamesystem checkINodeReference(Configuration conf,
256+
AtomicInteger errorCount) throws Exception {
257+
INodeReferenceValidation.start();
258+
final FSNamesystem namesystem = loadImage(conf);
200259
LOG.info(Util.memoryInfo());
201-
final int errorCount = INodeReferenceValidation.end();
260+
INodeReferenceValidation.end(errorCount);
202261
LOG.info(Util.memoryInfo());
203-
return errorCount;
262+
return namesystem;
263+
}
264+
265+
static class INodeMapValidation {
266+
static Iterable<INodeWithAdditionalFields> iterate(INodeMap map) {
267+
return new Iterable<INodeWithAdditionalFields>() {
268+
@Override
269+
public Iterator<INodeWithAdditionalFields> iterator() {
270+
return map.getMapIterator();
271+
}
272+
};
273+
}
274+
275+
static void run(FSDirectory fsdir, AtomicInteger errorCount) {
276+
final int initErrorCount = errorCount.get();
277+
final Counts counts = INodeCountVisitor.countTree(fsdir.getRoot());
278+
for (INodeWithAdditionalFields i : iterate(fsdir.getINodeMap())) {
279+
if (counts.getCount(i) == 0) {
280+
Cli.printError(errorCount, "%s (%d) is inaccessible (%s)",
281+
i, i.getId(), i.getFullPathName());
282+
}
283+
}
284+
println("%s ended successfully: %d error(s) found.",
285+
INodeMapValidation.class.getSimpleName(),
286+
errorCount.get() - initErrorCount);
287+
}
204288
}
205289

206290
static class Cli extends Configured implements Tool {
@@ -217,9 +301,10 @@ public int run(String[] args) throws Exception {
217301
initLogLevels();
218302

219303
final FsImageValidation validation = FsImageValidation.newInstance(args);
220-
final int errorCount = validation.checkINodeReference(getConf());
304+
final AtomicInteger errorCount = new AtomicInteger();
305+
validation.run(getConf(), errorCount);
221306
println("Error Count: %s", errorCount);
222-
return errorCount == 0? 0: 1;
307+
return errorCount.get() == 0? 0: 1;
223308
}
224309

225310
static String parse(String... args) {
@@ -240,19 +325,68 @@ static String parse(String... args) {
240325
return f;
241326
}
242327

243-
static void println(String format, Object... args) {
328+
static synchronized void println(String format, Object... args) {
244329
final String s = String.format(format, args);
245330
System.out.println(s);
246331
LOG.info(s);
247332
}
248333

249-
static void printError(String message, Throwable t) {
334+
static synchronized void warn(String format, Object... args) {
335+
final String s = "WARN: " + String.format(format, args);
336+
System.out.println(s);
337+
LOG.warn(s);
338+
}
339+
340+
static synchronized void printError(String message, Throwable t) {
250341
System.out.println(message);
251342
if (t != null) {
252343
t.printStackTrace(System.out);
253344
}
254345
LOG.error(message, t);
255346
}
347+
348+
static synchronized void printError(AtomicInteger errorCount,
349+
String format, Object... args) {
350+
final int count = errorCount.incrementAndGet();
351+
final String s = "FSIMAGE_ERROR " + count + ": "
352+
+ String.format(format, args);
353+
System.out.println(s);
354+
LOG.info(s);
355+
}
356+
}
357+
358+
public static int validate(FSNamesystem namesystem) throws Exception {
359+
final AtomicInteger errorCount = new AtomicInteger();
360+
final NNStorage nnStorage = namesystem.getFSImage().getStorage();
361+
for(Storage.StorageDirectory sd : nnStorage.getStorageDirs()) {
362+
validate(sd.getCurrentDir(), errorCount);
363+
}
364+
return errorCount.get();
365+
}
366+
367+
public static void validate(File path, AtomicInteger errorCount)
368+
throws Exception {
369+
if (path.isFile()) {
370+
new FsImageValidation(path).run(errorCount);
371+
} else if (path.isDirectory()) {
372+
final File[] images = path.listFiles(
373+
Util.newFilenameFilter(NameNodeFile.IMAGE));
374+
if (images == null || images.length == 0) {
375+
Cli.warn("%s not found in %s", FSImage.class.getSimpleName(),
376+
path.getAbsolutePath());
377+
return;
378+
}
379+
380+
Arrays.sort(images, Collections.reverseOrder());
381+
for (int i = 0; i < images.length; i++) {
382+
final File image = images[i];
383+
Cli.println("%s %d) %s", FSImage.class.getSimpleName(),
384+
i, image.getAbsolutePath());
385+
FsImageValidation.validate(image, errorCount);
386+
}
387+
}
388+
389+
Cli.warn("%s is neither a file nor a directory", path.getAbsolutePath());
256390
}
257391

258392
public static void main(String[] args) {

hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/INodeReferenceValidation.java

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -46,18 +46,20 @@ public class INodeReferenceValidation {
4646

4747
public static void start() {
4848
INSTANCE.compareAndSet(null, new INodeReferenceValidation());
49-
println("Validation started");
49+
println("%s started", INodeReferenceValidation.class.getSimpleName());
5050
}
5151

52-
public static int end() {
52+
public static void end(AtomicInteger errorCount) {
5353
final INodeReferenceValidation instance = INSTANCE.getAndSet(null);
5454
if (instance == null) {
55-
return 0;
55+
return;
5656
}
5757

58-
final int errorCount = instance.assertReferences();
59-
println("Validation ended successfully: %d error(s) found.", errorCount);
60-
return errorCount;
58+
final int initCount = errorCount.get();
59+
instance.assertReferences(errorCount);
60+
println("%s ended successfully: %d error(s) found.",
61+
INodeReferenceValidation.class.getSimpleName(),
62+
errorCount.get() - initCount);
6163
}
6264

6365
static <REF extends INodeReference> void add(REF ref, Class<REF> clazz) {
@@ -153,7 +155,7 @@ <REF extends INodeReference> ReferenceSet<REF> getReferences(
153155
throw new IllegalArgumentException("References not found for " + clazz);
154156
}
155157

156-
private int assertReferences() {
158+
private void assertReferences(AtomicInteger errorCount) {
157159
final int p = Runtime.getRuntime().availableProcessors();
158160
LOG.info("Available Processors: {}", p);
159161
final ExecutorService service = Executors.newFixedThreadPool(p);
@@ -168,7 +170,6 @@ public void run() {
168170
final Timer t = new Timer();
169171
t.scheduleAtFixedRate(checkProgress, 0, 1_000);
170172

171-
final AtomicInteger errorCount = new AtomicInteger();
172173
try {
173174
dstReferences.submit(errorCount, service);
174175
withCounts.submit(errorCount, service);
@@ -183,7 +184,6 @@ public void run() {
183184
service.shutdown();
184185
t.cancel();
185186
}
186-
return errorCount.get();
187187
}
188188

189189
static <REF extends INodeReference> List<Task<REF>> createTasks(
@@ -215,7 +215,7 @@ public Integer call() throws Exception {
215215
try {
216216
ref.assertReferences();
217217
} catch (Throwable t) {
218-
println("%d: %s", errorCount.incrementAndGet(), t);
218+
printError(errorCount, "%s", t);
219219
}
220220
}
221221
return references.size();

0 commit comments

Comments
 (0)