Skip to content

Commit e1da6b5

Browse files
committed
HBASE-26267 Don't try to recover WALs from a WAL dir which doesn't exist
We currently cause an error to be thrown by trying to list a non-existent directory. We see that the master region directory exists on the filesystem, but forget to make sure that the master region's WAL directory also exists before we try to list it.
1 parent cda5a87 commit e1da6b5

File tree

3 files changed

+128
-5
lines changed

3 files changed

+128
-5
lines changed

hbase-server/src/main/java/org/apache/hadoop/hbase/master/region/MasterRegion.java

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -227,7 +227,31 @@ private static HRegion open(Configuration conf, TableDescriptor td, FileSystem f
227227
if (!walFs.exists(replayEditsDir) && !walFs.mkdirs(replayEditsDir)) {
228228
throw new IOException("Failed to create replay directory: " + replayEditsDir);
229229
}
230+
231+
// Replay any WALs for the Master Region before opening it.
230232
Path walsDir = new Path(walRootDir, HREGION_LOGDIR_NAME);
233+
// In open(...), we expect that the WAL directory for the MasterRegion to already exist.
234+
// This is in contrast to bootstrap() where we create the MasterRegion data and WAL dir.
235+
// However, it's possible that users directly remove the WAL directory. We expect walsDir
236+
// to always exist in normal situations, but we should guard against users changing the
237+
// filesystem outside of HBase's line of sight.
238+
if (walFs.exists(walsDir)) {
239+
replayWALs(conf, walFs, walRootDir, walsDir, regionInfo, serverName, replayEditsDir);
240+
} else {
241+
LOG.error("UNEXPECTED: WAL directory for MasterRegion is missing."
242+
+ " {} is unexpectedly missing.", walsDir);
243+
}
244+
245+
// Create a new WAL
246+
WAL wal = createWAL(walFactory, walRoller, serverName, walFs, walRootDir, regionInfo);
247+
conf.set(HRegion.SPECIAL_RECOVERED_EDITS_DIR,
248+
replayEditsDir.makeQualified(walFs.getUri(), walFs.getWorkingDirectory()).toString());
249+
return HRegion.openHRegionFromTableDir(conf, fs, tableDir, regionInfo, td, wal, null, null);
250+
}
251+
252+
private static void replayWALs(Configuration conf, FileSystem walFs, Path walRootDir,
253+
Path walsDir, RegionInfo regionInfo, String serverName, Path replayEditsDir)
254+
throws IOException {
231255
for (FileStatus walDir : walFs.listStatus(walsDir)) {
232256
if (!walDir.isDirectory()) {
233257
continue;
@@ -261,11 +285,6 @@ private static HRegion open(Configuration conf, TableDescriptor td, FileSystem f
261285
LOG.info("Delete empty local region wal dir {}", deadWALDir);
262286
walFs.delete(deadWALDir, true);
263287
}
264-
265-
WAL wal = createWAL(walFactory, walRoller, serverName, walFs, walRootDir, regionInfo);
266-
conf.set(HRegion.SPECIAL_RECOVERED_EDITS_DIR,
267-
replayEditsDir.makeQualified(walFs.getUri(), walFs.getWorkingDirectory()).toString());
268-
return HRegion.openHRegionFromTableDir(conf, fs, tableDir, regionInfo, td, wal, null, null);
269288
}
270289

271290
public static MasterRegion create(MasterRegionParams params) throws IOException {

hbase-server/src/test/java/org/apache/hadoop/hbase/master/region/MasterRegionTestBase.java

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,14 @@ public void setUp() throws IOException {
8080
htu.getConfiguration().setBoolean(MemStoreLAB.USEMSLAB_KEY, false);
8181
// Runs on local filesystem. Test does not need sync. Turn off checks.
8282
htu.getConfiguration().setBoolean(CommonFSUtils.UNSAFE_STREAM_CAPABILITY_ENFORCE, false);
83+
84+
createMasterRegion();
85+
}
86+
87+
/**
88+
* Creates a new MasterRegion using an existing {@code htu} on this class.
89+
*/
90+
protected void createMasterRegion() throws IOException {
8391
configure(htu.getConfiguration());
8492
choreService = new ChoreService(getClass().getSimpleName());
8593
hfileCleanerPool = DirScanPool.getHFileCleanerScanPool(htu.getConfiguration());
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
/**
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
package org.apache.hadoop.hbase.master.region;
19+
20+
import static org.junit.Assert.assertArrayEquals;
21+
import static org.junit.Assert.assertEquals;
22+
import static org.junit.Assert.assertTrue;
23+
24+
import java.io.IOException;
25+
import java.util.Arrays;
26+
27+
import org.apache.hadoop.conf.Configuration;
28+
import org.apache.hadoop.fs.FileStatus;
29+
import org.apache.hadoop.fs.FileSystem;
30+
import org.apache.hadoop.fs.Path;
31+
import org.apache.hadoop.hbase.Cell;
32+
import org.apache.hadoop.hbase.CellUtil;
33+
import org.apache.hadoop.hbase.HBaseClassTestRule;
34+
import org.apache.hadoop.hbase.HConstants;
35+
import org.apache.hadoop.hbase.client.Get;
36+
import org.apache.hadoop.hbase.client.Put;
37+
import org.apache.hadoop.hbase.client.Result;
38+
import org.apache.hadoop.hbase.testclassification.MasterTests;
39+
import org.apache.hadoop.hbase.testclassification.MediumTests;
40+
import org.apache.hadoop.hbase.util.Bytes;
41+
import org.junit.ClassRule;
42+
import org.junit.Test;
43+
import org.junit.experimental.categories.Category;
44+
import org.slf4j.Logger;
45+
import org.slf4j.LoggerFactory;
46+
47+
@Category({ MasterTests.class, MediumTests.class })
48+
public class TestMasterRegionWALRecovery extends MasterRegionTestBase {
49+
private static final Logger LOG = LoggerFactory.getLogger(TestMasterRegionWALRecovery.class);
50+
51+
@ClassRule
52+
public static final HBaseClassTestRule CLASS_RULE =
53+
HBaseClassTestRule.forClass(TestMasterRegionWALRecovery.class);
54+
55+
private Path masterRegionDir;
56+
57+
@Override
58+
protected void postSetUp() throws IOException {
59+
Configuration conf = htu.getConfiguration();
60+
Path testDir = htu.getDataTestDir();
61+
FileSystem fs = testDir.getFileSystem(conf);
62+
masterRegionDir = new Path(testDir, REGION_DIR_NAME);
63+
}
64+
65+
@Test
66+
public void test() throws IOException, InterruptedException {
67+
region
68+
.update(r -> r.put(new Put(Bytes.toBytes(1)).addColumn(CF1, QUALIFIER, Bytes.toBytes(1))));
69+
region.flush(true);
70+
71+
Path testDir = htu.getDataTestDir();
72+
FileSystem fs = testDir.getFileSystem(htu.getConfiguration());
73+
region.close(false);
74+
75+
Path masterRegionWalDir = new Path(masterRegionDir, HConstants.HREGION_LOGDIR_NAME);
76+
LOG.info("WAL dir: {}", masterRegionWalDir);
77+
assertTrue(fs.exists(masterRegionWalDir));
78+
// Make sure we have the WAL for the localhost "server"
79+
FileStatus[] files = fs.listStatus(masterRegionWalDir);
80+
LOG.info("WAL files: {}", Arrays.toString(files));
81+
assertEquals(1, files.length);
82+
LOG.info("Deleting {}", masterRegionWalDir);
83+
// Delete the WAL directory
84+
fs.delete(masterRegionWalDir, true);
85+
86+
// Re-create the MasterRegion and hit the MasterRegion#open() code-path
87+
// (rather than bootstrap())
88+
createMasterRegion();
89+
90+
// Make sure we can read the same data we wrote (we flushed before nuking the WALs,
91+
// so data should be durable)
92+
Result r = region.get(new Get(Bytes.toBytes(1)));
93+
Cell c = r.getColumnLatestCell(CF1, QUALIFIER);
94+
assertArrayEquals(Bytes.toBytes(1), CellUtil.cloneValue(c));
95+
}
96+
}

0 commit comments

Comments
 (0)