Skip to content

Commit f3c20c7

Browse files
authored
HBASE-28803 HBase Master stuck due to improper handling of WALSyncTimeoutException within UncheckedIOException (#6254)
Signed-off-by: Peter Somogyi <[email protected]> Signed-off-by: Ray Mattingly <[email protected]>
1 parent a53a368 commit f3c20c7

File tree

2 files changed

+145
-8
lines changed

2 files changed

+145
-8
lines changed

hbase-server/src/main/java/org/apache/hadoop/hbase/master/region/MasterRegion.java

Lines changed: 25 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
import org.apache.hadoop.hbase.client.Scan;
3939
import org.apache.hadoop.hbase.client.TableDescriptor;
4040
import org.apache.hadoop.hbase.client.TableDescriptorBuilder;
41+
import org.apache.hadoop.hbase.log.HBaseMarkers;
4142
import org.apache.hadoop.hbase.regionserver.HRegion;
4243
import org.apache.hadoop.hbase.regionserver.HRegion.FlushResult;
4344
import org.apache.hadoop.hbase.regionserver.HRegionFileSystem;
@@ -46,6 +47,7 @@
4647
import org.apache.hadoop.hbase.regionserver.storefiletracker.StoreFileTracker;
4748
import org.apache.hadoop.hbase.regionserver.storefiletracker.StoreFileTrackerFactory;
4849
import org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL;
50+
import org.apache.hadoop.hbase.regionserver.wal.WALSyncTimeoutIOException;
4951
import org.apache.hadoop.hbase.util.Bytes;
5052
import org.apache.hadoop.hbase.util.CommonFSUtils;
5153
import org.apache.hadoop.hbase.util.FSTableDescriptors;
@@ -106,6 +108,8 @@ public final class MasterRegion {
106108

107109
private static final int REGION_ID = 1;
108110

111+
private final Server server;
112+
109113
private final WALFactory walFactory;
110114

111115
final HRegion region;
@@ -114,8 +118,9 @@ public final class MasterRegion {
114118

115119
private MasterRegionWALRoller walRoller;
116120

117-
private MasterRegion(HRegion region, WALFactory walFactory,
121+
private MasterRegion(Server server, HRegion region, WALFactory walFactory,
118122
MasterRegionFlusherAndCompactor flusherAndCompactor, MasterRegionWALRoller walRoller) {
123+
this.server = server;
119124
this.region = region;
120125
this.walFactory = walFactory;
121126
this.flusherAndCompactor = flusherAndCompactor;
@@ -139,8 +144,14 @@ private void shutdownWAL() {
139144
}
140145

141146
public void update(UpdateMasterRegion action) throws IOException {
142-
action.update(region);
143-
flusherAndCompactor.onUpdate();
147+
try {
148+
action.update(region);
149+
flusherAndCompactor.onUpdate();
150+
} catch (WALSyncTimeoutIOException e) {
151+
LOG.error(HBaseMarkers.FATAL, "WAL sync timeout. Aborting server.");
152+
server.abort("WAL sync timeout", e);
153+
throw e;
154+
}
144155
}
145156

146157
public Result get(Get get) throws IOException {
@@ -156,10 +167,16 @@ public RegionScanner getRegionScanner(Scan scan) throws IOException {
156167
}
157168

158169
public FlushResult flush(boolean force) throws IOException {
159-
flusherAndCompactor.resetChangesAfterLastFlush();
160-
FlushResult flushResult = region.flush(force);
161-
flusherAndCompactor.recordLastFlushTime();
162-
return flushResult;
170+
try {
171+
flusherAndCompactor.resetChangesAfterLastFlush();
172+
FlushResult flushResult = region.flush(force);
173+
flusherAndCompactor.recordLastFlushTime();
174+
return flushResult;
175+
} catch (WALSyncTimeoutIOException e) {
176+
LOG.error(HBaseMarkers.FATAL, "WAL sync timeout. Aborting server.");
177+
server.abort("WAL sync timeout", e);
178+
throw e;
179+
}
163180
}
164181

165182
@RestrictedApi(explanation = "Should only be called in tests", link = "",
@@ -446,6 +463,6 @@ public static MasterRegion create(MasterRegionParams params) throws IOException
446463
LOG.warn("Failed to create archive directory {}. Usually this should not happen but it will"
447464
+ " be created again when we actually archive the hfiles later, so continue", archiveDir);
448465
}
449-
return new MasterRegion(region, walFactory, flusherAndCompactor, walRoller);
466+
return new MasterRegion(server, region, walFactory, flusherAndCompactor, walRoller);
450467
}
451468
}
Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
package org.apache.hadoop.hbase.master.region;
19+
20+
import static org.junit.Assert.assertThrows;
21+
22+
import java.io.IOException;
23+
import java.time.Duration;
24+
import java.util.List;
25+
import org.apache.hadoop.conf.Configuration;
26+
import org.apache.hadoop.fs.FileSystem;
27+
import org.apache.hadoop.fs.Path;
28+
import org.apache.hadoop.hbase.Abortable;
29+
import org.apache.hadoop.hbase.HBaseClassTestRule;
30+
import org.apache.hadoop.hbase.client.Put;
31+
import org.apache.hadoop.hbase.io.asyncfs.monitor.StreamSlowMonitor;
32+
import org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL;
33+
import org.apache.hadoop.hbase.regionserver.wal.AsyncFSWAL;
34+
import org.apache.hadoop.hbase.regionserver.wal.FailedLogCloseException;
35+
import org.apache.hadoop.hbase.regionserver.wal.WALActionsListener;
36+
import org.apache.hadoop.hbase.regionserver.wal.WALSyncTimeoutIOException;
37+
import org.apache.hadoop.hbase.testclassification.MasterTests;
38+
import org.apache.hadoop.hbase.testclassification.MediumTests;
39+
import org.apache.hadoop.hbase.util.Bytes;
40+
import org.apache.hadoop.hbase.util.CommonFSUtils;
41+
import org.apache.hadoop.hbase.wal.AsyncFSWALProvider;
42+
import org.apache.hadoop.hbase.wal.WALFactory;
43+
import org.apache.hadoop.hbase.wal.WALProvider;
44+
import org.junit.ClassRule;
45+
import org.junit.Test;
46+
import org.junit.experimental.categories.Category;
47+
48+
import org.apache.hbase.thirdparty.io.netty.channel.Channel;
49+
import org.apache.hbase.thirdparty.io.netty.channel.EventLoopGroup;
50+
51+
@Category({ MasterTests.class, MediumTests.class })
52+
public class TestMasterRegionWALSyncTimeoutIOException extends MasterRegionTestBase {
53+
54+
@ClassRule
55+
public static final HBaseClassTestRule CLASS_RULE =
56+
HBaseClassTestRule.forClass(TestMasterRegionWALSyncTimeoutIOException.class);
57+
58+
private static final Duration WAL_SYNC_TIMEOUT = Duration.ofSeconds(3);
59+
60+
private static volatile boolean testWalTimeout = false;
61+
62+
@Override
63+
protected void configure(Configuration conf) throws IOException {
64+
conf.setClass(WALFactory.WAL_PROVIDER, SlowAsyncFSWALProvider.class, WALProvider.class);
65+
conf.setLong(AbstractFSWAL.WAL_SYNC_TIMEOUT_MS, WAL_SYNC_TIMEOUT.toMillis());
66+
}
67+
68+
@Override
69+
protected void configure(MasterRegionParams params) {
70+
params.flushIntervalMs(Duration.ofSeconds(1).toMillis());
71+
}
72+
73+
@Test
74+
public void testUpdateWalSyncWriteException() {
75+
testWalTimeout = true;
76+
assertThrows(WALSyncTimeoutIOException.class, () -> {
77+
for (int i = 0; i < 10; i++) {
78+
region.update(
79+
r -> r.put(new Put(Bytes.toBytes("0")).addColumn(CF1, QUALIFIER, Bytes.toBytes("0"))));
80+
Thread.sleep(Duration.ofSeconds(1).toMillis());
81+
}
82+
});
83+
}
84+
85+
public static class SlowAsyncFSWAL extends AsyncFSWAL {
86+
87+
public SlowAsyncFSWAL(FileSystem fs, Abortable abortable, Path rootDir, String logDir,
88+
String archiveDir, Configuration conf, List<WALActionsListener> listeners,
89+
boolean failIfWALExists, String prefix, String suffix, EventLoopGroup eventLoopGroup,
90+
Class<? extends Channel> channelClass, StreamSlowMonitor monitor)
91+
throws FailedLogCloseException, IOException {
92+
super(fs, abortable, rootDir, logDir, archiveDir, conf, listeners, failIfWALExists, prefix,
93+
suffix, eventLoopGroup, channelClass, monitor);
94+
}
95+
96+
@Override
97+
protected void atHeadOfRingBufferEventHandlerAppend() {
98+
if (testWalTimeout) {
99+
try {
100+
Thread.sleep(WAL_SYNC_TIMEOUT.plusSeconds(1).toMillis());
101+
} catch (InterruptedException e) {
102+
throw new RuntimeException(e);
103+
}
104+
}
105+
super.atHeadOfRingBufferEventHandlerAppend();
106+
}
107+
}
108+
109+
public static class SlowAsyncFSWALProvider extends AsyncFSWALProvider {
110+
111+
@Override
112+
protected AsyncFSWAL createWAL() throws IOException {
113+
return new SlowAsyncFSWAL(CommonFSUtils.getWALFileSystem(conf), this.abortable,
114+
CommonFSUtils.getWALRootDir(conf), getWALDirectoryName(factory.getFactoryId()),
115+
getWALArchiveDirectoryName(conf, factory.getFactoryId()), conf, listeners, true, logPrefix,
116+
META_WAL_PROVIDER_ID.equals(providerId) ? META_WAL_PROVIDER_ID : null, eventLoopGroup,
117+
channelClass, factory.getExcludeDatanodeManager().getStreamSlowMonitor(providerId));
118+
}
119+
}
120+
}

0 commit comments

Comments
 (0)