Skip to content

Commit 7c034d6

Browse files
comnetworkpetersomogyi
authored andcommitted
CDPD-66387: HBASE-27230 RegionServer should be aborted when WAL.sync throws Timeout apache#4641)
Signed-off-by: Duo Zhang <[email protected]> Change-Id: I4a413143681376235a250d164fec36ff9eb377b4
1 parent 62ef3aa commit 7c034d6

File tree

9 files changed

+290
-13
lines changed

9 files changed

+290
-13
lines changed

hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegion.java

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,7 @@
164164
import org.apache.hadoop.hbase.regionserver.throttle.NoLimitThroughputController;
165165
import org.apache.hadoop.hbase.regionserver.throttle.StoreHotnessProtector;
166166
import org.apache.hadoop.hbase.regionserver.throttle.ThroughputController;
167+
import org.apache.hadoop.hbase.regionserver.wal.WALSyncTimeoutIOException;
167168
import org.apache.hadoop.hbase.regionserver.wal.WALUtil;
168169
import org.apache.hadoop.hbase.replication.ReplicationUtils;
169170
import org.apache.hadoop.hbase.replication.regionserver.ReplicationObserver;
@@ -1338,7 +1339,9 @@ public RegionInfo getRegionInfo() {
13381339
return this.fs.getRegionInfo();
13391340
}
13401341

1341-
/** Returns Instance of {@link RegionServerServices} used by this HRegion. Can be null. */
1342+
/**
1343+
* Returns Instance of {@link RegionServerServices} used by this HRegion. Can be null.
1344+
*/
13421345
RegionServerServices getRegionServerServices() {
13431346
return this.rsServices;
13441347
}
@@ -3631,7 +3634,7 @@ public void doPostOpCleanupForMiniBatch(
36313634
* @param familyMap Map of Cells by family
36323635
*/
36333636
protected void applyFamilyMapToMemStore(Map<byte[], List<Cell>> familyMap,
3634-
MemStoreSizing memstoreAccounting) throws IOException {
3637+
MemStoreSizing memstoreAccounting) {
36353638
for (Map.Entry<byte[], List<Cell>> e : familyMap.entrySet()) {
36363639
byte[] family = e.getKey();
36373640
List<Cell> cells = e.getValue();
@@ -5049,7 +5052,7 @@ void put(final byte[] row, byte[] family, List<Cell> edits) throws IOException {
50495052
* @see #applyToMemStore(HStore, Cell, MemStoreSizing)
50505053
*/
50515054
private void applyToMemStore(HStore store, List<Cell> cells, boolean delta,
5052-
MemStoreSizing memstoreAccounting) throws IOException {
5055+
MemStoreSizing memstoreAccounting) {
50535056
// Any change in how we update Store/MemStore needs to also be done in other applyToMemStore!!!!
50545057
boolean upsert = delta && store.getColumnFamilyDescriptor().getMaxVersions() == 1;
50555058
if (upsert) {
@@ -8514,6 +8517,19 @@ private WriteEntry doWALAppend(WALEdit walEdit, Durability durability, List<UUID
85148517
if (walKey != null && walKey.getWriteEntry() != null) {
85158518
mvcc.complete(walKey.getWriteEntry());
85168519
}
8520+
8521+
/**
8522+
* If {@link WAL#sync} get a timeout exception, the only correct way is to abort the region
8523+
* server, as the design of {@link WAL#sync}, is to succeed or die, there is no 'failure'. It
8524+
* is usually not a big deal is because we set a very large default value(5 minutes) for
8525+
* {@link AbstractFSWAL#WAL_SYNC_TIMEOUT_MS}, usually the WAL system will abort the region
8526+
* server if it can not finish the sync within 5 minutes.
8527+
*/
8528+
if (ioe instanceof WALSyncTimeoutIOException) {
8529+
if (rsServices != null) {
8530+
rsServices.abort("WAL sync timeout,forcing server shutdown", ioe);
8531+
}
8532+
}
85178533
throw ioe;
85188534
}
85198535
return writeEntry;

hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HStore.java

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1975,8 +1975,7 @@ public long getSmallestReadPoint() {
19751975
* across all of them.
19761976
* @param readpoint readpoint below which we can safely remove duplicate KVs
19771977
*/
1978-
public void upsert(Iterable<Cell> cells, long readpoint, MemStoreSizing memstoreSizing)
1979-
throws IOException {
1978+
public void upsert(Iterable<Cell> cells, long readpoint, MemStoreSizing memstoreSizing) {
19801979
this.storeEngine.readLock();
19811980
try {
19821981
this.memstore.upsert(cells, readpoint, memstoreSizing);

hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/AbstractFSWAL.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,7 @@ public abstract class AbstractFSWAL<W extends WriterBase> implements WAL {
138138
"hbase.regionserver.wal.slowsync.roll.interval.ms";
139139
protected static final int DEFAULT_SLOW_SYNC_ROLL_INTERVAL_MS = 60 * 1000; // in ms, 1 minute
140140

141-
protected static final String WAL_SYNC_TIMEOUT_MS = "hbase.regionserver.wal.sync.timeout";
141+
public static final String WAL_SYNC_TIMEOUT_MS = "hbase.regionserver.wal.sync.timeout";
142142
protected static final int DEFAULT_WAL_SYNC_TIMEOUT_MS = 5 * 60 * 1000; // in ms, 5min
143143

144144
public static final String WAL_ROLL_MULTIPLIER = "hbase.regionserver.logroll.multiplier";
@@ -845,7 +845,7 @@ protected final void blockOnSync(SyncFuture syncFuture) throws IOException {
845845
}
846846
}
847847
} catch (TimeoutIOException tioe) {
848-
throw tioe;
848+
throw new WALSyncTimeoutIOException(tioe);
849849
} catch (InterruptedException ie) {
850850
LOG.warn("Interrupted", ie);
851851
throw convertInterruptedExceptionToIOException(ie);
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
package org.apache.hadoop.hbase.regionserver.wal;
19+
20+
import org.apache.hadoop.hbase.HBaseIOException;
21+
import org.apache.hadoop.hbase.wal.WAL;
22+
import org.apache.yetus.audience.InterfaceAudience;
23+
24+
/**
25+
* Thrown when {@link WAL#sync} timeout.
26+
*/
27+
@InterfaceAudience.Private
28+
public class WALSyncTimeoutIOException extends HBaseIOException {
29+
30+
private static final long serialVersionUID = 5067699288291906985L;
31+
32+
public WALSyncTimeoutIOException() {
33+
super();
34+
}
35+
36+
public WALSyncTimeoutIOException(String message, Throwable cause) {
37+
super(message, cause);
38+
}
39+
40+
public WALSyncTimeoutIOException(String message) {
41+
super(message);
42+
}
43+
44+
public WALSyncTimeoutIOException(Throwable cause) {
45+
super(cause);
46+
}
47+
48+
}

hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/WALUtil.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,12 @@ private static WALKeyImpl doFullMarkerAppendTransaction(WAL wal,
168168
if (walKey.getWriteEntry() != null) {
169169
mvcc.complete(walKey.getWriteEntry());
170170
}
171+
/**
172+
* Here we do not abort the RegionServer for {@link WALSyncTimeoutIOException} as
173+
* {@link HRegion#doWALAppend} does,because WAL Marker just records the internal state and
174+
* seems it is no need to always abort the RegionServer when {@link WAL#sync} timeout,it is
175+
* the internal state transition that determines whether RegionServer is aborted or not.
176+
*/
171177
throw ioe;
172178
}
173179
return walKey;

hbase-server/src/main/java/org/apache/hadoop/hbase/wal/AsyncFSWALProvider.java

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,9 +64,15 @@ void init(FileSystem fs, Path path, Configuration c, boolean overwritable, long
6464
throws IOException, CommonFSUtils.StreamLacksCapabilityException;
6565
}
6666

67-
private EventLoopGroup eventLoopGroup;
67+
/**
68+
* Protected visibility for used in tests.
69+
*/
70+
protected EventLoopGroup eventLoopGroup;
6871

69-
private Class<? extends Channel> channelClass;
72+
/**
73+
* Protected visibility for used in tests.
74+
*/
75+
protected Class<? extends Channel> channelClass;
7076

7177
@Override
7278
protected AsyncFSWAL createWAL() throws IOException {

hbase-server/src/main/java/org/apache/hadoop/hbase/wal/WAL.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
import org.apache.hadoop.hbase.regionserver.wal.FailedLogCloseException;
3131
import org.apache.hadoop.hbase.regionserver.wal.WALActionsListener;
3232
import org.apache.hadoop.hbase.regionserver.wal.WALCoprocessorHost;
33+
import org.apache.hadoop.hbase.regionserver.wal.WALSyncTimeoutIOException;
3334
import org.apache.hadoop.hbase.replication.regionserver.WALFileLengthProvider;
3435
import org.apache.yetus.audience.InterfaceAudience;
3536
import org.apache.yetus.audience.InterfaceStability;
@@ -136,18 +137,21 @@ void updateStore(byte[] encodedRegionName, byte[] familyName, Long sequenceid,
136137

137138
/**
138139
* Sync what we have in the WAL.
140+
* @throws when timeout, it would throw {@link WALSyncTimeoutIOException}.
139141
*/
140142
void sync() throws IOException;
141143

142144
/**
143145
* Sync the WAL if the txId was not already sync'd.
144146
* @param txid Transaction id to sync to.
147+
* @throws when timeout, it would throw {@link WALSyncTimeoutIOException}.
145148
*/
146149
void sync(long txid) throws IOException;
147150

148151
/**
149152
* @param forceSync Flag to force sync rather than flushing to the buffer. Example - Hadoop hflush
150153
* vs hsync.
154+
* @throws when timeout, it would throw {@link WALSyncTimeoutIOException}.
151155
*/
152156
default void sync(boolean forceSync) throws IOException {
153157
sync();
@@ -157,6 +161,7 @@ default void sync(boolean forceSync) throws IOException {
157161
* @param txid Transaction id to sync to.
158162
* @param forceSync Flag to force sync rather than flushing to the buffer. Example - Hadoop hflush
159163
* vs hsync.
164+
* @throws when timeout, it would throw {@link WALSyncTimeoutIOException}.
160165
*/
161166
default void sync(long txid, boolean forceSync) throws IOException {
162167
sync(txid);

hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestWarmupRegion.java

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121
import static org.junit.Assert.assertTrue;
2222

2323
import java.io.IOException;
24-
import org.apache.hadoop.conf.Configuration;
2524
import org.apache.hadoop.hbase.HBaseClassTestRule;
2625
import org.apache.hadoop.hbase.HBaseTestingUtility;
2726
import org.apache.hadoop.hbase.HTableDescriptor;
@@ -54,7 +53,6 @@
5453
* named for the method and does its stuff against that.
5554
*/
5655
@Category({ MasterTests.class, LargeTests.class })
57-
@SuppressWarnings("deprecation")
5856
public class TestWarmupRegion {
5957

6058
@ClassRule
@@ -66,7 +64,6 @@ public class TestWarmupRegion {
6664
protected final static HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
6765
private static byte[] ROW = Bytes.toBytes("testRow");
6866
private static byte[] FAMILY = Bytes.toBytes("testFamily");
69-
private static byte[] QUALIFIER = Bytes.toBytes("testQualifier");
7067
private static byte[] VALUE = Bytes.toBytes("testValue");
7168
private static byte[] COLUMN = Bytes.toBytes("column");
7269
private static int numRows = 10000;
@@ -79,7 +76,6 @@ public class TestWarmupRegion {
7976
*/
8077
@BeforeClass
8178
public static void setUpBeforeClass() throws Exception {
82-
Configuration conf = TEST_UTIL.getConfiguration();
8379
TEST_UTIL.startMiniCluster(SLAVES);
8480
}
8581

0 commit comments

Comments
 (0)