Skip to content

Commit 32125e9

Browse files
d-c-manningvirajjasani
authored andcommitted
HBASE-28663 Graceful shutdown of CanaryTool timeouts (#5991)
Signed-off-by: Viraj Jasani <[email protected]> Signed-off-by: Mihir Monani <[email protected]>
1 parent 2ad7348 commit 32125e9

File tree

2 files changed

+86
-3
lines changed

2 files changed

+86
-3
lines changed

hbase-server/src/main/java/org/apache/hadoop/hbase/tool/CanaryTool.java

Lines changed: 38 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,10 @@ public interface Sink {
198198
long getWriteSuccessCount();
199199

200200
long incWriteSuccessCount();
201+
202+
void stop();
203+
204+
boolean isStopped();
201205
}
202206

203207
/**
@@ -208,6 +212,7 @@ public static class StdOutSink implements Sink {
208212
readSuccessCount = new AtomicLong(0), writeSuccessCount = new AtomicLong(0);
209213
private Map<String, String> readFailures = new ConcurrentHashMap<>();
210214
private Map<String, String> writeFailures = new ConcurrentHashMap<>();
215+
private volatile boolean stopped = false;
211216

212217
@Override
213218
public long getReadFailureCount() {
@@ -268,6 +273,15 @@ public long getWriteSuccessCount() {
268273
public long incWriteSuccessCount() {
269274
return writeSuccessCount.incrementAndGet();
270275
}
276+
277+
public void stop() {
278+
stopped = true;
279+
}
280+
281+
@Override
282+
public boolean isStopped() {
283+
return stopped;
284+
}
271285
}
272286

273287
/**
@@ -444,6 +458,9 @@ public ZookeeperTask(Connection connection, String host, String znode, int timeo
444458

445459
@Override
446460
public Void call() throws Exception {
461+
if (this.sink.isStopped()) {
462+
return null;
463+
}
447464
ZooKeeper zooKeeper = null;
448465
try {
449466
zooKeeper = new ZooKeeper(host, timeout, EmptyWatcher.instance);
@@ -498,6 +515,9 @@ public enum TaskType {
498515

499516
@Override
500517
public Void call() {
518+
if (this.sink.isStopped()) {
519+
return null;
520+
}
501521
switch (taskType) {
502522
case READ:
503523
return read();
@@ -685,6 +705,9 @@ static class RegionServerTask implements Callable<Void> {
685705

686706
@Override
687707
public Void call() {
708+
if (this.sink.isStopped()) {
709+
return null;
710+
}
688711
TableName tableName = null;
689712
Table table = null;
690713
Get get = null;
@@ -1075,6 +1098,7 @@ private int runMonitor(String[] monitorTargets) throws Exception {
10751098
if (currentTimeLength > timeout) {
10761099
LOG.error("The monitor is running too long (" + currentTimeLength
10771100
+ ") after timeout limit:" + timeout + " will be killed itself !!");
1101+
monitorThread.interrupt();
10781102
if (monitor.initialized) {
10791103
return TIMEOUT_ERROR_EXIT_CODE;
10801104
} else {
@@ -1113,6 +1137,15 @@ public Map<String, String> getWriteFailures() {
11131137
return sink.getWriteFailures();
11141138
}
11151139

1140+
/**
1141+
* Return a CanaryTool.Sink object containing the detailed results of the canary run. The Sink may
1142+
* not have been created if a Monitor thread is not yet running.
1143+
* @return the active Sink if one exists, null otherwise.
1144+
*/
1145+
public Sink getActiveSink() {
1146+
return sink;
1147+
}
1148+
11161149
private void printUsageAndExit() {
11171150
System.err.println(
11181151
"Usage: canary [OPTIONS] [<TABLE1> [<TABLE2]...] | [<REGIONSERVER1> [<REGIONSERVER2]..]");
@@ -1159,10 +1192,11 @@ private void printUsageAndExit() {
11591192

11601193
Sink getSink(Configuration configuration, Class clazz) {
11611194
// In test context, this.sink might be set. Use it if non-null. For testing.
1162-
return this.sink != null
1163-
? this.sink
1164-
: (Sink) ReflectionUtils
1195+
if (this.sink == null) {
1196+
this.sink = (Sink) ReflectionUtils
11651197
.newInstance(configuration.getClass("hbase.canary.sink.class", clazz, Sink.class));
1198+
}
1199+
return this.sink;
11661200
}
11671201

11681202
/**
@@ -1366,6 +1400,7 @@ public boolean finalCheckForErrors() {
13661400

13671401
@Override
13681402
public void close() throws IOException {
1403+
this.sink.stop();
13691404
if (this.admin != null) {
13701405
this.admin.close();
13711406
}

hbase-server/src/test/java/org/apache/hadoop/hbase/tool/TestCanaryTool.java

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
*/
1818
package org.apache.hadoop.hbase.tool;
1919

20+
import static org.apache.hadoop.hbase.regionserver.TestRegionServerNoMaster.closeRegion;
2021
import static org.junit.Assert.assertEquals;
2122
import static org.junit.Assert.assertFalse;
2223
import static org.junit.Assert.assertNotEquals;
@@ -38,6 +39,7 @@
3839
import java.util.Map;
3940
import java.util.concurrent.ExecutorService;
4041
import java.util.concurrent.ScheduledThreadPoolExecutor;
42+
import java.util.concurrent.TimeUnit;
4143
import org.apache.hadoop.conf.Configuration;
4244
import org.apache.hadoop.hbase.HBaseClassTestRule;
4345
import org.apache.hadoop.hbase.HBaseConfiguration;
@@ -49,6 +51,7 @@
4951
import org.apache.hadoop.hbase.client.Put;
5052
import org.apache.hadoop.hbase.client.RegionInfo;
5153
import org.apache.hadoop.hbase.client.Table;
54+
import org.apache.hadoop.hbase.regionserver.HRegionServer;
5255
import org.apache.hadoop.hbase.testclassification.LargeTests;
5356
import org.apache.hadoop.hbase.util.Bytes;
5457
import org.apache.hadoop.util.ToolRunner;
@@ -130,6 +133,51 @@ public void testBasicCanaryWorks() throws Exception {
130133
isA(ColumnFamilyDescriptor.class), anyLong());
131134
}
132135

136+
/**
137+
* When CanaryTool times out, it should stop scanning and shutdown quickly and gracefully. This
138+
* test helps to confirm that threadpools do not continue executing work after the canary
139+
* finishes. It also verifies sink behavior and measures correct failure counts in the sink.
140+
* @throws Exception if it can't create a table, communicate with minicluster, or run the canary.
141+
*/
142+
@Test
143+
public void testCanaryStopsScanningAfterTimeout() throws Exception {
144+
// Prepare a table with multiple regions, and close those regions on the regionserver.
145+
// Do not notify HMaster or META. CanaryTool will scan and receive NotServingRegionExceptions.
146+
final TableName tableName = TableName.valueOf(name.getMethodName());
147+
// Close the unused Table reference returned by createMultiRegionTable.
148+
testingUtility.createMultiRegionTable(tableName, new byte[][] { FAMILY }).close();
149+
List<RegionInfo> regions = testingUtility.getAdmin().getRegions(tableName);
150+
assertTrue("verify table has multiple regions", regions.size() > 1);
151+
HRegionServer regionserver = testingUtility.getMiniHBaseCluster().getRegionServer(0);
152+
for (RegionInfo region : regions) {
153+
closeRegion(testingUtility, regionserver, region);
154+
}
155+
156+
// Run CanaryTool with 1 thread. This thread will attempt to scan the first region.
157+
// It will use default rpc retries and receive NotServingRegionExceptions for many seconds
158+
// according to HConstants.RETRY_BACKOFF. The CanaryTool timeout is set to 4 seconds, so it
159+
// will time out before the first region scan is complete.
160+
ExecutorService executor = new ScheduledThreadPoolExecutor(1);
161+
CanaryTool canary = new CanaryTool(executor);
162+
String[] args = { "-t", "4000", tableName.getNameAsString() };
163+
int retCode = ToolRunner.run(testingUtility.getConfiguration(), canary, args);
164+
executor.shutdown();
165+
try {
166+
if (!executor.awaitTermination(3, TimeUnit.SECONDS)) {
167+
executor.shutdownNow();
168+
}
169+
} catch (InterruptedException e) {
170+
executor.shutdownNow();
171+
}
172+
173+
CanaryTool.Sink sink = canary.getActiveSink();
174+
assertEquals("verify canary timed out with TIMEOUT_ERROR_EXIT_CODE", 3, retCode);
175+
assertEquals("verify only the first region failed", 1, sink.getReadFailureCount());
176+
assertEquals("verify no successful reads", 0, sink.getReadSuccessCount());
177+
assertEquals("verify we were attempting to scan all regions", regions.size(),
178+
((CanaryTool.RegionStdOutSink) sink).getTotalExpectedRegions());
179+
}
180+
133181
@Test
134182
public void testCanaryRegionTaskReadAllCF() throws Exception {
135183
final TableName tableName = TableName.valueOf(name.getMethodName());

0 commit comments

Comments
 (0)