Skip to content

Commit ed8549b

Browse files
committed
HBASE-27490 Locating regions for all actions of batch requests can exceed operation timeout (#4908)
Signed-off-by: Xiaolin Ha <[email protected]>
1 parent 3e3c8fa commit ed8549b

File tree

3 files changed

+121
-7
lines changed

3 files changed

+121
-7
lines changed

hbase-client/src/main/java/org/apache/hadoop/hbase/client/AsyncRequestFutureImpl.java

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -409,6 +409,28 @@ SingleServerRequestRunnable createSingleServerRequest(MultiAction multiAction, i
409409
return new SingleServerRequestRunnable(multiAction, numAttempt, server, callsInProgress);
410410
}
411411

412+
/**
413+
* Some checked calls send a callable with their own tracker. This method checks the operation
414+
* timeout against the appropriate tracker, or returns false if no tracker.
415+
*/
416+
private boolean isOperationTimeoutExceeded() {
417+
RetryingTimeTracker currentTracker;
418+
if (tracker != null) {
419+
currentTracker = tracker;
420+
} else if (currentCallable != null && currentCallable.getTracker() != null) {
421+
currentTracker = currentCallable.getTracker();
422+
} else {
423+
return false;
424+
}
425+
426+
// no-op if already started, this is just to ensure it was initialized (usually true)
427+
currentTracker.start();
428+
429+
// return value of 1 is special to mean exceeded, to differentiate from 0
430+
// which is no timeout. see implementation of getRemainingTime
431+
return currentTracker.getRemainingTime(operationTimeout) == 1;
432+
}
433+
412434
/**
413435
* Group a list of actions per region servers, and send them.
414436
* @param currentActions - the list of row to submit
@@ -420,6 +442,27 @@ void groupAndSendMultiAction(List<Action> currentActions, int numAttempt) {
420442
boolean isReplica = false;
421443
List<Action> unknownReplicaActions = null;
422444
for (Action action : currentActions) {
445+
if (isOperationTimeoutExceeded()) {
446+
String message = numAttempt == 1
447+
? "Operation timeout exceeded during resolution of region locations, "
448+
+ "prior to executing any actions."
449+
: "Operation timeout exceeded during re-resolution of region locations on retry "
450+
+ (numAttempt - 1) + ".";
451+
452+
message += " Meta may be slow or operation timeout too short for batch size or retries.";
453+
OperationTimeoutExceededException exception =
454+
new OperationTimeoutExceededException(message);
455+
456+
// Clear any actions we already resolved, because none will have been executed yet
457+
// We are going to fail all passed actions because there's no way we can execute any
458+
// if operation timeout is exceeded.
459+
actionsByServer.clear();
460+
for (Action actionToFail : currentActions) {
461+
manageLocationError(actionToFail, exception);
462+
}
463+
return;
464+
}
465+
423466
RegionLocations locs = findAllLocationsOrFail(action, true);
424467
if (locs == null) continue;
425468
boolean isReplicaAction = !RegionReplicaUtil.isDefaultReplica(action.getReplicaId());

hbase-client/src/main/java/org/apache/hadoop/hbase/client/CancellableRegionServerCallable.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,4 +122,8 @@ protected ClientProtos.ScanResponse doScan(ClientProtos.ScanRequest request)
122122
throws org.apache.hbase.thirdparty.com.google.protobuf.ServiceException {
123123
return getStub().cleanupBulkLoad(getRpcController(), request);
124124
}
125+
126+
RetryingTimeTracker getTracker() {
127+
return tracker;
128+
}
125129
}

hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestClientOperationTimeout.java

Lines changed: 74 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
import org.apache.hadoop.hbase.HBaseTestingUtility;
2929
import org.apache.hadoop.hbase.HConstants;
3030
import org.apache.hadoop.hbase.MiniHBaseCluster;
31+
import org.apache.hadoop.hbase.NotServingRegionException;
3132
import org.apache.hadoop.hbase.StartMiniClusterOption;
3233
import org.apache.hadoop.hbase.TableName;
3334
import org.apache.hadoop.hbase.ipc.CallTimeoutException;
@@ -79,6 +80,8 @@ public class TestClientOperationTimeout {
7980
private static int DELAY_BATCH;
8081
private static int DELAY_META_SCAN;
8182

83+
private static boolean FAIL_BATCH = false;
84+
8285
private static final TableName TABLE_NAME = TableName.valueOf("Timeout");
8386
private static final byte[] FAMILY = Bytes.toBytes("family");
8487
private static final byte[] ROW = Bytes.toBytes("row");
@@ -113,6 +116,7 @@ public void setUp() throws Exception {
113116
DELAY_MUTATE = 0;
114117
DELAY_BATCH = 0;
115118
DELAY_META_SCAN = 0;
119+
FAIL_BATCH = false;
116120
}
117121

118122
@AfterClass
@@ -196,17 +200,16 @@ public void testMultiTimeout() {
196200
/**
197201
* Tests that a batch get on a table throws
198202
* {@link org.apache.hadoop.hbase.client.OperationTimeoutExceededException} when the region lookup
199-
* takes longer than the 'hbase.client.operation.timeout'
203+
* takes longer than the 'hbase.client.operation.timeout'. This specifically tests that when meta
204+
* is slow, the fetching of region locations for a batch is not allowed to itself exceed the
205+
* operation timeout. In a batch size of 100, it's possible to need to make 100 meta calls in
206+
* sequence. If meta is slow, we should abort the request once the operation timeout is exceeded,
207+
* even if we haven't finished locating all regions. See HBASE-27490
200208
*/
201209
@Test
202210
public void testMultiGetMetaTimeout() throws IOException {
203-
204211
Configuration conf = new Configuration(UTIL.getConfiguration());
205212

206-
// the operation timeout must be lower than the delay from a meta scan to etch region locations
207-
// of the get requests. Simply increasing the meta scan timeout to greater than the
208-
// HBASE_CLIENT_SCANNER_TIMEOUT_PERIOD will result in SocketTimeoutException on the scans thus
209-
// avoiding the simulation of load on meta. See: HBASE-27487
210213
conf.setLong(HConstants.HBASE_CLIENT_OPERATION_TIMEOUT, 400);
211214
conf.setBoolean(CLIENT_SIDE_METRICS_ENABLED_KEY, true);
212215
try (Connection specialConnection = ConnectionFactory.createConnection(conf);
@@ -216,7 +219,9 @@ public void testMultiGetMetaTimeout() throws IOException {
216219
((ConnectionImplementation) specialConnection).getConnectionMetrics();
217220
long metaCacheNumClearServerPreFailure = metrics.getMetaCacheNumClearServer().getCount();
218221

222+
// delay and timeout are the same, so we should see a timeout after the first region lookup
219223
DELAY_META_SCAN = 400;
224+
220225
List<Get> gets = new ArrayList<>();
221226
// we need to ensure the region look-ups eat up more time than the operation timeout without
222227
// exceeding the scan timeout.
@@ -237,11 +242,70 @@ public void testMultiGetMetaTimeout() throws IOException {
237242

238243
for (Throwable cause : expected.getCauses()) {
239244
Assert.assertTrue(cause instanceof OperationTimeoutExceededException);
245+
// Check that this is the timeout thrown by AsyncRequestFutureImpl during region lookup
246+
Assert.assertTrue(cause.getMessage().contains("Operation timeout exceeded during"));
240247
}
241-
242248
}
243249
}
250+
}
244251

252+
/**
253+
* Tests that a batch get on a table throws
254+
* {@link org.apache.hadoop.hbase.client.OperationTimeoutExceededException} when retries are tuned
255+
* too high to be able to be processed within the operation timeout. In this case, the final
256+
* OperationTimeoutExceededException should not trigger a cache clear (but the individual failures
257+
* may, if appropriate). This test skirts around the timeout checks during meta lookups from
258+
* HBASE-27490, because we want to test for the case where meta lookups were able to succeed in
259+
* time but did not leave enough time for the actual calls to occur. See HBASE-27487
260+
*/
261+
@Test
262+
public void testMultiGetRetryTimeout() {
263+
Configuration conf = new Configuration(UTIL.getConfiguration());
264+
265+
// allow 1 retry, and 0 backoff
266+
conf.setLong(HConstants.HBASE_CLIENT_OPERATION_TIMEOUT, 500);
267+
conf.setInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER, 1);
268+
conf.setLong(HConstants.HBASE_CLIENT_PAUSE, 0);
269+
conf.setBoolean(CLIENT_SIDE_METRICS_ENABLED_KEY, true);
270+
271+
try (Connection specialConnection = ConnectionFactory.createConnection(conf);
272+
Table specialTable = specialConnection.getTable(TABLE_NAME)) {
273+
274+
MetricsConnection metrics =
275+
((ConnectionImplementation) specialConnection).getConnectionMetrics();
276+
long metaCacheNumClearServerPreFailure = metrics.getMetaCacheNumClearServer().getCount();
277+
278+
// meta scan should take up most of the timeout but not all
279+
DELAY_META_SCAN = 300;
280+
// fail the batch call, causing a retry
281+
FAIL_BATCH = true;
282+
283+
// Use a batch size of 1 so that we only make 1 meta call per attempt
284+
List<Get> gets = new ArrayList<>();
285+
gets.add(new Get(Bytes.toBytes(0)).addColumn(FAMILY, QUALIFIER));
286+
287+
try {
288+
specialTable.batch(gets, new Object[1]);
289+
Assert.fail("should not reach here");
290+
} catch (Exception e) {
291+
RetriesExhaustedWithDetailsException expected = (RetriesExhaustedWithDetailsException) e;
292+
Assert.assertEquals(1, expected.getNumExceptions());
293+
294+
// We expect that the error caused by FAIL_BATCH would clear the meta cache but
295+
// the OperationTimeoutExceededException should not. So only allow new cache clear here
296+
long metaCacheNumClearServerPostFailure = metrics.getMetaCacheNumClearServer().getCount();
297+
Assert.assertEquals(metaCacheNumClearServerPreFailure + 1,
298+
metaCacheNumClearServerPostFailure);
299+
300+
for (Throwable cause : expected.getCauses()) {
301+
Assert.assertTrue(cause instanceof OperationTimeoutExceededException);
302+
// Check that this is the timeout thrown by CancellableRegionServerCallable
303+
Assert.assertTrue(cause.getMessage().contains("Timeout exceeded before call began"));
304+
}
305+
}
306+
} catch (IOException e) {
307+
throw new RuntimeException(e);
308+
}
245309
}
246310

247311
/**
@@ -322,6 +386,9 @@ public ClientProtos.ScanResponse scan(RpcController controller,
322386
public ClientProtos.MultiResponse multi(RpcController rpcc, ClientProtos.MultiRequest request)
323387
throws ServiceException {
324388
try {
389+
if (FAIL_BATCH) {
390+
throw new ServiceException(new NotServingRegionException("simulated failure"));
391+
}
325392
Thread.sleep(DELAY_BATCH);
326393
} catch (InterruptedException e) {
327394
LOG.error("Sleep interrupted during multi operation", e);

0 commit comments

Comments
 (0)