2828import org .apache .hadoop .hbase .HBaseTestingUtility ;
2929import org .apache .hadoop .hbase .HConstants ;
3030import org .apache .hadoop .hbase .MiniHBaseCluster ;
31+ import org .apache .hadoop .hbase .NotServingRegionException ;
3132import org .apache .hadoop .hbase .StartMiniClusterOption ;
3233import org .apache .hadoop .hbase .TableName ;
3334import org .apache .hadoop .hbase .ipc .CallTimeoutException ;
@@ -79,6 +80,8 @@ public class TestClientOperationTimeout {
7980 private static int DELAY_BATCH ;
8081 private static int DELAY_META_SCAN ;
8182
83+ private static boolean FAIL_BATCH = false ;
84+
8285 private static final TableName TABLE_NAME = TableName .valueOf ("Timeout" );
8386 private static final byte [] FAMILY = Bytes .toBytes ("family" );
8487 private static final byte [] ROW = Bytes .toBytes ("row" );
@@ -113,6 +116,7 @@ public void setUp() throws Exception {
113116 DELAY_MUTATE = 0 ;
114117 DELAY_BATCH = 0 ;
115118 DELAY_META_SCAN = 0 ;
119+ FAIL_BATCH = false ;
116120 }
117121
118122 @ AfterClass
@@ -196,17 +200,16 @@ public void testMultiTimeout() {
196200 /**
197201 * Tests that a batch get on a table throws
198202 * {@link org.apache.hadoop.hbase.client.OperationTimeoutExceededException} when the region lookup
199- * takes longer than the 'hbase.client.operation.timeout'
203+ * takes longer than the 'hbase.client.operation.timeout'. This specifically tests that when meta
204+ * is slow, the fetching of region locations for a batch is not allowed to itself exceed the
205+ * operation timeout. In a batch size of 100, it's possible to need to make 100 meta calls in
206+ * sequence. If meta is slow, we should abort the request once the operation timeout is exceeded,
207+ * even if we haven't finished locating all regions. See HBASE-27490
200208 */
201209 @ Test
202210 public void testMultiGetMetaTimeout () throws IOException {
203-
204211 Configuration conf = new Configuration (UTIL .getConfiguration ());
205212
206- // the operation timeout must be lower than the delay from a meta scan to etch region locations
207- // of the get requests. Simply increasing the meta scan timeout to greater than the
208- // HBASE_CLIENT_SCANNER_TIMEOUT_PERIOD will result in SocketTimeoutException on the scans thus
209- // avoiding the simulation of load on meta. See: HBASE-27487
210213 conf .setLong (HConstants .HBASE_CLIENT_OPERATION_TIMEOUT , 400 );
211214 conf .setBoolean (CLIENT_SIDE_METRICS_ENABLED_KEY , true );
212215 try (Connection specialConnection = ConnectionFactory .createConnection (conf );
@@ -216,7 +219,9 @@ public void testMultiGetMetaTimeout() throws IOException {
216219 ((ConnectionImplementation ) specialConnection ).getConnectionMetrics ();
217220 long metaCacheNumClearServerPreFailure = metrics .getMetaCacheNumClearServer ().getCount ();
218221
222+ // delay and timeout are the same, so we should see a timeout after the first region lookup
219223 DELAY_META_SCAN = 400 ;
224+
220225 List <Get > gets = new ArrayList <>();
221226 // we need to ensure the region look-ups eat up more time than the operation timeout without
222227 // exceeding the scan timeout.
@@ -237,11 +242,70 @@ public void testMultiGetMetaTimeout() throws IOException {
237242
238243 for (Throwable cause : expected .getCauses ()) {
239244 Assert .assertTrue (cause instanceof OperationTimeoutExceededException );
245+ // Check that this is the timeout thrown by AsyncRequestFutureImpl during region lookup
246+ Assert .assertTrue (cause .getMessage ().contains ("Operation timeout exceeded during" ));
240247 }
241-
242248 }
243249 }
250+ }
244251
252+ /**
253+ * Tests that a batch get on a table throws
254+ * {@link org.apache.hadoop.hbase.client.OperationTimeoutExceededException} when retries are tuned
255+ * too high to be able to be processed within the operation timeout. In this case, the final
256+ * OperationTimeoutExceededException should not trigger a cache clear (but the individual failures
257+ * may, if appropriate). This test skirts around the timeout checks during meta lookups from
258+ * HBASE-27490, because we want to test for the case where meta lookups were able to succeed in
259+ * time but did not leave enough time for the actual calls to occur. See HBASE-27487
260+ */
261+ @ Test
262+ public void testMultiGetRetryTimeout () {
263+ Configuration conf = new Configuration (UTIL .getConfiguration ());
264+
265+ // allow 1 retry, and 0 backoff
266+ conf .setLong (HConstants .HBASE_CLIENT_OPERATION_TIMEOUT , 500 );
267+ conf .setInt (HConstants .HBASE_CLIENT_RETRIES_NUMBER , 1 );
268+ conf .setLong (HConstants .HBASE_CLIENT_PAUSE , 0 );
269+ conf .setBoolean (CLIENT_SIDE_METRICS_ENABLED_KEY , true );
270+
271+ try (Connection specialConnection = ConnectionFactory .createConnection (conf );
272+ Table specialTable = specialConnection .getTable (TABLE_NAME )) {
273+
274+ MetricsConnection metrics =
275+ ((ConnectionImplementation ) specialConnection ).getConnectionMetrics ();
276+ long metaCacheNumClearServerPreFailure = metrics .getMetaCacheNumClearServer ().getCount ();
277+
278+ // meta scan should take up most of the timeout but not all
279+ DELAY_META_SCAN = 300 ;
280+ // fail the batch call, causing a retry
281+ FAIL_BATCH = true ;
282+
283+ // Use a batch size of 1 so that we only make 1 meta call per attempt
284+ List <Get > gets = new ArrayList <>();
285+ gets .add (new Get (Bytes .toBytes (0 )).addColumn (FAMILY , QUALIFIER ));
286+
287+ try {
288+ specialTable .batch (gets , new Object [1 ]);
289+ Assert .fail ("should not reach here" );
290+ } catch (Exception e ) {
291+ RetriesExhaustedWithDetailsException expected = (RetriesExhaustedWithDetailsException ) e ;
292+ Assert .assertEquals (1 , expected .getNumExceptions ());
293+
294+ // We expect that the error caused by FAIL_BATCH would clear the meta cache but
295+ // the OperationTimeoutExceededException should not. So only allow new cache clear here
296+ long metaCacheNumClearServerPostFailure = metrics .getMetaCacheNumClearServer ().getCount ();
297+ Assert .assertEquals (metaCacheNumClearServerPreFailure + 1 ,
298+ metaCacheNumClearServerPostFailure );
299+
300+ for (Throwable cause : expected .getCauses ()) {
301+ Assert .assertTrue (cause instanceof OperationTimeoutExceededException );
302+ // Check that this is the timeout thrown by CancellableRegionServerCallable
303+ Assert .assertTrue (cause .getMessage ().contains ("Timeout exceeded before call began" ));
304+ }
305+ }
306+ } catch (IOException e ) {
307+ throw new RuntimeException (e );
308+ }
245309 }
246310
247311 /**
@@ -322,6 +386,9 @@ public ClientProtos.ScanResponse scan(RpcController controller,
322386 public ClientProtos .MultiResponse multi (RpcController rpcc , ClientProtos .MultiRequest request )
323387 throws ServiceException {
324388 try {
389+ if (FAIL_BATCH ) {
390+ throw new ServiceException (new NotServingRegionException ("simulated failure" ));
391+ }
325392 Thread .sleep (DELAY_BATCH );
326393 } catch (InterruptedException e ) {
327394 LOG .error ("Sleep interrupted during multi operation" , e );
0 commit comments