Skip to content

Commit 06c7548

Browse files
authored
HBASE-27531 AsyncRequestFutureImpl unnecessarily clears meta cache for full server (#4930)
Signed-off-by: Duo Zhang <[email protected]>
1 parent 26b0221 commit 06c7548

File tree

2 files changed

+98
-16
lines changed

2 files changed

+98
-16
lines changed

hbase-client/src/main/java/org/apache/hadoop/hbase/client/AsyncRequestFutureImpl.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -989,7 +989,7 @@ private void invokeCallBack(byte[] regionName, byte[] row, CResult result) {
989989
}
990990

991991
private void cleanServerCache(ServerName server, Throwable regionException) {
992-
if (ClientExceptionsUtil.isMetaClearingException(regionException)) {
992+
if (tableName == null && ClientExceptionsUtil.isMetaClearingException(regionException)) {
993993
// We want to make sure to clear the cache in case there were location-related exceptions.
994994
// We don't to clear the cache for every possible exception that comes through, however.
995995
MetricsConnection metrics = asyncProcess.connection.getConnectionMetrics();

hbase-client/src/test/java/org/apache/hadoop/hbase/client/TestAsyncProcess.java

Lines changed: 97 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
import java.util.Iterator;
3131
import java.util.List;
3232
import java.util.Map;
33+
import java.util.Map.Entry;
3334
import java.util.Random;
3435
import java.util.Set;
3536
import java.util.TreeSet;
@@ -319,19 +320,66 @@ public AbstractResponse callWithoutRetries(RetryingCallable<AbstractResponse> ca
319320
}
320321
}
321322

323+
/**
324+
* Used to simulate the case where a RegionServer responds to a multi request, but some or all of
325+
* the actions have an Exception instead of Result. These responses go through receiveMultiAction,
326+
* which has handling for individual action failures.
327+
*/
328+
static class CallerWithRegionException extends RpcRetryingCallerImpl<AbstractResponse> {
329+
330+
private final IOException e;
331+
private MultiAction multi;
332+
333+
public CallerWithRegionException(IOException e, MultiAction multi) {
334+
super(100, 500, 100, RetryingCallerInterceptorFactory.NO_OP_INTERCEPTOR, 9, 0, null);
335+
this.e = e;
336+
this.multi = multi;
337+
}
338+
339+
@Override
340+
public AbstractResponse callWithoutRetries(RetryingCallable<AbstractResponse> callable,
341+
int callTimeout) throws IOException, RuntimeException {
342+
MultiResponse response = new MultiResponse();
343+
for (Entry<byte[], List<Action>> entry : multi.actions.entrySet()) {
344+
response.addException(entry.getKey(), e);
345+
}
346+
return response;
347+
}
348+
}
349+
322350
static class AsyncProcessWithFailure extends MyAsyncProcess {
323351

324352
private final IOException ioe;
353+
private final ServerName failingServer;
354+
private final boolean returnAsRegionException;
325355

326-
public AsyncProcessWithFailure(ClusterConnection hc, Configuration conf, IOException ioe) {
327-
super(hc, conf);
356+
public AsyncProcessWithFailure(ClusterConnection hc, Configuration myConf, IOException ioe,
357+
ServerName failingServer, boolean returnAsRegionException) {
358+
super(hc, myConf);
328359
this.ioe = ioe;
360+
this.failingServer = failingServer;
361+
this.returnAsRegionException = returnAsRegionException;
329362
serverTrackerTimeout = 1L;
330363
}
331364

365+
public AsyncProcessWithFailure(ClusterConnection hc, Configuration myConf, IOException ioe) {
366+
this(hc, myConf, ioe, null, false);
367+
}
368+
332369
@Override
333370
protected RpcRetryingCaller<AbstractResponse>
334371
createCaller(CancellableRegionServerCallable callable, int rpcTimeout) {
372+
MultiServerCallable msc = (MultiServerCallable) callable;
373+
if (failingServer != null) {
374+
if (!msc.getServerName().equals(failingServer)) {
375+
return super.createCaller(callable, rpcTimeout);
376+
}
377+
}
378+
379+
if (returnAsRegionException) {
380+
return new CallerWithRegionException(ioe, msc.getMulti());
381+
}
382+
335383
callsCt.incrementAndGet();
336384
return new CallerWithFailure(ioe);
337385
}
@@ -1754,14 +1802,35 @@ private void testRetryPauseWhenServerIsOverloaded(HBaseServerException exception
17541802
Assert.assertTrue("Slept for too long: " + actualSleep + "ms", actualSleep <= expectedSleep);
17551803
}
17561804

1805+
/**
1806+
* Tests that we properly recover from exceptions that DO NOT go through receiveGlobalFailure, due
1807+
* to updating the meta cache for the region which failed. Successful multigets can include region
1808+
* exceptions in the MultiResponse. In that case, it skips receiveGlobalFailure and instead
1809+
* handles in receiveMultiAction
1810+
*/
17571811
@Test
1758-
public void testRetryWithExceptionClearsMetaCache() throws Exception {
1812+
public void testRetryWithExceptionClearsMetaCacheUsingRegionException() throws Exception {
1813+
testRetryWithExceptionClearsMetaCache(true);
1814+
}
1815+
1816+
/**
1817+
* Tests that we properly recover from exceptions that go through receiveGlobalFailure, due to
1818+
* updating the meta cache for the region which failed.
1819+
*/
1820+
@Test
1821+
public void testRetryWithExceptionClearsMetaCacheUsingServerException() throws Exception {
1822+
testRetryWithExceptionClearsMetaCache(false);
1823+
}
1824+
1825+
private void testRetryWithExceptionClearsMetaCache(boolean useRegionException)
1826+
throws IOException {
17591827
Configuration myConf = new Configuration(CONF);
1760-
myConf.setInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER, 0);
1828+
myConf.setInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER, 1);
17611829
ClusterConnection conn = createHConnection(new ConnectionConfiguration(myConf));
17621830

1763-
AsyncProcessWithFailure ap =
1764-
new AsyncProcessWithFailure(conn, myConf, new RegionOpeningException("test"));
1831+
// we pass in loc1.getServerName here so that only calls to that server will fail
1832+
AsyncProcessWithFailure ap = new AsyncProcessWithFailure(conn, myConf,
1833+
new RegionOpeningException("test"), loc1.getServerName(), useRegionException);
17651834
BufferedMutatorParams bufferParam = createBufferedMutatorParams(ap, DUMMY_TABLE);
17661835
BufferedMutatorImpl mutator = new BufferedMutatorImpl(conn, bufferParam, ap);
17671836

@@ -1770,20 +1839,33 @@ public void testRetryWithExceptionClearsMetaCache() throws Exception {
17701839
Assert.assertEquals(conn.locateRegion(DUMMY_TABLE, DUMMY_BYTES_1, true, true).toString(),
17711840
new RegionLocations(loc1).toString());
17721841

1773-
Mockito.verify(conn, Mockito.times(0)).clearCaches(Mockito.any());
1842+
// simulate updateCachedLocations, by changing the loc for this row to loc3. only loc1 fails,
1843+
// so this means retry will succeed
1844+
Mockito.doAnswer(invocation -> {
1845+
setMockLocation(conn, DUMMY_BYTES_1, new RegionLocations(loc3));
1846+
return null;
1847+
}).when(conn).updateCachedLocations(Mockito.eq(DUMMY_TABLE),
1848+
Mockito.eq(loc1.getRegion().getRegionName()), Mockito.eq(DUMMY_BYTES_1), Mockito.any(),
1849+
Mockito.eq(loc1.getServerName()));
1850+
1851+
// Ensure we haven't called updateCachedLocations yet
1852+
Mockito.verify(conn, Mockito.times(0)).updateCachedLocations(Mockito.any(), Mockito.any(),
1853+
Mockito.any(), Mockito.any(), Mockito.any());
17741854

17751855
Put p = createPut(1, true);
17761856
mutator.mutate(p);
17771857

1778-
try {
1779-
mutator.flush();
1780-
Assert.fail();
1781-
} catch (RetriesExhaustedWithDetailsException expected) {
1782-
assertEquals(1, expected.getNumExceptions());
1783-
assertTrue(expected.getRow(0) == p);
1784-
}
1858+
// we expect this to succeed because the bad region location should be updated upon
1859+
// the initial failure causing retries to succeed.
1860+
mutator.flush();
17851861

1786-
Mockito.verify(conn, Mockito.times(1)).clearCaches(loc1.getServerName());
1862+
// validate that we updated the location, as we expected
1863+
Assert.assertEquals(conn.locateRegion(DUMMY_TABLE, DUMMY_BYTES_1, true, true).toString(),
1864+
new RegionLocations(loc3).toString());
1865+
// this is a given since the location updated, but validate that we called updateCachedLocations
1866+
Mockito.verify(conn, Mockito.atLeastOnce()).updateCachedLocations(Mockito.eq(DUMMY_TABLE),
1867+
Mockito.eq(loc1.getRegion().getRegionName()), Mockito.eq(DUMMY_BYTES_1), Mockito.any(),
1868+
Mockito.eq(loc1.getServerName()));
17871869
}
17881870

17891871
@Test

0 commit comments

Comments
 (0)