From 6c4c3e92e834914529fe9b9a461d96cddac3ef17 Mon Sep 17 00:00:00 2001 From: Nick Dimiduk Date: Fri, 1 Mar 2024 07:48:47 +0100 Subject: [PATCH 1/2] HBASE-28413 Fix race condition in TestCleanerChore.retriesIOExceptionInStatus We occasionally get a test failure in TestCleanerChore.retriesIOExceptionInStatus. For example, from a recent PR build [0] on branch-2.6, ``` java.util.concurrent.ExecutionException: java.io.IOException: whomp whomp. at java.base/java.util.concurrent.CompletableFuture.reportGet(CompletableFuture.java:395) at java.base/java.util.concurrent.CompletableFuture.get(CompletableFuture.java:1999) at org.apache.hadoop.hbase.master.cleaner.TestCleanerChore.retriesIOExceptionInStatus(TestCleanerChore.java:163) ... Caused by: java.io.IOException: whomp whomp. at org.apache.hadoop.hbase.master.cleaner.TestCleanerChore$1.listStatus(TestCleanerChore.java:134) at org.apache.hadoop.hbase.master.cleaner.CleanerChore.traverseAndDelete(CleanerChore.java:475) at org.apache.hadoop.hbase.master.cleaner.CleanerChore.lambda$chore$0(CleanerChore.java:258) at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128) at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628) ... 1 more ``` This looks like a race condition where the chore manages an entire execution between when the flag is flipped and when the test thread gets back around to continuing execution. Make the test a little more pessimistic about its view of the world. [0]: https://ci-hbase.apache.org/job/HBase-PreCommit-GitHub-PR/job/PR-5725/1/testReport/org.apache.hadoop.hbase.master.cleaner/TestCleanerChore/precommit_checks___yetus_jdk11_hadoop3_checks___retriesIOExceptionInStatus/ --- .../master/cleaner/TestCleanerChore.java | 33 ++++++++++++++----- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/cleaner/TestCleanerChore.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/cleaner/TestCleanerChore.java index d1a4f6dea055..d82da85b4310 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/cleaner/TestCleanerChore.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/cleaner/TestCleanerChore.java @@ -22,14 +22,17 @@ import static org.hamcrest.Matchers.instanceOf; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertThrows; import static org.junit.Assert.assertTrue; import java.io.IOException; import java.util.Map; import java.util.concurrent.CompletableFuture; +import java.util.concurrent.CountDownLatch; import java.util.concurrent.ExecutionException; import java.util.concurrent.ThreadLocalRandom; +import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicInteger; import org.apache.hadoop.conf.Configuration; @@ -128,12 +131,15 @@ public void retriesIOExceptionInStatus() throws Exception { fs.create(file).close(); assertTrue("test file didn't get created.", fs.exists(file)); final AtomicBoolean fails = new AtomicBoolean(true); + CountDownLatch successSignal = new CountDownLatch(1); FilterFileSystem filtered = new FilterFileSystem(fs) { public FileStatus[] listStatus(Path f) throws IOException { if (fails.get()) { throw new IOException("whomp whomp."); } - return fs.listStatus(f); + FileStatus[] ret = fs.listStatus(f); + successSignal.countDown(); + return ret; } }; @@ -144,7 +150,7 @@ public FileStatus[] listStatus(Path f) throws IOException { // trouble talking to the filesystem // and verify that it accurately reported the failure. CompletableFuture errorFuture = chore.triggerCleanerNow(); - ExecutionException e = assertThrows(ExecutionException.class, () -> errorFuture.get()); + ExecutionException e = assertThrows(ExecutionException.class, errorFuture::get); assertThat(e.getCause(), instanceOf(IOException.class)); assertThat(e.getCause().getMessage(), containsString("whomp")); @@ -154,16 +160,25 @@ public FileStatus[] listStatus(Path f) throws IOException { // filesystem is back fails.set(false); - for (;;) { - CompletableFuture succFuture = chore.triggerCleanerNow(); - // the reset of the future is async, so it is possible that we get the previous future - // again. - if (succFuture != errorFuture) { - // verify that it accurately reported success. - assertTrue("chore should claim it succeeded.", succFuture.get()); + CompletableFuture succFuture = chore.triggerCleanerNow(); + // all invocations after the signal are expected to succeed. the future handle we have + // currently may or may succeed. an entire run might be scheduled between now and when we're + // signaled, so we cannot naively check equality of succFuture vs. errorFuture. + successSignal.await(); + for (int i = 0; i < 5; i++) { + if (succFuture == null) { + succFuture = chore.triggerCleanerNow(); + } + if (succFuture.isCompletedExceptionally()) { + succFuture = null; + Thread.sleep(TimeUnit.SECONDS.toMillis(5)); + } else { break; } } + assertNotNull("chore future was null.", succFuture); + // verify that it accurately reported success. + assertTrue("chore should claim it succeeded.", succFuture.get()); // verify everything is gone. assertFalse("file should have been destroyed.", fs.exists(file)); assertFalse("directory should have been destroyed.", fs.exists(child)); From d906f43978d4199fc83e4479a398d9ad080a665e Mon Sep 17 00:00:00 2001 From: Nick Dimiduk Date: Fri, 1 Mar 2024 16:10:21 +0100 Subject: [PATCH 2/2] pr feedback --- .../master/cleaner/TestCleanerChore.java | 61 +++++++++++++++---- 1 file changed, 50 insertions(+), 11 deletions(-) diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/cleaner/TestCleanerChore.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/cleaner/TestCleanerChore.java index d82da85b4310..cb0190d12d56 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/cleaner/TestCleanerChore.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/cleaner/TestCleanerChore.java @@ -26,7 +26,9 @@ import static org.junit.Assert.assertThrows; import static org.junit.Assert.assertTrue; +import java.io.ByteArrayOutputStream; import java.io.IOException; +import java.io.PrintStream; import java.util.Map; import java.util.concurrent.CompletableFuture; import java.util.concurrent.CountDownLatch; @@ -45,10 +47,12 @@ import org.apache.hadoop.hbase.HBaseClassTestRule; import org.apache.hadoop.hbase.HBaseTestingUtil; import org.apache.hadoop.hbase.Stoppable; +import org.apache.hadoop.hbase.Waiter; import org.apache.hadoop.hbase.testclassification.MasterTests; import org.apache.hadoop.hbase.testclassification.SmallTests; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.CommonFSUtils; +import org.apache.hadoop.hbase.util.FutureUtils; import org.apache.hadoop.hbase.util.StoppableImplementation; import org.apache.hadoop.hbase.util.Threads; import org.junit.AfterClass; @@ -165,17 +169,52 @@ public FileStatus[] listStatus(Path f) throws IOException { // currently may or may succeed. an entire run might be scheduled between now and when we're // signaled, so we cannot naively check equality of succFuture vs. errorFuture. successSignal.await(); - for (int i = 0; i < 5; i++) { - if (succFuture == null) { - succFuture = chore.triggerCleanerNow(); - } - if (succFuture.isCompletedExceptionally()) { - succFuture = null; - Thread.sleep(TimeUnit.SECONDS.toMillis(5)); - } else { - break; - } - } + UTIL.waitFor(TimeUnit.MINUTES.toMillis(3), TimeUnit.SECONDS.toMillis(5), + new Waiter.ExplainingPredicate() { + private int attemptCount = 1; + private CompletableFuture fut = succFuture; + + @Override + public boolean evaluate() throws InterruptedException { + if (fut == null) { + fut = chore.triggerCleanerNow(); + attemptCount++; + } + if (fut.isCompletedExceptionally()) { + fut = null; + return false; + } else { + return true; + } + } + + @Override + public String explainFailure() throws Exception { + StringBuilder sb = new StringBuilder().append(String + .format("Unable to achieve a successful chore run after %s attempts.", attemptCount)); + if (fut != null && fut.isCompletedExceptionally()) { + // call fut.exceptionNow() when we have JDK19+ + Throwable cause = null; + try { + fut.get(); + } catch (ExecutionException e) { + cause = e.getCause() != null ? e.getCause() : null; + } catch (RuntimeException e) { + cause = FutureUtils.unwrapCompletionException(e); + } + if (cause != null) { + sb.append(" Most recent failure: "); + try (ByteArrayOutputStream baos = new ByteArrayOutputStream(1024); + PrintStream stream = new PrintStream(baos)) { + cause.printStackTrace(stream); + sb.append(baos); + } + } + } + return sb.toString(); + } + }); + assertNotNull("chore future was null.", succFuture); // verify that it accurately reported success. assertTrue("chore should claim it succeeded.", succFuture.get());