Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@
import java.util.Random;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicReference;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
Expand All @@ -56,11 +58,23 @@ public class LongGCDisruption extends SingleNodeDisruption {
private Set<Thread> suspendedThreads;
private Thread blockDetectionThread;

private final AtomicBoolean sawSlowSuspendBug = new AtomicBoolean(false);

public LongGCDisruption(Random random, String disruptedNode) {
super(random);
this.disruptedNode = disruptedNode;
}

/**
* Checks if during disruption we ran into a known JVM issue that makes {@link Thread#suspend()} calls block for multiple seconds
* was observed.
* @see <a href=https://bugs.openjdk.java.net/browse/JDK-8218446>JDK-8218446</a>
* @return true if during thread suspending a call to {@link Thread#suspend()} took more than 3s
*/
public boolean sawSlowSuspendBug() {
return sawSlowSuspendBug.get();
}

@Override
public synchronized void startDisrupting() {
if (suspendedThreads == null) {
Expand Down Expand Up @@ -251,7 +265,11 @@ protected boolean suspendThreads(Set<Thread> nodeThreads) {
* assuming that it is safe.
*/
boolean definitelySafe = true;
final long startTime = System.nanoTime();
thread.suspend();
if (System.nanoTime() - startTime > TimeUnit.SECONDS.toNanos(3L)) {
sawSlowSuspendBug.set(true);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we make this conditional on the JVM version so that we are sure to keep tracking that JVM bug and to drop this fix when it's no longer needed?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Technically we could, but right now we're seeing this failure on all versions. This is a little unexpected on 8 so I figured we'd apply the fix for 8 as well to identify whether it's the same issue (slow suspend) there or if it's blocked somewhere else.

}
// double check the thread is not in a shared resource like logging; if so, let it go and come back
safe:
for (StackTraceElement stackElement : thread.getStackTrace()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
*/
package org.elasticsearch.test.disruption;

import org.elasticsearch.bootstrap.JavaVersion;
import org.elasticsearch.common.Nullable;
import org.elasticsearch.test.ESTestCase;

Expand Down Expand Up @@ -115,8 +114,6 @@ protected long getSuspendingTimeoutInMillis() {
* but does keep retrying until all threads can be safely paused
*/
public void testNotBlockingUnsafeStackTraces() throws Exception {
assumeFalse("https://github.com/elastic/elasticsearch/issues/50047",
JavaVersion.current().equals(JavaVersion.parse("11")) || JavaVersion.current().equals(JavaVersion.parse("12")));
final String nodeName = "test_node";
LongGCDisruption disruption = new LongGCDisruption(random(), nodeName) {
@Override
Expand Down Expand Up @@ -149,14 +146,22 @@ protected Pattern[] getUnsafeClasses() {
threads[i].start();
}
// make sure some threads are under lock
disruption.startDisrupting();
try {
disruption.startDisrupting();
} catch (RuntimeException e) {
if (e.getMessage().contains("suspending node threads took too long") && disruption.sawSlowSuspendBug()) {
return;
}
throw new AssertionError(e);
}
long first = ops.get();
assertThat(lockedExecutor.lock.isLocked(), equalTo(false)); // no threads should own the lock
Thread.sleep(100);
assertThat(ops.get(), equalTo(first));
disruption.stopDisrupting();
assertBusy(() -> assertThat(ops.get(), greaterThan(first)));
} finally {
disruption.stopDisrupting();
stop.set(true);
for (final Thread thread : threads) {
thread.join();
Expand Down