From 6bee96195a77b7d7f9f70e857381bcb9d0cfe326 Mon Sep 17 00:00:00 2001 From: Armin Braun Date: Wed, 22 Jan 2020 18:22:12 +0100 Subject: [PATCH 1/4] Fix TransportMasterNodeAction not Retrying NodeClosedException Added node closed exception to the retryable remote exceptions. --- .../action/support/master/TransportMasterNodeAction.java | 2 +- .../action/support/master/TransportMasterNodeActionTests.java | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/action/support/master/TransportMasterNodeAction.java b/server/src/main/java/org/elasticsearch/action/support/master/TransportMasterNodeAction.java index d4ac88c8b723c..c1f1f30ddae1c 100644 --- a/server/src/main/java/org/elasticsearch/action/support/master/TransportMasterNodeAction.java +++ b/server/src/main/java/org/elasticsearch/action/support/master/TransportMasterNodeAction.java @@ -178,7 +178,7 @@ protected void doStart(ClusterState clusterState) { @Override public void handleException(final TransportException exp) { Throwable cause = exp.unwrapCause(); - if (cause instanceof ConnectTransportException) { + if (cause instanceof ConnectTransportException || cause instanceof NodeClosedException) { // we want to retry here a bit to see if a new master is elected logger.debug("connection exception while trying to forward request with action name [{}] to " + "master node [{}], scheduling a retry. Error: [{}]", diff --git a/server/src/test/java/org/elasticsearch/action/support/master/TransportMasterNodeActionTests.java b/server/src/test/java/org/elasticsearch/action/support/master/TransportMasterNodeActionTests.java index cea57fa8f4e21..77f617ca3f9fd 100644 --- a/server/src/test/java/org/elasticsearch/action/support/master/TransportMasterNodeActionTests.java +++ b/server/src/test/java/org/elasticsearch/action/support/master/TransportMasterNodeActionTests.java @@ -45,6 +45,7 @@ import org.elasticsearch.common.io.stream.StreamOutput; import org.elasticsearch.common.unit.TimeValue; import org.elasticsearch.discovery.MasterNotDiscoveredException; +import org.elasticsearch.node.NodeClosedException; import org.elasticsearch.rest.RestStatus; import org.elasticsearch.tasks.Task; import org.elasticsearch.test.ESTestCase; @@ -392,7 +393,8 @@ public void testDelegateToFailingMaster() throws ExecutionException, Interrupted assertThat(capturedRequest.action, equalTo("internal:testAction")); if (rejoinSameMaster) { - transport.handleRemoteError(capturedRequest.requestId, new ConnectTransportException(masterNode, "Fake error")); + transport.handleRemoteError(capturedRequest.requestId, + randomBoolean() ? new ConnectTransportException(masterNode, "Fake error") : new NodeClosedException(masterNode)); assertFalse(listener.isDone()); if (randomBoolean()) { // simulate master node removal From 3378d3de2a248e87e30c4f67315842322c90f432 Mon Sep 17 00:00:00 2001 From: Armin Braun Date: Thu, 23 Jan 2020 16:55:32 +0100 Subject: [PATCH 2/4] test via healthcheck --- .../elasticsearch/cluster/ClusterHealthIT.java | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/server/src/test/java/org/elasticsearch/cluster/ClusterHealthIT.java b/server/src/test/java/org/elasticsearch/cluster/ClusterHealthIT.java index 656961411ea5e..1d5a892790f8e 100644 --- a/server/src/test/java/org/elasticsearch/cluster/ClusterHealthIT.java +++ b/server/src/test/java/org/elasticsearch/cluster/ClusterHealthIT.java @@ -28,7 +28,10 @@ import org.elasticsearch.common.Priority; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.test.ESIntegTestCase; +import org.elasticsearch.test.InternalTestCluster; +import java.util.ArrayList; +import java.util.List; import java.util.concurrent.atomic.AtomicBoolean; import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked; @@ -294,4 +297,17 @@ public void clusterStateProcessed(String source, ClusterState oldState, ClusterS assertFalse(healthResponseFuture.get().isTimedOut()); } + public void testHealthOnMasterFailover() throws Exception { + final String node = internalCluster().startDataOnlyNode(); + final List> responseFutures = new ArrayList<>(); + // Run a few health requests concurrent to master fail-overs against a data-node to make sure master failover is handled + // without exceptions + for (int i = 0; i < 20; ++i) { + responseFutures.add(client(node).admin().cluster().prepareHealth().execute()); + internalCluster().restartNode(internalCluster().getMasterName(), InternalTestCluster.EMPTY_CALLBACK); + } + for (ActionFuture responseFuture : responseFutures) { + assertSame(responseFuture.get().getStatus(), ClusterHealthStatus.GREEN); + } + } } From 1d07a269a00d6899f129c79f9a9c63bbdfbdb5d7 Mon Sep 17 00:00:00 2001 From: Armin Braun Date: Fri, 24 Jan 2020 09:58:56 +0100 Subject: [PATCH 3/4] only retry remote node closed --- .../action/support/master/TransportMasterNodeAction.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/server/src/main/java/org/elasticsearch/action/support/master/TransportMasterNodeAction.java b/server/src/main/java/org/elasticsearch/action/support/master/TransportMasterNodeAction.java index c1f1f30ddae1c..55051a6105d7a 100644 --- a/server/src/main/java/org/elasticsearch/action/support/master/TransportMasterNodeAction.java +++ b/server/src/main/java/org/elasticsearch/action/support/master/TransportMasterNodeAction.java @@ -46,6 +46,7 @@ import org.elasticsearch.tasks.Task; import org.elasticsearch.threadpool.ThreadPool; import org.elasticsearch.transport.ConnectTransportException; +import org.elasticsearch.transport.RemoteTransportException; import org.elasticsearch.transport.TransportException; import org.elasticsearch.transport.TransportService; @@ -178,7 +179,8 @@ protected void doStart(ClusterState clusterState) { @Override public void handleException(final TransportException exp) { Throwable cause = exp.unwrapCause(); - if (cause instanceof ConnectTransportException || cause instanceof NodeClosedException) { + if (cause instanceof ConnectTransportException || + (exp instanceof RemoteTransportException && cause instanceof NodeClosedException)) { // we want to retry here a bit to see if a new master is elected logger.debug("connection exception while trying to forward request with action name [{}] to " + "master node [{}], scheduling a retry. Error: [{}]", From 646823d17d0f1cd972d9dd2b6c316674ba23a572 Mon Sep 17 00:00:00 2001 From: Armin Braun Date: Fri, 24 Jan 2020 12:02:56 +0100 Subject: [PATCH 4/4] better repro --- .../test/java/org/elasticsearch/cluster/ClusterHealthIT.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/src/test/java/org/elasticsearch/cluster/ClusterHealthIT.java b/server/src/test/java/org/elasticsearch/cluster/ClusterHealthIT.java index 1d5a892790f8e..1d264a31fbdb3 100644 --- a/server/src/test/java/org/elasticsearch/cluster/ClusterHealthIT.java +++ b/server/src/test/java/org/elasticsearch/cluster/ClusterHealthIT.java @@ -303,7 +303,7 @@ public void testHealthOnMasterFailover() throws Exception { // Run a few health requests concurrent to master fail-overs against a data-node to make sure master failover is handled // without exceptions for (int i = 0; i < 20; ++i) { - responseFutures.add(client(node).admin().cluster().prepareHealth().execute()); + responseFutures.add(client(node).admin().cluster().prepareHealth().setWaitForEvents(Priority.LANGUID).execute()); internalCluster().restartNode(internalCluster().getMasterName(), InternalTestCluster.EMPTY_CALLBACK); } for (ActionFuture responseFuture : responseFutures) {