From c7275b7054ed69d8bf75b2dc8024c850befc16a4 Mon Sep 17 00:00:00 2001 From: Armin Braun Date: Fri, 24 Jan 2020 18:19:54 +0100 Subject: [PATCH] Fix TransportMasterNodeAction not Retrying NodeClosedException (#51325) Added node closed exception to the retryable remote exceptions as it's possible to run into this exception instead of a connect exception when the master node is just shutting down but still responding to requests. --- .../master/TransportMasterNodeAction.java | 4 +++- .../master/TransportMasterNodeActionTests.java | 4 +++- .../elasticsearch/cluster/ClusterHealthIT.java | 16 ++++++++++++++++ 3 files changed, 22 insertions(+), 2 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/action/support/master/TransportMasterNodeAction.java b/server/src/main/java/org/elasticsearch/action/support/master/TransportMasterNodeAction.java index 154359ff99392..e3beeeafc58f9 100644 --- a/server/src/main/java/org/elasticsearch/action/support/master/TransportMasterNodeAction.java +++ b/server/src/main/java/org/elasticsearch/action/support/master/TransportMasterNodeAction.java @@ -46,6 +46,7 @@ import org.elasticsearch.tasks.Task; import org.elasticsearch.threadpool.ThreadPool; import org.elasticsearch.transport.ConnectTransportException; +import org.elasticsearch.transport.RemoteTransportException; import org.elasticsearch.transport.TransportException; import org.elasticsearch.transport.TransportService; @@ -180,7 +181,8 @@ protected void doStart(ClusterState clusterState) { @Override public void handleException(final TransportException exp) { Throwable cause = exp.unwrapCause(); - if (cause instanceof ConnectTransportException) { + if (cause instanceof ConnectTransportException || + (exp instanceof RemoteTransportException && cause instanceof NodeClosedException)) { // we want to retry here a bit to see if a new master is elected logger.debug("connection exception while trying to forward request with action name [{}] to " + "master node [{}], scheduling a retry. Error: [{}]", diff --git a/server/src/test/java/org/elasticsearch/action/support/master/TransportMasterNodeActionTests.java b/server/src/test/java/org/elasticsearch/action/support/master/TransportMasterNodeActionTests.java index 7e8efa4b55224..b4910ccdb8830 100644 --- a/server/src/test/java/org/elasticsearch/action/support/master/TransportMasterNodeActionTests.java +++ b/server/src/test/java/org/elasticsearch/action/support/master/TransportMasterNodeActionTests.java @@ -44,6 +44,7 @@ import org.elasticsearch.common.io.stream.StreamOutput; import org.elasticsearch.common.unit.TimeValue; import org.elasticsearch.discovery.MasterNotDiscoveredException; +import org.elasticsearch.node.NodeClosedException; import org.elasticsearch.rest.RestStatus; import org.elasticsearch.tasks.Task; import org.elasticsearch.test.ESTestCase; @@ -391,7 +392,8 @@ public void testDelegateToFailingMaster() throws ExecutionException, Interrupted assertThat(capturedRequest.action, equalTo("internal:testAction")); if (rejoinSameMaster) { - transport.handleRemoteError(capturedRequest.requestId, new ConnectTransportException(masterNode, "Fake error")); + transport.handleRemoteError(capturedRequest.requestId, + randomBoolean() ? new ConnectTransportException(masterNode, "Fake error") : new NodeClosedException(masterNode)); assertFalse(listener.isDone()); if (randomBoolean()) { // simulate master node removal diff --git a/server/src/test/java/org/elasticsearch/cluster/ClusterHealthIT.java b/server/src/test/java/org/elasticsearch/cluster/ClusterHealthIT.java index 656961411ea5e..1d264a31fbdb3 100644 --- a/server/src/test/java/org/elasticsearch/cluster/ClusterHealthIT.java +++ b/server/src/test/java/org/elasticsearch/cluster/ClusterHealthIT.java @@ -28,7 +28,10 @@ import org.elasticsearch.common.Priority; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.test.ESIntegTestCase; +import org.elasticsearch.test.InternalTestCluster; +import java.util.ArrayList; +import java.util.List; import java.util.concurrent.atomic.AtomicBoolean; import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked; @@ -294,4 +297,17 @@ public void clusterStateProcessed(String source, ClusterState oldState, ClusterS assertFalse(healthResponseFuture.get().isTimedOut()); } + public void testHealthOnMasterFailover() throws Exception { + final String node = internalCluster().startDataOnlyNode(); + final List> responseFutures = new ArrayList<>(); + // Run a few health requests concurrent to master fail-overs against a data-node to make sure master failover is handled + // without exceptions + for (int i = 0; i < 20; ++i) { + responseFutures.add(client(node).admin().cluster().prepareHealth().setWaitForEvents(Priority.LANGUID).execute()); + internalCluster().restartNode(internalCluster().getMasterName(), InternalTestCluster.EMPTY_CALLBACK); + } + for (ActionFuture responseFuture : responseFutures) { + assertSame(responseFuture.get().getStatus(), ClusterHealthStatus.GREEN); + } + } }