From 809c2d6772b3daf78fc2d87e817e97fc13db3255 Mon Sep 17 00:00:00 2001 From: Henning Andersen Date: Mon, 31 Aug 2020 13:03:07 +0200 Subject: [PATCH 1/3] Fix cluster health when closing When master shuts down it's cluster service, a waiting health request would fail rather than fail over to a new master. --- .../org/elasticsearch/cluster/ClusterHealthIT.java | 11 +++++++++-- .../cluster/health/TransportClusterHealthAction.java | 3 ++- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/server/src/internalClusterTest/java/org/elasticsearch/cluster/ClusterHealthIT.java b/server/src/internalClusterTest/java/org/elasticsearch/cluster/ClusterHealthIT.java index 7488e24b8814a..b47a4c46b1ab2 100644 --- a/server/src/internalClusterTest/java/org/elasticsearch/cluster/ClusterHealthIT.java +++ b/server/src/internalClusterTest/java/org/elasticsearch/cluster/ClusterHealthIT.java @@ -20,6 +20,8 @@ package org.elasticsearch.cluster; import org.elasticsearch.action.ActionFuture; +import org.elasticsearch.action.admin.cluster.health.ClusterHealthRequest; +import org.elasticsearch.action.admin.cluster.health.ClusterHealthRequestBuilder; import org.elasticsearch.action.admin.cluster.health.ClusterHealthResponse; import org.elasticsearch.action.support.IndicesOptions; import org.elasticsearch.action.support.PlainActionFuture; @@ -313,8 +315,13 @@ public void testHealthOnMasterFailover() throws Exception { // Run a few health requests concurrent to master fail-overs against a data-node to make sure master failover is handled // without exceptions for (int i = 0; i < 20; ++i) { - responseFutures.add(client(node).admin().cluster().prepareHealth().setWaitForEvents(Priority.LANGUID) - .setWaitForGreenStatus().execute()); + ClusterHealthRequestBuilder healthRequestBuilder = + client(node).admin().cluster().prepareHealth().setWaitForEvents(Priority.LANGUID).setWaitForGreenStatus(); + if (randomBoolean()) { + healthRequestBuilder.setWaitForNodes(">100") + .setTimeout(TimeValue.timeValueMillis(100)).setMasterNodeTimeout(ClusterHealthRequest.DEFAULT_MASTER_NODE_TIMEOUT); + } + responseFutures.add(healthRequestBuilder.execute()); internalCluster().restartNode(internalCluster().getMasterName(), InternalTestCluster.EMPTY_CALLBACK); } for (ActionFuture responseFuture : responseFutures) { diff --git a/server/src/main/java/org/elasticsearch/action/admin/cluster/health/TransportClusterHealthAction.java b/server/src/main/java/org/elasticsearch/action/admin/cluster/health/TransportClusterHealthAction.java index 329cb0a307efd..99e60b3bca287 100644 --- a/server/src/main/java/org/elasticsearch/action/admin/cluster/health/TransportClusterHealthAction.java +++ b/server/src/main/java/org/elasticsearch/action/admin/cluster/health/TransportClusterHealthAction.java @@ -45,6 +45,7 @@ import org.elasticsearch.common.unit.TimeValue; import org.elasticsearch.common.util.CollectionUtils; import org.elasticsearch.index.IndexNotFoundException; +import org.elasticsearch.node.NodeClosedException; import org.elasticsearch.tasks.Task; import org.elasticsearch.threadpool.ThreadPool; import org.elasticsearch.transport.TransportService; @@ -204,7 +205,7 @@ public void onNewClusterState(ClusterState newState) { @Override public void onClusterServiceClose() { - listener.onFailure(new IllegalStateException("ClusterService was close during health call")); + listener.onFailure(new NodeClosedException(clusterService.localNode())); } @Override From cf465437c28e235958e11d5973943d715f170e22 Mon Sep 17 00:00:00 2001 From: Henning Andersen Date: Tue, 8 Sep 2020 20:57:28 +0200 Subject: [PATCH 2/3] Better way to provoke wait in test case --- .../cluster/ClusterHealthIT.java | 24 ++++++++++++------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/server/src/internalClusterTest/java/org/elasticsearch/cluster/ClusterHealthIT.java b/server/src/internalClusterTest/java/org/elasticsearch/cluster/ClusterHealthIT.java index b47a4c46b1ab2..9479541525e8d 100644 --- a/server/src/internalClusterTest/java/org/elasticsearch/cluster/ClusterHealthIT.java +++ b/server/src/internalClusterTest/java/org/elasticsearch/cluster/ClusterHealthIT.java @@ -20,9 +20,8 @@ package org.elasticsearch.cluster; import org.elasticsearch.action.ActionFuture; -import org.elasticsearch.action.admin.cluster.health.ClusterHealthRequest; -import org.elasticsearch.action.admin.cluster.health.ClusterHealthRequestBuilder; import org.elasticsearch.action.admin.cluster.health.ClusterHealthResponse; +import org.elasticsearch.action.admin.indices.settings.put.UpdateSettingsRequest; import org.elasticsearch.action.support.IndicesOptions; import org.elasticsearch.action.support.PlainActionFuture; import org.elasticsearch.cluster.health.ClusterHealthStatus; @@ -311,19 +310,26 @@ public void clusterStateProcessed(String source, ClusterState oldState, ClusterS public void testHealthOnMasterFailover() throws Exception { final String node = internalCluster().startDataOnlyNode(); + boolean withIndex = randomBoolean(); + if (withIndex) { + // create index with many shards to provoke the health request to wait (for green) while master is being shut down. + createIndex("test", Settings.builder().put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, randomIntBetween(0, 10)).build()); + } final List> responseFutures = new ArrayList<>(); // Run a few health requests concurrent to master fail-overs against a data-node to make sure master failover is handled // without exceptions for (int i = 0; i < 20; ++i) { - ClusterHealthRequestBuilder healthRequestBuilder = - client(node).admin().cluster().prepareHealth().setWaitForEvents(Priority.LANGUID).setWaitForGreenStatus(); - if (randomBoolean()) { - healthRequestBuilder.setWaitForNodes(">100") - .setTimeout(TimeValue.timeValueMillis(100)).setMasterNodeTimeout(ClusterHealthRequest.DEFAULT_MASTER_NODE_TIMEOUT); - } - responseFutures.add(healthRequestBuilder.execute()); + responseFutures.add(client(node).admin().cluster().prepareHealth().setWaitForEvents(Priority.LANGUID) + .setWaitForGreenStatus().setMasterNodeTimeout(TimeValue.timeValueMinutes(1)).execute()); internalCluster().restartNode(internalCluster().getMasterName(), InternalTestCluster.EMPTY_CALLBACK); } + if (withIndex) { + assertAcked( + client().admin().indices() + .updateSettings(new UpdateSettingsRequest("test") + .settings(Settings.builder().put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 0))).get() + ); + } for (ActionFuture responseFuture : responseFutures) { assertSame(responseFuture.get().getStatus(), ClusterHealthStatus.GREEN); } From 89cc97f0933aeeb47d2e6f24241917fb6cbdc261 Mon Sep 17 00:00:00 2001 From: Henning Andersen Date: Fri, 18 Sep 2020 16:20:37 +0200 Subject: [PATCH 3/3] Add comment --- .../java/org/elasticsearch/cluster/ClusterHealthIT.java | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/server/src/internalClusterTest/java/org/elasticsearch/cluster/ClusterHealthIT.java b/server/src/internalClusterTest/java/org/elasticsearch/cluster/ClusterHealthIT.java index 9479541525e8d..db343b0bfbff5 100644 --- a/server/src/internalClusterTest/java/org/elasticsearch/cluster/ClusterHealthIT.java +++ b/server/src/internalClusterTest/java/org/elasticsearch/cluster/ClusterHealthIT.java @@ -312,7 +312,10 @@ public void testHealthOnMasterFailover() throws Exception { final String node = internalCluster().startDataOnlyNode(); boolean withIndex = randomBoolean(); if (withIndex) { - // create index with many shards to provoke the health request to wait (for green) while master is being shut down. + // Create index with many shards to provoke the health request to wait (for green) while master is being shut down. + // Notice that this is set to 0 after the test completed starting a number of health requests and master restarts. + // This ensures that the cluster is yellow when the health request is made, making the health request wait on the observer, + // triggering a call to observer.onClusterServiceClose when master is shutdown. createIndex("test", Settings.builder().put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, randomIntBetween(0, 10)).build()); } final List> responseFutures = new ArrayList<>();