From 809c2d6772b3daf78fc2d87e817e97fc13db3255 Mon Sep 17 00:00:00 2001
From: Henning Andersen <henning.andersen@elastic.co>
Date: Mon, 31 Aug 2020 13:03:07 +0200
Subject: [PATCH 1/3] Fix cluster health when closing

When master shuts down it's cluster service, a waiting health request
would fail rather than fail over to a new master.
---
 .../org/elasticsearch/cluster/ClusterHealthIT.java    | 11 +++++++++--
 .../cluster/health/TransportClusterHealthAction.java  |  3 ++-
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/server/src/internalClusterTest/java/org/elasticsearch/cluster/ClusterHealthIT.java b/server/src/internalClusterTest/java/org/elasticsearch/cluster/ClusterHealthIT.java
index 7488e24b8814a..b47a4c46b1ab2 100644
--- a/server/src/internalClusterTest/java/org/elasticsearch/cluster/ClusterHealthIT.java
+++ b/server/src/internalClusterTest/java/org/elasticsearch/cluster/ClusterHealthIT.java
@@ -20,6 +20,8 @@
 package org.elasticsearch.cluster;
 
 import org.elasticsearch.action.ActionFuture;
+import org.elasticsearch.action.admin.cluster.health.ClusterHealthRequest;
+import org.elasticsearch.action.admin.cluster.health.ClusterHealthRequestBuilder;
 import org.elasticsearch.action.admin.cluster.health.ClusterHealthResponse;
 import org.elasticsearch.action.support.IndicesOptions;
 import org.elasticsearch.action.support.PlainActionFuture;
@@ -313,8 +315,13 @@ public void testHealthOnMasterFailover() throws Exception {
         // Run a few health requests concurrent to master fail-overs against a data-node to make sure master failover is handled
         // without exceptions
         for (int i = 0; i < 20; ++i) {
-            responseFutures.add(client(node).admin().cluster().prepareHealth().setWaitForEvents(Priority.LANGUID)
-                .setWaitForGreenStatus().execute());
+            ClusterHealthRequestBuilder healthRequestBuilder =
+                client(node).admin().cluster().prepareHealth().setWaitForEvents(Priority.LANGUID).setWaitForGreenStatus();
+            if (randomBoolean()) {
+                healthRequestBuilder.setWaitForNodes(">100")
+                    .setTimeout(TimeValue.timeValueMillis(100)).setMasterNodeTimeout(ClusterHealthRequest.DEFAULT_MASTER_NODE_TIMEOUT);
+            }
+            responseFutures.add(healthRequestBuilder.execute());
             internalCluster().restartNode(internalCluster().getMasterName(), InternalTestCluster.EMPTY_CALLBACK);
         }
         for (ActionFuture<ClusterHealthResponse> responseFuture : responseFutures) {
diff --git a/server/src/main/java/org/elasticsearch/action/admin/cluster/health/TransportClusterHealthAction.java b/server/src/main/java/org/elasticsearch/action/admin/cluster/health/TransportClusterHealthAction.java
index 329cb0a307efd..99e60b3bca287 100644
--- a/server/src/main/java/org/elasticsearch/action/admin/cluster/health/TransportClusterHealthAction.java
+++ b/server/src/main/java/org/elasticsearch/action/admin/cluster/health/TransportClusterHealthAction.java
@@ -45,6 +45,7 @@
 import org.elasticsearch.common.unit.TimeValue;
 import org.elasticsearch.common.util.CollectionUtils;
 import org.elasticsearch.index.IndexNotFoundException;
+import org.elasticsearch.node.NodeClosedException;
 import org.elasticsearch.tasks.Task;
 import org.elasticsearch.threadpool.ThreadPool;
 import org.elasticsearch.transport.TransportService;
@@ -204,7 +205,7 @@ public void onNewClusterState(ClusterState newState) {
 
                 @Override
                 public void onClusterServiceClose() {
-                    listener.onFailure(new IllegalStateException("ClusterService was close during health call"));
+                    listener.onFailure(new NodeClosedException(clusterService.localNode()));
                 }
 
                 @Override

From cf465437c28e235958e11d5973943d715f170e22 Mon Sep 17 00:00:00 2001
From: Henning Andersen <henning.andersen@elastic.co>
Date: Tue, 8 Sep 2020 20:57:28 +0200
Subject: [PATCH 2/3] Better way to provoke wait in test case

---
 .../cluster/ClusterHealthIT.java              | 24 ++++++++++++-------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/server/src/internalClusterTest/java/org/elasticsearch/cluster/ClusterHealthIT.java b/server/src/internalClusterTest/java/org/elasticsearch/cluster/ClusterHealthIT.java
index b47a4c46b1ab2..9479541525e8d 100644
--- a/server/src/internalClusterTest/java/org/elasticsearch/cluster/ClusterHealthIT.java
+++ b/server/src/internalClusterTest/java/org/elasticsearch/cluster/ClusterHealthIT.java
@@ -20,9 +20,8 @@
 package org.elasticsearch.cluster;
 
 import org.elasticsearch.action.ActionFuture;
-import org.elasticsearch.action.admin.cluster.health.ClusterHealthRequest;
-import org.elasticsearch.action.admin.cluster.health.ClusterHealthRequestBuilder;
 import org.elasticsearch.action.admin.cluster.health.ClusterHealthResponse;
+import org.elasticsearch.action.admin.indices.settings.put.UpdateSettingsRequest;
 import org.elasticsearch.action.support.IndicesOptions;
 import org.elasticsearch.action.support.PlainActionFuture;
 import org.elasticsearch.cluster.health.ClusterHealthStatus;
@@ -311,19 +310,26 @@ public void clusterStateProcessed(String source, ClusterState oldState, ClusterS
 
     public void testHealthOnMasterFailover() throws Exception {
         final String node = internalCluster().startDataOnlyNode();
+        boolean withIndex = randomBoolean();
+        if (withIndex) {
+            // create index with many shards to provoke the health request to wait (for green) while master is being shut down.
+            createIndex("test", Settings.builder().put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, randomIntBetween(0, 10)).build());
+        }
         final List<ActionFuture<ClusterHealthResponse>> responseFutures = new ArrayList<>();
         // Run a few health requests concurrent to master fail-overs against a data-node to make sure master failover is handled
         // without exceptions
         for (int i = 0; i < 20; ++i) {
-            ClusterHealthRequestBuilder healthRequestBuilder =
-                client(node).admin().cluster().prepareHealth().setWaitForEvents(Priority.LANGUID).setWaitForGreenStatus();
-            if (randomBoolean()) {
-                healthRequestBuilder.setWaitForNodes(">100")
-                    .setTimeout(TimeValue.timeValueMillis(100)).setMasterNodeTimeout(ClusterHealthRequest.DEFAULT_MASTER_NODE_TIMEOUT);
-            }
-            responseFutures.add(healthRequestBuilder.execute());
+            responseFutures.add(client(node).admin().cluster().prepareHealth().setWaitForEvents(Priority.LANGUID)
+                .setWaitForGreenStatus().setMasterNodeTimeout(TimeValue.timeValueMinutes(1)).execute());
             internalCluster().restartNode(internalCluster().getMasterName(), InternalTestCluster.EMPTY_CALLBACK);
         }
+        if (withIndex) {
+            assertAcked(
+                client().admin().indices()
+                    .updateSettings(new UpdateSettingsRequest("test")
+                        .settings(Settings.builder().put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 0))).get()
+            );
+        }
         for (ActionFuture<ClusterHealthResponse> responseFuture : responseFutures) {
             assertSame(responseFuture.get().getStatus(), ClusterHealthStatus.GREEN);
         }

From 89cc97f0933aeeb47d2e6f24241917fb6cbdc261 Mon Sep 17 00:00:00 2001
From: Henning Andersen <henning.andersen@elastic.co>
Date: Fri, 18 Sep 2020 16:20:37 +0200
Subject: [PATCH 3/3] Add comment

---
 .../java/org/elasticsearch/cluster/ClusterHealthIT.java      | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/server/src/internalClusterTest/java/org/elasticsearch/cluster/ClusterHealthIT.java b/server/src/internalClusterTest/java/org/elasticsearch/cluster/ClusterHealthIT.java
index 9479541525e8d..db343b0bfbff5 100644
--- a/server/src/internalClusterTest/java/org/elasticsearch/cluster/ClusterHealthIT.java
+++ b/server/src/internalClusterTest/java/org/elasticsearch/cluster/ClusterHealthIT.java
@@ -312,7 +312,10 @@ public void testHealthOnMasterFailover() throws Exception {
         final String node = internalCluster().startDataOnlyNode();
         boolean withIndex = randomBoolean();
         if (withIndex) {
-            // create index with many shards to provoke the health request to wait (for green) while master is being shut down.
+            // Create index with many shards to provoke the health request to wait (for green) while master is being shut down.
+            // Notice that this is set to 0 after the test completed starting a number of health requests and master restarts.
+            // This ensures that the cluster is yellow when the health request is made, making the health request wait on the observer,
+            // triggering a call to observer.onClusterServiceClose when master is shutdown.
             createIndex("test", Settings.builder().put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, randomIntBetween(0, 10)).build());
         }
         final List<ActionFuture<ClusterHealthResponse>> responseFutures = new ArrayList<>();