Fix ClusterInfoServiceIT timeouts (#36758)

ywelsch · web-flow · commit 8f141b8a4155 · 2018-12-19T13:59:58.000+01:00
The test testClusterInfoServiceInformationClearOnError relies on timing behavior. It sets InternalClusterInfoService.INTERNAL_CLUSTER_INFO_TIMEOUT_SETTING to 1s and relies on the fact that the stats request completes within that timeframe (which our ever-so-slow CI seems to violate at times). Unfortunately the logging has been misimplemented in InternalClusterInfoService, so the corresponding log messages showing that the requests have timed out are missing for this. The issue can be locally reproduced by reducing the timeout to something lower. Closes #36554
diff --git a/server/src/main/java/org/elasticsearch/cluster/InternalClusterInfoService.java b/server/src/main/java/org/elasticsearch/cluster/InternalClusterInfoService.java
@@ -345,17 +345,19 @@ public void onFailure(Exception e) {
         });
 
         try {
-            nodeLatch.await(fetchTimeout.getMillis(), TimeUnit.MILLISECONDS);
+            if (nodeLatch.await(fetchTimeout.getMillis(), TimeUnit.MILLISECONDS) == false) {
+                logger.warn("Failed to update node information for ClusterInfoUpdateJob within {} timeout", fetchTimeout);
+            }
         } catch (InterruptedException e) {
             Thread.currentThread().interrupt(); // restore interrupt status
-            logger.warn("Failed to update node information for ClusterInfoUpdateJob within {} timeout", fetchTimeout);
         }
 
         try {
-            indicesLatch.await(fetchTimeout.getMillis(), TimeUnit.MILLISECONDS);
+            if (indicesLatch.await(fetchTimeout.getMillis(), TimeUnit.MILLISECONDS) == false) {
+                logger.warn("Failed to update shard information for ClusterInfoUpdateJob within {} timeout", fetchTimeout);
+            }
         } catch (InterruptedException e) {
             Thread.currentThread().interrupt(); // restore interrupt status
-            logger.warn("Failed to update shard information for ClusterInfoUpdateJob within {} timeout", fetchTimeout);
         }
         ClusterInfo clusterInfo = getClusterInfo();
         try {
diff --git a/server/src/test/java/org/elasticsearch/cluster/ClusterInfoServiceIT.java b/server/src/test/java/org/elasticsearch/cluster/ClusterInfoServiceIT.java
@@ -111,7 +111,6 @@ protected Settings nodeSettings(int nodeOrdinal) {
             .put(super.nodeSettings(nodeOrdinal))
             // manual collection or upon cluster forming.
             .put(NodeEnvironment.MAX_LOCAL_STORAGE_NODES_SETTING.getKey(), 2)
-            .put(InternalClusterInfoService.INTERNAL_CLUSTER_INFO_TIMEOUT_SETTING.getKey(), "1s")
             .build();
     }
 
@@ -120,6 +119,11 @@ protected Collection<Class<? extends Plugin>> nodePlugins() {
         return Arrays.asList(TestPlugin.class, MockTransportService.TestPlugin.class);
     }
 
+    private void setClusterInfoTimeout(String timeValue) {
+        assertAcked(client().admin().cluster().prepareUpdateSettings().setTransientSettings(Settings.builder()
+            .put(InternalClusterInfoService.INTERNAL_CLUSTER_INFO_TIMEOUT_SETTING.getKey(), timeValue).build()));
+    }
+
     public void testClusterInfoServiceCollectsInformation() throws Exception {
         internalCluster().startNodes(2);
         assertAcked(prepareCreate("test").setSettings(Settings.builder()
@@ -204,6 +208,7 @@ public void testClusterInfoServiceInformationClearOnError() {
                 });
         }
 
+        setClusterInfoTimeout("1s");
         // timeouts shouldn't clear the info
         timeout.set(true);
         info = infoService.refresh();
@@ -237,6 +242,7 @@ public void testClusterInfoServiceInformationClearOnError() {
 
         // check we recover
         blockingActionFilter.blockActions();
+        setClusterInfoTimeout("15s");
         info = infoService.refresh();
         assertNotNull("info should not be null", info);
         assertThat(info.getNodeLeastAvailableDiskUsages().size(), equalTo(2));