From 63d0406b67998fdbf1702a6761060dfd935d0c92 Mon Sep 17 00:00:00 2001
From: Shay Banon <kimchy@gmail.com>
Date: Thu, 10 Apr 2014 18:46:23 +0200
Subject: [PATCH 01/74] [Discovery] lightweight minimum master node recovery
 don't perform full recovery when minimum master nodes are not met, keep the
 state around and use it once elected as master

---
 .../org/elasticsearch/discovery/zen/ZenDiscovery.java  | 10 +---------
 .../java/org/elasticsearch/gateway/GatewayService.java |  6 ------
 2 files changed, 1 insertion(+), 15 deletions(-)

diff --git a/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java b/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
index 421e1adeb716f..e1396aa695f9c 100644
--- a/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
+++ b/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
@@ -846,24 +846,16 @@ private ClusterState rejoin(ClusterState clusterState, String reason) {
 
         ClusterBlocks clusterBlocks = ClusterBlocks.builder().blocks(clusterState.blocks())
                 .addGlobalBlock(NO_MASTER_BLOCK)
-                .addGlobalBlock(GatewayService.STATE_NOT_RECOVERED_BLOCK)
                 .build();
 
-        // clear the routing table, we have no master, so we need to recreate the routing when we reform the cluster
-        RoutingTable routingTable = RoutingTable.builder().build();
-        // we also clean the metadata, since we are going to recover it if we become master
-        MetaData metaData = MetaData.builder().build();
-
         // clean the nodes, we are now not connected to anybody, since we try and reform the cluster
-        latestDiscoNodes = new DiscoveryNodes.Builder().put(localNode).localNodeId(localNode.id()).build();
+        latestDiscoNodes = new DiscoveryNodes.Builder(latestDiscoNodes).masterNodeId(null).build();
 
         asyncJoinCluster();
 
         return ClusterState.builder(clusterState)
                 .blocks(clusterBlocks)
                 .nodes(latestDiscoNodes)
-                .routingTable(routingTable)
-                .metaData(metaData)
                 .build();
     }
 
diff --git a/src/main/java/org/elasticsearch/gateway/GatewayService.java b/src/main/java/org/elasticsearch/gateway/GatewayService.java
index 5f5eaa8e3e5cb..940bf50fa9507 100644
--- a/src/main/java/org/elasticsearch/gateway/GatewayService.java
+++ b/src/main/java/org/elasticsearch/gateway/GatewayService.java
@@ -134,12 +134,6 @@ public void clusterChanged(final ClusterChangedEvent event) {
         if (lifecycle.stoppedOrClosed()) {
             return;
         }
-        if (event.state().blocks().hasGlobalBlock(Discovery.NO_MASTER_BLOCK)) {
-            // we need to clear those flags, since we might need to recover again in case we disconnect
-            // from the cluster and then reconnect
-            recovered.set(false);
-            scheduledRecovery.set(false);
-        }
         if (event.localNodeMaster() && event.state().blocks().hasGlobalBlock(STATE_NOT_RECOVERED_BLOCK)) {
             checkStateMeetsSettingsAndMaybeRecover(event.state(), true);
         }

From 4824f05369e7445cc25de3c72e799a8fbbe34a40 Mon Sep 17 00:00:00 2001
From: Shay Banon <kimchy@gmail.com>
Date: Fri, 11 Apr 2014 12:01:44 +0200
Subject: [PATCH 02/74] [Internal] make no master lock an instance var so it
 can be configured

---
 .../cluster/block/ClusterBlocks.java          |  9 +++++
 .../service/InternalClusterService.java       |  6 ++--
 .../elasticsearch/discovery/Discovery.java    |  2 --
 .../discovery/DiscoveryService.java           |  9 ++++-
 .../discovery/DiscoverySettings.java          | 14 ++++++--
 .../discovery/local/LocalDiscovery.java       |  2 +-
 .../discovery/zen/ZenDiscovery.java           |  6 ++--
 .../elasticsearch/gateway/GatewayService.java |  2 +-
 .../org/elasticsearch/tribe/TribeService.java |  5 +--
 .../cluster/MinimumMasterNodesTests.java      | 33 ++++++++++---------
 .../cluster/NoMasterNodeTests.java            |  3 +-
 11 files changed, 61 insertions(+), 30 deletions(-)

diff --git a/src/main/java/org/elasticsearch/cluster/block/ClusterBlocks.java b/src/main/java/org/elasticsearch/cluster/block/ClusterBlocks.java
index 957bd4062638a..e53cd24af8adb 100644
--- a/src/main/java/org/elasticsearch/cluster/block/ClusterBlocks.java
+++ b/src/main/java/org/elasticsearch/cluster/block/ClusterBlocks.java
@@ -108,6 +108,15 @@ public boolean hasGlobalBlock(ClusterBlock block) {
         return global.contains(block);
     }
 
+    public boolean hasGlobalBlock(int blockId) {
+        for (ClusterBlock clusterBlock : global) {
+            if (clusterBlock.id() == blockId) {
+                return true;
+            }
+        }
+        return false;
+    }
+
     /**
      * Is there a global block with the provided status?
      */
diff --git a/src/main/java/org/elasticsearch/cluster/service/InternalClusterService.java b/src/main/java/org/elasticsearch/cluster/service/InternalClusterService.java
index fad94ba194485..7cb52df3b6b57 100644
--- a/src/main/java/org/elasticsearch/cluster/service/InternalClusterService.java
+++ b/src/main/java/org/elasticsearch/cluster/service/InternalClusterService.java
@@ -84,7 +84,7 @@ public class InternalClusterService extends AbstractLifecycleComponent<ClusterSe
 
     private volatile ClusterState clusterState;
 
-    private final ClusterBlocks.Builder initialBlocks = ClusterBlocks.builder().addGlobalBlock(Discovery.NO_MASTER_BLOCK);
+    private final ClusterBlocks.Builder initialBlocks;
 
     private volatile ScheduledFuture reconnectToNodes;
 
@@ -104,6 +104,8 @@ public InternalClusterService(Settings settings, DiscoveryService discoveryServi
         this.reconnectInterval = componentSettings.getAsTime("reconnect_interval", TimeValue.timeValueSeconds(10));
 
         localNodeMasterListeners = new LocalNodeMasterListeners(threadPool);
+
+        initialBlocks = ClusterBlocks.builder().addGlobalBlock(discoveryService.getNoMasterBlock());
     }
 
     public NodeSettingsService settingsService() {
@@ -380,7 +382,7 @@ public void run() {
                         }
                     }
                 } else {
-                    if (previousClusterState.blocks().hasGlobalBlock(Discovery.NO_MASTER_BLOCK) && !newClusterState.blocks().hasGlobalBlock(Discovery.NO_MASTER_BLOCK)) {
+                    if (previousClusterState.blocks().hasGlobalBlock(discoveryService.getNoMasterBlock()) && !newClusterState.blocks().hasGlobalBlock(discoveryService.getNoMasterBlock())) {
                         // force an update, its a fresh update from the master as we transition from a start of not having a master to having one
                         // have a fresh instances of routing and metadata to remove the chance that version might be the same
                         Builder builder = ClusterState.builder(newClusterState);
diff --git a/src/main/java/org/elasticsearch/discovery/Discovery.java b/src/main/java/org/elasticsearch/discovery/Discovery.java
index b66e90c8c1c60..ea41cee2dc655 100644
--- a/src/main/java/org/elasticsearch/discovery/Discovery.java
+++ b/src/main/java/org/elasticsearch/discovery/Discovery.java
@@ -36,8 +36,6 @@
  */
 public interface Discovery extends LifecycleComponent<Discovery> {
 
-    final ClusterBlock NO_MASTER_BLOCK = new ClusterBlock(2, "no master", true, true, RestStatus.SERVICE_UNAVAILABLE, ClusterBlockLevel.ALL);
-
     DiscoveryNode localNode();
 
     void addListener(InitialStateDiscoveryListener listener);
diff --git a/src/main/java/org/elasticsearch/discovery/DiscoveryService.java b/src/main/java/org/elasticsearch/discovery/DiscoveryService.java
index 0108db12a1990..f5a555417cc96 100644
--- a/src/main/java/org/elasticsearch/discovery/DiscoveryService.java
+++ b/src/main/java/org/elasticsearch/discovery/DiscoveryService.java
@@ -22,6 +22,7 @@
 import org.elasticsearch.ElasticsearchException;
 import org.elasticsearch.ElasticsearchTimeoutException;
 import org.elasticsearch.cluster.ClusterState;
+import org.elasticsearch.cluster.block.ClusterBlock;
 import org.elasticsearch.cluster.node.DiscoveryNode;
 import org.elasticsearch.common.Strings;
 import org.elasticsearch.common.component.AbstractLifecycleComponent;
@@ -60,14 +61,20 @@ public boolean waitForInitialState(TimeValue timeValue) throws InterruptedExcept
     private final TimeValue initialStateTimeout;
     private final Discovery discovery;
     private InitialStateListener initialStateListener;
+    private final DiscoverySettings discoverySettings;
 
     @Inject
-    public DiscoveryService(Settings settings, Discovery discovery) {
+    public DiscoveryService(Settings settings, DiscoverySettings discoverySettings, Discovery discovery) {
         super(settings);
+        this.discoverySettings = discoverySettings;
         this.discovery = discovery;
         this.initialStateTimeout = componentSettings.getAsTime("initial_state_timeout", TimeValue.timeValueSeconds(30));
     }
 
+    public ClusterBlock getNoMasterBlock() {
+        return discoverySettings.getNoMasterBlock();
+    }
+
     @Override
     protected void doStart() throws ElasticsearchException {
         initialStateListener = new InitialStateListener();
diff --git a/src/main/java/org/elasticsearch/discovery/DiscoverySettings.java b/src/main/java/org/elasticsearch/discovery/DiscoverySettings.java
index 5a56e2d3a1b94..8f061eeb7e282 100644
--- a/src/main/java/org/elasticsearch/discovery/DiscoverySettings.java
+++ b/src/main/java/org/elasticsearch/discovery/DiscoverySettings.java
@@ -19,11 +19,14 @@
 
 package org.elasticsearch.discovery;
 
+import org.elasticsearch.cluster.block.ClusterBlock;
+import org.elasticsearch.cluster.block.ClusterBlockLevel;
 import org.elasticsearch.common.component.AbstractComponent;
 import org.elasticsearch.common.inject.Inject;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.common.unit.TimeValue;
 import org.elasticsearch.node.settings.NodeSettingsService;
+import org.elasticsearch.rest.RestStatus;
 
 /**
  * Exposes common discovery settings that may be supported by all the different discovery implementations
@@ -31,15 +34,18 @@
 public class DiscoverySettings extends AbstractComponent {
 
     public static final String PUBLISH_TIMEOUT = "discovery.zen.publish_timeout";
-
     public static final TimeValue DEFAULT_PUBLISH_TIMEOUT = TimeValue.timeValueSeconds(30);
-
     private volatile TimeValue publishTimeout = DEFAULT_PUBLISH_TIMEOUT;
 
+    public final static int NO_MASTER_BLOCK_ID = 2;
+
+    private final ClusterBlock noMasterBlock;
+
     @Inject
     public DiscoverySettings(Settings settings, NodeSettingsService nodeSettingsService) {
         super(settings);
         nodeSettingsService.addListener(new ApplySettings());
+        this.noMasterBlock = new ClusterBlock(NO_MASTER_BLOCK_ID, "no master", true, true, RestStatus.SERVICE_UNAVAILABLE, ClusterBlockLevel.ALL);
     }
 
     /**
@@ -49,6 +55,10 @@ public TimeValue getPublishTimeout() {
         return publishTimeout;
     }
 
+    public ClusterBlock getNoMasterBlock() {
+        return noMasterBlock;
+    }
+
     private class ApplySettings implements NodeSettingsService.Listener {
         @Override
         public void onRefreshSettings(Settings settings) {
diff --git a/src/main/java/org/elasticsearch/discovery/local/LocalDiscovery.java b/src/main/java/org/elasticsearch/discovery/local/LocalDiscovery.java
index f052ffef51282..1a6ffd3a66a2e 100644
--- a/src/main/java/org/elasticsearch/discovery/local/LocalDiscovery.java
+++ b/src/main/java/org/elasticsearch/discovery/local/LocalDiscovery.java
@@ -132,7 +132,7 @@ public ClusterState execute(ClusterState currentState) {
                         }
                         nodesBuilder.localNodeId(master.localNode().id()).masterNodeId(master.localNode().id());
                         // remove the NO_MASTER block in this case
-                        ClusterBlocks.Builder blocks = ClusterBlocks.builder().blocks(currentState.blocks()).removeGlobalBlock(Discovery.NO_MASTER_BLOCK);
+                        ClusterBlocks.Builder blocks = ClusterBlocks.builder().blocks(currentState.blocks()).removeGlobalBlock(discoverySettings.getNoMasterBlock());
                         return ClusterState.builder(currentState).nodes(nodesBuilder).blocks(blocks).build();
                     }
 
diff --git a/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java b/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
index e1396aa695f9c..97223d73f6224 100644
--- a/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
+++ b/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
@@ -86,6 +86,7 @@ public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implemen
     private AllocationService allocationService;
     private final ClusterName clusterName;
     private final DiscoveryNodeService discoveryNodeService;
+    private final DiscoverySettings discoverySettings;
     private final ZenPingService pingService;
     private final MasterFaultDetection masterFD;
     private final NodesFaultDetection nodesFD;
@@ -132,6 +133,7 @@ public ZenDiscovery(Settings settings, ClusterName clusterName, ThreadPool threa
         this.clusterService = clusterService;
         this.transportService = transportService;
         this.discoveryNodeService = discoveryNodeService;
+        this.discoverySettings = discoverySettings;
         this.pingService = pingService;
         this.version = version;
 
@@ -321,7 +323,7 @@ public ClusterState execute(ClusterState currentState) {
                                 .put(localNode);
                         // update the fact that we are the master...
                         latestDiscoNodes = builder.build();
-                        ClusterBlocks clusterBlocks = ClusterBlocks.builder().blocks(currentState.blocks()).removeGlobalBlock(NO_MASTER_BLOCK).build();
+                        ClusterBlocks clusterBlocks = ClusterBlocks.builder().blocks(currentState.blocks()).removeGlobalBlock(discoverySettings.getNoMasterBlock()).build();
                         return ClusterState.builder(currentState).nodes(latestDiscoNodes).blocks(clusterBlocks).build();
                     }
 
@@ -845,7 +847,7 @@ private ClusterState rejoin(ClusterState clusterState, String reason) {
         master = false;
 
         ClusterBlocks clusterBlocks = ClusterBlocks.builder().blocks(clusterState.blocks())
-                .addGlobalBlock(NO_MASTER_BLOCK)
+                .addGlobalBlock(discoverySettings.getNoMasterBlock())
                 .build();
 
         // clean the nodes, we are now not connected to anybody, since we try and reform the cluster
diff --git a/src/main/java/org/elasticsearch/gateway/GatewayService.java b/src/main/java/org/elasticsearch/gateway/GatewayService.java
index 940bf50fa9507..5f63f00732c5a 100644
--- a/src/main/java/org/elasticsearch/gateway/GatewayService.java
+++ b/src/main/java/org/elasticsearch/gateway/GatewayService.java
@@ -141,7 +141,7 @@ public void clusterChanged(final ClusterChangedEvent event) {
 
     protected void checkStateMeetsSettingsAndMaybeRecover(ClusterState state, boolean asyncRecovery) {
         DiscoveryNodes nodes = state.nodes();
-        if (state.blocks().hasGlobalBlock(Discovery.NO_MASTER_BLOCK)) {
+        if (state.blocks().hasGlobalBlock(discoveryService.getNoMasterBlock())) {
             logger.debug("not recovering from gateway, no master elected yet");
         } else if (recoverAfterNodes != -1 && (nodes.masterAndDataNodes().size()) < recoverAfterNodes) {
             logger.debug("not recovering from gateway, nodes_size (data+master) [" + nodes.masterAndDataNodes().size() + "] < recover_after_nodes [" + recoverAfterNodes + "]");
diff --git a/src/main/java/org/elasticsearch/tribe/TribeService.java b/src/main/java/org/elasticsearch/tribe/TribeService.java
index e706e40065898..0894edccd8c2c 100644
--- a/src/main/java/org/elasticsearch/tribe/TribeService.java
+++ b/src/main/java/org/elasticsearch/tribe/TribeService.java
@@ -44,6 +44,7 @@
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.common.util.concurrent.ConcurrentCollections;
 import org.elasticsearch.discovery.Discovery;
+import org.elasticsearch.discovery.DiscoveryService;
 import org.elasticsearch.gateway.GatewayService;
 import org.elasticsearch.node.NodeBuilder;
 import org.elasticsearch.node.internal.InternalNode;
@@ -121,7 +122,7 @@ public static Settings processSettings(Settings settings) {
     private final List<InternalNode> nodes = Lists.newCopyOnWriteArrayList();
 
     @Inject
-    public TribeService(Settings settings, ClusterService clusterService) {
+    public TribeService(Settings settings, ClusterService clusterService, DiscoveryService discoveryService) {
         super(settings);
         this.clusterService = clusterService;
         Map<String, Settings> nodesSettings = Maps.newHashMap(settings.getGroups("tribe", true));
@@ -143,7 +144,7 @@ public TribeService(Settings settings, ClusterService clusterService) {
         if (!nodes.isEmpty()) {
             // remove the initial election / recovery blocks since we are not going to have a
             // master elected in this single tribe  node local "cluster"
-            clusterService.removeInitialStateBlock(Discovery.NO_MASTER_BLOCK);
+            clusterService.removeInitialStateBlock(discoveryService.getNoMasterBlock());
             clusterService.removeInitialStateBlock(GatewayService.STATE_NOT_RECOVERED_BLOCK);
             if (settings.getAsBoolean("tribe.blocks.write", false)) {
                 clusterService.addInitialStateBlock(TRIBE_WRITE_BLOCK);
diff --git a/src/test/java/org/elasticsearch/cluster/MinimumMasterNodesTests.java b/src/test/java/org/elasticsearch/cluster/MinimumMasterNodesTests.java
index 5445214ec2049..c797969a06195 100644
--- a/src/test/java/org/elasticsearch/cluster/MinimumMasterNodesTests.java
+++ b/src/test/java/org/elasticsearch/cluster/MinimumMasterNodesTests.java
@@ -26,6 +26,7 @@
 import org.elasticsearch.common.Priority;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.discovery.Discovery;
+import org.elasticsearch.discovery.DiscoverySettings;
 import org.elasticsearch.discovery.zen.elect.ElectMasterService;
 import org.elasticsearch.index.query.QueryBuilders;
 import org.elasticsearch.test.ElasticsearchIntegrationTest;
@@ -60,7 +61,7 @@ public void simpleMinimumMasterNodes() throws Exception {
 
         logger.info("--> should be blocked, no master...");
         ClusterState state = client().admin().cluster().prepareState().setLocal(true).execute().actionGet().getState();
-        assertThat(state.blocks().hasGlobalBlock(Discovery.NO_MASTER_BLOCK), equalTo(true));
+        assertThat(state.blocks().hasGlobalBlock(DiscoverySettings.NO_MASTER_BLOCK_ID), equalTo(true));
         assertThat(state.nodes().size(), equalTo(1)); // verify that we still see the local node in the cluster state
 
         logger.info("--> start second node, cluster should be formed");
@@ -70,9 +71,9 @@ public void simpleMinimumMasterNodes() throws Exception {
         assertThat(clusterHealthResponse.isTimedOut(), equalTo(false));
 
         state = client().admin().cluster().prepareState().setLocal(true).execute().actionGet().getState();
-        assertThat(state.blocks().hasGlobalBlock(Discovery.NO_MASTER_BLOCK), equalTo(false));
+        assertThat(state.blocks().hasGlobalBlock(DiscoverySettings.NO_MASTER_BLOCK_ID), equalTo(false));
         state = client().admin().cluster().prepareState().setLocal(true).execute().actionGet().getState();
-        assertThat(state.blocks().hasGlobalBlock(Discovery.NO_MASTER_BLOCK), equalTo(false));
+        assertThat(state.blocks().hasGlobalBlock(DiscoverySettings.NO_MASTER_BLOCK_ID), equalTo(false));
 
         state = client().admin().cluster().prepareState().execute().actionGet().getState();
         assertThat(state.nodes().size(), equalTo(2));
@@ -98,11 +99,11 @@ public void simpleMinimumMasterNodes() throws Exception {
         awaitBusy(new Predicate<Object>() {
             public boolean apply(Object obj) {
                 ClusterState  state = client().admin().cluster().prepareState().setLocal(true).execute().actionGet().getState();
-                return state.blocks().hasGlobalBlock(Discovery.NO_MASTER_BLOCK);
+                return state.blocks().hasGlobalBlock(DiscoverySettings.NO_MASTER_BLOCK_ID);
             }
         });
         state = client().admin().cluster().prepareState().setLocal(true).execute().actionGet().getState();
-        assertThat(state.blocks().hasGlobalBlock(Discovery.NO_MASTER_BLOCK), equalTo(true));
+        assertThat(state.blocks().hasGlobalBlock(DiscoverySettings.NO_MASTER_BLOCK_ID), equalTo(true));
         assertThat(state.nodes().size(), equalTo(1)); // verify that we still see the local node in the cluster state
 
         logger.info("--> starting the previous master node again...");
@@ -112,9 +113,9 @@ public boolean apply(Object obj) {
         assertThat(clusterHealthResponse.isTimedOut(), equalTo(false));
 
         state = client().admin().cluster().prepareState().setLocal(true).execute().actionGet().getState();
-        assertThat(state.blocks().hasGlobalBlock(Discovery.NO_MASTER_BLOCK), equalTo(false));
+        assertThat(state.blocks().hasGlobalBlock(DiscoverySettings.NO_MASTER_BLOCK_ID), equalTo(false));
         state = client().admin().cluster().prepareState().setLocal(true).execute().actionGet().getState();
-        assertThat(state.blocks().hasGlobalBlock(Discovery.NO_MASTER_BLOCK), equalTo(false));
+        assertThat(state.blocks().hasGlobalBlock(DiscoverySettings.NO_MASTER_BLOCK_ID), equalTo(false));
 
         state = client().admin().cluster().prepareState().execute().actionGet().getState();
         assertThat(state.nodes().size(), equalTo(2));
@@ -135,7 +136,7 @@ public boolean apply(Object obj) {
         assertThat(awaitBusy(new Predicate<Object>() {
             public boolean apply(Object obj) {
                 ClusterState state = client().admin().cluster().prepareState().setLocal(true).execute().actionGet().getState();
-                return state.blocks().hasGlobalBlock(Discovery.NO_MASTER_BLOCK);
+                return state.blocks().hasGlobalBlock(DiscoverySettings.NO_MASTER_BLOCK_ID);
             }
         }), equalTo(true));
 
@@ -146,9 +147,9 @@ public boolean apply(Object obj) {
         assertThat(clusterHealthResponse.isTimedOut(), equalTo(false));
 
         state = client().admin().cluster().prepareState().setLocal(true).execute().actionGet().getState();
-        assertThat(state.blocks().hasGlobalBlock(Discovery.NO_MASTER_BLOCK), equalTo(false));
+        assertThat(state.blocks().hasGlobalBlock(DiscoverySettings.NO_MASTER_BLOCK_ID), equalTo(false));
         state = client().admin().cluster().prepareState().setLocal(true).execute().actionGet().getState();
-        assertThat(state.blocks().hasGlobalBlock(Discovery.NO_MASTER_BLOCK), equalTo(false));
+        assertThat(state.blocks().hasGlobalBlock(DiscoverySettings.NO_MASTER_BLOCK_ID), equalTo(false));
 
         state = client().admin().cluster().prepareState().execute().actionGet().getState();
         assertThat(state.nodes().size(), equalTo(2));
@@ -183,21 +184,21 @@ public void multipleNodesShutdownNonMasterNodes() throws Exception {
         awaitBusy(new Predicate<Object>() {
             public boolean apply(Object obj) {
                 ClusterState state = client().admin().cluster().prepareState().setLocal(true).execute().actionGet().getState();
-                return state.blocks().hasGlobalBlock(Discovery.NO_MASTER_BLOCK);
+                return state.blocks().hasGlobalBlock(DiscoverySettings.NO_MASTER_BLOCK_ID);
             }
         });
         
         awaitBusy(new Predicate<Object>() {
             public boolean apply(Object obj) {
                 ClusterState state = client().admin().cluster().prepareState().setLocal(true).execute().actionGet().getState();
-                return state.blocks().hasGlobalBlock(Discovery.NO_MASTER_BLOCK);
+                return state.blocks().hasGlobalBlock(DiscoverySettings.NO_MASTER_BLOCK_ID);
             }
         });
 
         state = client().admin().cluster().prepareState().setLocal(true).execute().actionGet().getState();
-        assertThat(state.blocks().hasGlobalBlock(Discovery.NO_MASTER_BLOCK), equalTo(true));
+        assertThat(state.blocks().hasGlobalBlock(DiscoverySettings.NO_MASTER_BLOCK_ID), equalTo(true));
         state = client().admin().cluster().prepareState().setLocal(true).execute().actionGet().getState();
-        assertThat(state.blocks().hasGlobalBlock(Discovery.NO_MASTER_BLOCK), equalTo(true));
+        assertThat(state.blocks().hasGlobalBlock(DiscoverySettings.NO_MASTER_BLOCK_ID), equalTo(true));
 
         logger.info("--> start two more nodes");
         internalCluster().startNode(settings);
@@ -298,9 +299,9 @@ public boolean apply(Object obj) {
                 boolean success = true;
                 for (Client client : internalCluster()) {
                     ClusterState state = client.admin().cluster().prepareState().setLocal(true).execute().actionGet().getState();
-                    success &= state.blocks().hasGlobalBlock(Discovery.NO_MASTER_BLOCK);
+                    success &= state.blocks().hasGlobalBlock(DiscoverySettings.NO_MASTER_BLOCK_ID);
                     if (logger.isDebugEnabled()) {
-                        logger.debug("Checking for NO_MASTER_BLOCK on client: {} NO_MASTER_BLOCK: [{}]", client, state.blocks().hasGlobalBlock(Discovery.NO_MASTER_BLOCK));
+                        logger.debug("Checking for NO_MASTER_BLOCK on client: {} NO_MASTER_BLOCK: [{}]", client, state.blocks().hasGlobalBlock(DiscoverySettings.NO_MASTER_BLOCK_ID));
                     }
                 }
                 return success;
diff --git a/src/test/java/org/elasticsearch/cluster/NoMasterNodeTests.java b/src/test/java/org/elasticsearch/cluster/NoMasterNodeTests.java
index 1689dc20a1828..96a8f29e1f043 100644
--- a/src/test/java/org/elasticsearch/cluster/NoMasterNodeTests.java
+++ b/src/test/java/org/elasticsearch/cluster/NoMasterNodeTests.java
@@ -27,6 +27,7 @@
 import org.elasticsearch.common.unit.TimeValue;
 import org.elasticsearch.common.xcontent.XContentFactory;
 import org.elasticsearch.discovery.Discovery;
+import org.elasticsearch.discovery.DiscoverySettings;
 import org.elasticsearch.discovery.MasterNotDiscoveredException;
 import org.elasticsearch.rest.RestStatus;
 import org.elasticsearch.script.ScriptService;
@@ -74,7 +75,7 @@ public void testNoMasterActions() throws Exception {
             @Override
             public void run() {
                 ClusterState state = client().admin().cluster().prepareState().setLocal(true).execute().actionGet().getState();
-                assertTrue(state.blocks().hasGlobalBlock(Discovery.NO_MASTER_BLOCK));
+                assertTrue(state.blocks().hasGlobalBlock(DiscoverySettings.NO_MASTER_BLOCK_ID));
             }
         });
 

From 6ede83ab45831b69b2627e1ad05acb3a1f55d6d9 Mon Sep 17 00:00:00 2001
From: Shay Banon <kimchy@gmail.com>
Date: Fri, 11 Apr 2014 17:24:32 +0200
Subject: [PATCH 03/74] [Discovery] add rejoin on master gone flag, defaults to
 false

defaults to false since there is still work left to properly make it work
---
 .../org/elasticsearch/discovery/zen/ZenDiscovery.java     | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java b/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
index 97223d73f6224..2aecce6337ae3 100644
--- a/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
+++ b/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
@@ -119,6 +119,8 @@ public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implemen
 
     private final AtomicBoolean initialStateSent = new AtomicBoolean();
 
+    private final boolean rejoinOnMasterGone;
+
 
     @Nullable
     private NodeService nodeService;
@@ -144,6 +146,7 @@ public ZenDiscovery(Settings settings, ClusterName clusterName, ThreadPool threa
 
         this.masterElectionFilterClientNodes = settings.getAsBoolean("discovery.zen.master_election.filter_client", true);
         this.masterElectionFilterDataNodes = settings.getAsBoolean("discovery.zen.master_election.filter_data", false);
+        this.rejoinOnMasterGone = settings.getAsBoolean("discovery.zen.rejoin_on_master_gone", false);
 
         logger.debug("using ping.timeout [{}], join.timeout [{}], master_election.filter_client [{}], master_election.filter_data [{}]", pingTimeout, joinTimeout, masterElectionFilterClientNodes, masterElectionFilterDataNodes);
 
@@ -495,6 +498,11 @@ public ClusterState execute(ClusterState currentState) {
                         // make sure the old master node, which has failed, is not part of the nodes we publish
                         .remove(masterNode.id())
                         .masterNodeId(null).build();
+                latestDiscoNodes = discoveryNodes;
+
+                if (rejoinOnMasterGone) {
+                    return rejoin(ClusterState.builder(currentState).nodes(discoveryNodes).build(), "master left (reason = " + reason + ")");
+                }
 
                 if (!electMaster.hasEnoughMasterNodes(discoveryNodes)) {
                     return rejoin(ClusterState.builder(currentState).nodes(discoveryNodes).build(), "not enough master nodes after master left (reason = " + reason + ")");

From 97bdc8f5a2d83d6166d630540c48f0f43fc71bd4 Mon Sep 17 00:00:00 2001
From: Martijn van Groningen <martijn.v.groningen@gmail.com>
Date: Thu, 17 Apr 2014 12:09:16 +0700
Subject: [PATCH 04/74] [Discovery] Make noMasterBlock configurable and added
 simple test that shows reads do execute (partially) when m_m_n isn't met

---
 .../discovery/DiscoverySettings.java          | 34 ++++++++-
 .../cluster/NoMasterNodeTests.java            | 75 ++++++++++++++++++-
 2 files changed, 104 insertions(+), 5 deletions(-)

diff --git a/src/main/java/org/elasticsearch/discovery/DiscoverySettings.java b/src/main/java/org/elasticsearch/discovery/DiscoverySettings.java
index 8f061eeb7e282..8903e73922b11 100644
--- a/src/main/java/org/elasticsearch/discovery/DiscoverySettings.java
+++ b/src/main/java/org/elasticsearch/discovery/DiscoverySettings.java
@@ -19,6 +19,7 @@
 
 package org.elasticsearch.discovery;
 
+import org.elasticsearch.ElasticsearchIllegalArgumentException;
 import org.elasticsearch.cluster.block.ClusterBlock;
 import org.elasticsearch.cluster.block.ClusterBlockLevel;
 import org.elasticsearch.common.component.AbstractComponent;
@@ -34,18 +35,24 @@
 public class DiscoverySettings extends AbstractComponent {
 
     public static final String PUBLISH_TIMEOUT = "discovery.zen.publish_timeout";
-    public static final TimeValue DEFAULT_PUBLISH_TIMEOUT = TimeValue.timeValueSeconds(30);
-    private volatile TimeValue publishTimeout = DEFAULT_PUBLISH_TIMEOUT;
+    public static final String NO_MASTER_BLOCK = "discovery.zen.no_master_block";
 
+    public static final TimeValue DEFAULT_PUBLISH_TIMEOUT = TimeValue.timeValueSeconds(30);
+    public static final String DEFAULT_NO_MASTER_BLOCK = "write";
     public final static int NO_MASTER_BLOCK_ID = 2;
 
-    private final ClusterBlock noMasterBlock;
+    private final static ClusterBlock ALL = new ClusterBlock(NO_MASTER_BLOCK_ID, "no master", true, true, RestStatus.SERVICE_UNAVAILABLE, ClusterBlockLevel.ALL);
+    private final static ClusterBlock WRITE = new ClusterBlock(NO_MASTER_BLOCK_ID, "no master", true, false, RestStatus.SERVICE_UNAVAILABLE, ClusterBlockLevel.WRITE, ClusterBlockLevel.METADATA);
+    private final static ClusterBlock METADATA = new ClusterBlock(NO_MASTER_BLOCK_ID, "no master", true, false, RestStatus.SERVICE_UNAVAILABLE, ClusterBlockLevel.METADATA);
+
+    private volatile ClusterBlock noMasterBlock;
+    private volatile TimeValue publishTimeout = DEFAULT_PUBLISH_TIMEOUT;
 
     @Inject
     public DiscoverySettings(Settings settings, NodeSettingsService nodeSettingsService) {
         super(settings);
         nodeSettingsService.addListener(new ApplySettings());
-        this.noMasterBlock = new ClusterBlock(NO_MASTER_BLOCK_ID, "no master", true, true, RestStatus.SERVICE_UNAVAILABLE, ClusterBlockLevel.ALL);
+        this.noMasterBlock = parseNoMasterBlock(settings.get(NO_MASTER_BLOCK, DEFAULT_NO_MASTER_BLOCK));
     }
 
     /**
@@ -69,6 +76,25 @@ public void onRefreshSettings(Settings settings) {
                     publishTimeout = newPublishTimeout;
                 }
             }
+            String newNoMasterBlockValue = settings.get(NO_MASTER_BLOCK);
+            if (newNoMasterBlockValue != null) {
+                ClusterBlock newNoMasterBlock = parseNoMasterBlock(newNoMasterBlockValue);
+                if (newNoMasterBlock != noMasterBlock) {
+                    noMasterBlock = newNoMasterBlock;
+                }
+            }
+        }
+    }
+
+    private ClusterBlock parseNoMasterBlock(String value) {
+        if ("all".equals(value)) {
+            return ALL;
+        } else if ("write".equals(value)) {
+            return WRITE;
+        } else if ("metadata".equals(value)) {
+            return METADATA;
+        } else {
+            throw new ElasticsearchIllegalArgumentException("invalid master block [" + value + "]");
         }
     }
 }
diff --git a/src/test/java/org/elasticsearch/cluster/NoMasterNodeTests.java b/src/test/java/org/elasticsearch/cluster/NoMasterNodeTests.java
index 96a8f29e1f043..fdf27d7057258 100644
--- a/src/test/java/org/elasticsearch/cluster/NoMasterNodeTests.java
+++ b/src/test/java/org/elasticsearch/cluster/NoMasterNodeTests.java
@@ -20,13 +20,17 @@
 package org.elasticsearch.cluster;
 
 import org.elasticsearch.action.ActionRequestBuilder;
+import com.google.common.base.Predicate;
+import org.elasticsearch.action.count.CountResponse;
+import org.elasticsearch.action.get.GetResponse;
 import org.elasticsearch.action.bulk.BulkRequestBuilder;
 import org.elasticsearch.action.percolate.PercolateSourceBuilder;
+import org.elasticsearch.action.search.SearchResponse;
 import org.elasticsearch.cluster.block.ClusterBlockException;
+import org.elasticsearch.cluster.metadata.IndexMetaData;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.common.unit.TimeValue;
 import org.elasticsearch.common.xcontent.XContentFactory;
-import org.elasticsearch.discovery.Discovery;
 import org.elasticsearch.discovery.DiscoverySettings;
 import org.elasticsearch.discovery.MasterNotDiscoveredException;
 import org.elasticsearch.rest.RestStatus;
@@ -40,6 +44,8 @@
 
 import static org.elasticsearch.common.settings.ImmutableSettings.settingsBuilder;
 import static org.elasticsearch.test.ElasticsearchIntegrationTest.Scope;
+import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertExists;
+import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertHitCount;
 import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertThrows;
 import static org.hamcrest.Matchers.*;
 
@@ -61,6 +67,7 @@ public void testNoMasterActions() throws Exception {
                 .put("discovery.zen.minimum_master_nodes", 2)
                 .put("discovery.zen.ping_timeout", "200ms")
                 .put("discovery.initial_state_timeout", "500ms")
+                .put(DiscoverySettings.NO_MASTER_BLOCK, "all")
                 .build();
 
         TimeValue timeout = TimeValue.timeValueMillis(200);
@@ -196,4 +203,70 @@ void checkBulkAction(boolean autoCreateIndex, TimeValue timeout, BulkRequestBuil
             }
         }
     }
+
+    @Test
+    public void testNoMasterActions_writeMasterBlock() throws Exception {
+        Settings settings = settingsBuilder()
+                .put("discovery.type", "zen")
+                .put("action.auto_create_index", false)
+                .put("discovery.zen.minimum_master_nodes", 2)
+                .put("discovery.zen.ping_timeout", "200ms")
+                .put("discovery.initial_state_timeout", "500ms")
+                .put(DiscoverySettings.NO_MASTER_BLOCK, "write")
+                .build();
+
+        internalCluster().startNode(settings);
+        // start a second node, create an index, and then shut it down so we have no master block
+        internalCluster().startNode(settings);
+        prepareCreate("test1").setSettings(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1).get();
+        prepareCreate("test2").setSettings(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 2, IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 0).get();
+        client().admin().cluster().prepareHealth("_all").setWaitForGreenStatus().get();
+        client().prepareIndex("test1", "type1", "1").setSource("field", "value1").get();
+        client().prepareIndex("test2", "type1", "1").setSource("field", "value1").get();
+        refresh();
+
+        internalCluster().stopRandomDataNode();
+        assertThat(awaitBusy(new Predicate<Object>() {
+            public boolean apply(Object o) {
+                ClusterState state = client().admin().cluster().prepareState().setLocal(true).get().getState();
+                return state.blocks().hasGlobalBlock(DiscoverySettings.NO_MASTER_BLOCK_ID);
+            }
+        }), equalTo(true));
+
+
+        GetResponse getResponse = client().prepareGet("test1", "type1", "1").get();
+        assertExists(getResponse);
+
+        CountResponse countResponse = client().prepareCount("test1").get();
+        assertHitCount(countResponse, 1l);
+
+        SearchResponse searchResponse = client().prepareSearch("test1").get();
+        assertHitCount(searchResponse, 1l);
+
+        countResponse = client().prepareCount("test2").get();
+        assertThat(countResponse.getTotalShards(), equalTo(2));
+        assertThat(countResponse.getSuccessfulShards(), equalTo(1));
+
+        TimeValue timeout = TimeValue.timeValueMillis(200);
+        long now = System.currentTimeMillis();
+        try {
+            client().prepareUpdate("test1", "type1", "1").setDoc("field", "value2").setTimeout(timeout).get();
+            fail("Expected ClusterBlockException");
+        } catch (ClusterBlockException e) {
+            assertThat(System.currentTimeMillis() - now, greaterThan(timeout.millis() - 50));
+            assertThat(e.status(), equalTo(RestStatus.SERVICE_UNAVAILABLE));
+        }
+
+        now = System.currentTimeMillis();
+        try {
+            client().prepareIndex("test1", "type1", "1").setSource(XContentFactory.jsonBuilder().startObject().endObject()).setTimeout(timeout).get();
+            fail("Expected ClusterBlockException");
+        } catch (ClusterBlockException e) {
+            assertThat(System.currentTimeMillis() - now, greaterThan(timeout.millis() - 50));
+            assertThat(e.status(), equalTo(RestStatus.SERVICE_UNAVAILABLE));
+        }
+
+        internalCluster().startNode(settings);
+        client().admin().cluster().prepareHealth().setWaitForGreenStatus().setWaitForNodes("2").get();
+    }
 }

From 3cdbb1a79d78c1209de197607c9122d037b0edef Mon Sep 17 00:00:00 2001
From: Martijn van Groningen <martijn.v.groningen@gmail.com>
Date: Thu, 17 Apr 2014 15:36:13 +0700
Subject: [PATCH 05/74] [Discovery] Enable
 `discovery.zen.rejoin_on_master_gone` setting in
 DiscoveryWithNetworkFailuresTests only.

---
 .../DiscoveryWithNetworkFailuresTests.java    | 28 +++++++++++--------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java b/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java
index d2987f77ad0fa..a785a4e5d2ed3 100644
--- a/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java
+++ b/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java
@@ -20,7 +20,6 @@
 package org.elasticsearch.discovery;
 
 import com.google.common.base.Predicate;
-import org.apache.lucene.util.LuceneTestCase;
 import org.elasticsearch.action.admin.cluster.health.ClusterHealthResponse;
 import org.elasticsearch.client.Client;
 import org.elasticsearch.cluster.ClusterState;
@@ -30,13 +29,14 @@
 import org.elasticsearch.common.settings.ImmutableSettings;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.test.ElasticsearchIntegrationTest;
+import org.elasticsearch.test.junit.annotations.TestLogging;
 import org.elasticsearch.test.transport.MockTransportService;
 import org.elasticsearch.transport.TransportModule;
 import org.elasticsearch.transport.TransportService;
 import org.junit.Test;
 
-import java.util.Arrays;
 import java.util.List;
+import java.util.concurrent.TimeUnit;
 
 import static org.elasticsearch.test.ElasticsearchIntegrationTest.ClusterScope;
 import static org.elasticsearch.test.ElasticsearchIntegrationTest.Scope;
@@ -44,21 +44,23 @@
 
 /**
  */
-@ClusterScope(scope= Scope.SUITE, numDataNodes =0)
+@ClusterScope(scope= Scope.TEST, numDataNodes =0)
 public class DiscoveryWithNetworkFailuresTests extends ElasticsearchIntegrationTest {
 
     @Test
-    @LuceneTestCase.AwaitsFix(bugUrl = "https://github.com/elasticsearch/elasticsearch/issues/2488")
+    @TestLogging("discovery.zen:TRACE")
     public void failWithMinimumMasterNodesConfigured() throws Exception {
         final Settings settings = ImmutableSettings.settingsBuilder()
-                .put("discovery.zen.minimum_master_nodes", 2)
+                .put("discovery.type", "zen") // <-- To override the local setting if set externally
                 .put("discovery.zen.fd.ping_timeout", "1s") // <-- for hitting simulated network failures quickly
+                .put("discovery.zen.minimum_master_nodes", 2)
                 .put(TransportModule.TRANSPORT_SERVICE_TYPE_KEY, MockTransportService.class.getName())
+                .put("discovery.zen.rejoin_on_master_gone", true)
                 .build();
-        List<String>nodes = internalCluster().startNodesAsync(3, settings).get();
+
+        List<String> nodes = internalCluster().startNodesAsync(3, settings).get();
 
         // Wait until a green status has been reaches and 3 nodes are part of the cluster
-        List<String> nodesList = Arrays.asList(nodes.toArray(new String[3]));
         ClusterHealthResponse clusterHealthResponse = client().admin().cluster().prepareHealth()
                 .setWaitForEvents(Priority.LANGUID)
                 .setWaitForNodes("3")
@@ -67,7 +69,8 @@ public void failWithMinimumMasterNodesConfigured() throws Exception {
 
         // Figure out what is the elected master node
         DiscoveryNode masterDiscoNode = null;
-        for (String node : nodesList) {
+
+        for (String node : nodes) {
             ClusterState state = internalCluster().client(node).admin().cluster().prepareState().setLocal(true).execute().actionGet().getState();
             assertThat(state.nodes().size(), equalTo(3));
             if (masterDiscoNode == null) {
@@ -84,7 +87,7 @@ public void failWithMinimumMasterNodesConfigured() throws Exception {
 
         // Pick a node that isn't the elected master.
         String unluckyNode = null;
-        for (String node : nodesList) {
+        for (String node : nodes) {
             if (!node.equals(masterDiscoNode.getName())) {
                 unluckyNode = node;
             }
@@ -96,12 +99,13 @@ public void failWithMinimumMasterNodesConfigured() throws Exception {
         addFailToSendNoConnectRule(unluckyNode, masterDiscoNode.getName());
         try {
             // Wait until elected master has removed that the unlucky node...
-            awaitBusy(new Predicate<Object>() {
+            boolean applied = awaitBusy(new Predicate<Object>() {
                 @Override
                 public boolean apply(Object input) {
                     return masterClient.admin().cluster().prepareState().setLocal(true).get().getState().nodes().size() == 2;
                 }
-            });
+            }, 1, TimeUnit.MINUTES);
+            assertThat(applied, is(true));
 
             // The unlucky node must report *no* master node, since it can't connect to master and in fact it should
             // continuously ping until network failures have been resolved.
@@ -123,7 +127,7 @@ public boolean apply(Object input) {
                 .get();
         assertThat(clusterHealthResponse.isTimedOut(), is(false));
 
-        for (String node : nodesList) {
+        for (String node : nodes) {
             ClusterState state = internalCluster().client(node).admin().cluster().prepareState().setLocal(true).execute().actionGet().getState();
             assertThat(state.nodes().size(), equalTo(3));
             // The elected master shouldn't have changed, since the unlucky node never could have elected himself as

From 549076eb4cdbd848d6b52e3ac4e1b045ef7432cf Mon Sep 17 00:00:00 2001
From: Martijn van Groningen <martijn.v.groningen@gmail.com>
Date: Wed, 23 Apr 2014 17:09:44 +0700
Subject: [PATCH 06/74] [Discovery] Changed the default for the
 'rejoin_on_master_gone' option from false to true in zen discovery.

Added AwaitFix for the FullRollingRestartTests.
---
 .../java/org/elasticsearch/discovery/zen/ZenDiscovery.java    | 4 +---
 .../discovery/DiscoveryWithNetworkFailuresTests.java          | 1 -
 .../org/elasticsearch/recovery/FullRollingRestartTests.java   | 2 ++
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java b/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
index 2aecce6337ae3..1dde6fdd75383 100644
--- a/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
+++ b/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
@@ -32,7 +32,6 @@
 import org.elasticsearch.cluster.node.DiscoveryNode;
 import org.elasticsearch.cluster.node.DiscoveryNodeService;
 import org.elasticsearch.cluster.node.DiscoveryNodes;
-import org.elasticsearch.cluster.routing.RoutingTable;
 import org.elasticsearch.cluster.routing.allocation.AllocationService;
 import org.elasticsearch.cluster.routing.allocation.RoutingAllocation;
 import org.elasticsearch.common.Priority;
@@ -56,7 +55,6 @@
 import org.elasticsearch.discovery.zen.ping.ZenPing;
 import org.elasticsearch.discovery.zen.ping.ZenPingService;
 import org.elasticsearch.discovery.zen.publish.PublishClusterStateAction;
-import org.elasticsearch.gateway.GatewayService;
 import org.elasticsearch.node.service.NodeService;
 import org.elasticsearch.node.settings.NodeSettingsService;
 import org.elasticsearch.threadpool.ThreadPool;
@@ -146,7 +144,7 @@ public ZenDiscovery(Settings settings, ClusterName clusterName, ThreadPool threa
 
         this.masterElectionFilterClientNodes = settings.getAsBoolean("discovery.zen.master_election.filter_client", true);
         this.masterElectionFilterDataNodes = settings.getAsBoolean("discovery.zen.master_election.filter_data", false);
-        this.rejoinOnMasterGone = settings.getAsBoolean("discovery.zen.rejoin_on_master_gone", false);
+        this.rejoinOnMasterGone = settings.getAsBoolean("discovery.zen.rejoin_on_master_gone", true);
 
         logger.debug("using ping.timeout [{}], join.timeout [{}], master_election.filter_client [{}], master_election.filter_data [{}]", pingTimeout, joinTimeout, masterElectionFilterClientNodes, masterElectionFilterDataNodes);
 
diff --git a/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java b/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java
index a785a4e5d2ed3..39bf6b9d62401 100644
--- a/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java
+++ b/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java
@@ -55,7 +55,6 @@ public void failWithMinimumMasterNodesConfigured() throws Exception {
                 .put("discovery.zen.fd.ping_timeout", "1s") // <-- for hitting simulated network failures quickly
                 .put("discovery.zen.minimum_master_nodes", 2)
                 .put(TransportModule.TRANSPORT_SERVICE_TYPE_KEY, MockTransportService.class.getName())
-                .put("discovery.zen.rejoin_on_master_gone", true)
                 .build();
 
         List<String> nodes = internalCluster().startNodesAsync(3, settings).get();
diff --git a/src/test/java/org/elasticsearch/recovery/FullRollingRestartTests.java b/src/test/java/org/elasticsearch/recovery/FullRollingRestartTests.java
index 26bf890f85bf0..46d82250e9198 100644
--- a/src/test/java/org/elasticsearch/recovery/FullRollingRestartTests.java
+++ b/src/test/java/org/elasticsearch/recovery/FullRollingRestartTests.java
@@ -19,6 +19,7 @@
 
 package org.elasticsearch.recovery;
 
+import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.util.LuceneTestCase.Slow;
 import org.elasticsearch.action.admin.cluster.health.ClusterHealthRequestBuilder;
 import org.elasticsearch.action.admin.cluster.health.ClusterHealthResponse;
@@ -55,6 +56,7 @@ protected int numberOfReplicas() {
     @Test
     @Slow
     @TestLogging("indices.cluster:TRACE,cluster.service:TRACE")
+    @LuceneTestCase.AwaitsFix(bugUrl = "https://github.com/elasticsearch/elasticsearch/tree/feature/improve_zen")
     public void testFullRollingRestart() throws Exception {
         internalCluster().startNode();
         createIndex("test");

From 89a50f60136a2d9c435de5ada920774848404cff Mon Sep 17 00:00:00 2001
From: Martijn van Groningen <martijn.v.groningen@gmail.com>
Date: Mon, 28 Apr 2014 15:57:17 +0700
Subject: [PATCH 07/74] [Discovery] If available newly elected master node
 should take over previous known nodes.

---
 .../java/org/elasticsearch/discovery/zen/ZenDiscovery.java   | 4 +++-
 .../org/elasticsearch/cluster/MinimumMasterNodesTests.java   | 1 -
 .../java/org/elasticsearch/cluster/NoMasterNodeTests.java    | 5 +++++
 3 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java b/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
index 1dde6fdd75383..44f3926cf6707 100644
--- a/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
+++ b/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
@@ -317,7 +317,9 @@ private void innerJoinCluster() {
                 clusterService.submitStateUpdateTask("zen-disco-join (elected_as_master)", Priority.URGENT, new ProcessedClusterStateUpdateTask() {
                     @Override
                     public ClusterState execute(ClusterState currentState) {
-                        DiscoveryNodes.Builder builder = new DiscoveryNodes.Builder()
+                        // Take into account the previous known nodes, if they happen not to be available
+                        // then fault detection will remove these nodes.
+                        DiscoveryNodes.Builder builder = new DiscoveryNodes.Builder(latestDiscoNodes)
                                 .localNodeId(localNode.id())
                                 .masterNodeId(localNode.id())
                                         // put our local node
diff --git a/src/test/java/org/elasticsearch/cluster/MinimumMasterNodesTests.java b/src/test/java/org/elasticsearch/cluster/MinimumMasterNodesTests.java
index c797969a06195..b7c7e59180472 100644
--- a/src/test/java/org/elasticsearch/cluster/MinimumMasterNodesTests.java
+++ b/src/test/java/org/elasticsearch/cluster/MinimumMasterNodesTests.java
@@ -25,7 +25,6 @@
 import org.elasticsearch.client.Client;
 import org.elasticsearch.common.Priority;
 import org.elasticsearch.common.settings.Settings;
-import org.elasticsearch.discovery.Discovery;
 import org.elasticsearch.discovery.DiscoverySettings;
 import org.elasticsearch.discovery.zen.elect.ElectMasterService;
 import org.elasticsearch.index.query.QueryBuilders;
diff --git a/src/test/java/org/elasticsearch/cluster/NoMasterNodeTests.java b/src/test/java/org/elasticsearch/cluster/NoMasterNodeTests.java
index fdf27d7057258..fa1ca5e9e8005 100644
--- a/src/test/java/org/elasticsearch/cluster/NoMasterNodeTests.java
+++ b/src/test/java/org/elasticsearch/cluster/NoMasterNodeTests.java
@@ -48,6 +48,11 @@
 import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertHitCount;
 import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertThrows;
 import static org.hamcrest.Matchers.*;
+import static org.elasticsearch.test.ElasticsearchIntegrationTest.*;
+import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertExists;
+import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertHitCount;
+import static org.hamcrest.Matchers.equalTo;
+import static org.hamcrest.Matchers.greaterThan;
 
 /**
  */

From 2220c66535240c2e1d96827d8b864a1ea56036de Mon Sep 17 00:00:00 2001
From: Martijn van Groningen <martijn.v.groningen@gmail.com>
Date: Fri, 2 May 2014 12:50:30 +0700
Subject: [PATCH 08/74] [Discovery] Eagerly clean the routing table of shards
 that exist on nodes that are not in the latestDiscoNodes list.

Only the previous master node has been removed, so only shards allocated to that node will get failed.
This would have happened anyhow on later on when AllocationService#reroute is invoked (for example when a cluster setting changes or another cluster event),
but by cleaning the routing table pro-actively, the stale routing table is fixed sooner and therefor the shards
that are not accessible anyhow (because the node these shards were on has left the cluster) will get re-assigned sooner.
---
 .../java/org/elasticsearch/discovery/zen/ZenDiscovery.java  | 6 +++++-
 .../org/elasticsearch/recovery/FullRollingRestartTests.java | 6 ++----
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java b/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
index 44f3926cf6707..dc1d0df5ac53c 100644
--- a/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
+++ b/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
@@ -327,7 +327,11 @@ public ClusterState execute(ClusterState currentState) {
                         // update the fact that we are the master...
                         latestDiscoNodes = builder.build();
                         ClusterBlocks clusterBlocks = ClusterBlocks.builder().blocks(currentState.blocks()).removeGlobalBlock(discoverySettings.getNoMasterBlock()).build();
-                        return ClusterState.builder(currentState).nodes(latestDiscoNodes).blocks(clusterBlocks).build();
+                        currentState = ClusterState.builder(currentState).nodes(latestDiscoNodes).blocks(clusterBlocks).build();
+
+                        // eagerly run reroute to remove dead nodes from routing table
+                        RoutingAllocation.Result result = allocationService.reroute(currentState);
+                        return ClusterState.builder(currentState).routingResult(result).build();
                     }
 
                     @Override
diff --git a/src/test/java/org/elasticsearch/recovery/FullRollingRestartTests.java b/src/test/java/org/elasticsearch/recovery/FullRollingRestartTests.java
index 46d82250e9198..94121d71a639e 100644
--- a/src/test/java/org/elasticsearch/recovery/FullRollingRestartTests.java
+++ b/src/test/java/org/elasticsearch/recovery/FullRollingRestartTests.java
@@ -19,7 +19,6 @@
 
 package org.elasticsearch.recovery;
 
-import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.util.LuceneTestCase.Slow;
 import org.elasticsearch.action.admin.cluster.health.ClusterHealthRequestBuilder;
 import org.elasticsearch.action.admin.cluster.health.ClusterHealthResponse;
@@ -31,7 +30,7 @@
 import org.junit.Test;
 
 import static org.elasticsearch.index.query.QueryBuilders.matchAllQuery;
-import static org.elasticsearch.test.ElasticsearchIntegrationTest.*;
+import static org.elasticsearch.test.ElasticsearchIntegrationTest.Scope;
 import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertHitCount;
 
 /**
@@ -55,8 +54,7 @@ protected int numberOfReplicas() {
 
     @Test
     @Slow
-    @TestLogging("indices.cluster:TRACE,cluster.service:TRACE")
-    @LuceneTestCase.AwaitsFix(bugUrl = "https://github.com/elasticsearch/elasticsearch/tree/feature/improve_zen")
+    @TestLogging("indices.cluster:TRACE,cluster.service:TRACE,action.search:TRACE,indices.recovery:TRACE")
     public void testFullRollingRestart() throws Exception {
         internalCluster().startNode();
         createIndex("test");

From a9aa10ade07754ebef118fafa5c6b7f0a7d0e976 Mon Sep 17 00:00:00 2001
From: Boaz Leskes <b.leskes@gmail.com>
Date: Sun, 18 May 2014 20:32:06 +0200
Subject: [PATCH 09/74] Updated to use ClusterBlocks new constructor signature

Introduced with: 11a3201a092ed6c5d31516ae4b30dbb618ba348c
---
 .../java/org/elasticsearch/discovery/DiscoverySettings.java | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/main/java/org/elasticsearch/discovery/DiscoverySettings.java b/src/main/java/org/elasticsearch/discovery/DiscoverySettings.java
index 8903e73922b11..8c1ba757d9bd0 100644
--- a/src/main/java/org/elasticsearch/discovery/DiscoverySettings.java
+++ b/src/main/java/org/elasticsearch/discovery/DiscoverySettings.java
@@ -29,6 +29,8 @@
 import org.elasticsearch.node.settings.NodeSettingsService;
 import org.elasticsearch.rest.RestStatus;
 
+import java.util.EnumSet;
+
 /**
  * Exposes common discovery settings that may be supported by all the different discovery implementations
  */
@@ -42,8 +44,8 @@ public class DiscoverySettings extends AbstractComponent {
     public final static int NO_MASTER_BLOCK_ID = 2;
 
     private final static ClusterBlock ALL = new ClusterBlock(NO_MASTER_BLOCK_ID, "no master", true, true, RestStatus.SERVICE_UNAVAILABLE, ClusterBlockLevel.ALL);
-    private final static ClusterBlock WRITE = new ClusterBlock(NO_MASTER_BLOCK_ID, "no master", true, false, RestStatus.SERVICE_UNAVAILABLE, ClusterBlockLevel.WRITE, ClusterBlockLevel.METADATA);
-    private final static ClusterBlock METADATA = new ClusterBlock(NO_MASTER_BLOCK_ID, "no master", true, false, RestStatus.SERVICE_UNAVAILABLE, ClusterBlockLevel.METADATA);
+    private final static ClusterBlock WRITE = new ClusterBlock(NO_MASTER_BLOCK_ID, "no master", true, false, RestStatus.SERVICE_UNAVAILABLE, EnumSet.of(ClusterBlockLevel.WRITE, ClusterBlockLevel.METADATA));
+    private final static ClusterBlock METADATA = new ClusterBlock(NO_MASTER_BLOCK_ID, "no master", true, false, RestStatus.SERVICE_UNAVAILABLE, EnumSet.of(ClusterBlockLevel.METADATA));
 
     private volatile ClusterBlock noMasterBlock;
     private volatile TimeValue publishTimeout = DEFAULT_PUBLISH_TIMEOUT;

From d44bed5f483779d9b1d148d6637d943312d68c5a Mon Sep 17 00:00:00 2001
From: Boaz Leskes <b.leskes@gmail.com>
Date: Fri, 16 May 2014 22:09:39 +0200
Subject: [PATCH 10/74] [Internal] Do not execute cluster state changes if
 current node is no longer master

When a node steps down from being a master (because, for example, min_master_node is breached), it may still have
cluster state update tasks queued up. Most (but not all) are tasks that should no longer be executed as the node
no longer has authority to do so. Other cluster states updates, like electing the current node as master, should be
executed even if the current node is no longer master.

This commit make sure that, by default, `ClusterStateUpdateTask` is not executed if the node is no longer master. Tasks
that should run on non masters are changed to implement a new interface called `ClusterStateNonMasterUpdateTask`

Closes #6230
---
 .../TransportClusterUpdateSettingsAction.java |  9 +++-
 .../elasticsearch/cluster/ClusterService.java | 12 +++++
 .../ClusterStateNonMasterUpdateTask.java      | 27 ++++++++++
 ...cessedClusterStateNonMasterUpdateTask.java | 26 +++++++++
 .../service/InternalClusterService.java       |  6 ++-
 .../discovery/local/LocalDiscovery.java       | 10 ++--
 .../discovery/zen/ZenDiscovery.java           | 38 +++++++++----
 .../org/elasticsearch/tribe/TribeService.java |  3 +-
 .../cluster/ClusterServiceTests.java          | 53 +++++++++++++++++++
 9 files changed, 165 insertions(+), 19 deletions(-)
 create mode 100644 src/main/java/org/elasticsearch/cluster/ClusterStateNonMasterUpdateTask.java
 create mode 100644 src/main/java/org/elasticsearch/cluster/ProcessedClusterStateNonMasterUpdateTask.java

diff --git a/src/main/java/org/elasticsearch/action/admin/cluster/settings/TransportClusterUpdateSettingsAction.java b/src/main/java/org/elasticsearch/action/admin/cluster/settings/TransportClusterUpdateSettingsAction.java
index fa77ae8847874..a94b322ceb376 100644
--- a/src/main/java/org/elasticsearch/action/admin/cluster/settings/TransportClusterUpdateSettingsAction.java
+++ b/src/main/java/org/elasticsearch/action/admin/cluster/settings/TransportClusterUpdateSettingsAction.java
@@ -140,8 +140,13 @@ protected ClusterUpdateSettingsResponse newResponse(boolean acknowledged) {
                     @Override
                     public void onFailure(String source, Throwable t) {
                         //if the reroute fails we only log
-                        logger.debug("failed to perform [{}]", t, source);
-                        listener.onFailure(new ElasticsearchException("reroute after update settings failed", t));
+                        if (t instanceof ClusterService.NoLongerMasterException) {
+                            logger.debug("failed to preform reroute after cluster settings were updated - current node is no longer a master");
+                            listener.onResponse(new ClusterUpdateSettingsResponse(updateSettingsAcked, transientUpdates.build(), persistentUpdates.build()));
+                        } else {
+                            logger.debug("failed to perform [{}]", t, source);
+                            listener.onFailure(new ElasticsearchException("reroute after update settings failed", t));
+                        }
                     }
 
                     @Override
diff --git a/src/main/java/org/elasticsearch/cluster/ClusterService.java b/src/main/java/org/elasticsearch/cluster/ClusterService.java
index 6204599f57d33..f032a0cd06454 100644
--- a/src/main/java/org/elasticsearch/cluster/ClusterService.java
+++ b/src/main/java/org/elasticsearch/cluster/ClusterService.java
@@ -110,4 +110,16 @@ public interface ClusterService extends LifecycleComponent<ClusterService> {
      * Returns the tasks that are pending.
      */
     List<PendingClusterTask> pendingTasks();
+
+    /**
+     * an exception to indicate a {@link org.elasticsearch.cluster.ClusterStateUpdateTask} was not executed as
+     * the current node is no longer master
+     */
+    public static class NoLongerMasterException extends ElasticsearchIllegalStateException {
+
+        public NoLongerMasterException(String msg) {
+            super(msg);
+        }
+
+    }
 }
diff --git a/src/main/java/org/elasticsearch/cluster/ClusterStateNonMasterUpdateTask.java b/src/main/java/org/elasticsearch/cluster/ClusterStateNonMasterUpdateTask.java
new file mode 100644
index 0000000000000..2fac718ae2de2
--- /dev/null
+++ b/src/main/java/org/elasticsearch/cluster/ClusterStateNonMasterUpdateTask.java
@@ -0,0 +1,27 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.cluster;
+
+/**
+ * This is a marker interface to indicate that the task should be executed
+ * even if the current node is not a master.
+ */
+public interface ClusterStateNonMasterUpdateTask extends ClusterStateUpdateTask {
+}
diff --git a/src/main/java/org/elasticsearch/cluster/ProcessedClusterStateNonMasterUpdateTask.java b/src/main/java/org/elasticsearch/cluster/ProcessedClusterStateNonMasterUpdateTask.java
new file mode 100644
index 0000000000000..e46a2edc79245
--- /dev/null
+++ b/src/main/java/org/elasticsearch/cluster/ProcessedClusterStateNonMasterUpdateTask.java
@@ -0,0 +1,26 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.elasticsearch.cluster;
+
+/**
+ * A combination interface between {@link org.elasticsearch.cluster.ProcessedClusterStateUpdateTask} and
+ * {@link org.elasticsearch.cluster.ClusterStateNonMasterUpdateTask} to allow easy creation of anonymous classes
+ */
+public interface ProcessedClusterStateNonMasterUpdateTask extends ProcessedClusterStateUpdateTask, ClusterStateNonMasterUpdateTask {
+}
diff --git a/src/main/java/org/elasticsearch/cluster/service/InternalClusterService.java b/src/main/java/org/elasticsearch/cluster/service/InternalClusterService.java
index 7cb52df3b6b57..dbe0b4c7ad085 100644
--- a/src/main/java/org/elasticsearch/cluster/service/InternalClusterService.java
+++ b/src/main/java/org/elasticsearch/cluster/service/InternalClusterService.java
@@ -325,6 +325,11 @@ public void run() {
             }
             logger.debug("processing [{}]: execute", source);
             ClusterState previousClusterState = clusterState;
+            if (!previousClusterState.nodes().localNodeMaster() && !(updateTask instanceof ClusterStateNonMasterUpdateTask)) {
+                logger.debug("failing [{}]: local node is no longer master", source);
+                updateTask.onFailure(source, new NoLongerMasterException("source: " + source));
+                return;
+            }
             ClusterState newClusterState;
             try {
                 newClusterState = updateTask.execute(previousClusterState);
@@ -722,5 +727,4 @@ public void onTimeout() {
             }
         }
     }
-
 }
\ No newline at end of file
diff --git a/src/main/java/org/elasticsearch/discovery/local/LocalDiscovery.java b/src/main/java/org/elasticsearch/discovery/local/LocalDiscovery.java
index 1a6ffd3a66a2e..065f3b6e45fed 100644
--- a/src/main/java/org/elasticsearch/discovery/local/LocalDiscovery.java
+++ b/src/main/java/org/elasticsearch/discovery/local/LocalDiscovery.java
@@ -123,7 +123,7 @@ protected void doStart() throws ElasticsearchException {
                 // we are the first master (and the master)
                 master = true;
                 final LocalDiscovery master = firstMaster;
-                clusterService.submitStateUpdateTask("local-disco-initial_connect(master)", new ProcessedClusterStateUpdateTask() {
+                clusterService.submitStateUpdateTask("local-disco-initial_connect(master)", new ProcessedClusterStateNonMasterUpdateTask() {
                     @Override
                     public ClusterState execute(ClusterState currentState) {
                         DiscoveryNodes.Builder nodesBuilder = DiscoveryNodes.builder();
@@ -149,7 +149,7 @@ public void clusterStateProcessed(String source, ClusterState oldState, ClusterS
             } else if (firstMaster != null) {
                 // update as fast as we can the local node state with the new metadata (so we create indices for example)
                 final ClusterState masterState = firstMaster.clusterService.state();
-                clusterService.submitStateUpdateTask("local-disco(detected_master)", new ClusterStateUpdateTask() {
+                clusterService.submitStateUpdateTask("local-disco(detected_master)", new ClusterStateNonMasterUpdateTask() {
                     @Override
                     public ClusterState execute(ClusterState currentState) {
                         // make sure we have the local node id set, we might need it as a result of the new metadata
@@ -165,7 +165,7 @@ public void onFailure(String source, Throwable t) {
 
                 // tell the master to send the fact that we are here
                 final LocalDiscovery master = firstMaster;
-                firstMaster.clusterService.submitStateUpdateTask("local-disco-receive(from node[" + localNode + "])", new ProcessedClusterStateUpdateTask() {
+                firstMaster.clusterService.submitStateUpdateTask("local-disco-receive(from node[" + localNode + "])", new ProcessedClusterStateNonMasterUpdateTask() {
                     @Override
                     public ClusterState execute(ClusterState currentState) {
                         DiscoveryNodes.Builder nodesBuilder = DiscoveryNodes.builder();
@@ -225,7 +225,7 @@ protected void doStop() throws ElasticsearchException {
                 }
 
                 final LocalDiscovery master = firstMaster;
-                master.clusterService.submitStateUpdateTask("local-disco-update", new ClusterStateUpdateTask() {
+                master.clusterService.submitStateUpdateTask("local-disco-update", new ClusterStateNonMasterUpdateTask() {
                     @Override
                     public ClusterState execute(ClusterState currentState) {
                         DiscoveryNodes newNodes = currentState.nodes().removeDeadMembers(newMembers, master.localNode.id());
@@ -305,7 +305,7 @@ private void publish(LocalDiscovery[] members, ClusterState clusterState, final
                 nodeSpecificClusterState.status(ClusterState.ClusterStateStatus.RECEIVED);
                 // ignore cluster state messages that do not include "me", not in the game yet...
                 if (nodeSpecificClusterState.nodes().localNode() != null) {
-                    discovery.clusterService.submitStateUpdateTask("local-disco-receive(from master)", new ProcessedClusterStateUpdateTask() {
+                    discovery.clusterService.submitStateUpdateTask("local-disco-receive(from master)", new ProcessedClusterStateNonMasterUpdateTask() {
                         @Override
                         public ClusterState execute(ClusterState currentState) {
                             if (nodeSpecificClusterState.version() < currentState.version() && Objects.equal(nodeSpecificClusterState.nodes().masterNodeId(), currentState.nodes().masterNodeId())) {
diff --git a/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java b/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
index dc1d0df5ac53c..50f5747243a2d 100644
--- a/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
+++ b/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
@@ -314,7 +314,7 @@ private void innerJoinCluster() {
             if (localNode.equals(masterNode)) {
                 this.master = true;
                 nodesFD.start(); // start the nodes FD
-                clusterService.submitStateUpdateTask("zen-disco-join (elected_as_master)", Priority.URGENT, new ProcessedClusterStateUpdateTask() {
+                clusterService.submitStateUpdateTask("zen-disco-join (elected_as_master)", Priority.URGENT, new ProcessedClusterStateNonMasterUpdateTask() {
                     @Override
                     public ClusterState execute(ClusterState currentState) {
                         // Take into account the previous known nodes, if they happen not to be available
@@ -336,7 +336,7 @@ public ClusterState execute(ClusterState currentState) {
 
                     @Override
                     public void onFailure(String source, Throwable t) {
-                        logger.error("unexpected failure during [{}]", t, source);
+                            logger.error("unexpected failure during [{}]", t, source);
                     }
 
                     @Override
@@ -400,7 +400,12 @@ public ClusterState execute(ClusterState currentState) {
 
                 @Override
                 public void onFailure(String source, Throwable t) {
-                    logger.error("unexpected failure during [{}]", t, source);
+                    if (t instanceof ClusterService.NoLongerMasterException) {
+                        logger.debug("not processing {} leave request as we are no longer master", node);
+                    }
+                    else {
+                        logger.error("unexpected failure during [{}]", t, source);
+                    }
                 }
             });
         } else {
@@ -435,7 +440,12 @@ public ClusterState execute(ClusterState currentState) {
 
             @Override
             public void onFailure(String source, Throwable t) {
-                logger.error("unexpected failure during [{}]", t, source);
+                if (t instanceof ClusterService.NoLongerMasterException) {
+                    logger.debug("not processing [{}] as we are no longer master", source);
+                }
+                else {
+                    logger.error("unexpected failure during [{}]", t, source);
+                }
             }
 
             @Override
@@ -468,7 +478,12 @@ public ClusterState execute(ClusterState currentState) {
 
             @Override
             public void onFailure(String source, Throwable t) {
-                logger.error("unexpected failure during [{}]", t, source);
+                if (t instanceof ClusterService.NoLongerMasterException) {
+                    logger.debug("not processing [{}] as we are no longer master", source);
+                }
+                else {
+                    logger.error("unexpected failure during [{}]", t, source);
+                }
             }
 
             @Override
@@ -490,7 +505,7 @@ private void handleMasterGone(final DiscoveryNode masterNode, final String reaso
 
         logger.info("master_left [{}], reason [{}]", masterNode, reason);
 
-        clusterService.submitStateUpdateTask("zen-disco-master_failed (" + masterNode + ")", Priority.IMMEDIATE, new ProcessedClusterStateUpdateTask() {
+        clusterService.submitStateUpdateTask("zen-disco-master_failed (" + masterNode + ")", Priority.IMMEDIATE, new ProcessedClusterStateNonMasterUpdateTask() {
             @Override
             public ClusterState execute(ClusterState currentState) {
                 if (!masterNode.id().equals(currentState.nodes().masterNodeId())) {
@@ -624,7 +639,7 @@ public void onFailure(String source, Throwable t) {
                 final ProcessClusterState processClusterState = new ProcessClusterState(newClusterState, newStateProcessed);
                 processNewClusterStates.add(processClusterState);
 
-                clusterService.submitStateUpdateTask("zen-disco-receive(from master [" + newClusterState.nodes().masterNode() + "])", Priority.URGENT, new ProcessedClusterStateUpdateTask() {
+                clusterService.submitStateUpdateTask("zen-disco-receive(from master [" + newClusterState.nodes().masterNode() + "])", Priority.URGENT, new ProcessedClusterStateNonMasterUpdateTask() {
                     @Override
                     public ClusterState execute(ClusterState currentState) {
                         // we already processed it in a previous event
@@ -961,7 +976,7 @@ public RejoinClusterRequest newInstance() {
 
         @Override
         public void messageReceived(final RejoinClusterRequest request, final TransportChannel channel) throws Exception {
-            clusterService.submitStateUpdateTask("received a request to rejoin the cluster from [" + request.fromNodeId + "]", Priority.URGENT, new ClusterStateUpdateTask() {
+            clusterService.submitStateUpdateTask("received a request to rejoin the cluster from [" + request.fromNodeId + "]", Priority.URGENT, new ClusterStateNonMasterUpdateTask() {
                 @Override
                 public ClusterState execute(ClusterState currentState) {
                     try {
@@ -974,7 +989,12 @@ public ClusterState execute(ClusterState currentState) {
 
                 @Override
                 public void onFailure(String source, Throwable t) {
-                    logger.error("unexpected failure during [{}]", t, source);
+                    if (t instanceof ClusterService.NoLongerMasterException) {
+                        logger.debug("not processing [{}] as we are no longer master", source);
+                    }
+                    else {
+                        logger.error("unexpected failure during [{}]", t, source);
+                    }
                 }
             });
         }
diff --git a/src/main/java/org/elasticsearch/tribe/TribeService.java b/src/main/java/org/elasticsearch/tribe/TribeService.java
index 0894edccd8c2c..9c1607900a78c 100644
--- a/src/main/java/org/elasticsearch/tribe/TribeService.java
+++ b/src/main/java/org/elasticsearch/tribe/TribeService.java
@@ -43,7 +43,6 @@
 import org.elasticsearch.common.settings.ImmutableSettings;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.common.util.concurrent.ConcurrentCollections;
-import org.elasticsearch.discovery.Discovery;
 import org.elasticsearch.discovery.DiscoveryService;
 import org.elasticsearch.gateway.GatewayService;
 import org.elasticsearch.node.NodeBuilder;
@@ -223,7 +222,7 @@ class TribeClusterStateListener implements ClusterStateListener {
         @Override
         public void clusterChanged(final ClusterChangedEvent event) {
             logger.debug("[{}] received cluster event, [{}]", tribeName, event.source());
-            clusterService.submitStateUpdateTask("cluster event from " + tribeName + ", " + event.source(), new ClusterStateUpdateTask() {
+            clusterService.submitStateUpdateTask("cluster event from " + tribeName + ", " + event.source(), new ClusterStateNonMasterUpdateTask() {
                 @Override
                 public ClusterState execute(ClusterState currentState) throws Exception {
                     ClusterState tribeState = event.state();
diff --git a/src/test/java/org/elasticsearch/cluster/ClusterServiceTests.java b/src/test/java/org/elasticsearch/cluster/ClusterServiceTests.java
index dde9eedc4e1ed..1286c62d1668a 100644
--- a/src/test/java/org/elasticsearch/cluster/ClusterServiceTests.java
+++ b/src/test/java/org/elasticsearch/cluster/ClusterServiceTests.java
@@ -19,6 +19,7 @@
 package org.elasticsearch.cluster;
 
 import com.google.common.base.Predicate;
+import com.google.common.util.concurrent.ListenableFuture;
 import org.elasticsearch.ElasticsearchException;
 import org.elasticsearch.action.admin.cluster.health.ClusterHealthResponse;
 import org.elasticsearch.action.admin.cluster.tasks.PendingClusterTasksResponse;
@@ -256,6 +257,58 @@ public void onFailure(String source, Throwable t) {
         assertThat(processedLatch.await(1, TimeUnit.SECONDS), equalTo(true));
     }
 
+    @Test
+    public void testMasterAwareExecution() throws Exception {
+        Settings settings = settingsBuilder()
+                .put("discovery.type", "local")
+                .build();
+
+        ListenableFuture<String> master = cluster().startNodeAsync(settings);
+        ListenableFuture<String> nonMaster = cluster().startNodeAsync(settingsBuilder().put(settings).put("node.master", false).build());
+        master.get();
+        ensureGreen(); // make sure we have a cluster
+
+        ClusterService clusterService = cluster().getInstance(ClusterService.class, nonMaster.get());
+
+        final boolean[] taskFailed = {false};
+        final CountDownLatch latch1 = new CountDownLatch(1);
+        clusterService.submitStateUpdateTask("test", new ClusterStateUpdateTask() {
+            @Override
+            public ClusterState execute(ClusterState currentState) throws Exception {
+                latch1.countDown();
+                return currentState;
+            }
+
+            @Override
+            public void onFailure(String source, Throwable t) {
+                taskFailed[0] = true;
+                latch1.countDown();
+            }
+        });
+
+        latch1.await();
+        assertTrue("cluster state update task was executed on a non-master", taskFailed[0]);
+
+        taskFailed[0] = true;
+        final CountDownLatch latch2 = new CountDownLatch(1);
+        clusterService.submitStateUpdateTask("test", new ClusterStateNonMasterUpdateTask() {
+            @Override
+            public ClusterState execute(ClusterState currentState) throws Exception {
+                taskFailed[0] = false;
+                latch2.countDown();
+                return currentState;
+            }
+
+            @Override
+            public void onFailure(String source, Throwable t) {
+                taskFailed[0] = true;
+                latch2.countDown();
+            }
+        });
+        latch2.await();
+        assertFalse("non-master cluster state update task was not executed", taskFailed[0]);
+    }
+
     @Test
     public void testAckedUpdateTaskNoAckExpected() throws Exception {
         Settings settings = settingsBuilder()

From 2c9ef63676614f142c323c34972e1d171435c236 Mon Sep 17 00:00:00 2001
From: Martijn van Groningen <martijn.v.groningen@gmail.com>
Date: Thu, 5 Jun 2014 22:34:11 +0200
Subject: [PATCH 11/74] [TEST] It may take a little bit before the unlucky node
 deals with the fact the master left

---
 .../DiscoveryWithNetworkFailuresTests.java    | 22 ++++++++++++++-----
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java b/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java
index 39bf6b9d62401..1a2c01ccbbbc8 100644
--- a/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java
+++ b/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java
@@ -40,7 +40,8 @@
 
 import static org.elasticsearch.test.ElasticsearchIntegrationTest.ClusterScope;
 import static org.elasticsearch.test.ElasticsearchIntegrationTest.Scope;
-import static org.hamcrest.Matchers.*;
+import static org.hamcrest.Matchers.equalTo;
+import static org.hamcrest.Matchers.is;
 
 /**
  */
@@ -53,6 +54,7 @@ public void failWithMinimumMasterNodesConfigured() throws Exception {
         final Settings settings = ImmutableSettings.settingsBuilder()
                 .put("discovery.type", "zen") // <-- To override the local setting if set externally
                 .put("discovery.zen.fd.ping_timeout", "1s") // <-- for hitting simulated network failures quickly
+                .put("discovery.zen.fd.ping_retries", "1") // <-- for hitting simulated network failures quickly
                 .put("discovery.zen.minimum_master_nodes", 2)
                 .put(TransportModule.TRANSPORT_SERVICE_TYPE_KEY, MockTransportService.class.getName())
                 .build();
@@ -107,11 +109,19 @@ public boolean apply(Object input) {
             assertThat(applied, is(true));
 
             // The unlucky node must report *no* master node, since it can't connect to master and in fact it should
-            // continuously ping until network failures have been resolved.
-            Client isolatedNodeClient = internalCluster().client(unluckyNode);
-            ClusterState localClusterState = isolatedNodeClient.admin().cluster().prepareState().setLocal(true).get().getState();
-            DiscoveryNodes localDiscoveryNodes = localClusterState.nodes();
-            assertThat(localDiscoveryNodes.masterNode(), nullValue());
+            // continuously ping until network failures have been resolved. However
+            final Client isolatedNodeClient = internalCluster().client(unluckyNode);
+            // It may a take a bit before the node detects it has been cut off from the elected master
+            applied = awaitBusy(new Predicate<Object>() {
+                @Override
+                public boolean apply(Object input) {
+                    ClusterState localClusterState = isolatedNodeClient.admin().cluster().prepareState().setLocal(true).get().getState();
+                    DiscoveryNodes localDiscoveryNodes = localClusterState.nodes();
+                    logger.info("localDiscoveryNodes=" + localDiscoveryNodes.prettyPrint());
+                    return localDiscoveryNodes.masterNode() == null;
+                }
+            }, 10, TimeUnit.SECONDS);
+            assertThat(applied, is(true));
         } finally {
             // stop simulating network failures, from this point on the unlucky node is able to rejoin
             // We also need to do this even if assertions fail, since otherwise the test framework can't work properly

From fc8ae4d30dee9b255f7caa7da76a9793b1cc6c25 Mon Sep 17 00:00:00 2001
From: Martijn van Groningen <martijn.v.groningen@gmail.com>
Date: Fri, 6 Jun 2014 12:09:11 +0200
Subject: [PATCH 12/74] [TEST] Added test that verifies data integrity during
 and after a simulated network split.

---
 .../DiscoveryWithNetworkFailuresTests.java    | 231 +++++++++++++++++-
 1 file changed, 221 insertions(+), 10 deletions(-)

diff --git a/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java b/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java
index 1a2c01ccbbbc8..7c0824cc605ba 100644
--- a/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java
+++ b/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java
@@ -21,13 +21,25 @@
 
 import com.google.common.base.Predicate;
 import org.elasticsearch.action.admin.cluster.health.ClusterHealthResponse;
+import org.elasticsearch.action.admin.cluster.health.ClusterHealthStatus;
+import org.elasticsearch.action.get.GetResponse;
+import org.elasticsearch.action.index.IndexRequestBuilder;
+import org.elasticsearch.action.search.SearchResponse;
+import org.elasticsearch.action.update.UpdateResponse;
 import org.elasticsearch.client.Client;
 import org.elasticsearch.cluster.ClusterState;
+import org.elasticsearch.cluster.block.ClusterBlock;
+import org.elasticsearch.cluster.block.ClusterBlockException;
 import org.elasticsearch.cluster.node.DiscoveryNode;
 import org.elasticsearch.cluster.node.DiscoveryNodes;
+import org.elasticsearch.cluster.routing.allocation.decider.ShardsLimitAllocationDecider;
 import org.elasticsearch.common.Priority;
 import org.elasticsearch.common.settings.ImmutableSettings;
 import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.common.unit.TimeValue;
+import org.elasticsearch.rest.RestStatus;
+import org.elasticsearch.search.SearchHit;
+import org.elasticsearch.search.sort.SortOrder;
 import org.elasticsearch.test.ElasticsearchIntegrationTest;
 import org.elasticsearch.test.junit.annotations.TestLogging;
 import org.elasticsearch.test.transport.MockTransportService;
@@ -40,26 +52,46 @@
 
 import static org.elasticsearch.test.ElasticsearchIntegrationTest.ClusterScope;
 import static org.elasticsearch.test.ElasticsearchIntegrationTest.Scope;
-import static org.hamcrest.Matchers.equalTo;
-import static org.hamcrest.Matchers.is;
+import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.*;
+import static org.hamcrest.Matchers.*;
 
 /**
  */
 @ClusterScope(scope= Scope.TEST, numDataNodes =0)
 public class DiscoveryWithNetworkFailuresTests extends ElasticsearchIntegrationTest {
 
+    private static final Settings nodeSettings = ImmutableSettings.settingsBuilder()
+            .put("discovery.type", "zen") // <-- To override the local setting if set externally
+            .put("discovery.zen.fd.ping_timeout", "1s") // <-- for hitting simulated network failures quickly
+            .put("discovery.zen.fd.ping_retries", "1") // <-- for hitting simulated network failures quickly
+            .put("discovery.zen.minimum_master_nodes", 2)
+            .put(TransportModule.TRANSPORT_SERVICE_TYPE_KEY, MockTransportService.class.getName())
+            .build();
+
+    @Override
+    protected int numberOfShards() {
+        return 3;
+    }
+
+    @Override
+    protected int numberOfReplicas() {
+        return 1;
+    }
+
+    @Override
+    public Settings indexSettings() {
+        Settings settings = super.indexSettings();
+        return ImmutableSettings.builder()
+                .put(settings)
+                .put(ShardsLimitAllocationDecider.INDEX_TOTAL_SHARDS_PER_NODE, 2)
+                .build();
+    }
+
     @Test
     @TestLogging("discovery.zen:TRACE")
     public void failWithMinimumMasterNodesConfigured() throws Exception {
-        final Settings settings = ImmutableSettings.settingsBuilder()
-                .put("discovery.type", "zen") // <-- To override the local setting if set externally
-                .put("discovery.zen.fd.ping_timeout", "1s") // <-- for hitting simulated network failures quickly
-                .put("discovery.zen.fd.ping_retries", "1") // <-- for hitting simulated network failures quickly
-                .put("discovery.zen.minimum_master_nodes", 2)
-                .put(TransportModule.TRANSPORT_SERVICE_TYPE_KEY, MockTransportService.class.getName())
-                .build();
 
-        List<String> nodes = internalCluster().startNodesAsync(3, settings).get();
+        List<String> nodes = internalCluster().startNodesAsync(3, nodeSettings).get();
 
         // Wait until a green status has been reaches and 3 nodes are part of the cluster
         ClusterHealthResponse clusterHealthResponse = client().admin().cluster().prepareHealth()
@@ -145,6 +177,185 @@ public boolean apply(Object input) {
         }
     }
 
+    @Test
+    @TestLogging("discovery.zen:TRACE,action:TRACE,cluster.service:TRACE")
+    public void testDataConsistency() throws Exception {
+        List<String> nodes = internalCluster().startNodesAsync(3, nodeSettings).get();
+
+        // Wait until a green status has been reaches and 3 nodes are part of the cluster
+        ClusterHealthResponse clusterHealthResponse = client().admin().cluster().prepareHealth()
+                .setWaitForEvents(Priority.LANGUID)
+                .setWaitForNodes("3")
+                .get();
+        assertThat(clusterHealthResponse.isTimedOut(), is(false));
+
+        assertAcked(prepareCreate("test")
+                .addMapping("type", "field", "type=long")
+                .get());
+
+        IndexRequestBuilder[] indexRequests = new IndexRequestBuilder[1 + randomInt(1000)];
+        for (int i = 0; i < indexRequests.length; i++) {
+            indexRequests[i] = client().prepareIndex("test", "type", String.valueOf(i)).setSource("field", i);
+        }
+        indexRandom(true, indexRequests);
+
+
+        for (int i = 0; i < indexRequests.length; i++) {
+            GetResponse getResponse = client().prepareGet("test", "type", String.valueOf(i)).get();
+            assertThat(getResponse.isExists(), is(true));
+            assertThat(getResponse.getVersion(), equalTo(1l));
+            assertThat(getResponse.getId(), equalTo(String.valueOf(i)));
+        }
+        SearchResponse searchResponse = client().prepareSearch("test").setTypes("type")
+                .addSort("field", SortOrder.ASC)
+                .get();
+        assertHitCount(searchResponse, indexRequests.length);
+        for (int i = 0; i < searchResponse.getHits().getHits().length; i++) {
+            SearchHit searchHit = searchResponse.getHits().getAt(i);
+            assertThat(searchHit.id(), equalTo(String.valueOf(i)));
+            assertThat((long) searchHit.sortValues()[0], equalTo((long) i));
+        }
+
+        // Figure out what is the elected master node
+        DiscoveryNode masterDiscoNode = null;
+        for (String node : nodes) {
+            ClusterState state = internalCluster().client(node).admin().cluster().prepareState().setLocal(true).execute().actionGet().getState();
+            assertThat(state.nodes().size(), equalTo(3));
+            if (masterDiscoNode == null) {
+                masterDiscoNode = state.nodes().masterNode();
+            } else {
+                assertThat(state.nodes().masterNode(), equalTo(masterDiscoNode));
+            }
+        }
+        assert masterDiscoNode != null;
+        logger.info("---> legit elected master node=" + masterDiscoNode);
+        final Client masterClient = internalCluster().masterClient();
+
+        // Everything is stable now, it is now time to simulate evil...
+
+        // Pick a node that isn't the elected master.
+        String unluckyNode = null;
+        for (String node : nodes) {
+            if (!node.equals(masterDiscoNode.getName())) {
+                unluckyNode = node;
+            }
+        }
+        assert unluckyNode != null;
+
+        // Simulate a network issue between the unlucky node and the rest of the cluster.
+        for (String nodeId : nodes) {
+            if (!nodeId.equals(unluckyNode)) {
+                addFailToSendNoConnectRule(nodeId, unluckyNode);
+                addFailToSendNoConnectRule(unluckyNode, nodeId);
+            }
+        }
+        try {
+            // Wait until elected master has removed that the unlucky node...
+            boolean applied = awaitBusy(new Predicate<Object>() {
+                @Override
+                public boolean apply(Object input) {
+                    return masterClient.admin().cluster().prepareState().setLocal(true).get().getState().nodes().size() == 2;
+                }
+            }, 1, TimeUnit.MINUTES);
+            assertThat(applied, is(true));
+
+            // The unlucky node must report *no* master node, since it can't connect to master and in fact it should
+            // continuously ping until network failures have been resolved. However
+            final Client isolatedNodeClient = internalCluster().client(unluckyNode);
+            // It may a take a bit before the node detects it has been cut off from the elected master
+            applied = awaitBusy(new Predicate<Object>() {
+                @Override
+                public boolean apply(Object input) {
+                    ClusterState localClusterState = isolatedNodeClient.admin().cluster().prepareState().setLocal(true).get().getState();
+                    DiscoveryNodes localDiscoveryNodes = localClusterState.nodes();
+                    logger.info("localDiscoveryNodes=" + localDiscoveryNodes.prettyPrint());
+                    return localDiscoveryNodes.masterNode() == null;
+                }
+            }, 10, TimeUnit.SECONDS);
+            assertThat(applied, is(true));
+
+            ClusterHealthResponse healthResponse = masterClient.admin().cluster().prepareHealth("test")
+                    .setWaitForYellowStatus().get();
+            assertThat(healthResponse.isTimedOut(), is(false));
+            assertThat(healthResponse.getStatus(), equalTo(ClusterHealthStatus.YELLOW));
+
+            // Reads on the right side of the split must work
+            searchResponse = masterClient.prepareSearch("test").setTypes("type")
+                    .addSort("field", SortOrder.ASC)
+                    .get();
+            assertHitCount(searchResponse, indexRequests.length);
+            for (int i = 0; i < searchResponse.getHits().getHits().length; i++) {
+                SearchHit searchHit = searchResponse.getHits().getAt(i);
+                assertThat(searchHit.id(), equalTo(String.valueOf(i)));
+                assertThat((long) searchHit.sortValues()[0], equalTo((long) i));
+            }
+
+            // Reads on the wrong side of the split are partial
+            searchResponse = isolatedNodeClient.prepareSearch("test").setTypes("type")
+                    .addSort("field", SortOrder.ASC)
+                    .get();
+            assertThat(searchResponse.getSuccessfulShards(), lessThan(searchResponse.getTotalShards()));
+            assertThat(searchResponse.getHits().totalHits(), lessThan((long) indexRequests.length));
+
+            // Writes on the right side of the split must work
+            UpdateResponse updateResponse = masterClient.prepareUpdate("test", "type", "0").setDoc("field2", 2).get();
+            assertThat(updateResponse.getVersion(), equalTo(2l));
+
+            // Writes on the wrong side of the split fail
+            try {
+                isolatedNodeClient.prepareUpdate("test", "type", "0").setDoc("field2", 2)
+                        .setTimeout(TimeValue.timeValueSeconds(5)) // Fail quick, otherwise we wait 60 seconds.
+                        .get();
+            } catch (ClusterBlockException exception) {
+                assertThat(exception.status(), equalTo(RestStatus.SERVICE_UNAVAILABLE));
+                assertThat(exception.blocks().size(), equalTo(1));
+                ClusterBlock clusterBlock = exception.blocks().iterator().next();
+                assertThat(clusterBlock.id(), equalTo(DiscoverySettings.NO_MASTER_BLOCK_ID));
+            }
+        } finally {
+            // stop simulating network failures, from this point on the unlucky node is able to rejoin
+            // We also need to do this even if assertions fail, since otherwise the test framework can't work properly
+            for (String nodeId : nodes) {
+                if (!nodeId.equals(unluckyNode)) {
+                    clearNoConnectRule(nodeId, unluckyNode);
+                    clearNoConnectRule(unluckyNode, nodeId);
+                }
+            }
+        }
+
+        // Wait until the master node sees all 3 nodes again.
+        clusterHealthResponse = masterClient.admin().cluster().prepareHealth()
+                .setWaitForGreenStatus()
+                .setWaitForEvents(Priority.LANGUID)
+                .setWaitForNodes("3")
+                .get();
+        assertThat(clusterHealthResponse.isTimedOut(), is(false));
+
+        for (String node : nodes) {
+            Client client = internalCluster().client(node);
+            searchResponse = client.prepareSearch("test").setTypes("type")
+                    .addSort("field", SortOrder.ASC)
+                    .get();
+            for (int i = 0; i < searchResponse.getHits().getHits().length; i++) {
+                SearchHit searchHit = searchResponse.getHits().getAt(i);
+                assertThat(searchHit.id(), equalTo(String.valueOf(i)));
+                assertThat((long) searchHit.sortValues()[0], equalTo((long) i));
+            }
+
+
+            GetResponse getResponse = client().prepareGet("test", "type", "0").get();
+            assertThat(getResponse.isExists(), is(true));
+            assertThat(getResponse.getVersion(), equalTo(2l));
+            assertThat(getResponse.getId(), equalTo("0"));
+            for (int i = 1; i < indexRequests.length; i++) {
+                getResponse = client().prepareGet("test", "type", String.valueOf(i)).get();
+                assertThat(getResponse.isExists(), is(true));
+                assertThat(getResponse.getVersion(), equalTo(1l));
+                assertThat(getResponse.getId(), equalTo(String.valueOf(i)));
+            }
+        }
+    }
+
     private void addFailToSendNoConnectRule(String fromNode, String toNode) {
         TransportService mockTransportService = internalCluster().getInstance(TransportService.class, fromNode);
         ((MockTransportService) mockTransportService).addFailToSendNoConnectRule(internalCluster().getInstance(Discovery.class, toNode).localNode());

From e7d24ecdd034933cc56a5134d2e7e145450bd714 Mon Sep 17 00:00:00 2001
From: Martijn van Groningen <martijn.v.groningen@gmail.com>
Date: Fri, 6 Jun 2014 17:13:47 +0200
Subject: [PATCH 13/74] [TEST] Make sure there no initializing shards when
 network partition is simulated

---
 .../DiscoveryWithNetworkFailuresTests.java    | 70 +++++++++----------
 1 file changed, 34 insertions(+), 36 deletions(-)

diff --git a/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java b/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java
index 7c0824cc605ba..4e16160dc90f9 100644
--- a/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java
+++ b/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java
@@ -101,18 +101,7 @@ public void failWithMinimumMasterNodesConfigured() throws Exception {
         assertThat(clusterHealthResponse.isTimedOut(), is(false));
 
         // Figure out what is the elected master node
-        DiscoveryNode masterDiscoNode = null;
-
-        for (String node : nodes) {
-            ClusterState state = internalCluster().client(node).admin().cluster().prepareState().setLocal(true).execute().actionGet().getState();
-            assertThat(state.nodes().size(), equalTo(3));
-            if (masterDiscoNode == null) {
-                masterDiscoNode = state.nodes().masterNode();
-            } else {
-                assertThat(state.nodes().masterNode(), equalTo(masterDiscoNode));
-            }
-        }
-        assert masterDiscoNode != null;
+        DiscoveryNode masterDiscoNode = findMasterNode(nodes);
         logger.info("---> legit elected master node=" + masterDiscoNode);
         final Client masterClient = internalCluster().masterClient();
 
@@ -193,7 +182,7 @@ public void testDataConsistency() throws Exception {
                 .addMapping("type", "field", "type=long")
                 .get());
 
-        IndexRequestBuilder[] indexRequests = new IndexRequestBuilder[1 + randomInt(1000)];
+        IndexRequestBuilder[] indexRequests = new IndexRequestBuilder[scaledRandomIntBetween(1, 1000)];
         for (int i = 0; i < indexRequests.length; i++) {
             indexRequests[i] = client().prepareIndex("test", "type", String.valueOf(i)).setSource("field", i);
         }
@@ -217,36 +206,30 @@ public void testDataConsistency() throws Exception {
         }
 
         // Figure out what is the elected master node
-        DiscoveryNode masterDiscoNode = null;
-        for (String node : nodes) {
-            ClusterState state = internalCluster().client(node).admin().cluster().prepareState().setLocal(true).execute().actionGet().getState();
-            assertThat(state.nodes().size(), equalTo(3));
-            if (masterDiscoNode == null) {
-                masterDiscoNode = state.nodes().masterNode();
-            } else {
-                assertThat(state.nodes().masterNode(), equalTo(masterDiscoNode));
-            }
-        }
-        assert masterDiscoNode != null;
+        DiscoveryNode masterDiscoNode = findMasterNode(nodes);
+
         logger.info("---> legit elected master node=" + masterDiscoNode);
         final Client masterClient = internalCluster().masterClient();
 
         // Everything is stable now, it is now time to simulate evil...
+        // but first make sure we have no initializing shards and all is green
+        // (waiting for green here, because indexing / search in a yellow index is fine as long as no other nodes go down)
+        ensureGreen("test");
 
         // Pick a node that isn't the elected master.
-        String unluckyNode = null;
+        String isolatedNode = null;
         for (String node : nodes) {
             if (!node.equals(masterDiscoNode.getName())) {
-                unluckyNode = node;
+                isolatedNode = node;
             }
         }
-        assert unluckyNode != null;
+        assert isolatedNode != null;
 
         // Simulate a network issue between the unlucky node and the rest of the cluster.
         for (String nodeId : nodes) {
-            if (!nodeId.equals(unluckyNode)) {
-                addFailToSendNoConnectRule(nodeId, unluckyNode);
-                addFailToSendNoConnectRule(unluckyNode, nodeId);
+            if (!nodeId.equals(isolatedNode)) {
+                addFailToSendNoConnectRule(nodeId, isolatedNode);
+                addFailToSendNoConnectRule(isolatedNode, nodeId);
             }
         }
         try {
@@ -261,7 +244,7 @@ public boolean apply(Object input) {
 
             // The unlucky node must report *no* master node, since it can't connect to master and in fact it should
             // continuously ping until network failures have been resolved. However
-            final Client isolatedNodeClient = internalCluster().client(unluckyNode);
+            final Client isolatedNodeClient = internalCluster().client(isolatedNode);
             // It may a take a bit before the node detects it has been cut off from the elected master
             applied = awaitBusy(new Predicate<Object>() {
                 @Override
@@ -316,9 +299,9 @@ public boolean apply(Object input) {
             // stop simulating network failures, from this point on the unlucky node is able to rejoin
             // We also need to do this even if assertions fail, since otherwise the test framework can't work properly
             for (String nodeId : nodes) {
-                if (!nodeId.equals(unluckyNode)) {
-                    clearNoConnectRule(nodeId, unluckyNode);
-                    clearNoConnectRule(unluckyNode, nodeId);
+                if (!nodeId.equals(isolatedNode)) {
+                    clearNoConnectRule(nodeId, isolatedNode);
+                    clearNoConnectRule(isolatedNode, nodeId);
                 }
             }
         }
@@ -343,12 +326,12 @@ public boolean apply(Object input) {
             }
 
 
-            GetResponse getResponse = client().prepareGet("test", "type", "0").get();
+            GetResponse getResponse = client.prepareGet("test", "type", "0").get();
             assertThat(getResponse.isExists(), is(true));
             assertThat(getResponse.getVersion(), equalTo(2l));
             assertThat(getResponse.getId(), equalTo("0"));
             for (int i = 1; i < indexRequests.length; i++) {
-                getResponse = client().prepareGet("test", "type", String.valueOf(i)).get();
+                getResponse = client.prepareGet("test", "type", String.valueOf(i)).get();
                 assertThat(getResponse.isExists(), is(true));
                 assertThat(getResponse.getVersion(), equalTo(1l));
                 assertThat(getResponse.getId(), equalTo(String.valueOf(i)));
@@ -356,6 +339,21 @@ public boolean apply(Object input) {
         }
     }
 
+    private DiscoveryNode findMasterNode(List<String> nodes) {
+        DiscoveryNode masterDiscoNode = null;
+        for (String node : nodes) {
+            ClusterState state = internalCluster().client(node).admin().cluster().prepareState().setLocal(true).execute().actionGet().getState();
+            assertThat(state.nodes().size(), equalTo(3));
+            if (masterDiscoNode == null) {
+                masterDiscoNode = state.nodes().masterNode();
+            } else {
+                assertThat(state.nodes().masterNode(), equalTo(masterDiscoNode));
+            }
+        }
+        assert masterDiscoNode != null;
+        return masterDiscoNode;
+    }
+
     private void addFailToSendNoConnectRule(String fromNode, String toNode) {
         TransportService mockTransportService = internalCluster().getInstance(TransportService.class, fromNode);
         ((MockTransportService) mockTransportService).addFailToSendNoConnectRule(internalCluster().getInstance(Discovery.class, toNode).localNode());

From 4828e78637e23fc0f6c63092d47afe19dfbbecd2 Mon Sep 17 00:00:00 2001
From: Martijn van Groningen <martijn.v.groningen@gmail.com>
Date: Wed, 11 Jun 2014 10:06:43 +0200
Subject: [PATCH 14/74] [TEST] Added test that exposes a shard consistency
 problem when isolated node(s) rejoin the cluster after network segmentation
 and when the elected master node ended up on the lesser side of the network
 segmentation.

---
 .../DiscoveryWithNetworkFailuresTests.java    | 114 ++++++++++++++----
 1 file changed, 88 insertions(+), 26 deletions(-)

diff --git a/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java b/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java
index 4e16160dc90f9..1d6a346dbd9fa 100644
--- a/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java
+++ b/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java
@@ -24,12 +24,14 @@
 import org.elasticsearch.action.admin.cluster.health.ClusterHealthStatus;
 import org.elasticsearch.action.get.GetResponse;
 import org.elasticsearch.action.index.IndexRequestBuilder;
+import org.elasticsearch.action.index.IndexResponse;
 import org.elasticsearch.action.search.SearchResponse;
 import org.elasticsearch.action.update.UpdateResponse;
 import org.elasticsearch.client.Client;
 import org.elasticsearch.cluster.ClusterState;
 import org.elasticsearch.cluster.block.ClusterBlock;
 import org.elasticsearch.cluster.block.ClusterBlockException;
+import org.elasticsearch.cluster.metadata.IndexMetaData;
 import org.elasticsearch.cluster.node.DiscoveryNode;
 import org.elasticsearch.cluster.node.DiscoveryNodes;
 import org.elasticsearch.cluster.routing.allocation.decider.ShardsLimitAllocationDecider;
@@ -45,6 +47,7 @@
 import org.elasticsearch.test.transport.MockTransportService;
 import org.elasticsearch.transport.TransportModule;
 import org.elasticsearch.transport.TransportService;
+import org.junit.Ignore;
 import org.junit.Test;
 
 import java.util.List;
@@ -52,7 +55,8 @@
 
 import static org.elasticsearch.test.ElasticsearchIntegrationTest.ClusterScope;
 import static org.elasticsearch.test.ElasticsearchIntegrationTest.Scope;
-import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.*;
+import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked;
+import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertHitCount;
 import static org.hamcrest.Matchers.*;
 
 /**
@@ -167,7 +171,8 @@ public boolean apply(Object input) {
     }
 
     @Test
-    @TestLogging("discovery.zen:TRACE,action:TRACE,cluster.service:TRACE")
+    @Ignore
+    @TestLogging("discovery.zen:TRACE,action:TRACE,cluster.service:TRACE,indices.recovery:TRACE,indices.cluster:TRACE")
     public void testDataConsistency() throws Exception {
         List<String> nodes = internalCluster().startNodesAsync(3, nodeSettings).get();
 
@@ -205,25 +210,15 @@ public void testDataConsistency() throws Exception {
             assertThat((long) searchHit.sortValues()[0], equalTo((long) i));
         }
 
-        // Figure out what is the elected master node
-        DiscoveryNode masterDiscoNode = findMasterNode(nodes);
-
-        logger.info("---> legit elected master node=" + masterDiscoNode);
-        final Client masterClient = internalCluster().masterClient();
-
         // Everything is stable now, it is now time to simulate evil...
         // but first make sure we have no initializing shards and all is green
         // (waiting for green here, because indexing / search in a yellow index is fine as long as no other nodes go down)
         ensureGreen("test");
 
         // Pick a node that isn't the elected master.
-        String isolatedNode = null;
-        for (String node : nodes) {
-            if (!node.equals(masterDiscoNode.getName())) {
-                isolatedNode = node;
-            }
-        }
-        assert isolatedNode != null;
+        String isolatedNode = nodes.get(0);
+        String nonIsolatedNode = nodes.get(1);
+        final Client nonIsolatedNodeClient = internalCluster().client(nonIsolatedNode);
 
         // Simulate a network issue between the unlucky node and the rest of the cluster.
         for (String nodeId : nodes) {
@@ -237,7 +232,7 @@ public void testDataConsistency() throws Exception {
             boolean applied = awaitBusy(new Predicate<Object>() {
                 @Override
                 public boolean apply(Object input) {
-                    return masterClient.admin().cluster().prepareState().setLocal(true).get().getState().nodes().size() == 2;
+                    return nonIsolatedNodeClient.admin().cluster().prepareState().setLocal(true).get().getState().nodes().size() == 2;
                 }
             }, 1, TimeUnit.MINUTES);
             assertThat(applied, is(true));
@@ -257,13 +252,13 @@ public boolean apply(Object input) {
             }, 10, TimeUnit.SECONDS);
             assertThat(applied, is(true));
 
-            ClusterHealthResponse healthResponse = masterClient.admin().cluster().prepareHealth("test")
+            ClusterHealthResponse healthResponse = nonIsolatedNodeClient.admin().cluster().prepareHealth("test")
                     .setWaitForYellowStatus().get();
             assertThat(healthResponse.isTimedOut(), is(false));
             assertThat(healthResponse.getStatus(), equalTo(ClusterHealthStatus.YELLOW));
 
             // Reads on the right side of the split must work
-            searchResponse = masterClient.prepareSearch("test").setTypes("type")
+            searchResponse = nonIsolatedNodeClient.prepareSearch("test").setTypes("type")
                     .addSort("field", SortOrder.ASC)
                     .get();
             assertHitCount(searchResponse, indexRequests.length);
@@ -281,7 +276,7 @@ public boolean apply(Object input) {
             assertThat(searchResponse.getHits().totalHits(), lessThan((long) indexRequests.length));
 
             // Writes on the right side of the split must work
-            UpdateResponse updateResponse = masterClient.prepareUpdate("test", "type", "0").setDoc("field2", 2).get();
+            UpdateResponse updateResponse = nonIsolatedNodeClient.prepareUpdate("test", "type", "0").setDoc("field2", 2).get();
             assertThat(updateResponse.getVersion(), equalTo(2l));
 
             // Writes on the wrong side of the split fail
@@ -289,6 +284,7 @@ public boolean apply(Object input) {
                 isolatedNodeClient.prepareUpdate("test", "type", "0").setDoc("field2", 2)
                         .setTimeout(TimeValue.timeValueSeconds(5)) // Fail quick, otherwise we wait 60 seconds.
                         .get();
+                fail();
             } catch (ClusterBlockException exception) {
                 assertThat(exception.status(), equalTo(RestStatus.SERVICE_UNAVAILABLE));
                 assertThat(exception.blocks().size(), equalTo(1));
@@ -307,15 +303,15 @@ public boolean apply(Object input) {
         }
 
         // Wait until the master node sees all 3 nodes again.
-        clusterHealthResponse = masterClient.admin().cluster().prepareHealth()
+        clusterHealthResponse = nonIsolatedNodeClient.admin().cluster().prepareHealth()
                 .setWaitForGreenStatus()
                 .setWaitForEvents(Priority.LANGUID)
                 .setWaitForNodes("3")
                 .get();
+        assertThat(clusterHealthResponse.getStatus(), equalTo(ClusterHealthStatus.GREEN));
         assertThat(clusterHealthResponse.isTimedOut(), is(false));
 
-        for (String node : nodes) {
-            Client client = internalCluster().client(node);
+        for (Client client : clients()) {
             searchResponse = client.prepareSearch("test").setTypes("type")
                     .addSort("field", SortOrder.ASC)
                     .get();
@@ -325,13 +321,12 @@ public boolean apply(Object input) {
                 assertThat((long) searchHit.sortValues()[0], equalTo((long) i));
             }
 
-
-            GetResponse getResponse = client.prepareGet("test", "type", "0").get();
+            GetResponse getResponse = client.prepareGet("test", "type", "0").setPreference("_local").get();
             assertThat(getResponse.isExists(), is(true));
-            assertThat(getResponse.getVersion(), equalTo(2l));
             assertThat(getResponse.getId(), equalTo("0"));
+            assertThat(getResponse.getVersion(), equalTo(2l));
             for (int i = 1; i < indexRequests.length; i++) {
-                getResponse = client.prepareGet("test", "type", String.valueOf(i)).get();
+                getResponse = client.prepareGet("test", "type", String.valueOf(i)).setPreference("_local").get();
                 assertThat(getResponse.isExists(), is(true));
                 assertThat(getResponse.getVersion(), equalTo(1l));
                 assertThat(getResponse.getId(), equalTo(String.valueOf(i)));
@@ -339,6 +334,73 @@ public boolean apply(Object input) {
         }
     }
 
+    @Test
+    @Ignore
+    @TestLogging("discovery.zen:TRACE,action:TRACE,cluster.service:TRACE,indices.recovery:TRACE,indices.cluster:TRACE")
+    public void testRejoinDocumentExistsInAllShardCopies() throws Exception {
+        final List<String> nodes = internalCluster().startNodesAsync(3, nodeSettings).get();
+        ClusterHealthResponse clusterHealthResponse = client().admin().cluster().prepareHealth()
+                .setWaitForEvents(Priority.LANGUID)
+                .setWaitForNodes("3")
+                .get();
+        assertThat(clusterHealthResponse.isTimedOut(), is(false));
+        assertAcked(prepareCreate("test")
+                .setSettings(ImmutableSettings.builder()
+                                .put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1)
+                                .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 2)
+                )
+                .get());
+        ensureGreen("test");
+
+        String isolatedNode = findMasterNode(nodes).getName();
+        String notIsolatedNode = null;
+        for (String node : nodes) {
+            if (!node.equals(isolatedNode)) {
+                notIsolatedNode = node;
+                break;
+            }
+        }
+
+        logger.info("Isolating node[" + isolatedNode + "]");
+        for (String nodeId : nodes) {
+            if (!nodeId.equals(isolatedNode)) {
+                addFailToSendNoConnectRule(nodeId, isolatedNode);
+                addFailToSendNoConnectRule(isolatedNode, nodeId);
+            }
+        }
+        ensureYellow("test");
+
+        IndexResponse indexResponse = internalCluster().client(notIsolatedNode).prepareIndex("test", "type").setSource("field", "value").get();
+        assertThat(indexResponse.getVersion(), equalTo(1l));
+
+        logger.info("Verifying if document exists via node[" + notIsolatedNode + "]");
+        GetResponse getResponse = internalCluster().client(notIsolatedNode).prepareGet("test", "type", indexResponse.getId())
+                .setPreference("_local")
+                .get();
+        assertThat(getResponse.isExists(), is(true));
+        assertThat(getResponse.getVersion(), equalTo(1l));
+        assertThat(getResponse.getId(), equalTo(indexResponse.getId()));
+
+        for (String nodeId : nodes) {
+            if (!nodeId.equals(isolatedNode)) {
+                clearNoConnectRule(nodeId, isolatedNode);
+                clearNoConnectRule(isolatedNode, nodeId);
+            }
+        }
+
+        ensureGreen("test");
+
+        for (String node : nodes) {
+            logger.info("Verifying if document exists after isolating node[" + isolatedNode + "] via node[" + node + "]");
+            getResponse = internalCluster().client(node).prepareGet("test", "type", indexResponse.getId())
+                    .setPreference("_local")
+                    .get();
+            assertThat(getResponse.isExists(), is(true));
+            assertThat(getResponse.getVersion(), equalTo(1l));
+            assertThat(getResponse.getId(), equalTo(indexResponse.getId()));
+        }
+    }
+
     private DiscoveryNode findMasterNode(List<String> nodes) {
         DiscoveryNode masterDiscoNode = null;
         for (String node : nodes) {

From 424a2f68c6589e8a4452c238e2801a45f862959f Mon Sep 17 00:00:00 2001
From: Martijn van Groningen <martijn.v.groningen@gmail.com>
Date: Wed, 11 Jun 2014 20:43:09 +0200
Subject: [PATCH 15/74] [Discovery] Removed METADATA block

---
 .../java/org/elasticsearch/discovery/DiscoverySettings.java    | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/main/java/org/elasticsearch/discovery/DiscoverySettings.java b/src/main/java/org/elasticsearch/discovery/DiscoverySettings.java
index 8c1ba757d9bd0..c18bd4984671c 100644
--- a/src/main/java/org/elasticsearch/discovery/DiscoverySettings.java
+++ b/src/main/java/org/elasticsearch/discovery/DiscoverySettings.java
@@ -45,7 +45,6 @@ public class DiscoverySettings extends AbstractComponent {
 
     private final static ClusterBlock ALL = new ClusterBlock(NO_MASTER_BLOCK_ID, "no master", true, true, RestStatus.SERVICE_UNAVAILABLE, ClusterBlockLevel.ALL);
     private final static ClusterBlock WRITE = new ClusterBlock(NO_MASTER_BLOCK_ID, "no master", true, false, RestStatus.SERVICE_UNAVAILABLE, EnumSet.of(ClusterBlockLevel.WRITE, ClusterBlockLevel.METADATA));
-    private final static ClusterBlock METADATA = new ClusterBlock(NO_MASTER_BLOCK_ID, "no master", true, false, RestStatus.SERVICE_UNAVAILABLE, EnumSet.of(ClusterBlockLevel.METADATA));
 
     private volatile ClusterBlock noMasterBlock;
     private volatile TimeValue publishTimeout = DEFAULT_PUBLISH_TIMEOUT;
@@ -93,8 +92,6 @@ private ClusterBlock parseNoMasterBlock(String value) {
             return ALL;
         } else if ("write".equals(value)) {
             return WRITE;
-        } else if ("metadata".equals(value)) {
-            return METADATA;
         } else {
             throw new ElasticsearchIllegalArgumentException("invalid master block [" + value + "]");
         }

From 1849d0966c2e6a23db339d5a02aa975377e882ba Mon Sep 17 00:00:00 2001
From: Martijn van Groningen <martijn.v.groningen@gmail.com>
Date: Wed, 11 Jun 2014 20:48:35 +0200
Subject: [PATCH 16/74] [Discovery] Made 'discovery.zen.rejoin_on_master_gone'
 setting updatable at runtime.

---
 .../elasticsearch/discovery/zen/ZenDiscovery.java    | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java b/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
index 50f5747243a2d..8ccd5046e24ba 100644
--- a/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
+++ b/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
@@ -76,6 +76,8 @@
  */
 public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implements Discovery, DiscoveryNodesProvider {
 
+    private final static String REJOIN_ON_MASTER_GONE = "discovery.zen.rejoin_on_master_gone";
+
     public static final String DISCOVERY_REJOIN_ACTION_NAME = "internal:discovery/zen/rejoin";
 
     private final ThreadPool threadPool;
@@ -117,7 +119,7 @@ public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implemen
 
     private final AtomicBoolean initialStateSent = new AtomicBoolean();
 
-    private final boolean rejoinOnMasterGone;
+    private volatile boolean rejoinOnMasterGone;
 
 
     @Nullable
@@ -144,7 +146,7 @@ public ZenDiscovery(Settings settings, ClusterName clusterName, ThreadPool threa
 
         this.masterElectionFilterClientNodes = settings.getAsBoolean("discovery.zen.master_election.filter_client", true);
         this.masterElectionFilterDataNodes = settings.getAsBoolean("discovery.zen.master_election.filter_data", false);
-        this.rejoinOnMasterGone = settings.getAsBoolean("discovery.zen.rejoin_on_master_gone", true);
+        this.rejoinOnMasterGone = settings.getAsBoolean(REJOIN_ON_MASTER_GONE, true);
 
         logger.debug("using ping.timeout [{}], join.timeout [{}], master_election.filter_client [{}], master_election.filter_data [{}]", pingTimeout, joinTimeout, masterElectionFilterClientNodes, masterElectionFilterDataNodes);
 
@@ -1015,6 +1017,12 @@ public void onRefreshSettings(Settings settings) {
                         ZenDiscovery.this.electMaster.minimumMasterNodes(), minimumMasterNodes);
                 handleMinimumMasterNodesChanged(minimumMasterNodes);
             }
+
+            boolean rejoinOnMasterGone = settings.getAsBoolean(REJOIN_ON_MASTER_GONE, ZenDiscovery.this.rejoinOnMasterGone);
+            if (rejoinOnMasterGone != ZenDiscovery.this.rejoinOnMasterGone) {
+                logger.info("updating {} from [{}] to [{}]", REJOIN_ON_MASTER_GONE, ZenDiscovery.this.rejoinOnMasterGone, rejoinOnMasterGone);
+                ZenDiscovery.this.rejoinOnMasterGone = rejoinOnMasterGone;
+            }
         }
     }
 }

From 58f8774fa25ed749490a3cdef63fc6c5e15ce8b6 Mon Sep 17 00:00:00 2001
From: Boaz Leskes <b.leskes@gmail.com>
Date: Wed, 11 Jun 2014 15:54:47 +0200
Subject: [PATCH 17/74] [Discovery] do not use versions to optimize cluster
 state copying for a first update from a new master

We have an optimization which compares routing/meta data version of cluster states and tries to reuse the current object if the versions are equal. This can cause rare failures during recovery from a minimum_master_node breach when using the "new light rejoin" mechanism and simulated network disconnects. This happens where the current master updates it's state, doesn't manage to broadcast it to other nodes due to the disconnect and then steps down. The new master will start with a previous version and continue to update it. When the old master rejoins, the versions of it's state can equal but the content is different.

Also improved DiscoveryWithNetworkFailuresTests to simulate this failure (and other improvements)

Closes #6466
---
 .../service/InternalClusterService.java       |  14 --
 .../discovery/DiscoverySettings.java          |   1 +
 .../discovery/local/LocalDiscovery.java       |  13 +-
 .../discovery/zen/ZenDiscovery.java           |  18 +-
 .../DiscoveryWithNetworkFailuresTests.java    | 237 ++++++++++++------
 .../test/ElasticsearchIntegrationTest.java    |   7 +
 6 files changed, 191 insertions(+), 99 deletions(-)

diff --git a/src/main/java/org/elasticsearch/cluster/service/InternalClusterService.java b/src/main/java/org/elasticsearch/cluster/service/InternalClusterService.java
index dbe0b4c7ad085..ff6f392425340 100644
--- a/src/main/java/org/elasticsearch/cluster/service/InternalClusterService.java
+++ b/src/main/java/org/elasticsearch/cluster/service/InternalClusterService.java
@@ -386,20 +386,6 @@ public void run() {
                             }
                         }
                     }
-                } else {
-                    if (previousClusterState.blocks().hasGlobalBlock(discoveryService.getNoMasterBlock()) && !newClusterState.blocks().hasGlobalBlock(discoveryService.getNoMasterBlock())) {
-                        // force an update, its a fresh update from the master as we transition from a start of not having a master to having one
-                        // have a fresh instances of routing and metadata to remove the chance that version might be the same
-                        Builder builder = ClusterState.builder(newClusterState);
-                        builder.routingTable(RoutingTable.builder(newClusterState.routingTable()));
-                        builder.metaData(MetaData.builder(newClusterState.metaData()));
-                        newClusterState = builder.build();
-                        logger.debug("got first state from fresh master [{}]", newClusterState.nodes().masterNodeId());
-                    } else if (newClusterState.version() < previousClusterState.version()) {
-                        // we got a cluster state with older version, when we are *not* the master, let it in since it might be valid
-                        // we check on version where applicable, like at ZenDiscovery#handleNewClusterStateFromMaster
-                        logger.debug("got smaller cluster state when not master [" + newClusterState.version() + "<" + previousClusterState.version() + "] from source [" + source + "]");
-                    }
                 }
 
                 newClusterState.status(ClusterState.ClusterStateStatus.BEING_APPLIED);
diff --git a/src/main/java/org/elasticsearch/discovery/DiscoverySettings.java b/src/main/java/org/elasticsearch/discovery/DiscoverySettings.java
index c18bd4984671c..8304893f0ba0a 100644
--- a/src/main/java/org/elasticsearch/discovery/DiscoverySettings.java
+++ b/src/main/java/org/elasticsearch/discovery/DiscoverySettings.java
@@ -54,6 +54,7 @@ public DiscoverySettings(Settings settings, NodeSettingsService nodeSettingsServ
         super(settings);
         nodeSettingsService.addListener(new ApplySettings());
         this.noMasterBlock = parseNoMasterBlock(settings.get(NO_MASTER_BLOCK, DEFAULT_NO_MASTER_BLOCK));
+        this.publishTimeout = settings.getAsTime(PUBLISH_TIMEOUT, publishTimeout);
     }
 
     /**
diff --git a/src/main/java/org/elasticsearch/discovery/local/LocalDiscovery.java b/src/main/java/org/elasticsearch/discovery/local/LocalDiscovery.java
index 065f3b6e45fed..af91c3608b17d 100644
--- a/src/main/java/org/elasticsearch/discovery/local/LocalDiscovery.java
+++ b/src/main/java/org/elasticsearch/discovery/local/LocalDiscovery.java
@@ -58,6 +58,7 @@ public class LocalDiscovery extends AbstractLifecycleComponent<Discovery> implem
 
     private final TransportService transportService;
     private final ClusterService clusterService;
+    private final DiscoveryService discoveryService;
     private final DiscoveryNodeService discoveryNodeService;
     private AllocationService allocationService;
     private final ClusterName clusterName;
@@ -77,7 +78,7 @@ public class LocalDiscovery extends AbstractLifecycleComponent<Discovery> implem
 
     @Inject
     public LocalDiscovery(Settings settings, ClusterName clusterName, TransportService transportService, ClusterService clusterService,
-                          DiscoveryNodeService discoveryNodeService, Version version, DiscoverySettings discoverySettings) {
+                          DiscoveryNodeService discoveryNodeService, Version version, DiscoverySettings discoverySettings, DiscoveryService discoveryService) {
         super(settings);
         this.clusterName = clusterName;
         this.clusterService = clusterService;
@@ -85,6 +86,7 @@ public LocalDiscovery(Settings settings, ClusterName clusterName, TransportServi
         this.discoveryNodeService = discoveryNodeService;
         this.version = version;
         this.discoverySettings = discoverySettings;
+        this.discoveryService = discoveryService;
     }
 
     @Override
@@ -305,6 +307,9 @@ private void publish(LocalDiscovery[] members, ClusterState clusterState, final
                 nodeSpecificClusterState.status(ClusterState.ClusterStateStatus.RECEIVED);
                 // ignore cluster state messages that do not include "me", not in the game yet...
                 if (nodeSpecificClusterState.nodes().localNode() != null) {
+                    assert nodeSpecificClusterState.nodes().masterNode() != null : "received a cluster state without a master";
+                    assert !nodeSpecificClusterState.blocks().hasGlobalBlock(discoveryService.getNoMasterBlock()) : "received a cluster state with a master block";
+
                     discovery.clusterService.submitStateUpdateTask("local-disco-receive(from master)", new ProcessedClusterStateNonMasterUpdateTask() {
                         @Override
                         public ClusterState execute(ClusterState currentState) {
@@ -312,6 +317,12 @@ public ClusterState execute(ClusterState currentState) {
                                 return currentState;
                             }
 
+                            if (currentState.blocks().hasGlobalBlock(discoveryService.getNoMasterBlock())) {
+                                // its a fresh update from the master as we transition from a start of not having a master to having one
+                                logger.debug("got first state from fresh master [{}]", nodeSpecificClusterState.nodes().masterNodeId());
+                                return nodeSpecificClusterState;
+                            }
+
                             ClusterState.Builder builder = ClusterState.builder(nodeSpecificClusterState);
                             // if the routing table did not change, use the original one
                             if (nodeSpecificClusterState.routingTable().version() == currentState.routingTable().version()) {
diff --git a/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java b/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
index 8ccd5046e24ba..1f0b365aee3e3 100644
--- a/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
+++ b/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
@@ -85,6 +85,7 @@ public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implemen
     private final ClusterService clusterService;
     private AllocationService allocationService;
     private final ClusterName clusterName;
+    private final DiscoveryService discoveryService;
     private final DiscoveryNodeService discoveryNodeService;
     private final DiscoverySettings discoverySettings;
     private final ZenPingService pingService;
@@ -128,12 +129,14 @@ public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implemen
     @Inject
     public ZenDiscovery(Settings settings, ClusterName clusterName, ThreadPool threadPool,
                         TransportService transportService, ClusterService clusterService, NodeSettingsService nodeSettingsService,
-                        DiscoveryNodeService discoveryNodeService, ZenPingService pingService, Version version, DiscoverySettings discoverySettings) {
+                        DiscoveryNodeService discoveryNodeService, ZenPingService pingService, Version version, DiscoverySettings discoverySettings,
+                        DiscoveryService discoveryService) {
         super(settings);
         this.clusterName = clusterName;
         this.threadPool = threadPool;
         this.clusterService = clusterService;
         this.transportService = transportService;
+        this.discoveryService = discoveryService;
         this.discoveryNodeService = discoveryNodeService;
         this.discoverySettings = discoverySettings;
         this.pingService = pingService;
@@ -641,6 +644,10 @@ public void onFailure(String source, Throwable t) {
                 final ProcessClusterState processClusterState = new ProcessClusterState(newClusterState, newStateProcessed);
                 processNewClusterStates.add(processClusterState);
 
+
+                assert newClusterState.nodes().masterNode() != null : "received a cluster state without a master";
+                assert !newClusterState.blocks().hasGlobalBlock(discoveryService.getNoMasterBlock()) : "received a cluster state with a master block";
+
                 clusterService.submitStateUpdateTask("zen-disco-receive(from master [" + newClusterState.nodes().masterNode() + "])", Priority.URGENT, new ProcessedClusterStateNonMasterUpdateTask() {
                     @Override
                     public ClusterState execute(ClusterState currentState) {
@@ -701,7 +708,16 @@ public ClusterState execute(ClusterState currentState) {
                             masterFD.restart(latestDiscoNodes.masterNode(), "new cluster state received and we are monitoring the wrong master [" + masterFD.masterNode() + "]");
                         }
 
+                        if (currentState.blocks().hasGlobalBlock(discoveryService.getNoMasterBlock())) {
+                            // its a fresh update from the master as we transition from a start of not having a master to having one
+                            logger.debug("got first state from fresh master [{}]", updatedState.nodes().masterNodeId());
+                            return updatedState;
+                        }
+
+
+                        // some optimizations to make sure we keep old objects where possible
                         ClusterState.Builder builder = ClusterState.builder(updatedState);
+
                         // if the routing table did not change, use the original one
                         if (updatedState.routingTable().version() == currentState.routingTable().version()) {
                             builder.routingTable(currentState.routingTable());
diff --git a/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java b/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java
index 1d6a346dbd9fa..905b45a65952a 100644
--- a/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java
+++ b/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java
@@ -35,10 +35,10 @@
 import org.elasticsearch.cluster.node.DiscoveryNode;
 import org.elasticsearch.cluster.node.DiscoveryNodes;
 import org.elasticsearch.cluster.routing.allocation.decider.ShardsLimitAllocationDecider;
+import org.elasticsearch.common.Nullable;
 import org.elasticsearch.common.Priority;
 import org.elasticsearch.common.settings.ImmutableSettings;
 import org.elasticsearch.common.settings.Settings;
-import org.elasticsearch.common.unit.TimeValue;
 import org.elasticsearch.rest.RestStatus;
 import org.elasticsearch.search.SearchHit;
 import org.elasticsearch.search.sort.SortOrder;
@@ -47,9 +47,10 @@
 import org.elasticsearch.test.transport.MockTransportService;
 import org.elasticsearch.transport.TransportModule;
 import org.elasticsearch.transport.TransportService;
-import org.junit.Ignore;
 import org.junit.Test;
 
+import java.util.ArrayList;
+import java.util.Collections;
 import java.util.List;
 import java.util.concurrent.TimeUnit;
 
@@ -61,13 +62,14 @@
 
 /**
  */
-@ClusterScope(scope= Scope.TEST, numDataNodes =0)
+@ClusterScope(scope = Scope.TEST, numDataNodes = 0)
 public class DiscoveryWithNetworkFailuresTests extends ElasticsearchIntegrationTest {
 
     private static final Settings nodeSettings = ImmutableSettings.settingsBuilder()
             .put("discovery.type", "zen") // <-- To override the local setting if set externally
             .put("discovery.zen.fd.ping_timeout", "1s") // <-- for hitting simulated network failures quickly
             .put("discovery.zen.fd.ping_retries", "1") // <-- for hitting simulated network failures quickly
+            .put(DiscoverySettings.PUBLISH_TIMEOUT, "1s") // <-- for hitting simulated network failures quickly
             .put("discovery.zen.minimum_master_nodes", 2)
             .put(TransportModule.TRANSPORT_SERVICE_TYPE_KEY, MockTransportService.class.getName())
             .build();
@@ -97,12 +99,8 @@ public void failWithMinimumMasterNodesConfigured() throws Exception {
 
         List<String> nodes = internalCluster().startNodesAsync(3, nodeSettings).get();
 
-        // Wait until a green status has been reaches and 3 nodes are part of the cluster
-        ClusterHealthResponse clusterHealthResponse = client().admin().cluster().prepareHealth()
-                .setWaitForEvents(Priority.LANGUID)
-                .setWaitForNodes("3")
-                .get();
-        assertThat(clusterHealthResponse.isTimedOut(), is(false));
+        // Wait until 3 nodes are part of the cluster
+        ensureStableCluster(3);
 
         // Figure out what is the elected master node
         DiscoveryNode masterDiscoNode = findMasterNode(nodes);
@@ -155,11 +153,7 @@ public boolean apply(Object input) {
         }
 
         // Wait until the master node sees all 3 nodes again.
-        clusterHealthResponse = masterClient.admin().cluster().prepareHealth()
-                .setWaitForEvents(Priority.LANGUID)
-                .setWaitForNodes("3")
-                .get();
-        assertThat(clusterHealthResponse.isTimedOut(), is(false));
+        ensureStableCluster(3);
 
         for (String node : nodes) {
             ClusterState state = internalCluster().client(node).admin().cluster().prepareState().setLocal(true).execute().actionGet().getState();
@@ -171,17 +165,12 @@ public boolean apply(Object input) {
     }
 
     @Test
-    @Ignore
     @TestLogging("discovery.zen:TRACE,action:TRACE,cluster.service:TRACE,indices.recovery:TRACE,indices.cluster:TRACE")
     public void testDataConsistency() throws Exception {
         List<String> nodes = internalCluster().startNodesAsync(3, nodeSettings).get();
 
-        // Wait until a green status has been reaches and 3 nodes are part of the cluster
-        ClusterHealthResponse clusterHealthResponse = client().admin().cluster().prepareHealth()
-                .setWaitForEvents(Priority.LANGUID)
-                .setWaitForNodes("3")
-                .get();
-        assertThat(clusterHealthResponse.isTimedOut(), is(false));
+        // Wait until a 3 nodes are part of the cluster
+        ensureStableCluster(3);
 
         assertAcked(prepareCreate("test")
                 .addMapping("type", "field", "type=long")
@@ -216,35 +205,29 @@ public void testDataConsistency() throws Exception {
         ensureGreen("test");
 
         // Pick a node that isn't the elected master.
-        String isolatedNode = nodes.get(0);
-        String nonIsolatedNode = nodes.get(1);
-        final Client nonIsolatedNodeClient = internalCluster().client(nonIsolatedNode);
+        final String isolatedNode = nodes.get(0);
+        final String nonIsolatedNode = nodes.get(1);
 
         // Simulate a network issue between the unlucky node and the rest of the cluster.
-        for (String nodeId : nodes) {
-            if (!nodeId.equals(isolatedNode)) {
-                addFailToSendNoConnectRule(nodeId, isolatedNode);
-                addFailToSendNoConnectRule(isolatedNode, nodeId);
-            }
-        }
+        randomIsolateNode(isolatedNode, nodes);
         try {
-            // Wait until elected master has removed that the unlucky node...
+            logger.info("wait until elected master has removed [{}]", isolatedNode);
             boolean applied = awaitBusy(new Predicate<Object>() {
                 @Override
                 public boolean apply(Object input) {
-                    return nonIsolatedNodeClient.admin().cluster().prepareState().setLocal(true).get().getState().nodes().size() == 2;
+                    return client(nonIsolatedNode).admin().cluster().prepareState().setLocal(true).get().getState().nodes().size() == 2;
                 }
             }, 1, TimeUnit.MINUTES);
             assertThat(applied, is(true));
 
             // The unlucky node must report *no* master node, since it can't connect to master and in fact it should
             // continuously ping until network failures have been resolved. However
-            final Client isolatedNodeClient = internalCluster().client(isolatedNode);
             // It may a take a bit before the node detects it has been cut off from the elected master
+            logger.info("waiting for isolated node [{}] to have no master", isolatedNode);
             applied = awaitBusy(new Predicate<Object>() {
                 @Override
                 public boolean apply(Object input) {
-                    ClusterState localClusterState = isolatedNodeClient.admin().cluster().prepareState().setLocal(true).get().getState();
+                    ClusterState localClusterState = client(isolatedNode).admin().cluster().prepareState().setLocal(true).get().getState();
                     DiscoveryNodes localDiscoveryNodes = localClusterState.nodes();
                     logger.info("localDiscoveryNodes=" + localDiscoveryNodes.prettyPrint());
                     return localDiscoveryNodes.masterNode() == null;
@@ -252,13 +235,14 @@ public boolean apply(Object input) {
             }, 10, TimeUnit.SECONDS);
             assertThat(applied, is(true));
 
-            ClusterHealthResponse healthResponse = nonIsolatedNodeClient.admin().cluster().prepareHealth("test")
+            ClusterHealthResponse healthResponse = client(nonIsolatedNode).admin().cluster().prepareHealth("test")
                     .setWaitForYellowStatus().get();
             assertThat(healthResponse.isTimedOut(), is(false));
             assertThat(healthResponse.getStatus(), equalTo(ClusterHealthStatus.YELLOW));
 
             // Reads on the right side of the split must work
-            searchResponse = nonIsolatedNodeClient.prepareSearch("test").setTypes("type")
+            logger.info("verifying healthy part of cluster returns data");
+            searchResponse = client(nonIsolatedNode).prepareSearch("test").setTypes("type")
                     .addSort("field", SortOrder.ASC)
                     .get();
             assertHitCount(searchResponse, indexRequests.length);
@@ -269,20 +253,21 @@ public boolean apply(Object input) {
             }
 
             // Reads on the wrong side of the split are partial
-            searchResponse = isolatedNodeClient.prepareSearch("test").setTypes("type")
-                    .addSort("field", SortOrder.ASC)
+            logger.info("verifying isolated node [{}] returns partial data", isolatedNode);
+            searchResponse = client(isolatedNode).prepareSearch("test").setTypes("type")
+                    .addSort("field", SortOrder.ASC).setPreference("_only_local")
                     .get();
             assertThat(searchResponse.getSuccessfulShards(), lessThan(searchResponse.getTotalShards()));
             assertThat(searchResponse.getHits().totalHits(), lessThan((long) indexRequests.length));
 
-            // Writes on the right side of the split must work
-            UpdateResponse updateResponse = nonIsolatedNodeClient.prepareUpdate("test", "type", "0").setDoc("field2", 2).get();
+            logger.info("verifying writes on healthy cluster");
+            UpdateResponse updateResponse = client(nonIsolatedNode).prepareUpdate("test", "type", "0").setDoc("field2", 2).get();
             assertThat(updateResponse.getVersion(), equalTo(2l));
 
-            // Writes on the wrong side of the split fail
             try {
-                isolatedNodeClient.prepareUpdate("test", "type", "0").setDoc("field2", 2)
-                        .setTimeout(TimeValue.timeValueSeconds(5)) // Fail quick, otherwise we wait 60 seconds.
+                logger.info("verifying writes on isolated [{}] fail", isolatedNode);
+                client(isolatedNode).prepareUpdate("test", "type", "0").setDoc("field2", 2)
+                        .setTimeout("1s") // Fail quick, otherwise we wait 60 seconds.
                         .get();
                 fail();
             } catch (ClusterBlockException exception) {
@@ -294,23 +279,13 @@ public boolean apply(Object input) {
         } finally {
             // stop simulating network failures, from this point on the unlucky node is able to rejoin
             // We also need to do this even if assertions fail, since otherwise the test framework can't work properly
-            for (String nodeId : nodes) {
-                if (!nodeId.equals(isolatedNode)) {
-                    clearNoConnectRule(nodeId, isolatedNode);
-                    clearNoConnectRule(isolatedNode, nodeId);
-                }
-            }
+            restoreIsolation(isolatedNode, nodes);
         }
 
         // Wait until the master node sees all 3 nodes again.
-        clusterHealthResponse = nonIsolatedNodeClient.admin().cluster().prepareHealth()
-                .setWaitForGreenStatus()
-                .setWaitForEvents(Priority.LANGUID)
-                .setWaitForNodes("3")
-                .get();
-        assertThat(clusterHealthResponse.getStatus(), equalTo(ClusterHealthStatus.GREEN));
-        assertThat(clusterHealthResponse.isTimedOut(), is(false));
+        ensureStableCluster(3);
 
+        logger.info("verifying all nodes return all data");
         for (Client client : clients()) {
             searchResponse = client.prepareSearch("test").setTypes("type")
                     .addSort("field", SortOrder.ASC)
@@ -334,41 +309,96 @@ public boolean apply(Object input) {
         }
     }
 
+
     @Test
-    @Ignore
     @TestLogging("discovery.zen:TRACE,action:TRACE,cluster.service:TRACE,indices.recovery:TRACE,indices.cluster:TRACE")
-    public void testRejoinDocumentExistsInAllShardCopies() throws Exception {
+    public void voidIsolateMasterAndVerifyClusterStateConsensus() throws Exception {
         final List<String> nodes = internalCluster().startNodesAsync(3, nodeSettings).get();
-        ClusterHealthResponse clusterHealthResponse = client().admin().cluster().prepareHealth()
-                .setWaitForEvents(Priority.LANGUID)
-                .setWaitForNodes("3")
-                .get();
-        assertThat(clusterHealthResponse.isTimedOut(), is(false));
+        ensureStableCluster(3);
+
         assertAcked(prepareCreate("test")
                 .setSettings(ImmutableSettings.builder()
-                                .put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1)
-                                .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 2)
-                )
-                .get());
-        ensureGreen("test");
+                                .put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1 + randomInt(2))
+                                .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, randomInt(2))
+                ));
 
-        String isolatedNode = findMasterNode(nodes).getName();
-        String notIsolatedNode = null;
+        ensureGreen();
+        String isolatedNode = findMasterNode(nodes).name();
+        String nonIsolatedNode = null;
         for (String node : nodes) {
             if (!node.equals(isolatedNode)) {
-                notIsolatedNode = node;
+                nonIsolatedNode = node;
                 break;
             }
         }
+        randomIsolateNode(isolatedNode, nodes);
 
-        logger.info("Isolating node[" + isolatedNode + "]");
-        for (String nodeId : nodes) {
-            if (!nodeId.equals(isolatedNode)) {
-                addFailToSendNoConnectRule(nodeId, isolatedNode);
-                addFailToSendNoConnectRule(isolatedNode, nodeId);
+        // make sure cluster reforms
+        ensureStableCluster(2, nonIsolatedNode);
+
+        // restore isolation
+        restoreIsolation(isolatedNode, nodes);
+
+        ensureStableCluster(3);
+
+        logger.info("issue a reroute");
+        // trigger a reroute now, instead of waiting for the background reroute of RerouteService
+        assertAcked(client().admin().cluster().prepareReroute());
+        // and wait for it to finish.
+        assertFalse(client().admin().cluster().prepareHealth().setWaitForRelocatingShards(0).get().isTimedOut());
+
+
+        // verify all cluster states are the same
+        ClusterState state = null;
+        for (String node : nodes) {
+            ClusterState nodeState = client(node).admin().cluster().prepareState().setLocal(true).get().getState();
+            if (state == null) {
+                state = nodeState;
+                continue;
+            }
+            // assert nodes are identical
+            try {
+                assertEquals("unequal versions", state.version(), nodeState.version());
+                assertEquals("unequal node count", state.nodes().size(), nodeState.nodes().size());
+                assertEquals("different masters ", state.nodes().masterNodeId(), nodeState.nodes().masterNodeId());
+                assertEquals("different meta data version", state.metaData().version(), nodeState.metaData().version());
+                if (!state.routingTable().prettyPrint().equals(nodeState.routingTable().prettyPrint())) {
+                    fail("different routing");
+                }
+            } catch (AssertionError t) {
+                fail("failed comparing cluster state: " + t.getMessage() + "\n" +
+                        "--- cluster state of node [" + nodes.get(0) + "]: ---\n" + state.prettyPrint() +
+                        "\n--- cluster state [" + node + "]: ---\n" + nodeState.prettyPrint());
             }
+
         }
-        ensureYellow("test");
+
+    }
+
+
+    @Test
+    @TestLogging("discovery.zen:TRACE,action:TRACE,cluster.service:TRACE,indices.recovery:TRACE,indices.cluster:TRACE")
+    public void testRejoinDocumentExistsInAllShardCopies() throws Exception {
+        List<String> nodes = internalCluster().startNodesAsync(3, nodeSettings).get();
+        ensureStableCluster(3);
+
+        assertAcked(prepareCreate("test")
+                .setSettings(ImmutableSettings.builder()
+                                .put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1)
+                                .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 2)
+                )
+                .get());
+        ensureGreen("test");
+
+        nodes = new ArrayList<>(nodes);
+        Collections.shuffle(nodes, getRandom());
+        String isolatedNode = nodes.get(0);
+        String notIsolatedNode = nodes.get(1);
+
+        randomIsolateNode(isolatedNode, nodes);
+        ensureStableCluster(2, notIsolatedNode);
+        assertFalse(client(notIsolatedNode).admin().cluster().prepareHealth("test").setWaitForYellowStatus().get().isTimedOut());
+
 
         IndexResponse indexResponse = internalCluster().client(notIsolatedNode).prepareIndex("test", "type").setSource("field", "value").get();
         assertThat(indexResponse.getVersion(), equalTo(1l));
@@ -381,13 +411,9 @@ public void testRejoinDocumentExistsInAllShardCopies() throws Exception {
         assertThat(getResponse.getVersion(), equalTo(1l));
         assertThat(getResponse.getId(), equalTo(indexResponse.getId()));
 
-        for (String nodeId : nodes) {
-            if (!nodeId.equals(isolatedNode)) {
-                clearNoConnectRule(nodeId, isolatedNode);
-                clearNoConnectRule(isolatedNode, nodeId);
-            }
-        }
+        restoreIsolation(isolatedNode, nodes);
 
+        ensureStableCluster(3);
         ensureGreen("test");
 
         for (String node : nodes) {
@@ -401,6 +427,32 @@ public void testRejoinDocumentExistsInAllShardCopies() throws Exception {
         }
     }
 
+    protected void restoreIsolation(String isolatedNode, List<String> nodes) {
+        logger.info("restoring isolation of [{}]", isolatedNode);
+        for (String nodeId : nodes) {
+            if (!nodeId.equals(isolatedNode)) {
+                clearNoConnectRule(nodeId, isolatedNode);
+                clearNoConnectRule(isolatedNode, nodeId);
+            }
+        }
+    }
+
+    protected void randomIsolateNode(String isolatedNode, List<String> nodes) {
+        boolean unresponsive = randomBoolean();
+        logger.info("isolating [{}] with unresponsive: [{}]", isolatedNode, unresponsive);
+        for (String nodeId : nodes) {
+            if (!nodeId.equals(isolatedNode)) {
+                if (unresponsive) {
+                    addUnresponsiveRule(nodeId, isolatedNode);
+                    addUnresponsiveRule(isolatedNode, nodeId);
+                } else {
+                    addFailToSendNoConnectRule(nodeId, isolatedNode);
+                    addFailToSendNoConnectRule(isolatedNode, nodeId);
+                }
+            }
+        }
+    }
+
     private DiscoveryNode findMasterNode(List<String> nodes) {
         DiscoveryNode masterDiscoNode = null;
         for (String node : nodes) {
@@ -421,9 +473,28 @@ private void addFailToSendNoConnectRule(String fromNode, String toNode) {
         ((MockTransportService) mockTransportService).addFailToSendNoConnectRule(internalCluster().getInstance(Discovery.class, toNode).localNode());
     }
 
+    private void addUnresponsiveRule(String fromNode, String toNode) {
+        TransportService mockTransportService = internalCluster().getInstance(TransportService.class, fromNode);
+        ((MockTransportService) mockTransportService).addUnresponsiveRule(internalCluster().getInstance(Discovery.class, toNode).localNode());
+    }
+
     private void clearNoConnectRule(String fromNode, String toNode) {
         TransportService mockTransportService = internalCluster().getInstance(TransportService.class, fromNode);
         ((MockTransportService) mockTransportService).clearRule(internalCluster().getInstance(Discovery.class, toNode).localNode());
     }
 
+
+    private void ensureStableCluster(int nodeCount) {
+        ensureStableCluster(nodeCount, null);
+    }
+
+    private void ensureStableCluster(int nodeCount, @Nullable String viaNode) {
+        ClusterHealthResponse clusterHealthResponse = client(viaNode).admin().cluster().prepareHealth()
+                .setWaitForEvents(Priority.LANGUID)
+                .setWaitForNodes(Integer.toString(nodeCount))
+                .setWaitForRelocatingShards(0)
+                .get();
+        assertThat(clusterHealthResponse.isTimedOut(), is(false));
+    }
+
 }
diff --git a/src/test/java/org/elasticsearch/test/ElasticsearchIntegrationTest.java b/src/test/java/org/elasticsearch/test/ElasticsearchIntegrationTest.java
index 5cfceafa220cc..20789924ac936 100644
--- a/src/test/java/org/elasticsearch/test/ElasticsearchIntegrationTest.java
+++ b/src/test/java/org/elasticsearch/test/ElasticsearchIntegrationTest.java
@@ -646,6 +646,13 @@ public ClusterService clusterService() {
     }
 
     public static Client client() {
+        return client(null);
+    }
+
+    public static Client client(@Nullable String node) {
+        if (node != null) {
+            return internalCluster().client(node);
+        }
         Client client = cluster().client();
         if (frequently()) {
             client = new RandomizingClient(client, getRandom());

From f3d90cdb170e14e81483ab82876375558b1e138d Mon Sep 17 00:00:00 2001
From: Martijn van Groningen <martijn.v.groningen@gmail.com>
Date: Thu, 12 Jun 2014 21:26:58 +0200
Subject: [PATCH 18/74] [TEST] Remove
 'index.routing.allocation.total_shards_per_node' setting in data consistency
 test

---
 .../DiscoveryWithNetworkFailuresTests.java      | 17 +----------------
 1 file changed, 1 insertion(+), 16 deletions(-)

diff --git a/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java b/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java
index 905b45a65952a..493c18e45bffa 100644
--- a/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java
+++ b/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java
@@ -21,7 +21,6 @@
 
 import com.google.common.base.Predicate;
 import org.elasticsearch.action.admin.cluster.health.ClusterHealthResponse;
-import org.elasticsearch.action.admin.cluster.health.ClusterHealthStatus;
 import org.elasticsearch.action.get.GetResponse;
 import org.elasticsearch.action.index.IndexRequestBuilder;
 import org.elasticsearch.action.index.IndexResponse;
@@ -34,7 +33,6 @@
 import org.elasticsearch.cluster.metadata.IndexMetaData;
 import org.elasticsearch.cluster.node.DiscoveryNode;
 import org.elasticsearch.cluster.node.DiscoveryNodes;
-import org.elasticsearch.cluster.routing.allocation.decider.ShardsLimitAllocationDecider;
 import org.elasticsearch.common.Nullable;
 import org.elasticsearch.common.Priority;
 import org.elasticsearch.common.settings.ImmutableSettings;
@@ -84,15 +82,6 @@ protected int numberOfReplicas() {
         return 1;
     }
 
-    @Override
-    public Settings indexSettings() {
-        Settings settings = super.indexSettings();
-        return ImmutableSettings.builder()
-                .put(settings)
-                .put(ShardsLimitAllocationDecider.INDEX_TOTAL_SHARDS_PER_NODE, 2)
-                .build();
-    }
-
     @Test
     @TestLogging("discovery.zen:TRACE")
     public void failWithMinimumMasterNodesConfigured() throws Exception {
@@ -234,11 +223,7 @@ public boolean apply(Object input) {
                 }
             }, 10, TimeUnit.SECONDS);
             assertThat(applied, is(true));
-
-            ClusterHealthResponse healthResponse = client(nonIsolatedNode).admin().cluster().prepareHealth("test")
-                    .setWaitForYellowStatus().get();
-            assertThat(healthResponse.isTimedOut(), is(false));
-            assertThat(healthResponse.getStatus(), equalTo(ClusterHealthStatus.YELLOW));
+            ensureStableCluster(2, nonIsolatedNode);
 
             // Reads on the right side of the split must work
             logger.info("verifying healthy part of cluster returns data");

From e39ac7eef45cf797149fad64e537f818d8720f75 Mon Sep 17 00:00:00 2001
From: Boaz Leskes <b.leskes@gmail.com>
Date: Fri, 13 Jun 2014 11:35:01 +0200
Subject: [PATCH 19/74] [Test]  testIsolateMasterAndVerifyClusterStateConsensus
 didn't wait on initializing shards before comparing cluster states

---
 .../discovery/DiscoveryWithNetworkFailuresTests.java       | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java b/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java
index 493c18e45bffa..07371274cad7d 100644
--- a/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java
+++ b/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java
@@ -297,7 +297,7 @@ public boolean apply(Object input) {
 
     @Test
     @TestLogging("discovery.zen:TRACE,action:TRACE,cluster.service:TRACE,indices.recovery:TRACE,indices.cluster:TRACE")
-    public void voidIsolateMasterAndVerifyClusterStateConsensus() throws Exception {
+    public void testIsolateMasterAndVerifyClusterStateConsensus() throws Exception {
         final List<String> nodes = internalCluster().startNodesAsync(3, nodeSettings).get();
         ensureStableCluster(3);
 
@@ -329,9 +329,8 @@ public void voidIsolateMasterAndVerifyClusterStateConsensus() throws Exception {
         logger.info("issue a reroute");
         // trigger a reroute now, instead of waiting for the background reroute of RerouteService
         assertAcked(client().admin().cluster().prepareReroute());
-        // and wait for it to finish.
-        assertFalse(client().admin().cluster().prepareHealth().setWaitForRelocatingShards(0).get().isTimedOut());
-
+        // and wait for it to finish and for the cluster to stabilize
+        ensureGreen("test");
 
         // verify all cluster states are the same
         ClusterState state = null;

From 7db9e98ee77e50c9e0b8b8f859a00263903a1980 Mon Sep 17 00:00:00 2001
From: Boaz Leskes <b.leskes@gmail.com>
Date: Sat, 14 Jun 2014 20:08:16 +0200
Subject: [PATCH 20/74] [Discovery] Change (Master|Nodes)FaultDetection's
 connect_on_network_disconnect default to false

The previous default was true, which means that after a node disconnected event we try to connect to it as an extra validation. This can result in slow detection of network partitions if the extra reconnect times out before failure.

Also added tests to verify the settings' behaviour
---
 .../zen/fd/MasterFaultDetection.java          |   2 +-
 .../discovery/zen/fd/NodesFaultDetection.java |   2 +-
 .../discovery/ZenFaultDetectionTests.java     | 213 ++++++++++++++++++
 3 files changed, 215 insertions(+), 2 deletions(-)
 create mode 100644 src/test/java/org/elasticsearch/discovery/ZenFaultDetectionTests.java

diff --git a/src/main/java/org/elasticsearch/discovery/zen/fd/MasterFaultDetection.java b/src/main/java/org/elasticsearch/discovery/zen/fd/MasterFaultDetection.java
index 26fd2b00e9497..1a1fe2cecee4e 100644
--- a/src/main/java/org/elasticsearch/discovery/zen/fd/MasterFaultDetection.java
+++ b/src/main/java/org/elasticsearch/discovery/zen/fd/MasterFaultDetection.java
@@ -91,7 +91,7 @@ public MasterFaultDetection(Settings settings, ThreadPool threadPool, TransportS
         this.transportService = transportService;
         this.nodesProvider = nodesProvider;
 
-        this.connectOnNetworkDisconnect = componentSettings.getAsBoolean("connect_on_network_disconnect", true);
+        this.connectOnNetworkDisconnect = componentSettings.getAsBoolean("connect_on_network_disconnect", false);
         this.pingInterval = componentSettings.getAsTime("ping_interval", timeValueSeconds(1));
         this.pingRetryTimeout = componentSettings.getAsTime("ping_timeout", timeValueSeconds(30));
         this.pingRetryCount = componentSettings.getAsInt("ping_retries", 3);
diff --git a/src/main/java/org/elasticsearch/discovery/zen/fd/NodesFaultDetection.java b/src/main/java/org/elasticsearch/discovery/zen/fd/NodesFaultDetection.java
index 6f4e403610c1f..877cd2fa941ee 100644
--- a/src/main/java/org/elasticsearch/discovery/zen/fd/NodesFaultDetection.java
+++ b/src/main/java/org/elasticsearch/discovery/zen/fd/NodesFaultDetection.java
@@ -83,7 +83,7 @@ public NodesFaultDetection(Settings settings, ThreadPool threadPool, TransportSe
         this.threadPool = threadPool;
         this.transportService = transportService;
 
-        this.connectOnNetworkDisconnect = componentSettings.getAsBoolean("connect_on_network_disconnect", true);
+        this.connectOnNetworkDisconnect = componentSettings.getAsBoolean("connect_on_network_disconnect", false);
         this.pingInterval = componentSettings.getAsTime("ping_interval", timeValueSeconds(1));
         this.pingRetryTimeout = componentSettings.getAsTime("ping_timeout", timeValueSeconds(30));
         this.pingRetryCount = componentSettings.getAsInt("ping_retries", 3);
diff --git a/src/test/java/org/elasticsearch/discovery/ZenFaultDetectionTests.java b/src/test/java/org/elasticsearch/discovery/ZenFaultDetectionTests.java
new file mode 100644
index 0000000000000..fc1634ddf7beb
--- /dev/null
+++ b/src/test/java/org/elasticsearch/discovery/ZenFaultDetectionTests.java
@@ -0,0 +1,213 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.discovery;
+
+import com.google.common.collect.ImmutableMap;
+import org.elasticsearch.Version;
+import org.elasticsearch.cluster.node.DiscoveryNode;
+import org.elasticsearch.cluster.node.DiscoveryNodes;
+import org.elasticsearch.common.settings.ImmutableSettings;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.discovery.zen.DiscoveryNodesProvider;
+import org.elasticsearch.discovery.zen.fd.MasterFaultDetection;
+import org.elasticsearch.discovery.zen.fd.NodesFaultDetection;
+import org.elasticsearch.node.service.NodeService;
+import org.elasticsearch.test.ElasticsearchTestCase;
+import org.elasticsearch.test.transport.MockTransportService;
+import org.elasticsearch.threadpool.ThreadPool;
+import org.elasticsearch.transport.TransportConnectionListener;
+import org.elasticsearch.transport.local.LocalTransport;
+import org.hamcrest.Matcher;
+import org.hamcrest.Matchers;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.TimeUnit;
+
+import static org.hamcrest.Matchers.equalTo;
+
+public class ZenFaultDetectionTests extends ElasticsearchTestCase {
+
+    protected ThreadPool threadPool;
+
+    protected static final Version version0 = Version.fromId(/*0*/99);
+    protected DiscoveryNode nodeA;
+    protected MockTransportService serviceA;
+
+    protected static final Version version1 = Version.fromId(199);
+    protected DiscoveryNode nodeB;
+    protected MockTransportService serviceB;
+
+    @Before
+    public void setUp() throws Exception {
+        super.setUp();
+        threadPool = new ThreadPool();
+        serviceA = build(ImmutableSettings.builder().put("name", "TS_A").build(), version0);
+        nodeA = new DiscoveryNode("TS_A", "TS_A", serviceA.boundAddress().publishAddress(), ImmutableMap.<String, String>of(), version0);
+        serviceB = build(ImmutableSettings.builder().put("name", "TS_B").build(), version1);
+        nodeB = new DiscoveryNode("TS_B", "TS_B", serviceB.boundAddress().publishAddress(), ImmutableMap.<String, String>of(), version1);
+
+        // wait till all nodes are properly connected and the event has been sent, so tests in this class
+        // will not get this callback called on the connections done in this setup
+        final CountDownLatch latch = new CountDownLatch(4);
+        TransportConnectionListener waitForConnection = new TransportConnectionListener() {
+            @Override
+            public void onNodeConnected(DiscoveryNode node) {
+                latch.countDown();
+            }
+
+            @Override
+            public void onNodeDisconnected(DiscoveryNode node) {
+                fail("disconnect should not be called " + node);
+            }
+        };
+        serviceA.addConnectionListener(waitForConnection);
+        serviceB.addConnectionListener(waitForConnection);
+
+        serviceA.connectToNode(nodeB);
+        serviceA.connectToNode(nodeA);
+        serviceB.connectToNode(nodeA);
+        serviceB.connectToNode(nodeB);
+
+        assertThat("failed to wait for all nodes to connect", latch.await(5, TimeUnit.SECONDS), equalTo(true));
+        serviceA.removeConnectionListener(waitForConnection);
+        serviceB.removeConnectionListener(waitForConnection);
+    }
+
+    @After
+    public void tearDown() throws Exception {
+        super.tearDown();
+        serviceA.close();
+        serviceB.close();
+        threadPool.shutdown();
+    }
+
+    protected MockTransportService build(Settings settings, Version version) {
+        MockTransportService transportService = new MockTransportService(ImmutableSettings.EMPTY, new LocalTransport(settings, threadPool, version), threadPool);
+        transportService.start();
+        return transportService;
+    }
+
+    private DiscoveryNodes buildNodesForA(boolean master) {
+        DiscoveryNodes.Builder builder = DiscoveryNodes.builder();
+        builder.put(nodeA);
+        builder.put(nodeB);
+        builder.localNodeId(nodeA.id());
+        builder.masterNodeId(master ? nodeA.id() : nodeB.id());
+        return builder.build();
+    }
+
+    private DiscoveryNodes buildNodesForB(boolean master) {
+        DiscoveryNodes.Builder builder = DiscoveryNodes.builder();
+        builder.put(nodeA);
+        builder.put(nodeB);
+        builder.localNodeId(nodeB.id());
+        builder.masterNodeId(master ? nodeB.id() : nodeA.id());
+        return builder.build();
+    }
+
+    @Test
+    public void testNodesFaultDetectionConnectOnDisconnect() throws InterruptedException {
+        ImmutableSettings.Builder settings = ImmutableSettings.builder();
+        boolean shouldRetry = randomBoolean();
+        // make sure we don't ping
+        settings.put("discovery.zen.fd.connect_on_network_disconnect", shouldRetry).put("discovery.zen.fd.ping_interval", "5m");
+        NodesFaultDetection nodesFD = new NodesFaultDetection(settings.build(), threadPool, serviceA);
+        nodesFD.start();
+        nodesFD.updateNodes(buildNodesForA(true));
+        final String[] failureReason = new String[1];
+        final DiscoveryNode[] failureNode = new DiscoveryNode[1];
+        final CountDownLatch notified = new CountDownLatch(1);
+        nodesFD.addListener(new NodesFaultDetection.Listener() {
+            @Override
+            public void onNodeFailure(DiscoveryNode node, String reason) {
+                failureNode[0] = node;
+                failureReason[0] = reason;
+                notified.countDown();
+            }
+        });
+        // will raise a disconnect on A
+        serviceB.stop();
+        notified.await(30, TimeUnit.SECONDS);
+
+        assertEquals(nodeB, failureNode[0]);
+        Matcher<String> matcher = Matchers.containsString("verified");
+        if (!shouldRetry) {
+            matcher = Matchers.not(matcher);
+        }
+
+        assertThat(failureReason[0], matcher);
+    }
+
+    @Test
+    public void testMasterFaultDetectionConnectOnDisconnect() throws InterruptedException {
+
+        ImmutableSettings.Builder settings = ImmutableSettings.builder();
+        boolean shouldRetry = randomBoolean();
+        // make sure we don't ping
+        settings.put("discovery.zen.fd.connect_on_network_disconnect", shouldRetry).put("discovery.zen.fd.ping_interval", "5m");
+        final DiscoveryNodes nodes = buildNodesForA(false);
+        MasterFaultDetection masterFD = new MasterFaultDetection(settings.build(), threadPool, serviceA,
+                new DiscoveryNodesProvider() {
+                    @Override
+                    public DiscoveryNodes nodes() {
+                        return nodes;
+                    }
+
+                    @Override
+                    public NodeService nodeService() {
+                        return null;
+                    }
+                }
+        );
+        masterFD.start(nodeB, "test");
+
+        final String[] failureReason = new String[1];
+        final DiscoveryNode[] failureNode = new DiscoveryNode[1];
+        final CountDownLatch notified = new CountDownLatch(1);
+        masterFD.addListener(new MasterFaultDetection.Listener() {
+
+            @Override
+            public void onMasterFailure(DiscoveryNode masterNode, String reason) {
+                failureNode[0] = masterNode;
+                failureReason[0] = reason;
+                notified.countDown();
+            }
+
+            @Override
+            public void onDisconnectedFromMaster() {
+
+            }
+        });
+        // will raise a disconnect on A
+        serviceB.stop();
+        notified.await(30, TimeUnit.SECONDS);
+
+        assertEquals(nodeB, failureNode[0]);
+        Matcher<String> matcher = Matchers.containsString("verified");
+        if (!shouldRetry) {
+            matcher = Matchers.not(matcher);
+        }
+
+        assertThat(failureReason[0], matcher);
+    }
+}
\ No newline at end of file

From 8b85d97ea60c6a4eadcda439b74eef1dbf55abcc Mon Sep 17 00:00:00 2001
From: Boaz Leskes <b.leskes@gmail.com>
Date: Sun, 15 Jun 2014 23:57:19 +0200
Subject: [PATCH 21/74] [Discovery] Improved logging when a join request is not
 executed because local node is no longer master

---
 .../java/org/elasticsearch/discovery/zen/ZenDiscovery.java  | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java b/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
index 1f0b365aee3e3..9a46aeb8b76ed 100644
--- a/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
+++ b/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
@@ -797,7 +797,11 @@ public ClusterState execute(ClusterState currentState) {
 
                 @Override
                 public void onFailure(String source, Throwable t) {
-                    logger.error("unexpected failure during [{}]", t, source);
+                    if (t instanceof ClusterService.NoLongerMasterException) {
+                        logger.debug("not processing [{}] as we are no longer master", source);
+                    } else {
+                        logger.error("unexpected failure during [{}]", t, source);
+                    }
                     callback.onFailure(t);
                 }
 

From 5d13571dbec1af5ee8b62493775149be8404c0b0 Mon Sep 17 00:00:00 2001
From: Boaz Leskes <b.leskes@gmail.com>
Date: Tue, 17 Jun 2014 10:05:36 +0200
Subject: [PATCH 22/74] [Discovery] when master is gone, flush all pending
 cluster states

If the master FD flags master as gone while there are still pending cluster states, the processing of those cluster states we re-instate that node a master again.

Closes #6526
---
 .../org/elasticsearch/discovery/zen/ZenDiscovery.java | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java b/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
index 9a46aeb8b76ed..c1ca890df893c 100644
--- a/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
+++ b/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
@@ -61,6 +61,7 @@
 import org.elasticsearch.transport.*;
 
 import java.io.IOException;
+import java.util.ArrayList;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
@@ -524,6 +525,11 @@ public ClusterState execute(ClusterState currentState) {
                         .masterNodeId(null).build();
                 latestDiscoNodes = discoveryNodes;
 
+                // flush any pending cluster states from old master, so it will not be set as master again
+                ArrayList<ProcessClusterState> pendingNewClusterStates = new ArrayList<>();
+                processNewClusterStates.drainTo(pendingNewClusterStates);
+                logger.trace("removed [{}] pending cluster states", pendingNewClusterStates.size());
+
                 if (rejoinOnMasterGone) {
                     return rejoin(ClusterState.builder(currentState).nodes(discoveryNodes).build(), "master left (reason = " + reason + ")");
                 }
@@ -680,6 +686,11 @@ public ClusterState execute(ClusterState currentState) {
 
                             // we are going to use it for sure, poll (remove) it
                             potentialState = processNewClusterStates.poll();
+                            if (potentialState == null) {
+                                // might happen if the queue is drained
+                                break;
+                            }
+
                             potentialState.processed = true;
 
                             if (potentialState.clusterState.version() > stateToProcess.clusterState.version()) {

From 28489cee45370eaee0c0151e406b2ae157e47634 Mon Sep 17 00:00:00 2001
From: Boaz Leskes <b.leskes@gmail.com>
Date: Fri, 16 May 2014 22:09:39 +0200
Subject: [PATCH 23/74] [Tests] Added ServiceDisruptionScheme(s) and
 testAckedIndexing

This commit adds the notion of ServiceDisruptionScheme allowing for introducing disruptions in our test cluster. This
abstraction as used in a couple of wrappers around the functionality offered by MockTransportService to simulate various
network partions. There is also one implementation for causing a node to be slow in processing cluster state updates.

This new mechnaism is integrated into existing tests DiscoveryWithNetworkFailuresTests.

A new test called testAckedIndexing is added to verify retrieval of documents whose indexing was acked during various disruptions.

Closes #6505
---
 .../discovery/zen/ZenDiscovery.java           |  11 +-
 .../transport/TransportService.java           |   4 +
 .../cluster/ClusterServiceTests.java          |   6 +-
 .../cluster/NoMasterNodeTests.java            |   7 +-
 .../DiscoveryWithNetworkFailuresTests.java    | 399 ++++++++++++------
 .../recovery/RecoveryWhileUnderLoadTests.java |   1 -
 .../elasticsearch/test/BackgroundIndexer.java |   2 +-
 .../test/ElasticsearchIntegrationTest.java    |  11 +
 .../test/InternalTestCluster.java             |  63 +++
 .../org/elasticsearch/test/TestCluster.java   |   1 +
 .../disruption/NetworkDelaysPartition.java    |  88 ++++
 .../NetworkDisconnectPartition.java           |  53 +++
 .../test/disruption/NetworkPartition.java     | 199 +++++++++
 .../NetworkUnresponsivePartition.java         |  52 +++
 .../test/disruption/NoOpDisruptionScheme.java |  60 +++
 .../disruption/ServiceDisruptionScheme.java   |  40 ++
 .../test/disruption/SingleNodeDisruption.java |  83 ++++
 .../SlowClusterStateProcessing.java           | 130 ++++++
 .../test/transport/MockTransportService.java  | 100 ++++-
 19 files changed, 1149 insertions(+), 161 deletions(-)
 create mode 100644 src/test/java/org/elasticsearch/test/disruption/NetworkDelaysPartition.java
 create mode 100644 src/test/java/org/elasticsearch/test/disruption/NetworkDisconnectPartition.java
 create mode 100644 src/test/java/org/elasticsearch/test/disruption/NetworkPartition.java
 create mode 100644 src/test/java/org/elasticsearch/test/disruption/NetworkUnresponsivePartition.java
 create mode 100644 src/test/java/org/elasticsearch/test/disruption/NoOpDisruptionScheme.java
 create mode 100644 src/test/java/org/elasticsearch/test/disruption/ServiceDisruptionScheme.java
 create mode 100644 src/test/java/org/elasticsearch/test/disruption/SingleNodeDisruption.java
 create mode 100644 src/test/java/org/elasticsearch/test/disruption/SlowClusterStateProcessing.java

diff --git a/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java b/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
index c1ca890df893c..bccc274965617 100644
--- a/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
+++ b/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
@@ -342,7 +342,7 @@ public ClusterState execute(ClusterState currentState) {
 
                     @Override
                     public void onFailure(String source, Throwable t) {
-                            logger.error("unexpected failure during [{}]", t, source);
+                        logger.error("unexpected failure during [{}]", t, source);
                     }
 
                     @Override
@@ -408,8 +408,7 @@ public ClusterState execute(ClusterState currentState) {
                 public void onFailure(String source, Throwable t) {
                     if (t instanceof ClusterService.NoLongerMasterException) {
                         logger.debug("not processing {} leave request as we are no longer master", node);
-                    }
-                    else {
+                    } else {
                         logger.error("unexpected failure during [{}]", t, source);
                     }
                 }
@@ -448,8 +447,7 @@ public ClusterState execute(ClusterState currentState) {
             public void onFailure(String source, Throwable t) {
                 if (t instanceof ClusterService.NoLongerMasterException) {
                     logger.debug("not processing [{}] as we are no longer master", source);
-                }
-                else {
+                } else {
                     logger.error("unexpected failure during [{}]", t, source);
                 }
             }
@@ -486,8 +484,7 @@ public ClusterState execute(ClusterState currentState) {
             public void onFailure(String source, Throwable t) {
                 if (t instanceof ClusterService.NoLongerMasterException) {
                     logger.debug("not processing [{}] as we are no longer master", source);
-                }
-                else {
+                } else {
                     logger.error("unexpected failure during [{}]", t, source);
                 }
             }
diff --git a/src/main/java/org/elasticsearch/transport/TransportService.java b/src/main/java/org/elasticsearch/transport/TransportService.java
index e922f1b4932f0..e2e6f502e89d8 100644
--- a/src/main/java/org/elasticsearch/transport/TransportService.java
+++ b/src/main/java/org/elasticsearch/transport/TransportService.java
@@ -245,6 +245,10 @@ public void removeHandler(String action) {
         }
     }
 
+    protected TransportRequestHandler getHandler(String action) {
+        return serverHandlers.get(action);
+    }
+
     class Adapter implements TransportServiceAdapter {
 
         final MeanMetric rxMetric = new MeanMetric();
diff --git a/src/test/java/org/elasticsearch/cluster/ClusterServiceTests.java b/src/test/java/org/elasticsearch/cluster/ClusterServiceTests.java
index 1286c62d1668a..52f00035c4ab8 100644
--- a/src/test/java/org/elasticsearch/cluster/ClusterServiceTests.java
+++ b/src/test/java/org/elasticsearch/cluster/ClusterServiceTests.java
@@ -263,12 +263,12 @@ public void testMasterAwareExecution() throws Exception {
                 .put("discovery.type", "local")
                 .build();
 
-        ListenableFuture<String> master = cluster().startNodeAsync(settings);
-        ListenableFuture<String> nonMaster = cluster().startNodeAsync(settingsBuilder().put(settings).put("node.master", false).build());
+        ListenableFuture<String> master = internalCluster().startNodeAsync(settings);
+        ListenableFuture<String> nonMaster = internalCluster().startNodeAsync(settingsBuilder().put(settings).put("node.master", false).build());
         master.get();
         ensureGreen(); // make sure we have a cluster
 
-        ClusterService clusterService = cluster().getInstance(ClusterService.class, nonMaster.get());
+        ClusterService clusterService = internalCluster().getInstance(ClusterService.class, nonMaster.get());
 
         final boolean[] taskFailed = {false};
         final CountDownLatch latch1 = new CountDownLatch(1);
diff --git a/src/test/java/org/elasticsearch/cluster/NoMasterNodeTests.java b/src/test/java/org/elasticsearch/cluster/NoMasterNodeTests.java
index fa1ca5e9e8005..94c0268cdd6d2 100644
--- a/src/test/java/org/elasticsearch/cluster/NoMasterNodeTests.java
+++ b/src/test/java/org/elasticsearch/cluster/NoMasterNodeTests.java
@@ -21,9 +21,9 @@
 
 import org.elasticsearch.action.ActionRequestBuilder;
 import com.google.common.base.Predicate;
+import org.elasticsearch.action.bulk.BulkRequestBuilder;
 import org.elasticsearch.action.count.CountResponse;
 import org.elasticsearch.action.get.GetResponse;
-import org.elasticsearch.action.bulk.BulkRequestBuilder;
 import org.elasticsearch.action.percolate.PercolateSourceBuilder;
 import org.elasticsearch.action.search.SearchResponse;
 import org.elasticsearch.cluster.block.ClusterBlockException;
@@ -48,11 +48,6 @@
 import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertHitCount;
 import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertThrows;
 import static org.hamcrest.Matchers.*;
-import static org.elasticsearch.test.ElasticsearchIntegrationTest.*;
-import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertExists;
-import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertHitCount;
-import static org.hamcrest.Matchers.equalTo;
-import static org.hamcrest.Matchers.greaterThan;
 
 /**
  */
diff --git a/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java b/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java
index 07371274cad7d..a0abf9fdd91ba 100644
--- a/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java
+++ b/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java
@@ -20,6 +20,8 @@
 package org.elasticsearch.discovery;
 
 import com.google.common.base.Predicate;
+import org.apache.lucene.util.LuceneTestCase;
+import org.elasticsearch.ElasticsearchException;
 import org.elasticsearch.action.admin.cluster.health.ClusterHealthResponse;
 import org.elasticsearch.action.get.GetResponse;
 import org.elasticsearch.action.index.IndexRequestBuilder;
@@ -41,16 +43,20 @@
 import org.elasticsearch.search.SearchHit;
 import org.elasticsearch.search.sort.SortOrder;
 import org.elasticsearch.test.ElasticsearchIntegrationTest;
+import org.elasticsearch.test.disruption.*;
 import org.elasticsearch.test.junit.annotations.TestLogging;
 import org.elasticsearch.test.transport.MockTransportService;
 import org.elasticsearch.transport.TransportModule;
-import org.elasticsearch.transport.TransportService;
 import org.junit.Test;
 
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.List;
+import java.util.*;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.Semaphore;
 import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.AtomicReference;
 
 import static org.elasticsearch.test.ElasticsearchIntegrationTest.ClusterScope;
 import static org.elasticsearch.test.ElasticsearchIntegrationTest.Scope;
@@ -108,38 +114,36 @@ public void failWithMinimumMasterNodesConfigured() throws Exception {
         assert unluckyNode != null;
 
         // Simulate a network issue between the unlucky node and elected master node in both directions.
-        addFailToSendNoConnectRule(masterDiscoNode.getName(), unluckyNode);
-        addFailToSendNoConnectRule(unluckyNode, masterDiscoNode.getName());
-        try {
-            // Wait until elected master has removed that the unlucky node...
-            boolean applied = awaitBusy(new Predicate<Object>() {
-                @Override
-                public boolean apply(Object input) {
-                    return masterClient.admin().cluster().prepareState().setLocal(true).get().getState().nodes().size() == 2;
-                }
-            }, 1, TimeUnit.MINUTES);
-            assertThat(applied, is(true));
-
-            // The unlucky node must report *no* master node, since it can't connect to master and in fact it should
-            // continuously ping until network failures have been resolved. However
-            final Client isolatedNodeClient = internalCluster().client(unluckyNode);
-            // It may a take a bit before the node detects it has been cut off from the elected master
-            applied = awaitBusy(new Predicate<Object>() {
-                @Override
-                public boolean apply(Object input) {
-                    ClusterState localClusterState = isolatedNodeClient.admin().cluster().prepareState().setLocal(true).get().getState();
-                    DiscoveryNodes localDiscoveryNodes = localClusterState.nodes();
-                    logger.info("localDiscoveryNodes=" + localDiscoveryNodes.prettyPrint());
-                    return localDiscoveryNodes.masterNode() == null;
-                }
-            }, 10, TimeUnit.SECONDS);
-            assertThat(applied, is(true));
-        } finally {
-            // stop simulating network failures, from this point on the unlucky node is able to rejoin
-            // We also need to do this even if assertions fail, since otherwise the test framework can't work properly
-            clearNoConnectRule(masterDiscoNode.getName(), unluckyNode);
-            clearNoConnectRule(unluckyNode, masterDiscoNode.getName());
-        }
+
+        NetworkDisconnectPartition networkDisconnect = new NetworkDisconnectPartition(masterDiscoNode.name(), unluckyNode, getRandom());
+        setDisruptionScheme(networkDisconnect);
+        networkDisconnect.startDisrupting();
+
+        // Wait until elected master has removed that the unlucky node...
+        boolean applied = awaitBusy(new Predicate<Object>() {
+            @Override
+            public boolean apply(Object input) {
+                return masterClient.admin().cluster().prepareState().setLocal(true).get().getState().nodes().size() == 2;
+            }
+        }, 1, TimeUnit.MINUTES);
+        assertThat(applied, is(true));
+
+        // The unlucky node must report *no* master node, since it can't connect to master and in fact it should
+        // continuously ping until network failures have been resolved. However
+        final Client isolatedNodeClient = internalCluster().client(unluckyNode);
+        // It may a take a bit before the node detects it has been cut off from the elected master
+        applied = awaitBusy(new Predicate<Object>() {
+            @Override
+            public boolean apply(Object input) {
+                ClusterState localClusterState = isolatedNodeClient.admin().cluster().prepareState().setLocal(true).get().getState();
+                DiscoveryNodes localDiscoveryNodes = localClusterState.nodes();
+                logger.info("localDiscoveryNodes=" + localDiscoveryNodes.prettyPrint());
+                return localDiscoveryNodes.masterNode() == null;
+            }
+        }, 10, TimeUnit.SECONDS);
+        assertThat(applied, is(true));
+
+        networkDisconnect.stopDisrupting();
 
         // Wait until the master node sees all 3 nodes again.
         ensureStableCluster(3);
@@ -193,80 +197,78 @@ public void testDataConsistency() throws Exception {
         // (waiting for green here, because indexing / search in a yellow index is fine as long as no other nodes go down)
         ensureGreen("test");
 
-        // Pick a node that isn't the elected master.
-        final String isolatedNode = nodes.get(0);
-        final String nonIsolatedNode = nodes.get(1);
+        NetworkPartition networkPartition = addRandomPartition();
+
+        final String isolatedNode = networkPartition.getMinoritySide().get(0);
+        final String nonIsolatedNode = networkPartition.getMjaoritySide().get(0);
 
         // Simulate a network issue between the unlucky node and the rest of the cluster.
-        randomIsolateNode(isolatedNode, nodes);
-        try {
-            logger.info("wait until elected master has removed [{}]", isolatedNode);
-            boolean applied = awaitBusy(new Predicate<Object>() {
-                @Override
-                public boolean apply(Object input) {
-                    return client(nonIsolatedNode).admin().cluster().prepareState().setLocal(true).get().getState().nodes().size() == 2;
-                }
-            }, 1, TimeUnit.MINUTES);
-            assertThat(applied, is(true));
-
-            // The unlucky node must report *no* master node, since it can't connect to master and in fact it should
-            // continuously ping until network failures have been resolved. However
-            // It may a take a bit before the node detects it has been cut off from the elected master
-            logger.info("waiting for isolated node [{}] to have no master", isolatedNode);
-            applied = awaitBusy(new Predicate<Object>() {
-                @Override
-                public boolean apply(Object input) {
-                    ClusterState localClusterState = client(isolatedNode).admin().cluster().prepareState().setLocal(true).get().getState();
-                    DiscoveryNodes localDiscoveryNodes = localClusterState.nodes();
-                    logger.info("localDiscoveryNodes=" + localDiscoveryNodes.prettyPrint());
-                    return localDiscoveryNodes.masterNode() == null;
-                }
-            }, 10, TimeUnit.SECONDS);
-            assertThat(applied, is(true));
-            ensureStableCluster(2, nonIsolatedNode);
+        networkPartition.startDisrupting();
 
-            // Reads on the right side of the split must work
-            logger.info("verifying healthy part of cluster returns data");
-            searchResponse = client(nonIsolatedNode).prepareSearch("test").setTypes("type")
-                    .addSort("field", SortOrder.ASC)
-                    .get();
-            assertHitCount(searchResponse, indexRequests.length);
-            for (int i = 0; i < searchResponse.getHits().getHits().length; i++) {
-                SearchHit searchHit = searchResponse.getHits().getAt(i);
-                assertThat(searchHit.id(), equalTo(String.valueOf(i)));
-                assertThat((long) searchHit.sortValues()[0], equalTo((long) i));
+        logger.info("wait until elected master has removed [{}]", isolatedNode);
+        boolean applied = awaitBusy(new Predicate<Object>() {
+            @Override
+            public boolean apply(Object input) {
+                return client(nonIsolatedNode).admin().cluster().prepareState().setLocal(true).get().getState().nodes().size() == 2;
+            }
+        }, 1, TimeUnit.MINUTES);
+        assertThat(applied, is(true));
+
+        // The unlucky node must report *no* master node, since it can't connect to master and in fact it should
+        // continuously ping until network failures have been resolved. However
+        // It may a take a bit before the node detects it has been cut off from the elected master
+        logger.info("waiting for isolated node [{}] to have no master", isolatedNode);
+        applied = awaitBusy(new Predicate<Object>() {
+            @Override
+            public boolean apply(Object input) {
+                ClusterState localClusterState = client(isolatedNode).admin().cluster().prepareState().setLocal(true).get().getState();
+                DiscoveryNodes localDiscoveryNodes = localClusterState.nodes();
+                logger.info("localDiscoveryNodes=" + localDiscoveryNodes.prettyPrint());
+                return localDiscoveryNodes.masterNode() == null;
             }
+        }, 10, TimeUnit.SECONDS);
+        assertThat(applied, is(true));
+        ensureStableCluster(2, nonIsolatedNode);
 
-            // Reads on the wrong side of the split are partial
-            logger.info("verifying isolated node [{}] returns partial data", isolatedNode);
-            searchResponse = client(isolatedNode).prepareSearch("test").setTypes("type")
-                    .addSort("field", SortOrder.ASC).setPreference("_only_local")
-                    .get();
-            assertThat(searchResponse.getSuccessfulShards(), lessThan(searchResponse.getTotalShards()));
-            assertThat(searchResponse.getHits().totalHits(), lessThan((long) indexRequests.length));
+        // Reads on the right side of the split must work
+        logger.info("verifying healthy part of cluster returns data");
+        searchResponse = client(nonIsolatedNode).prepareSearch("test").setTypes("type")
+                .addSort("field", SortOrder.ASC)
+                .get();
+        assertHitCount(searchResponse, indexRequests.length);
+        for (int i = 0; i < searchResponse.getHits().getHits().length; i++) {
+            SearchHit searchHit = searchResponse.getHits().getAt(i);
+            assertThat(searchHit.id(), equalTo(String.valueOf(i)));
+            assertThat((long) searchHit.sortValues()[0], equalTo((long) i));
+        }
 
-            logger.info("verifying writes on healthy cluster");
-            UpdateResponse updateResponse = client(nonIsolatedNode).prepareUpdate("test", "type", "0").setDoc("field2", 2).get();
-            assertThat(updateResponse.getVersion(), equalTo(2l));
+        // Reads on the wrong side of the split are partial
+        logger.info("verifying isolated node [{}] returns partial data", isolatedNode);
+        searchResponse = client(isolatedNode).prepareSearch("test").setTypes("type")
+                .addSort("field", SortOrder.ASC).setPreference("_only_local")
+                .get();
+        assertThat(searchResponse.getSuccessfulShards(), lessThan(searchResponse.getTotalShards()));
+        assertThat(searchResponse.getHits().totalHits(), lessThan((long) indexRequests.length));
 
-            try {
-                logger.info("verifying writes on isolated [{}] fail", isolatedNode);
-                client(isolatedNode).prepareUpdate("test", "type", "0").setDoc("field2", 2)
-                        .setTimeout("1s") // Fail quick, otherwise we wait 60 seconds.
-                        .get();
-                fail();
-            } catch (ClusterBlockException exception) {
-                assertThat(exception.status(), equalTo(RestStatus.SERVICE_UNAVAILABLE));
-                assertThat(exception.blocks().size(), equalTo(1));
-                ClusterBlock clusterBlock = exception.blocks().iterator().next();
-                assertThat(clusterBlock.id(), equalTo(DiscoverySettings.NO_MASTER_BLOCK_ID));
-            }
-        } finally {
-            // stop simulating network failures, from this point on the unlucky node is able to rejoin
-            // We also need to do this even if assertions fail, since otherwise the test framework can't work properly
-            restoreIsolation(isolatedNode, nodes);
+        logger.info("verifying writes on healthy cluster");
+        UpdateResponse updateResponse = client(nonIsolatedNode).prepareUpdate("test", "type", "0").setDoc("field2", 2).get();
+        assertThat(updateResponse.getVersion(), equalTo(2l));
+
+        try {
+            logger.info("verifying writes on isolated [{}] fail", isolatedNode);
+            client(isolatedNode).prepareUpdate("test", "type", "0").setDoc("field2", 2)
+                    .setTimeout("1s") // Fail quick, otherwise we wait 60 seconds.
+                    .get();
+            fail();
+        } catch (ClusterBlockException exception) {
+            assertThat(exception.status(), equalTo(RestStatus.SERVICE_UNAVAILABLE));
+            assertThat(exception.blocks().size(), equalTo(1));
+            ClusterBlock clusterBlock = exception.blocks().iterator().next();
+            assertThat(clusterBlock.id(), equalTo(DiscoverySettings.NO_MASTER_BLOCK_ID));
         }
 
+        networkPartition.stopDisrupting();
+
         // Wait until the master node sees all 3 nodes again.
         ensureStableCluster(3);
 
@@ -316,13 +318,14 @@ public void testIsolateMasterAndVerifyClusterStateConsensus() throws Exception {
                 break;
             }
         }
-        randomIsolateNode(isolatedNode, nodes);
+        ServiceDisruptionScheme scheme = addRandomIsolation(isolatedNode);
+        scheme.startDisrupting();
 
         // make sure cluster reforms
         ensureStableCluster(2, nonIsolatedNode);
 
         // restore isolation
-        restoreIsolation(isolatedNode, nodes);
+        scheme.stopDisrupting();
 
         ensureStableCluster(3);
 
@@ -356,7 +359,120 @@ public void testIsolateMasterAndVerifyClusterStateConsensus() throws Exception {
             }
 
         }
+    }
 
+    @Test
+    @LuceneTestCase.AwaitsFix(bugUrl = "MvG will fix")
+    public void testAckedIndexing() throws Exception {
+        final List<String> nodes = internalCluster().startNodesAsync(3, nodeSettings).get();
+        ensureStableCluster(3);
+
+        assertAcked(prepareCreate("test")
+                .setSettings(ImmutableSettings.builder()
+                                .put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1 + randomInt(2))
+                                .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, randomInt(2))
+                ));
+
+        ensureGreen();
+
+        ServiceDisruptionScheme disruptionScheme = addRandomDisruptionScheme();
+        logger.info("disruption scheme [{}] added", disruptionScheme);
+
+        final ConcurrentHashMap<String, String> ackedDocs = new ConcurrentHashMap<>(); // id -> node sent.
+
+        final AtomicBoolean stop = new AtomicBoolean(false);
+        List<Thread> indexers = new ArrayList<>(nodes.size());
+        List<Semaphore> semaphores = new ArrayList<>(nodes.size());
+        final AtomicInteger idGenerator = new AtomicInteger(0);
+        final AtomicReference<CountDownLatch> countDownLatch = new AtomicReference<>();
+        logger.info("starting indexers");
+
+        for (final String node : nodes) {
+            final Semaphore semaphore = new Semaphore(0);
+            semaphores.add(semaphore);
+            final Client client = client(node);
+            final String name = "indexer_" + indexers.size();
+            Thread thread = new Thread(new Runnable() {
+                @Override
+                public void run() {
+                    while (!stop.get()) {
+                        try {
+                            if (!semaphore.tryAcquire(10, TimeUnit.SECONDS)) {
+                                continue;
+                            }
+                            try {
+                                String id = Integer.toString(idGenerator.incrementAndGet());
+                                logger.trace("[{}] indexing id [{}] through node [{}]", name, id, node);
+                                IndexResponse response = client.prepareIndex("test", "type", id).setSource("{}").setTimeout("1s").get();
+                                ackedDocs.put(id, node);
+                            } finally {
+                                countDownLatch.get().countDown();
+                                logger.trace("[{}] decreased counter : {}", name, countDownLatch.get().getCount());
+                            }
+                        } catch (ElasticsearchException | InterruptedException e) {
+                            // expected
+                        } catch (Throwable t) {
+                            logger.info("unexpected exception in background thread of [{}]", t, node);
+                        }
+                    }
+                }
+            });
+
+            thread.setName(name);
+            thread.setDaemon(true);
+            thread.start();
+            indexers.add(thread);
+        }
+
+        logger.info("indexing some docs before partition");
+        int docsPerIndexer = randomInt(3);
+        countDownLatch.set(new CountDownLatch(docsPerIndexer * indexers.size()));
+        for (Semaphore semaphore : semaphores) {
+            semaphore.release(docsPerIndexer);
+        }
+        assertTrue(countDownLatch.get().await(1, TimeUnit.MINUTES));
+
+        for (int iter = 1 + randomInt(2); iter > 0; iter--) {
+
+            logger.info("starting disruptions & indexing (iteration [{}])", iter);
+            disruptionScheme.startDisrupting();
+
+            docsPerIndexer = 1 + randomInt(5);
+            countDownLatch.set(new CountDownLatch(docsPerIndexer * indexers.size()));
+            Collections.shuffle(semaphores);
+            for (Semaphore semaphore : semaphores) {
+                semaphore.release(docsPerIndexer);
+            }
+            assertTrue(countDownLatch.get().await(1, TimeUnit.MINUTES));
+
+            logger.info("stopping disruption");
+            disruptionScheme.stopDisrupting();
+
+            ensureStableCluster(3);
+            ensureGreen("test");
+
+            logger.info("validating successful docs");
+            for (String node : nodes) {
+                try {
+                    logger.debug("validating through node [{}]", node);
+                    for (String id : ackedDocs.keySet()) {
+                        assertTrue("doc [" + id + "] indexed via node [" + ackedDocs.get(id) + "] not found",
+                                client(node).prepareGet("test", "type", id).setPreference("_local").get().isExists());
+                    }
+                } catch (AssertionError e) {
+                    throw new AssertionError(e.getMessage() + " (checked via node [" + node + "]", e);
+                }
+            }
+
+            logger.info("done validating (iteration [{}])", iter);
+        }
+
+        logger.info("shutting down indexers");
+        stop.set(true);
+        for (Thread indexer : indexers) {
+            indexer.interrupt();
+            indexer.join(60000);
+        }
     }
 
 
@@ -379,7 +495,8 @@ public void testRejoinDocumentExistsInAllShardCopies() throws Exception {
         String isolatedNode = nodes.get(0);
         String notIsolatedNode = nodes.get(1);
 
-        randomIsolateNode(isolatedNode, nodes);
+        ServiceDisruptionScheme scheme = addRandomIsolation(isolatedNode);
+        scheme.startDisrupting();
         ensureStableCluster(2, notIsolatedNode);
         assertFalse(client(notIsolatedNode).admin().cluster().prepareHealth("test").setWaitForYellowStatus().get().isTimedOut());
 
@@ -395,7 +512,7 @@ public void testRejoinDocumentExistsInAllShardCopies() throws Exception {
         assertThat(getResponse.getVersion(), equalTo(1l));
         assertThat(getResponse.getId(), equalTo(indexResponse.getId()));
 
-        restoreIsolation(isolatedNode, nodes);
+        scheme.stopDisrupting();
 
         ensureStableCluster(3);
         ensureGreen("test");
@@ -411,30 +528,47 @@ public void testRejoinDocumentExistsInAllShardCopies() throws Exception {
         }
     }
 
-    protected void restoreIsolation(String isolatedNode, List<String> nodes) {
-        logger.info("restoring isolation of [{}]", isolatedNode);
-        for (String nodeId : nodes) {
-            if (!nodeId.equals(isolatedNode)) {
-                clearNoConnectRule(nodeId, isolatedNode);
-                clearNoConnectRule(isolatedNode, nodeId);
-            }
+    protected NetworkPartition addRandomPartition() {
+        NetworkPartition partition;
+        if (randomBoolean()) {
+            partition = new NetworkUnresponsivePartition(getRandom());
+        } else {
+            partition = new NetworkDisconnectPartition(getRandom());
         }
+
+        setDisruptionScheme(partition);
+
+        return partition;
     }
 
-    protected void randomIsolateNode(String isolatedNode, List<String> nodes) {
-        boolean unresponsive = randomBoolean();
-        logger.info("isolating [{}] with unresponsive: [{}]", isolatedNode, unresponsive);
-        for (String nodeId : nodes) {
-            if (!nodeId.equals(isolatedNode)) {
-                if (unresponsive) {
-                    addUnresponsiveRule(nodeId, isolatedNode);
-                    addUnresponsiveRule(isolatedNode, nodeId);
-                } else {
-                    addFailToSendNoConnectRule(nodeId, isolatedNode);
-                    addFailToSendNoConnectRule(isolatedNode, nodeId);
-                }
-            }
+    protected NetworkPartition addRandomIsolation(String isolatedNode) {
+        Set<String> side1 = new HashSet<>();
+        Set<String> side2 = new HashSet<>(Arrays.asList(internalCluster().getNodeNames()));
+        side1.add(isolatedNode);
+        side2.remove(isolatedNode);
+
+        NetworkPartition partition;
+        if (randomBoolean()) {
+            partition = new NetworkUnresponsivePartition(side1, side2, getRandom());
+        } else {
+            partition = new NetworkDisconnectPartition(side1, side2, getRandom());
         }
+
+        internalCluster().setDisruptionScheme(partition);
+
+        return partition;
+    }
+
+    private ServiceDisruptionScheme addRandomDisruptionScheme() {
+        List<ServiceDisruptionScheme> list = Arrays.asList(
+                new NetworkUnresponsivePartition(getRandom()),
+                new NetworkDelaysPartition(getRandom()),
+                new NetworkDisconnectPartition(getRandom()),
+                new SlowClusterStateProcessing(getRandom())
+        );
+        Collections.shuffle(list);
+        setDisruptionScheme(list.get(0));
+        return list.get(0);
     }
 
     private DiscoveryNode findMasterNode(List<String> nodes) {
@@ -452,21 +586,6 @@ private DiscoveryNode findMasterNode(List<String> nodes) {
         return masterDiscoNode;
     }
 
-    private void addFailToSendNoConnectRule(String fromNode, String toNode) {
-        TransportService mockTransportService = internalCluster().getInstance(TransportService.class, fromNode);
-        ((MockTransportService) mockTransportService).addFailToSendNoConnectRule(internalCluster().getInstance(Discovery.class, toNode).localNode());
-    }
-
-    private void addUnresponsiveRule(String fromNode, String toNode) {
-        TransportService mockTransportService = internalCluster().getInstance(TransportService.class, fromNode);
-        ((MockTransportService) mockTransportService).addUnresponsiveRule(internalCluster().getInstance(Discovery.class, toNode).localNode());
-    }
-
-    private void clearNoConnectRule(String fromNode, String toNode) {
-        TransportService mockTransportService = internalCluster().getInstance(TransportService.class, fromNode);
-        ((MockTransportService) mockTransportService).clearRule(internalCluster().getInstance(Discovery.class, toNode).localNode());
-    }
-
 
     private void ensureStableCluster(int nodeCount) {
         ensureStableCluster(nodeCount, null);
diff --git a/src/test/java/org/elasticsearch/recovery/RecoveryWhileUnderLoadTests.java b/src/test/java/org/elasticsearch/recovery/RecoveryWhileUnderLoadTests.java
index ca2f8a5b05044..ff4512b4113dd 100644
--- a/src/test/java/org/elasticsearch/recovery/RecoveryWhileUnderLoadTests.java
+++ b/src/test/java/org/elasticsearch/recovery/RecoveryWhileUnderLoadTests.java
@@ -43,7 +43,6 @@
 import static org.elasticsearch.common.settings.ImmutableSettings.settingsBuilder;
 import static org.elasticsearch.index.query.QueryBuilders.matchAllQuery;
 import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.*;
-import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertNoTimeout;
 import static org.hamcrest.Matchers.equalTo;
 
 public class RecoveryWhileUnderLoadTests extends ElasticsearchIntegrationTest {
diff --git a/src/test/java/org/elasticsearch/test/BackgroundIndexer.java b/src/test/java/org/elasticsearch/test/BackgroundIndexer.java
index 29184b897687d..2cafcef5d9f92 100644
--- a/src/test/java/org/elasticsearch/test/BackgroundIndexer.java
+++ b/src/test/java/org/elasticsearch/test/BackgroundIndexer.java
@@ -217,7 +217,7 @@ public void continueIndexing(int numOfDocs) {
         setBudget(numOfDocs);
     }
 
-    /** Stop all background threads **/
+    /** Stop all background threads * */
     public void stop() throws InterruptedException {
         if (stop.get()) {
             return;
diff --git a/src/test/java/org/elasticsearch/test/ElasticsearchIntegrationTest.java b/src/test/java/org/elasticsearch/test/ElasticsearchIntegrationTest.java
index 20789924ac936..5a59036ff41b3 100644
--- a/src/test/java/org/elasticsearch/test/ElasticsearchIntegrationTest.java
+++ b/src/test/java/org/elasticsearch/test/ElasticsearchIntegrationTest.java
@@ -97,6 +97,7 @@
 import org.elasticsearch.search.SearchService;
 import org.elasticsearch.test.client.RandomizingClient;
 import org.hamcrest.Matchers;
+import org.elasticsearch.test.disruption.ServiceDisruptionScheme;
 import org.junit.*;
 
 import java.io.IOException;
@@ -583,6 +584,7 @@ protected final void afterInternal() throws IOException {
         boolean success = false;
         try {
             logger.info("[{}#{}]: cleaning up after test", getTestClass().getSimpleName(), getTestName());
+            clearDisruptionScheme();
             final Scope currentClusterScope = getCurrentClusterScope();
             try {
                 if (currentClusterScope != Scope.TEST) {
@@ -696,6 +698,15 @@ protected int numberOfReplicas() {
         return between(minimumNumberOfReplicas(), maximumNumberOfReplicas());
     }
 
+
+    public void setDisruptionScheme(ServiceDisruptionScheme scheme) {
+        internalCluster().setDisruptionScheme(scheme);
+    }
+
+    public void clearDisruptionScheme() {
+        internalCluster().clearDisruptionScheme();
+    }
+
     /**
      * Returns a settings object used in {@link #createIndex(String...)} and {@link #prepareCreate(String)} and friends.
      * This method can be overwritten by subclasses to set defaults for the indices that are created by the test.
diff --git a/src/test/java/org/elasticsearch/test/InternalTestCluster.java b/src/test/java/org/elasticsearch/test/InternalTestCluster.java
index fdd345d1ab194..2ec3df2297252 100644
--- a/src/test/java/org/elasticsearch/test/InternalTestCluster.java
+++ b/src/test/java/org/elasticsearch/test/InternalTestCluster.java
@@ -76,6 +76,7 @@
 import org.elasticsearch.search.SearchService;
 import org.elasticsearch.test.cache.recycler.MockBigArraysModule;
 import org.elasticsearch.test.cache.recycler.MockPageCacheRecyclerModule;
+import org.elasticsearch.test.disruption.ServiceDisruptionScheme;
 import org.elasticsearch.test.engine.MockEngineModule;
 import org.elasticsearch.test.store.MockFSIndexStoreModule;
 import org.elasticsearch.test.transport.AssertingLocalTransport;
@@ -185,6 +186,8 @@ public final class InternalTestCluster extends TestCluster {
 
     private final boolean hasFilterCache;
 
+    private ServiceDisruptionScheme activeDisruptionScheme;
+
     public InternalTestCluster(long clusterSeed, String clusterName) {
         this(clusterSeed, DEFAULT_MIN_NUM_DATA_NODES, DEFAULT_MAX_NUM_DATA_NODES, clusterName, SettingsSource.EMPTY, DEFAULT_NUM_CLIENT_NODES, DEFAULT_ENABLE_RANDOM_BENCH_NODES);
     }
@@ -288,6 +291,10 @@ public String getClusterName() {
         return clusterName;
     }
 
+    public String[] getNodeNames() {
+        return nodes.keySet().toArray(Strings.EMPTY_ARRAY);
+    }
+
     private static boolean isLocalTransportConfigured() {
         if ("local".equals(System.getProperty("es.node.mode", "network"))) {
             return true;
@@ -486,6 +493,7 @@ public synchronized void ensureAtMostNumDataNodes(int n) throws IOException {
         while (limit.hasNext()) {
             NodeAndClient next = limit.next();
             nodesToRemove.add(next);
+            removeDistruptionSchemeFromNode(next);
             next.close();
         }
         for (NodeAndClient toRemove : nodesToRemove) {
@@ -660,6 +668,10 @@ public boolean apply(NodeAndClient nodeAndClient) {
     @Override
     public void close() {
         if (this.open.compareAndSet(true, false)) {
+            if (activeDisruptionScheme != null) {
+                activeDisruptionScheme.testClusterClosed();
+                activeDisruptionScheme = null;
+            }
             IOUtils.closeWhileHandlingException(nodes.values());
             nodes.clear();
             executor.shutdownNow();
@@ -824,6 +836,7 @@ public synchronized void beforeTest(Random random, double transportClientRatio)
     }
 
     private synchronized void reset(boolean wipeData) throws IOException {
+        clearDisruptionScheme();
         randomlyResetClients();
         if (wipeData) {
             wipeDataDirectories();
@@ -1023,6 +1036,7 @@ public synchronized void stopRandomDataNode() throws IOException {
         NodeAndClient nodeAndClient = getRandomNodeAndClient(new DataNodePredicate());
         if (nodeAndClient != null) {
             logger.info("Closing random node [{}] ", nodeAndClient.name);
+            removeDistruptionSchemeFromNode(nodeAndClient);
             nodes.remove(nodeAndClient.name);
             nodeAndClient.close();
         }
@@ -1042,6 +1056,7 @@ public boolean apply(NodeAndClient nodeAndClient) {
         });
         if (nodeAndClient != null) {
             logger.info("Closing filtered random node [{}] ", nodeAndClient.name);
+            removeDistruptionSchemeFromNode(nodeAndClient);
             nodes.remove(nodeAndClient.name);
             nodeAndClient.close();
         }
@@ -1056,6 +1071,7 @@ public synchronized void stopCurrentMasterNode() throws IOException {
         String masterNodeName = getMasterName();
         assert nodes.containsKey(masterNodeName);
         logger.info("Closing master node [{}] ", masterNodeName);
+        removeDistruptionSchemeFromNode(nodes.get(masterNodeName));
         NodeAndClient remove = nodes.remove(masterNodeName);
         remove.close();
     }
@@ -1067,6 +1083,7 @@ public void stopRandomNonMasterNode() throws IOException {
         NodeAndClient nodeAndClient = getRandomNodeAndClient(Predicates.not(new MasterNodePredicate(getMasterName())));
         if (nodeAndClient != null) {
             logger.info("Closing random non master node [{}] current master [{}] ", nodeAndClient.name, getMasterName());
+            removeDistruptionSchemeFromNode(nodeAndClient);
             nodes.remove(nodeAndClient.name);
             nodeAndClient.close();
         }
@@ -1120,6 +1137,9 @@ private void restartAllNodes(boolean rollingRestart, RestartCallback callback) t
                 if (!callback.doRestart(nodeAndClient.name)) {
                     logger.info("Closing node [{}] during restart", nodeAndClient.name);
                     toRemove.add(nodeAndClient);
+                    if (activeDisruptionScheme != null) {
+                        activeDisruptionScheme.removeFromNode(nodeAndClient.name, this);
+                    }
                     nodeAndClient.close();
                 }
             }
@@ -1134,18 +1154,33 @@ private void restartAllNodes(boolean rollingRestart, RestartCallback callback) t
             for (NodeAndClient nodeAndClient : nodes.values()) {
                 callback.doAfterNodes(numNodesRestarted++, nodeAndClient.nodeClient());
                 logger.info("Restarting node [{}] ", nodeAndClient.name);
+                if (activeDisruptionScheme != null) {
+                    activeDisruptionScheme.removeFromNode(nodeAndClient.name, this);
+                }
                 nodeAndClient.restart(callback);
+                if (activeDisruptionScheme != null) {
+                    activeDisruptionScheme.applyToNode(nodeAndClient.name, this);
+                }
             }
         } else {
             int numNodesRestarted = 0;
             for (NodeAndClient nodeAndClient : nodes.values()) {
                 callback.doAfterNodes(numNodesRestarted++, nodeAndClient.nodeClient());
                 logger.info("Stopping node [{}] ", nodeAndClient.name);
+                if (activeDisruptionScheme != null) {
+                    activeDisruptionScheme.removeFromNode(nodeAndClient.name, this);
+                }
                 nodeAndClient.node.close();
             }
             for (NodeAndClient nodeAndClient : nodes.values()) {
                 logger.info("Starting node [{}] ", nodeAndClient.name);
+                if (activeDisruptionScheme != null) {
+                    activeDisruptionScheme.removeFromNode(nodeAndClient.name, this);
+                }
                 nodeAndClient.restart(callback);
+                if (activeDisruptionScheme != null) {
+                    activeDisruptionScheme.applyToNode(nodeAndClient.name, this);
+                }
             }
         }
     }
@@ -1343,6 +1378,7 @@ private synchronized void publishNode(NodeAndClient nodeAndClient) {
             dataDirToClean.addAll(Arrays.asList(nodeEnv.nodeDataLocations()));
         }
         nodes.put(nodeAndClient.name, nodeAndClient);
+        applyDisruptionSchemeToNode(nodeAndClient);
     }
 
     public void closeNonSharedNodes(boolean wipeData) throws IOException {
@@ -1364,6 +1400,33 @@ public boolean hasFilterCache() {
         return hasFilterCache;
     }
 
+    public void setDisruptionScheme(ServiceDisruptionScheme scheme) {
+        clearDisruptionScheme();
+        scheme.applyToCluster(this);
+        activeDisruptionScheme = scheme;
+    }
+
+    public void clearDisruptionScheme() {
+        if (activeDisruptionScheme != null) {
+            activeDisruptionScheme.removeFromCluster(this);
+        }
+        activeDisruptionScheme = null;
+    }
+
+    private void applyDisruptionSchemeToNode(NodeAndClient nodeAndClient) {
+        if (activeDisruptionScheme != null) {
+            assert nodes.containsKey(nodeAndClient.name);
+            activeDisruptionScheme.applyToNode(nodeAndClient.name, this);
+        }
+    }
+
+    private void removeDistruptionSchemeFromNode(NodeAndClient nodeAndClient) {
+        if (activeDisruptionScheme != null) {
+            assert nodes.containsKey(nodeAndClient.name);
+            activeDisruptionScheme.removeFromNode(nodeAndClient.name, this);
+        }
+    }
+
     private synchronized Collection<NodeAndClient> dataNodeAndClients() {
         return Collections2.filter(nodes.values(), new DataNodePredicate());
     }
diff --git a/src/test/java/org/elasticsearch/test/TestCluster.java b/src/test/java/org/elasticsearch/test/TestCluster.java
index 68560e52e928b..deb65ca22fc03 100644
--- a/src/test/java/org/elasticsearch/test/TestCluster.java
+++ b/src/test/java/org/elasticsearch/test/TestCluster.java
@@ -24,6 +24,7 @@
 import org.elasticsearch.action.admin.cluster.state.ClusterStateResponse;
 import org.elasticsearch.client.Client;
 import org.elasticsearch.cluster.metadata.IndexMetaData;
+import org.elasticsearch.common.Strings;
 import org.elasticsearch.common.logging.ESLogger;
 import org.elasticsearch.common.logging.Loggers;
 import org.elasticsearch.indices.IndexMissingException;
diff --git a/src/test/java/org/elasticsearch/test/disruption/NetworkDelaysPartition.java b/src/test/java/org/elasticsearch/test/disruption/NetworkDelaysPartition.java
new file mode 100644
index 0000000000000..65dbc056130b4
--- /dev/null
+++ b/src/test/java/org/elasticsearch/test/disruption/NetworkDelaysPartition.java
@@ -0,0 +1,88 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.elasticsearch.test.disruption;
+
+import org.elasticsearch.cluster.node.DiscoveryNode;
+import org.elasticsearch.common.unit.TimeValue;
+import org.elasticsearch.test.transport.MockTransportService;
+
+import java.util.Random;
+import java.util.Set;
+
+public class NetworkDelaysPartition extends NetworkPartition {
+
+    static long DEFAULT_DELAY_MIN = 10000;
+    static long DEFAULT_DELAY_MAX = 90000;
+
+
+    final long delayMin;
+    final long delayMax;
+
+    TimeValue duration;
+
+    public NetworkDelaysPartition(Random random) {
+        this(random, DEFAULT_DELAY_MIN, DEFAULT_DELAY_MAX);
+    }
+
+    public NetworkDelaysPartition(Random random, long delayMin, long delayMax) {
+        super(random);
+        this.delayMin = delayMin;
+        this.delayMax = delayMax;
+    }
+
+    public NetworkDelaysPartition(String node1, String node2, Random random) {
+        this(node1, node2, DEFAULT_DELAY_MIN, DEFAULT_DELAY_MAX, random);
+    }
+
+    public NetworkDelaysPartition(String node1, String node2, long delayMin, long delayMax, Random random) {
+        super(node1, node2, random);
+        this.delayMin = delayMin;
+        this.delayMax = delayMax;
+    }
+
+    public NetworkDelaysPartition(Set<String> nodesSideOne, Set<String> nodesSideTwo, Random random) {
+        this(nodesSideOne, nodesSideTwo, DEFAULT_DELAY_MIN, DEFAULT_DELAY_MAX, random);
+    }
+
+    public NetworkDelaysPartition(Set<String> nodesSideOne, Set<String> nodesSideTwo, long delayMin, long delayMax, Random random) {
+        super(nodesSideOne, nodesSideTwo, random);
+        this.delayMin = delayMin;
+        this.delayMax = delayMax;
+
+    }
+
+    @Override
+    public synchronized void startDisrupting() {
+        duration = new TimeValue(delayMin + random.nextInt((int) (delayMax - delayMin)));
+        super.startDisrupting();
+    }
+
+    @Override
+    void applyDisruption(DiscoveryNode node1, MockTransportService transportService1,
+                         DiscoveryNode node2, MockTransportService transportService2) {
+        transportService1.addUnresponsiveRule(node1, duration);
+        transportService1.addUnresponsiveRule(node2, duration);
+    }
+
+    @Override
+    protected String getPartitionDescription() {
+        return "network delays for [" + duration + "]";
+    }
+
+}
diff --git a/src/test/java/org/elasticsearch/test/disruption/NetworkDisconnectPartition.java b/src/test/java/org/elasticsearch/test/disruption/NetworkDisconnectPartition.java
new file mode 100644
index 0000000000000..664c7a09977cd
--- /dev/null
+++ b/src/test/java/org/elasticsearch/test/disruption/NetworkDisconnectPartition.java
@@ -0,0 +1,53 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.elasticsearch.test.disruption;
+
+import org.elasticsearch.cluster.node.DiscoveryNode;
+import org.elasticsearch.test.transport.MockTransportService;
+
+import java.util.Random;
+import java.util.Set;
+
+public class NetworkDisconnectPartition extends NetworkPartition {
+
+
+    public NetworkDisconnectPartition(Random random) {
+        super(random);
+    }
+
+    public NetworkDisconnectPartition(String node1, String node2, Random random) {
+        super(node1, node2, random);
+    }
+
+    public NetworkDisconnectPartition(Set<String> nodesSideOne, Set<String> nodesSideTwo, Random random) {
+        super(nodesSideOne, nodesSideTwo, random);
+    }
+
+    @Override
+    protected String getPartitionDescription() {
+        return "disconnected";
+    }
+
+    @Override
+    void applyDisruption(DiscoveryNode node1, MockTransportService transportService1,
+                         DiscoveryNode node2, MockTransportService transportService2) {
+        transportService1.addFailToSendNoConnectRule(node2);
+        transportService2.addFailToSendNoConnectRule(node1);
+    }
+}
diff --git a/src/test/java/org/elasticsearch/test/disruption/NetworkPartition.java b/src/test/java/org/elasticsearch/test/disruption/NetworkPartition.java
new file mode 100644
index 0000000000000..c8953fad593ff
--- /dev/null
+++ b/src/test/java/org/elasticsearch/test/disruption/NetworkPartition.java
@@ -0,0 +1,199 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.elasticsearch.test.disruption;
+
+import com.google.common.collect.ImmutableList;
+import org.elasticsearch.cluster.node.DiscoveryNode;
+import org.elasticsearch.common.logging.ESLogger;
+import org.elasticsearch.common.logging.Loggers;
+import org.elasticsearch.discovery.Discovery;
+import org.elasticsearch.test.InternalTestCluster;
+import org.elasticsearch.test.TestCluster;
+import org.elasticsearch.test.transport.MockTransportService;
+import org.elasticsearch.transport.TransportService;
+
+import java.util.HashSet;
+import java.util.List;
+import java.util.Random;
+import java.util.Set;
+
+public abstract class NetworkPartition implements ServiceDisruptionScheme {
+
+    protected final ESLogger logger = Loggers.getLogger(getClass());
+
+    final Set<String> nodesSideOne;
+    final Set<String> nodesSideTwo;
+    volatile boolean autoExpand;
+    protected final Random random;
+    protected volatile InternalTestCluster cluster;
+
+
+    public NetworkPartition(Random random) {
+        this.random = new Random(random.nextLong());
+        nodesSideOne = new HashSet<>();
+        nodesSideTwo = new HashSet<>();
+        autoExpand = true;
+    }
+
+    public NetworkPartition(String node1, String node2, Random random) {
+        this(random);
+        nodesSideOne.add(node1);
+        nodesSideTwo.add(node2);
+        autoExpand = false;
+    }
+
+    public NetworkPartition(Set<String> nodesSideOne, Set<String> nodesSideTwo, Random random) {
+        this(random);
+        this.nodesSideOne.addAll(nodesSideOne);
+        this.nodesSideTwo.addAll(nodesSideTwo);
+        autoExpand = false;
+    }
+
+
+    public List<String> getNodesSideOne() {
+        return ImmutableList.copyOf(nodesSideOne);
+    }
+
+    public List<String> getNodesSideTwo() {
+        return ImmutableList.copyOf(nodesSideTwo);
+    }
+
+    public List<String> getMjaoritySide() {
+        if (nodesSideOne.size() >= nodesSideTwo.size()) {
+            return getNodesSideOne();
+        } else {
+            return getNodesSideTwo();
+        }
+    }
+
+    public List<String> getMinoritySide() {
+        if (nodesSideOne.size() >= nodesSideTwo.size()) {
+            return getNodesSideTwo();
+        } else {
+            return getNodesSideOne();
+        }
+    }
+
+    @Override
+    public void applyToCluster(InternalTestCluster cluster) {
+        this.cluster = cluster;
+        if (autoExpand) {
+            for (String node : cluster.getNodeNames()) {
+                applyToNode(node, cluster);
+            }
+        }
+    }
+
+    @Override
+    public void removeFromCluster(InternalTestCluster cluster) {
+        stopDisrupting();
+    }
+
+    @Override
+    public synchronized void applyToNode(String node, InternalTestCluster cluster) {
+        if (!autoExpand || nodesSideOne.contains(node) || nodesSideTwo.contains(node)) {
+            return;
+        }
+        if (nodesSideOne.isEmpty()) {
+            nodesSideOne.add(node);
+        } else if (nodesSideTwo.isEmpty()) {
+            nodesSideTwo.add(node);
+        } else if (random.nextBoolean()) {
+            nodesSideOne.add(node);
+        } else {
+            nodesSideTwo.add(node);
+        }
+    }
+
+    @Override
+    public synchronized void removeFromNode(String node, InternalTestCluster cluster) {
+        MockTransportService transportService = (MockTransportService) cluster.getInstance(TransportService.class, node);
+        DiscoveryNode discoveryNode = discoveryNode(node);
+        Set<String> otherSideNodes;
+        if (nodesSideOne.contains(node)) {
+            otherSideNodes = nodesSideTwo;
+        } else if (nodesSideTwo.contains(node)) {
+            otherSideNodes = nodesSideOne;
+        } else {
+            return;
+        }
+        for (String node2 : otherSideNodes) {
+            MockTransportService transportService2 = (MockTransportService) cluster.getInstance(TransportService.class, node2);
+            DiscoveryNode discoveryNode2 = discoveryNode(node2);
+            removeDisruption(discoveryNode, transportService, discoveryNode2, transportService2);
+        }
+    }
+
+    @Override
+    public synchronized void testClusterClosed() {
+
+    }
+
+    protected abstract String getPartitionDescription();
+
+
+    protected DiscoveryNode discoveryNode(String node) {
+        return cluster.getInstance(Discovery.class, node).localNode();
+    }
+
+    @Override
+    public synchronized void startDisrupting() {
+        if (nodesSideOne.size() == 0 || nodesSideTwo.size() == 0) {
+            return;
+        }
+        logger.info("nodes {} will be partitioned from {}. partition type [{}]", nodesSideOne, nodesSideTwo, getPartitionDescription());
+        for (String node1 : nodesSideOne) {
+            MockTransportService transportService1 = (MockTransportService) cluster.getInstance(TransportService.class, node1);
+            DiscoveryNode discoveryNode1 = discoveryNode(node1);
+            for (String node2 : nodesSideTwo) {
+                DiscoveryNode discoveryNode2 = discoveryNode(node2);
+                MockTransportService transportService2 = (MockTransportService) cluster.getInstance(TransportService.class, node2);
+                applyDisruption(discoveryNode1, transportService1, discoveryNode2, transportService2);
+            }
+        }
+    }
+
+
+    @Override
+    public void stopDisrupting() {
+        if (nodesSideOne.size() == 0 || nodesSideTwo.size() == 0) {
+            return;
+        }
+        logger.info("restoring partition between nodes {} & nodes {}", nodesSideOne, nodesSideTwo);
+        for (String node1 : nodesSideOne) {
+            MockTransportService transportService1 = (MockTransportService) cluster.getInstance(TransportService.class, node1);
+            DiscoveryNode discoveryNode1 = discoveryNode(node1);
+            for (String node2 : nodesSideTwo) {
+                DiscoveryNode discoveryNode2 = discoveryNode(node2);
+                MockTransportService transportService2 = (MockTransportService) cluster.getInstance(TransportService.class, node2);
+                removeDisruption(discoveryNode1, transportService1, discoveryNode2, transportService2);
+            }
+        }
+    }
+
+    abstract void applyDisruption(DiscoveryNode node1, MockTransportService transportService1,
+                                  DiscoveryNode node2, MockTransportService transportService2);
+
+
+    protected void removeDisruption(DiscoveryNode node1, MockTransportService transportService1,
+                                    DiscoveryNode node2, MockTransportService transportService2) {
+        transportService1.clearRule(node2);
+        transportService2.clearRule(node1);
+    }
+}
diff --git a/src/test/java/org/elasticsearch/test/disruption/NetworkUnresponsivePartition.java b/src/test/java/org/elasticsearch/test/disruption/NetworkUnresponsivePartition.java
new file mode 100644
index 0000000000000..95b853cf9b54d
--- /dev/null
+++ b/src/test/java/org/elasticsearch/test/disruption/NetworkUnresponsivePartition.java
@@ -0,0 +1,52 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.elasticsearch.test.disruption;
+
+import org.elasticsearch.cluster.node.DiscoveryNode;
+import org.elasticsearch.test.transport.MockTransportService;
+
+import java.util.Random;
+import java.util.Set;
+
+public class NetworkUnresponsivePartition extends NetworkPartition {
+
+    public NetworkUnresponsivePartition(Random random) {
+        super(random);
+    }
+
+    public NetworkUnresponsivePartition(String node1, String node2, Random random) {
+        super(node1, node2, random);
+    }
+
+    public NetworkUnresponsivePartition(Set<String> nodesSideOne, Set<String> nodesSideTwo, Random random) {
+        super(nodesSideOne, nodesSideTwo, random);
+    }
+
+    @Override
+    protected String getPartitionDescription() {
+        return "unresponsive";
+    }
+
+    @Override
+    void applyDisruption(DiscoveryNode node1, MockTransportService transportService1,
+                         DiscoveryNode node2, MockTransportService transportService2) {
+        transportService1.addUnresponsiveRule(node2);
+        transportService2.addUnresponsiveRule(node1);
+    }
+}
diff --git a/src/test/java/org/elasticsearch/test/disruption/NoOpDisruptionScheme.java b/src/test/java/org/elasticsearch/test/disruption/NoOpDisruptionScheme.java
new file mode 100644
index 0000000000000..6ce11582904b7
--- /dev/null
+++ b/src/test/java/org/elasticsearch/test/disruption/NoOpDisruptionScheme.java
@@ -0,0 +1,60 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.test.disruption;
+
+import org.elasticsearch.test.InternalTestCluster;
+
+public class NoOpDisruptionScheme implements ServiceDisruptionScheme {
+
+    @Override
+    public void applyToCluster(InternalTestCluster cluster) {
+
+    }
+
+    @Override
+    public void removeFromCluster(InternalTestCluster cluster) {
+
+    }
+
+    @Override
+    public void applyToNode(String node, InternalTestCluster cluster) {
+
+    }
+
+    @Override
+    public void removeFromNode(String node, InternalTestCluster cluster) {
+
+    }
+
+    @Override
+    public void startDisrupting() {
+
+    }
+
+    @Override
+    public void stopDisrupting() {
+
+    }
+
+    @Override
+    public void testClusterClosed() {
+
+    }
+}
diff --git a/src/test/java/org/elasticsearch/test/disruption/ServiceDisruptionScheme.java b/src/test/java/org/elasticsearch/test/disruption/ServiceDisruptionScheme.java
new file mode 100644
index 0000000000000..1290e387e12b5
--- /dev/null
+++ b/src/test/java/org/elasticsearch/test/disruption/ServiceDisruptionScheme.java
@@ -0,0 +1,40 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.elasticsearch.test.disruption;
+
+import org.elasticsearch.test.InternalTestCluster;
+import org.elasticsearch.test.TestCluster;
+
+public interface ServiceDisruptionScheme {
+
+    public void applyToCluster(InternalTestCluster cluster);
+
+    public void removeFromCluster(InternalTestCluster cluster);
+
+    public void applyToNode(String node, InternalTestCluster cluster);
+
+    public void removeFromNode(String node, InternalTestCluster cluster);
+
+    public void startDisrupting();
+
+    public void stopDisrupting();
+
+    public void testClusterClosed();
+
+}
diff --git a/src/test/java/org/elasticsearch/test/disruption/SingleNodeDisruption.java b/src/test/java/org/elasticsearch/test/disruption/SingleNodeDisruption.java
new file mode 100644
index 0000000000000..3148254011ef2
--- /dev/null
+++ b/src/test/java/org/elasticsearch/test/disruption/SingleNodeDisruption.java
@@ -0,0 +1,83 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.elasticsearch.test.disruption;
+
+import org.elasticsearch.common.logging.ESLogger;
+import org.elasticsearch.common.logging.Loggers;
+import org.elasticsearch.test.InternalTestCluster;
+
+import java.util.Random;
+
+public abstract class SingleNodeDisruption implements ServiceDisruptionScheme {
+
+    protected final ESLogger logger = Loggers.getLogger(getClass());
+
+    protected volatile String disruptedNode;
+    protected volatile InternalTestCluster cluster;
+    protected final Random random;
+
+
+    public SingleNodeDisruption(String disruptedNode, Random random) {
+        this(random);
+        this.disruptedNode = disruptedNode;
+    }
+
+    public SingleNodeDisruption(Random random) {
+        this.random = new Random(random.nextLong());
+    }
+
+    @Override
+    public void applyToCluster(InternalTestCluster cluster) {
+        this.cluster = cluster;
+        if (disruptedNode == null) {
+            String[] nodes = cluster.getNodeNames();
+            disruptedNode = nodes[random.nextInt(nodes.length)];
+        }
+    }
+
+    @Override
+    public void removeFromCluster(InternalTestCluster cluster) {
+        if (disruptedNode != null) {
+            removeFromNode(disruptedNode, cluster);
+        }
+    }
+
+    @Override
+    public synchronized void applyToNode(String node, InternalTestCluster cluster) {
+
+    }
+
+    @Override
+    public synchronized void removeFromNode(String node, InternalTestCluster cluster) {
+        if (disruptedNode == null) {
+            return;
+        }
+        if (!node.equals(disruptedNode)) {
+            return;
+        }
+        stopDisrupting();
+        disruptedNode = null;
+    }
+
+    @Override
+    public synchronized void testClusterClosed() {
+        disruptedNode = null;
+    }
+
+}
diff --git a/src/test/java/org/elasticsearch/test/disruption/SlowClusterStateProcessing.java b/src/test/java/org/elasticsearch/test/disruption/SlowClusterStateProcessing.java
new file mode 100644
index 0000000000000..6bfe5e7366ad2
--- /dev/null
+++ b/src/test/java/org/elasticsearch/test/disruption/SlowClusterStateProcessing.java
@@ -0,0 +1,130 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.elasticsearch.test.disruption;
+
+import org.elasticsearch.cluster.ClusterService;
+import org.elasticsearch.cluster.ClusterState;
+import org.elasticsearch.cluster.ClusterStateNonMasterUpdateTask;
+import org.elasticsearch.common.Priority;
+import org.elasticsearch.common.unit.TimeValue;
+
+import java.util.Random;
+
+public class SlowClusterStateProcessing extends SingleNodeDisruption {
+
+    volatile boolean disrupting;
+    volatile Thread worker;
+
+    final long intervalBetweenDelaysMin;
+    final long intervalBetweenDelaysMax;
+    final long delayDurationMin;
+    final long delayDurationMax;
+
+
+    public SlowClusterStateProcessing(Random random) {
+        this(null, random);
+    }
+
+    public SlowClusterStateProcessing(String disruptedNode, Random random) {
+        this(disruptedNode, random, 100, 200, 300, 20000);
+    }
+
+    public SlowClusterStateProcessing(String disruptedNode, Random random, long intervalBetweenDelaysMin,
+                                      long intervalBetweenDelaysMax, long delayDurationMin, long delayDurationMax) {
+        this(random, intervalBetweenDelaysMin, intervalBetweenDelaysMax, delayDurationMin, delayDurationMax);
+        this.disruptedNode = disruptedNode;
+    }
+
+    public SlowClusterStateProcessing(Random random,
+                                      long intervalBetweenDelaysMin, long intervalBetweenDelaysMax, long delayDurationMin,
+                                      long delayDurationMax) {
+        super(random);
+        this.intervalBetweenDelaysMin = intervalBetweenDelaysMin;
+        this.intervalBetweenDelaysMax = intervalBetweenDelaysMax;
+        this.delayDurationMin = delayDurationMin;
+        this.delayDurationMax = delayDurationMax;
+    }
+
+
+    @Override
+    public void startDisrupting() {
+        disrupting = true;
+        worker = new Thread(new BackgroundWorker());
+        worker.setDaemon(true);
+        worker.start();
+    }
+
+    @Override
+    public void stopDisrupting() {
+        disrupting = false;
+        try {
+            worker.join(2 * (intervalBetweenDelaysMax + delayDurationMax));
+        } catch (InterruptedException e) {
+            logger.info("background thread failed to stop");
+        }
+        worker = null;
+    }
+
+
+    private synchronized boolean interruptClusterStateProcessing(final TimeValue duration) {
+        if (disruptedNode == null) {
+            return false;
+        }
+        logger.info("delaying cluster state updates on node [{}] for [{}]", disruptedNode, duration);
+        ClusterService clusterService = cluster.getInstance(ClusterService.class, disruptedNode);
+        clusterService.submitStateUpdateTask("service_disruption_delay", Priority.IMMEDIATE, new ClusterStateNonMasterUpdateTask() {
+
+            @Override
+            public ClusterState execute(ClusterState currentState) throws Exception {
+                Thread.sleep(duration.millis());
+                return currentState;
+            }
+
+            @Override
+            public void onFailure(String source, Throwable t) {
+
+            }
+        });
+        return true;
+    }
+
+    class BackgroundWorker implements Runnable {
+
+        @Override
+        public void run() {
+            while (disrupting) {
+                try {
+                    TimeValue duration = new TimeValue(delayDurationMin + random.nextInt((int) (delayDurationMax - delayDurationMin)));
+                    if (!interruptClusterStateProcessing(duration)) {
+                        continue;
+                    }
+                    Thread.sleep(duration.millis());
+
+                    if (disruptedNode == null) {
+                        return;
+                    }
+
+                } catch (Exception e) {
+                    logger.error("error in background worker", e);
+                }
+            }
+        }
+    }
+
+}
diff --git a/src/test/java/org/elasticsearch/test/transport/MockTransportService.java b/src/test/java/org/elasticsearch/test/transport/MockTransportService.java
index 14f0296121e13..5012384dbf037 100644
--- a/src/test/java/org/elasticsearch/test/transport/MockTransportService.java
+++ b/src/test/java/org/elasticsearch/test/transport/MockTransportService.java
@@ -24,9 +24,14 @@
 import org.elasticsearch.common.component.Lifecycle;
 import org.elasticsearch.common.component.LifecycleListener;
 import org.elasticsearch.common.inject.Inject;
+import org.elasticsearch.common.io.stream.BytesStreamInput;
+import org.elasticsearch.common.io.stream.BytesStreamOutput;
+import org.elasticsearch.common.network.NetworkService;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.common.transport.BoundTransportAddress;
 import org.elasticsearch.common.transport.TransportAddress;
+import org.elasticsearch.common.unit.TimeValue;
+import org.elasticsearch.common.util.concurrent.AbstractRunnable;
 import org.elasticsearch.common.util.concurrent.ConcurrentCollections;
 import org.elasticsearch.threadpool.ThreadPool;
 import org.elasticsearch.transport.*;
@@ -46,6 +51,7 @@ public class MockTransportService extends TransportService {
     public MockTransportService(Settings settings, Transport transport, ThreadPool threadPool) {
         super(settings, new LookupTestTransport(transport), threadPool);
         this.original = transport;
+
     }
 
     /**
@@ -97,7 +103,7 @@ public void sendRequest(DiscoveryNode node, long requestId, String action, Trans
      */
     public void addFailToSendNoConnectRule(DiscoveryNode node, final Set<String> blockedActions) {
 
-        ((LookupTestTransport) transport).transports.put(node.getAddress(), new DelegateTransport(original) {
+        addDelegate(node, new DelegateTransport(original) {
             @Override
             public void connectToNode(DiscoveryNode node) throws ConnectTransportException {
                 original.connectToNode(node);
@@ -124,7 +130,6 @@ public void sendRequest(DiscoveryNode node, long requestId, String action, Trans
      * and failing to connect once the rule was added.
      */
     public void addUnresponsiveRule(DiscoveryNode node) {
-        // TODO add a parameter to delay the connect timeout?
         addDelegate(node, new DelegateTransport(original) {
             @Override
             public void connectToNode(DiscoveryNode node) throws ConnectTransportException {
@@ -143,8 +148,98 @@ public void sendRequest(DiscoveryNode node, long requestId, String action, Trans
         });
     }
 
+    /**
+     * Adds a rule that will cause ignores each send request, simulating an unresponsive node
+     * and failing to connect once the rule was added.
+     *
+     * @param duration the amount of time to delay sending and connecting.
+     */
+    public void addUnresponsiveRule(DiscoveryNode node, final TimeValue duration) {
+        final long startTime = System.currentTimeMillis();
+
+        addDelegate(node, new DelegateTransport(original) {
+
+            TimeValue getDelay() {
+                return new TimeValue(duration.millis() - (System.currentTimeMillis() - startTime));
+            }
+
+            @Override
+            public void connectToNode(DiscoveryNode node) throws ConnectTransportException {
+                TimeValue delay = getDelay();
+                if (delay.millis() <= 0) {
+                    original.connectToNode(node);
+                }
+
+                // TODO: Replace with proper setting
+                TimeValue connectingTimeout = NetworkService.TcpSettings.TCP_DEFAULT_CONNECT_TIMEOUT;
+                try {
+                    if (delay.millis() < connectingTimeout.millis()) {
+                        Thread.sleep(delay.millis());
+                        original.connectToNode(node);
+                    } else {
+                        Thread.sleep(connectingTimeout.millis());
+                        throw new ConnectTransportException(node, "UNRESPONSIVE: simulated");
+                    }
+                } catch (InterruptedException e) {
+                    throw new ConnectTransportException(node, "UNRESPONSIVE: interrupted while sleeping", e);
+                }
+            }
+
+            @Override
+            public void connectToNodeLight(DiscoveryNode node) throws ConnectTransportException {
+                TimeValue delay = getDelay();
+                if (delay.millis() <= 0) {
+                    original.connectToNodeLight(node);
+                }
+
+                // TODO: Replace with proper setting
+                TimeValue connectingTimeout = NetworkService.TcpSettings.TCP_DEFAULT_CONNECT_TIMEOUT;
+                try {
+                    if (delay.millis() < connectingTimeout.millis()) {
+                        Thread.sleep(delay.millis());
+                        original.connectToNodeLight(node);
+                    } else {
+                        Thread.sleep(connectingTimeout.millis());
+                        throw new ConnectTransportException(node, "UNRESPONSIVE: simulated");
+                    }
+                } catch (InterruptedException e) {
+                    throw new ConnectTransportException(node, "UNRESPONSIVE: interrupted while sleeping", e);
+                }
+            }
+
+            @Override
+            public void sendRequest(final DiscoveryNode node, final long requestId, final String action, TransportRequest request, final TransportRequestOptions options) throws IOException, TransportException {
+                // delayed sending - even if larger then the request timeout to simulated a potential late response from target node
+
+                TimeValue delay = getDelay();
+                if (delay.millis() <= 0) {
+                    original.sendRequest(node, requestId, action, request, options);
+                }
+
+                // poor mans request cloning...
+                TransportRequestHandler handler = MockTransportService.this.getHandler(action);
+                BytesStreamOutput bStream = new BytesStreamOutput();
+                request.writeTo(bStream);
+                final TransportRequest clonedRequest = handler.newInstance();
+                clonedRequest.readFrom(new BytesStreamInput(bStream.bytes()));
+
+                threadPool.schedule(delay, ThreadPool.Names.GENERIC, new AbstractRunnable() {
+                    @Override
+                    public void run() {
+                        try {
+                            original.sendRequest(node, requestId, action, clonedRequest, options);
+                        } catch (Throwable e) {
+                            logger.debug("failed to send delayed request", e);
+                        }
+                    }
+                });
+            }
+        });
+    }
+
     /**
      * Adds a new delegate transport that is used for communication with the given node.
+     *
      * @return <tt>true</tt> iff no other delegate was registered for this node before, otherwise <tt>false</tt>
      */
     public boolean addDelegate(DiscoveryNode node, DelegateTransport transport) {
@@ -214,7 +309,6 @@ public DelegateTransport(Transport transport) {
         }
 
 
-
         @Override
         public void transportServiceAdapter(TransportServiceAdapter service) {
             transport.transportServiceAdapter(service);

From 8aed9ee46fd9ba93d9e17fafee1abcbdab33189f Mon Sep 17 00:00:00 2001
From: Martijn van Groningen <martijn.v.groningen@gmail.com>
Date: Thu, 19 Jun 2014 07:07:08 +0200
Subject: [PATCH 24/74] [TEST] Check if worker if null to prevent NPE on double
 stopping

---
 .../test/disruption/SlowClusterStateProcessing.java            | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/test/java/org/elasticsearch/test/disruption/SlowClusterStateProcessing.java b/src/test/java/org/elasticsearch/test/disruption/SlowClusterStateProcessing.java
index 6bfe5e7366ad2..3de223ae84427 100644
--- a/src/test/java/org/elasticsearch/test/disruption/SlowClusterStateProcessing.java
+++ b/src/test/java/org/elasticsearch/test/disruption/SlowClusterStateProcessing.java
@@ -72,6 +72,9 @@ public void startDisrupting() {
 
     @Override
     public void stopDisrupting() {
+        if (worker == null) {
+            return;
+        }
         disrupting = false;
         try {
             worker.join(2 * (intervalBetweenDelaysMax + delayDurationMax));

From 785d0e55abc9fd0d1cac559ee9f43b83c3183a7c Mon Sep 17 00:00:00 2001
From: Martijn van Groningen <martijn.v.groningen@gmail.com>
Date: Mon, 23 Jun 2014 09:36:40 +0200
Subject: [PATCH 25/74] [TEST] Reduced failures in
 DiscoveryWithNetworkFailuresTests#testAckedIndexing test: * waiting time
 should be long enough depending on the type of the disruption scheme *
 MockTransportService#addUnresponsiveRule if remaining delay is smaller than 0
 don't double execute transport logic

---
 .../DiscoveryWithNetworkFailuresTests.java    | 55 ++++++++++++-------
 .../disruption/NetworkDelaysPartition.java    |  4 ++
 .../test/disruption/NetworkPartition.java     |  7 ++-
 .../test/disruption/NoOpDisruptionScheme.java |  6 ++
 .../disruption/ServiceDisruptionScheme.java   |  4 +-
 .../test/disruption/SingleNodeDisruption.java |  5 ++
 .../test/transport/MockTransportService.java  |  3 +
 7 files changed, 63 insertions(+), 21 deletions(-)

diff --git a/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java b/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java
index a0abf9fdd91ba..cc9feaf0e256c 100644
--- a/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java
+++ b/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java
@@ -35,14 +35,17 @@
 import org.elasticsearch.cluster.metadata.IndexMetaData;
 import org.elasticsearch.cluster.node.DiscoveryNode;
 import org.elasticsearch.cluster.node.DiscoveryNodes;
+import org.elasticsearch.cluster.routing.operation.hash.djb.DjbHashFunction;
 import org.elasticsearch.common.Nullable;
 import org.elasticsearch.common.Priority;
 import org.elasticsearch.common.settings.ImmutableSettings;
 import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.common.unit.TimeValue;
 import org.elasticsearch.rest.RestStatus;
 import org.elasticsearch.search.SearchHit;
 import org.elasticsearch.search.sort.SortOrder;
 import org.elasticsearch.test.ElasticsearchIntegrationTest;
+import org.elasticsearch.test.InternalTestCluster;
 import org.elasticsearch.test.disruption.*;
 import org.elasticsearch.test.junit.annotations.TestLogging;
 import org.elasticsearch.test.transport.MockTransportService;
@@ -66,7 +69,7 @@
 
 /**
  */
-@ClusterScope(scope = Scope.TEST, numDataNodes = 0)
+@ClusterScope(scope = Scope.TEST, numDataNodes = 0, transportClientRatio = 0)
 public class DiscoveryWithNetworkFailuresTests extends ElasticsearchIntegrationTest {
 
     private static final Settings nodeSettings = ImmutableSettings.settingsBuilder()
@@ -363,6 +366,7 @@ public void testIsolateMasterAndVerifyClusterStateConsensus() throws Exception {
 
     @Test
     @LuceneTestCase.AwaitsFix(bugUrl = "MvG will fix")
+    @TestLogging("action.index:TRACE,action.get:TRACE,discovery:TRACE,cluster.service:TRACE,indices.recovery:TRACE,indices.cluster:TRACE")
     public void testAckedIndexing() throws Exception {
         final List<String> nodes = internalCluster().startNodesAsync(3, nodeSettings).get();
         ensureStableCluster(3);
@@ -370,9 +374,8 @@ public void testAckedIndexing() throws Exception {
         assertAcked(prepareCreate("test")
                 .setSettings(ImmutableSettings.builder()
                                 .put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1 + randomInt(2))
-                                .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, randomInt(2))
+                                .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 1 + randomInt(1))
                 ));
-
         ensureGreen();
 
         ServiceDisruptionScheme disruptionScheme = addRandomDisruptionScheme();
@@ -384,7 +387,7 @@ public void testAckedIndexing() throws Exception {
         List<Thread> indexers = new ArrayList<>(nodes.size());
         List<Semaphore> semaphores = new ArrayList<>(nodes.size());
         final AtomicInteger idGenerator = new AtomicInteger(0);
-        final AtomicReference<CountDownLatch> countDownLatch = new AtomicReference<>();
+        final AtomicReference<CountDownLatch> countDownLatchRef = new AtomicReference<>();
         logger.info("starting indexers");
 
         for (final String node : nodes) {
@@ -392,25 +395,31 @@ public void testAckedIndexing() throws Exception {
             semaphores.add(semaphore);
             final Client client = client(node);
             final String name = "indexer_" + indexers.size();
+            final int numPrimaries = getNumShards("test").numPrimaries;
             Thread thread = new Thread(new Runnable() {
                 @Override
                 public void run() {
                     while (!stop.get()) {
+                        String id = null;
                         try {
                             if (!semaphore.tryAcquire(10, TimeUnit.SECONDS)) {
                                 continue;
                             }
+                            logger.info("[{}] Acquired semaphore and it has {} permits left", name, semaphore.availablePermits());
                             try {
-                                String id = Integer.toString(idGenerator.incrementAndGet());
-                                logger.trace("[{}] indexing id [{}] through node [{}]", name, id, node);
+                                id = Integer.toString(idGenerator.incrementAndGet());
+                                int shard = ((InternalTestCluster) cluster()).getInstance(DjbHashFunction.class).hash(id) % numPrimaries;
+                                logger.trace("[{}] indexing id [{}] through node [{}] targeting shard [{}]", name, id, node, shard);
                                 IndexResponse response = client.prepareIndex("test", "type", id).setSource("{}").setTimeout("1s").get();
+                                assertThat(response.getVersion(), equalTo(1l));
                                 ackedDocs.put(id, node);
+                                logger.trace("[{}] indexed id [{}] through node [{}]", name, id, node);
                             } finally {
-                                countDownLatch.get().countDown();
-                                logger.trace("[{}] decreased counter : {}", name, countDownLatch.get().getCount());
+                                countDownLatchRef.get().countDown();
+                                logger.trace("[{}] decreased counter : {}", name, countDownLatchRef.get().getCount());
                             }
                         } catch (ElasticsearchException | InterruptedException e) {
-                            // expected
+                            logger.trace("[{}] failed id [{}] through node [{}]", e, name, id, node);
                         } catch (Throwable t) {
                             logger.info("unexpected exception in background thread of [{}]", t, node);
                         }
@@ -424,31 +433,31 @@ public void run() {
             indexers.add(thread);
         }
 
-        logger.info("indexing some docs before partition");
         int docsPerIndexer = randomInt(3);
-        countDownLatch.set(new CountDownLatch(docsPerIndexer * indexers.size()));
+        logger.info("indexing " + docsPerIndexer + " docs per indexer before partition");
+        countDownLatchRef.set(new CountDownLatch(docsPerIndexer * indexers.size()));
         for (Semaphore semaphore : semaphores) {
             semaphore.release(docsPerIndexer);
         }
-        assertTrue(countDownLatch.get().await(1, TimeUnit.MINUTES));
+        assertTrue(countDownLatchRef.get().await(1, TimeUnit.MINUTES));
 
         for (int iter = 1 + randomInt(2); iter > 0; iter--) {
-
             logger.info("starting disruptions & indexing (iteration [{}])", iter);
             disruptionScheme.startDisrupting();
 
             docsPerIndexer = 1 + randomInt(5);
-            countDownLatch.set(new CountDownLatch(docsPerIndexer * indexers.size()));
+            logger.info("indexing " + docsPerIndexer + " docs per indexer during partition");
+            countDownLatchRef.set(new CountDownLatch(docsPerIndexer * indexers.size()));
             Collections.shuffle(semaphores);
             for (Semaphore semaphore : semaphores) {
+                assertThat(semaphore.availablePermits(), equalTo(0));
                 semaphore.release(docsPerIndexer);
             }
-            assertTrue(countDownLatch.get().await(1, TimeUnit.MINUTES));
+            assertTrue(countDownLatchRef.get().await(disruptionScheme.afterDisruptionTimeOut().millis() * (docsPerIndexer * indexers.size()), TimeUnit.MILLISECONDS));
 
             logger.info("stopping disruption");
             disruptionScheme.stopDisrupting();
-
-            ensureStableCluster(3);
+            ensureStableCluster(3, disruptionScheme.afterDisruptionTimeOut());
             ensureGreen("test");
 
             logger.info("validating successful docs");
@@ -475,7 +484,6 @@ public void run() {
         }
     }
 
-
     @Test
     @TestLogging("discovery.zen:TRACE,action:TRACE,cluster.service:TRACE,indices.recovery:TRACE,indices.cluster:TRACE")
     public void testRejoinDocumentExistsInAllShardCopies() throws Exception {
@@ -588,13 +596,22 @@ private DiscoveryNode findMasterNode(List<String> nodes) {
 
 
     private void ensureStableCluster(int nodeCount) {
-        ensureStableCluster(nodeCount, null);
+        ensureStableCluster(nodeCount, TimeValue.timeValueSeconds(30), null);
+    }
+
+    private void ensureStableCluster(int nodeCount, TimeValue timeValue) {
+        ensureStableCluster(nodeCount, timeValue, null);
     }
 
     private void ensureStableCluster(int nodeCount, @Nullable String viaNode) {
+        ensureStableCluster(nodeCount, TimeValue.timeValueSeconds(30), null);
+    }
+
+    private void ensureStableCluster(int nodeCount, TimeValue timeValue, @Nullable String viaNode) {
         ClusterHealthResponse clusterHealthResponse = client(viaNode).admin().cluster().prepareHealth()
                 .setWaitForEvents(Priority.LANGUID)
                 .setWaitForNodes(Integer.toString(nodeCount))
+                .setTimeout(timeValue)
                 .setWaitForRelocatingShards(0)
                 .get();
         assertThat(clusterHealthResponse.isTimedOut(), is(false));
diff --git a/src/test/java/org/elasticsearch/test/disruption/NetworkDelaysPartition.java b/src/test/java/org/elasticsearch/test/disruption/NetworkDelaysPartition.java
index 65dbc056130b4..16a5913452b26 100644
--- a/src/test/java/org/elasticsearch/test/disruption/NetworkDelaysPartition.java
+++ b/src/test/java/org/elasticsearch/test/disruption/NetworkDelaysPartition.java
@@ -85,4 +85,8 @@ protected String getPartitionDescription() {
         return "network delays for [" + duration + "]";
     }
 
+    @Override
+    public TimeValue afterDisruptionTimeOut() {
+        return TimeValue.timeValueMillis(delayMax + super.afterDisruptionTimeOut().millis());
+    }
 }
diff --git a/src/test/java/org/elasticsearch/test/disruption/NetworkPartition.java b/src/test/java/org/elasticsearch/test/disruption/NetworkPartition.java
index c8953fad593ff..13fe1af77d667 100644
--- a/src/test/java/org/elasticsearch/test/disruption/NetworkPartition.java
+++ b/src/test/java/org/elasticsearch/test/disruption/NetworkPartition.java
@@ -22,9 +22,9 @@
 import org.elasticsearch.cluster.node.DiscoveryNode;
 import org.elasticsearch.common.logging.ESLogger;
 import org.elasticsearch.common.logging.Loggers;
+import org.elasticsearch.common.unit.TimeValue;
 import org.elasticsearch.discovery.Discovery;
 import org.elasticsearch.test.InternalTestCluster;
-import org.elasticsearch.test.TestCluster;
 import org.elasticsearch.test.transport.MockTransportService;
 import org.elasticsearch.transport.TransportService;
 
@@ -196,4 +196,9 @@ protected void removeDisruption(DiscoveryNode node1, MockTransportService transp
         transportService1.clearRule(node2);
         transportService2.clearRule(node1);
     }
+
+    @Override
+    public TimeValue afterDisruptionTimeOut() {
+        return TimeValue.timeValueSeconds(30);
+    }
 }
diff --git a/src/test/java/org/elasticsearch/test/disruption/NoOpDisruptionScheme.java b/src/test/java/org/elasticsearch/test/disruption/NoOpDisruptionScheme.java
index 6ce11582904b7..24096a40334c4 100644
--- a/src/test/java/org/elasticsearch/test/disruption/NoOpDisruptionScheme.java
+++ b/src/test/java/org/elasticsearch/test/disruption/NoOpDisruptionScheme.java
@@ -19,6 +19,7 @@
 
 package org.elasticsearch.test.disruption;
 
+import org.elasticsearch.common.unit.TimeValue;
 import org.elasticsearch.test.InternalTestCluster;
 
 public class NoOpDisruptionScheme implements ServiceDisruptionScheme {
@@ -57,4 +58,9 @@ public void stopDisrupting() {
     public void testClusterClosed() {
 
     }
+
+    @Override
+    public TimeValue afterDisruptionTimeOut() {
+        return TimeValue.timeValueSeconds(30);
+    }
 }
diff --git a/src/test/java/org/elasticsearch/test/disruption/ServiceDisruptionScheme.java b/src/test/java/org/elasticsearch/test/disruption/ServiceDisruptionScheme.java
index 1290e387e12b5..5f6c949000f92 100644
--- a/src/test/java/org/elasticsearch/test/disruption/ServiceDisruptionScheme.java
+++ b/src/test/java/org/elasticsearch/test/disruption/ServiceDisruptionScheme.java
@@ -18,8 +18,8 @@
  */
 package org.elasticsearch.test.disruption;
 
+import org.elasticsearch.common.unit.TimeValue;
 import org.elasticsearch.test.InternalTestCluster;
-import org.elasticsearch.test.TestCluster;
 
 public interface ServiceDisruptionScheme {
 
@@ -37,4 +37,6 @@ public interface ServiceDisruptionScheme {
 
     public void testClusterClosed();
 
+    public TimeValue afterDisruptionTimeOut();
+
 }
diff --git a/src/test/java/org/elasticsearch/test/disruption/SingleNodeDisruption.java b/src/test/java/org/elasticsearch/test/disruption/SingleNodeDisruption.java
index 3148254011ef2..dd18445a7052d 100644
--- a/src/test/java/org/elasticsearch/test/disruption/SingleNodeDisruption.java
+++ b/src/test/java/org/elasticsearch/test/disruption/SingleNodeDisruption.java
@@ -20,6 +20,7 @@
 
 import org.elasticsearch.common.logging.ESLogger;
 import org.elasticsearch.common.logging.Loggers;
+import org.elasticsearch.common.unit.TimeValue;
 import org.elasticsearch.test.InternalTestCluster;
 
 import java.util.Random;
@@ -80,4 +81,8 @@ public synchronized void testClusterClosed() {
         disruptedNode = null;
     }
 
+    @Override
+    public TimeValue afterDisruptionTimeOut() {
+        return TimeValue.timeValueSeconds(30);
+    }
 }
diff --git a/src/test/java/org/elasticsearch/test/transport/MockTransportService.java b/src/test/java/org/elasticsearch/test/transport/MockTransportService.java
index 5012384dbf037..3f180f9c5e5bb 100644
--- a/src/test/java/org/elasticsearch/test/transport/MockTransportService.java
+++ b/src/test/java/org/elasticsearch/test/transport/MockTransportService.java
@@ -168,6 +168,7 @@ public void connectToNode(DiscoveryNode node) throws ConnectTransportException {
                 TimeValue delay = getDelay();
                 if (delay.millis() <= 0) {
                     original.connectToNode(node);
+                    return;
                 }
 
                 // TODO: Replace with proper setting
@@ -190,6 +191,7 @@ public void connectToNodeLight(DiscoveryNode node) throws ConnectTransportExcept
                 TimeValue delay = getDelay();
                 if (delay.millis() <= 0) {
                     original.connectToNodeLight(node);
+                    return;
                 }
 
                 // TODO: Replace with proper setting
@@ -214,6 +216,7 @@ public void sendRequest(final DiscoveryNode node, final long requestId, final St
                 TimeValue delay = getDelay();
                 if (delay.millis() <= 0) {
                     original.sendRequest(node, requestId, action, request, options);
+                    return;
                 }
 
                 // poor mans request cloning...

From f7b962a417ba5a2daa411ffc1820f4fe779dc3ff Mon Sep 17 00:00:00 2001
From: Martijn van Groningen <martijn.v.groningen@gmail.com>
Date: Mon, 23 Jun 2014 14:04:54 +0200
Subject: [PATCH 26/74] [TEST] Renamed afterDistribution timeout to
 expectedTimeToHeal Accumulate expected shard failures to log later

---
 .../DiscoveryWithNetworkFailuresTests.java    | 168 +++++++++---------
 .../test/ElasticsearchIntegrationTest.java    |   2 +-
 .../disruption/NetworkDelaysPartition.java    |   4 +-
 .../NetworkDisconnectPartition.java           |   6 +
 .../test/disruption/NetworkPartition.java     |   5 -
 .../NetworkUnresponsivePartition.java         |   6 +
 .../test/disruption/NoOpDisruptionScheme.java |   4 +-
 .../disruption/ServiceDisruptionScheme.java   |   2 +-
 .../test/disruption/SingleNodeDisruption.java |   5 -
 .../SlowClusterStateProcessing.java           |   5 +
 10 files changed, 109 insertions(+), 98 deletions(-)

diff --git a/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java b/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java
index cc9feaf0e256c..688bf51fec580 100644
--- a/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java
+++ b/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java
@@ -20,7 +20,6 @@
 package org.elasticsearch.discovery;
 
 import com.google.common.base.Predicate;
-import org.apache.lucene.util.LuceneTestCase;
 import org.elasticsearch.ElasticsearchException;
 import org.elasticsearch.action.admin.cluster.health.ClusterHealthResponse;
 import org.elasticsearch.action.get.GetResponse;
@@ -365,7 +364,7 @@ public void testIsolateMasterAndVerifyClusterStateConsensus() throws Exception {
     }
 
     @Test
-    @LuceneTestCase.AwaitsFix(bugUrl = "MvG will fix")
+//    @LuceneTestCase.AwaitsFix(bugUrl = "MvG will fix")
     @TestLogging("action.index:TRACE,action.get:TRACE,discovery:TRACE,cluster.service:TRACE,indices.recovery:TRACE,indices.cluster:TRACE")
     public void testAckedIndexing() throws Exception {
         final List<String> nodes = internalCluster().startNodesAsync(3, nodeSettings).get();
@@ -374,7 +373,7 @@ public void testAckedIndexing() throws Exception {
         assertAcked(prepareCreate("test")
                 .setSettings(ImmutableSettings.builder()
                                 .put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1 + randomInt(2))
-                                .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 1 + randomInt(1))
+                                .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, randomInt(2))
                 ));
         ensureGreen();
 
@@ -388,99 +387,104 @@ public void testAckedIndexing() throws Exception {
         List<Semaphore> semaphores = new ArrayList<>(nodes.size());
         final AtomicInteger idGenerator = new AtomicInteger(0);
         final AtomicReference<CountDownLatch> countDownLatchRef = new AtomicReference<>();
-        logger.info("starting indexers");
+        final List<Exception> exceptedExceptions = Collections.synchronizedList(new ArrayList<Exception>());
 
-        for (final String node : nodes) {
-            final Semaphore semaphore = new Semaphore(0);
-            semaphores.add(semaphore);
-            final Client client = client(node);
-            final String name = "indexer_" + indexers.size();
-            final int numPrimaries = getNumShards("test").numPrimaries;
-            Thread thread = new Thread(new Runnable() {
-                @Override
-                public void run() {
-                    while (!stop.get()) {
-                        String id = null;
-                        try {
-                            if (!semaphore.tryAcquire(10, TimeUnit.SECONDS)) {
-                                continue;
-                            }
-                            logger.info("[{}] Acquired semaphore and it has {} permits left", name, semaphore.availablePermits());
+        logger.info("starting indexers");
+        try {
+            for (final String node : nodes) {
+                final Semaphore semaphore = new Semaphore(0);
+                semaphores.add(semaphore);
+                final Client client = client(node);
+                final String name = "indexer_" + indexers.size();
+                final int numPrimaries = getNumShards("test").numPrimaries;
+                Thread thread = new Thread(new Runnable() {
+                    @Override
+                    public void run() {
+                        while (!stop.get()) {
+                            String id = null;
                             try {
-                                id = Integer.toString(idGenerator.incrementAndGet());
-                                int shard = ((InternalTestCluster) cluster()).getInstance(DjbHashFunction.class).hash(id) % numPrimaries;
-                                logger.trace("[{}] indexing id [{}] through node [{}] targeting shard [{}]", name, id, node, shard);
-                                IndexResponse response = client.prepareIndex("test", "type", id).setSource("{}").setTimeout("1s").get();
-                                assertThat(response.getVersion(), equalTo(1l));
-                                ackedDocs.put(id, node);
-                                logger.trace("[{}] indexed id [{}] through node [{}]", name, id, node);
-                            } finally {
-                                countDownLatchRef.get().countDown();
-                                logger.trace("[{}] decreased counter : {}", name, countDownLatchRef.get().getCount());
+                                if (!semaphore.tryAcquire(10, TimeUnit.SECONDS)) {
+                                    continue;
+                                }
+                                logger.info("[{}] Acquired semaphore and it has {} permits left", name, semaphore.availablePermits());
+                                try {
+                                    id = Integer.toString(idGenerator.incrementAndGet());
+                                    int shard = ((InternalTestCluster) cluster()).getInstance(DjbHashFunction.class).hash(id) % numPrimaries;
+                                    logger.trace("[{}] indexing id [{}] through node [{}] targeting shard [{}]", name, id, node, shard);
+                                    IndexResponse response = client.prepareIndex("test", "type", id).setSource("{}").setTimeout("1s").get();
+                                    assertThat(response.getVersion(), equalTo(1l));
+                                    ackedDocs.put(id, node);
+                                    logger.trace("[{}] indexed id [{}] through node [{}]", name, id, node);
+                                } finally {
+                                    countDownLatchRef.get().countDown();
+                                    logger.trace("[{}] decreased counter : {}", name, countDownLatchRef.get().getCount());
+                                }
+                            } catch (ElasticsearchException | InterruptedException e) {
+                                exceptedExceptions.add(e);
+                                logger.trace("[{}] failed id [{}] through node [{}]", e, name, id, node);
+                            } catch (Throwable t) {
+                                logger.info("unexpected exception in background thread of [{}]", t, node);
                             }
-                        } catch (ElasticsearchException | InterruptedException e) {
-                            logger.trace("[{}] failed id [{}] through node [{}]", e, name, id, node);
-                        } catch (Throwable t) {
-                            logger.info("unexpected exception in background thread of [{}]", t, node);
                         }
                     }
-                }
-            });
-
-            thread.setName(name);
-            thread.setDaemon(true);
-            thread.start();
-            indexers.add(thread);
-        }
-
-        int docsPerIndexer = randomInt(3);
-        logger.info("indexing " + docsPerIndexer + " docs per indexer before partition");
-        countDownLatchRef.set(new CountDownLatch(docsPerIndexer * indexers.size()));
-        for (Semaphore semaphore : semaphores) {
-            semaphore.release(docsPerIndexer);
-        }
-        assertTrue(countDownLatchRef.get().await(1, TimeUnit.MINUTES));
+                });
 
-        for (int iter = 1 + randomInt(2); iter > 0; iter--) {
-            logger.info("starting disruptions & indexing (iteration [{}])", iter);
-            disruptionScheme.startDisrupting();
+                thread.setName(name);
+                thread.setDaemon(true);
+                thread.start();
+                indexers.add(thread);
+            }
 
-            docsPerIndexer = 1 + randomInt(5);
-            logger.info("indexing " + docsPerIndexer + " docs per indexer during partition");
+            int docsPerIndexer = randomInt(3);
+            logger.info("indexing " + docsPerIndexer + " docs per indexer before partition");
             countDownLatchRef.set(new CountDownLatch(docsPerIndexer * indexers.size()));
-            Collections.shuffle(semaphores);
             for (Semaphore semaphore : semaphores) {
-                assertThat(semaphore.availablePermits(), equalTo(0));
                 semaphore.release(docsPerIndexer);
             }
-            assertTrue(countDownLatchRef.get().await(disruptionScheme.afterDisruptionTimeOut().millis() * (docsPerIndexer * indexers.size()), TimeUnit.MILLISECONDS));
-
-            logger.info("stopping disruption");
-            disruptionScheme.stopDisrupting();
-            ensureStableCluster(3, disruptionScheme.afterDisruptionTimeOut());
-            ensureGreen("test");
-
-            logger.info("validating successful docs");
-            for (String node : nodes) {
-                try {
-                    logger.debug("validating through node [{}]", node);
-                    for (String id : ackedDocs.keySet()) {
-                        assertTrue("doc [" + id + "] indexed via node [" + ackedDocs.get(id) + "] not found",
-                                client(node).prepareGet("test", "type", id).setPreference("_local").get().isExists());
+            assertTrue(countDownLatchRef.get().await(1, TimeUnit.MINUTES));
+
+            for (int iter = 1 + randomInt(2); iter > 0; iter--) {
+                logger.info("starting disruptions & indexing (iteration [{}])", iter);
+                disruptionScheme.startDisrupting();
+
+                docsPerIndexer = 1 + randomInt(5);
+                logger.info("indexing " + docsPerIndexer + " docs per indexer during partition");
+                countDownLatchRef.set(new CountDownLatch(docsPerIndexer * indexers.size()));
+                Collections.shuffle(semaphores);
+                for (Semaphore semaphore : semaphores) {
+                    assertThat(semaphore.availablePermits(), equalTo(0));
+                    semaphore.release(docsPerIndexer);
+                }
+                assertTrue(countDownLatchRef.get().await(30000 + disruptionScheme.expectedTimeToHeal().millis() * (docsPerIndexer * indexers.size()), TimeUnit.MILLISECONDS));
+
+                logger.info("stopping disruption");
+                disruptionScheme.stopDisrupting();
+                ensureStableCluster(3, TimeValue.timeValueMillis(disruptionScheme.expectedTimeToHeal().millis() + 30000));
+                ensureGreen("test");
+
+                logger.info("validating successful docs");
+                for (String node : nodes) {
+                    try {
+                        logger.debug("validating through node [{}]", node);
+                        for (String id : ackedDocs.keySet()) {
+                            assertTrue("doc [" + id + "] indexed via node [" + ackedDocs.get(id) + "] not found",
+                                    client(node).prepareGet("test", "type", id).setPreference("_local").get().isExists());
+                        }
+                    } catch (AssertionError e) {
+                        throw new AssertionError(e.getMessage() + " (checked via node [" + node + "]", e);
                     }
-                } catch (AssertionError e) {
-                    throw new AssertionError(e.getMessage() + " (checked via node [" + node + "]", e);
                 }
-            }
 
-            logger.info("done validating (iteration [{}])", iter);
-        }
-
-        logger.info("shutting down indexers");
-        stop.set(true);
-        for (Thread indexer : indexers) {
-            indexer.interrupt();
-            indexer.join(60000);
+                logger.info("done validating (iteration [{}])", iter);
+            }
+        } finally {
+            logger.debug("Excepted exception during disruption [{}]", exceptedExceptions);
+            logger.info("shutting down indexers");
+            stop.set(true);
+            for (Thread indexer : indexers) {
+                indexer.interrupt();
+                indexer.join(60000);
+            }
         }
     }
 
diff --git a/src/test/java/org/elasticsearch/test/ElasticsearchIntegrationTest.java b/src/test/java/org/elasticsearch/test/ElasticsearchIntegrationTest.java
index 5a59036ff41b3..74f624eee7f89 100644
--- a/src/test/java/org/elasticsearch/test/ElasticsearchIntegrationTest.java
+++ b/src/test/java/org/elasticsearch/test/ElasticsearchIntegrationTest.java
@@ -907,7 +907,7 @@ public void allowNodes(String index, int n) {
      * It is useful to ensure that all action on the cluster have finished and all shards that were currently relocating
      * are now allocated and started.
      */
-    public ClusterHealthStatus ensureGreen(String... indices) {
+    public ClusterHealthStatus  ensureGreen(String... indices) {
         ClusterHealthResponse actionGet = client().admin().cluster()
                 .health(Requests.clusterHealthRequest(indices).waitForGreenStatus().waitForEvents(Priority.LANGUID).waitForRelocatingShards(0)).actionGet();
         if (actionGet.isTimedOut()) {
diff --git a/src/test/java/org/elasticsearch/test/disruption/NetworkDelaysPartition.java b/src/test/java/org/elasticsearch/test/disruption/NetworkDelaysPartition.java
index 16a5913452b26..9eb99302e461f 100644
--- a/src/test/java/org/elasticsearch/test/disruption/NetworkDelaysPartition.java
+++ b/src/test/java/org/elasticsearch/test/disruption/NetworkDelaysPartition.java
@@ -86,7 +86,7 @@ protected String getPartitionDescription() {
     }
 
     @Override
-    public TimeValue afterDisruptionTimeOut() {
-        return TimeValue.timeValueMillis(delayMax + super.afterDisruptionTimeOut().millis());
+    public TimeValue expectedTimeToHeal() {
+        return TimeValue.timeValueMillis(delayMax);
     }
 }
diff --git a/src/test/java/org/elasticsearch/test/disruption/NetworkDisconnectPartition.java b/src/test/java/org/elasticsearch/test/disruption/NetworkDisconnectPartition.java
index 664c7a09977cd..8653b50f7497c 100644
--- a/src/test/java/org/elasticsearch/test/disruption/NetworkDisconnectPartition.java
+++ b/src/test/java/org/elasticsearch/test/disruption/NetworkDisconnectPartition.java
@@ -19,6 +19,7 @@
 package org.elasticsearch.test.disruption;
 
 import org.elasticsearch.cluster.node.DiscoveryNode;
+import org.elasticsearch.common.unit.TimeValue;
 import org.elasticsearch.test.transport.MockTransportService;
 
 import java.util.Random;
@@ -50,4 +51,9 @@ void applyDisruption(DiscoveryNode node1, MockTransportService transportService1
         transportService1.addFailToSendNoConnectRule(node2);
         transportService2.addFailToSendNoConnectRule(node1);
     }
+
+    @Override
+    public TimeValue expectedTimeToHeal() {
+        return TimeValue.timeValueSeconds(0);
+    }
 }
diff --git a/src/test/java/org/elasticsearch/test/disruption/NetworkPartition.java b/src/test/java/org/elasticsearch/test/disruption/NetworkPartition.java
index 13fe1af77d667..f9b05a1c1c86c 100644
--- a/src/test/java/org/elasticsearch/test/disruption/NetworkPartition.java
+++ b/src/test/java/org/elasticsearch/test/disruption/NetworkPartition.java
@@ -22,7 +22,6 @@
 import org.elasticsearch.cluster.node.DiscoveryNode;
 import org.elasticsearch.common.logging.ESLogger;
 import org.elasticsearch.common.logging.Loggers;
-import org.elasticsearch.common.unit.TimeValue;
 import org.elasticsearch.discovery.Discovery;
 import org.elasticsearch.test.InternalTestCluster;
 import org.elasticsearch.test.transport.MockTransportService;
@@ -197,8 +196,4 @@ protected void removeDisruption(DiscoveryNode node1, MockTransportService transp
         transportService2.clearRule(node1);
     }
 
-    @Override
-    public TimeValue afterDisruptionTimeOut() {
-        return TimeValue.timeValueSeconds(30);
-    }
 }
diff --git a/src/test/java/org/elasticsearch/test/disruption/NetworkUnresponsivePartition.java b/src/test/java/org/elasticsearch/test/disruption/NetworkUnresponsivePartition.java
index 95b853cf9b54d..1feb56c46c746 100644
--- a/src/test/java/org/elasticsearch/test/disruption/NetworkUnresponsivePartition.java
+++ b/src/test/java/org/elasticsearch/test/disruption/NetworkUnresponsivePartition.java
@@ -19,6 +19,7 @@
 package org.elasticsearch.test.disruption;
 
 import org.elasticsearch.cluster.node.DiscoveryNode;
+import org.elasticsearch.common.unit.TimeValue;
 import org.elasticsearch.test.transport.MockTransportService;
 
 import java.util.Random;
@@ -49,4 +50,9 @@ void applyDisruption(DiscoveryNode node1, MockTransportService transportService1
         transportService1.addUnresponsiveRule(node2);
         transportService2.addUnresponsiveRule(node1);
     }
+
+    @Override
+    public TimeValue expectedTimeToHeal() {
+        return TimeValue.timeValueSeconds(0);
+    }
 }
diff --git a/src/test/java/org/elasticsearch/test/disruption/NoOpDisruptionScheme.java b/src/test/java/org/elasticsearch/test/disruption/NoOpDisruptionScheme.java
index 24096a40334c4..7b348b1afea51 100644
--- a/src/test/java/org/elasticsearch/test/disruption/NoOpDisruptionScheme.java
+++ b/src/test/java/org/elasticsearch/test/disruption/NoOpDisruptionScheme.java
@@ -60,7 +60,7 @@ public void testClusterClosed() {
     }
 
     @Override
-    public TimeValue afterDisruptionTimeOut() {
-        return TimeValue.timeValueSeconds(30);
+    public TimeValue expectedTimeToHeal() {
+        return TimeValue.timeValueSeconds(0);
     }
 }
diff --git a/src/test/java/org/elasticsearch/test/disruption/ServiceDisruptionScheme.java b/src/test/java/org/elasticsearch/test/disruption/ServiceDisruptionScheme.java
index 5f6c949000f92..70774a823568a 100644
--- a/src/test/java/org/elasticsearch/test/disruption/ServiceDisruptionScheme.java
+++ b/src/test/java/org/elasticsearch/test/disruption/ServiceDisruptionScheme.java
@@ -37,6 +37,6 @@ public interface ServiceDisruptionScheme {
 
     public void testClusterClosed();
 
-    public TimeValue afterDisruptionTimeOut();
+    public TimeValue expectedTimeToHeal();
 
 }
diff --git a/src/test/java/org/elasticsearch/test/disruption/SingleNodeDisruption.java b/src/test/java/org/elasticsearch/test/disruption/SingleNodeDisruption.java
index dd18445a7052d..3148254011ef2 100644
--- a/src/test/java/org/elasticsearch/test/disruption/SingleNodeDisruption.java
+++ b/src/test/java/org/elasticsearch/test/disruption/SingleNodeDisruption.java
@@ -20,7 +20,6 @@
 
 import org.elasticsearch.common.logging.ESLogger;
 import org.elasticsearch.common.logging.Loggers;
-import org.elasticsearch.common.unit.TimeValue;
 import org.elasticsearch.test.InternalTestCluster;
 
 import java.util.Random;
@@ -81,8 +80,4 @@ public synchronized void testClusterClosed() {
         disruptedNode = null;
     }
 
-    @Override
-    public TimeValue afterDisruptionTimeOut() {
-        return TimeValue.timeValueSeconds(30);
-    }
 }
diff --git a/src/test/java/org/elasticsearch/test/disruption/SlowClusterStateProcessing.java b/src/test/java/org/elasticsearch/test/disruption/SlowClusterStateProcessing.java
index 3de223ae84427..a7f6b88592400 100644
--- a/src/test/java/org/elasticsearch/test/disruption/SlowClusterStateProcessing.java
+++ b/src/test/java/org/elasticsearch/test/disruption/SlowClusterStateProcessing.java
@@ -107,6 +107,11 @@ public void onFailure(String source, Throwable t) {
         return true;
     }
 
+    @Override
+    public TimeValue expectedTimeToHeal() {
+        return TimeValue.timeValueSeconds(delayDurationMax + intervalBetweenDelaysMax);
+    }
+
     class BackgroundWorker implements Runnable {
 
         @Override

From a7a61a0392e6e5b01f5af1720be37db33a89c0cd Mon Sep 17 00:00:00 2001
From: Boaz Leskes <b.leskes@gmail.com>
Date: Tue, 24 Jun 2014 11:33:43 +0200
Subject: [PATCH 27/74] [Test] ensureStableCluster failed to pass viaNode
 parameter correctly

Also improved timeouts & logs
---
 .../DiscoveryWithNetworkFailuresTests.java    | 25 +++++++++++++------
 .../test/InternalTestCluster.java             |  8 ++++++
 2 files changed, 25 insertions(+), 8 deletions(-)

diff --git a/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java b/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java
index 688bf51fec580..d8a8c806b59b2 100644
--- a/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java
+++ b/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java
@@ -272,7 +272,7 @@ public boolean apply(Object input) {
         networkPartition.stopDisrupting();
 
         // Wait until the master node sees all 3 nodes again.
-        ensureStableCluster(3);
+        ensureStableCluster(3, new TimeValue(30000 + networkPartition.expectedTimeToHeal().millis()));
 
         logger.info("verifying all nodes return all data");
         for (Client client : clients()) {
@@ -329,7 +329,7 @@ public void testIsolateMasterAndVerifyClusterStateConsensus() throws Exception {
         // restore isolation
         scheme.stopDisrupting();
 
-        ensureStableCluster(3);
+        ensureStableCluster(3, new TimeValue(30000 + scheme.expectedTimeToHeal().millis()));
 
         logger.info("issue a reroute");
         // trigger a reroute now, instead of waiting for the background reroute of RerouteService
@@ -415,13 +415,15 @@ public void run() {
                                     assertThat(response.getVersion(), equalTo(1l));
                                     ackedDocs.put(id, node);
                                     logger.trace("[{}] indexed id [{}] through node [{}]", name, id, node);
+                                } catch (ElasticsearchException e) {
+                                    exceptedExceptions.add(e);
+                                    logger.trace("[{}] failed id [{}] through node [{}]", e, name, id, node);
                                 } finally {
                                     countDownLatchRef.get().countDown();
                                     logger.trace("[{}] decreased counter : {}", name, countDownLatchRef.get().getCount());
                                 }
-                            } catch (ElasticsearchException | InterruptedException e) {
-                                exceptedExceptions.add(e);
-                                logger.trace("[{}] failed id [{}] through node [{}]", e, name, id, node);
+                            } catch (InterruptedException e) {
+                                // fine - semaphore interrupt
                             } catch (Throwable t) {
                                 logger.info("unexpected exception in background thread of [{}]", t, node);
                             }
@@ -455,7 +457,7 @@ public void run() {
                     assertThat(semaphore.availablePermits(), equalTo(0));
                     semaphore.release(docsPerIndexer);
                 }
-                assertTrue(countDownLatchRef.get().await(30000 + disruptionScheme.expectedTimeToHeal().millis() * (docsPerIndexer * indexers.size()), TimeUnit.MILLISECONDS));
+                assertTrue(countDownLatchRef.get().await(60000 + disruptionScheme.expectedTimeToHeal().millis() * (docsPerIndexer * indexers.size()), TimeUnit.MILLISECONDS));
 
                 logger.info("stopping disruption");
                 disruptionScheme.stopDisrupting();
@@ -478,7 +480,13 @@ public void run() {
                 logger.info("done validating (iteration [{}])", iter);
             }
         } finally {
-            logger.debug("Excepted exception during disruption [{}]", exceptedExceptions);
+            if (exceptedExceptions.size() > 0) {
+                StringBuilder sb = new StringBuilder("Indexing exceptions during disruption:");
+                for (Exception e : exceptedExceptions) {
+                    sb.append("\n").append(e.getMessage());
+                }
+                logger.debug(sb.toString());
+            }
             logger.info("shutting down indexers");
             stop.set(true);
             for (Thread indexer : indexers) {
@@ -608,10 +616,11 @@ private void ensureStableCluster(int nodeCount, TimeValue timeValue) {
     }
 
     private void ensureStableCluster(int nodeCount, @Nullable String viaNode) {
-        ensureStableCluster(nodeCount, TimeValue.timeValueSeconds(30), null);
+        ensureStableCluster(nodeCount, TimeValue.timeValueSeconds(30), viaNode);
     }
 
     private void ensureStableCluster(int nodeCount, TimeValue timeValue, @Nullable String viaNode) {
+        logger.debug("ensuring cluster is stable with [{}] nodes. access node: [{}]. timeout: [{}]", nodeCount, viaNode, timeValue);
         ClusterHealthResponse clusterHealthResponse = client(viaNode).admin().cluster().prepareHealth()
                 .setWaitForEvents(Priority.LANGUID)
                 .setWaitForNodes(Integer.toString(nodeCount))
diff --git a/src/test/java/org/elasticsearch/test/InternalTestCluster.java b/src/test/java/org/elasticsearch/test/InternalTestCluster.java
index 2ec3df2297252..92646ff390045 100644
--- a/src/test/java/org/elasticsearch/test/InternalTestCluster.java
+++ b/src/test/java/org/elasticsearch/test/InternalTestCluster.java
@@ -836,7 +836,15 @@ public synchronized void beforeTest(Random random, double transportClientRatio)
     }
 
     private synchronized void reset(boolean wipeData) throws IOException {
+        TimeValue expectedHealingTime = activeDisruptionScheme != null ? activeDisruptionScheme.expectedTimeToHeal() : null;
         clearDisruptionScheme();
+        if (expectedHealingTime != null && expectedHealingTime.millis() > 0) {
+            try {
+                Thread.sleep(expectedHealingTime.millis());
+            } catch (InterruptedException e) {
+                Thread.currentThread().interrupt();
+            }
+        }
         randomlyResetClients();
         if (wipeData) {
             wipeDataDirectories();

From 1af82fd96af7d8467700640f862186175aaf75b0 Mon Sep 17 00:00:00 2001
From: Boaz Leskes <b.leskes@gmail.com>
Date: Tue, 24 Jun 2014 23:00:09 +0200
Subject: [PATCH 28/74] [Tests] Disabling testAckedIndexing

The test is currently unstable and needs some more work
---
 .../discovery/DiscoveryWithNetworkFailuresTests.java           | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java b/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java
index d8a8c806b59b2..1428e33ebcc59 100644
--- a/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java
+++ b/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java
@@ -20,6 +20,7 @@
 package org.elasticsearch.discovery;
 
 import com.google.common.base.Predicate;
+import org.apache.lucene.util.LuceneTestCase;
 import org.elasticsearch.ElasticsearchException;
 import org.elasticsearch.action.admin.cluster.health.ClusterHealthResponse;
 import org.elasticsearch.action.get.GetResponse;
@@ -364,7 +365,7 @@ public void testIsolateMasterAndVerifyClusterStateConsensus() throws Exception {
     }
 
     @Test
-//    @LuceneTestCase.AwaitsFix(bugUrl = "MvG will fix")
+    @LuceneTestCase.AwaitsFix(bugUrl = "needs some more work to stabilize")
     @TestLogging("action.index:TRACE,action.get:TRACE,discovery:TRACE,cluster.service:TRACE,indices.recovery:TRACE,indices.cluster:TRACE")
     public void testAckedIndexing() throws Exception {
         final List<String> nodes = internalCluster().startNodesAsync(3, nodeSettings).get();

From c3e84eb63953ce34f17b166c967e51e6ca7ebebb Mon Sep 17 00:00:00 2001
From: Boaz Leskes <b.leskes@gmail.com>
Date: Thu, 26 Jun 2014 10:58:15 +0200
Subject: [PATCH 29/74] Fixed compilation issue caused by the lack of a thread
 pool name

---
 .../org/elasticsearch/discovery/ZenFaultDetectionTests.java     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/test/java/org/elasticsearch/discovery/ZenFaultDetectionTests.java b/src/test/java/org/elasticsearch/discovery/ZenFaultDetectionTests.java
index fc1634ddf7beb..3f65ed1591eb3 100644
--- a/src/test/java/org/elasticsearch/discovery/ZenFaultDetectionTests.java
+++ b/src/test/java/org/elasticsearch/discovery/ZenFaultDetectionTests.java
@@ -60,7 +60,7 @@ public class ZenFaultDetectionTests extends ElasticsearchTestCase {
     @Before
     public void setUp() throws Exception {
         super.setUp();
-        threadPool = new ThreadPool();
+        threadPool = new ThreadPool(getClass().getName());
         serviceA = build(ImmutableSettings.builder().put("name", "TS_A").build(), version0);
         nodeA = new DiscoveryNode("TS_A", "TS_A", serviceA.boundAddress().publishAddress(), ImmutableMap.<String, String>of(), version0);
         serviceB = build(ImmutableSettings.builder().put("name", "TS_B").build(), version1);

From 98084c02cef1c8a8b13377e24de5e8e6eb0af110 Mon Sep 17 00:00:00 2001
From: Martijn van Groningen <martijn.v.groningen@gmail.com>
Date: Mon, 30 Jun 2014 12:41:26 +0200
Subject: [PATCH 30/74] [TEST] Added test to verify if
 'discovery.zen.rejoin_on_master_gone' is updatable at runtime.

---
 .../ClusterDynamicSettingsModule.java         |  2 +
 .../discovery/zen/ZenDiscovery.java           |  6 ++-
 .../zen/ZenDiscoveryRejoinOnMaster.java       | 51 +++++++++++++++++++
 3 files changed, 58 insertions(+), 1 deletion(-)
 create mode 100644 src/test/java/org/elasticsearch/discovery/zen/ZenDiscoveryRejoinOnMaster.java

diff --git a/src/main/java/org/elasticsearch/cluster/settings/ClusterDynamicSettingsModule.java b/src/main/java/org/elasticsearch/cluster/settings/ClusterDynamicSettingsModule.java
index 7a022982ab742..9f799d5af3e3f 100644
--- a/src/main/java/org/elasticsearch/cluster/settings/ClusterDynamicSettingsModule.java
+++ b/src/main/java/org/elasticsearch/cluster/settings/ClusterDynamicSettingsModule.java
@@ -27,6 +27,7 @@
 import org.elasticsearch.cluster.routing.allocation.decider.*;
 import org.elasticsearch.common.inject.AbstractModule;
 import org.elasticsearch.discovery.DiscoverySettings;
+import org.elasticsearch.discovery.zen.ZenDiscovery;
 import org.elasticsearch.discovery.zen.elect.ElectMasterService;
 import org.elasticsearch.indices.breaker.HierarchyCircuitBreakerService;
 import org.elasticsearch.indices.cache.filter.IndicesFilterCache;
@@ -57,6 +58,7 @@ public ClusterDynamicSettingsModule() {
         clusterDynamicSettings.addDynamicSetting(DisableAllocationDecider.CLUSTER_ROUTING_ALLOCATION_DISABLE_ALLOCATION);
         clusterDynamicSettings.addDynamicSetting(DisableAllocationDecider.CLUSTER_ROUTING_ALLOCATION_DISABLE_REPLICA_ALLOCATION);
         clusterDynamicSettings.addDynamicSetting(ElectMasterService.DISCOVERY_ZEN_MINIMUM_MASTER_NODES, Validator.INTEGER);
+        clusterDynamicSettings.addDynamicSetting(ZenDiscovery.REJOIN_ON_MASTER_GONE, Validator.BOOLEAN);
         clusterDynamicSettings.addDynamicSetting(FilterAllocationDecider.CLUSTER_ROUTING_INCLUDE_GROUP + "*");
         clusterDynamicSettings.addDynamicSetting(FilterAllocationDecider.CLUSTER_ROUTING_EXCLUDE_GROUP + "*");
         clusterDynamicSettings.addDynamicSetting(FilterAllocationDecider.CLUSTER_ROUTING_REQUIRE_GROUP + "*");
diff --git a/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java b/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
index bccc274965617..bd9eb0f367445 100644
--- a/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
+++ b/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
@@ -77,7 +77,7 @@
  */
 public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implements Discovery, DiscoveryNodesProvider {
 
-    private final static String REJOIN_ON_MASTER_GONE = "discovery.zen.rejoin_on_master_gone";
+    public final static String REJOIN_ON_MASTER_GONE = "discovery.zen.rejoin_on_master_gone";
 
     public static final String DISCOVERY_REJOIN_ACTION_NAME = "internal:discovery/zen/rejoin";
 
@@ -973,6 +973,10 @@ public void onDisconnectedFromMaster() {
         }
     }
 
+    boolean isRejoinOnMasterGone() {
+        return rejoinOnMasterGone;
+    }
+
     static class RejoinClusterRequest extends TransportRequest {
 
         private String fromNodeId;
diff --git a/src/test/java/org/elasticsearch/discovery/zen/ZenDiscoveryRejoinOnMaster.java b/src/test/java/org/elasticsearch/discovery/zen/ZenDiscoveryRejoinOnMaster.java
new file mode 100644
index 0000000000000..83cc76af5f3b9
--- /dev/null
+++ b/src/test/java/org/elasticsearch/discovery/zen/ZenDiscoveryRejoinOnMaster.java
@@ -0,0 +1,51 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.discovery.zen;
+
+import org.elasticsearch.common.settings.ImmutableSettings;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.discovery.Discovery;
+import org.elasticsearch.test.ElasticsearchIntegrationTest;
+import org.junit.Test;
+
+import static org.hamcrest.Matchers.is;
+
+/**
+ */
+@ElasticsearchIntegrationTest.ClusterScope(scope = ElasticsearchIntegrationTest.Scope.TEST, numDataNodes = 0, numClientNodes = 0)
+public class ZenDiscoveryRejoinOnMaster extends ElasticsearchIntegrationTest {
+
+    @Test
+    public void testChangeRejoinOnMaster() throws Exception {
+        Settings nodeSettings = ImmutableSettings.settingsBuilder()
+                .put("discovery.type", "zen") // <-- To override the local setting if set externally
+                .build();
+        String nodeName = internalCluster().startNode(nodeSettings);
+        ZenDiscovery zenDiscovery = (ZenDiscovery) internalCluster().getInstance(Discovery.class, nodeName);
+        assertThat(zenDiscovery.isRejoinOnMasterGone(), is(true));
+
+        client().admin().cluster().prepareUpdateSettings()
+                .setTransientSettings(ImmutableSettings.builder().put(ZenDiscovery.REJOIN_ON_MASTER_GONE, false))
+                .get();
+
+        assertThat(zenDiscovery.isRejoinOnMasterGone(), is(false));
+    }
+
+}

From 52f69c64f7a720a280fe58fdca405fb0d14fefab Mon Sep 17 00:00:00 2001
From: Martijn van Groningen <martijn.v.groningen@gmail.com>
Date: Mon, 30 Jun 2014 19:03:24 +0200
Subject: [PATCH 31/74] [TEST] Verify no master block during partition for read
 and write apis

---
 .../ClusterDynamicSettingsModule.java         |   1 +
 .../DiscoveryWithNetworkFailuresTests.java    | 175 +++++++++---------
 2 files changed, 86 insertions(+), 90 deletions(-)

diff --git a/src/main/java/org/elasticsearch/cluster/settings/ClusterDynamicSettingsModule.java b/src/main/java/org/elasticsearch/cluster/settings/ClusterDynamicSettingsModule.java
index 9f799d5af3e3f..e28438f6cfe45 100644
--- a/src/main/java/org/elasticsearch/cluster/settings/ClusterDynamicSettingsModule.java
+++ b/src/main/java/org/elasticsearch/cluster/settings/ClusterDynamicSettingsModule.java
@@ -59,6 +59,7 @@ public ClusterDynamicSettingsModule() {
         clusterDynamicSettings.addDynamicSetting(DisableAllocationDecider.CLUSTER_ROUTING_ALLOCATION_DISABLE_REPLICA_ALLOCATION);
         clusterDynamicSettings.addDynamicSetting(ElectMasterService.DISCOVERY_ZEN_MINIMUM_MASTER_NODES, Validator.INTEGER);
         clusterDynamicSettings.addDynamicSetting(ZenDiscovery.REJOIN_ON_MASTER_GONE, Validator.BOOLEAN);
+        clusterDynamicSettings.addDynamicSetting(DiscoverySettings.NO_MASTER_BLOCK);
         clusterDynamicSettings.addDynamicSetting(FilterAllocationDecider.CLUSTER_ROUTING_INCLUDE_GROUP + "*");
         clusterDynamicSettings.addDynamicSetting(FilterAllocationDecider.CLUSTER_ROUTING_EXCLUDE_GROUP + "*");
         clusterDynamicSettings.addDynamicSetting(FilterAllocationDecider.CLUSTER_ROUTING_REQUIRE_GROUP + "*");
diff --git a/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java b/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java
index 1428e33ebcc59..ab1f597b1f913 100644
--- a/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java
+++ b/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java
@@ -22,12 +22,10 @@
 import com.google.common.base.Predicate;
 import org.apache.lucene.util.LuceneTestCase;
 import org.elasticsearch.ElasticsearchException;
+import org.elasticsearch.action.ActionRequestBuilder;
 import org.elasticsearch.action.admin.cluster.health.ClusterHealthResponse;
 import org.elasticsearch.action.get.GetResponse;
-import org.elasticsearch.action.index.IndexRequestBuilder;
 import org.elasticsearch.action.index.IndexResponse;
-import org.elasticsearch.action.search.SearchResponse;
-import org.elasticsearch.action.update.UpdateResponse;
 import org.elasticsearch.client.Client;
 import org.elasticsearch.cluster.ClusterState;
 import org.elasticsearch.cluster.block.ClusterBlock;
@@ -42,8 +40,6 @@
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.common.unit.TimeValue;
 import org.elasticsearch.rest.RestStatus;
-import org.elasticsearch.search.SearchHit;
-import org.elasticsearch.search.sort.SortOrder;
 import org.elasticsearch.test.ElasticsearchIntegrationTest;
 import org.elasticsearch.test.InternalTestCluster;
 import org.elasticsearch.test.disruption.*;
@@ -64,8 +60,8 @@
 import static org.elasticsearch.test.ElasticsearchIntegrationTest.ClusterScope;
 import static org.elasticsearch.test.ElasticsearchIntegrationTest.Scope;
 import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked;
-import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertHitCount;
-import static org.hamcrest.Matchers.*;
+import static org.hamcrest.Matchers.equalTo;
+import static org.hamcrest.Matchers.is;
 
 /**
  */
@@ -161,39 +157,11 @@ public boolean apply(Object input) {
     }
 
     @Test
-    @TestLogging("discovery.zen:TRACE,action:TRACE,cluster.service:TRACE,indices.recovery:TRACE,indices.cluster:TRACE")
-    public void testDataConsistency() throws Exception {
-        List<String> nodes = internalCluster().startNodesAsync(3, nodeSettings).get();
-
+    public void testVerifyApiBlocksDuringPartition() throws Exception {
+        internalCluster().startNodesAsync(3, nodeSettings).get();
         // Wait until a 3 nodes are part of the cluster
         ensureStableCluster(3);
-
-        assertAcked(prepareCreate("test")
-                .addMapping("type", "field", "type=long")
-                .get());
-
-        IndexRequestBuilder[] indexRequests = new IndexRequestBuilder[scaledRandomIntBetween(1, 1000)];
-        for (int i = 0; i < indexRequests.length; i++) {
-            indexRequests[i] = client().prepareIndex("test", "type", String.valueOf(i)).setSource("field", i);
-        }
-        indexRandom(true, indexRequests);
-
-
-        for (int i = 0; i < indexRequests.length; i++) {
-            GetResponse getResponse = client().prepareGet("test", "type", String.valueOf(i)).get();
-            assertThat(getResponse.isExists(), is(true));
-            assertThat(getResponse.getVersion(), equalTo(1l));
-            assertThat(getResponse.getId(), equalTo(String.valueOf(i)));
-        }
-        SearchResponse searchResponse = client().prepareSearch("test").setTypes("type")
-                .addSort("field", SortOrder.ASC)
-                .get();
-        assertHitCount(searchResponse, indexRequests.length);
-        for (int i = 0; i < searchResponse.getHits().getHits().length; i++) {
-            SearchHit searchHit = searchResponse.getHits().getAt(i);
-            assertThat(searchHit.id(), equalTo(String.valueOf(i)));
-            assertThat((long) searchHit.sortValues()[0], equalTo((long) i));
-        }
+        createIndex("test");
 
         // Everything is stable now, it is now time to simulate evil...
         // but first make sure we have no initializing shards and all is green
@@ -233,35 +201,91 @@ public boolean apply(Object input) {
         assertThat(applied, is(true));
         ensureStableCluster(2, nonIsolatedNode);
 
-        // Reads on the right side of the split must work
-        logger.info("verifying healthy part of cluster returns data");
-        searchResponse = client(nonIsolatedNode).prepareSearch("test").setTypes("type")
-                .addSort("field", SortOrder.ASC)
+        // Reads on the wrong side of the split are allowed
+        client(isolatedNode).prepareSearch("test").setTypes("type")
+                .setPreference("_only_local")
+                .get();
+        client(isolatedNode).preparePercolate().setDocumentType("type").setIndices("test")
+                .setPreference("_only_local").setSource("{\"doc\" : {}}")
+                .get();
+        client(isolatedNode).prepareCount("test").setTypes("type")
+                .setPreference("_only_local")
+                .get();
+        client(isolatedNode).prepareGet("test", "type", "0").setPreference("_only_local")
                 .get();
-        assertHitCount(searchResponse, indexRequests.length);
-        for (int i = 0; i < searchResponse.getHits().getHits().length; i++) {
-            SearchHit searchHit = searchResponse.getHits().getAt(i);
-            assertThat(searchHit.id(), equalTo(String.valueOf(i)));
-            assertThat((long) searchHit.sortValues()[0], equalTo((long) i));
-        }
 
-        // Reads on the wrong side of the split are partial
-        logger.info("verifying isolated node [{}] returns partial data", isolatedNode);
-        searchResponse = client(isolatedNode).prepareSearch("test").setTypes("type")
-                .addSort("field", SortOrder.ASC).setPreference("_only_local")
+        // Writes on the wrong side of the split are *not* allowed
+        executeBlockedApi(
+                client(isolatedNode).prepareIndex("test", "type", "0").setSource("{}").setTimeout("1s") // Fail quick, otherwise we wait 60 seconds.
+        );
+        executeBlockedApi(
+                client(isolatedNode).prepareUpdate("test", "type", "0").setDoc("{}").setTimeout("1s") // Fail quick, otherwise we wait 60 seconds.
+        );
+        networkPartition.stopDisrupting();
+
+        // Wait until the master node sees all 3 nodes again.
+        ensureStableCluster(3, new TimeValue(30000 + networkPartition.expectedTimeToHeal().millis()));
+
+        logger.info("Verify no master block with {} set to {}", DiscoverySettings.NO_MASTER_BLOCK, "all");
+        client().admin().cluster().prepareUpdateSettings()
+                .setTransientSettings(ImmutableSettings.builder().put(DiscoverySettings.NO_MASTER_BLOCK, "all"))
                 .get();
-        assertThat(searchResponse.getSuccessfulShards(), lessThan(searchResponse.getTotalShards()));
-        assertThat(searchResponse.getHits().totalHits(), lessThan((long) indexRequests.length));
+        networkPartition.startDisrupting();
 
-        logger.info("verifying writes on healthy cluster");
-        UpdateResponse updateResponse = client(nonIsolatedNode).prepareUpdate("test", "type", "0").setDoc("field2", 2).get();
-        assertThat(updateResponse.getVersion(), equalTo(2l));
+        logger.info("wait until elected master has removed [{}]", isolatedNode);
+        applied = awaitBusy(new Predicate<Object>() {
+            @Override
+            public boolean apply(Object input) {
+                return client(nonIsolatedNode).admin().cluster().prepareState().setLocal(true).get().getState().nodes().size() == 2;
+            }
+        }, 1, TimeUnit.MINUTES);
+        assertThat(applied, is(true));
 
+        // The unlucky node must report *no* master node, since it can't connect to master and in fact it should
+        // continuously ping until network failures have been resolved. However
+        // It may a take a bit before the node detects it has been cut off from the elected master
+        logger.info("waiting for isolated node [{}] to have no master", isolatedNode);
+        applied = awaitBusy(new Predicate<Object>() {
+            @Override
+            public boolean apply(Object input) {
+                ClusterState localClusterState = client(isolatedNode).admin().cluster().prepareState().setLocal(true).get().getState();
+                DiscoveryNodes localDiscoveryNodes = localClusterState.nodes();
+                logger.info("localDiscoveryNodes=" + localDiscoveryNodes.prettyPrint());
+                return localDiscoveryNodes.masterNode() == null;
+            }
+        }, 10, TimeUnit.SECONDS);
+        assertThat(applied, is(true));
+        ensureStableCluster(2, nonIsolatedNode);
+
+        // Now reads and writes on the wrong side of the split are allowed
+        executeBlockedApi(
+                client(isolatedNode).prepareSearch("test").setTypes("type").setPreference("_only_local")
+        );
+        executeBlockedApi(
+                client(isolatedNode).preparePercolate().setDocumentType("type").setIndices("test").setPreference("_only_local").setSource("{\"doc\" : {}}")
+        );
+        executeBlockedApi(
+                client(isolatedNode).prepareCount("test").setTypes("type").setPreference("_only_local")
+        );
+        executeBlockedApi(
+                client(isolatedNode).prepareGet("test", "type", "0").setPreference("_only_local")
+        );
+        executeBlockedApi(
+                client(isolatedNode).prepareIndex("test", "type", "0").setSource("{}").setTimeout("1s") // Fail quick, otherwise we wait 60 seconds.
+        );
+        executeBlockedApi(
+                client(isolatedNode).prepareUpdate("test", "type", "0").setDoc("{}").setTimeout("1s") // Fail quick, otherwise we wait 60 seconds.
+        );
+        networkPartition.stopDisrupting();
+
+        // Wait until the master node sees all 3 nodes again.
+        ensureStableCluster(3, new TimeValue(30000 + networkPartition.expectedTimeToHeal().millis()));
+    }
+
+    private void executeBlockedApi(ActionRequestBuilder builder) {
         try {
-            logger.info("verifying writes on isolated [{}] fail", isolatedNode);
-            client(isolatedNode).prepareUpdate("test", "type", "0").setDoc("field2", 2)
-                    .setTimeout("1s") // Fail quick, otherwise we wait 60 seconds.
-                    .get();
+            logger.info("verifying request[{}] on isolated [{}] and fail", builder.getClass().getSimpleName());
+            builder.get();
             fail();
         } catch (ClusterBlockException exception) {
             assertThat(exception.status(), equalTo(RestStatus.SERVICE_UNAVAILABLE));
@@ -269,37 +293,8 @@ public boolean apply(Object input) {
             ClusterBlock clusterBlock = exception.blocks().iterator().next();
             assertThat(clusterBlock.id(), equalTo(DiscoverySettings.NO_MASTER_BLOCK_ID));
         }
-
-        networkPartition.stopDisrupting();
-
-        // Wait until the master node sees all 3 nodes again.
-        ensureStableCluster(3, new TimeValue(30000 + networkPartition.expectedTimeToHeal().millis()));
-
-        logger.info("verifying all nodes return all data");
-        for (Client client : clients()) {
-            searchResponse = client.prepareSearch("test").setTypes("type")
-                    .addSort("field", SortOrder.ASC)
-                    .get();
-            for (int i = 0; i < searchResponse.getHits().getHits().length; i++) {
-                SearchHit searchHit = searchResponse.getHits().getAt(i);
-                assertThat(searchHit.id(), equalTo(String.valueOf(i)));
-                assertThat((long) searchHit.sortValues()[0], equalTo((long) i));
-            }
-
-            GetResponse getResponse = client.prepareGet("test", "type", "0").setPreference("_local").get();
-            assertThat(getResponse.isExists(), is(true));
-            assertThat(getResponse.getId(), equalTo("0"));
-            assertThat(getResponse.getVersion(), equalTo(2l));
-            for (int i = 1; i < indexRequests.length; i++) {
-                getResponse = client.prepareGet("test", "type", String.valueOf(i)).setPreference("_local").get();
-                assertThat(getResponse.isExists(), is(true));
-                assertThat(getResponse.getVersion(), equalTo(1l));
-                assertThat(getResponse.getId(), equalTo(String.valueOf(i)));
-            }
-        }
     }
 
-
     @Test
     @TestLogging("discovery.zen:TRACE,action:TRACE,cluster.service:TRACE,indices.recovery:TRACE,indices.cluster:TRACE")
     public void testIsolateMasterAndVerifyClusterStateConsensus() throws Exception {

From 77dae631e169a4ef7ce7e18c01d68dfd6eea76a6 Mon Sep 17 00:00:00 2001
From: Martijn van Groningen <martijn.v.groningen@gmail.com>
Date: Tue, 1 Jul 2014 19:45:21 +0200
Subject: [PATCH 32/74] [TEST] Make sure get request is always local

---
 .../discovery/DiscoveryWithNetworkFailuresTests.java       | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java b/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java
index ab1f597b1f913..d5b2a81bd3f39 100644
--- a/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java
+++ b/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java
@@ -161,7 +161,12 @@ public void testVerifyApiBlocksDuringPartition() throws Exception {
         internalCluster().startNodesAsync(3, nodeSettings).get();
         // Wait until a 3 nodes are part of the cluster
         ensureStableCluster(3);
-        createIndex("test");
+
+        // Makes sure that the get request can be executed on each node locally:
+        assertAcked(prepareCreate("test").setSettings(ImmutableSettings.builder()
+                .put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1)
+                .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 2)
+        ));
 
         // Everything is stable now, it is now time to simulate evil...
         // but first make sure we have no initializing shards and all is green

From 5e5f8a9daf8c442d88c25d7fa9e8ac02f090df06 Mon Sep 17 00:00:00 2001
From: Boaz Leskes <b.leskes@gmail.com>
Date: Wed, 2 Jul 2014 12:00:49 +0200
Subject: [PATCH 33/74] Added java docs to all tests in
 DiscoveryWithNetworkFailuresTests

Moved testVerifyApiBlocksDuringPartition to test blocks rather then rely on specific API rejections.
Did some cleaning while at it.
---
 .../cluster/block/ClusterBlocks.java          |   4 +
 .../discovery/DiscoverySettings.java          |   8 +-
 .../DiscoveryWithNetworkFailuresTests.java    | 247 +++++++-----------
 .../test/InternalTestCluster.java             |  16 +-
 .../test/disruption/NetworkPartition.java     |   2 +-
 5 files changed, 119 insertions(+), 158 deletions(-)

diff --git a/src/main/java/org/elasticsearch/cluster/block/ClusterBlocks.java b/src/main/java/org/elasticsearch/cluster/block/ClusterBlocks.java
index e53cd24af8adb..2aa6e4e014b79 100644
--- a/src/main/java/org/elasticsearch/cluster/block/ClusterBlocks.java
+++ b/src/main/java/org/elasticsearch/cluster/block/ClusterBlocks.java
@@ -117,6 +117,10 @@ public boolean hasGlobalBlock(int blockId) {
         return false;
     }
 
+    public boolean hasGlobalBlock(ClusterBlockLevel level) {
+        return !global(level).isEmpty();
+    }
+
     /**
      * Is there a global block with the provided status?
      */
diff --git a/src/main/java/org/elasticsearch/discovery/DiscoverySettings.java b/src/main/java/org/elasticsearch/discovery/DiscoverySettings.java
index 8304893f0ba0a..dda300424ad7d 100644
--- a/src/main/java/org/elasticsearch/discovery/DiscoverySettings.java
+++ b/src/main/java/org/elasticsearch/discovery/DiscoverySettings.java
@@ -43,8 +43,8 @@ public class DiscoverySettings extends AbstractComponent {
     public static final String DEFAULT_NO_MASTER_BLOCK = "write";
     public final static int NO_MASTER_BLOCK_ID = 2;
 
-    private final static ClusterBlock ALL = new ClusterBlock(NO_MASTER_BLOCK_ID, "no master", true, true, RestStatus.SERVICE_UNAVAILABLE, ClusterBlockLevel.ALL);
-    private final static ClusterBlock WRITE = new ClusterBlock(NO_MASTER_BLOCK_ID, "no master", true, false, RestStatus.SERVICE_UNAVAILABLE, EnumSet.of(ClusterBlockLevel.WRITE, ClusterBlockLevel.METADATA));
+    public final static ClusterBlock NO_MASTER_BLOCK_ALL = new ClusterBlock(NO_MASTER_BLOCK_ID, "no master", true, true, RestStatus.SERVICE_UNAVAILABLE, ClusterBlockLevel.ALL);
+    public final static ClusterBlock NO_MASTER_BLOCK_WRITES = new ClusterBlock(NO_MASTER_BLOCK_ID, "no master", true, false, RestStatus.SERVICE_UNAVAILABLE, EnumSet.of(ClusterBlockLevel.WRITE, ClusterBlockLevel.METADATA));
 
     private volatile ClusterBlock noMasterBlock;
     private volatile TimeValue publishTimeout = DEFAULT_PUBLISH_TIMEOUT;
@@ -90,9 +90,9 @@ public void onRefreshSettings(Settings settings) {
 
     private ClusterBlock parseNoMasterBlock(String value) {
         if ("all".equals(value)) {
-            return ALL;
+            return NO_MASTER_BLOCK_ALL;
         } else if ("write".equals(value)) {
-            return WRITE;
+            return NO_MASTER_BLOCK_WRITES;
         } else {
             throw new ElasticsearchIllegalArgumentException("invalid master block [" + value + "]");
         }
diff --git a/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java b/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java
index d5b2a81bd3f39..5dbe4e203db14 100644
--- a/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java
+++ b/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java
@@ -22,24 +22,21 @@
 import com.google.common.base.Predicate;
 import org.apache.lucene.util.LuceneTestCase;
 import org.elasticsearch.ElasticsearchException;
-import org.elasticsearch.action.ActionRequestBuilder;
 import org.elasticsearch.action.admin.cluster.health.ClusterHealthResponse;
 import org.elasticsearch.action.get.GetResponse;
 import org.elasticsearch.action.index.IndexResponse;
 import org.elasticsearch.client.Client;
 import org.elasticsearch.cluster.ClusterState;
-import org.elasticsearch.cluster.block.ClusterBlock;
-import org.elasticsearch.cluster.block.ClusterBlockException;
+import org.elasticsearch.cluster.block.ClusterBlockLevel;
 import org.elasticsearch.cluster.metadata.IndexMetaData;
-import org.elasticsearch.cluster.node.DiscoveryNode;
 import org.elasticsearch.cluster.node.DiscoveryNodes;
 import org.elasticsearch.cluster.routing.operation.hash.djb.DjbHashFunction;
 import org.elasticsearch.common.Nullable;
 import org.elasticsearch.common.Priority;
+import org.elasticsearch.common.Strings;
 import org.elasticsearch.common.settings.ImmutableSettings;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.common.unit.TimeValue;
-import org.elasticsearch.rest.RestStatus;
 import org.elasticsearch.test.ElasticsearchIntegrationTest;
 import org.elasticsearch.test.InternalTestCluster;
 import org.elasticsearch.test.disruption.*;
@@ -66,6 +63,7 @@
 /**
  */
 @ClusterScope(scope = Scope.TEST, numDataNodes = 0, transportClientRatio = 0)
+@TestLogging("discovery.zen:TRACE")
 public class DiscoveryWithNetworkFailuresTests extends ElasticsearchIntegrationTest {
 
     private static final Settings nodeSettings = ImmutableSettings.settingsBuilder()
@@ -87,8 +85,12 @@ protected int numberOfReplicas() {
         return 1;
     }
 
+    /**
+     * Test that no split brain occurs under partial network partition. See https://github.com/elasticsearch/elasticsearch/issues/2488
+     *
+     * @throws Exception
+     */
     @Test
-    @TestLogging("discovery.zen:TRACE")
     public void failWithMinimumMasterNodesConfigured() throws Exception {
 
         List<String> nodes = internalCluster().startNodesAsync(3, nodeSettings).get();
@@ -97,50 +99,37 @@ public void failWithMinimumMasterNodesConfigured() throws Exception {
         ensureStableCluster(3);
 
         // Figure out what is the elected master node
-        DiscoveryNode masterDiscoNode = findMasterNode(nodes);
-        logger.info("---> legit elected master node=" + masterDiscoNode);
-        final Client masterClient = internalCluster().masterClient();
-
-        // Everything is stable now, it is now time to simulate evil...
+        final String masterNode = internalCluster().getMasterName();
+        logger.info("---> legit elected master node=" + masterNode);
 
         // Pick a node that isn't the elected master.
-        String unluckyNode = null;
-        for (String node : nodes) {
-            if (!node.equals(masterDiscoNode.getName())) {
-                unluckyNode = node;
-            }
-        }
-        assert unluckyNode != null;
+        Set<String> nonMasters = new HashSet<>(nodes);
+        nonMasters.remove(masterNode);
+        final String unluckyNode = randomFrom(nonMasters.toArray(Strings.EMPTY_ARRAY));
+
 
         // Simulate a network issue between the unlucky node and elected master node in both directions.
 
-        NetworkDisconnectPartition networkDisconnect = new NetworkDisconnectPartition(masterDiscoNode.name(), unluckyNode, getRandom());
+        NetworkDisconnectPartition networkDisconnect = new NetworkDisconnectPartition(masterNode, unluckyNode, getRandom());
         setDisruptionScheme(networkDisconnect);
         networkDisconnect.startDisrupting();
 
         // Wait until elected master has removed that the unlucky node...
-        boolean applied = awaitBusy(new Predicate<Object>() {
-            @Override
-            public boolean apply(Object input) {
-                return masterClient.admin().cluster().prepareState().setLocal(true).get().getState().nodes().size() == 2;
-            }
-        }, 1, TimeUnit.MINUTES);
-        assertThat(applied, is(true));
+        ensureStableCluster(2, masterNode);
 
         // The unlucky node must report *no* master node, since it can't connect to master and in fact it should
         // continuously ping until network failures have been resolved. However
-        final Client isolatedNodeClient = internalCluster().client(unluckyNode);
         // It may a take a bit before the node detects it has been cut off from the elected master
-        applied = awaitBusy(new Predicate<Object>() {
+        boolean success = awaitBusy(new Predicate<Object>() {
             @Override
             public boolean apply(Object input) {
-                ClusterState localClusterState = isolatedNodeClient.admin().cluster().prepareState().setLocal(true).get().getState();
+                ClusterState localClusterState = getNodeClusterState(unluckyNode);
                 DiscoveryNodes localDiscoveryNodes = localClusterState.nodes();
                 logger.info("localDiscoveryNodes=" + localDiscoveryNodes.prettyPrint());
                 return localDiscoveryNodes.masterNode() == null;
             }
         }, 10, TimeUnit.SECONDS);
-        assertThat(applied, is(true));
+        assertThat(success, is(true));
 
         networkDisconnect.stopDisrupting();
 
@@ -148,14 +137,17 @@ public boolean apply(Object input) {
         ensureStableCluster(3);
 
         for (String node : nodes) {
-            ClusterState state = internalCluster().client(node).admin().cluster().prepareState().setLocal(true).execute().actionGet().getState();
+            ClusterState state = getNodeClusterState(node);
             assertThat(state.nodes().size(), equalTo(3));
             // The elected master shouldn't have changed, since the unlucky node never could have elected himself as
             // master since m_m_n of 2 could never be satisfied.
-            assertThat(state.nodes().masterNode(), equalTo(masterDiscoNode));
+            assertThat(state.nodes().masterNode().name(), equalTo(masterNode));
         }
     }
 
+    /**
+     * Verify that the proper block is applied when nodes loose their master
+     */
     @Test
     public void testVerifyApiBlocksDuringPartition() throws Exception {
         internalCluster().startNodesAsync(3, nodeSettings).get();
@@ -164,8 +156,8 @@ public void testVerifyApiBlocksDuringPartition() throws Exception {
 
         // Makes sure that the get request can be executed on each node locally:
         assertAcked(prepareCreate("test").setSettings(ImmutableSettings.builder()
-                .put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1)
-                .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 2)
+                        .put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1)
+                        .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 2)
         ));
 
         // Everything is stable now, it is now time to simulate evil...
@@ -176,56 +168,58 @@ public void testVerifyApiBlocksDuringPartition() throws Exception {
         NetworkPartition networkPartition = addRandomPartition();
 
         final String isolatedNode = networkPartition.getMinoritySide().get(0);
-        final String nonIsolatedNode = networkPartition.getMjaoritySide().get(0);
+        final String nonIsolatedNode = networkPartition.getMajoritySide().get(0);
 
         // Simulate a network issue between the unlucky node and the rest of the cluster.
         networkPartition.startDisrupting();
 
-        logger.info("wait until elected master has removed [{}]", isolatedNode);
-        boolean applied = awaitBusy(new Predicate<Object>() {
-            @Override
-            public boolean apply(Object input) {
-                return client(nonIsolatedNode).admin().cluster().prepareState().setLocal(true).get().getState().nodes().size() == 2;
-            }
-        }, 1, TimeUnit.MINUTES);
-        assertThat(applied, is(true));
 
         // The unlucky node must report *no* master node, since it can't connect to master and in fact it should
         // continuously ping until network failures have been resolved. However
         // It may a take a bit before the node detects it has been cut off from the elected master
         logger.info("waiting for isolated node [{}] to have no master", isolatedNode);
-        applied = awaitBusy(new Predicate<Object>() {
+        final ClusterState[] lastState = new ClusterState[1];
+        boolean success = awaitBusy(new Predicate<Object>() {
             @Override
             public boolean apply(Object input) {
-                ClusterState localClusterState = client(isolatedNode).admin().cluster().prepareState().setLocal(true).get().getState();
-                DiscoveryNodes localDiscoveryNodes = localClusterState.nodes();
+                lastState[0] = getNodeClusterState(isolatedNode);
+                DiscoveryNodes localDiscoveryNodes = lastState[0].nodes();
                 logger.info("localDiscoveryNodes=" + localDiscoveryNodes.prettyPrint());
-                return localDiscoveryNodes.masterNode() == null;
+                if (localDiscoveryNodes.masterNode() == null) {
+                    return false;
+                }
+                for (ClusterBlockLevel level : DiscoverySettings.NO_MASTER_BLOCK_WRITES.levels()) {
+                    if (lastState[0].getBlocks().hasGlobalBlock(level)) {
+                        return false;
+                    }
+                }
+                return true;
             }
         }, 10, TimeUnit.SECONDS);
-        assertThat(applied, is(true));
+        if (!success) {
+            fail("isolated node still has a master or the wrong blocks. Cluster state:\n" + lastState[0].prettyPrint());
+        }
+
+
+        logger.info("wait until elected master has been removed and a new 2 node cluster was from (via [{}])", isolatedNode);
         ensureStableCluster(2, nonIsolatedNode);
 
-        // Reads on the wrong side of the split are allowed
-        client(isolatedNode).prepareSearch("test").setTypes("type")
-                .setPreference("_only_local")
-                .get();
-        client(isolatedNode).preparePercolate().setDocumentType("type").setIndices("test")
-                .setPreference("_only_local").setSource("{\"doc\" : {}}")
-                .get();
-        client(isolatedNode).prepareCount("test").setTypes("type")
-                .setPreference("_only_local")
-                .get();
-        client(isolatedNode).prepareGet("test", "type", "0").setPreference("_only_local")
-                .get();
+        for (String node : networkPartition.getMajoritySide()) {
+            ClusterState nodeState = getNodeClusterState(node);
+            success = true;
+            if (nodeState.nodes().getMasterNode() == null) {
+                success = false;
+            }
+            if (!nodeState.blocks().global().isEmpty()) {
+                success = false;
+            }
+            if (!success) {
+                fail("node [" + node + "] has no master or has blocks, despite of being on the right side of the partition. State dump:\n"
+                        + nodeState.prettyPrint());
+            }
+        }
+
 
-        // Writes on the wrong side of the split are *not* allowed
-        executeBlockedApi(
-                client(isolatedNode).prepareIndex("test", "type", "0").setSource("{}").setTimeout("1s") // Fail quick, otherwise we wait 60 seconds.
-        );
-        executeBlockedApi(
-                client(isolatedNode).prepareUpdate("test", "type", "0").setDoc("{}").setTimeout("1s") // Fail quick, otherwise we wait 60 seconds.
-        );
         networkPartition.stopDisrupting();
 
         // Wait until the master node sees all 3 nodes again.
@@ -237,69 +231,37 @@ public boolean apply(Object input) {
                 .get();
         networkPartition.startDisrupting();
 
-        logger.info("wait until elected master has removed [{}]", isolatedNode);
-        applied = awaitBusy(new Predicate<Object>() {
-            @Override
-            public boolean apply(Object input) {
-                return client(nonIsolatedNode).admin().cluster().prepareState().setLocal(true).get().getState().nodes().size() == 2;
-            }
-        }, 1, TimeUnit.MINUTES);
-        assertThat(applied, is(true));
 
         // The unlucky node must report *no* master node, since it can't connect to master and in fact it should
         // continuously ping until network failures have been resolved. However
         // It may a take a bit before the node detects it has been cut off from the elected master
         logger.info("waiting for isolated node [{}] to have no master", isolatedNode);
-        applied = awaitBusy(new Predicate<Object>() {
+        success = awaitBusy(new Predicate<Object>() {
             @Override
             public boolean apply(Object input) {
-                ClusterState localClusterState = client(isolatedNode).admin().cluster().prepareState().setLocal(true).get().getState();
-                DiscoveryNodes localDiscoveryNodes = localClusterState.nodes();
+                lastState[0] = getNodeClusterState(isolatedNode);
+                DiscoveryNodes localDiscoveryNodes = lastState[0].nodes();
                 logger.info("localDiscoveryNodes=" + localDiscoveryNodes.prettyPrint());
-                return localDiscoveryNodes.masterNode() == null;
+                if (localDiscoveryNodes.masterNode() == null) {
+                    return false;
+                }
+                for (ClusterBlockLevel level : DiscoverySettings.NO_MASTER_BLOCK_ALL.levels()) {
+                    if (lastState[0].getBlocks().hasGlobalBlock(level)) {
+                        return false;
+                    }
+                }
+                return true;
             }
         }, 10, TimeUnit.SECONDS);
-        assertThat(applied, is(true));
-        ensureStableCluster(2, nonIsolatedNode);
-
-        // Now reads and writes on the wrong side of the split are allowed
-        executeBlockedApi(
-                client(isolatedNode).prepareSearch("test").setTypes("type").setPreference("_only_local")
-        );
-        executeBlockedApi(
-                client(isolatedNode).preparePercolate().setDocumentType("type").setIndices("test").setPreference("_only_local").setSource("{\"doc\" : {}}")
-        );
-        executeBlockedApi(
-                client(isolatedNode).prepareCount("test").setTypes("type").setPreference("_only_local")
-        );
-        executeBlockedApi(
-                client(isolatedNode).prepareGet("test", "type", "0").setPreference("_only_local")
-        );
-        executeBlockedApi(
-                client(isolatedNode).prepareIndex("test", "type", "0").setSource("{}").setTimeout("1s") // Fail quick, otherwise we wait 60 seconds.
-        );
-        executeBlockedApi(
-                client(isolatedNode).prepareUpdate("test", "type", "0").setDoc("{}").setTimeout("1s") // Fail quick, otherwise we wait 60 seconds.
-        );
-        networkPartition.stopDisrupting();
-
-        // Wait until the master node sees all 3 nodes again.
-        ensureStableCluster(3, new TimeValue(30000 + networkPartition.expectedTimeToHeal().millis()));
-    }
-
-    private void executeBlockedApi(ActionRequestBuilder builder) {
-        try {
-            logger.info("verifying request[{}] on isolated [{}] and fail", builder.getClass().getSimpleName());
-            builder.get();
-            fail();
-        } catch (ClusterBlockException exception) {
-            assertThat(exception.status(), equalTo(RestStatus.SERVICE_UNAVAILABLE));
-            assertThat(exception.blocks().size(), equalTo(1));
-            ClusterBlock clusterBlock = exception.blocks().iterator().next();
-            assertThat(clusterBlock.id(), equalTo(DiscoverySettings.NO_MASTER_BLOCK_ID));
+        if (!success) {
+            fail("isolated node still has a master or the wrong blocks (expected 'all' block). Cluster state:\n" + lastState[0].prettyPrint());
         }
     }
 
+    /**
+     * This test isolates the master from rest of the cluster, waits for a new master to be elected, restores the partition
+     * and verifies that all node agree on the new cluster state
+     */
     @Test
     @TestLogging("discovery.zen:TRACE,action:TRACE,cluster.service:TRACE,indices.recovery:TRACE,indices.cluster:TRACE")
     public void testIsolateMasterAndVerifyClusterStateConsensus() throws Exception {
@@ -313,24 +275,19 @@ public void testIsolateMasterAndVerifyClusterStateConsensus() throws Exception {
                 ));
 
         ensureGreen();
-        String isolatedNode = findMasterNode(nodes).name();
-        String nonIsolatedNode = null;
-        for (String node : nodes) {
-            if (!node.equals(isolatedNode)) {
-                nonIsolatedNode = node;
-                break;
-            }
-        }
-        ServiceDisruptionScheme scheme = addRandomIsolation(isolatedNode);
-        scheme.startDisrupting();
+        String isolatedNode = internalCluster().getMasterName();
+        NetworkPartition networkPartition = addRandomIsolation(isolatedNode);
+        networkPartition.startDisrupting();
+
+        String nonIsolatedNode = networkPartition.getMajoritySide().get(0);
 
         // make sure cluster reforms
         ensureStableCluster(2, nonIsolatedNode);
 
         // restore isolation
-        scheme.stopDisrupting();
+        networkPartition.stopDisrupting();
 
-        ensureStableCluster(3, new TimeValue(30000 + scheme.expectedTimeToHeal().millis()));
+        ensureStableCluster(3, new TimeValue(30000 + networkPartition.expectedTimeToHeal().millis()));
 
         logger.info("issue a reroute");
         // trigger a reroute now, instead of waiting for the background reroute of RerouteService
@@ -341,7 +298,7 @@ public void testIsolateMasterAndVerifyClusterStateConsensus() throws Exception {
         // verify all cluster states are the same
         ClusterState state = null;
         for (String node : nodes) {
-            ClusterState nodeState = client(node).admin().cluster().prepareState().setLocal(true).get().getState();
+            ClusterState nodeState = getNodeClusterState(node);
             if (state == null) {
                 state = nodeState;
                 continue;
@@ -364,6 +321,10 @@ public void testIsolateMasterAndVerifyClusterStateConsensus() throws Exception {
         }
     }
 
+    /**
+     * Test the we do not loose document whose indexing request was successful, under a randomly selected disruption scheme
+     * We also collect & report the type of indexing failures that occur.
+     */
     @Test
     @LuceneTestCase.AwaitsFix(bugUrl = "needs some more work to stabilize")
     @TestLogging("action.index:TRACE,action.get:TRACE,discovery:TRACE,cluster.service:TRACE,indices.recovery:TRACE,indices.cluster:TRACE")
@@ -497,6 +458,12 @@ public void run() {
         }
     }
 
+    /**
+     * Test that a document which is indexed on the majority side of a partition, is available from the minory side,
+     * once the partition is healed
+     *
+     * @throws Exception
+     */
     @Test
     @TestLogging("discovery.zen:TRACE,action:TRACE,cluster.service:TRACE,indices.recovery:TRACE,indices.cluster:TRACE")
     public void testRejoinDocumentExistsInAllShardCopies() throws Exception {
@@ -592,22 +559,6 @@ private ServiceDisruptionScheme addRandomDisruptionScheme() {
         return list.get(0);
     }
 
-    private DiscoveryNode findMasterNode(List<String> nodes) {
-        DiscoveryNode masterDiscoNode = null;
-        for (String node : nodes) {
-            ClusterState state = internalCluster().client(node).admin().cluster().prepareState().setLocal(true).execute().actionGet().getState();
-            assertThat(state.nodes().size(), equalTo(3));
-            if (masterDiscoNode == null) {
-                masterDiscoNode = state.nodes().masterNode();
-            } else {
-                assertThat(state.nodes().masterNode(), equalTo(masterDiscoNode));
-            }
-        }
-        assert masterDiscoNode != null;
-        return masterDiscoNode;
-    }
-
-
     private void ensureStableCluster(int nodeCount) {
         ensureStableCluster(nodeCount, TimeValue.timeValueSeconds(30), null);
     }
@@ -631,4 +582,8 @@ private void ensureStableCluster(int nodeCount, TimeValue timeValue, @Nullable S
         assertThat(clusterHealthResponse.isTimedOut(), is(false));
     }
 
+    private ClusterState getNodeClusterState(String node) {
+        return client(node).admin().cluster().prepareState().setLocal(true).get().getState();
+    }
+
 }
diff --git a/src/test/java/org/elasticsearch/test/InternalTestCluster.java b/src/test/java/org/elasticsearch/test/InternalTestCluster.java
index 92646ff390045..88065424b8d69 100644
--- a/src/test/java/org/elasticsearch/test/InternalTestCluster.java
+++ b/src/test/java/org/elasticsearch/test/InternalTestCluster.java
@@ -222,7 +222,7 @@ public InternalTestCluster(long clusterSeed, int minNumDataNodes, int maxNumData
                 this.numSharedClientNodes = numClientNodes;
             }
         }
-        assert this.numSharedClientNodes >=0;
+        assert this.numSharedClientNodes >= 0;
 
         this.enableRandomBenchNodes = enableRandomBenchNodes;
 
@@ -247,7 +247,7 @@ public InternalTestCluster(long clusterSeed, int minNumDataNodes, int maxNumData
             if (numOfDataPaths > 0) {
                 StringBuilder dataPath = new StringBuilder();
                 for (int i = 0; i < numOfDataPaths; i++) {
-                    dataPath.append(new File("data/d"+i).getAbsolutePath()).append(',');
+                    dataPath.append(new File("data/d" + i).getAbsolutePath()).append(',');
                 }
                 builder.put("path.data", dataPath.toString());
             }
@@ -270,7 +270,7 @@ public InternalTestCluster(long clusterSeed, int minNumDataNodes, int maxNumData
 
     public static String nodeMode() {
         Builder builder = ImmutableSettings.builder();
-        if (Strings.isEmpty(System.getProperty("es.node.mode"))&& Strings.isEmpty(System.getProperty("es.node.local"))) {
+        if (Strings.isEmpty(System.getProperty("es.node.mode")) && Strings.isEmpty(System.getProperty("es.node.local"))) {
             return "local"; // default if nothing is specified
         }
         if (Strings.hasLength(System.getProperty("es.node.mode"))) {
@@ -327,7 +327,7 @@ private static Settings getRandomNodeSettings(long seed) {
                 //.put("index.store.type", random.nextInt(10) == 0 ? MockRamIndexStoreModule.class.getName() : MockFSIndexStoreModule.class.getName())
                 // decrease the routing schedule so new nodes will be added quickly - some random value between 30 and 80 ms
                 .put("cluster.routing.schedule", (30 + random.nextInt(50)) + "ms")
-                // default to non gateway
+                        // default to non gateway
                 .put("gateway.type", "none")
                 .put(SETTING_CLUSTER_NODE_SEED, seed);
         if (ENABLE_MOCK_MODULES && usually(random)) {
@@ -351,7 +351,7 @@ private static Settings getRandomNodeSettings(long seed) {
             builder.put(SearchService.KEEPALIVE_INTERVAL_KEY, TimeValue.timeValueSeconds(10 + random.nextInt(5 * 60)));
         }
         if (random.nextBoolean()) { // sometimes set a
-            builder.put(SearchService.DEFAUTL_KEEPALIVE_KEY, TimeValue.timeValueSeconds(100 + random.nextInt(5*60)));
+            builder.put(SearchService.DEFAUTL_KEEPALIVE_KEY, TimeValue.timeValueSeconds(100 + random.nextInt(5 * 60)));
         }
         if (random.nextBoolean()) {
             // change threadpool types to make sure we don't have components that rely on the type of thread pools
@@ -782,7 +782,6 @@ public void close() throws IOException {
 
     public static final String TRANSPORT_CLIENT_PREFIX = "transport_client_";
     static class TransportClientFactory {
-
         private static TransportClientFactory NO_SNIFF_CLIENT_FACTORY = new TransportClientFactory(false, ImmutableSettings.EMPTY);
         private static TransportClientFactory SNIFF_CLIENT_FACTORY = new TransportClientFactory(true, ImmutableSettings.EMPTY);
 
@@ -1229,7 +1228,10 @@ public void fullRestart(RestartCallback function) throws Exception {
     }
 
 
-    private String getMasterName() {
+    /**
+     * get the name of the current master node
+     */
+    public String getMasterName() {
         try {
             ClusterState state = client().admin().cluster().prepareState().execute().actionGet().getState();
             return state.nodes().masterNode().name();
diff --git a/src/test/java/org/elasticsearch/test/disruption/NetworkPartition.java b/src/test/java/org/elasticsearch/test/disruption/NetworkPartition.java
index f9b05a1c1c86c..7c9b14ad36805 100644
--- a/src/test/java/org/elasticsearch/test/disruption/NetworkPartition.java
+++ b/src/test/java/org/elasticsearch/test/disruption/NetworkPartition.java
@@ -73,7 +73,7 @@ public List<String> getNodesSideTwo() {
         return ImmutableList.copyOf(nodesSideTwo);
     }
 
-    public List<String> getMjaoritySide() {
+    public List<String> getMajoritySide() {
         if (nodesSideOne.size() >= nodesSideTwo.size()) {
             return getNodesSideOne();
         } else {

From e897dccb5232306e982a57cf4e0eaa283e94df50 Mon Sep 17 00:00:00 2001
From: Boaz Leskes <b.leskes@gmail.com>
Date: Wed, 2 Jul 2014 21:39:12 +0200
Subject: [PATCH 34/74] [Tests] improved automatic disruption healing after
 tests

---
 .../test/InternalTestCluster.java             | 29 ++++++++++---------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/src/test/java/org/elasticsearch/test/InternalTestCluster.java b/src/test/java/org/elasticsearch/test/InternalTestCluster.java
index 88065424b8d69..cdc95022591de 100644
--- a/src/test/java/org/elasticsearch/test/InternalTestCluster.java
+++ b/src/test/java/org/elasticsearch/test/InternalTestCluster.java
@@ -835,19 +835,6 @@ public synchronized void beforeTest(Random random, double transportClientRatio)
     }
 
     private synchronized void reset(boolean wipeData) throws IOException {
-        TimeValue expectedHealingTime = activeDisruptionScheme != null ? activeDisruptionScheme.expectedTimeToHeal() : null;
-        clearDisruptionScheme();
-        if (expectedHealingTime != null && expectedHealingTime.millis() > 0) {
-            try {
-                Thread.sleep(expectedHealingTime.millis());
-            } catch (InterruptedException e) {
-                Thread.currentThread().interrupt();
-            }
-        }
-        randomlyResetClients();
-        if (wipeData) {
-            wipeDataDirectories();
-        }
         // clear all rules for mock transport services
         for (NodeAndClient nodeAndClient : nodes.values()) {
             TransportService transportService = nodeAndClient.node.injector().getInstance(TransportService.class);
@@ -855,6 +842,22 @@ private synchronized void reset(boolean wipeData) throws IOException {
                 ((MockTransportService) transportService).clearAllRules();
             }
         }
+        if (activeDisruptionScheme != null) {
+            TimeValue expectedHealingTime = activeDisruptionScheme.expectedTimeToHeal();
+            clearDisruptionScheme();
+            if (expectedHealingTime != null && expectedHealingTime.millis() > 0) {
+                try {
+                    Thread.sleep(expectedHealingTime.millis());
+                } catch (InterruptedException e) {
+                    Thread.currentThread().interrupt();
+                }
+            }
+            assert !client().admin().cluster().prepareHealth().setWaitForNodes("" + nodes.size()).get().isTimedOut() : "cluster failed to form after disruption was healed";
+        }
+        randomlyResetClients();
+        if (wipeData) {
+            wipeDataDirectories();
+        }
         if (nextNodeId.get() == sharedNodesSeeds.length && nodes.size() == sharedNodesSeeds.length) {
             logger.debug("Cluster hasn't changed - moving out - nodes: [{}] nextNodeId: [{}] numSharedNodes: [{}]", nodes.keySet(), nextNodeId.get(), sharedNodesSeeds.length);
             return;

From d99ca806cb61a917f0974c88e1198c04b71f0b6e Mon Sep 17 00:00:00 2001
From: Martijn van Groningen <martijn.v.groningen@gmail.com>
Date: Thu, 3 Jul 2014 13:37:46 +0200
Subject: [PATCH 35/74] [TEST] Properly clear the disruption schemes after test
 completed.

---
 .../test/InternalTestCluster.java             | 28 +++++++++++--------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/src/test/java/org/elasticsearch/test/InternalTestCluster.java b/src/test/java/org/elasticsearch/test/InternalTestCluster.java
index cdc95022591de..500df9541efaf 100644
--- a/src/test/java/org/elasticsearch/test/InternalTestCluster.java
+++ b/src/test/java/org/elasticsearch/test/InternalTestCluster.java
@@ -108,6 +108,7 @@
 import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertNoTimeout;
 import static org.hamcrest.Matchers.equalTo;
 import static org.junit.Assert.assertThat;
+import static org.junit.Assert.assertFalse;
 
 /**
  * InternalTestCluster manages a set of JVM private nodes and allows convenient access to them.
@@ -842,18 +843,6 @@ private synchronized void reset(boolean wipeData) throws IOException {
                 ((MockTransportService) transportService).clearAllRules();
             }
         }
-        if (activeDisruptionScheme != null) {
-            TimeValue expectedHealingTime = activeDisruptionScheme.expectedTimeToHeal();
-            clearDisruptionScheme();
-            if (expectedHealingTime != null && expectedHealingTime.millis() > 0) {
-                try {
-                    Thread.sleep(expectedHealingTime.millis());
-                } catch (InterruptedException e) {
-                    Thread.currentThread().interrupt();
-                }
-            }
-            assert !client().admin().cluster().prepareHealth().setWaitForNodes("" + nodes.size()).get().isTimedOut() : "cluster failed to form after disruption was healed";
-        }
         randomlyResetClients();
         if (wipeData) {
             wipeDataDirectories();
@@ -1421,7 +1410,22 @@ public void setDisruptionScheme(ServiceDisruptionScheme scheme) {
 
     public void clearDisruptionScheme() {
         if (activeDisruptionScheme != null) {
+            TimeValue expectedHealingTime = activeDisruptionScheme.expectedTimeToHeal();
+            logger.info("Clearing active scheme {}, expected healing time {}", activeDisruptionScheme, expectedHealingTime);
             activeDisruptionScheme.removeFromCluster(this);
+            // We don't what scheme is picked, certain schemes don't partition the cluster, but process slow, so we need
+            // to to sleep, cluster health alone doesn't verify if these schemes have been cleared.
+            if (expectedHealingTime != null && expectedHealingTime.millis() > 0) {
+                try {
+                    Thread.sleep(expectedHealingTime.millis());
+                } catch (InterruptedException e) {
+                    Thread.currentThread().interrupt();
+                }
+            }
+            assertFalse("cluster failed to form after disruption was healed", client().admin().cluster().prepareHealth()
+                    .setWaitForNodes("" + nodes.size())
+                    .setWaitForRelocatingShards(0)
+                    .get().isTimedOut());
         }
         activeDisruptionScheme = null;
     }

From 48c7da1fd43bebcc6d73cee634f86bbf03ba0859 Mon Sep 17 00:00:00 2001
From: Boaz Leskes <b.leskes@gmail.com>
Date: Sun, 6 Jul 2014 10:21:02 +0200
Subject: [PATCH 36/74] [Test] testVerifyApiBlocksDuringPartition - wait for
 stable cluster after partition

---
 .../discovery/DiscoveryWithNetworkFailuresTests.java      | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java b/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java
index 5dbe4e203db14..8310f29101113 100644
--- a/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java
+++ b/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java
@@ -149,6 +149,7 @@ public boolean apply(Object input) {
      * Verify that the proper block is applied when nodes loose their master
      */
     @Test
+    @TestLogging(value = "cluster.service:TRACE,indices.recovery:TRACE")
     public void testVerifyApiBlocksDuringPartition() throws Exception {
         internalCluster().startNodesAsync(3, nodeSettings).get();
         // Wait until a 3 nodes are part of the cluster
@@ -229,6 +230,7 @@ public boolean apply(Object input) {
         client().admin().cluster().prepareUpdateSettings()
                 .setTransientSettings(ImmutableSettings.builder().put(DiscoverySettings.NO_MASTER_BLOCK, "all"))
                 .get();
+
         networkPartition.startDisrupting();
 
 
@@ -256,6 +258,12 @@ public boolean apply(Object input) {
         if (!success) {
             fail("isolated node still has a master or the wrong blocks (expected 'all' block). Cluster state:\n" + lastState[0].prettyPrint());
         }
+
+        // make sure we have stable cluster & cross partition recoveries are canceled by the removal of the missing node
+        // the unresponsive partition causes recoveries to only time out after 15m (default) and these will cause
+        // the test to fail due to unfreed resources
+        ensureStableCluster(2, nonIsolatedNode);
+
     }
 
     /**

From 5302a53145a62ec2add7bdab095a92074819266a Mon Sep 17 00:00:00 2001
From: Boaz Leskes <b.leskes@gmail.com>
Date: Thu, 3 Jul 2014 12:24:04 +0200
Subject: [PATCH 37/74] [Discovery] immediately start Master|Node fault
 detection pinging

After a node joins the clusters, it starts pinging the master to verify it's health. Before, the cluster join request was processed async and we had to give some time to complete. With  #6480 we changed this to wait for the join process to complete on the master. We can therefore start pinging immediately for fast detection of failures. Similar change can be made to the Node fault detection from the master side.

Closes #6706
---
 .../discovery/zen/fd/MasterFaultDetection.java            | 8 +++++---
 .../discovery/zen/fd/NodesFaultDetection.java             | 6 ++++--
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/src/main/java/org/elasticsearch/discovery/zen/fd/MasterFaultDetection.java b/src/main/java/org/elasticsearch/discovery/zen/fd/MasterFaultDetection.java
index 1a1fe2cecee4e..b2a17165b6a1a 100644
--- a/src/main/java/org/elasticsearch/discovery/zen/fd/MasterFaultDetection.java
+++ b/src/main/java/org/elasticsearch/discovery/zen/fd/MasterFaultDetection.java
@@ -155,8 +155,9 @@ private void innerStart(final DiscoveryNode masterNode) {
             masterPinger.stop();
         }
         this.masterPinger = new MasterPinger();
-        // start the ping process
-        threadPool.schedule(pingInterval, ThreadPool.Names.SAME, masterPinger);
+
+        // we use schedule with a 0 time value to run the pinger on the pool as it will run on later
+        threadPool.schedule(TimeValue.timeValueMillis(0), ThreadPool.Names.SAME, masterPinger);
     }
 
     public void stop(String reason) {
@@ -200,7 +201,8 @@ private void handleTransportDisconnect(DiscoveryNode node) {
                         masterPinger.stop();
                     }
                     this.masterPinger = new MasterPinger();
-                    threadPool.schedule(pingInterval, ThreadPool.Names.SAME, masterPinger);
+                    // we use schedule with a 0 time value to run the pinger on the pool as it will run on later
+                    threadPool.schedule(TimeValue.timeValueMillis(0), ThreadPool.Names.SAME, masterPinger);
                 } catch (Exception e) {
                     logger.trace("[master] [{}] transport disconnected (with verified connect)", masterNode);
                     notifyMasterFailure(masterNode, "transport disconnected (with verified connect)");
diff --git a/src/main/java/org/elasticsearch/discovery/zen/fd/NodesFaultDetection.java b/src/main/java/org/elasticsearch/discovery/zen/fd/NodesFaultDetection.java
index 877cd2fa941ee..b808e080f2103 100644
--- a/src/main/java/org/elasticsearch/discovery/zen/fd/NodesFaultDetection.java
+++ b/src/main/java/org/elasticsearch/discovery/zen/fd/NodesFaultDetection.java
@@ -121,7 +121,8 @@ public void updateNodes(DiscoveryNodes nodes) {
             }
             if (!nodesFD.containsKey(newNode)) {
                 nodesFD.put(newNode, new NodeFD());
-                threadPool.schedule(pingInterval, ThreadPool.Names.SAME, new SendPingRequest(newNode));
+                // we use schedule with a 0 time value to run the pinger on the pool as it will run on later
+                threadPool.schedule(TimeValue.timeValueMillis(0), ThreadPool.Names.SAME, new SendPingRequest(newNode));
             }
         }
         for (DiscoveryNode removedNode : delta.removedNodes()) {
@@ -167,7 +168,8 @@ private void handleTransportDisconnect(DiscoveryNode node) {
             try {
                 transportService.connectToNode(node);
                 nodesFD.put(node, new NodeFD());
-                threadPool.schedule(pingInterval, ThreadPool.Names.SAME, new SendPingRequest(node));
+                // we use schedule with a 0 time value to run the pinger on the pool as it will run on later
+                threadPool.schedule(TimeValue.timeValueMillis(0), ThreadPool.Names.SAME, new SendPingRequest(node));
             } catch (Exception e) {
                 logger.trace("[node  ] [{}] transport disconnected (with verified connect)", node);
                 notifyNodeFailure(node, "transport disconnected (with verified connect)");

From 3586e38c409f47004fc2a3882e5a9f956beace84 Mon Sep 17 00:00:00 2001
From: Boaz Leskes <b.leskes@gmail.com>
Date: Wed, 9 Jul 2014 11:40:28 +0200
Subject: [PATCH 38/74] [Discovery] Start master fault detection after
 pingInterval

This is to allow the master election to complete on the chosen master.

 Relates to #6706
---
 .../elasticsearch/discovery/zen/fd/MasterFaultDetection.java  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/main/java/org/elasticsearch/discovery/zen/fd/MasterFaultDetection.java b/src/main/java/org/elasticsearch/discovery/zen/fd/MasterFaultDetection.java
index b2a17165b6a1a..b601884002cd4 100644
--- a/src/main/java/org/elasticsearch/discovery/zen/fd/MasterFaultDetection.java
+++ b/src/main/java/org/elasticsearch/discovery/zen/fd/MasterFaultDetection.java
@@ -156,8 +156,8 @@ private void innerStart(final DiscoveryNode masterNode) {
         }
         this.masterPinger = new MasterPinger();
 
-        // we use schedule with a 0 time value to run the pinger on the pool as it will run on later
-        threadPool.schedule(TimeValue.timeValueMillis(0), ThreadPool.Names.SAME, masterPinger);
+        // we start pinging slightly later to allow the chosen master to complete it's own master election
+        threadPool.schedule(pingInterval, ThreadPool.Names.SAME, masterPinger);
     }
 
     public void stop(String reason) {

From 522d4afe0ca6026f9a81f00497d62b41c9c4b272 Mon Sep 17 00:00:00 2001
From: Boaz Leskes <b.leskes@gmail.com>
Date: Thu, 10 Jul 2014 18:50:46 +0200
Subject: [PATCH 39/74] [Tests] Use local gateway

This is important to for proper primary allocation decisions
---
 .../discovery/DiscoveryWithNetworkFailuresTests.java             | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java b/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java
index 8310f29101113..866f1a74fa93f 100644
--- a/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java
+++ b/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java
@@ -67,6 +67,7 @@
 public class DiscoveryWithNetworkFailuresTests extends ElasticsearchIntegrationTest {
 
     private static final Settings nodeSettings = ImmutableSettings.settingsBuilder()
+            .put("gateway.type", "local")
             .put("discovery.type", "zen") // <-- To override the local setting if set externally
             .put("discovery.zen.fd.ping_timeout", "1s") // <-- for hitting simulated network failures quickly
             .put("discovery.zen.fd.ping_retries", "1") // <-- for hitting simulated network failures quickly

From 7b6e194923aadf3172b7e9185264b3be7c1eed02 Mon Sep 17 00:00:00 2001
From: Boaz Leskes <b.leskes@gmail.com>
Date: Thu, 10 Jul 2014 20:41:17 +0200
Subject: [PATCH 40/74] [Tests] Don't log about restoring a partition if the
 partition is not active.

---
 .../elasticsearch/test/disruption/NetworkPartition.java    | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/test/java/org/elasticsearch/test/disruption/NetworkPartition.java b/src/test/java/org/elasticsearch/test/disruption/NetworkPartition.java
index 7c9b14ad36805..8206fafef4eb5 100644
--- a/src/test/java/org/elasticsearch/test/disruption/NetworkPartition.java
+++ b/src/test/java/org/elasticsearch/test/disruption/NetworkPartition.java
@@ -41,6 +41,7 @@ public abstract class NetworkPartition implements ServiceDisruptionScheme {
     volatile boolean autoExpand;
     protected final Random random;
     protected volatile InternalTestCluster cluster;
+    protected volatile boolean activeDisruption = false;
 
 
     public NetworkPartition(Random random) {
@@ -157,6 +158,7 @@ public synchronized void startDisrupting() {
             return;
         }
         logger.info("nodes {} will be partitioned from {}. partition type [{}]", nodesSideOne, nodesSideTwo, getPartitionDescription());
+        activeDisruption = true;
         for (String node1 : nodesSideOne) {
             MockTransportService transportService1 = (MockTransportService) cluster.getInstance(TransportService.class, node1);
             DiscoveryNode discoveryNode1 = discoveryNode(node1);
@@ -170,8 +172,8 @@ public synchronized void startDisrupting() {
 
 
     @Override
-    public void stopDisrupting() {
-        if (nodesSideOne.size() == 0 || nodesSideTwo.size() == 0) {
+    public synchronized void stopDisrupting() {
+        if (nodesSideOne.size() == 0 || nodesSideTwo.size() == 0 || !activeDisruption) {
             return;
         }
         logger.info("restoring partition between nodes {} & nodes {}", nodesSideOne, nodesSideTwo);
@@ -184,6 +186,7 @@ public void stopDisrupting() {
                 removeDisruption(discoveryNode1, transportService1, discoveryNode2, transportService2);
             }
         }
+        activeDisruption = false;
     }
 
     abstract void applyDisruption(DiscoveryNode node1, MockTransportService transportService1,

From c12d0901f66abea8cf78bcac0d1ab1f739b79ed9 Mon Sep 17 00:00:00 2001
From: Boaz Leskes <b.leskes@gmail.com>
Date: Thu, 10 Jul 2014 21:14:13 +0200
Subject: [PATCH 41/74] [Tests] Increase timeout when waiting for partitions to
 heal

the current 30s addition is tricky because we use 30s as timeout in many places...
---
 .../discovery/DiscoveryWithNetworkFailuresTests.java   | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java b/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java
index 866f1a74fa93f..4ec9672e4e51e 100644
--- a/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java
+++ b/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java
@@ -66,6 +66,8 @@
 @TestLogging("discovery.zen:TRACE")
 public class DiscoveryWithNetworkFailuresTests extends ElasticsearchIntegrationTest {
 
+    private static final TimeValue DISRUPTION_HEALING_OVERHEAD = TimeValue.timeValueSeconds(40); // we use 30s as timeout in many places.
+
     private static final Settings nodeSettings = ImmutableSettings.settingsBuilder()
             .put("gateway.type", "local")
             .put("discovery.type", "zen") // <-- To override the local setting if set externally
@@ -224,8 +226,8 @@ public boolean apply(Object input) {
 
         networkPartition.stopDisrupting();
 
-        // Wait until the master node sees all 3 nodes again.
-        ensureStableCluster(3, new TimeValue(30000 + networkPartition.expectedTimeToHeal().millis()));
+        // Wait until the master node sees al 3 nodes again.
+        ensureStableCluster(3, new TimeValue(DISRUPTION_HEALING_OVERHEAD.millis() + networkPartition.expectedTimeToHeal().millis()));
 
         logger.info("Verify no master block with {} set to {}", DiscoverySettings.NO_MASTER_BLOCK, "all");
         client().admin().cluster().prepareUpdateSettings()
@@ -296,7 +298,7 @@ public void testIsolateMasterAndVerifyClusterStateConsensus() throws Exception {
         // restore isolation
         networkPartition.stopDisrupting();
 
-        ensureStableCluster(3, new TimeValue(30000 + networkPartition.expectedTimeToHeal().millis()));
+        ensureStableCluster(3, new TimeValue(DISRUPTION_HEALING_OVERHEAD.millis() + networkPartition.expectedTimeToHeal().millis()));
 
         logger.info("issue a reroute");
         // trigger a reroute now, instead of waiting for the background reroute of RerouteService
@@ -432,7 +434,7 @@ public void run() {
 
                 logger.info("stopping disruption");
                 disruptionScheme.stopDisrupting();
-                ensureStableCluster(3, TimeValue.timeValueMillis(disruptionScheme.expectedTimeToHeal().millis() + 30000));
+                ensureStableCluster(3, TimeValue.timeValueMillis(disruptionScheme.expectedTimeToHeal().millis() + DISRUPTION_HEALING_OVERHEAD.millis()));
                 ensureGreen("test");
 
                 logger.info("validating successful docs");

From e0543b3426872e5c0c830034f45c3c3349f63b7a Mon Sep 17 00:00:00 2001
From: Boaz Leskes <b.leskes@gmail.com>
Date: Fri, 11 Jul 2014 20:03:02 +0200
Subject: [PATCH 42/74] [Internal] Migrate new initial state cluster update
 task to a ClusterStateNonMasterUpdateTask

---
 .../elasticsearch/cluster/service/InternalClusterService.java | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/main/java/org/elasticsearch/cluster/service/InternalClusterService.java b/src/main/java/org/elasticsearch/cluster/service/InternalClusterService.java
index ff6f392425340..be4f8d26df79c 100644
--- a/src/main/java/org/elasticsearch/cluster/service/InternalClusterService.java
+++ b/src/main/java/org/elasticsearch/cluster/service/InternalClusterService.java
@@ -136,7 +136,7 @@ protected void doStart() throws ElasticsearchException {
         discoveryService.addLifecycleListener(new LifecycleListener() {
             @Override
             public void afterStart() {
-                submitStateUpdateTask("update local node", Priority.IMMEDIATE, new ClusterStateUpdateTask() {
+                submitStateUpdateTask("update local node", Priority.IMMEDIATE, new ClusterStateNonMasterUpdateTask() {
                     @Override
                     public ClusterState execute(ClusterState currentState) throws Exception {
                         return ClusterState.builder(currentState)
@@ -146,7 +146,7 @@ public ClusterState execute(ClusterState currentState) throws Exception {
 
                     @Override
                     public void onFailure(String source, Throwable t) {
-                        logger.warn("failed ot update local node", t);
+                        logger.warn("failed to update local node", t);
                     }
                 });
             }

From 7fa3d7081b14a2e8aa04b9a688766c0ea1611e85 Mon Sep 17 00:00:00 2001
From: Boaz Leskes <b.leskes@gmail.com>
Date: Tue, 15 Jul 2014 11:46:34 +0200
Subject: [PATCH 43/74] [logging] don't log an error if scheduled reroute is
 rejected because local node is no longer master

Since it runs in a background thread after a node is added, or submits a cluster state update when a node leaves, it may be that by the time it is executed the local node is no longer master.
---
 .../org/elasticsearch/cluster/routing/RoutingService.java   | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/main/java/org/elasticsearch/cluster/routing/RoutingService.java b/src/main/java/org/elasticsearch/cluster/routing/RoutingService.java
index b33804de564dd..828244494a976 100644
--- a/src/main/java/org/elasticsearch/cluster/routing/RoutingService.java
+++ b/src/main/java/org/elasticsearch/cluster/routing/RoutingService.java
@@ -151,8 +151,10 @@ public ClusterState execute(ClusterState currentState) {
 
                 @Override
                 public void onFailure(String source, Throwable t) {
-                    ClusterState state = clusterService.state();
-                    logger.error("unexpected failure during [{}], current state:\n{}", t, source, state.prettyPrint());
+                    if (!(t instanceof ClusterService.NoLongerMasterException)) {
+                        ClusterState state = clusterService.state();
+                        logger.error("unexpected failure during [{}], current state:\n{}", t, source, state.prettyPrint());
+                    }
                 }
             });
             routingTableDirty = false;

From ccabb4aa20752f6a7d00fb2f6f208228a971c141 Mon Sep 17 00:00:00 2001
From: Boaz Leskes <b.leskes@gmail.com>
Date: Wed, 16 Jul 2014 22:54:34 +0200
Subject: [PATCH 44/74] Remove unneeded reference to DiscoveryService which
 potentially causes circular references

---
 .../org/elasticsearch/discovery/zen/ZenDiscovery.java  | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java b/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
index bd9eb0f367445..8cdf2bd3f529c 100644
--- a/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
+++ b/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
@@ -86,7 +86,6 @@ public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implemen
     private final ClusterService clusterService;
     private AllocationService allocationService;
     private final ClusterName clusterName;
-    private final DiscoveryService discoveryService;
     private final DiscoveryNodeService discoveryNodeService;
     private final DiscoverySettings discoverySettings;
     private final ZenPingService pingService;
@@ -130,14 +129,13 @@ public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implemen
     @Inject
     public ZenDiscovery(Settings settings, ClusterName clusterName, ThreadPool threadPool,
                         TransportService transportService, ClusterService clusterService, NodeSettingsService nodeSettingsService,
-                        DiscoveryNodeService discoveryNodeService, ZenPingService pingService, Version version, DiscoverySettings discoverySettings,
-                        DiscoveryService discoveryService) {
+                        DiscoveryNodeService discoveryNodeService, ZenPingService pingService, Version version,
+                        DiscoverySettings discoverySettings) {
         super(settings);
         this.clusterName = clusterName;
         this.threadPool = threadPool;
         this.clusterService = clusterService;
         this.transportService = transportService;
-        this.discoveryService = discoveryService;
         this.discoveryNodeService = discoveryNodeService;
         this.discoverySettings = discoverySettings;
         this.pingService = pingService;
@@ -649,7 +647,7 @@ public void onFailure(String source, Throwable t) {
 
 
                 assert newClusterState.nodes().masterNode() != null : "received a cluster state without a master";
-                assert !newClusterState.blocks().hasGlobalBlock(discoveryService.getNoMasterBlock()) : "received a cluster state with a master block";
+                assert !newClusterState.blocks().hasGlobalBlock(discoverySettings.getNoMasterBlock()) : "received a cluster state with a master block";
 
                 clusterService.submitStateUpdateTask("zen-disco-receive(from master [" + newClusterState.nodes().masterNode() + "])", Priority.URGENT, new ProcessedClusterStateNonMasterUpdateTask() {
                     @Override
@@ -716,7 +714,7 @@ public ClusterState execute(ClusterState currentState) {
                             masterFD.restart(latestDiscoNodes.masterNode(), "new cluster state received and we are monitoring the wrong master [" + masterFD.masterNode() + "]");
                         }
 
-                        if (currentState.blocks().hasGlobalBlock(discoveryService.getNoMasterBlock())) {
+                        if (currentState.blocks().hasGlobalBlock(discoverySettings.getNoMasterBlock())) {
                             // its a fresh update from the master as we transition from a start of not having a master to having one
                             logger.debug("got first state from fresh master [{}]", updatedState.nodes().masterNodeId());
                             return updatedState;

From ea2783787c0f94c5de57ee9d24d7320a53a1acee Mon Sep 17 00:00:00 2001
From: Boaz Leskes <b.leskes@gmail.com>
Date: Sun, 13 Jul 2014 09:42:36 +0200
Subject: [PATCH 45/74] [Tests] Introduced ClusterDiscoveryConfiguration

Closes #6890
---
 .../ClusterDiscoveryConfiguration.java        | 136 ++++++++++++++++++
 .../DiscoveryWithNetworkFailuresTests.java    |  71 +++++----
 .../discovery/ZenUnicastDiscoveryTests.java   |  48 ++-----
 .../test/InternalTestCluster.java             |   4 +-
 .../elasticsearch/test/SettingsSource.java    |   4 +-
 5 files changed, 199 insertions(+), 64 deletions(-)
 create mode 100644 src/test/java/org/elasticsearch/discovery/ClusterDiscoveryConfiguration.java

diff --git a/src/test/java/org/elasticsearch/discovery/ClusterDiscoveryConfiguration.java b/src/test/java/org/elasticsearch/discovery/ClusterDiscoveryConfiguration.java
new file mode 100644
index 0000000000000..00f1a8421d665
--- /dev/null
+++ b/src/test/java/org/elasticsearch/discovery/ClusterDiscoveryConfiguration.java
@@ -0,0 +1,136 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.elasticsearch.discovery;
+
+import com.carrotsearch.randomizedtesting.RandomizedTest;
+import com.google.common.primitives.Ints;
+import org.elasticsearch.common.settings.ImmutableSettings;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.test.ElasticsearchIntegrationTest;
+import org.elasticsearch.test.InternalTestCluster;
+import org.elasticsearch.test.SettingsSource;
+import org.elasticsearch.transport.local.LocalTransport;
+
+import java.util.HashSet;
+import java.util.Set;
+import java.util.concurrent.atomic.AtomicInteger;
+
+public class ClusterDiscoveryConfiguration extends SettingsSource {
+
+    public static Settings DEFAULT_SETTINGS = ImmutableSettings.settingsBuilder()
+            .put("gateway.type", "local")
+            .put("discovery.type", "zen")
+            .build();
+
+    final int numOfNodes;
+
+    final Settings baseSettings;
+
+    public ClusterDiscoveryConfiguration(int numOfNodes) {
+        this(numOfNodes, ImmutableSettings.EMPTY);
+    }
+
+    public ClusterDiscoveryConfiguration(int numOfNodes, Settings extraSettings) {
+        this.numOfNodes = numOfNodes;
+        this.baseSettings = ImmutableSettings.builder().put(DEFAULT_SETTINGS).put(extraSettings).build();
+    }
+
+    @Override
+    public Settings node(int nodeOrdinal) {
+        return baseSettings;
+    }
+
+    @Override
+    public Settings transportClient() {
+        return baseSettings;
+    }
+
+    public static class UnicastZen extends ClusterDiscoveryConfiguration {
+
+        private final static AtomicInteger portRangeCounter = new AtomicInteger();
+
+        private final int[] unicastHostOrdinals;
+        private final int basePort;
+
+        public UnicastZen(int numOfNodes) {
+            this(numOfNodes, numOfNodes);
+        }
+
+        public UnicastZen(int numOfNodes, int numOfUnicastHosts) {
+            this(numOfNodes, numOfUnicastHosts, ImmutableSettings.EMPTY);
+        }
+
+        public UnicastZen(int numOfNodes, int numOfUnicastHosts, Settings extraSettings) {
+            super(numOfNodes, extraSettings);
+            if (numOfUnicastHosts == numOfNodes) {
+                unicastHostOrdinals = new int[numOfNodes];
+                for (int i = 0; i < numOfNodes; i++) {
+                    unicastHostOrdinals[i] = i;
+                }
+            } else {
+                Set<Integer> ordinals = new HashSet<>(numOfUnicastHosts);
+                while (ordinals.size() != numOfUnicastHosts) {
+                    ordinals.add(RandomizedTest.randomInt(numOfNodes - 1));
+                }
+                unicastHostOrdinals = Ints.toArray(ordinals);
+            }
+            this.basePort = calcBasePort();
+        }
+
+        public UnicastZen(int numOfNodes, int[] unicastHostOrdinals) {
+            this(numOfNodes, ImmutableSettings.EMPTY, unicastHostOrdinals);
+        }
+
+        public UnicastZen(int numOfNodes, Settings extraSettings, int[] unicastHostOrdinals) {
+            super(numOfNodes, extraSettings);
+            this.unicastHostOrdinals = unicastHostOrdinals;
+            this.basePort = calcBasePort();
+        }
+
+        private final static int calcBasePort() {
+            return 10000 +
+                    1000 * (ElasticsearchIntegrationTest.CHILD_VM_ID.hashCode() % 60) + // up to 60 jvms
+                    100 * portRangeCounter.incrementAndGet(); // up to 100 nodes
+        }
+
+
+        @Override
+        public Settings node(int nodeOrdinal) {
+            ImmutableSettings.Builder builder = ImmutableSettings.builder()
+                    .put("discovery.zen.ping.multicast.enabled", false);
+
+            String[] unicastHosts = new String[unicastHostOrdinals.length];
+            if (InternalTestCluster.NODE_MODE.equals("local")) {
+                builder.put(LocalTransport.TRANSPORT_LOCAL_ADDRESS, "node_" + nodeOrdinal);
+                for (int i = 0; i < unicastHosts.length; i++) {
+                    unicastHosts[i] = "node_" + unicastHostOrdinals[i];
+                }
+            } else {
+                // we need to pin the node port & host so we'd know where to point things
+                builder.put("transport.tcp.port", basePort + nodeOrdinal);
+                builder.put("transport.host", "localhost");
+                for (int i = 0; i < unicastHosts.length; i++) {
+                    unicastHosts[i] = "localhost:" + (basePort + unicastHostOrdinals[i]);
+                }
+            }
+            builder.putArray("discovery.zen.ping.unicast.hosts", unicastHosts);
+            return builder.put(super.node(nodeOrdinal)).build();
+        }
+    }
+}
diff --git a/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java b/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java
index 4ec9672e4e51e..1059b587aa568 100644
--- a/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java
+++ b/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java
@@ -37,19 +37,18 @@
 import org.elasticsearch.common.settings.ImmutableSettings;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.common.unit.TimeValue;
+import org.elasticsearch.discovery.zen.elect.ElectMasterService;
 import org.elasticsearch.test.ElasticsearchIntegrationTest;
 import org.elasticsearch.test.InternalTestCluster;
 import org.elasticsearch.test.disruption.*;
 import org.elasticsearch.test.junit.annotations.TestLogging;
 import org.elasticsearch.test.transport.MockTransportService;
 import org.elasticsearch.transport.TransportModule;
+import org.junit.Before;
 import org.junit.Test;
 
 import java.util.*;
-import java.util.concurrent.ConcurrentHashMap;
-import java.util.concurrent.CountDownLatch;
-import java.util.concurrent.Semaphore;
-import java.util.concurrent.TimeUnit;
+import java.util.concurrent.*;
 import java.util.concurrent.atomic.AtomicBoolean;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.concurrent.atomic.AtomicReference;
@@ -68,15 +67,18 @@ public class DiscoveryWithNetworkFailuresTests extends ElasticsearchIntegrationT
 
     private static final TimeValue DISRUPTION_HEALING_OVERHEAD = TimeValue.timeValueSeconds(40); // we use 30s as timeout in many places.
 
-    private static final Settings nodeSettings = ImmutableSettings.settingsBuilder()
-            .put("gateway.type", "local")
-            .put("discovery.type", "zen") // <-- To override the local setting if set externally
-            .put("discovery.zen.fd.ping_timeout", "1s") // <-- for hitting simulated network failures quickly
-            .put("discovery.zen.fd.ping_retries", "1") // <-- for hitting simulated network failures quickly
-            .put(DiscoverySettings.PUBLISH_TIMEOUT, "1s") // <-- for hitting simulated network failures quickly
-            .put("discovery.zen.minimum_master_nodes", 2)
-            .put(TransportModule.TRANSPORT_SERVICE_TYPE_KEY, MockTransportService.class.getName())
-            .build();
+    private ClusterDiscoveryConfiguration discoveryConfig;
+
+
+    @Override
+    protected Settings nodeSettings(int nodeOrdinal) {
+        return discoveryConfig.node(nodeOrdinal);
+    }
+
+    @Before
+    public void clearConfig() {
+        discoveryConfig = null;
+    }
 
     @Override
     protected int numberOfShards() {
@@ -88,6 +90,31 @@ protected int numberOfReplicas() {
         return 1;
     }
 
+    private List<String> startCluster(int numberOfNodes) throws ExecutionException, InterruptedException {
+        Settings settings = ImmutableSettings.builder()
+                // TODO: this is a temporary solution so that nodes will not base their reaction to a partition based on previous successful results
+                .put("discovery.zen.ping_timeout", "0.5s")
+                        // end of temporary solution
+                .put("discovery.zen.fd.ping_timeout", "1s") // <-- for hitting simulated network failures quickly
+                .put("discovery.zen.fd.ping_retries", "1") // <-- for hitting simulated network failures quickly
+                .put(DiscoverySettings.PUBLISH_TIMEOUT, "1s") // <-- for hitting simulated network failures quickly
+                .put("http.enabled", false) // just to make test quicker
+                .put(TransportModule.TRANSPORT_SERVICE_TYPE_KEY, MockTransportService.class.getName())
+                .put(ElectMasterService.DISCOVERY_ZEN_MINIMUM_MASTER_NODES, numberOfNodes / 2 + 1).build();
+
+        if (discoveryConfig == null) {
+            if (randomBoolean()) {
+                discoveryConfig = new ClusterDiscoveryConfiguration.UnicastZen(numberOfNodes, numberOfNodes, settings);
+            } else {
+                discoveryConfig = new ClusterDiscoveryConfiguration(numberOfNodes, settings);
+            }
+        }
+        List<String> nodes = internalCluster().startNodesAsync(numberOfNodes).get();
+        ensureStableCluster(numberOfNodes);
+
+        return nodes;
+    }
+
     /**
      * Test that no split brain occurs under partial network partition. See https://github.com/elasticsearch/elasticsearch/issues/2488
      *
@@ -96,10 +123,7 @@ protected int numberOfReplicas() {
     @Test
     public void failWithMinimumMasterNodesConfigured() throws Exception {
 
-        List<String> nodes = internalCluster().startNodesAsync(3, nodeSettings).get();
-
-        // Wait until 3 nodes are part of the cluster
-        ensureStableCluster(3);
+        List<String> nodes = startCluster(3);
 
         // Figure out what is the elected master node
         final String masterNode = internalCluster().getMasterName();
@@ -154,9 +178,7 @@ public boolean apply(Object input) {
     @Test
     @TestLogging(value = "cluster.service:TRACE,indices.recovery:TRACE")
     public void testVerifyApiBlocksDuringPartition() throws Exception {
-        internalCluster().startNodesAsync(3, nodeSettings).get();
-        // Wait until a 3 nodes are part of the cluster
-        ensureStableCluster(3);
+        startCluster(3);
 
         // Makes sure that the get request can be executed on each node locally:
         assertAcked(prepareCreate("test").setSettings(ImmutableSettings.builder()
@@ -276,8 +298,7 @@ public boolean apply(Object input) {
     @Test
     @TestLogging("discovery.zen:TRACE,action:TRACE,cluster.service:TRACE,indices.recovery:TRACE,indices.cluster:TRACE")
     public void testIsolateMasterAndVerifyClusterStateConsensus() throws Exception {
-        final List<String> nodes = internalCluster().startNodesAsync(3, nodeSettings).get();
-        ensureStableCluster(3);
+        final List<String> nodes = startCluster(3);
 
         assertAcked(prepareCreate("test")
                 .setSettings(ImmutableSettings.builder()
@@ -340,8 +361,7 @@ public void testIsolateMasterAndVerifyClusterStateConsensus() throws Exception {
     @LuceneTestCase.AwaitsFix(bugUrl = "needs some more work to stabilize")
     @TestLogging("action.index:TRACE,action.get:TRACE,discovery:TRACE,cluster.service:TRACE,indices.recovery:TRACE,indices.cluster:TRACE")
     public void testAckedIndexing() throws Exception {
-        final List<String> nodes = internalCluster().startNodesAsync(3, nodeSettings).get();
-        ensureStableCluster(3);
+        final List<String> nodes = startCluster(3);
 
         assertAcked(prepareCreate("test")
                 .setSettings(ImmutableSettings.builder()
@@ -478,8 +498,7 @@ public void run() {
     @Test
     @TestLogging("discovery.zen:TRACE,action:TRACE,cluster.service:TRACE,indices.recovery:TRACE,indices.cluster:TRACE")
     public void testRejoinDocumentExistsInAllShardCopies() throws Exception {
-        List<String> nodes = internalCluster().startNodesAsync(3, nodeSettings).get();
-        ensureStableCluster(3);
+        List<String> nodes = startCluster(3);
 
         assertAcked(prepareCreate("test")
                 .setSettings(ImmutableSettings.builder()
diff --git a/src/test/java/org/elasticsearch/discovery/ZenUnicastDiscoveryTests.java b/src/test/java/org/elasticsearch/discovery/ZenUnicastDiscoveryTests.java
index 5f3f6cf978e94..c36834d7cf9d0 100644
--- a/src/test/java/org/elasticsearch/discovery/ZenUnicastDiscoveryTests.java
+++ b/src/test/java/org/elasticsearch/discovery/ZenUnicastDiscoveryTests.java
@@ -26,7 +26,6 @@
 import org.elasticsearch.test.ElasticsearchIntegrationTest;
 import org.elasticsearch.test.ElasticsearchIntegrationTest.ClusterScope;
 import org.elasticsearch.test.ElasticsearchIntegrationTest.Scope;
-import org.elasticsearch.transport.local.LocalTransport;
 import org.junit.Before;
 import org.junit.Test;
 
@@ -38,46 +37,24 @@
 @ClusterScope(scope = Scope.TEST, numDataNodes = 0)
 public class ZenUnicastDiscoveryTests extends ElasticsearchIntegrationTest {
 
-    private static int currentNumNodes = -1;
-
-    static int currentBaseHttpPort = -1;
-    static int currentNumOfUnicastHosts = -1;
-
-    @Before
-    public void setUP() throws Exception {
-        ElasticsearchIntegrationTest.beforeClass();
-        currentNumNodes = randomIntBetween(3, 5);
-        currentNumOfUnicastHosts = randomIntBetween(1, currentNumNodes);
-        currentBaseHttpPort = 25000 + randomInt(100);
-    }
+    private ClusterDiscoveryConfiguration discoveryConfig;
 
     @Override
     protected Settings nodeSettings(int nodeOrdinal) {
-        ImmutableSettings.Builder builder = ImmutableSettings.settingsBuilder()
-                .put("discovery.type", "zen")
-                .put("discovery.zen.ping.multicast.enabled", false)
-                .put("http.enabled", false) // just to make test quicker
-                .put(super.nodeSettings(nodeOrdinal));
+        return discoveryConfig.node(nodeOrdinal);
+    }
 
-        String[] unicastHosts = new String[currentNumOfUnicastHosts];
-        if (internalCluster().getDefaultSettings().get("node.mode").equals("local")) {
-            builder.put(LocalTransport.TRANSPORT_LOCAL_ADDRESS, "unicast_test_" + nodeOrdinal);
-            for (int i = 0; i < unicastHosts.length; i++) {
-                unicastHosts[i] = "unicast_test_" + i;
-            }
-        } else {
-            // we need to pin the node ports so we'd know where to point things
-            builder.put("transport.tcp.port", currentBaseHttpPort + nodeOrdinal);
-            for (int i = 0; i < unicastHosts.length; i++) {
-                unicastHosts[i] = "localhost:" + (currentBaseHttpPort + i);
-            }
-        }
-        builder.putArray("discovery.zen.ping.unicast.hosts", unicastHosts);
-        return builder.build();
+    @Before
+    public void clearConfig() {
+        discoveryConfig = null;
     }
 
     @Test
     public void testNormalClusterForming() throws ExecutionException, InterruptedException {
+        int currentNumNodes = randomIntBetween(3, 5);
+        int currentNumOfUnicastHosts = randomIntBetween(1, currentNumNodes);
+        discoveryConfig = new ClusterDiscoveryConfiguration.UnicastZen(currentNumNodes, currentNumOfUnicastHosts);
+
         internalCluster().startNodesAsync(currentNumNodes).get();
 
         if (client().admin().cluster().prepareHealth().setWaitForNodes("" + currentNumNodes).get().isTimedOut()) {
@@ -91,9 +68,12 @@ public void testNormalClusterForming() throws ExecutionException, InterruptedExc
     // test fails, because 2 nodes elect themselves as master and the health request times out b/c waiting_for_nodes=N
     // can't be satisfied.
     public void testMinimumMasterNodes() throws Exception {
+        int currentNumNodes = randomIntBetween(3, 5);
+        int currentNumOfUnicastHosts = randomIntBetween(1, currentNumNodes);
         final Settings settings = ImmutableSettings.settingsBuilder().put("discovery.zen.minimum_master_nodes", currentNumNodes / 2 + 1).build();
+        discoveryConfig = new ClusterDiscoveryConfiguration.UnicastZen(currentNumNodes, currentNumOfUnicastHosts, settings);
 
-        List<String> nodes = internalCluster().startNodesAsync(currentNumNodes, settings).get();
+        List<String> nodes = internalCluster().startNodesAsync(currentNumNodes).get();
 
         ensureGreen();
 
diff --git a/src/test/java/org/elasticsearch/test/InternalTestCluster.java b/src/test/java/org/elasticsearch/test/InternalTestCluster.java
index 500df9541efaf..113520a832793 100644
--- a/src/test/java/org/elasticsearch/test/InternalTestCluster.java
+++ b/src/test/java/org/elasticsearch/test/InternalTestCluster.java
@@ -107,8 +107,8 @@
 import static org.elasticsearch.test.ElasticsearchTestCase.assertBusy;
 import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertNoTimeout;
 import static org.hamcrest.Matchers.equalTo;
-import static org.junit.Assert.assertThat;
 import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertThat;
 
 /**
  * InternalTestCluster manages a set of JVM private nodes and allows convenient access to them.
@@ -155,7 +155,7 @@ public final class InternalTestCluster extends TestCluster {
 
     static final boolean DEFAULT_ENABLE_RANDOM_BENCH_NODES = true;
 
-    static final String NODE_MODE = nodeMode();
+    public static final String NODE_MODE = nodeMode();
 
     /* sorted map to make traverse order reproducible, concurrent since we do checks on it not within a sync block */
     private final NavigableMap<String, NodeAndClient> nodes = new TreeMap<>();
diff --git a/src/test/java/org/elasticsearch/test/SettingsSource.java b/src/test/java/org/elasticsearch/test/SettingsSource.java
index 8829885bf7b03..6341d842d6789 100644
--- a/src/test/java/org/elasticsearch/test/SettingsSource.java
+++ b/src/test/java/org/elasticsearch/test/SettingsSource.java
@@ -20,7 +20,7 @@
 
 import org.elasticsearch.common.settings.Settings;
 
-abstract class SettingsSource {
+public abstract class SettingsSource {
 
     public static final SettingsSource EMPTY = new SettingsSource() {
         @Override
@@ -35,7 +35,7 @@ public Settings transportClient() {
     };
 
     /**
-     * @return  the settings for the node represented by the given ordinal, or {@code null} if there are no settings defined
+     * @return the settings for the node represented by the given ordinal, or {@code null} if there are no settings defined
      */
     public abstract Settings node(int nodeOrdinal);
 

From bebaf9799ce0d98230810e099e2b6df1508756c9 Mon Sep 17 00:00:00 2001
From: Boaz Leskes <b.leskes@gmail.com>
Date: Mon, 14 Jul 2014 21:30:11 +0200
Subject: [PATCH 46/74] [Tests] stability improvements

added explicit cleaning of temp unicast ping results
reduce gateway local.list_timeout to 10s.
testVerifyApiBlocksDuringPartition: verify master node has stepped down before restoring partition
---
 .../zen/ping/unicast/UnicastZenPing.java      |   7 ++
 .../DiscoveryWithNetworkFailuresTests.java    | 102 ++++++++----------
 2 files changed, 52 insertions(+), 57 deletions(-)

diff --git a/src/main/java/org/elasticsearch/discovery/zen/ping/unicast/UnicastZenPing.java b/src/main/java/org/elasticsearch/discovery/zen/ping/unicast/UnicastZenPing.java
index 25a43ead8ef78..5b7cf0334676a 100644
--- a/src/main/java/org/elasticsearch/discovery/zen/ping/unicast/UnicastZenPing.java
+++ b/src/main/java/org/elasticsearch/discovery/zen/ping/unicast/UnicastZenPing.java
@@ -143,6 +143,13 @@ public void setNodesProvider(DiscoveryNodesProvider nodesProvider) {
         this.nodesProvider = nodesProvider;
     }
 
+    /**
+     * Clears the list of cached ping responses.
+     */
+    public void clearTemporalReponses() {
+        temporalResponses.clear();
+    }
+
     public PingResponse[] pingAndWait(TimeValue timeout) {
         final AtomicReference<PingResponse[]> response = new AtomicReference<>();
         final CountDownLatch latch = new CountDownLatch(1);
diff --git a/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java b/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java
index 1059b587aa568..f494ab5df6a14 100644
--- a/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java
+++ b/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java
@@ -19,7 +19,6 @@
 
 package org.elasticsearch.discovery;
 
-import com.google.common.base.Predicate;
 import org.apache.lucene.util.LuceneTestCase;
 import org.elasticsearch.ElasticsearchException;
 import org.elasticsearch.action.admin.cluster.health.ClusterHealthResponse;
@@ -27,9 +26,9 @@
 import org.elasticsearch.action.index.IndexResponse;
 import org.elasticsearch.client.Client;
 import org.elasticsearch.cluster.ClusterState;
+import org.elasticsearch.cluster.block.ClusterBlock;
 import org.elasticsearch.cluster.block.ClusterBlockLevel;
 import org.elasticsearch.cluster.metadata.IndexMetaData;
-import org.elasticsearch.cluster.node.DiscoveryNodes;
 import org.elasticsearch.cluster.routing.operation.hash.djb.DjbHashFunction;
 import org.elasticsearch.common.Nullable;
 import org.elasticsearch.common.Priority;
@@ -38,6 +37,9 @@
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.common.unit.TimeValue;
 import org.elasticsearch.discovery.zen.elect.ElectMasterService;
+import org.elasticsearch.discovery.zen.ping.ZenPing;
+import org.elasticsearch.discovery.zen.ping.ZenPingService;
+import org.elasticsearch.discovery.zen.ping.unicast.UnicastZenPing;
 import org.elasticsearch.test.ElasticsearchIntegrationTest;
 import org.elasticsearch.test.InternalTestCluster;
 import org.elasticsearch.test.disruption.*;
@@ -91,14 +93,14 @@ protected int numberOfReplicas() {
     }
 
     private List<String> startCluster(int numberOfNodes) throws ExecutionException, InterruptedException {
+
+        // TODO: Rarely use default settings form some of these
         Settings settings = ImmutableSettings.builder()
-                // TODO: this is a temporary solution so that nodes will not base their reaction to a partition based on previous successful results
-                .put("discovery.zen.ping_timeout", "0.5s")
-                        // end of temporary solution
                 .put("discovery.zen.fd.ping_timeout", "1s") // <-- for hitting simulated network failures quickly
                 .put("discovery.zen.fd.ping_retries", "1") // <-- for hitting simulated network failures quickly
                 .put(DiscoverySettings.PUBLISH_TIMEOUT, "1s") // <-- for hitting simulated network failures quickly
                 .put("http.enabled", false) // just to make test quicker
+                .put("gateway.local.list_timeout", "10s") // still long to induce failures but to long so test won't time out
                 .put(TransportModule.TRANSPORT_SERVICE_TYPE_KEY, MockTransportService.class.getName())
                 .put(ElectMasterService.DISCOVERY_ZEN_MINIMUM_MASTER_NODES, numberOfNodes / 2 + 1).build();
 
@@ -112,6 +114,15 @@ private List<String> startCluster(int numberOfNodes) throws ExecutionException,
         List<String> nodes = internalCluster().startNodesAsync(numberOfNodes).get();
         ensureStableCluster(numberOfNodes);
 
+        // TODO: this is a temporary solution so that nodes will not base their reaction to a partition based on previous successful results
+        for (ZenPingService pingService : internalCluster().getInstances(ZenPingService.class)) {
+            for (ZenPing zenPing : pingService.zenPings()) {
+                if (zenPing instanceof UnicastZenPing) {
+                    ((UnicastZenPing) zenPing).clearTemporalReponses();
+                }
+            }
+        }
+
         return nodes;
     }
 
@@ -147,16 +158,7 @@ public void failWithMinimumMasterNodesConfigured() throws Exception {
         // The unlucky node must report *no* master node, since it can't connect to master and in fact it should
         // continuously ping until network failures have been resolved. However
         // It may a take a bit before the node detects it has been cut off from the elected master
-        boolean success = awaitBusy(new Predicate<Object>() {
-            @Override
-            public boolean apply(Object input) {
-                ClusterState localClusterState = getNodeClusterState(unluckyNode);
-                DiscoveryNodes localDiscoveryNodes = localClusterState.nodes();
-                logger.info("localDiscoveryNodes=" + localDiscoveryNodes.prettyPrint());
-                return localDiscoveryNodes.masterNode() == null;
-            }
-        }, 10, TimeUnit.SECONDS);
-        assertThat(success, is(true));
+        assertNoMaster(unluckyNode);
 
         networkDisconnect.stopDisrupting();
 
@@ -204,27 +206,7 @@ public void testVerifyApiBlocksDuringPartition() throws Exception {
         // continuously ping until network failures have been resolved. However
         // It may a take a bit before the node detects it has been cut off from the elected master
         logger.info("waiting for isolated node [{}] to have no master", isolatedNode);
-        final ClusterState[] lastState = new ClusterState[1];
-        boolean success = awaitBusy(new Predicate<Object>() {
-            @Override
-            public boolean apply(Object input) {
-                lastState[0] = getNodeClusterState(isolatedNode);
-                DiscoveryNodes localDiscoveryNodes = lastState[0].nodes();
-                logger.info("localDiscoveryNodes=" + localDiscoveryNodes.prettyPrint());
-                if (localDiscoveryNodes.masterNode() == null) {
-                    return false;
-                }
-                for (ClusterBlockLevel level : DiscoverySettings.NO_MASTER_BLOCK_WRITES.levels()) {
-                    if (lastState[0].getBlocks().hasGlobalBlock(level)) {
-                        return false;
-                    }
-                }
-                return true;
-            }
-        }, 10, TimeUnit.SECONDS);
-        if (!success) {
-            fail("isolated node still has a master or the wrong blocks. Cluster state:\n" + lastState[0].prettyPrint());
-        }
+        assertNoMaster(isolatedNode, DiscoverySettings.NO_MASTER_BLOCK_WRITES, TimeValue.timeValueSeconds(10));
 
 
         logger.info("wait until elected master has been removed and a new 2 node cluster was from (via [{}])", isolatedNode);
@@ -232,7 +214,7 @@ public boolean apply(Object input) {
 
         for (String node : networkPartition.getMajoritySide()) {
             ClusterState nodeState = getNodeClusterState(node);
-            success = true;
+            boolean success = true;
             if (nodeState.nodes().getMasterNode() == null) {
                 success = false;
             }
@@ -263,26 +245,7 @@ public boolean apply(Object input) {
         // continuously ping until network failures have been resolved. However
         // It may a take a bit before the node detects it has been cut off from the elected master
         logger.info("waiting for isolated node [{}] to have no master", isolatedNode);
-        success = awaitBusy(new Predicate<Object>() {
-            @Override
-            public boolean apply(Object input) {
-                lastState[0] = getNodeClusterState(isolatedNode);
-                DiscoveryNodes localDiscoveryNodes = lastState[0].nodes();
-                logger.info("localDiscoveryNodes=" + localDiscoveryNodes.prettyPrint());
-                if (localDiscoveryNodes.masterNode() == null) {
-                    return false;
-                }
-                for (ClusterBlockLevel level : DiscoverySettings.NO_MASTER_BLOCK_ALL.levels()) {
-                    if (lastState[0].getBlocks().hasGlobalBlock(level)) {
-                        return false;
-                    }
-                }
-                return true;
-            }
-        }, 10, TimeUnit.SECONDS);
-        if (!success) {
-            fail("isolated node still has a master or the wrong blocks (expected 'all' block). Cluster state:\n" + lastState[0].prettyPrint());
-        }
+        assertNoMaster(isolatedNode, DiscoverySettings.NO_MASTER_BLOCK_ALL, TimeValue.timeValueSeconds(10));
 
         // make sure we have stable cluster & cross partition recoveries are canceled by the removal of the missing node
         // the unresponsive partition causes recoveries to only time out after 15m (default) and these will cause
@@ -316,6 +279,9 @@ public void testIsolateMasterAndVerifyClusterStateConsensus() throws Exception {
         // make sure cluster reforms
         ensureStableCluster(2, nonIsolatedNode);
 
+        // make sure isolated need picks up on things.
+        assertNoMaster(isolatedNode, TimeValue.timeValueSeconds(40));
+
         // restore isolation
         networkPartition.stopDisrupting();
 
@@ -616,4 +582,26 @@ private ClusterState getNodeClusterState(String node) {
         return client(node).admin().cluster().prepareState().setLocal(true).get().getState();
     }
 
+    private void assertNoMaster(final String node) throws Exception {
+        assertNoMaster(node, null, TimeValue.timeValueSeconds(10));
+    }
+
+    private void assertNoMaster(final String node, TimeValue maxWaitTime) throws Exception {
+        assertNoMaster(node, null, maxWaitTime);
+    }
+
+    private void assertNoMaster(final String node, @Nullable final ClusterBlock expectedBlocks, TimeValue maxWaitTime) throws Exception {
+        assertBusy(new Runnable() {
+            @Override
+            public void run() {
+                ClusterState state = getNodeClusterState(node);
+                assertNull("node [" + node + "] still has [" + state.nodes().masterNode() + "] as master", state.nodes().masterNode());
+                if (expectedBlocks != null) {
+                    for (ClusterBlockLevel level : expectedBlocks.levels()) {
+                        assertTrue("node [" + node + "] does have level [" + level + "] in it's blocks", state.getBlocks().hasGlobalBlock(level));
+                    }
+                }
+            }
+        }, maxWaitTime.getMillis(), TimeUnit.MILLISECONDS);
+    }
 }

From f029a24d53f3881724f9297e372b4120f1692179 Mon Sep 17 00:00:00 2001
From: Boaz Leskes <b.leskes@gmail.com>
Date: Thu, 17 Jul 2014 15:55:03 +0200
Subject: [PATCH 47/74] [Store] migrate non-allocated shard deletion to use
 ClusterStateNonMasterUpdateTask

---
 src/main/java/org/elasticsearch/indices/store/IndicesStore.java | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/main/java/org/elasticsearch/indices/store/IndicesStore.java b/src/main/java/org/elasticsearch/indices/store/IndicesStore.java
index 02420d0e3d534..ecf5e6b6b2290 100644
--- a/src/main/java/org/elasticsearch/indices/store/IndicesStore.java
+++ b/src/main/java/org/elasticsearch/indices/store/IndicesStore.java
@@ -307,7 +307,7 @@ private void allNodesResponded() {
                 return;
             }
 
-            clusterService.submitStateUpdateTask("indices_store", new ClusterStateUpdateTask() {
+            clusterService.submitStateUpdateTask("indices_store", new ClusterStateNonMasterUpdateTask() {
                 @Override
                 public ClusterState execute(ClusterState currentState) throws Exception {
                     if (clusterState.getVersion() != currentState.getVersion()) {

From 67685cb026b3cf3425faf29f3a6e9f8dc44574cd Mon Sep 17 00:00:00 2001
From: Martijn van Groningen <martijn.v.groningen@gmail.com>
Date: Thu, 17 Jul 2014 12:55:35 +0200
Subject: [PATCH 48/74] Discovery: If not enough possible masters are found,
 but there are masters to ping (ping responses did include master node) then
 these nodes should be resolved.

After the findMaster() call we try to connect to the node and if it isn't the master we start looking for a new master via pinging again.

Closes #6904
---
 .../discovery/zen/ZenDiscovery.java           |  11 +-
 .../ClusterDiscoveryConfiguration.java        |   4 +
 .../DiscoveryWithNetworkFailuresTests.java    | 115 +++++++++++++++---
 3 files changed, 107 insertions(+), 23 deletions(-)

diff --git a/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java b/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
index 8cdf2bd3f529c..0188b14e24a7d 100644
--- a/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
+++ b/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
@@ -873,14 +873,13 @@ private DiscoveryNode findMaster() {
         for (ZenPing.PingResponse pingResponse : pingResponses) {
             possibleMasterNodes.add(pingResponse.target());
         }
-        // if we don't have enough master nodes, we bail, even if we get a response that indicates
-        // there is a master by other node, we don't see enough...
-        if (!electMaster.hasEnoughMasterNodes(possibleMasterNodes)) {
-            logger.trace("not enough master nodes [{}]", possibleMasterNodes);
-            return null;
-        }
 
         if (pingMasters.isEmpty()) {
+            // if we don't have enough master nodes, we bail, because there are not enough master to elect from
+            if (!electMaster.hasEnoughMasterNodes(possibleMasterNodes)) {
+                logger.trace("not enough master nodes [{}]", possibleMasterNodes);
+                return null;
+            }
             // lets tie break between discovered nodes
             DiscoveryNode electedMaster = electMaster.electMaster(possibleMasterNodes);
             if (localNode.equals(electedMaster)) {
diff --git a/src/test/java/org/elasticsearch/discovery/ClusterDiscoveryConfiguration.java b/src/test/java/org/elasticsearch/discovery/ClusterDiscoveryConfiguration.java
index 00f1a8421d665..345422aad526b 100644
--- a/src/test/java/org/elasticsearch/discovery/ClusterDiscoveryConfiguration.java
+++ b/src/test/java/org/elasticsearch/discovery/ClusterDiscoveryConfiguration.java
@@ -72,6 +72,10 @@ public UnicastZen(int numOfNodes) {
             this(numOfNodes, numOfNodes);
         }
 
+        public UnicastZen(int numOfNodes, Settings extraSettings) {
+            this(numOfNodes, numOfNodes, extraSettings);
+        }
+
         public UnicastZen(int numOfNodes, int numOfUnicastHosts) {
             this(numOfNodes, numOfUnicastHosts, ImmutableSettings.EMPTY);
         }
diff --git a/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java b/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java
index f494ab5df6a14..b03c6ea6b660d 100644
--- a/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java
+++ b/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java
@@ -93,22 +93,56 @@ protected int numberOfReplicas() {
     }
 
     private List<String> startCluster(int numberOfNodes) throws ExecutionException, InterruptedException {
+        if (randomBoolean()) {
+            return startMulticastCluster(numberOfNodes, -1);
+        } else {
+            return startUnicastCluster(numberOfNodes, null, -1);
+        }
+    }
+
+    final static Settings DEFAULT_SETTINGS = ImmutableSettings.builder()
+            .put("discovery.zen.fd.ping_timeout", "1s") // <-- for hitting simulated network failures quickly
+            .put("discovery.zen.fd.ping_retries", "1") // <-- for hitting simulated network failures quickly
+            .put(DiscoverySettings.PUBLISH_TIMEOUT, "1s") // <-- for hitting simulated network failures quickly
+            .put("http.enabled", false) // just to make test quicker
+            .put("gateway.local.list_timeout", "10s") // still long to induce failures but to long so test won't time out
+            .put(TransportModule.TRANSPORT_SERVICE_TYPE_KEY, MockTransportService.class.getName())
+            .build();
+
+    private List<String> startMulticastCluster(int numberOfNodes, int minimumMasterNode) throws ExecutionException, InterruptedException {
+        if (minimumMasterNode < 0) {
+            minimumMasterNode = numberOfNodes / 2 + 1;
+        }
+        // TODO: Rarely use default settings form some of these
+        Settings settings = ImmutableSettings.builder()
+                .put(DEFAULT_SETTINGS)
+                .put(ElectMasterService.DISCOVERY_ZEN_MINIMUM_MASTER_NODES, minimumMasterNode)
+                .build();
+
+        if (discoveryConfig == null) {
+            discoveryConfig = new ClusterDiscoveryConfiguration(numberOfNodes, settings);
+        }
+        List<String> nodes = internalCluster().startNodesAsync(numberOfNodes).get();
+        ensureStableCluster(numberOfNodes);
+
+        return nodes;
+    }
 
+    private List<String> startUnicastCluster(int numberOfNodes,@Nullable int[] unicastHostsOrdinals, int minimumMasterNode) throws ExecutionException, InterruptedException {
+        if (minimumMasterNode < 0) {
+            minimumMasterNode = numberOfNodes / 2 + 1;
+        }
         // TODO: Rarely use default settings form some of these
         Settings settings = ImmutableSettings.builder()
-                .put("discovery.zen.fd.ping_timeout", "1s") // <-- for hitting simulated network failures quickly
-                .put("discovery.zen.fd.ping_retries", "1") // <-- for hitting simulated network failures quickly
-                .put(DiscoverySettings.PUBLISH_TIMEOUT, "1s") // <-- for hitting simulated network failures quickly
-                .put("http.enabled", false) // just to make test quicker
-                .put("gateway.local.list_timeout", "10s") // still long to induce failures but to long so test won't time out
-                .put(TransportModule.TRANSPORT_SERVICE_TYPE_KEY, MockTransportService.class.getName())
-                .put(ElectMasterService.DISCOVERY_ZEN_MINIMUM_MASTER_NODES, numberOfNodes / 2 + 1).build();
+                .put(DEFAULT_SETTINGS)
+                .put(ElectMasterService.DISCOVERY_ZEN_MINIMUM_MASTER_NODES, minimumMasterNode)
+                .build();
 
         if (discoveryConfig == null) {
-            if (randomBoolean()) {
-                discoveryConfig = new ClusterDiscoveryConfiguration.UnicastZen(numberOfNodes, numberOfNodes, settings);
+            if (unicastHostsOrdinals == null) {
+                discoveryConfig = new ClusterDiscoveryConfiguration.UnicastZen(numberOfNodes, settings);
             } else {
-                discoveryConfig = new ClusterDiscoveryConfiguration(numberOfNodes, settings);
+                discoveryConfig = new ClusterDiscoveryConfiguration.UnicastZen(numberOfNodes, settings, unicastHostsOrdinals);
             }
         }
         List<String> nodes = internalCluster().startNodesAsync(numberOfNodes).get();
@@ -126,6 +160,7 @@ private List<String> startCluster(int numberOfNodes) throws ExecutionException,
         return nodes;
     }
 
+
     /**
      * Test that no split brain occurs under partial network partition. See https://github.com/elasticsearch/elasticsearch/issues/2488
      *
@@ -165,13 +200,9 @@ public void failWithMinimumMasterNodesConfigured() throws Exception {
         // Wait until the master node sees all 3 nodes again.
         ensureStableCluster(3);
 
-        for (String node : nodes) {
-            ClusterState state = getNodeClusterState(node);
-            assertThat(state.nodes().size(), equalTo(3));
-            // The elected master shouldn't have changed, since the unlucky node never could have elected himself as
-            // master since m_m_n of 2 could never be satisfied.
-            assertThat(state.nodes().masterNode().name(), equalTo(masterNode));
-        }
+        // The elected master shouldn't have changed, since the unlucky node never could have elected himself as
+        // master since m_m_n of 2 could never be satisfied.
+        assertMaster(masterNode, nodes);
     }
 
     /**
@@ -512,6 +543,48 @@ public void testRejoinDocumentExistsInAllShardCopies() throws Exception {
         }
     }
 
+    /**
+     * A 4 node cluster with m_m_n set to 3 and each node has one unicast enpoint. One node partitions from the master node.
+     * The temporal unicast responses is empty. When partition is solved the one ping response contains a master node.
+     * The rejoining node should take this master node and connect.
+     */
+    @Test
+    @TestLogging("discovery.zen:TRACE,action:TRACE")
+    public void unicastSinglePingResponseContainsMaster() throws Exception {
+        List<String> nodes = startUnicastCluster(4, new int[] {0}, -1);
+        // Figure out what is the elected master node
+        final String masterNode = internalCluster().getMasterName();
+        logger.info("---> legit elected master node=" + masterNode);
+        List<String> otherNodes = new ArrayList<>(nodes);
+        otherNodes.remove(masterNode);
+        otherNodes.remove(nodes.get(0)); // <-- Don't isolate the node that is in the unicast endpoint for all the other nodes.
+        final String isolatedNode = otherNodes.get(0);
+
+        // Forcefully clean temporal response lists on all nodes. Otherwise the node in the unicast host list
+        // includes all the other nodes that have pinged it and the issue doesn't manifest
+        for (ZenPingService pingService : internalCluster().getInstances(ZenPingService.class)) {
+            for (ZenPing zenPing : pingService.zenPings()) {
+                ((UnicastZenPing) zenPing).clearTemporalReponses();
+            }
+        }
+
+        // Simulate a network issue between the unlucky node and elected master node in both directions.
+        NetworkDisconnectPartition networkDisconnect = new NetworkDisconnectPartition(masterNode, isolatedNode, getRandom());
+        setDisruptionScheme(networkDisconnect);
+        networkDisconnect.startDisrupting();
+        // Wait until elected master has removed that the unlucky node...
+        ensureStableCluster(3, masterNode);
+
+        // The isolate master node must report no master, so it starts with pinging
+        assertNoMaster(isolatedNode);
+        networkDisconnect.stopDisrupting();
+        // Wait until the master node sees all 4 nodes again.
+        ensureStableCluster(4);
+        // The elected master shouldn't have changed, since the isolated node never could have elected himself as
+        // master since m_m_n of 3 could never be satisfied.
+        assertMaster(masterNode, nodes);
+    }
+
     protected NetworkPartition addRandomPartition() {
         NetworkPartition partition;
         if (randomBoolean()) {
@@ -604,4 +677,12 @@ public void run() {
             }
         }, maxWaitTime.getMillis(), TimeUnit.MILLISECONDS);
     }
+
+    private void assertMaster(String masterNode, List<String> nodes) {
+        for (String node : nodes) {
+            ClusterState state = getNodeClusterState(node);
+            assertThat(state.nodes().size(), equalTo(nodes.size()));
+            assertThat(state.nodes().masterNode().name(), equalTo(masterNode));
+        }
+    }
 }

From 5e38e9eb4f4f4081fbce38ef55eae7e9f3b8e6a9 Mon Sep 17 00:00:00 2001
From: Martijn van Groningen <martijn.v.groningen@gmail.com>
Date: Thu, 17 Jul 2014 22:49:08 +0200
Subject: [PATCH 49/74] Discovery: Only add local node to possibleMasterNodes
 if it is a master node.

---
 .../java/org/elasticsearch/discovery/zen/ZenDiscovery.java    | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java b/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
index 0188b14e24a7d..63d53933b11f7 100644
--- a/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
+++ b/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
@@ -869,7 +869,9 @@ private DiscoveryNode findMaster() {
         }
 
         Set<DiscoveryNode> possibleMasterNodes = Sets.newHashSet();
-        possibleMasterNodes.add(localNode);
+        if (localNode.masterNode()) {
+            possibleMasterNodes.add(localNode);
+        }
         for (ZenPing.PingResponse pingResponse : pingResponses) {
             possibleMasterNodes.add(pingResponse.target());
         }

From c2142c0f6d56c2a4ae7b31aff7092073ba3038bb Mon Sep 17 00:00:00 2001
From: Martijn van Groningen <martijn.v.groningen@gmail.com>
Date: Thu, 17 Jul 2014 23:49:08 +0200
Subject: [PATCH 50/74] Discovery: Don't include local node to pingMasters
 list. We might end up electing ourselves without any form of verification.

---
 .../java/org/elasticsearch/discovery/zen/ZenDiscovery.java  | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java b/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
index 63d53933b11f7..e18672f2cb962 100644
--- a/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
+++ b/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
@@ -864,7 +864,11 @@ private DiscoveryNode findMaster() {
         List<DiscoveryNode> pingMasters = newArrayList();
         for (ZenPing.PingResponse pingResponse : pingResponses) {
             if (pingResponse.master() != null) {
-                pingMasters.add(pingResponse.master());
+                // We can't include the local node in pingMasters list, otherwise we may up electing ourselves without
+                // any check / verifications from other nodes in ZenDiscover#innerJoinCluster()
+                if (!localNode.equals(pingResponse.master())) {
+                    pingMasters.add(pingResponse.master());
+                }
             }
         }
 

From a40984887b175c41727c1d58e6512760bc810163 Mon Sep 17 00:00:00 2001
From: Boaz Leskes <b.leskes@gmail.com>
Date: Sun, 20 Jul 2014 21:00:25 +0300
Subject: [PATCH 51/74] [Tests] Fixed some issues with
 SlowClusterStateProcessing

Reduced expected time to heal to 0 (we interrupt and wait on stop disruption). It was also  wrongly indicated in seconds.
We didn't properly wait between slow cluster state tasks
---
 .../SlowClusterStateProcessing.java           | 37 +++++++++++++------
 1 file changed, 26 insertions(+), 11 deletions(-)

diff --git a/src/test/java/org/elasticsearch/test/disruption/SlowClusterStateProcessing.java b/src/test/java/org/elasticsearch/test/disruption/SlowClusterStateProcessing.java
index a7f6b88592400..46ae0afe54c68 100644
--- a/src/test/java/org/elasticsearch/test/disruption/SlowClusterStateProcessing.java
+++ b/src/test/java/org/elasticsearch/test/disruption/SlowClusterStateProcessing.java
@@ -25,6 +25,7 @@
 import org.elasticsearch.common.unit.TimeValue;
 
 import java.util.Random;
+import java.util.concurrent.CountDownLatch;
 
 public class SlowClusterStateProcessing extends SingleNodeDisruption {
 
@@ -75,7 +76,9 @@ public void stopDisrupting() {
         if (worker == null) {
             return;
         }
+        logger.info("stopping to slow down cluster state processing on [{}]", disruptedNode);
         disrupting = false;
+        worker.interrupt();
         try {
             worker.join(2 * (intervalBetweenDelaysMax + delayDurationMax));
         } catch (InterruptedException e) {
@@ -85,49 +88,61 @@ public void stopDisrupting() {
     }
 
 
-    private synchronized boolean interruptClusterStateProcessing(final TimeValue duration) {
-        if (disruptedNode == null) {
+    private boolean interruptClusterStateProcessing(final TimeValue duration) throws InterruptedException {
+        final String disruptionNodeCopy = disruptedNode;
+        if (disruptionNodeCopy == null) {
+            return false;
+        }
+        logger.info("delaying cluster state updates on node [{}] for [{}]", disruptionNodeCopy, duration);
+        final CountDownLatch countDownLatch = new CountDownLatch(1);
+        ClusterService clusterService = cluster.getInstance(ClusterService.class, disruptionNodeCopy);
+        if (clusterService == null) {
             return false;
         }
-        logger.info("delaying cluster state updates on node [{}] for [{}]", disruptedNode, duration);
-        ClusterService clusterService = cluster.getInstance(ClusterService.class, disruptedNode);
         clusterService.submitStateUpdateTask("service_disruption_delay", Priority.IMMEDIATE, new ClusterStateNonMasterUpdateTask() {
 
             @Override
             public ClusterState execute(ClusterState currentState) throws Exception {
                 Thread.sleep(duration.millis());
+                countDownLatch.countDown();
                 return currentState;
             }
 
             @Override
             public void onFailure(String source, Throwable t) {
-
+                countDownLatch.countDown();
             }
         });
+        try {
+            countDownLatch.await();
+        } catch (InterruptedException e) {
+            // try to wait again, we really want the cluster state thread to be freed up when stopping disruption
+            countDownLatch.await();
+        }
         return true;
     }
 
     @Override
     public TimeValue expectedTimeToHeal() {
-        return TimeValue.timeValueSeconds(delayDurationMax + intervalBetweenDelaysMax);
+        return TimeValue.timeValueMillis(0);
     }
 
     class BackgroundWorker implements Runnable {
 
         @Override
         public void run() {
-            while (disrupting) {
+            while (disrupting && disruptedNode != null) {
                 try {
                     TimeValue duration = new TimeValue(delayDurationMin + random.nextInt((int) (delayDurationMax - delayDurationMin)));
                     if (!interruptClusterStateProcessing(duration)) {
                         continue;
                     }
-                    Thread.sleep(duration.millis());
 
-                    if (disruptedNode == null) {
-                        return;
+                    duration = new TimeValue(intervalBetweenDelaysMin + random.nextInt((int) (intervalBetweenDelaysMax - intervalBetweenDelaysMin)));
+                    if (disrupting && disruptedNode != null) {
+                        Thread.sleep(duration.millis());
                     }
-
+                } catch (InterruptedException e) {
                 } catch (Exception e) {
                     logger.error("error in background worker", e);
                 }

From ffcf1077d8f4f6b4d3630501df2b0ac7e2f8b93b Mon Sep 17 00:00:00 2001
From: Boaz Leskes <b.leskes@gmail.com>
Date: Fri, 18 Jul 2014 08:14:38 +0200
Subject: [PATCH 52/74] [Discovery] join master after first election

Currently, pinging results are only used if the local node is elected master or if they detect another *already* active master. This has the effect that master election requires two pinging rounds - one for the elected master to take is role and another for the other nodes to detect it and join the cluster. We can be smarter and use the election of the first round on other nodes as well. Those nodes can try to join the elected master immediately. There is a catch though - the elected master node may still be processing the election and may reject the join request if not ready yet. To compensate a retry mechanism is introduced to try again (up to 3 times by default) if this happens.

Closes #6943
---
 .../discovery/zen/ZenDiscovery.java           | 92 ++++++++++++-------
 1 file changed, 60 insertions(+), 32 deletions(-)

diff --git a/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java b/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
index e18672f2cb962..f1621484df973 100644
--- a/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
+++ b/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
@@ -99,6 +99,12 @@ public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implemen
     private final TimeValue pingTimeout;
     private final TimeValue joinTimeout;
 
+    /** how many retry attempts to perform if join request failed with an retriable error */
+    private final int joinRetryAttempts;
+    /** how long to wait before performing another join attempt after a join request failed with an retriable error */
+    private final TimeValue joinRetryDelay;
+
+
     // a flag that should be used only for testing
     private final boolean sendLeaveRequest;
 
@@ -144,6 +150,8 @@ public ZenDiscovery(Settings settings, ClusterName clusterName, ThreadPool threa
         // also support direct discovery.zen settings, for cases when it gets extended
         this.pingTimeout = settings.getAsTime("discovery.zen.ping.timeout", settings.getAsTime("discovery.zen.ping_timeout", componentSettings.getAsTime("ping_timeout", componentSettings.getAsTime("initial_ping_timeout", timeValueSeconds(3)))));
         this.joinTimeout = settings.getAsTime("discovery.zen.join_timeout", TimeValue.timeValueMillis(pingTimeout.millis() * 20));
+        this.joinRetryAttempts = settings.getAsInt("discovery.zen.join_retry_attempts", 3);
+        this.joinRetryDelay = settings.getAsTime("discovery.zen.join_retry_delay", TimeValue.timeValueMillis(100));
         this.sendLeaveRequest = componentSettings.getAsBoolean("send_leave_request", true);
 
         this.masterElectionFilterClientNodes = settings.getAsBoolean("discovery.zen.master_election.filter_client", true);
@@ -350,30 +358,12 @@ public void clusterStateProcessed(String source, ClusterState oldState, ClusterS
                 });
             } else {
                 this.master = false;
-                try {
-                    // first, make sure we can connect to the master
-                    transportService.connectToNode(masterNode);
-                } catch (Exception e) {
-                    logger.warn("failed to connect to master [{}], retrying...", e, masterNode);
-                    retry = true;
-                    continue;
-                }
                 // send join request
-                try {
-                    membership.sendJoinRequestBlocking(masterNode, localNode, joinTimeout);
-                } catch (Exception e) {
-                    if (e instanceof ElasticsearchException) {
-                        logger.info("failed to send join request to master [{}], reason [{}]", masterNode, ((ElasticsearchException) e).getDetailedMessage());
-                    } else {
-                        logger.info("failed to send join request to master [{}], reason [{}]", masterNode, e.getMessage());
-                    }
-                    if (logger.isTraceEnabled()) {
-                        logger.trace("detailed failed reason", e);
-                    }
-                    // failed to send the join request, retry
-                    retry = true;
+                retry = !joinElectedMaster(masterNode);
+                if (retry) {
                     continue;
                 }
+
                 masterFD.start(masterNode, "initial_join");
                 // no need to submit the received cluster state, we will get it from the master when it publishes
                 // the fact that we joined
@@ -381,6 +371,52 @@ public void clusterStateProcessed(String source, ClusterState oldState, ClusterS
         }
     }
 
+    /**
+     * Join a newly elected master.
+     *
+     * @return true if successful
+     */
+    private boolean joinElectedMaster(DiscoveryNode masterNode) {
+        try {
+            // first, make sure we can connect to the master
+            transportService.connectToNode(masterNode);
+        } catch (Exception e) {
+            logger.warn("failed to connect to master [{}], retrying...", e, masterNode);
+            return false;
+        }
+        for (int joinAttempt = 0; joinAttempt < this.joinRetryAttempts; joinAttempt++) {
+            try {
+                logger.trace("joining master {}", masterNode);
+                membership.sendJoinRequestBlocking(masterNode, localNode, joinTimeout);
+                return true;
+            } catch (ElasticsearchIllegalStateException e) {
+                if (joinAttempt >= this.joinRetryAttempts) {
+                    logger.info("failed to send join request to master [{}], reason [{}]. Tried [{}] times",
+                            masterNode, e.getDetailedMessage(), joinAttempt + 1);
+                    return false;
+                } else {
+                    logger.trace("master {} failed with [{}]. retrying... (attempts done: [{}])", masterNode, e.getDetailedMessage(), joinAttempt + 1);
+                }
+            } catch (Exception e) {
+                if (logger.isTraceEnabled()) {
+                    logger.trace("failed to send join request to master [{}]", e);
+                } else if (e instanceof ElasticsearchException) {
+                    logger.info("failed to send join request to master [{}], reason [{}]", masterNode, ((ElasticsearchException) e).getDetailedMessage());
+                } else {
+                    logger.info("failed to send join request to master [{}], reason [{}]", masterNode, e.getMessage());
+                }
+                return false;
+            }
+
+            try {
+                Thread.sleep(this.joinRetryDelay.millis());
+            } catch (InterruptedException e) {
+                Thread.currentThread().interrupt();
+            }
+        }
+        return false;
+    }
+
     private void handleLeaveRequest(final DiscoveryNode node) {
         if (lifecycleState() != Lifecycle.State.STARTED) {
             // not started, ignore a node failure
@@ -887,17 +923,10 @@ private DiscoveryNode findMaster() {
                 return null;
             }
             // lets tie break between discovered nodes
-            DiscoveryNode electedMaster = electMaster.electMaster(possibleMasterNodes);
-            if (localNode.equals(electedMaster)) {
-                return localNode;
-            }
+            return electMaster.electMaster(possibleMasterNodes);
         } else {
-            DiscoveryNode electedMaster = electMaster.electMaster(pingMasters);
-            if (electedMaster != null) {
-                return electedMaster;
-            }
+            return electMaster.electMaster(pingMasters);
         }
-        return null;
     }
 
     private ClusterState rejoin(ClusterState clusterState, String reason) {
@@ -1028,8 +1057,7 @@ public ClusterState execute(ClusterState currentState) {
                 public void onFailure(String source, Throwable t) {
                     if (t instanceof ClusterService.NoLongerMasterException) {
                         logger.debug("not processing [{}] as we are no longer master", source);
-                    }
-                    else {
+                    } else {
                         logger.error("unexpected failure during [{}]", t, source);
                     }
                 }

From cccd060a0c1e052b261a15b4edf671b10e13ee1d Mon Sep 17 00:00:00 2001
From: Boaz Leskes <b.leskes@gmail.com>
Date: Wed, 23 Jul 2014 01:03:18 +0300
Subject: [PATCH 53/74] [Discovery] verify we have a master after a successful
 join request

After master election, nodes send join requests to the elected master. Master is then responsible for publishing a new cluster state which sets the master on the local node's cluster state. If something goes wrong with the cluster state publishing, this process will not successfully complete. We should check it after the join request returns and if it failed, retry pinging.

Closes #6969
---
 .../discovery/zen/ZenDiscovery.java           |  6 ++
 .../DiscoveryWithNetworkFailuresTests.java    | 64 ++++++++++++++++++-
 .../test/transport/MockTransportService.java  | 11 +++-
 3 files changed, 77 insertions(+), 4 deletions(-)

diff --git a/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java b/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
index f1621484df973..fa40467da81bf 100644
--- a/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
+++ b/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
@@ -364,6 +364,12 @@ public void clusterStateProcessed(String source, ClusterState oldState, ClusterS
                     continue;
                 }
 
+                if (latestDiscoNodes.masterNode() == null) {
+                    logger.debug("no master node is set, despite of join request completing. retrying pings");
+                    retry = true;
+                    continue;
+                }
+
                 masterFD.start(masterNode, "initial_join");
                 // no need to submit the received cluster state, we will get it from the master when it publishes
                 // the fact that we joined
diff --git a/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java b/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java
index b03c6ea6b660d..a08ba12f4fb56 100644
--- a/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java
+++ b/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java
@@ -25,10 +25,13 @@
 import org.elasticsearch.action.get.GetResponse;
 import org.elasticsearch.action.index.IndexResponse;
 import org.elasticsearch.client.Client;
+import org.elasticsearch.cluster.ClusterService;
 import org.elasticsearch.cluster.ClusterState;
 import org.elasticsearch.cluster.block.ClusterBlock;
 import org.elasticsearch.cluster.block.ClusterBlockLevel;
 import org.elasticsearch.cluster.metadata.IndexMetaData;
+import org.elasticsearch.cluster.node.DiscoveryNode;
+import org.elasticsearch.cluster.node.DiscoveryNodes;
 import org.elasticsearch.cluster.routing.operation.hash.djb.DjbHashFunction;
 import org.elasticsearch.common.Nullable;
 import org.elasticsearch.common.Priority;
@@ -45,10 +48,11 @@
 import org.elasticsearch.test.disruption.*;
 import org.elasticsearch.test.junit.annotations.TestLogging;
 import org.elasticsearch.test.transport.MockTransportService;
-import org.elasticsearch.transport.TransportModule;
+import org.elasticsearch.transport.*;
 import org.junit.Before;
 import org.junit.Test;
 
+import java.io.IOException;
 import java.util.*;
 import java.util.concurrent.*;
 import java.util.concurrent.atomic.AtomicBoolean;
@@ -93,10 +97,14 @@ protected int numberOfReplicas() {
     }
 
     private List<String> startCluster(int numberOfNodes) throws ExecutionException, InterruptedException {
+        return startCluster(numberOfNodes, -1);
+    }
+
+    private List<String> startCluster(int numberOfNodes, int minimumMasterNode) throws ExecutionException, InterruptedException {
         if (randomBoolean()) {
-            return startMulticastCluster(numberOfNodes, -1);
+            return startMulticastCluster(numberOfNodes, minimumMasterNode);
         } else {
-            return startUnicastCluster(numberOfNodes, null, -1);
+            return startUnicastCluster(numberOfNodes, null, minimumMasterNode);
         }
     }
 
@@ -585,6 +593,56 @@ public void unicastSinglePingResponseContainsMaster() throws Exception {
         assertMaster(masterNode, nodes);
     }
 
+
+    /** Test cluster join with issues in cluster state publishing * */
+    @Test
+    @TestLogging("discovery.zen:TRACE,action:TRACE")
+    public void testClusterJoinDespiteOfPublishingIssues() throws Exception {
+        List<String> nodes = startCluster(2, 1);
+
+        String masterNode = internalCluster().getMasterName();
+        String nonMasterNode;
+        if (masterNode.equals(nodes.get(0))) {
+            nonMasterNode = nodes.get(1);
+        } else {
+            nonMasterNode = nodes.get(0);
+        }
+
+        DiscoveryNodes discoveryNodes = internalCluster().getInstance(ClusterService.class, nonMasterNode).state().nodes();
+
+        logger.info("blocking requests from non master [{}] to master [{}]", nonMasterNode, masterNode);
+        MockTransportService nonMasterTransportService = (MockTransportService) internalCluster().getInstance(TransportService.class, nonMasterNode);
+        nonMasterTransportService.addFailToSendNoConnectRule(discoveryNodes.masterNode());
+
+        assertNoMaster(nonMasterNode);
+
+        logger.info("blocking cluster state publishing from master [{}] to non master [{}]", masterNode, nonMasterNode);
+        MockTransportService masterTransportService = (MockTransportService) internalCluster().getInstance(TransportService.class, masterNode);
+        masterTransportService.addFailToSendNoConnectRule(discoveryNodes.localNode(), "discovery/zen/publish");
+
+        logger.info("allowing requests from non master [{}] to master [{}], waiting for two join request", nonMasterNode, masterNode);
+        final CountDownLatch countDownLatch = new CountDownLatch(2);
+        nonMasterTransportService.addDelegate(discoveryNodes.masterNode(), new MockTransportService.DelegateTransport(nonMasterTransportService.original()) {
+            @Override
+            public void sendRequest(DiscoveryNode node, long requestId, String action, TransportRequest request, TransportRequestOptions options) throws IOException, TransportException {
+                if (action.equals("discovery/zen/join")) {
+                    countDownLatch.countDown();
+                }
+                super.sendRequest(node, requestId, action, request, options);
+            }
+        });
+
+        countDownLatch.await();
+
+        logger.info("waiting for cluster to reform");
+        masterTransportService.clearRule(discoveryNodes.localNode());
+        nonMasterTransportService.clearRule(discoveryNodes.masterNode());
+
+        ensureStableCluster(2);
+
+    }
+
+
     protected NetworkPartition addRandomPartition() {
         NetworkPartition partition;
         if (randomBoolean()) {
diff --git a/src/test/java/org/elasticsearch/test/transport/MockTransportService.java b/src/test/java/org/elasticsearch/test/transport/MockTransportService.java
index 3f180f9c5e5bb..cf088bab4763f 100644
--- a/src/test/java/org/elasticsearch/test/transport/MockTransportService.java
+++ b/src/test/java/org/elasticsearch/test/transport/MockTransportService.java
@@ -37,6 +37,8 @@
 import org.elasticsearch.transport.*;
 
 import java.io.IOException;
+import java.util.Arrays;
+import java.util.HashSet;
 import java.util.Set;
 import java.util.concurrent.ConcurrentMap;
 
@@ -98,6 +100,13 @@ public void sendRequest(DiscoveryNode node, long requestId, String action, Trans
         });
     }
 
+    /**
+     * Adds a rule that will cause matching operations to throw ConnectTransportExceptions
+     */
+    public void addFailToSendNoConnectRule(DiscoveryNode node, final String... blockedActions) {
+        addFailToSendNoConnectRule(node, new HashSet<>(Arrays.asList(blockedActions)));
+    }
+
     /**
      * Adds a rule that will cause matching operations to throw ConnectTransportExceptions
      */
@@ -307,11 +316,11 @@ public static class DelegateTransport implements Transport {
 
         protected final Transport transport;
 
+
         public DelegateTransport(Transport transport) {
             this.transport = transport;
         }
 
-
         @Override
         public void transportServiceAdapter(TransportServiceAdapter service) {
             transport.transportServiceAdapter(service);

From 0244ddb0cda278b9c48326985304e26c60cb858f Mon Sep 17 00:00:00 2001
From: Shay Banon <kimchy@gmail.com>
Date: Wed, 23 Jul 2014 00:46:35 +0200
Subject: [PATCH 54/74] retry logic to unwrap exception to check for illegal
 state it probably comes wrapped in a remote exception, which we should unwrap
 in order to detect it..., also, simplified a bit the retry logic

---
 .../discovery/zen/ZenDiscovery.java           | 35 ++++++++++---------
 1 file changed, 18 insertions(+), 17 deletions(-)

diff --git a/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java b/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
index fa40467da81bf..5149c4e3b3ef5 100644
--- a/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
+++ b/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
@@ -24,6 +24,7 @@
 import com.google.common.collect.Sets;
 import org.elasticsearch.ElasticsearchException;
 import org.elasticsearch.ElasticsearchIllegalStateException;
+import org.elasticsearch.ExceptionsHelper;
 import org.elasticsearch.Version;
 import org.elasticsearch.cluster.*;
 import org.elasticsearch.cluster.block.ClusterBlocks;
@@ -390,28 +391,29 @@ private boolean joinElectedMaster(DiscoveryNode masterNode) {
             logger.warn("failed to connect to master [{}], retrying...", e, masterNode);
             return false;
         }
-        for (int joinAttempt = 0; joinAttempt < this.joinRetryAttempts; joinAttempt++) {
+        int joinAttempt = 0; // we retry on illegal state if the master is not yet ready
+        while (true) {
             try {
                 logger.trace("joining master {}", masterNode);
                 membership.sendJoinRequestBlocking(masterNode, localNode, joinTimeout);
                 return true;
-            } catch (ElasticsearchIllegalStateException e) {
-                if (joinAttempt >= this.joinRetryAttempts) {
-                    logger.info("failed to send join request to master [{}], reason [{}]. Tried [{}] times",
-                            masterNode, e.getDetailedMessage(), joinAttempt + 1);
-                    return false;
-                } else {
-                    logger.trace("master {} failed with [{}]. retrying... (attempts done: [{}])", masterNode, e.getDetailedMessage(), joinAttempt + 1);
-                }
-            } catch (Exception e) {
-                if (logger.isTraceEnabled()) {
-                    logger.trace("failed to send join request to master [{}]", e);
-                } else if (e instanceof ElasticsearchException) {
-                    logger.info("failed to send join request to master [{}], reason [{}]", masterNode, ((ElasticsearchException) e).getDetailedMessage());
+            } catch (Throwable t) {
+                Throwable unwrap = ExceptionsHelper.unwrapCause(t);
+                if (unwrap instanceof ElasticsearchIllegalStateException) {
+                    if (++joinAttempt == this.joinRetryAttempts) {
+                        logger.info("failed to send join request to master [{}], reason [{}], tried [{}] times", masterNode, ExceptionsHelper.detailedMessage(t), joinAttempt);
+                        return false;
+                    } else {
+                        logger.trace("master {} failed with [{}]. retrying... (attempts done: [{}])", masterNode, ExceptionsHelper.detailedMessage(t), joinAttempt);
+                    }
                 } else {
-                    logger.info("failed to send join request to master [{}], reason [{}]", masterNode, e.getMessage());
+                    if (logger.isTraceEnabled()) {
+                        logger.trace("failed to send join request to master [{}]", t);
+                    } else {
+                        logger.info("failed to send join request to master [{}], reason [{}]", masterNode, ExceptionsHelper.detailedMessage(t));
+                    }
+                    return false;
                 }
-                return false;
             }
 
             try {
@@ -420,7 +422,6 @@ private boolean joinElectedMaster(DiscoveryNode masterNode) {
                 Thread.currentThread().interrupt();
             }
         }
-        return false;
     }
 
     private void handleLeaveRequest(final DiscoveryNode node) {

From 130e680cfbce810a4c70d1cb724d0ec25cd87d55 Mon Sep 17 00:00:00 2001
From: Martijn van Groningen <martijn.v.groningen@gmail.com>
Date: Wed, 23 Jul 2014 20:29:27 +0200
Subject: [PATCH 55/74] [Discovery] Made the handeling of the join request
 batch oriented.

In large clusters when a new elected master is chosen, there are many join requests to handle. By batching them up the the cluster state doesn't get published for each individual join request, but many handled at the same time, which results into a single new cluster state which ends up be published.

Closes #6984
---
 .../discovery/zen/ZenDiscovery.java           | 56 +++++++++++++------
 1 file changed, 39 insertions(+), 17 deletions(-)

diff --git a/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java b/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
index 5149c4e3b3ef5..7da058ca2af15 100644
--- a/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
+++ b/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
@@ -36,6 +36,7 @@
 import org.elasticsearch.cluster.routing.allocation.AllocationService;
 import org.elasticsearch.cluster.routing.allocation.RoutingAllocation;
 import org.elasticsearch.common.Priority;
+import org.elasticsearch.common.collect.Tuple;
 import org.elasticsearch.common.component.AbstractLifecycleComponent;
 import org.elasticsearch.common.component.Lifecycle;
 import org.elasticsearch.common.inject.Inject;
@@ -129,10 +130,11 @@ public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implemen
 
     private volatile boolean rejoinOnMasterGone;
 
-
     @Nullable
     private NodeService nodeService;
 
+    private final BlockingQueue<Tuple<DiscoveryNode, MembershipAction.JoinCallback>> processJoinRequests = ConcurrentCollections.newBlockingQueue();
+
     @Inject
     public ZenDiscovery(Settings settings, ClusterName clusterName, ThreadPool threadPool,
                         TransportService transportService, ClusterService clusterService, NodeSettingsService nodeSettingsService,
@@ -822,26 +824,42 @@ private void handleJoinRequest(final DiscoveryNode node, final MembershipAction.
             // validate the join request, will throw a failure if it fails, which will get back to the
             // node calling the join request
             membership.sendValidateJoinRequestBlocking(node, joinTimeout);
-
+            processJoinRequests.add(new Tuple<>(node, callback));
             clusterService.submitStateUpdateTask("zen-disco-receive(join from node[" + node + "])", Priority.IMMEDIATE, new ProcessedClusterStateUpdateTask() {
+
+                private final List<Tuple<DiscoveryNode, MembershipAction.JoinCallback>> drainedTasks = new ArrayList<>();
+
                 @Override
                 public ClusterState execute(ClusterState currentState) {
-                    if (currentState.nodes().nodeExists(node.id())) {
-                        // the node already exists in the cluster
-                        logger.info("received a join request for an existing node [{}]", node);
-                        // still send a new cluster state, so it will be re published and possibly update the other node
-                        return ClusterState.builder(currentState).build();
+                    processJoinRequests.drainTo(drainedTasks);
+                    if (drainedTasks.isEmpty()) {
+                        return currentState;
                     }
-                    DiscoveryNodes.Builder builder = DiscoveryNodes.builder(currentState.nodes());
-                    for (DiscoveryNode existingNode : currentState.nodes()) {
-                        if (node.address().equals(existingNode.address())) {
-                            builder.remove(existingNode.id());
-                            logger.warn("received join request from node [{}], but found existing node {} with same address, removing existing node", node, existingNode);
+
+                    boolean modified = false;
+                    DiscoveryNodes.Builder nodesBuilder = DiscoveryNodes.builder(currentState.nodes());
+                    for (Tuple<DiscoveryNode, MembershipAction.JoinCallback> task : drainedTasks) {
+                        DiscoveryNode node = task.v1();
+                        if (currentState.nodes().nodeExists(node.id())) {
+                            logger.debug("received a join request for an existing node [{}]", node);
+                        } else {
+                            modified = true;
+                            nodesBuilder.put(node);
+                            for (DiscoveryNode existingNode : currentState.nodes()) {
+                                if (node.address().equals(existingNode.address())) {
+                                    nodesBuilder.remove(existingNode.id());
+                                    logger.warn("received join request from node [{}], but found existing node {} with same address, removing existing node", node, existingNode);
+                                }
+                            }
                         }
                     }
-                    latestDiscoNodes = builder.build();
-                    // add the new node now (will update latestDiscoNodes on publish)
-                    return ClusterState.builder(currentState).nodes(latestDiscoNodes.newNode(node)).build();
+
+                    ClusterState.Builder stateBuilder = ClusterState.builder(currentState);
+                    if (modified) {
+                        latestDiscoNodes = nodesBuilder.build();
+                        stateBuilder.nodes(latestDiscoNodes);
+                    }
+                    return stateBuilder.build();
                 }
 
                 @Override
@@ -851,12 +869,16 @@ public void onFailure(String source, Throwable t) {
                     } else {
                         logger.error("unexpected failure during [{}]", t, source);
                     }
-                    callback.onFailure(t);
+                    for (Tuple<DiscoveryNode, MembershipAction.JoinCallback> drainedTask : drainedTasks) {
+                        drainedTask.v2().onFailure(t);
+                    }
                 }
 
                 @Override
                 public void clusterStateProcessed(String source, ClusterState oldState, ClusterState newState) {
-                    callback.onSuccess();
+                    for (Tuple<DiscoveryNode, MembershipAction.JoinCallback> drainedTask : drainedTasks) {
+                        drainedTask.v2().onSuccess();
+                    }
                 }
             });
         }

From 364374dd03ee5986926e03846013552c43b4f5e7 Mon Sep 17 00:00:00 2001
From: Martijn van Groningen <martijn.v.groningen@gmail.com>
Date: Fri, 25 Jul 2014 17:31:45 +0200
Subject: [PATCH 56/74] [TEST] Added test that verifies that no shard
 relocations happen during / after a master re-election.

---
 .../recovery/TransportRecoveryAction.java     |  4 +-
 .../zen/ZenDiscoveryRejoinOnMaster.java       | 54 ++++++++++++++++++-
 2 files changed, 54 insertions(+), 4 deletions(-)

diff --git a/src/main/java/org/elasticsearch/action/admin/indices/recovery/TransportRecoveryAction.java b/src/main/java/org/elasticsearch/action/admin/indices/recovery/TransportRecoveryAction.java
index c0e9a65de346b..2e54d5cf181ce 100644
--- a/src/main/java/org/elasticsearch/action/admin/indices/recovery/TransportRecoveryAction.java
+++ b/src/main/java/org/elasticsearch/action/admin/indices/recovery/TransportRecoveryAction.java
@@ -173,12 +173,12 @@ protected GroupShardsIterator shards(ClusterState state, RecoveryRequest request
 
     @Override
     protected ClusterBlockException checkGlobalBlock(ClusterState state, RecoveryRequest request) {
-        return state.blocks().globalBlockedException(ClusterBlockLevel.METADATA);
+        return state.blocks().globalBlockedException(ClusterBlockLevel.READ);
     }
 
     @Override
     protected ClusterBlockException checkRequestBlock(ClusterState state, RecoveryRequest request, String[] concreteIndices) {
-        return state.blocks().indicesBlockedException(ClusterBlockLevel.METADATA, concreteIndices);
+        return state.blocks().indicesBlockedException(ClusterBlockLevel.READ, concreteIndices);
     }
 
     static class ShardRecoveryRequest extends BroadcastShardOperationRequest {
diff --git a/src/test/java/org/elasticsearch/discovery/zen/ZenDiscoveryRejoinOnMaster.java b/src/test/java/org/elasticsearch/discovery/zen/ZenDiscoveryRejoinOnMaster.java
index 83cc76af5f3b9..31cede9260254 100644
--- a/src/test/java/org/elasticsearch/discovery/zen/ZenDiscoveryRejoinOnMaster.java
+++ b/src/test/java/org/elasticsearch/discovery/zen/ZenDiscoveryRejoinOnMaster.java
@@ -19,13 +19,16 @@
 
 package org.elasticsearch.discovery.zen;
 
+import org.elasticsearch.action.admin.cluster.health.ClusterHealthResponse;
+import org.elasticsearch.action.admin.indices.recovery.RecoveryResponse;
+import org.elasticsearch.common.Priority;
 import org.elasticsearch.common.settings.ImmutableSettings;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.discovery.Discovery;
 import org.elasticsearch.test.ElasticsearchIntegrationTest;
 import org.junit.Test;
 
-import static org.hamcrest.Matchers.is;
+import static org.hamcrest.Matchers.*;
 
 /**
  */
@@ -33,7 +36,7 @@
 public class ZenDiscoveryRejoinOnMaster extends ElasticsearchIntegrationTest {
 
     @Test
-    public void testChangeRejoinOnMaster() throws Exception {
+    public void testChangeRejoinOnMasterOptionIsDynamic() throws Exception {
         Settings nodeSettings = ImmutableSettings.settingsBuilder()
                 .put("discovery.type", "zen") // <-- To override the local setting if set externally
                 .build();
@@ -48,4 +51,51 @@ public void testChangeRejoinOnMaster() throws Exception {
         assertThat(zenDiscovery.isRejoinOnMasterGone(), is(false));
     }
 
+    @Test
+    public void testNoShardRelocationsOccurWhenElectedMasterNodeFails() throws Exception {
+        Settings defaultSettings = ImmutableSettings.builder()
+                .put("discovery.zen.fd.ping_timeout", "1s")
+                .put("discovery.zen.fd.ping_retries", "1")
+                .put("discovery.type", "zen")
+                .build();
+
+        Settings masterNodeSettings = ImmutableSettings.builder()
+                .put("node.data", false)
+                .put(defaultSettings)
+                .build();
+        internalCluster().startNodesAsync(2, masterNodeSettings).get();
+        Settings dateNodeSettings = ImmutableSettings.builder()
+                .put("node.master", false)
+                .put(defaultSettings)
+                .build();
+        internalCluster().startNodesAsync(2, dateNodeSettings).get();
+        ClusterHealthResponse clusterHealthResponse = client().admin().cluster().prepareHealth()
+                .setWaitForEvents(Priority.LANGUID)
+                .setWaitForNodes("4")
+                .setWaitForRelocatingShards(0)
+                .get();
+        assertThat(clusterHealthResponse.isTimedOut(), is(false));
+
+        createIndex("test");
+        ensureSearchable("test");
+        RecoveryResponse r = client().admin().indices().prepareRecoveries("test").get();
+        int numRecoveriesBeforeNewMaster = r.shardResponses().get("test").size();
+
+        final String oldMaster = internalCluster().getMasterName();
+        internalCluster().stopCurrentMasterNode();
+        assertBusy(new Runnable() {
+            @Override
+            public void run() {
+                String current = internalCluster().getMasterName();
+                assertThat(current, notNullValue());
+                assertThat(current, not(equalTo(oldMaster)));
+            }
+        });
+        ensureSearchable("test");
+
+        r = client().admin().indices().prepareRecoveries("test").get();
+        int numRecoveriesAfterNewMaster = r.shardResponses().get("test").size();
+        assertThat(numRecoveriesAfterNewMaster, equalTo(numRecoveriesBeforeNewMaster));
+    }
+
 }

From 4b8456e9540bfb351c1ed9acdf4c292f4b10931e Mon Sep 17 00:00:00 2001
From: Martijn van Groningen <martijn.v.groningen@gmail.com>
Date: Mon, 28 Jul 2014 16:04:25 +0200
Subject: [PATCH 57/74] [Discovery] Master fault detection and nodes fault
 detection should take cluster name into account.

Both master fault detection and nodes fault detection request should also send the cluster name, so that on the receiving side the handling of these requests can be failed with an error. This error can be caught on the sending side and for master fault detection the node can fail the master locally and for nodes fault detection the node can be failed.

Note this validation will most likely never fail in a production cluster, but in during automated tests where cluster / nodes are created and destroyed very frequently.
---
 .../discovery/zen/ZenDiscovery.java           |  4 +-
 .../zen/fd/MasterFaultDetection.java          | 40 ++++++++++++++++---
 .../discovery/zen/fd/NodesFaultDetection.java | 39 ++++++++++++++----
 .../discovery/ZenFaultDetectionTests.java     |  7 +++-
 4 files changed, 74 insertions(+), 16 deletions(-)

diff --git a/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java b/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
index 7da058ca2af15..07028f19a36a7 100644
--- a/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
+++ b/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
@@ -166,10 +166,10 @@ public ZenDiscovery(Settings settings, ClusterName clusterName, ThreadPool threa
         this.electMaster = new ElectMasterService(settings);
         nodeSettingsService.addListener(new ApplySettings());
 
-        this.masterFD = new MasterFaultDetection(settings, threadPool, transportService, this);
+        this.masterFD = new MasterFaultDetection(settings, threadPool, transportService, this, clusterName);
         this.masterFD.addListener(new MasterNodeFailureListener());
 
-        this.nodesFD = new NodesFaultDetection(settings, threadPool, transportService);
+        this.nodesFD = new NodesFaultDetection(settings, threadPool, transportService, clusterName);
         this.nodesFD.addListener(new NodeFailureListener());
 
         this.publishClusterState = new PublishClusterStateAction(settings, transportService, this, new NewClusterStateListener(), discoverySettings);
diff --git a/src/main/java/org/elasticsearch/discovery/zen/fd/MasterFaultDetection.java b/src/main/java/org/elasticsearch/discovery/zen/fd/MasterFaultDetection.java
index b601884002cd4..b4f635184e7ea 100644
--- a/src/main/java/org/elasticsearch/discovery/zen/fd/MasterFaultDetection.java
+++ b/src/main/java/org/elasticsearch/discovery/zen/fd/MasterFaultDetection.java
@@ -20,6 +20,8 @@
 package org.elasticsearch.discovery.zen.fd;
 
 import org.elasticsearch.ElasticsearchIllegalStateException;
+import org.elasticsearch.Version;
+import org.elasticsearch.cluster.ClusterName;
 import org.elasticsearch.cluster.node.DiscoveryNode;
 import org.elasticsearch.cluster.node.DiscoveryNodes;
 import org.elasticsearch.common.component.AbstractComponent;
@@ -58,6 +60,8 @@ public static interface Listener {
 
     private final DiscoveryNodesProvider nodesProvider;
 
+    private final ClusterName clusterName;
+
     private final CopyOnWriteArrayList<Listener> listeners = new CopyOnWriteArrayList<>();
 
 
@@ -85,11 +89,13 @@ public static interface Listener {
 
     private final AtomicBoolean notifiedMasterFailure = new AtomicBoolean();
 
-    public MasterFaultDetection(Settings settings, ThreadPool threadPool, TransportService transportService, DiscoveryNodesProvider nodesProvider) {
+    public MasterFaultDetection(Settings settings, ThreadPool threadPool, TransportService transportService,
+                                DiscoveryNodesProvider nodesProvider, ClusterName clusterName) {
         super(settings);
         this.threadPool = threadPool;
         this.transportService = transportService;
         this.nodesProvider = nodesProvider;
+        this.clusterName = clusterName;
 
         this.connectOnNetworkDisconnect = componentSettings.getAsBoolean("connect_on_network_disconnect", false);
         this.pingInterval = componentSettings.getAsTime("ping_interval", timeValueSeconds(1));
@@ -270,8 +276,10 @@ public void run() {
                 threadPool.schedule(pingInterval, ThreadPool.Names.SAME, MasterPinger.this);
                 return;
             }
-            transportService.sendRequest(masterToPing, MASTER_PING_ACTION_NAME, new MasterPingRequest(nodesProvider.nodes().localNode().id(), masterToPing.id()), options().withType(TransportRequestOptions.Type.PING).withTimeout(pingRetryTimeout),
-                    new BaseTransportResponseHandler<MasterPingResponseResponse>() {
+            final MasterPingRequest request = new MasterPingRequest(nodesProvider.nodes().localNode().id(), masterToPing.id(), clusterName);
+            final TransportRequestOptions options = options().withType(TransportRequestOptions.Type.PING).withTimeout(pingRetryTimeout);
+            transportService.sendRequest(masterToPing, MASTER_PING_ACTION_NAME, request, options, new BaseTransportResponseHandler<MasterPingResponseResponse>() {
+
                         @Override
                         public MasterPingResponseResponse newInstance() {
                             return new MasterPingResponseResponse();
@@ -328,7 +336,7 @@ public void handleException(TransportException exp) {
                                         notifyMasterFailure(masterToPing, "failed to ping, tried [" + pingRetryCount + "] times, each with  maximum [" + pingRetryTimeout + "] timeout");
                                     } else {
                                         // resend the request, not reschedule, rely on send timeout
-                                        transportService.sendRequest(masterToPing, MASTER_PING_ACTION_NAME, new MasterPingRequest(nodesProvider.nodes().localNode().id(), masterToPing.id()), options().withType(TransportRequestOptions.Type.PING).withTimeout(pingRetryTimeout), this);
+                                        transportService.sendRequest(masterToPing, MASTER_PING_ACTION_NAME, request, options, this);
                                     }
                                 }
                             }
@@ -351,6 +359,14 @@ public Throwable fillInStackTrace() {
     }
 
     static class NotMasterException extends ElasticsearchIllegalStateException {
+
+        NotMasterException(String msg) {
+            super(msg);
+        }
+
+        NotMasterException() {
+        }
+
         @Override
         public Throwable fillInStackTrace() {
             return null;
@@ -379,6 +395,12 @@ public void messageReceived(MasterPingRequest request, TransportChannel channel)
             if (!request.masterNodeId.equals(nodes.localNodeId())) {
                 throw new NotMasterException();
             }
+
+            if (request.clusterName != null && !request.clusterName.equals(clusterName)) {
+                logger.trace("master fault detection ping request is targeted for a different [{}] cluster then us [{}]", request.clusterName, clusterName);
+                throw new NotMasterException("master fault detection ping request is targeted for a different [" + request.clusterName + "] cluster then us [" + clusterName + "]");
+            }
+
             // if we are no longer master, fail...
             if (!nodes.localNodeMaster()) {
                 throw new NoLongerMasterException();
@@ -402,13 +424,15 @@ private static class MasterPingRequest extends TransportRequest {
         private String nodeId;
 
         private String masterNodeId;
+        private ClusterName clusterName;
 
         private MasterPingRequest() {
         }
 
-        private MasterPingRequest(String nodeId, String masterNodeId) {
+        private MasterPingRequest(String nodeId, String masterNodeId, ClusterName clusterName) {
             this.nodeId = nodeId;
             this.masterNodeId = masterNodeId;
+            this.clusterName = clusterName;
         }
 
         @Override
@@ -416,6 +440,9 @@ public void readFrom(StreamInput in) throws IOException {
             super.readFrom(in);
             nodeId = in.readString();
             masterNodeId = in.readString();
+            if (in.getVersion().onOrAfter(Version.V_1_4_0)) {
+                clusterName = ClusterName.readClusterName(in);
+            }
         }
 
         @Override
@@ -423,6 +450,9 @@ public void writeTo(StreamOutput out) throws IOException {
             super.writeTo(out);
             out.writeString(nodeId);
             out.writeString(masterNodeId);
+            if (out.getVersion().onOrAfter(Version.V_1_4_0)) {
+                clusterName.writeTo(out);
+            }
         }
     }
 
diff --git a/src/main/java/org/elasticsearch/discovery/zen/fd/NodesFaultDetection.java b/src/main/java/org/elasticsearch/discovery/zen/fd/NodesFaultDetection.java
index b808e080f2103..87dfe2ba2f7ac 100644
--- a/src/main/java/org/elasticsearch/discovery/zen/fd/NodesFaultDetection.java
+++ b/src/main/java/org/elasticsearch/discovery/zen/fd/NodesFaultDetection.java
@@ -20,6 +20,8 @@
 package org.elasticsearch.discovery.zen.fd;
 
 import org.elasticsearch.ElasticsearchIllegalStateException;
+import org.elasticsearch.Version;
+import org.elasticsearch.cluster.ClusterName;
 import org.elasticsearch.cluster.node.DiscoveryNode;
 import org.elasticsearch.cluster.node.DiscoveryNodes;
 import org.elasticsearch.common.component.AbstractComponent;
@@ -54,6 +56,7 @@ public static interface Listener {
     private final ThreadPool threadPool;
 
     private final TransportService transportService;
+    private final ClusterName clusterName;
 
 
     private final boolean connectOnNetworkDisconnect;
@@ -78,10 +81,11 @@ public static interface Listener {
 
     private volatile boolean running = false;
 
-    public NodesFaultDetection(Settings settings, ThreadPool threadPool, TransportService transportService) {
+    public NodesFaultDetection(Settings settings, ThreadPool threadPool, TransportService transportService, ClusterName clusterName) {
         super(settings);
         this.threadPool = threadPool;
         this.transportService = transportService;
+        this.clusterName = clusterName;
 
         this.connectOnNetworkDisconnect = componentSettings.getAsBoolean("connect_on_network_disconnect", false);
         this.pingInterval = componentSettings.getAsTime("ping_interval", timeValueSeconds(1));
@@ -204,8 +208,9 @@ public void run() {
             if (!running) {
                 return;
             }
-            transportService.sendRequest(node, PING_ACTION_NAME, new PingRequest(node.id()), options().withType(TransportRequestOptions.Type.PING).withTimeout(pingRetryTimeout),
-                    new BaseTransportResponseHandler<PingResponse>() {
+            final PingRequest pingRequest = new PingRequest(node.id(), clusterName);
+            final TransportRequestOptions options = options().withType(TransportRequestOptions.Type.PING).withTimeout(pingRetryTimeout);
+            transportService.sendRequest(node, PING_ACTION_NAME, pingRequest, options, new BaseTransportResponseHandler<PingResponse>() {
                         @Override
                         public PingResponse newInstance() {
                             return new PingResponse();
@@ -252,8 +257,7 @@ public void handleException(TransportException exp) {
                                     }
                                 } else {
                                     // resend the request, not reschedule, rely on send timeout
-                                    transportService.sendRequest(node, PING_ACTION_NAME, new PingRequest(node.id()),
-                                            options().withType(TransportRequestOptions.Type.PING).withTimeout(pingRetryTimeout), this);
+                                    transportService.sendRequest(node, PING_ACTION_NAME, pingRequest, options, this);
                                 }
                             }
                         }
@@ -298,6 +302,10 @@ public void messageReceived(PingRequest request, TransportChannel channel) throw
             if (!latestNodes.localNodeId().equals(request.nodeId)) {
                 throw new ElasticsearchIllegalStateException("Got pinged as node [" + request.nodeId + "], but I am node [" + latestNodes.localNodeId() + "]");
             }
+            if (request.clusterName != null && !request.clusterName.equals(clusterName)) {
+                // Don't introduce new exception for bwc reasons
+                throw new ElasticsearchIllegalStateException("Got pinged with cluster name [" + request.clusterName + "], but I'm part of cluster [" + clusterName + "]");
+            }
             channel.sendResponse(new PingResponse());
         }
 
@@ -308,28 +316,45 @@ public String executor() {
     }
 
 
-    static class PingRequest extends TransportRequest {
+    public static class PingRequest extends TransportRequest {
 
         // the (assumed) node id we are pinging
         private String nodeId;
 
+        private ClusterName clusterName;
+
         PingRequest() {
         }
 
-        PingRequest(String nodeId) {
+        PingRequest(String nodeId, ClusterName clusterName) {
             this.nodeId = nodeId;
+            this.clusterName = clusterName;
+        }
+
+        public String nodeId() {
+            return nodeId;
+        }
+
+        public ClusterName clusterName() {
+            return clusterName;
         }
 
         @Override
         public void readFrom(StreamInput in) throws IOException {
             super.readFrom(in);
             nodeId = in.readString();
+            if (in.getVersion().onOrAfter(Version.V_1_4_0)) {
+                clusterName = ClusterName.readClusterName(in);
+            }
         }
 
         @Override
         public void writeTo(StreamOutput out) throws IOException {
             super.writeTo(out);
             out.writeString(nodeId);
+            if (out.getVersion().onOrAfter(Version.V_1_4_0)) {
+                clusterName.writeTo(out);
+            }
         }
     }
 
diff --git a/src/test/java/org/elasticsearch/discovery/ZenFaultDetectionTests.java b/src/test/java/org/elasticsearch/discovery/ZenFaultDetectionTests.java
index 3f65ed1591eb3..457e7a5b4cd3d 100644
--- a/src/test/java/org/elasticsearch/discovery/ZenFaultDetectionTests.java
+++ b/src/test/java/org/elasticsearch/discovery/ZenFaultDetectionTests.java
@@ -21,6 +21,7 @@
 
 import com.google.common.collect.ImmutableMap;
 import org.elasticsearch.Version;
+import org.elasticsearch.cluster.ClusterName;
 import org.elasticsearch.cluster.node.DiscoveryNode;
 import org.elasticsearch.cluster.node.DiscoveryNodes;
 import org.elasticsearch.common.settings.ImmutableSettings;
@@ -131,7 +132,7 @@ public void testNodesFaultDetectionConnectOnDisconnect() throws InterruptedExcep
         boolean shouldRetry = randomBoolean();
         // make sure we don't ping
         settings.put("discovery.zen.fd.connect_on_network_disconnect", shouldRetry).put("discovery.zen.fd.ping_interval", "5m");
-        NodesFaultDetection nodesFD = new NodesFaultDetection(settings.build(), threadPool, serviceA);
+        NodesFaultDetection nodesFD = new NodesFaultDetection(settings.build(), threadPool, serviceA, new ClusterName("test"));
         nodesFD.start();
         nodesFD.updateNodes(buildNodesForA(true));
         final String[] failureReason = new String[1];
@@ -165,6 +166,7 @@ public void testMasterFaultDetectionConnectOnDisconnect() throws InterruptedExce
         boolean shouldRetry = randomBoolean();
         // make sure we don't ping
         settings.put("discovery.zen.fd.connect_on_network_disconnect", shouldRetry).put("discovery.zen.fd.ping_interval", "5m");
+        ClusterName clusterName = new ClusterName(randomAsciiOfLengthBetween(3, 20));
         final DiscoveryNodes nodes = buildNodesForA(false);
         MasterFaultDetection masterFD = new MasterFaultDetection(settings.build(), threadPool, serviceA,
                 new DiscoveryNodesProvider() {
@@ -177,7 +179,8 @@ public DiscoveryNodes nodes() {
                     public NodeService nodeService() {
                         return null;
                     }
-                }
+                },
+                clusterName
         );
         masterFD.start(nodeB, "test");
 

From 50f852ffeb93a09c086738dfd69eb60cd8b4da03 Mon Sep 17 00:00:00 2001
From: Boaz Leskes <b.leskes@gmail.com>
Date: Tue, 29 Jul 2014 21:16:25 +0200
Subject: [PATCH 58/74] [TEST] Added LongGCDisruption and a test simulating GC
 on master nodes Also rename DiscoveryWithNetworkFailuresTests to
 DiscoveryWithServiceDisruptions which better suites what we do.

---
 pom.xml                                       |   5 +
 .../discovery/zen/ZenDiscovery.java           |  11 +-
 .../ClusterDiscoveryConfiguration.java        |   3 +-
 ...a => DiscoveryWithServiceDisruptions.java} | 112 +++++++++--
 .../test/disruption/LongGCDisruption.java     | 177 ++++++++++++++++++
 5 files changed, 293 insertions(+), 15 deletions(-)
 rename src/test/java/org/elasticsearch/discovery/{DiscoveryWithNetworkFailuresTests.java => DiscoveryWithServiceDisruptions.java} (85%)
 create mode 100644 src/test/java/org/elasticsearch/test/disruption/LongGCDisruption.java

diff --git a/pom.xml b/pom.xml
index 2c6753eec0219..e01f6b8e6895d 100644
--- a/pom.xml
+++ b/pom.xml
@@ -1219,6 +1219,11 @@
                                 <bundledSignature>jdk-unsafe</bundledSignature>
                                 <bundledSignature>jdk-deprecated</bundledSignature>
                             </bundledSignatures>
+                            <excludes>
+                                <!-- start exclude for test GC simulation using Thread.suspend -->
+                                <exclude>org/elasticsearch/test/disruption/LongGCDisruption.class</exclude>
+                                <!-- end exclude for Channels -->
+                            </excludes>
                             <signaturesFiles>
                                 <signaturesFile>test-signatures.txt</signaturesFile>
                                 <signaturesFile>all-signatures.txt</signaturesFile>
diff --git a/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java b/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
index 07028f19a36a7..a1d8d626b1e48 100644
--- a/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
+++ b/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
@@ -313,6 +313,15 @@ public void run() {
         });
     }
 
+
+    /**
+     * returns true if there is a currently a background thread active for (re)joining the cluster
+     * used for testing.
+     */
+    public boolean joiningCluster() {
+        return currentJoinThread != null;
+    }
+
     private void innerJoinCluster() {
         boolean retry = true;
         while (retry) {
@@ -410,7 +419,7 @@ private boolean joinElectedMaster(DiscoveryNode masterNode) {
                     }
                 } else {
                     if (logger.isTraceEnabled()) {
-                        logger.trace("failed to send join request to master [{}]", t);
+                        logger.trace("failed to send join request to master [{}]", t, masterNode);
                     } else {
                         logger.info("failed to send join request to master [{}], reason [{}]", masterNode, ExceptionsHelper.detailedMessage(t));
                     }
diff --git a/src/test/java/org/elasticsearch/discovery/ClusterDiscoveryConfiguration.java b/src/test/java/org/elasticsearch/discovery/ClusterDiscoveryConfiguration.java
index 345422aad526b..7036dda1a20e1 100644
--- a/src/test/java/org/elasticsearch/discovery/ClusterDiscoveryConfiguration.java
+++ b/src/test/java/org/elasticsearch/discovery/ClusterDiscoveryConfiguration.java
@@ -120,7 +120,8 @@ public Settings node(int nodeOrdinal) {
                     .put("discovery.zen.ping.multicast.enabled", false);
 
             String[] unicastHosts = new String[unicastHostOrdinals.length];
-            if (InternalTestCluster.NODE_MODE.equals("local")) {
+            String mode = baseSettings.get("node.mode", InternalTestCluster.NODE_MODE);
+            if (mode.equals("local")) {
                 builder.put(LocalTransport.TRANSPORT_LOCAL_ADDRESS, "node_" + nodeOrdinal);
                 for (int i = 0; i < unicastHosts.length; i++) {
                     unicastHosts[i] = "node_" + unicastHostOrdinals[i];
diff --git a/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java b/src/test/java/org/elasticsearch/discovery/DiscoveryWithServiceDisruptions.java
similarity index 85%
rename from src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java
rename to src/test/java/org/elasticsearch/discovery/DiscoveryWithServiceDisruptions.java
index a08ba12f4fb56..c1099ead87ba8 100644
--- a/src/test/java/org/elasticsearch/discovery/DiscoveryWithNetworkFailuresTests.java
+++ b/src/test/java/org/elasticsearch/discovery/DiscoveryWithServiceDisruptions.java
@@ -19,9 +19,11 @@
 
 package org.elasticsearch.discovery;
 
+import com.google.common.base.Predicate;
 import org.apache.lucene.util.LuceneTestCase;
 import org.elasticsearch.ElasticsearchException;
 import org.elasticsearch.action.admin.cluster.health.ClusterHealthResponse;
+import org.elasticsearch.action.admin.cluster.state.ClusterStateResponse;
 import org.elasticsearch.action.get.GetResponse;
 import org.elasticsearch.action.index.IndexResponse;
 import org.elasticsearch.client.Client;
@@ -39,6 +41,7 @@
 import org.elasticsearch.common.settings.ImmutableSettings;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.common.unit.TimeValue;
+import org.elasticsearch.discovery.zen.ZenDiscovery;
 import org.elasticsearch.discovery.zen.elect.ElectMasterService;
 import org.elasticsearch.discovery.zen.ping.ZenPing;
 import org.elasticsearch.discovery.zen.ping.ZenPingService;
@@ -62,14 +65,14 @@
 import static org.elasticsearch.test.ElasticsearchIntegrationTest.ClusterScope;
 import static org.elasticsearch.test.ElasticsearchIntegrationTest.Scope;
 import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked;
-import static org.hamcrest.Matchers.equalTo;
-import static org.hamcrest.Matchers.is;
+import static org.hamcrest.Matchers.*;
 
 /**
  */
-@ClusterScope(scope = Scope.TEST, numDataNodes = 0, transportClientRatio = 0)
+@LuceneTestCase.Slow
 @TestLogging("discovery.zen:TRACE")
-public class DiscoveryWithNetworkFailuresTests extends ElasticsearchIntegrationTest {
+@ClusterScope(scope = Scope.TEST, numDataNodes = 0, transportClientRatio = 0)
+public class DiscoveryWithServiceDisruptions extends ElasticsearchIntegrationTest {
 
     private static final TimeValue DISRUPTION_HEALING_OVERHEAD = TimeValue.timeValueSeconds(40); // we use 30s as timeout in many places.
 
@@ -109,8 +112,9 @@ private List<String> startCluster(int numberOfNodes, int minimumMasterNode) thro
     }
 
     final static Settings DEFAULT_SETTINGS = ImmutableSettings.builder()
-            .put("discovery.zen.fd.ping_timeout", "1s") // <-- for hitting simulated network failures quickly
-            .put("discovery.zen.fd.ping_retries", "1") // <-- for hitting simulated network failures quickly
+            .put("discovery.zen.fd.ping_timeout", "1s") // for hitting simulated network failures quickly
+            .put("discovery.zen.fd.ping_retries", "1") // for hitting simulated network failures quickly
+            .put("discovery.zen.join_timeout", "10s")  // still long to induce failures but to long so test won't time out
             .put(DiscoverySettings.PUBLISH_TIMEOUT, "1s") // <-- for hitting simulated network failures quickly
             .put("http.enabled", false) // just to make test quicker
             .put("gateway.local.list_timeout", "10s") // still long to induce failures but to long so test won't time out
@@ -136,21 +140,26 @@ private List<String> startMulticastCluster(int numberOfNodes, int minimumMasterN
         return nodes;
     }
 
-    private List<String> startUnicastCluster(int numberOfNodes,@Nullable int[] unicastHostsOrdinals, int minimumMasterNode) throws ExecutionException, InterruptedException {
+    private List<String> startUnicastCluster(int numberOfNodes, @Nullable int[] unicastHostsOrdinals, int minimumMasterNode) throws ExecutionException, InterruptedException {
+        return startUnicastCluster(numberOfNodes, unicastHostsOrdinals, minimumMasterNode, ImmutableSettings.EMPTY);
+    }
+
+    private List<String> startUnicastCluster(int numberOfNodes, @Nullable int[] unicastHostsOrdinals, int minimumMasterNode, Settings settings) throws ExecutionException, InterruptedException {
         if (minimumMasterNode < 0) {
             minimumMasterNode = numberOfNodes / 2 + 1;
         }
         // TODO: Rarely use default settings form some of these
-        Settings settings = ImmutableSettings.builder()
+        Settings nodeSettings = ImmutableSettings.builder()
                 .put(DEFAULT_SETTINGS)
+                .put(settings)
                 .put(ElectMasterService.DISCOVERY_ZEN_MINIMUM_MASTER_NODES, minimumMasterNode)
                 .build();
 
         if (discoveryConfig == null) {
             if (unicastHostsOrdinals == null) {
-                discoveryConfig = new ClusterDiscoveryConfiguration.UnicastZen(numberOfNodes, settings);
+                discoveryConfig = new ClusterDiscoveryConfiguration.UnicastZen(numberOfNodes, nodeSettings);
             } else {
-                discoveryConfig = new ClusterDiscoveryConfiguration.UnicastZen(numberOfNodes, settings, unicastHostsOrdinals);
+                discoveryConfig = new ClusterDiscoveryConfiguration.UnicastZen(numberOfNodes, nodeSettings, unicastHostsOrdinals);
             }
         }
         List<String> nodes = internalCluster().startNodesAsync(numberOfNodes).get();
@@ -494,6 +503,58 @@ public void run() {
         }
     }
 
+    /**
+     * Test that cluster recovers from a long GC on master that causes other nodes to elect a new one
+     */
+    @Test
+    @TestLogging("discovery.zen:TRACE,action:TRACE,cluster.service:TRACE,indices.recovery:TRACE,indices.cluster:TRACE")
+    public void testMasterNodeGCs() throws Exception {
+        // TODO: on mac OS multicast threads are shared between nodes and we therefore we can't simulate GC and stop pinging for just one node
+        // find a way to block thread creation in the generic thread pool to avoid this.
+        // TODO: with local transport the threads of the source node enter the target node, since everything is local and like above we can't simulate GC on one node
+        // with netty transport the threads of different nodes don't touch each other due to the network threading Netty uses
+        List<String> nodes = startUnicastCluster(3, null, -1, ImmutableSettings.builder().put("node.mode", "network").build());
+
+        String oldMasterNode = internalCluster().getMasterName();
+        // a very long GC, but it's OK as we remove the disruption when it has had an effect
+        SingleNodeDisruption masterNodeDisruption = new LongGCDisruption(oldMasterNode, getRandom(), 100, 200, 30000, 60000);
+        internalCluster().setDisruptionScheme(masterNodeDisruption);
+        masterNodeDisruption.startDisrupting();
+
+        Set<String> oldNonMasterNodesSet = new HashSet<>(nodes);
+        oldNonMasterNodesSet.remove(oldMasterNode);
+
+        List<String> oldNonMasterNodes = new ArrayList<>(oldNonMasterNodesSet);
+
+        logger.info("waiting for nodes to de-elect master [{}]", oldMasterNode);
+        for (String node : oldNonMasterNodesSet) {
+            assertDifferentMaster(node, oldMasterNode);
+        }
+
+        logger.info("waiting for nodes to elect a new master");
+        ensureStableCluster(2, oldNonMasterNodes.get(0));
+
+        logger.info("waiting for any pinging to stop");
+        for (final String node : oldNonMasterNodes) {
+            assertTrue("node [" + node + "] is still joining master", awaitBusy(new Predicate<Object>() {
+                @Override
+                public boolean apply(Object input) {
+                    return !((ZenDiscovery) internalCluster().getInstance(Discovery.class, node)).joiningCluster();
+                }
+            }, 30, TimeUnit.SECONDS));
+        }
+
+        // restore GC
+        masterNodeDisruption.stopDisrupting();
+        ensureStableCluster(3, new TimeValue(DISRUPTION_HEALING_OVERHEAD.millis() + masterNodeDisruption.expectedTimeToHeal().millis()),
+                oldNonMasterNodes.get(0));
+
+        // make sure all nodes agree on master
+        String newMaster = internalCluster().getMasterName();
+        assertThat(newMaster, not(equalTo(oldMasterNode)));
+        assertMaster(newMaster, nodes);
+    }
+
     /**
      * Test that a document which is indexed on the majority side of a partition, is available from the minory side,
      * once the partition is healed
@@ -559,7 +620,7 @@ public void testRejoinDocumentExistsInAllShardCopies() throws Exception {
     @Test
     @TestLogging("discovery.zen:TRACE,action:TRACE")
     public void unicastSinglePingResponseContainsMaster() throws Exception {
-        List<String> nodes = startUnicastCluster(4, new int[] {0}, -1);
+        List<String> nodes = startUnicastCluster(4, new int[]{0}, -1);
         // Figure out what is the elected master node
         final String masterNode = internalCluster().getMasterName();
         logger.info("---> legit elected master node=" + masterNode);
@@ -699,6 +760,9 @@ private void ensureStableCluster(int nodeCount, @Nullable String viaNode) {
     }
 
     private void ensureStableCluster(int nodeCount, TimeValue timeValue, @Nullable String viaNode) {
+        if (viaNode == null) {
+            viaNode = randomFrom(internalCluster().getNodeNames());
+        }
         logger.debug("ensuring cluster is stable with [{}] nodes. access node: [{}]. timeout: [{}]", nodeCount, viaNode, timeValue);
         ClusterHealthResponse clusterHealthResponse = client(viaNode).admin().cluster().prepareHealth()
                 .setWaitForEvents(Priority.LANGUID)
@@ -706,6 +770,11 @@ private void ensureStableCluster(int nodeCount, TimeValue timeValue, @Nullable S
                 .setTimeout(timeValue)
                 .setWaitForRelocatingShards(0)
                 .get();
+        if (clusterHealthResponse.isTimedOut()) {
+            ClusterStateResponse stateResponse = client(viaNode).admin().cluster().prepareState().get();
+            fail("failed to reach a stable cluster of [" + nodeCount + "] nodes. Tried via [" + viaNode + "]. last cluster state:\n"
+                    + stateResponse.getState().prettyPrint());
+        }
         assertThat(clusterHealthResponse.isTimedOut(), is(false));
     }
 
@@ -736,11 +805,28 @@ public void run() {
         }, maxWaitTime.getMillis(), TimeUnit.MILLISECONDS);
     }
 
+    private void assertDifferentMaster(final String node, final String oldMasterNode) throws Exception {
+        assertBusy(new Runnable() {
+            @Override
+            public void run() {
+                ClusterState state = getNodeClusterState(node);
+                String masterNode = null;
+                if (state.nodes().masterNode() != null) {
+                    masterNode = state.nodes().masterNode().name();
+                }
+                logger.trace("[{}] master is [{}]", node, state.nodes().masterNode());
+                assertThat("node [" + node + "] still has [" + masterNode + "] as master",
+                        oldMasterNode, not(equalTo(masterNode)));
+            }
+        }, 10, TimeUnit.SECONDS);
+    }
+
     private void assertMaster(String masterNode, List<String> nodes) {
         for (String node : nodes) {
             ClusterState state = getNodeClusterState(node);
-            assertThat(state.nodes().size(), equalTo(nodes.size()));
-            assertThat(state.nodes().masterNode().name(), equalTo(masterNode));
+            String failMsgSuffix = "cluster_state:\n" + state.prettyPrint();
+            assertThat("wrong node count on [" + node + "]. " + failMsgSuffix, state.nodes().size(), equalTo(nodes.size()));
+            assertThat("wrong master on node [" + node + "]. " + failMsgSuffix, state.nodes().masterNode().name(), equalTo(masterNode));
         }
     }
 }
diff --git a/src/test/java/org/elasticsearch/test/disruption/LongGCDisruption.java b/src/test/java/org/elasticsearch/test/disruption/LongGCDisruption.java
new file mode 100644
index 0000000000000..d2fa09cb7dd7a
--- /dev/null
+++ b/src/test/java/org/elasticsearch/test/disruption/LongGCDisruption.java
@@ -0,0 +1,177 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.elasticsearch.test.disruption;
+
+import org.elasticsearch.common.unit.TimeValue;
+
+import java.util.HashSet;
+import java.util.Random;
+import java.util.Set;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.regex.Pattern;
+
+public class LongGCDisruption extends SingleNodeDisruption {
+
+    volatile boolean disrupting;
+    volatile Thread worker;
+
+    final long intervalBetweenDelaysMin;
+    final long intervalBetweenDelaysMax;
+    final long delayDurationMin;
+    final long delayDurationMax;
+
+
+    public LongGCDisruption(Random random) {
+        this(null, random);
+    }
+
+    public LongGCDisruption(String disruptedNode, Random random) {
+        this(disruptedNode, random, 100, 200, 300, 20000);
+    }
+
+    public LongGCDisruption(String disruptedNode, Random random, long intervalBetweenDelaysMin,
+                            long intervalBetweenDelaysMax, long delayDurationMin, long delayDurationMax) {
+        this(random, intervalBetweenDelaysMin, intervalBetweenDelaysMax, delayDurationMin, delayDurationMax);
+        this.disruptedNode = disruptedNode;
+    }
+
+    public LongGCDisruption(Random random,
+                            long intervalBetweenDelaysMin, long intervalBetweenDelaysMax, long delayDurationMin,
+                            long delayDurationMax) {
+        super(random);
+        this.intervalBetweenDelaysMin = intervalBetweenDelaysMin;
+        this.intervalBetweenDelaysMax = intervalBetweenDelaysMax;
+        this.delayDurationMin = delayDurationMin;
+        this.delayDurationMax = delayDurationMax;
+    }
+
+    final static AtomicInteger thread_ids = new AtomicInteger();
+
+    @Override
+    public void startDisrupting() {
+        disrupting = true;
+        worker = new Thread(new BackgroundWorker(), "long_gc_simulation_" + thread_ids.incrementAndGet());
+        worker.setDaemon(true);
+        worker.start();
+    }
+
+    @Override
+    public void stopDisrupting() {
+        if (worker == null) {
+            return;
+        }
+        logger.info("stopping long GCs on [{}]", disruptedNode);
+        disrupting = false;
+        worker.interrupt();
+        try {
+            worker.join(2 * (intervalBetweenDelaysMax + delayDurationMax));
+        } catch (InterruptedException e) {
+            logger.info("background thread failed to stop");
+        }
+        worker = null;
+    }
+
+    final static Pattern[] unsafeClasses = new Pattern[]{
+            // logging has shared JVM locks - we may suspend a thread and block other nodes from doing their thing
+            Pattern.compile("Logger")
+    };
+
+    private boolean stopNodeThreads(String node, Set<Thread> nodeThreads) {
+        Set<Thread> allThreadsSet = Thread.getAllStackTraces().keySet();
+        boolean stopped = false;
+        final String nodeThreadNamePart = "[" + node + "]";
+        for (Thread thread : allThreadsSet) {
+            String name = thread.getName();
+            if (name.contains(nodeThreadNamePart)) {
+                if (thread.isAlive() && nodeThreads.add(thread)) {
+                    stopped = true;
+                    thread.suspend();
+                    // double check the thread is not in a shared resource like logging. If so, let it go and come back..
+                    boolean safe = true;
+                    safe:
+                    for (StackTraceElement stackElement : thread.getStackTrace()) {
+                        String className = stackElement.getClassName();
+                        for (Pattern unsafePattern : unsafeClasses) {
+                            if (unsafePattern.matcher(className).find()) {
+                                safe = false;
+                                break safe;
+                            }
+                        }
+                    }
+                    if (!safe) {
+                        thread.resume();
+                        nodeThreads.remove(thread);
+                    }
+                }
+            }
+        }
+        return stopped;
+    }
+
+    private void resumeThreads(Set<Thread> threads) {
+        for (Thread thread : threads) {
+            thread.resume();
+        }
+    }
+
+    private void simulateLongGC(final TimeValue duration) throws InterruptedException {
+        final String disruptionNodeCopy = disruptedNode;
+        if (disruptionNodeCopy == null) {
+            return;
+        }
+        logger.info("node [{}] goes into GC for for [{}]", disruptionNodeCopy, duration);
+        final Set<Thread> nodeThreads = new HashSet<>();
+        try {
+            while (stopNodeThreads(disruptionNodeCopy, nodeThreads)) ;
+            if (!nodeThreads.isEmpty()) {
+                Thread.sleep(duration.millis());
+            }
+        } finally {
+            logger.info("node [{}] resumes from GC", disruptionNodeCopy);
+            resumeThreads(nodeThreads);
+        }
+    }
+
+    @Override
+    public TimeValue expectedTimeToHeal() {
+        return TimeValue.timeValueMillis(0);
+    }
+
+    class BackgroundWorker implements Runnable {
+
+        @Override
+        public void run() {
+            while (disrupting && disruptedNode != null) {
+                try {
+                    TimeValue duration = new TimeValue(delayDurationMin + random.nextInt((int) (delayDurationMax - delayDurationMin)));
+                    simulateLongGC(duration);
+
+                    duration = new TimeValue(intervalBetweenDelaysMin + random.nextInt((int) (intervalBetweenDelaysMax - intervalBetweenDelaysMin)));
+                    if (disrupting && disruptedNode != null) {
+                        Thread.sleep(duration.millis());
+                    }
+                } catch (InterruptedException e) {
+                } catch (Exception e) {
+                    logger.error("error in background worker", e);
+                }
+            }
+        }
+    }
+
+}

From 403ebc9e07e0fa0da226c354321b75f6a8b173cf Mon Sep 17 00:00:00 2001
From: Martijn van Groningen <martijn.v.groningen@gmail.com>
Date: Thu, 31 Jul 2014 10:41:52 +0200
Subject: [PATCH 59/74] [Discovery] Added cluster version and master node to
 the nodes fault detecting ping request

The cluster state version allows resolving the case where a old master node become unresponsive and later wakes up and pings all the nodes in the cluster, allowing the newly elected master to decide whether it should step down or ask the old master to rejoin.
---
 .../discovery/zen/ZenDiscovery.java           | 97 +++++++++++++------
 .../discovery/zen/fd/NodesFaultDetection.java | 50 +++++++++-
 .../discovery/ZenFaultDetectionTests.java     |  2 +-
 3 files changed, 116 insertions(+), 33 deletions(-)

diff --git a/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java b/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
index a1d8d626b1e48..7c76afd16400c 100644
--- a/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
+++ b/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
@@ -70,6 +70,7 @@
 import java.util.concurrent.BlockingQueue;
 import java.util.concurrent.CopyOnWriteArrayList;
 import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicInteger;
 
 import static com.google.common.collect.Lists.newArrayList;
 import static org.elasticsearch.common.unit.TimeValue.timeValueSeconds;
@@ -106,6 +107,8 @@ public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implemen
     /** how long to wait before performing another join attempt after a join request failed with an retriable error */
     private final TimeValue joinRetryDelay;
 
+    /** how many pings from *another* master to tolerate before forcing a rejoin on other or local master */
+    private final int maxPingsFromAnotherMaster;
 
     // a flag that should be used only for testing
     private final boolean sendLeaveRequest;
@@ -155,6 +158,7 @@ public ZenDiscovery(Settings settings, ClusterName clusterName, ThreadPool threa
         this.joinTimeout = settings.getAsTime("discovery.zen.join_timeout", TimeValue.timeValueMillis(pingTimeout.millis() * 20));
         this.joinRetryAttempts = settings.getAsInt("discovery.zen.join_retry_attempts", 3);
         this.joinRetryDelay = settings.getAsTime("discovery.zen.join_retry_delay", TimeValue.timeValueMillis(100));
+        this.maxPingsFromAnotherMaster = settings.getAsInt("discovery.zen.max_pings_from_another_master", 3);
         this.sendLeaveRequest = componentSettings.getAsBoolean("send_leave_request", true);
 
         this.masterElectionFilterClientNodes = settings.getAsBoolean("discovery.zen.master_election.filter_client", true);
@@ -170,7 +174,7 @@ public ZenDiscovery(Settings settings, ClusterName clusterName, ThreadPool threa
         this.masterFD.addListener(new MasterNodeFailureListener());
 
         this.nodesFD = new NodesFaultDetection(settings, threadPool, transportService, clusterName);
-        this.nodesFD.addListener(new NodeFailureListener());
+        this.nodesFD.addListener(new NodeFaultDetectionListener());
 
         this.publishClusterState = new PublishClusterStateAction(settings, transportService, this, new NewClusterStateListener(), discoverySettings);
         this.pingService.setNodesProvider(this);
@@ -196,7 +200,7 @@ protected void doStart() throws ElasticsearchException {
         final String nodeId = DiscoveryService.generateNodeId(settings);
         localNode = new DiscoveryNode(settings.get("name"), nodeId, transportService.boundAddress().publishAddress(), nodeAttributes, version);
         latestDiscoNodes = new DiscoveryNodes.Builder().put(localNode).localNodeId(localNode.id()).build();
-        nodesFD.updateNodes(latestDiscoNodes);
+        nodesFD.updateNodes(latestDiscoNodes, -1);
         pingService.start();
 
         // do the join on a different thread, the DiscoveryService waits for 30s anyhow till it is discovered
@@ -290,7 +294,7 @@ public void publish(ClusterState clusterState, AckListener ackListener) {
             throw new ElasticsearchIllegalStateException("Shouldn't publish state when not master");
         }
         latestDiscoNodes = clusterState.nodes();
-        nodesFD.updateNodes(clusterState.nodes());
+        nodesFD.updateNodes(clusterState.nodes(), clusterState.version());
         publishClusterState.publish(clusterState, ackListener);
     }
 
@@ -650,29 +654,7 @@ void handleNewClusterStateFromMaster(ClusterState newClusterState, final Publish
             clusterService.submitStateUpdateTask("zen-disco-master_receive_cluster_state_from_another_master [" + newState.nodes().masterNode() + "]", Priority.URGENT, new ProcessedClusterStateUpdateTask() {
                 @Override
                 public ClusterState execute(ClusterState currentState) {
-                    if (newState.version() > currentState.version()) {
-                        logger.warn("received cluster state from [{}] which is also master but with a newer cluster_state, rejoining to cluster...", newState.nodes().masterNode());
-                        return rejoin(currentState, "zen-disco-master_receive_cluster_state_from_another_master [" + newState.nodes().masterNode() + "]");
-                    } else {
-                        logger.warn("received cluster state from [{}] which is also master but with an older cluster_state, telling [{}] to rejoin the cluster", newState.nodes().masterNode(), newState.nodes().masterNode());
-
-                        try {
-                            // make sure we're connected to this node (connect to node does nothing if we're already connected)
-                            // since the network connections are asymmetric, it may be that we received a state but have disconnected from the node
-                            // in the past (after a master failure, for example)
-                            transportService.connectToNode(newState.nodes().masterNode());
-                            transportService.sendRequest(newState.nodes().masterNode(), DISCOVERY_REJOIN_ACTION_NAME, new RejoinClusterRequest(currentState.nodes().localNodeId()), new EmptyTransportResponseHandler(ThreadPool.Names.SAME) {
-                                @Override
-                                public void handleException(TransportException exp) {
-                                    logger.warn("failed to send rejoin request to [{}]", exp, newState.nodes().masterNode());
-                                }
-                            });
-                        } catch (Exception e) {
-                            logger.warn("failed to send rejoin request to [{}]", e, newState.nodes().masterNode());
-                        }
-
-                        return currentState;
-                    }
+                    return handleAnotherMaster(currentState, newState.nodes().masterNode(), newState.version(), "via a new cluster state");
                 }
 
                 @Override
@@ -988,6 +970,31 @@ private ClusterState rejoin(ClusterState clusterState, String reason) {
                 .build();
     }
 
+    private ClusterState handleAnotherMaster(ClusterState localClusterState, final DiscoveryNode otherMaster, long otherClusterStateVersion, String reason) {
+        assert master : "handleAnotherMaster called but current node is not a master";
+        if (otherClusterStateVersion > localClusterState.version()) {
+            return rejoin(localClusterState, "zen-disco-discovered another master with a new cluster_state [" + otherMaster + "][" + reason + "]");
+        } else {
+            logger.warn("discovered [{}] which is also master but with an older cluster_state, telling [{}] to rejoin the cluster ([{}])", otherMaster, otherMaster, reason);
+            try {
+                // make sure we're connected to this node (connect to node does nothing if we're already connected)
+                // since the network connections are asymmetric, it may be that we received a state but have disconnected from the node
+                // in the past (after a master failure, for example)
+                transportService.connectToNode(otherMaster);
+                transportService.sendRequest(otherMaster, DISCOVERY_REJOIN_ACTION_NAME, new RejoinClusterRequest(localClusterState.nodes().localNodeId()), new EmptyTransportResponseHandler(ThreadPool.Names.SAME) {
+
+                    @Override
+                    public void handleException(TransportException exp) {
+                        logger.warn("failed to send rejoin request to [{}]", exp, otherMaster);
+                    }
+                });
+            } catch (Exception e) {
+                logger.warn("failed to send rejoin request to [{}]", e, otherMaster);
+            }
+            return localClusterState;
+        }
+    }
+
     private void sendInitialStateEventIfNeeded() {
         if (initialStateSent.compareAndSet(false, true)) {
             for (InitialStateDiscoveryListener listener : initialStateListeners) {
@@ -1016,12 +1023,48 @@ public void onLeave(DiscoveryNode node) {
         }
     }
 
-    private class NodeFailureListener implements NodesFaultDetection.Listener {
+    private class NodeFaultDetectionListener extends NodesFaultDetection.Listener {
+
+        private final AtomicInteger pingsWhileMaster = new AtomicInteger(0);
 
         @Override
         public void onNodeFailure(DiscoveryNode node, String reason) {
             handleNodeFailure(node, reason);
         }
+
+        @Override
+        public void onPingReceived(final NodesFaultDetection.PingRequest pingRequest) {
+            // if we are master, we don't expect any fault detection from another node. If we get it
+            // means we potentially have two masters in the cluster.
+            if (!master) {
+                pingsWhileMaster.set(0);
+                return;
+            }
+
+            // nodes pre 1.4.0 do not send this information
+            if (pingRequest.masterNode() == null) {
+                return;
+            }
+
+            if (pingsWhileMaster.incrementAndGet() < maxPingsFromAnotherMaster) {
+                logger.trace("got a ping from another master {}. current ping count: [{}]", pingRequest.masterNode(), pingsWhileMaster.get());
+                return;
+            }
+            logger.debug("got a ping from another master {}. resolving who should rejoin. current ping count: [{}]", pingRequest.masterNode(), pingsWhileMaster.get());
+            clusterService.submitStateUpdateTask("ping from another master", Priority.URGENT, new ClusterStateUpdateTask() {
+
+                @Override
+                public ClusterState execute(ClusterState currentState) throws Exception {
+                    pingsWhileMaster.set(0);
+                    return handleAnotherMaster(currentState, pingRequest.masterNode(), pingRequest.clusterStateVersion(), "node fd ping");
+                }
+
+                @Override
+                public void onFailure(String source, Throwable t) {
+                    logger.debug("unexpected error during cluster state update task after pings from another master", t);
+                }
+            });
+        }
     }
 
     private class MasterNodeFailureListener implements MasterFaultDetection.Listener {
diff --git a/src/main/java/org/elasticsearch/discovery/zen/fd/NodesFaultDetection.java b/src/main/java/org/elasticsearch/discovery/zen/fd/NodesFaultDetection.java
index 87dfe2ba2f7ac..8f90a7564c451 100644
--- a/src/main/java/org/elasticsearch/discovery/zen/fd/NodesFaultDetection.java
+++ b/src/main/java/org/elasticsearch/discovery/zen/fd/NodesFaultDetection.java
@@ -47,10 +47,13 @@
 public class NodesFaultDetection extends AbstractComponent {
 
     public static final String PING_ACTION_NAME = "internal:discovery/zen/fd/ping";
+    
+    public abstract static class Listener {
 
-    public static interface Listener {
+        public void onNodeFailure(DiscoveryNode node, String reason) {}
+
+        public void onPingReceived(PingRequest pingRequest) {}
 
-        void onNodeFailure(DiscoveryNode node, String reason);
     }
 
     private final ThreadPool threadPool;
@@ -79,6 +82,8 @@ public static interface Listener {
 
     private volatile DiscoveryNodes latestNodes = EMPTY_NODES;
 
+    private volatile long clusterStateVersion = -1;
+
     private volatile boolean running = false;
 
     public NodesFaultDetection(Settings settings, ThreadPool threadPool, TransportService transportService, ClusterName clusterName) {
@@ -111,9 +116,10 @@ public void removeListener(Listener listener) {
         listeners.remove(listener);
     }
 
-    public void updateNodes(DiscoveryNodes nodes) {
+    public void updateNodes(DiscoveryNodes nodes, long clusterStateVersion) {
         DiscoveryNodes prevNodes = latestNodes;
         this.latestNodes = nodes;
+        this.clusterStateVersion = clusterStateVersion;
         if (!running) {
             return;
         }
@@ -195,6 +201,19 @@ public void run() {
         });
     }
 
+    private void notifyPingRecieved(final PingRequest pingRequest) {
+        threadPool.generic().execute(new Runnable() {
+
+            @Override
+            public void run() {
+                for (Listener listener : listeners) {
+                    listener.onPingReceived(pingRequest);
+                }
+            }
+
+        });
+    }
+
     private class SendPingRequest implements Runnable {
 
         private final DiscoveryNode node;
@@ -208,7 +227,7 @@ public void run() {
             if (!running) {
                 return;
             }
-            final PingRequest pingRequest = new PingRequest(node.id(), clusterName);
+            final PingRequest pingRequest = new PingRequest(node.id(), clusterName, latestNodes.localNode(), clusterStateVersion);
             final TransportRequestOptions options = options().withType(TransportRequestOptions.Type.PING).withTimeout(pingRetryTimeout);
             transportService.sendRequest(node, PING_ACTION_NAME, pingRequest, options, new BaseTransportResponseHandler<PingResponse>() {
                         @Override
@@ -306,6 +325,9 @@ public void messageReceived(PingRequest request, TransportChannel channel) throw
                 // Don't introduce new exception for bwc reasons
                 throw new ElasticsearchIllegalStateException("Got pinged with cluster name [" + request.clusterName + "], but I'm part of cluster [" + clusterName + "]");
             }
+
+            notifyPingRecieved(request);
+
             channel.sendResponse(new PingResponse());
         }
 
@@ -323,12 +345,18 @@ public static class PingRequest extends TransportRequest {
 
         private ClusterName clusterName;
 
+        private DiscoveryNode masterNode;
+
+        private long clusterStateVersion = -1;
+
         PingRequest() {
         }
 
-        PingRequest(String nodeId, ClusterName clusterName) {
+        PingRequest(String nodeId, ClusterName clusterName, DiscoveryNode masterNode, long clusterStateVersion) {
             this.nodeId = nodeId;
             this.clusterName = clusterName;
+            this.masterNode = masterNode;
+            this.clusterStateVersion = clusterStateVersion;
         }
 
         public String nodeId() {
@@ -339,12 +367,22 @@ public ClusterName clusterName() {
             return clusterName;
         }
 
+        public DiscoveryNode masterNode() {
+            return masterNode;
+        }
+
+        public long clusterStateVersion() {
+            return clusterStateVersion;
+        }
+
         @Override
         public void readFrom(StreamInput in) throws IOException {
             super.readFrom(in);
             nodeId = in.readString();
             if (in.getVersion().onOrAfter(Version.V_1_4_0)) {
                 clusterName = ClusterName.readClusterName(in);
+                masterNode = DiscoveryNode.readNode(in);
+                clusterStateVersion = in.readLong();
             }
         }
 
@@ -354,6 +392,8 @@ public void writeTo(StreamOutput out) throws IOException {
             out.writeString(nodeId);
             if (out.getVersion().onOrAfter(Version.V_1_4_0)) {
                 clusterName.writeTo(out);
+                masterNode.writeTo(out);
+                out.writeLong(clusterStateVersion);
             }
         }
     }
diff --git a/src/test/java/org/elasticsearch/discovery/ZenFaultDetectionTests.java b/src/test/java/org/elasticsearch/discovery/ZenFaultDetectionTests.java
index 457e7a5b4cd3d..553267971e50e 100644
--- a/src/test/java/org/elasticsearch/discovery/ZenFaultDetectionTests.java
+++ b/src/test/java/org/elasticsearch/discovery/ZenFaultDetectionTests.java
@@ -134,7 +134,7 @@ public void testNodesFaultDetectionConnectOnDisconnect() throws InterruptedExcep
         settings.put("discovery.zen.fd.connect_on_network_disconnect", shouldRetry).put("discovery.zen.fd.ping_interval", "5m");
         NodesFaultDetection nodesFD = new NodesFaultDetection(settings.build(), threadPool, serviceA, new ClusterName("test"));
         nodesFD.start();
-        nodesFD.updateNodes(buildNodesForA(true));
+        nodesFD.updateNodes(buildNodesForA(true), -1);
         final String[] failureReason = new String[1];
         final DiscoveryNode[] failureNode = new DiscoveryNode[1];
         final CountDownLatch notified = new CountDownLatch(1);

From 47326adb6786e4ae0f5dee8c2ebafb9b8926492c Mon Sep 17 00:00:00 2001
From: Martijn van Groningen <martijn.v.groningen@gmail.com>
Date: Thu, 31 Jul 2014 12:14:42 +0200
Subject: [PATCH 60/74] [TEST] Make sure all shards are allocated before
 killing a random data node.

---
 .../java/org/elasticsearch/cluster/NoMasterNodeTests.java   | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/test/java/org/elasticsearch/cluster/NoMasterNodeTests.java b/src/test/java/org/elasticsearch/cluster/NoMasterNodeTests.java
index 94c0268cdd6d2..6d30da7fd81c9 100644
--- a/src/test/java/org/elasticsearch/cluster/NoMasterNodeTests.java
+++ b/src/test/java/org/elasticsearch/cluster/NoMasterNodeTests.java
@@ -21,6 +21,7 @@
 
 import org.elasticsearch.action.ActionRequestBuilder;
 import com.google.common.base.Predicate;
+import org.elasticsearch.action.admin.cluster.state.ClusterStateResponse;
 import org.elasticsearch.action.bulk.BulkRequestBuilder;
 import org.elasticsearch.action.count.CountResponse;
 import org.elasticsearch.action.get.GetResponse;
@@ -225,6 +226,11 @@ public void testNoMasterActions_writeMasterBlock() throws Exception {
         client().prepareIndex("test2", "type1", "1").setSource("field", "value1").get();
         refresh();
 
+        ensureSearchable("test1", "test2");
+
+        ClusterStateResponse clusterState = client().admin().cluster().prepareState().get();
+        logger.info("Cluster state:\n" + clusterState.getState().prettyPrint());
+
         internalCluster().stopRandomDataNode();
         assertThat(awaitBusy(new Predicate<Object>() {
             public boolean apply(Object o) {

From 966a55d21cdb60074913c3c2a8fa4ac33487a409 Mon Sep 17 00:00:00 2001
From: Martijn van Groningen <martijn.v.groningen@gmail.com>
Date: Thu, 31 Jul 2014 14:01:54 +0200
Subject: [PATCH 61/74] Typo: s/Recieved/Received

---
 .../elasticsearch/discovery/zen/fd/NodesFaultDetection.java   | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/main/java/org/elasticsearch/discovery/zen/fd/NodesFaultDetection.java b/src/main/java/org/elasticsearch/discovery/zen/fd/NodesFaultDetection.java
index 8f90a7564c451..bd485b927e394 100644
--- a/src/main/java/org/elasticsearch/discovery/zen/fd/NodesFaultDetection.java
+++ b/src/main/java/org/elasticsearch/discovery/zen/fd/NodesFaultDetection.java
@@ -201,7 +201,7 @@ public void run() {
         });
     }
 
-    private void notifyPingRecieved(final PingRequest pingRequest) {
+    private void notifyPingReceived(final PingRequest pingRequest) {
         threadPool.generic().execute(new Runnable() {
 
             @Override
@@ -326,7 +326,7 @@ public void messageReceived(PingRequest request, TransportChannel channel) throw
                 throw new ElasticsearchIllegalStateException("Got pinged with cluster name [" + request.clusterName + "], but I'm part of cluster [" + clusterName + "]");
             }
 
-            notifyPingRecieved(request);
+            notifyPingReceived(request);
 
             channel.sendResponse(new PingResponse());
         }

From 26d90882e5a62fd4e1b7c9942040766f67f19df9 Mon Sep 17 00:00:00 2001
From: Boaz Leskes <b.leskes@gmail.com>
Date: Thu, 31 Jul 2014 16:59:20 +0200
Subject: [PATCH 62/74] [Transport] Introduced worker threads to prevent alien
 threads of entering a node.

Requests are handled by the worked thread pool of the target node instead of the generic thread pool of the source node.
Also this change is required in order to make GC disruption work with local transport. Previously the handling of the a request was performed on on a node that that was being GC disrupted, resulting in some actions being performed while GC was being simulated.
---
 .../transport/local/LocalTransport.java       | 26 ++++++++++++++++---
 .../local/LocalTransportChannel.java          |  4 +--
 2 files changed, 24 insertions(+), 6 deletions(-)

diff --git a/src/main/java/org/elasticsearch/transport/local/LocalTransport.java b/src/main/java/org/elasticsearch/transport/local/LocalTransport.java
index 142f33f9d07a1..627f37c61a485 100644
--- a/src/main/java/org/elasticsearch/transport/local/LocalTransport.java
+++ b/src/main/java/org/elasticsearch/transport/local/LocalTransport.java
@@ -33,6 +33,7 @@
 import org.elasticsearch.common.transport.LocalTransportAddress;
 import org.elasticsearch.common.transport.TransportAddress;
 import org.elasticsearch.common.util.concurrent.AbstractRunnable;
+import org.elasticsearch.common.util.concurrent.EsExecutors;
 import org.elasticsearch.threadpool.ThreadPool;
 import org.elasticsearch.transport.*;
 import org.elasticsearch.transport.support.TransportStatus;
@@ -40,6 +41,8 @@
 import java.io.IOException;
 import java.util.Map;
 import java.util.concurrent.ConcurrentMap;
+import java.util.concurrent.ThreadPoolExecutor;
+import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicLong;
 
 import static org.elasticsearch.common.util.concurrent.ConcurrentCollections.newConcurrentMap;
@@ -50,6 +53,7 @@
 public class LocalTransport extends AbstractLifecycleComponent<Transport> implements Transport {
 
     private final ThreadPool threadPool;
+    private final ThreadPoolExecutor workers;
     private final Version version;
     private volatile TransportServiceAdapter transportServiceAdapter;
     private volatile BoundTransportAddress boundAddress;
@@ -58,13 +62,20 @@ public class LocalTransport extends AbstractLifecycleComponent<Transport> implem
     private static final AtomicLong transportAddressIdGenerator = new AtomicLong();
     private final ConcurrentMap<DiscoveryNode, LocalTransport> connectedNodes = newConcurrentMap();
 
-    public static final String TRANSPORT_LOCAL_ADDRESS = "transport.local_address";
+    public static final String TRANSPORT_LOCAL_ADDRESS = "transport.local.address";
+    public static final String TRANSPORT_LOCAL_WORKERS = "transport.local.workers";
+    public static final String TRANSPORT_LOCAL_QUEUE = "transport.local.queue";
 
     @Inject
     public LocalTransport(Settings settings, ThreadPool threadPool, Version version) {
         super(settings);
         this.threadPool = threadPool;
         this.version = version;
+
+        int workerCount = this.settings.getAsInt(TRANSPORT_LOCAL_WORKERS, EsExecutors.boundedNumberOfProcessors(settings));
+        int queueSize = this.settings.getAsInt(TRANSPORT_LOCAL_QUEUE, -1);
+        logger.debug("creating [{}] workers, queue_size [{}]", workerCount, queueSize);
+        this.workers = EsExecutors.newFixed(workerCount, queueSize, EsExecutors.daemonThreadFactory(this.settings, "local_transport"));
     }
 
     @Override
@@ -106,6 +117,13 @@ protected void doStop() throws ElasticsearchException {
 
     @Override
     protected void doClose() throws ElasticsearchException {
+        workers.shutdown();
+        try {
+            workers.awaitTermination(10, TimeUnit.SECONDS);
+        } catch (InterruptedException e) {
+            Thread.currentThread().interrupt();
+        }
+        workers.shutdownNow();
     }
 
     @Override
@@ -185,7 +203,7 @@ public void sendRequest(final DiscoveryNode node, final long requestId, final St
 
         transportServiceAdapter.sent(data.length);
 
-        threadPool.generic().execute(new Runnable() {
+        targetTransport.workers().execute(new Runnable() {
             @Override
             public void run() {
                 targetTransport.messageReceived(data, action, LocalTransport.this, version, requestId);
@@ -193,8 +211,8 @@ public void run() {
         });
     }
 
-    ThreadPool threadPool() {
-        return this.threadPool;
+    ThreadPoolExecutor workers() {
+        return this.workers;
     }
 
     protected void messageReceived(byte[] data, String action, LocalTransport sourceTransport, Version version, @Nullable final Long sendRequestId) {
diff --git a/src/main/java/org/elasticsearch/transport/local/LocalTransportChannel.java b/src/main/java/org/elasticsearch/transport/local/LocalTransportChannel.java
index f4d5e83053a50..f316e9ba69d8c 100644
--- a/src/main/java/org/elasticsearch/transport/local/LocalTransportChannel.java
+++ b/src/main/java/org/elasticsearch/transport/local/LocalTransportChannel.java
@@ -72,7 +72,7 @@ public void sendResponse(TransportResponse response, TransportResponseOptions op
         response.writeTo(stream);
         stream.close();
         final byte[] data = bStream.bytes().toBytes();
-        targetTransport.threadPool().generic().execute(new Runnable() {
+        targetTransport.workers().execute(new Runnable() {
             @Override
             public void run() {
                 targetTransport.messageReceived(data, action, sourceTransport, version, null);
@@ -98,7 +98,7 @@ public void sendResponse(Throwable error) throws IOException {
             too.close();
         }
         final byte[] data = stream.bytes().toBytes();
-        targetTransport.threadPool().generic().execute(new Runnable() {
+        targetTransport.workers().execute(new Runnable() {
             @Override
             public void run() {
                 targetTransport.messageReceived(data, action, sourceTransport, version, null);

From 702890e461b7d99bacf0febe970194960c5cdbe5 Mon Sep 17 00:00:00 2001
From: Martijn van Groningen <martijn.v.groningen@gmail.com>
Date: Thu, 31 Jul 2014 17:01:05 +0200
Subject: [PATCH 63/74] [TEST] Remove the forceful `network.mode` setting in 
 DiscoveryWithServiceDisruptions#testMasterNodeGCs now local transport use
 worker threads.

---
 .../discovery/DiscoveryWithServiceDisruptions.java       | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/src/test/java/org/elasticsearch/discovery/DiscoveryWithServiceDisruptions.java b/src/test/java/org/elasticsearch/discovery/DiscoveryWithServiceDisruptions.java
index c1099ead87ba8..21ca342f17a1c 100644
--- a/src/test/java/org/elasticsearch/discovery/DiscoveryWithServiceDisruptions.java
+++ b/src/test/java/org/elasticsearch/discovery/DiscoveryWithServiceDisruptions.java
@@ -141,17 +141,12 @@ private List<String> startMulticastCluster(int numberOfNodes, int minimumMasterN
     }
 
     private List<String> startUnicastCluster(int numberOfNodes, @Nullable int[] unicastHostsOrdinals, int minimumMasterNode) throws ExecutionException, InterruptedException {
-        return startUnicastCluster(numberOfNodes, unicastHostsOrdinals, minimumMasterNode, ImmutableSettings.EMPTY);
-    }
-
-    private List<String> startUnicastCluster(int numberOfNodes, @Nullable int[] unicastHostsOrdinals, int minimumMasterNode, Settings settings) throws ExecutionException, InterruptedException {
         if (minimumMasterNode < 0) {
             minimumMasterNode = numberOfNodes / 2 + 1;
         }
         // TODO: Rarely use default settings form some of these
         Settings nodeSettings = ImmutableSettings.builder()
                 .put(DEFAULT_SETTINGS)
-                .put(settings)
                 .put(ElectMasterService.DISCOVERY_ZEN_MINIMUM_MASTER_NODES, minimumMasterNode)
                 .build();
 
@@ -511,9 +506,7 @@ public void run() {
     public void testMasterNodeGCs() throws Exception {
         // TODO: on mac OS multicast threads are shared between nodes and we therefore we can't simulate GC and stop pinging for just one node
         // find a way to block thread creation in the generic thread pool to avoid this.
-        // TODO: with local transport the threads of the source node enter the target node, since everything is local and like above we can't simulate GC on one node
-        // with netty transport the threads of different nodes don't touch each other due to the network threading Netty uses
-        List<String> nodes = startUnicastCluster(3, null, -1, ImmutableSettings.builder().put("node.mode", "network").build());
+        List<String> nodes = startUnicastCluster(3, null, -1);
 
         String oldMasterNode = internalCluster().getMasterName();
         // a very long GC, but it's OK as we remove the disruption when it has had an effect

From c8919e4bf5df2264d7482d10aeee4c578d83c9f0 Mon Sep 17 00:00:00 2001
From: Martijn van Groningen <martijn.v.groningen@gmail.com>
Date: Tue, 5 Aug 2014 14:57:47 +0200
Subject: [PATCH 64/74] [TEST] Changed action names.

---
 .../discovery/DiscoveryWithServiceDisruptions.java          | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/test/java/org/elasticsearch/discovery/DiscoveryWithServiceDisruptions.java b/src/test/java/org/elasticsearch/discovery/DiscoveryWithServiceDisruptions.java
index 21ca342f17a1c..82bf85716c74f 100644
--- a/src/test/java/org/elasticsearch/discovery/DiscoveryWithServiceDisruptions.java
+++ b/src/test/java/org/elasticsearch/discovery/DiscoveryWithServiceDisruptions.java
@@ -43,9 +43,11 @@
 import org.elasticsearch.common.unit.TimeValue;
 import org.elasticsearch.discovery.zen.ZenDiscovery;
 import org.elasticsearch.discovery.zen.elect.ElectMasterService;
+import org.elasticsearch.discovery.zen.membership.MembershipAction;
 import org.elasticsearch.discovery.zen.ping.ZenPing;
 import org.elasticsearch.discovery.zen.ping.ZenPingService;
 import org.elasticsearch.discovery.zen.ping.unicast.UnicastZenPing;
+import org.elasticsearch.discovery.zen.publish.PublishClusterStateAction;
 import org.elasticsearch.test.ElasticsearchIntegrationTest;
 import org.elasticsearch.test.InternalTestCluster;
 import org.elasticsearch.test.disruption.*;
@@ -672,14 +674,14 @@ public void testClusterJoinDespiteOfPublishingIssues() throws Exception {
 
         logger.info("blocking cluster state publishing from master [{}] to non master [{}]", masterNode, nonMasterNode);
         MockTransportService masterTransportService = (MockTransportService) internalCluster().getInstance(TransportService.class, masterNode);
-        masterTransportService.addFailToSendNoConnectRule(discoveryNodes.localNode(), "discovery/zen/publish");
+        masterTransportService.addFailToSendNoConnectRule(discoveryNodes.localNode(), PublishClusterStateAction.ACTION_NAME);
 
         logger.info("allowing requests from non master [{}] to master [{}], waiting for two join request", nonMasterNode, masterNode);
         final CountDownLatch countDownLatch = new CountDownLatch(2);
         nonMasterTransportService.addDelegate(discoveryNodes.masterNode(), new MockTransportService.DelegateTransport(nonMasterTransportService.original()) {
             @Override
             public void sendRequest(DiscoveryNode node, long requestId, String action, TransportRequest request, TransportRequestOptions options) throws IOException, TransportException {
-                if (action.equals("discovery/zen/join")) {
+                if (action.equals(MembershipAction.DISCOVERY_JOIN_ACTION_NAME)) {
                     countDownLatch.countDown();
                 }
                 super.sendRequest(node, requestId, action, request, options);

From 5932371f213bcb912b0210d584befd9f88a614a9 Mon Sep 17 00:00:00 2001
From: Martijn van Groningen <martijn.v.groningen@gmail.com>
Date: Tue, 5 Aug 2014 14:59:29 +0200
Subject: [PATCH 65/74] [TEST] Adapt testNoMasterActions since metadata isn't
 cleared if there is a no master block

---
 .../org/elasticsearch/cluster/NoMasterNodeTests.java  | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/src/test/java/org/elasticsearch/cluster/NoMasterNodeTests.java b/src/test/java/org/elasticsearch/cluster/NoMasterNodeTests.java
index 6d30da7fd81c9..65b2bdfaaeffc 100644
--- a/src/test/java/org/elasticsearch/cluster/NoMasterNodeTests.java
+++ b/src/test/java/org/elasticsearch/cluster/NoMasterNodeTests.java
@@ -136,7 +136,7 @@ public void run() {
                 ClusterBlockException.class, RestStatus.SERVICE_UNAVAILABLE
         );
 
-        checkWriteAction(autoCreateIndex, timeout,
+        checkWriteAction(false, timeout,
                 client().prepareUpdate("test", "type1", "1").setScript("test script", ScriptService.ScriptType.INLINE).setTimeout(timeout));
 
 
@@ -144,7 +144,7 @@ public void run() {
                 client().prepareUpdate("no_index", "type1", "1").setScript("test script", ScriptService.ScriptType.INLINE).setTimeout(timeout));
 
 
-        checkWriteAction(autoCreateIndex, timeout,
+        checkWriteAction(false, timeout,
                 client().prepareIndex("test", "type1", "1").setSource(XContentFactory.jsonBuilder().startObject().endObject()).setTimeout(timeout));
 
         checkWriteAction(autoCreateIndex, timeout,
@@ -154,7 +154,7 @@ public void run() {
         bulkRequestBuilder.add(client().prepareIndex("test", "type1", "1").setSource(XContentFactory.jsonBuilder().startObject().endObject()));
         bulkRequestBuilder.add(client().prepareIndex("test", "type1", "2").setSource(XContentFactory.jsonBuilder().startObject().endObject()));
         bulkRequestBuilder.setTimeout(timeout);
-        checkBulkAction(autoCreateIndex, timeout, bulkRequestBuilder);
+        checkBulkAction(false, timeout, bulkRequestBuilder);
 
         bulkRequestBuilder = client().prepareBulk();
         bulkRequestBuilder.add(client().prepareIndex("no_index", "type1", "1").setSource(XContentFactory.jsonBuilder().startObject().endObject()));
@@ -193,9 +193,8 @@ void checkBulkAction(boolean autoCreateIndex, TimeValue timeout, BulkRequestBuil
             builder.get();
             fail("Expected ClusterBlockException");
         } catch (ClusterBlockException e) {
-            // today, we clear the metadata on when there is no master, so it will go through the auto create logic and
-            // add it... (if set to true), if we didn't remove the metedata when there is no master, then, the non
-            // retry in bulk should be taken into account
+            // If the index exists the bulk doesn't retry with a global block, if an index doesn't exist bulk api delegates
+            // to the create index api which does retry / wait on a global block.
             if (!autoCreateIndex) {
                 assertThat(System.currentTimeMillis() - now, lessThan(timeout.millis() / 2));
             } else {

From ff8b7409f7cb7c96b9887a9c69841ce865c20ea7 Mon Sep 17 00:00:00 2001
From: Boaz Leskes <b.leskes@gmail.com>
Date: Sat, 16 Aug 2014 15:36:37 +0200
Subject: [PATCH 66/74] [Discovery] add a debug log if a node responds to a
 publish request after publishing timed out.

---
 .../publish/PublishClusterStateAction.java    | 20 ++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/src/main/java/org/elasticsearch/discovery/zen/publish/PublishClusterStateAction.java b/src/main/java/org/elasticsearch/discovery/zen/publish/PublishClusterStateAction.java
index 554848422bae2..6f7098c6311c3 100644
--- a/src/main/java/org/elasticsearch/discovery/zen/publish/PublishClusterStateAction.java
+++ b/src/main/java/org/elasticsearch/discovery/zen/publish/PublishClusterStateAction.java
@@ -39,6 +39,7 @@
 import org.elasticsearch.transport.*;
 
 import java.util.Map;
+import java.util.concurrent.atomic.AtomicBoolean;
 
 /**
  *
@@ -82,12 +83,15 @@ public void publish(ClusterState clusterState, final Discovery.AckListener ackLi
         publish(clusterState, new AckClusterStatePublishResponseHandler(clusterState.nodes().size() - 1, ackListener));
     }
 
-    private void publish(ClusterState clusterState, final ClusterStatePublishResponseHandler publishResponseHandler) {
+    private void publish(final ClusterState clusterState, final ClusterStatePublishResponseHandler publishResponseHandler) {
 
         DiscoveryNode localNode = nodesProvider.nodes().localNode();
 
         Map<Version, BytesReference> serializedStates = Maps.newHashMap();
 
+        final AtomicBoolean timedOutWaitingForNodes = new AtomicBoolean(false);
+        final TimeValue publishTimeout = discoverySettings.getPublishTimeout();
+
         for (final DiscoveryNode node : clusterState.nodes()) {
             if (node.equals(localNode)) {
                 continue;
@@ -122,28 +126,30 @@ private void publish(ClusterState clusterState, final ClusterStatePublishRespons
 
                             @Override
                             public void handleResponse(TransportResponse.Empty response) {
+                                if (timedOutWaitingForNodes.get()) {
+                                    logger.debug("node {} responded for cluster state [{}] (took longer than [{}])", node, clusterState.version(), publishTimeout);
+                                }
                                 publishResponseHandler.onResponse(node);
                             }
 
                             @Override
                             public void handleException(TransportException exp) {
-                                logger.debug("failed to send cluster state to [{}]", exp, node);
+                                logger.debug("failed to send cluster state to {}", exp, node);
                                 publishResponseHandler.onFailure(node, exp);
                             }
                         });
             } catch (Throwable t) {
-                logger.debug("error sending cluster state to [{}]", t, node);
+                logger.debug("error sending cluster state to {}", t, node);
                 publishResponseHandler.onFailure(node, t);
             }
         }
 
-        TimeValue publishTimeout = discoverySettings.getPublishTimeout();
         if (publishTimeout.millis() > 0) {
             // only wait if the publish timeout is configured...
             try {
-                boolean awaited = publishResponseHandler.awaitAllNodes(publishTimeout);
-                if (!awaited) {
-                    logger.debug("awaiting all nodes to process published state {} timed out, timeout {}", clusterState.version(), publishTimeout);
+                timedOutWaitingForNodes.set(!publishResponseHandler.awaitAllNodes(publishTimeout));
+                if (timedOutWaitingForNodes.get()) {
+                    logger.debug("timed out waiting for all nodes to process published state [{}] (timeout [{}])", clusterState.version(), publishTimeout);
                 }
             } catch (InterruptedException e) {
                 // ignore & restore interrupt

From d5552a980fd5b5e08644dc0621b2a7d12068c5df Mon Sep 17 00:00:00 2001
From: Boaz Leskes <b.leskes@gmail.com>
Date: Tue, 19 Aug 2014 14:09:14 +0200
Subject: [PATCH 67/74] [Discovery] UnicastZenPing should also ping last known
 discoNodes

At the moment, when a node looses connection to the master (due to a partition or the master was stopped), we ping the unicast hosts in order to discover other nodes and elect a new master or get of another master than has been elected in the mean time. This can go wrong if all unicast targets are on the same side of a minority partition and therefore will never rejoin once the partition is healed.

Closes #7336
---
 .../discovery/zen/ZenDiscovery.java           |   5 +-
 .../discovery/zen/ZenDiscoveryModule.java     |   2 +
 .../zen/elect/ElectMasterService.java         |  25 ++++-
 .../discovery/zen/ping/ZenPingService.java    |   9 +-
 .../zen/ping/unicast/UnicastZenPing.java      |  30 ++++-
 .../DiscoveryWithServiceDisruptions.java      |  37 +++++-
 .../discovery/zen/ElectMasterServiceTest.java | 105 ++++++++++++++++++
 .../zen/ping/unicast/UnicastZenPingTests.java |   6 +-
 8 files changed, 201 insertions(+), 18 deletions(-)
 create mode 100644 src/test/java/org/elasticsearch/discovery/zen/ElectMasterServiceTest.java

diff --git a/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java b/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
index 7c76afd16400c..8bfcb5bdb141d 100644
--- a/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
+++ b/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
@@ -141,7 +141,7 @@ public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implemen
     @Inject
     public ZenDiscovery(Settings settings, ClusterName clusterName, ThreadPool threadPool,
                         TransportService transportService, ClusterService clusterService, NodeSettingsService nodeSettingsService,
-                        DiscoveryNodeService discoveryNodeService, ZenPingService pingService, Version version,
+                        DiscoveryNodeService discoveryNodeService, ZenPingService pingService, ElectMasterService electMasterService, Version version,
                         DiscoverySettings discoverySettings) {
         super(settings);
         this.clusterName = clusterName;
@@ -152,6 +152,8 @@ public ZenDiscovery(Settings settings, ClusterName clusterName, ThreadPool threa
         this.discoverySettings = discoverySettings;
         this.pingService = pingService;
         this.version = version;
+        this.electMaster = electMasterService;
+
 
         // also support direct discovery.zen settings, for cases when it gets extended
         this.pingTimeout = settings.getAsTime("discovery.zen.ping.timeout", settings.getAsTime("discovery.zen.ping_timeout", componentSettings.getAsTime("ping_timeout", componentSettings.getAsTime("initial_ping_timeout", timeValueSeconds(3)))));
@@ -167,7 +169,6 @@ public ZenDiscovery(Settings settings, ClusterName clusterName, ThreadPool threa
 
         logger.debug("using ping.timeout [{}], join.timeout [{}], master_election.filter_client [{}], master_election.filter_data [{}]", pingTimeout, joinTimeout, masterElectionFilterClientNodes, masterElectionFilterDataNodes);
 
-        this.electMaster = new ElectMasterService(settings);
         nodeSettingsService.addListener(new ApplySettings());
 
         this.masterFD = new MasterFaultDetection(settings, threadPool, transportService, this, clusterName);
diff --git a/src/main/java/org/elasticsearch/discovery/zen/ZenDiscoveryModule.java b/src/main/java/org/elasticsearch/discovery/zen/ZenDiscoveryModule.java
index e67c4e2af39ef..33987662bfa40 100644
--- a/src/main/java/org/elasticsearch/discovery/zen/ZenDiscoveryModule.java
+++ b/src/main/java/org/elasticsearch/discovery/zen/ZenDiscoveryModule.java
@@ -23,6 +23,7 @@
 import org.elasticsearch.common.inject.AbstractModule;
 import org.elasticsearch.common.inject.multibindings.Multibinder;
 import org.elasticsearch.discovery.Discovery;
+import org.elasticsearch.discovery.zen.elect.ElectMasterService;
 import org.elasticsearch.discovery.zen.ping.ZenPingService;
 import org.elasticsearch.discovery.zen.ping.unicast.UnicastHostsProvider;
 
@@ -44,6 +45,7 @@ public ZenDiscoveryModule addUnicastHostProvider(Class<? extends UnicastHostsPro
 
     @Override
     protected void configure() {
+        bind(ElectMasterService.class).asEagerSingleton();
         bind(ZenPingService.class).asEagerSingleton();
         Multibinder<UnicastHostsProvider> unicastHostsProviderMultibinder = Multibinder.newSetBinder(binder(), UnicastHostsProvider.class);
         for (Class<? extends UnicastHostsProvider> unicastHostProvider : unicastHostProviders) {
diff --git a/src/main/java/org/elasticsearch/discovery/zen/elect/ElectMasterService.java b/src/main/java/org/elasticsearch/discovery/zen/elect/ElectMasterService.java
index bcfa1dc2f029a..9ba26387ec52f 100644
--- a/src/main/java/org/elasticsearch/discovery/zen/elect/ElectMasterService.java
+++ b/src/main/java/org/elasticsearch/discovery/zen/elect/ElectMasterService.java
@@ -24,12 +24,10 @@
 import org.apache.lucene.util.CollectionUtil;
 import org.elasticsearch.cluster.node.DiscoveryNode;
 import org.elasticsearch.common.component.AbstractComponent;
+import org.elasticsearch.common.inject.Inject;
 import org.elasticsearch.common.settings.Settings;
 
-import java.util.Arrays;
-import java.util.Comparator;
-import java.util.Iterator;
-import java.util.List;
+import java.util.*;
 
 /**
  *
@@ -42,6 +40,7 @@ public class ElectMasterService extends AbstractComponent {
 
     private volatile int minimumMasterNodes;
 
+    @Inject
     public ElectMasterService(Settings settings) {
         super(settings);
         this.minimumMasterNodes = settings.getAsInt(DISCOVERY_ZEN_MINIMUM_MASTER_NODES, -1);
@@ -69,6 +68,18 @@ public boolean hasEnoughMasterNodes(Iterable<DiscoveryNode> nodes) {
         return count >= minimumMasterNodes;
     }
 
+    /**
+     * Returns the given nodes sorted by likelyhood of being elected as master, most likely first.
+     * Non-master nodes are not removed but are rather put in the end
+     * @param nodes
+     * @return
+     */
+    public List<DiscoveryNode> sortByMasterLikelihood(Iterable<DiscoveryNode> nodes) {
+        ArrayList<DiscoveryNode> sortedNodes = Lists.newArrayList(nodes);
+        CollectionUtil.introSort(sortedNodes, nodeComparator);
+        return sortedNodes;
+    }
+
     /**
      * Returns a list of the next possible masters.
      */
@@ -120,6 +131,12 @@ private static class NodeComparator implements Comparator<DiscoveryNode> {
 
         @Override
         public int compare(DiscoveryNode o1, DiscoveryNode o2) {
+            if (o1.masterNode() && !o2.masterNode()) {
+                return -1;
+            }
+            if (!o1.masterNode() && o2.masterNode()) {
+                return 1;
+            }
             return o1.id().compareTo(o2.id());
         }
     }
diff --git a/src/main/java/org/elasticsearch/discovery/zen/ping/ZenPingService.java b/src/main/java/org/elasticsearch/discovery/zen/ping/ZenPingService.java
index 53ee9248eac6a..39f710f7acd82 100644
--- a/src/main/java/org/elasticsearch/discovery/zen/ping/ZenPingService.java
+++ b/src/main/java/org/elasticsearch/discovery/zen/ping/ZenPingService.java
@@ -34,6 +34,7 @@
 import org.elasticsearch.common.util.concurrent.ConcurrentCollections;
 import org.elasticsearch.common.util.concurrent.EsRejectedExecutionException;
 import org.elasticsearch.discovery.zen.DiscoveryNodesProvider;
+import org.elasticsearch.discovery.zen.elect.ElectMasterService;
 import org.elasticsearch.discovery.zen.ping.multicast.MulticastZenPing;
 import org.elasticsearch.discovery.zen.ping.unicast.UnicastHostsProvider;
 import org.elasticsearch.discovery.zen.ping.unicast.UnicastZenPing;
@@ -55,20 +56,20 @@ public class ZenPingService extends AbstractLifecycleComponent<ZenPing> implemen
 
     // here for backward comp. with discovery plugins
     public ZenPingService(Settings settings, ThreadPool threadPool, TransportService transportService, ClusterName clusterName, NetworkService networkService,
-                          @Nullable Set<UnicastHostsProvider> unicastHostsProviders) {
-        this(settings, threadPool, transportService, clusterName, networkService, Version.CURRENT, unicastHostsProviders);
+                          ElectMasterService electMasterService, @Nullable Set<UnicastHostsProvider> unicastHostsProviders) {
+        this(settings, threadPool, transportService, clusterName, networkService, Version.CURRENT, electMasterService, unicastHostsProviders);
     }
 
     @Inject
     public ZenPingService(Settings settings, ThreadPool threadPool, TransportService transportService, ClusterName clusterName, NetworkService networkService,
-                          Version version, @Nullable Set<UnicastHostsProvider> unicastHostsProviders) {
+                          Version version, ElectMasterService electMasterService, @Nullable Set<UnicastHostsProvider> unicastHostsProviders) {
         super(settings);
         ImmutableList.Builder<ZenPing> zenPingsBuilder = ImmutableList.builder();
         if (componentSettings.getAsBoolean("multicast.enabled", true)) {
             zenPingsBuilder.add(new MulticastZenPing(settings, threadPool, transportService, clusterName, networkService, version));
         }
         // always add the unicast hosts, so it will be able to receive unicast requests even when working in multicast
-        zenPingsBuilder.add(new UnicastZenPing(settings, threadPool, transportService, clusterName, version, unicastHostsProviders));
+        zenPingsBuilder.add(new UnicastZenPing(settings, threadPool, transportService, clusterName, version, electMasterService, unicastHostsProviders));
 
         this.zenPings = zenPingsBuilder.build();
     }
diff --git a/src/main/java/org/elasticsearch/discovery/zen/ping/unicast/UnicastZenPing.java b/src/main/java/org/elasticsearch/discovery/zen/ping/unicast/UnicastZenPing.java
index 5b7cf0334676a..ee9526f6d5f52 100644
--- a/src/main/java/org/elasticsearch/discovery/zen/ping/unicast/UnicastZenPing.java
+++ b/src/main/java/org/elasticsearch/discovery/zen/ping/unicast/UnicastZenPing.java
@@ -19,8 +19,12 @@
 
 package org.elasticsearch.discovery.zen.ping.unicast;
 
+import com.carrotsearch.hppc.cursors.ObjectCursor;
 import com.google.common.collect.Lists;
-import org.elasticsearch.*;
+import org.elasticsearch.ElasticsearchException;
+import org.elasticsearch.ElasticsearchIllegalArgumentException;
+import org.elasticsearch.ElasticsearchIllegalStateException;
+import org.elasticsearch.Version;
 import org.elasticsearch.cluster.ClusterName;
 import org.elasticsearch.cluster.node.DiscoveryNode;
 import org.elasticsearch.cluster.node.DiscoveryNodes;
@@ -35,6 +39,7 @@
 import org.elasticsearch.common.util.concurrent.EsExecutors;
 import org.elasticsearch.common.util.concurrent.EsRejectedExecutionException;
 import org.elasticsearch.discovery.zen.DiscoveryNodesProvider;
+import org.elasticsearch.discovery.zen.elect.ElectMasterService;
 import org.elasticsearch.discovery.zen.ping.ZenPing;
 import org.elasticsearch.threadpool.ThreadPool;
 import org.elasticsearch.transport.*;
@@ -62,6 +67,7 @@ public class UnicastZenPing extends AbstractLifecycleComponent<ZenPing> implemen
     private final ThreadPool threadPool;
     private final TransportService transportService;
     private final ClusterName clusterName;
+    private final ElectMasterService electMasterService;
 
     private final int concurrentConnects;
 
@@ -78,11 +84,13 @@ public class UnicastZenPing extends AbstractLifecycleComponent<ZenPing> implemen
 
     private final CopyOnWriteArrayList<UnicastHostsProvider> hostsProviders = new CopyOnWriteArrayList<>();
 
-    public UnicastZenPing(Settings settings, ThreadPool threadPool, TransportService transportService, ClusterName clusterName, Version version, @Nullable Set<UnicastHostsProvider> unicastHostsProviders) {
+    public UnicastZenPing(Settings settings, ThreadPool threadPool, TransportService transportService, ClusterName clusterName,
+                          Version version, ElectMasterService electMasterService, @Nullable Set<UnicastHostsProvider> unicastHostsProviders) {
         super(settings);
         this.threadPool = threadPool;
         this.transportService = transportService;
         this.clusterName = clusterName;
+        this.electMasterService = electMasterService;
 
         if (unicastHostsProviders != null) {
             for (UnicastHostsProvider unicastHostsProvider : unicastHostsProviders) {
@@ -244,18 +252,30 @@ void sendPings(final TimeValue timeout, @Nullable TimeValue waitTime, final Send
         DiscoveryNodes discoNodes = nodesProvider.nodes();
         pingRequest.pingResponse = new PingResponse(discoNodes.localNode(), discoNodes.masterNode(), clusterName);
 
-        HashSet<DiscoveryNode> nodesToPing = new HashSet<>(Arrays.asList(nodes));
+        HashSet<DiscoveryNode> nodesToPingSet = new HashSet<>();
         for (PingResponse temporalResponse : temporalResponses) {
             // Only send pings to nodes that have the same cluster name.
             if (clusterName.equals(temporalResponse.clusterName())) {
-                nodesToPing.add(temporalResponse.target());
+                nodesToPingSet.add(temporalResponse.target());
             }
         }
 
         for (UnicastHostsProvider provider : hostsProviders) {
-            nodesToPing.addAll(provider.buildDynamicNodes());
+            nodesToPingSet.addAll(provider.buildDynamicNodes());
         }
 
+        // add all possible master nodes that were active in the last known cluster configuration
+        for (ObjectCursor<DiscoveryNode> masterNode : discoNodes.getMasterNodes().values()) {
+            nodesToPingSet.add(masterNode.value);
+        }
+
+        // sort the nodes by likelihood of being an active master
+        List<DiscoveryNode> sortedNodesToPing = electMasterService.sortByMasterLikelihood(nodesToPingSet);
+
+        // new add the the unicast targets first
+        ArrayList<DiscoveryNode> nodesToPing = Lists.newArrayList(nodes);
+        nodesToPing.addAll(sortedNodesToPing);
+
         final CountDownLatch latch = new CountDownLatch(nodesToPing.size());
         for (final DiscoveryNode node : nodesToPing) {
             // make sure we are connected
diff --git a/src/test/java/org/elasticsearch/discovery/DiscoveryWithServiceDisruptions.java b/src/test/java/org/elasticsearch/discovery/DiscoveryWithServiceDisruptions.java
index 82bf85716c74f..ff52ca351c6a3 100644
--- a/src/test/java/org/elasticsearch/discovery/DiscoveryWithServiceDisruptions.java
+++ b/src/test/java/org/elasticsearch/discovery/DiscoveryWithServiceDisruptions.java
@@ -649,6 +649,42 @@ public void unicastSinglePingResponseContainsMaster() throws Exception {
         assertMaster(masterNode, nodes);
     }
 
+    @Test
+    @TestLogging("discovery.zen:TRACE,action:TRACE")
+    public void isolatedUnicastNodes() throws Exception {
+        List<String> nodes = startUnicastCluster(3, new int[]{0}, -1);
+        // Figure out what is the elected master node
+        final String unicastTarget = nodes.get(0);
+
+        Set<String> unicastTargetSide = new HashSet<>();
+        unicastTargetSide.add(unicastTarget);
+
+        Set<String> restOfClusterSide = new HashSet<>();
+        restOfClusterSide.addAll(nodes);
+        restOfClusterSide.remove(unicastTarget);
+
+        // Forcefully clean temporal response lists on all nodes. Otherwise the node in the unicast host list
+        // includes all the other nodes that have pinged it and the issue doesn't manifest
+        for (ZenPingService pingService : internalCluster().getInstances(ZenPingService.class)) {
+            for (ZenPing zenPing : pingService.zenPings()) {
+                ((UnicastZenPing) zenPing).clearTemporalReponses();
+            }
+        }
+
+        // Simulate a network issue between the unicast target node and the rest of the cluster
+        NetworkDisconnectPartition networkDisconnect = new NetworkDisconnectPartition(unicastTargetSide, restOfClusterSide, getRandom());
+        setDisruptionScheme(networkDisconnect);
+        networkDisconnect.startDisrupting();
+        // Wait until elected master has removed that the unlucky node...
+        ensureStableCluster(2, nodes.get(1));
+
+        // The isolate master node must report no master, so it starts with pinging
+        assertNoMaster(unicastTarget);
+        networkDisconnect.stopDisrupting();
+        // Wait until the master node sees all 3 nodes again.
+        ensureStableCluster(3);
+    }
+
 
     /** Test cluster join with issues in cluster state publishing * */
     @Test
@@ -695,7 +731,6 @@ public void sendRequest(DiscoveryNode node, long requestId, String action, Trans
         nonMasterTransportService.clearRule(discoveryNodes.masterNode());
 
         ensureStableCluster(2);
-
     }
 
 
diff --git a/src/test/java/org/elasticsearch/discovery/zen/ElectMasterServiceTest.java b/src/test/java/org/elasticsearch/discovery/zen/ElectMasterServiceTest.java
new file mode 100644
index 0000000000000..df8f67c536f0d
--- /dev/null
+++ b/src/test/java/org/elasticsearch/discovery/zen/ElectMasterServiceTest.java
@@ -0,0 +1,105 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.discovery.zen;
+
+import org.elasticsearch.Version;
+import org.elasticsearch.cluster.node.DiscoveryNode;
+import org.elasticsearch.common.settings.ImmutableSettings;
+import org.elasticsearch.common.transport.DummyTransportAddress;
+import org.elasticsearch.discovery.zen.elect.ElectMasterService;
+import org.elasticsearch.test.ElasticsearchTestCase;
+import org.junit.Test;
+
+import java.util.*;
+
+public class ElectMasterServiceTest extends ElasticsearchTestCase {
+
+    ElectMasterService electMasterService() {
+        return new ElectMasterService(ImmutableSettings.EMPTY);
+    }
+
+    List<DiscoveryNode> generateRandomNodes() {
+        int count = scaledRandomIntBetween(1, 100);
+        ArrayList<DiscoveryNode> nodes = new ArrayList<>(count);
+
+        Map<String, String> master = new HashMap<>();
+        master.put("master", "true");
+        Map<String, String> nonMaster = new HashMap<>();
+        nonMaster.put("master", "false");
+
+        for (int i = 0; i < count; i++) {
+            Map<String, String> attributes = randomBoolean() ? master : nonMaster;
+            DiscoveryNode node = new DiscoveryNode("n_" + i, "n_" + i, DummyTransportAddress.INSTANCE, attributes, Version.CURRENT);
+            nodes.add(node);
+        }
+
+        Collections.shuffle(nodes, getRandom());
+        return nodes;
+    }
+
+    @Test
+    public void sortByMasterLikelihood() {
+        List<DiscoveryNode> nodes = generateRandomNodes();
+        List<DiscoveryNode> sortedNodes = electMasterService().sortByMasterLikelihood(nodes);
+        assertEquals(nodes.size(), sortedNodes.size());
+        DiscoveryNode prevNode = sortedNodes.get(0);
+        for (int i = 1; i < sortedNodes.size(); i++) {
+            DiscoveryNode node = sortedNodes.get(i);
+            if (!prevNode.masterNode()) {
+                assertFalse(node.masterNode());
+            } else if (node.masterNode()) {
+                assertTrue(prevNode.id().compareTo(node.id()) < 0);
+            }
+            prevNode = node;
+        }
+
+    }
+
+    @Test
+    public void electMaster() {
+        List<DiscoveryNode> nodes = generateRandomNodes();
+        ElectMasterService service = electMasterService();
+        int min_master_nodes = randomIntBetween(0, nodes.size());
+        service.minimumMasterNodes(min_master_nodes);
+
+        int master_nodes = 0;
+        for (DiscoveryNode node : nodes) {
+            if (node.masterNode()) {
+                master_nodes++;
+            }
+        }
+        DiscoveryNode master = null;
+        if (service.hasEnoughMasterNodes(nodes)) {
+            master = service.electMaster(nodes);
+        }
+
+        if (master_nodes == 0) {
+            assertNull(master);
+        } else if (min_master_nodes > 0 && master_nodes < min_master_nodes) {
+            assertNull(master);
+        } else {
+            for (DiscoveryNode node : nodes) {
+                if (node.masterNode()) {
+                    assertTrue(master.id().compareTo(node.id()) <= 0);
+                }
+            }
+        }
+    }
+}
diff --git a/src/test/java/org/elasticsearch/discovery/zen/ping/unicast/UnicastZenPingTests.java b/src/test/java/org/elasticsearch/discovery/zen/ping/unicast/UnicastZenPingTests.java
index 8f18cb11d3804..7ecc23b68ef99 100644
--- a/src/test/java/org/elasticsearch/discovery/zen/ping/unicast/UnicastZenPingTests.java
+++ b/src/test/java/org/elasticsearch/discovery/zen/ping/unicast/UnicastZenPingTests.java
@@ -30,6 +30,7 @@
 import org.elasticsearch.common.unit.TimeValue;
 import org.elasticsearch.common.util.BigArrays;
 import org.elasticsearch.discovery.zen.DiscoveryNodesProvider;
+import org.elasticsearch.discovery.zen.elect.ElectMasterService;
 import org.elasticsearch.discovery.zen.ping.ZenPing;
 import org.elasticsearch.node.service.NodeService;
 import org.elasticsearch.test.ElasticsearchTestCase;
@@ -55,6 +56,7 @@ public void testSimplePings() {
         ThreadPool threadPool = new ThreadPool(getClass().getName());
         ClusterName clusterName = new ClusterName("test");
         NetworkService networkService = new NetworkService(settings);
+        ElectMasterService electMasterService = new ElectMasterService(settings);
 
         NettyTransport transportA = new NettyTransport(settings, threadPool, networkService, BigArrays.NON_RECYCLING_INSTANCE, Version.CURRENT);
         final TransportService transportServiceA = new TransportService(transportA, threadPool).start();
@@ -73,7 +75,7 @@ public void testSimplePings() {
                 addressB.address().getAddress().getHostAddress() + ":" + addressB.address().getPort())
                 .build();
 
-        UnicastZenPing zenPingA = new UnicastZenPing(hostsSettings, threadPool, transportServiceA, clusterName, Version.CURRENT, null);
+        UnicastZenPing zenPingA = new UnicastZenPing(hostsSettings, threadPool, transportServiceA, clusterName, Version.CURRENT, electMasterService, null);
         zenPingA.setNodesProvider(new DiscoveryNodesProvider() {
             @Override
             public DiscoveryNodes nodes() {
@@ -87,7 +89,7 @@ public NodeService nodeService() {
         });
         zenPingA.start();
 
-        UnicastZenPing zenPingB = new UnicastZenPing(hostsSettings, threadPool, transportServiceB, clusterName, Version.CURRENT, null);
+        UnicastZenPing zenPingB = new UnicastZenPing(hostsSettings, threadPool, transportServiceB, clusterName, Version.CURRENT, electMasterService, null);
         zenPingB.setNodesProvider(new DiscoveryNodesProvider() {
             @Override
             public DiscoveryNodes nodes() {

From 183ca37dfad453e7eb26cf5ae9c3337495541334 Mon Sep 17 00:00:00 2001
From: Boaz Leskes <b.leskes@gmail.com>
Date: Fri, 29 Aug 2014 09:01:05 +0200
Subject: [PATCH 68/74] Code style improvement

---
 pom.xml                                       | 465 +++++++++---------
 .../cluster/block/ClusterBlocks.java          |   2 +-
 .../discovery/DiscoverySettings.java          |  13 +-
 .../discovery/zen/ZenDiscovery.java           |   7 +-
 .../zen/fd/MasterFaultDetection.java          |   1 +
 .../discovery/zen/fd/NodesFaultDetection.java |   2 +
 .../zen/ping/unicast/UnicastZenPing.java      |  12 +-
 7 files changed, 256 insertions(+), 246 deletions(-)

diff --git a/pom.xml b/pom.xml
index e01f6b8e6895d..d6bcaeb42cd2a 100644
--- a/pom.xml
+++ b/pom.xml
@@ -183,7 +183,7 @@
             <version>0.8.13</version>
             <optional>true</optional>
         </dependency>
-       <!-- Lucene spatial -->
+        <!-- Lucene spatial -->
 
 
         <!-- START: dependencies that are shaded -->
@@ -483,7 +483,8 @@
                             <haltOnFailure>${tests.failfast}</haltOnFailure>
                             <uniqueSuiteNames>false</uniqueSuiteNames>
                             <systemProperties>
-                                <java.io.tmpdir>.</java.io.tmpdir> <!-- we use '.' since this is different per JVM-->
+                                <java.io.tmpdir>.</java.io.tmpdir>
+                                <!-- we use '.' since this is different per JVM-->
                                 <!-- RandomizedTesting library system properties -->
                                 <tests.bwc>${tests.bwc}</tests.bwc>
                                 <tests.bwc.path>${tests.bwc.path}</tests.bwc.path>
@@ -537,15 +538,15 @@
                 <version>1.7</version>
                 <executions>
                     <execution>
-                    <phase>validate</phase>
-                    <goals>
-                        <goal>run</goal>
-                    </goals>
-                    <configuration>
-                           <target>
-                               <echo>Using ${java.runtime.name} ${java.runtime.version} ${java.vendor}</echo>
-                           </target>
-                    </configuration>
+                        <phase>validate</phase>
+                        <goals>
+                            <goal>run</goal>
+                        </goals>
+                        <configuration>
+                            <target>
+                                <echo>Using ${java.runtime.name} ${java.runtime.version} ${java.vendor}</echo>
+                            </target>
+                        </configuration>
                     </execution>
                     <execution>
                         <id>invalid-patterns</id>
@@ -573,7 +574,9 @@
                                     </fileset>
                                     <map from="${basedir}${file.separator}" to="* "/>
                                 </pathconvert>
-                                <fail if="validate.patternsFound">The following files contain tabs or nocommits:${line.separator}${validate.patternsFound}</fail>
+                                <fail if="validate.patternsFound">The following files contain tabs or
+                                    nocommits:${line.separator}${validate.patternsFound}
+                                </fail>
                             </target>
                         </configuration>
                     </execution>
@@ -581,7 +584,8 @@
                         <id>tests</id>
                         <phase>test</phase>
                         <configuration>
-                            <skip>${skipTests}</skip> <!-- don't run if we skip the tests -->
+                            <skip>${skipTests}</skip>
+                            <!-- don't run if we skip the tests -->
                             <failOnError>false</failOnError>
                             <target>
                                 <property name="runtime_classpath" refid="maven.runtime.classpath"/>
@@ -595,7 +599,7 @@
                                     </classpath>
                                 </taskdef>
                                 <tophints max="${tests.topn}">
-                                    <file file="${basedir}/${execution.hint.file}" />
+                                    <file file="${basedir}/${execution.hint.file}"/>
                                 </tophints>
                             </target>
                         </configuration>
@@ -708,7 +712,7 @@
                             <shadedPattern>org.elasticsearch.common.compress</shadedPattern>
                         </relocation>
                         <relocation>
-                        <pattern>com.github.mustachejava</pattern>
+                            <pattern>com.github.mustachejava</pattern>
                             <shadedPattern>org.elasticsearch.common.mustache</shadedPattern>
                         </relocation>
                         <relocation>
@@ -1222,7 +1226,7 @@
                             <excludes>
                                 <!-- start exclude for test GC simulation using Thread.suspend -->
                                 <exclude>org/elasticsearch/test/disruption/LongGCDisruption.class</exclude>
-                                <!-- end exclude for Channels -->
+                                <!-- end exclude for GC simulation  -->
                             </excludes>
                             <signaturesFiles>
                                 <signaturesFile>test-signatures.txt</signaturesFile>
@@ -1348,219 +1352,220 @@
         </pluginManagement>
     </build>
     <profiles>
-      <!-- default profile, with randomization setting kicks in -->
-      <profile>
-        <id>default</id>
-        <activation>
-          <activeByDefault>true</activeByDefault>
-        </activation>
-        <build>
-          <plugins>
-            <plugin>
-              <groupId>com.carrotsearch.randomizedtesting</groupId>
-              <artifactId>junit4-maven-plugin</artifactId>
-              <configuration>
-                <argLine>${tests.jvm.argline}</argLine>
-              </configuration>
-            </plugin>
-            <plugin>
-              <groupId>com.mycila</groupId>
-              <artifactId>license-maven-plugin</artifactId>
-              <version>2.5</version>
-              <configuration>
-                <header>dev-tools/elasticsearch_license_header.txt</header>
-                <headerDefinitions>
-                  <headerDefinition>dev-tools/license_header_definition.xml</headerDefinition>
-                </headerDefinitions>
-                <includes>
-                  <include>src/main/java/org/elasticsearch/**/*.java</include>
-                  <include>src/test/java/org/elasticsearch/**/*.java</include>
-                </includes>
-                <excludes>
-                  <exclude>src/main/java/org/elasticsearch/common/inject/**</exclude>
-                  <!-- Guice -->
-                  <exclude>src/main/java/org/elasticsearch/common/geo/GeoHashUtils.java</exclude>
-                  <exclude>src/main/java/org/elasticsearch/common/lucene/search/XBooleanFilter.java</exclude>
-                  <exclude>src/main/java/org/elasticsearch/common/lucene/search/XFilteredQuery.java</exclude>
-                  <exclude>src/main/java/org/apache/lucene/queryparser/XSimpleQueryParser.java</exclude>
-                  <exclude>src/main/java/org/apache/lucene/**/X*.java</exclude>
-                  <!-- t-digest -->
-                  <exclude>src/main/java/org/elasticsearch/search/aggregations/metrics/percentiles/tdigest/TDigestState.java</exclude>
-                  <exclude>src/test/java/org/elasticsearch/search/aggregations/metrics/GroupTree.java</exclude>
-                </excludes>
-              </configuration>
-                <executions>
-                    <execution>
-                        <phase>compile</phase>
-                        <goals>
-                            <goal>check</goal>
-                        </goals>
-                    </execution>
-                </executions>
-            </plugin>
-          </plugins>
-        </build>
-      </profile>
-      <!-- profile for development that doesn't check forbidden-apis, no-commit validation or license headers run with mvn -Pdev -->
-      <profile>
-        <id>dev</id>
-        <properties>
-          <validate.skip>true</validate.skip>
-        </properties>
-          <build>
-        <plugins>
-          <plugin>
-              <groupId>de.thetaphi</groupId>
-              <artifactId>forbiddenapis</artifactId>
-              <version>1.5.1</version>
-              <executions>
-                <execution>
-                    <id>check-forbidden-apis</id>
-                    <phase>none</phase>
-                </execution>
-                <execution>
-                    <id>check-forbidden-test-apis</id>
-                    <phase>none</phase>
-                </execution>
-              </executions>
-          </plugin>
-          </plugins>
-          </build>
-      </profile>
-      <!-- license profile, to generate third party license file -->
-      <profile>
-        <id>license</id>
-        <activation>
-          <property>
-            <name>license.generation</name>
-            <value>true</value>
-          </property>
-        </activation>
-        <!-- not including license-maven-plugin is sufficent to expose default license -->
-      </profile>
-      <!-- jacoco coverage profile.  This will insert -jagent -->
-      <profile>
-        <id>coverage</id>
-        <activation>
-          <property>
-            <name>tests.coverage</name>
-            <value>true</value>
-          </property>
-        </activation>
-        <dependencies>
-          <dependency>
-            <!--  must be on the classpath  -->
-            <groupId>org.jacoco</groupId>
-            <artifactId>org.jacoco.agent</artifactId>
-            <classifier>runtime</classifier>
-            <version>0.6.4.201312101107</version>
-            <scope>test</scope>
-          </dependency>
-        </dependencies>
-        <build>
-          <plugins>
-            <plugin>
-              <groupId>org.jacoco</groupId>
-              <artifactId>jacoco-maven-plugin</artifactId>
-              <version>0.6.4.201312101107</version>
-              <executions>
-                <execution>
-                  <id>default-prepare-agent</id>
-                  <goals>
-                    <goal>prepare-agent</goal>
-                  </goals>
-                </execution>
-                <execution>
-                  <id>default-report</id>
-                  <phase>prepare-package</phase>
-                  <goals>
-                    <goal>report</goal>
-                  </goals>
-                </execution>
-                <execution>
-                  <id>default-check</id>
-                  <goals>
-                    <goal>check</goal>
-                  </goals>
-                </execution>
-              </executions>
-              <configuration>
-                <excludes>
-                  <exclude>jsr166e/**</exclude>
-                  <exclude>org/apache/lucene/**</exclude>
-                </excludes>
-              </configuration>
-            </plugin>
-          </plugins>
-        </build>
-      </profile>
-      <profile>
-        <id>static</id>
-        <activation>
-          <property>
-            <name>tests.static</name>
-            <value>true</value>
-          </property>
-        </activation>
-        <build>
-          <plugins>
-            <plugin>
-              <groupId>org.codehaus.mojo</groupId>
-              <artifactId>findbugs-maven-plugin</artifactId>
-              <version>2.5.3</version>
-            </plugin>
-          </plugins>
-        </build>
-        <reporting>
-          <plugins>
-            <plugin>
-              <groupId>org.apache.maven.plugins</groupId>
-              <artifactId>maven-jxr-plugin</artifactId>
-              <version>2.3</version>
-            </plugin>
-            <plugin>
-              <groupId>org.apache.maven.plugins</groupId>
-              <artifactId>maven-pmd-plugin</artifactId>
-              <version>3.0.1</version>
-              <configuration>
-                <rulesets>
-                  <ruleset>${basedir}/dev-tools/pmd/custom.xml</ruleset>
-                </rulesets>
-                <targetJdk>1.7</targetJdk>
-                <excludes>
-                  <exclude>**/jsr166e/**</exclude>
-                  <exclude>**/org/apache/lucene/**</exclude>
-                  <exclude>**/org/apache/elasticsearch/common/Base64.java</exclude>
-                </excludes>
-              </configuration>
-            </plugin>
-            <plugin>
-              <groupId>org.codehaus.mojo</groupId>
-              <artifactId>findbugs-maven-plugin</artifactId>
-              <version>2.5.3</version>
-              <configuration>
-                <xmlOutput>true</xmlOutput>
-                <xmlOutputDirectory>target/site</xmlOutputDirectory>
-                <fork>true</fork>
-                <maxHeap>2048</maxHeap>
-                <timeout>1800000</timeout>
-                <onlyAnalyze>org.elasticsearch.-</onlyAnalyze>
-              </configuration>
-            </plugin>
-            <plugin>
-              <groupId>org.apache.maven.plugins</groupId>
-              <artifactId>maven-project-info-reports-plugin</artifactId>
-              <version>2.7</version>
-              <reportSets>
-                <reportSet>
-                  <reports>
-                    <report>index</report>
-                  </reports>
-                </reportSet>
-              </reportSets>
-            </plugin>
-          </plugins>
-        </reporting>
-      </profile>
+        <!-- default profile, with randomization setting kicks in -->
+        <profile>
+            <id>default</id>
+            <activation>
+                <activeByDefault>true</activeByDefault>
+            </activation>
+            <build>
+                <plugins>
+                    <plugin>
+                        <groupId>com.carrotsearch.randomizedtesting</groupId>
+                        <artifactId>junit4-maven-plugin</artifactId>
+                        <configuration>
+                            <argLine>${tests.jvm.argline}</argLine>
+                        </configuration>
+                    </plugin>
+                    <plugin>
+                        <groupId>com.mycila</groupId>
+                        <artifactId>license-maven-plugin</artifactId>
+                        <version>2.5</version>
+                        <configuration>
+                            <header>dev-tools/elasticsearch_license_header.txt</header>
+                            <headerDefinitions>
+                                <headerDefinition>dev-tools/license_header_definition.xml</headerDefinition>
+                            </headerDefinitions>
+                            <includes>
+                                <include>src/main/java/org/elasticsearch/**/*.java</include>
+                                <include>src/test/java/org/elasticsearch/**/*.java</include>
+                            </includes>
+                            <excludes>
+                                <exclude>src/main/java/org/elasticsearch/common/inject/**</exclude>
+                                <!-- Guice -->
+                                <exclude>src/main/java/org/elasticsearch/common/geo/GeoHashUtils.java</exclude>
+                                <exclude>src/main/java/org/elasticsearch/common/lucene/search/XBooleanFilter.java</exclude>
+                                <exclude>src/main/java/org/elasticsearch/common/lucene/search/XFilteredQuery.java</exclude>
+                                <exclude>src/main/java/org/apache/lucene/queryparser/XSimpleQueryParser.java</exclude>
+                                <exclude>src/main/java/org/apache/lucene/**/X*.java</exclude>
+                                <!-- t-digest -->
+                                <exclude>src/main/java/org/elasticsearch/search/aggregations/metrics/percentiles/tdigest/TDigestState.java
+                                </exclude>
+                                <exclude>src/test/java/org/elasticsearch/search/aggregations/metrics/GroupTree.java</exclude>
+                            </excludes>
+                        </configuration>
+                        <executions>
+                            <execution>
+                                <phase>compile</phase>
+                                <goals>
+                                    <goal>check</goal>
+                                </goals>
+                            </execution>
+                        </executions>
+                    </plugin>
+                </plugins>
+            </build>
+        </profile>
+        <!-- profile for development that doesn't check forbidden-apis, no-commit validation or license headers run with mvn -Pdev -->
+        <profile>
+            <id>dev</id>
+            <properties>
+                <validate.skip>true</validate.skip>
+            </properties>
+            <build>
+                <plugins>
+                    <plugin>
+                        <groupId>de.thetaphi</groupId>
+                        <artifactId>forbiddenapis</artifactId>
+                        <version>1.5.1</version>
+                        <executions>
+                            <execution>
+                                <id>check-forbidden-apis</id>
+                                <phase>none</phase>
+                            </execution>
+                            <execution>
+                                <id>check-forbidden-test-apis</id>
+                                <phase>none</phase>
+                            </execution>
+                        </executions>
+                    </plugin>
+                </plugins>
+            </build>
+        </profile>
+        <!-- license profile, to generate third party license file -->
+        <profile>
+            <id>license</id>
+            <activation>
+                <property>
+                    <name>license.generation</name>
+                    <value>true</value>
+                </property>
+            </activation>
+            <!-- not including license-maven-plugin is sufficent to expose default license -->
+        </profile>
+        <!-- jacoco coverage profile.  This will insert -jagent -->
+        <profile>
+            <id>coverage</id>
+            <activation>
+                <property>
+                    <name>tests.coverage</name>
+                    <value>true</value>
+                </property>
+            </activation>
+            <dependencies>
+                <dependency>
+                    <!--  must be on the classpath  -->
+                    <groupId>org.jacoco</groupId>
+                    <artifactId>org.jacoco.agent</artifactId>
+                    <classifier>runtime</classifier>
+                    <version>0.6.4.201312101107</version>
+                    <scope>test</scope>
+                </dependency>
+            </dependencies>
+            <build>
+                <plugins>
+                    <plugin>
+                        <groupId>org.jacoco</groupId>
+                        <artifactId>jacoco-maven-plugin</artifactId>
+                        <version>0.6.4.201312101107</version>
+                        <executions>
+                            <execution>
+                                <id>default-prepare-agent</id>
+                                <goals>
+                                    <goal>prepare-agent</goal>
+                                </goals>
+                            </execution>
+                            <execution>
+                                <id>default-report</id>
+                                <phase>prepare-package</phase>
+                                <goals>
+                                    <goal>report</goal>
+                                </goals>
+                            </execution>
+                            <execution>
+                                <id>default-check</id>
+                                <goals>
+                                    <goal>check</goal>
+                                </goals>
+                            </execution>
+                        </executions>
+                        <configuration>
+                            <excludes>
+                                <exclude>jsr166e/**</exclude>
+                                <exclude>org/apache/lucene/**</exclude>
+                            </excludes>
+                        </configuration>
+                    </plugin>
+                </plugins>
+            </build>
+        </profile>
+        <profile>
+            <id>static</id>
+            <activation>
+                <property>
+                    <name>tests.static</name>
+                    <value>true</value>
+                </property>
+            </activation>
+            <build>
+                <plugins>
+                    <plugin>
+                        <groupId>org.codehaus.mojo</groupId>
+                        <artifactId>findbugs-maven-plugin</artifactId>
+                        <version>2.5.3</version>
+                    </plugin>
+                </plugins>
+            </build>
+            <reporting>
+                <plugins>
+                    <plugin>
+                        <groupId>org.apache.maven.plugins</groupId>
+                        <artifactId>maven-jxr-plugin</artifactId>
+                        <version>2.3</version>
+                    </plugin>
+                    <plugin>
+                        <groupId>org.apache.maven.plugins</groupId>
+                        <artifactId>maven-pmd-plugin</artifactId>
+                        <version>3.0.1</version>
+                        <configuration>
+                            <rulesets>
+                                <ruleset>${basedir}/dev-tools/pmd/custom.xml</ruleset>
+                            </rulesets>
+                            <targetJdk>1.7</targetJdk>
+                            <excludes>
+                                <exclude>**/jsr166e/**</exclude>
+                                <exclude>**/org/apache/lucene/**</exclude>
+                                <exclude>**/org/apache/elasticsearch/common/Base64.java</exclude>
+                            </excludes>
+                        </configuration>
+                    </plugin>
+                    <plugin>
+                        <groupId>org.codehaus.mojo</groupId>
+                        <artifactId>findbugs-maven-plugin</artifactId>
+                        <version>2.5.3</version>
+                        <configuration>
+                            <xmlOutput>true</xmlOutput>
+                            <xmlOutputDirectory>target/site</xmlOutputDirectory>
+                            <fork>true</fork>
+                            <maxHeap>2048</maxHeap>
+                            <timeout>1800000</timeout>
+                            <onlyAnalyze>org.elasticsearch.-</onlyAnalyze>
+                        </configuration>
+                    </plugin>
+                    <plugin>
+                        <groupId>org.apache.maven.plugins</groupId>
+                        <artifactId>maven-project-info-reports-plugin</artifactId>
+                        <version>2.7</version>
+                        <reportSets>
+                            <reportSet>
+                                <reports>
+                                    <report>index</report>
+                                </reports>
+                            </reportSet>
+                        </reportSets>
+                    </plugin>
+                </plugins>
+            </reporting>
+        </profile>
     </profiles>
 </project>
diff --git a/src/main/java/org/elasticsearch/cluster/block/ClusterBlocks.java b/src/main/java/org/elasticsearch/cluster/block/ClusterBlocks.java
index 2aa6e4e014b79..bb7d332de4f96 100644
--- a/src/main/java/org/elasticsearch/cluster/block/ClusterBlocks.java
+++ b/src/main/java/org/elasticsearch/cluster/block/ClusterBlocks.java
@@ -118,7 +118,7 @@ public boolean hasGlobalBlock(int blockId) {
     }
 
     public boolean hasGlobalBlock(ClusterBlockLevel level) {
-        return !global(level).isEmpty();
+        return global(level).size() > 0;
     }
 
     /**
diff --git a/src/main/java/org/elasticsearch/discovery/DiscoverySettings.java b/src/main/java/org/elasticsearch/discovery/DiscoverySettings.java
index dda300424ad7d..b8d48b16129a9 100644
--- a/src/main/java/org/elasticsearch/discovery/DiscoverySettings.java
+++ b/src/main/java/org/elasticsearch/discovery/DiscoverySettings.java
@@ -89,12 +89,13 @@ public void onRefreshSettings(Settings settings) {
     }
 
     private ClusterBlock parseNoMasterBlock(String value) {
-        if ("all".equals(value)) {
-            return NO_MASTER_BLOCK_ALL;
-        } else if ("write".equals(value)) {
-            return NO_MASTER_BLOCK_WRITES;
-        } else {
-            throw new ElasticsearchIllegalArgumentException("invalid master block [" + value + "]");
+        switch (value) {
+            case "all":
+                return NO_MASTER_BLOCK_ALL;
+            case "write":
+                return NO_MASTER_BLOCK_WRITES;
+            default:
+                throw new ElasticsearchIllegalArgumentException("invalid master block [" + value + "]");
         }
     }
 }
diff --git a/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java b/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
index 8bfcb5bdb141d..5662492d4d5f3 100644
--- a/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
+++ b/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
@@ -939,13 +939,14 @@ private DiscoveryNode findMaster() {
 
         if (pingMasters.isEmpty()) {
             // if we don't have enough master nodes, we bail, because there are not enough master to elect from
-            if (!electMaster.hasEnoughMasterNodes(possibleMasterNodes)) {
+            if (electMaster.hasEnoughMasterNodes(possibleMasterNodes)) {
+                return electMaster.electMaster(possibleMasterNodes);
+            } else {
                 logger.trace("not enough master nodes [{}]", possibleMasterNodes);
                 return null;
             }
-            // lets tie break between discovered nodes
-            return electMaster.electMaster(possibleMasterNodes);
         } else {
+            // lets tie break between discovered nodes
             return electMaster.electMaster(pingMasters);
         }
     }
diff --git a/src/main/java/org/elasticsearch/discovery/zen/fd/MasterFaultDetection.java b/src/main/java/org/elasticsearch/discovery/zen/fd/MasterFaultDetection.java
index b4f635184e7ea..c4b385572a289 100644
--- a/src/main/java/org/elasticsearch/discovery/zen/fd/MasterFaultDetection.java
+++ b/src/main/java/org/elasticsearch/discovery/zen/fd/MasterFaultDetection.java
@@ -396,6 +396,7 @@ public void messageReceived(MasterPingRequest request, TransportChannel channel)
                 throw new NotMasterException();
             }
 
+            // ping from nodes of version < 1.4.0 will have the clustername set to null
             if (request.clusterName != null && !request.clusterName.equals(clusterName)) {
                 logger.trace("master fault detection ping request is targeted for a different [{}] cluster then us [{}]", request.clusterName, clusterName);
                 throw new NotMasterException("master fault detection ping request is targeted for a different [" + request.clusterName + "] cluster then us [" + clusterName + "]");
diff --git a/src/main/java/org/elasticsearch/discovery/zen/fd/NodesFaultDetection.java b/src/main/java/org/elasticsearch/discovery/zen/fd/NodesFaultDetection.java
index bd485b927e394..b21cab0bc4342 100644
--- a/src/main/java/org/elasticsearch/discovery/zen/fd/NodesFaultDetection.java
+++ b/src/main/java/org/elasticsearch/discovery/zen/fd/NodesFaultDetection.java
@@ -321,6 +321,8 @@ public void messageReceived(PingRequest request, TransportChannel channel) throw
             if (!latestNodes.localNodeId().equals(request.nodeId)) {
                 throw new ElasticsearchIllegalStateException("Got pinged as node [" + request.nodeId + "], but I am node [" + latestNodes.localNodeId() + "]");
             }
+
+            // PingRequest will have clusterName set to null if it came from a node of version <1.4.0
             if (request.clusterName != null && !request.clusterName.equals(clusterName)) {
                 // Don't introduce new exception for bwc reasons
                 throw new ElasticsearchIllegalStateException("Got pinged with cluster name [" + request.clusterName + "], but I'm part of cluster [" + clusterName + "]");
diff --git a/src/main/java/org/elasticsearch/discovery/zen/ping/unicast/UnicastZenPing.java b/src/main/java/org/elasticsearch/discovery/zen/ping/unicast/UnicastZenPing.java
index ee9526f6d5f52..123f2d7fc7f3d 100644
--- a/src/main/java/org/elasticsearch/discovery/zen/ping/unicast/UnicastZenPing.java
+++ b/src/main/java/org/elasticsearch/discovery/zen/ping/unicast/UnicastZenPing.java
@@ -71,7 +71,7 @@ public class UnicastZenPing extends AbstractLifecycleComponent<ZenPing> implemen
 
     private final int concurrentConnects;
 
-    private final DiscoveryNode[] nodes;
+    private final DiscoveryNode[] configuredTargetNodes;
 
     private volatile DiscoveryNodesProvider nodesProvider;
 
@@ -79,7 +79,7 @@ public class UnicastZenPing extends AbstractLifecycleComponent<ZenPing> implemen
 
     private final Map<Integer, ConcurrentMap<DiscoveryNode, PingResponse>> receivedResponses = newConcurrentMap();
 
-    // a list of temporal responses a node will return for a request (holds requests from other nodes)
+    // a list of temporal responses a node will return for a request (holds requests from other configuredTargetNodes)
     private final Queue<PingResponse> temporalResponses = ConcurrentCollections.newQueue();
 
     private final CopyOnWriteArrayList<UnicastHostsProvider> hostsProviders = new CopyOnWriteArrayList<>();
@@ -107,20 +107,20 @@ public UnicastZenPing(Settings settings, ThreadPool threadPool, TransportService
         List<String> hosts = Lists.newArrayList(hostArr);
         logger.debug("using initial hosts {}, with concurrent_connects [{}]", hosts, concurrentConnects);
 
-        List<DiscoveryNode> nodes = Lists.newArrayList();
+        List<DiscoveryNode> configuredTargetNodes = Lists.newArrayList();
         int idCounter = 0;
         for (String host : hosts) {
             try {
                 TransportAddress[] addresses = transportService.addressesFromString(host);
                 // we only limit to 1 addresses, makes no sense to ping 100 ports
                 for (int i = 0; (i < addresses.length && i < LIMIT_PORTS_COUNT); i++) {
-                    nodes.add(new DiscoveryNode("#zen_unicast_" + (++idCounter) + "#", addresses[i], version.minimumCompatibilityVersion()));
+                    configuredTargetNodes.add(new DiscoveryNode("#zen_unicast_" + (++idCounter) + "#", addresses[i], version.minimumCompatibilityVersion()));
                 }
             } catch (Exception e) {
                 throw new ElasticsearchIllegalArgumentException("Failed to resolve address for [" + host + "]", e);
             }
         }
-        this.nodes = nodes.toArray(new DiscoveryNode[nodes.size()]);
+        this.configuredTargetNodes = configuredTargetNodes.toArray(new DiscoveryNode[configuredTargetNodes.size()]);
 
         transportService.registerHandler(ACTION_NAME, new UnicastPingRequestHandler());
     }
@@ -273,7 +273,7 @@ void sendPings(final TimeValue timeout, @Nullable TimeValue waitTime, final Send
         List<DiscoveryNode> sortedNodesToPing = electMasterService.sortByMasterLikelihood(nodesToPingSet);
 
         // new add the the unicast targets first
-        ArrayList<DiscoveryNode> nodesToPing = Lists.newArrayList(nodes);
+        ArrayList<DiscoveryNode> nodesToPing = Lists.newArrayList(configuredTargetNodes);
         nodesToPing.addAll(sortedNodesToPing);
 
         final CountDownLatch latch = new CountDownLatch(nodesToPing.size());

From d15909716bf14c6c48e57689d9ecbfc651b5906d Mon Sep 17 00:00:00 2001
From: Boaz Leskes <b.leskes@gmail.com>
Date: Fri, 29 Aug 2014 09:46:28 +0200
Subject: [PATCH 69/74] [Internal] moved ZenDiscovery setting to use string
 constants

---
 .../ClusterDynamicSettingsModule.java         |  2 +-
 .../discovery/DiscoveryService.java           |  4 +-
 .../discovery/zen/ZenDiscovery.java           | 49 ++++++++++++-------
 .../zen/ZenDiscoveryRejoinOnMaster.java       |  2 +-
 4 files changed, 37 insertions(+), 20 deletions(-)

diff --git a/src/main/java/org/elasticsearch/cluster/settings/ClusterDynamicSettingsModule.java b/src/main/java/org/elasticsearch/cluster/settings/ClusterDynamicSettingsModule.java
index e28438f6cfe45..de9f66b2e75a5 100644
--- a/src/main/java/org/elasticsearch/cluster/settings/ClusterDynamicSettingsModule.java
+++ b/src/main/java/org/elasticsearch/cluster/settings/ClusterDynamicSettingsModule.java
@@ -58,7 +58,7 @@ public ClusterDynamicSettingsModule() {
         clusterDynamicSettings.addDynamicSetting(DisableAllocationDecider.CLUSTER_ROUTING_ALLOCATION_DISABLE_ALLOCATION);
         clusterDynamicSettings.addDynamicSetting(DisableAllocationDecider.CLUSTER_ROUTING_ALLOCATION_DISABLE_REPLICA_ALLOCATION);
         clusterDynamicSettings.addDynamicSetting(ElectMasterService.DISCOVERY_ZEN_MINIMUM_MASTER_NODES, Validator.INTEGER);
-        clusterDynamicSettings.addDynamicSetting(ZenDiscovery.REJOIN_ON_MASTER_GONE, Validator.BOOLEAN);
+        clusterDynamicSettings.addDynamicSetting(ZenDiscovery.SETTING_REJOIN_ON_MASTER_GONE, Validator.BOOLEAN);
         clusterDynamicSettings.addDynamicSetting(DiscoverySettings.NO_MASTER_BLOCK);
         clusterDynamicSettings.addDynamicSetting(FilterAllocationDecider.CLUSTER_ROUTING_INCLUDE_GROUP + "*");
         clusterDynamicSettings.addDynamicSetting(FilterAllocationDecider.CLUSTER_ROUTING_EXCLUDE_GROUP + "*");
diff --git a/src/main/java/org/elasticsearch/discovery/DiscoveryService.java b/src/main/java/org/elasticsearch/discovery/DiscoveryService.java
index f5a555417cc96..f73f2bbb5939b 100644
--- a/src/main/java/org/elasticsearch/discovery/DiscoveryService.java
+++ b/src/main/java/org/elasticsearch/discovery/DiscoveryService.java
@@ -39,6 +39,8 @@
  */
 public class DiscoveryService extends AbstractLifecycleComponent<DiscoveryService> {
 
+    public static final String SETTING_INITIAL_STATE_TIMEOUT = "discovery.initial_state_timeout";
+
     private static class InitialStateListener implements InitialStateDiscoveryListener {
 
         private final CountDownLatch latch = new CountDownLatch(1);
@@ -68,7 +70,7 @@ public DiscoveryService(Settings settings, DiscoverySettings discoverySettings,
         super(settings);
         this.discoverySettings = discoverySettings;
         this.discovery = discovery;
-        this.initialStateTimeout = componentSettings.getAsTime("initial_state_timeout", TimeValue.timeValueSeconds(30));
+        this.initialStateTimeout = settings.getAsTime(SETTING_INITIAL_STATE_TIMEOUT, TimeValue.timeValueSeconds(30));
     }
 
     public ClusterBlock getNoMasterBlock() {
diff --git a/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java b/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
index 5662492d4d5f3..e03bac932e207 100644
--- a/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
+++ b/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
@@ -22,10 +22,7 @@
 import com.google.common.base.Objects;
 import com.google.common.collect.Lists;
 import com.google.common.collect.Sets;
-import org.elasticsearch.ElasticsearchException;
-import org.elasticsearch.ElasticsearchIllegalStateException;
-import org.elasticsearch.ExceptionsHelper;
-import org.elasticsearch.Version;
+import org.elasticsearch.*;
 import org.elasticsearch.cluster.*;
 import org.elasticsearch.cluster.block.ClusterBlocks;
 import org.elasticsearch.cluster.metadata.IndexMetaData;
@@ -80,7 +77,15 @@
  */
 public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implements Discovery, DiscoveryNodesProvider {
 
-    public final static String REJOIN_ON_MASTER_GONE = "discovery.zen.rejoin_on_master_gone";
+    public final static String SETTING_REJOIN_ON_MASTER_GONE = "discovery.zen.rejoin_on_master_gone";
+    public final static String SETTING_PING_TIMEOUT = "discovery.zen.ping.timeout";
+    public final static String SETTING_JOIN_TIMEOUT = "discovery.zen.join_timeout";
+    public final static String SETTING_JOIN_RETRY_ATTEMPTS = "discovery.zen.join_retry_attempts";
+    public final static String SETTING_JOIN_RETRY_DELAY = "discovery.zen.join_retry_delay";
+    public final static String SETTING_MAX_PINGS_FROM_ANOTHER_MASTER = "discovery.zen.max_pings_from_another_master";
+    public final static String SETTING_SEND_LEAVE_REQUEST = "discovery.zen.send_leave_request";
+    public final static String SETTING_MASTER_ELECTION_FILTER_CLIENT = "discovery.zen.master_election.filter_client";
+    public final static String SETTING_MASTER_ELECTION_FILTER_DATA = "discovery.zen.master_election.filter_data";
 
     public static final String DISCOVERY_REJOIN_ACTION_NAME = "internal:discovery/zen/rejoin";
 
@@ -154,18 +159,28 @@ public ZenDiscovery(Settings settings, ClusterName clusterName, ThreadPool threa
         this.version = version;
         this.electMaster = electMasterService;
 
+        // keep using componentSettings for BWC, in case this class gets extended.
+        TimeValue pingTimeout = componentSettings.getAsTime("initial_ping_timeout", timeValueSeconds(3));
+        pingTimeout = componentSettings.getAsTime("ping_timeout", pingTimeout);
+        pingTimeout = settings.getAsTime("discovery.zen.ping_timeout", pingTimeout);
+        this.pingTimeout = settings.getAsTime(SETTING_PING_TIMEOUT, pingTimeout);
 
-        // also support direct discovery.zen settings, for cases when it gets extended
-        this.pingTimeout = settings.getAsTime("discovery.zen.ping.timeout", settings.getAsTime("discovery.zen.ping_timeout", componentSettings.getAsTime("ping_timeout", componentSettings.getAsTime("initial_ping_timeout", timeValueSeconds(3)))));
-        this.joinTimeout = settings.getAsTime("discovery.zen.join_timeout", TimeValue.timeValueMillis(pingTimeout.millis() * 20));
-        this.joinRetryAttempts = settings.getAsInt("discovery.zen.join_retry_attempts", 3);
-        this.joinRetryDelay = settings.getAsTime("discovery.zen.join_retry_delay", TimeValue.timeValueMillis(100));
-        this.maxPingsFromAnotherMaster = settings.getAsInt("discovery.zen.max_pings_from_another_master", 3);
-        this.sendLeaveRequest = componentSettings.getAsBoolean("send_leave_request", true);
+        this.joinTimeout = settings.getAsTime(SETTING_JOIN_TIMEOUT, TimeValue.timeValueMillis(pingTimeout.millis() * 20));
+        this.joinRetryAttempts = settings.getAsInt(SETTING_JOIN_RETRY_ATTEMPTS, 3);
+        this.joinRetryDelay = settings.getAsTime(SETTING_JOIN_RETRY_DELAY, TimeValue.timeValueMillis(100));
+        this.maxPingsFromAnotherMaster = settings.getAsInt(SETTING_MAX_PINGS_FROM_ANOTHER_MASTER, 3);
+        this.sendLeaveRequest = settings.getAsBoolean(SETTING_SEND_LEAVE_REQUEST, true);
 
-        this.masterElectionFilterClientNodes = settings.getAsBoolean("discovery.zen.master_election.filter_client", true);
-        this.masterElectionFilterDataNodes = settings.getAsBoolean("discovery.zen.master_election.filter_data", false);
-        this.rejoinOnMasterGone = settings.getAsBoolean(REJOIN_ON_MASTER_GONE, true);
+        this.masterElectionFilterClientNodes = settings.getAsBoolean(SETTING_MASTER_ELECTION_FILTER_CLIENT, true);
+        this.masterElectionFilterDataNodes = settings.getAsBoolean(SETTING_MASTER_ELECTION_FILTER_DATA, false);
+        this.rejoinOnMasterGone = settings.getAsBoolean(SETTING_REJOIN_ON_MASTER_GONE, true);
+
+        if (this.joinRetryAttempts < 1) {
+            throw new ElasticsearchIllegalArgumentException("'" + SETTING_JOIN_RETRY_ATTEMPTS + "' must be a positive number. got [" + this.SETTING_JOIN_RETRY_ATTEMPTS + "]");
+        }
+        if (this.maxPingsFromAnotherMaster < 1) {
+            throw new ElasticsearchIllegalArgumentException("'" + SETTING_MAX_PINGS_FROM_ANOTHER_MASTER + "' must be a positive number. got [" + this.maxPingsFromAnotherMaster + "]");
+        }
 
         logger.debug("using ping.timeout [{}], join.timeout [{}], master_election.filter_client [{}], master_election.filter_data [{}]", pingTimeout, joinTimeout, masterElectionFilterClientNodes, masterElectionFilterDataNodes);
 
@@ -1164,9 +1179,9 @@ public void onRefreshSettings(Settings settings) {
                 handleMinimumMasterNodesChanged(minimumMasterNodes);
             }
 
-            boolean rejoinOnMasterGone = settings.getAsBoolean(REJOIN_ON_MASTER_GONE, ZenDiscovery.this.rejoinOnMasterGone);
+            boolean rejoinOnMasterGone = settings.getAsBoolean(SETTING_REJOIN_ON_MASTER_GONE, ZenDiscovery.this.rejoinOnMasterGone);
             if (rejoinOnMasterGone != ZenDiscovery.this.rejoinOnMasterGone) {
-                logger.info("updating {} from [{}] to [{}]", REJOIN_ON_MASTER_GONE, ZenDiscovery.this.rejoinOnMasterGone, rejoinOnMasterGone);
+                logger.info("updating {} from [{}] to [{}]", SETTING_REJOIN_ON_MASTER_GONE, ZenDiscovery.this.rejoinOnMasterGone, rejoinOnMasterGone);
                 ZenDiscovery.this.rejoinOnMasterGone = rejoinOnMasterGone;
             }
         }
diff --git a/src/test/java/org/elasticsearch/discovery/zen/ZenDiscoveryRejoinOnMaster.java b/src/test/java/org/elasticsearch/discovery/zen/ZenDiscoveryRejoinOnMaster.java
index 31cede9260254..f5b0067ea1836 100644
--- a/src/test/java/org/elasticsearch/discovery/zen/ZenDiscoveryRejoinOnMaster.java
+++ b/src/test/java/org/elasticsearch/discovery/zen/ZenDiscoveryRejoinOnMaster.java
@@ -45,7 +45,7 @@ public void testChangeRejoinOnMasterOptionIsDynamic() throws Exception {
         assertThat(zenDiscovery.isRejoinOnMasterGone(), is(true));
 
         client().admin().cluster().prepareUpdateSettings()
-                .setTransientSettings(ImmutableSettings.builder().put(ZenDiscovery.REJOIN_ON_MASTER_GONE, false))
+                .setTransientSettings(ImmutableSettings.builder().put(ZenDiscovery.SETTING_REJOIN_ON_MASTER_GONE, false))
                 .get();
 
         assertThat(zenDiscovery.isRejoinOnMasterGone(), is(false));

From 680fb366379d1b0c01439e1ae83befc01cf403f9 Mon Sep 17 00:00:00 2001
From: Boaz Leskes <b.leskes@gmail.com>
Date: Fri, 29 Aug 2014 17:03:08 +0200
Subject: [PATCH 70/74] [Discovery] Add try/catch around repetitive onSuccess
 calls

---
 .../java/org/elasticsearch/discovery/zen/ZenDiscovery.java  | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java b/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
index e03bac932e207..b8c2d9eb0e54e 100644
--- a/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
+++ b/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
@@ -884,7 +884,11 @@ public void onFailure(String source, Throwable t) {
                 @Override
                 public void clusterStateProcessed(String source, ClusterState oldState, ClusterState newState) {
                     for (Tuple<DiscoveryNode, MembershipAction.JoinCallback> drainedTask : drainedTasks) {
-                        drainedTask.v2().onSuccess();
+                        try {
+                            drainedTask.v2().onSuccess();
+                        } catch (Exception e) {
+                            logger.error("unexpected error during [{}]", e, source);
+                        }
                     }
                 }
             });

From ed5b2e0e35638bd186754657a8195750b4ac6b51 Mon Sep 17 00:00:00 2001
From: Boaz Leskes <b.leskes@gmail.com>
Date: Fri, 29 Aug 2014 17:07:24 +0200
Subject: [PATCH 71/74] Add an assertion to ZenDiscovery checking that local
 node is never elected if pings indicate an active master

---
 src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java b/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
index b8c2d9eb0e54e..f924787338751 100644
--- a/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
+++ b/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
@@ -965,6 +965,8 @@ private DiscoveryNode findMaster() {
                 return null;
             }
         } else {
+
+            assert !pingMasters.contains(localNode) : "local node should never be elected as master when other nodes indicate an active master";
             // lets tie break between discovered nodes
             return electMaster.electMaster(pingMasters);
         }

From d8a5ff0047f530f489b34702a61e85037ce2e7b6 Mon Sep 17 00:00:00 2001
From: Boaz Leskes <b.leskes@gmail.com>
Date: Fri, 29 Aug 2014 22:57:23 +0200
Subject: [PATCH 72/74] [Internal] introduce ClusterState.UNKNOWN_VERSION
 constant

Used as null value for cluster state versions.
---
 src/main/java/org/elasticsearch/cluster/ClusterState.java    | 2 ++
 .../java/org/elasticsearch/discovery/zen/ZenDiscovery.java   | 2 +-
 .../elasticsearch/discovery/zen/fd/NodesFaultDetection.java  | 5 +++--
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/main/java/org/elasticsearch/cluster/ClusterState.java b/src/main/java/org/elasticsearch/cluster/ClusterState.java
index fec5fa486afdb..7594be3fad060 100644
--- a/src/main/java/org/elasticsearch/cluster/ClusterState.java
+++ b/src/main/java/org/elasticsearch/cluster/ClusterState.java
@@ -115,6 +115,8 @@ public static <T extends Custom> Custom.Factory<T> lookupFactorySafe(String type
     }
 
 
+    public static final long UNKNOWN_VERSION = -1;
+
     private final long version;
 
     private final RoutingTable routingTable;
diff --git a/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java b/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
index f924787338751..759b011dfb034 100644
--- a/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
+++ b/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
@@ -216,7 +216,7 @@ protected void doStart() throws ElasticsearchException {
         final String nodeId = DiscoveryService.generateNodeId(settings);
         localNode = new DiscoveryNode(settings.get("name"), nodeId, transportService.boundAddress().publishAddress(), nodeAttributes, version);
         latestDiscoNodes = new DiscoveryNodes.Builder().put(localNode).localNodeId(localNode.id()).build();
-        nodesFD.updateNodes(latestDiscoNodes, -1);
+        nodesFD.updateNodes(latestDiscoNodes, ClusterState.UNKNOWN_VERSION);
         pingService.start();
 
         // do the join on a different thread, the DiscoveryService waits for 30s anyhow till it is discovered
diff --git a/src/main/java/org/elasticsearch/discovery/zen/fd/NodesFaultDetection.java b/src/main/java/org/elasticsearch/discovery/zen/fd/NodesFaultDetection.java
index b21cab0bc4342..1788d3a257c48 100644
--- a/src/main/java/org/elasticsearch/discovery/zen/fd/NodesFaultDetection.java
+++ b/src/main/java/org/elasticsearch/discovery/zen/fd/NodesFaultDetection.java
@@ -22,6 +22,7 @@
 import org.elasticsearch.ElasticsearchIllegalStateException;
 import org.elasticsearch.Version;
 import org.elasticsearch.cluster.ClusterName;
+import org.elasticsearch.cluster.ClusterState;
 import org.elasticsearch.cluster.node.DiscoveryNode;
 import org.elasticsearch.cluster.node.DiscoveryNodes;
 import org.elasticsearch.common.component.AbstractComponent;
@@ -82,7 +83,7 @@ public void onPingReceived(PingRequest pingRequest) {}
 
     private volatile DiscoveryNodes latestNodes = EMPTY_NODES;
 
-    private volatile long clusterStateVersion = -1;
+    private volatile long clusterStateVersion = ClusterState.UNKNOWN_VERSION;
 
     private volatile boolean running = false;
 
@@ -349,7 +350,7 @@ public static class PingRequest extends TransportRequest {
 
         private DiscoveryNode masterNode;
 
-        private long clusterStateVersion = -1;
+        private long clusterStateVersion = ClusterState.UNKNOWN_VERSION;
 
         PingRequest() {
         }

From 596a4a073584c4262d574828c9caea35b5ed1de5 Mon Sep 17 00:00:00 2001
From: Boaz Leskes <b.leskes@gmail.com>
Date: Fri, 29 Aug 2014 10:10:58 +0200
Subject: [PATCH 73/74] [Internal] Extract a common base class for
 (Master|Nodes)FaultDetection

They share a lot of settings and some logic.

Closes #7512
---
 .../discovery/zen/fd/FaultDetection.java      | 95 +++++++++++++++++++
 .../zen/fd/MasterFaultDetection.java          | 57 +----------
 .../discovery/zen/fd/NodesFaultDetection.java | 57 +----------
 .../DiscoveryWithServiceDisruptions.java      |  5 +-
 .../discovery/ZenFaultDetectionTests.java     |  7 +-
 .../zen/ZenDiscoveryRejoinOnMaster.java       |  5 +-
 .../index/TransportIndexFailuresTest.java     |  5 +-
 7 files changed, 119 insertions(+), 112 deletions(-)
 create mode 100644 src/main/java/org/elasticsearch/discovery/zen/fd/FaultDetection.java

diff --git a/src/main/java/org/elasticsearch/discovery/zen/fd/FaultDetection.java b/src/main/java/org/elasticsearch/discovery/zen/fd/FaultDetection.java
new file mode 100644
index 0000000000000..d3e644f2166aa
--- /dev/null
+++ b/src/main/java/org/elasticsearch/discovery/zen/fd/FaultDetection.java
@@ -0,0 +1,95 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.elasticsearch.discovery.zen.fd;
+
+import org.elasticsearch.cluster.ClusterName;
+import org.elasticsearch.cluster.node.DiscoveryNode;
+import org.elasticsearch.common.component.AbstractComponent;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.common.unit.TimeValue;
+import org.elasticsearch.threadpool.ThreadPool;
+import org.elasticsearch.transport.TransportConnectionListener;
+import org.elasticsearch.transport.TransportService;
+
+import static org.elasticsearch.common.unit.TimeValue.timeValueSeconds;
+
+/**
+ * A base class for {@link org.elasticsearch.discovery.zen.fd.MasterFaultDetection} & {@link org.elasticsearch.discovery.zen.fd.NodesFaultDetection},
+ * making sure both use the same setting.
+ */
+public abstract class FaultDetection extends AbstractComponent {
+
+    public static final String SETTING_CONNECT_ON_NETWORK_DISCONNECT = "discovery.zen.fd.connect_on_network_disconnect";
+    public static final String SETTING_PING_INTERVAL = "discovery.zen.fd.ping_interval";
+    public static final String SETTING_PING_TIMEOUT = "discovery.zen.fd.ping_timeout";
+    public static final String SETTING_PING_RETRIES = "discovery.zen.fd.ping_retries";
+    public static final String SETTING_REGISTER_CONNECTION_LISTENER = "discovery.zen.fd.register_connection_listener";
+
+    protected final ThreadPool threadPool;
+    protected final ClusterName clusterName;
+    protected final TransportService transportService;
+
+    // used mainly for testing, should always be true
+    protected final boolean registerConnectionListener;
+    protected final FDConnectionListener connectionListener;
+    protected final boolean connectOnNetworkDisconnect;
+
+    protected final TimeValue pingInterval;
+    protected final TimeValue pingRetryTimeout;
+    protected final int pingRetryCount;
+
+    public FaultDetection(Settings settings, ThreadPool threadPool, TransportService transportService, ClusterName clusterName) {
+        super(settings);
+        this.threadPool = threadPool;
+        this.transportService = transportService;
+        this.clusterName = clusterName;
+
+        this.connectOnNetworkDisconnect = settings.getAsBoolean(SETTING_CONNECT_ON_NETWORK_DISCONNECT, false);
+        this.pingInterval = settings.getAsTime(SETTING_PING_INTERVAL, timeValueSeconds(1));
+        this.pingRetryTimeout = settings.getAsTime(SETTING_PING_TIMEOUT, timeValueSeconds(30));
+        this.pingRetryCount = settings.getAsInt(SETTING_PING_RETRIES, 3);
+        this.registerConnectionListener = settings.getAsBoolean(SETTING_REGISTER_CONNECTION_LISTENER, true);
+
+        this.connectionListener = new FDConnectionListener();
+        if (registerConnectionListener) {
+            transportService.addConnectionListener(connectionListener);
+        }
+    }
+
+    public void close() {
+        transportService.removeConnectionListener(connectionListener);
+    }
+
+    /**
+     * This method will be called when the {@link org.elasticsearch.transport.TransportService} raised a node disconnected event
+     */
+    abstract void handleTransportDisconnect(DiscoveryNode node);
+
+    private class FDConnectionListener implements TransportConnectionListener {
+        @Override
+        public void onNodeConnected(DiscoveryNode node) {
+        }
+
+        @Override
+        public void onNodeDisconnected(DiscoveryNode node) {
+            handleTransportDisconnect(node);
+        }
+    }
+
+}
diff --git a/src/main/java/org/elasticsearch/discovery/zen/fd/MasterFaultDetection.java b/src/main/java/org/elasticsearch/discovery/zen/fd/MasterFaultDetection.java
index c4b385572a289..49709b7905bfd 100644
--- a/src/main/java/org/elasticsearch/discovery/zen/fd/MasterFaultDetection.java
+++ b/src/main/java/org/elasticsearch/discovery/zen/fd/MasterFaultDetection.java
@@ -24,7 +24,6 @@
 import org.elasticsearch.cluster.ClusterName;
 import org.elasticsearch.cluster.node.DiscoveryNode;
 import org.elasticsearch.cluster.node.DiscoveryNodes;
-import org.elasticsearch.common.component.AbstractComponent;
 import org.elasticsearch.common.io.stream.StreamInput;
 import org.elasticsearch.common.io.stream.StreamOutput;
 import org.elasticsearch.common.settings.Settings;
@@ -37,13 +36,12 @@
 import java.util.concurrent.CopyOnWriteArrayList;
 import java.util.concurrent.atomic.AtomicBoolean;
 
-import static org.elasticsearch.common.unit.TimeValue.timeValueSeconds;
 import static org.elasticsearch.transport.TransportRequestOptions.options;
 
 /**
  * A fault detection that pings the master periodically to see if its alive.
  */
-public class MasterFaultDetection extends AbstractComponent {
+public class MasterFaultDetection extends FaultDetection {
 
     public static final String MASTER_PING_ACTION_NAME = "internal:discovery/zen/fd/master_ping";
 
@@ -54,31 +52,10 @@ public static interface Listener {
         void onDisconnectedFromMaster();
     }
 
-    private final ThreadPool threadPool;
-
-    private final TransportService transportService;
-
     private final DiscoveryNodesProvider nodesProvider;
 
-    private final ClusterName clusterName;
-
     private final CopyOnWriteArrayList<Listener> listeners = new CopyOnWriteArrayList<>();
 
-
-    private final boolean connectOnNetworkDisconnect;
-
-    private final TimeValue pingInterval;
-
-    private final TimeValue pingRetryTimeout;
-
-    private final int pingRetryCount;
-
-    // used mainly for testing, should always be true
-    private final boolean registerConnectionListener;
-
-
-    private final FDConnectionListener connectionListener;
-
     private volatile MasterPinger masterPinger;
 
     private final Object masterNodeMutex = new Object();
@@ -91,25 +68,11 @@ public static interface Listener {
 
     public MasterFaultDetection(Settings settings, ThreadPool threadPool, TransportService transportService,
                                 DiscoveryNodesProvider nodesProvider, ClusterName clusterName) {
-        super(settings);
-        this.threadPool = threadPool;
-        this.transportService = transportService;
+        super(settings, threadPool, transportService, clusterName);
         this.nodesProvider = nodesProvider;
-        this.clusterName = clusterName;
-
-        this.connectOnNetworkDisconnect = componentSettings.getAsBoolean("connect_on_network_disconnect", false);
-        this.pingInterval = componentSettings.getAsTime("ping_interval", timeValueSeconds(1));
-        this.pingRetryTimeout = componentSettings.getAsTime("ping_timeout", timeValueSeconds(30));
-        this.pingRetryCount = componentSettings.getAsInt("ping_retries", 3);
-        this.registerConnectionListener = componentSettings.getAsBoolean("register_connection_listener", true);
 
         logger.debug("[master] uses ping_interval [{}], ping_timeout [{}], ping_retries [{}]", pingInterval, pingRetryTimeout, pingRetryCount);
 
-        this.connectionListener = new FDConnectionListener();
-        if (registerConnectionListener) {
-            transportService.addConnectionListener(connectionListener);
-        }
-
         transportService.registerHandler(MASTER_PING_ACTION_NAME, new MasterPingRequestHandler());
     }
 
@@ -188,13 +151,14 @@ private void innerStop() {
     }
 
     public void close() {
+        super.close();
         stop("closing");
         this.listeners.clear();
-        transportService.removeConnectionListener(connectionListener);
         transportService.removeHandler(MASTER_PING_ACTION_NAME);
     }
 
-    private void handleTransportDisconnect(DiscoveryNode node) {
+    @Override
+    protected void handleTransportDisconnect(DiscoveryNode node) {
         synchronized (masterNodeMutex) {
             if (!node.equals(this.masterNode)) {
                 return;
@@ -245,17 +209,6 @@ public void run() {
         }
     }
 
-    private class FDConnectionListener implements TransportConnectionListener {
-        @Override
-        public void onNodeConnected(DiscoveryNode node) {
-        }
-
-        @Override
-        public void onNodeDisconnected(DiscoveryNode node) {
-            handleTransportDisconnect(node);
-        }
-    }
-
     private class MasterPinger implements Runnable {
 
         private volatile boolean running = true;
diff --git a/src/main/java/org/elasticsearch/discovery/zen/fd/NodesFaultDetection.java b/src/main/java/org/elasticsearch/discovery/zen/fd/NodesFaultDetection.java
index 1788d3a257c48..9001209911612 100644
--- a/src/main/java/org/elasticsearch/discovery/zen/fd/NodesFaultDetection.java
+++ b/src/main/java/org/elasticsearch/discovery/zen/fd/NodesFaultDetection.java
@@ -25,7 +25,6 @@
 import org.elasticsearch.cluster.ClusterState;
 import org.elasticsearch.cluster.node.DiscoveryNode;
 import org.elasticsearch.cluster.node.DiscoveryNodes;
-import org.elasticsearch.common.component.AbstractComponent;
 import org.elasticsearch.common.io.stream.StreamInput;
 import org.elasticsearch.common.io.stream.StreamOutput;
 import org.elasticsearch.common.settings.Settings;
@@ -38,14 +37,13 @@
 import java.util.concurrent.CopyOnWriteArrayList;
 
 import static org.elasticsearch.cluster.node.DiscoveryNodes.EMPTY_NODES;
-import static org.elasticsearch.common.unit.TimeValue.timeValueSeconds;
 import static org.elasticsearch.common.util.concurrent.ConcurrentCollections.newConcurrentMap;
 import static org.elasticsearch.transport.TransportRequestOptions.options;
 
 /**
  * A fault detection of multiple nodes.
  */
-public class NodesFaultDetection extends AbstractComponent {
+public class NodesFaultDetection extends FaultDetection {
 
     public static final String PING_ACTION_NAME = "internal:discovery/zen/fd/ping";
     
@@ -57,30 +55,10 @@ public void onPingReceived(PingRequest pingRequest) {}
 
     }
 
-    private final ThreadPool threadPool;
-
-    private final TransportService transportService;
-    private final ClusterName clusterName;
-
-
-    private final boolean connectOnNetworkDisconnect;
-
-    private final TimeValue pingInterval;
-
-    private final TimeValue pingRetryTimeout;
-
-    private final int pingRetryCount;
-
-    // used mainly for testing, should always be true
-    private final boolean registerConnectionListener;
-
-
     private final CopyOnWriteArrayList<Listener> listeners = new CopyOnWriteArrayList<>();
 
     private final ConcurrentMap<DiscoveryNode, NodeFD> nodesFD = newConcurrentMap();
 
-    private final FDConnectionListener connectionListener;
-
     private volatile DiscoveryNodes latestNodes = EMPTY_NODES;
 
     private volatile long clusterStateVersion = ClusterState.UNKNOWN_VERSION;
@@ -88,25 +66,11 @@ public void onPingReceived(PingRequest pingRequest) {}
     private volatile boolean running = false;
 
     public NodesFaultDetection(Settings settings, ThreadPool threadPool, TransportService transportService, ClusterName clusterName) {
-        super(settings);
-        this.threadPool = threadPool;
-        this.transportService = transportService;
-        this.clusterName = clusterName;
-
-        this.connectOnNetworkDisconnect = componentSettings.getAsBoolean("connect_on_network_disconnect", false);
-        this.pingInterval = componentSettings.getAsTime("ping_interval", timeValueSeconds(1));
-        this.pingRetryTimeout = componentSettings.getAsTime("ping_timeout", timeValueSeconds(30));
-        this.pingRetryCount = componentSettings.getAsInt("ping_retries", 3);
-        this.registerConnectionListener = componentSettings.getAsBoolean("register_connection_listener", true);
+        super(settings, threadPool, transportService, clusterName);
 
         logger.debug("[node  ] uses ping_interval [{}], ping_timeout [{}], ping_retries [{}]", pingInterval, pingRetryTimeout, pingRetryCount);
 
         transportService.registerHandler(PING_ACTION_NAME, new PingRequestHandler());
-
-        this.connectionListener = new FDConnectionListener();
-        if (registerConnectionListener) {
-            transportService.addConnectionListener(connectionListener);
-        }
     }
 
     public void addListener(Listener listener) {
@@ -158,12 +122,13 @@ public NodesFaultDetection stop() {
     }
 
     public void close() {
+        super.close();
         stop();
         transportService.removeHandler(PING_ACTION_NAME);
-        transportService.removeConnectionListener(connectionListener);
     }
 
-    private void handleTransportDisconnect(DiscoveryNode node) {
+    @Override
+    protected void handleTransportDisconnect(DiscoveryNode node) {
         if (!latestNodes.nodeExists(node.id())) {
             return;
         }
@@ -296,18 +261,6 @@ static class NodeFD {
         volatile boolean running = true;
     }
 
-    private class FDConnectionListener implements TransportConnectionListener {
-        @Override
-        public void onNodeConnected(DiscoveryNode node) {
-        }
-
-        @Override
-        public void onNodeDisconnected(DiscoveryNode node) {
-            handleTransportDisconnect(node);
-        }
-    }
-
-
     class PingRequestHandler extends BaseTransportRequestHandler<PingRequest> {
 
         @Override
diff --git a/src/test/java/org/elasticsearch/discovery/DiscoveryWithServiceDisruptions.java b/src/test/java/org/elasticsearch/discovery/DiscoveryWithServiceDisruptions.java
index ff52ca351c6a3..82abe2eccb14b 100644
--- a/src/test/java/org/elasticsearch/discovery/DiscoveryWithServiceDisruptions.java
+++ b/src/test/java/org/elasticsearch/discovery/DiscoveryWithServiceDisruptions.java
@@ -43,6 +43,7 @@
 import org.elasticsearch.common.unit.TimeValue;
 import org.elasticsearch.discovery.zen.ZenDiscovery;
 import org.elasticsearch.discovery.zen.elect.ElectMasterService;
+import org.elasticsearch.discovery.zen.fd.FaultDetection;
 import org.elasticsearch.discovery.zen.membership.MembershipAction;
 import org.elasticsearch.discovery.zen.ping.ZenPing;
 import org.elasticsearch.discovery.zen.ping.ZenPingService;
@@ -114,8 +115,8 @@ private List<String> startCluster(int numberOfNodes, int minimumMasterNode) thro
     }
 
     final static Settings DEFAULT_SETTINGS = ImmutableSettings.builder()
-            .put("discovery.zen.fd.ping_timeout", "1s") // for hitting simulated network failures quickly
-            .put("discovery.zen.fd.ping_retries", "1") // for hitting simulated network failures quickly
+            .put(FaultDetection.SETTING_PING_TIMEOUT, "1s") // for hitting simulated network failures quickly
+            .put(FaultDetection.SETTING_PING_RETRIES, "1") // for hitting simulated network failures quickly
             .put("discovery.zen.join_timeout", "10s")  // still long to induce failures but to long so test won't time out
             .put(DiscoverySettings.PUBLISH_TIMEOUT, "1s") // <-- for hitting simulated network failures quickly
             .put("http.enabled", false) // just to make test quicker
diff --git a/src/test/java/org/elasticsearch/discovery/ZenFaultDetectionTests.java b/src/test/java/org/elasticsearch/discovery/ZenFaultDetectionTests.java
index 553267971e50e..082148921e6db 100644
--- a/src/test/java/org/elasticsearch/discovery/ZenFaultDetectionTests.java
+++ b/src/test/java/org/elasticsearch/discovery/ZenFaultDetectionTests.java
@@ -27,6 +27,7 @@
 import org.elasticsearch.common.settings.ImmutableSettings;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.discovery.zen.DiscoveryNodesProvider;
+import org.elasticsearch.discovery.zen.fd.FaultDetection;
 import org.elasticsearch.discovery.zen.fd.MasterFaultDetection;
 import org.elasticsearch.discovery.zen.fd.NodesFaultDetection;
 import org.elasticsearch.node.service.NodeService;
@@ -131,7 +132,8 @@ public void testNodesFaultDetectionConnectOnDisconnect() throws InterruptedExcep
         ImmutableSettings.Builder settings = ImmutableSettings.builder();
         boolean shouldRetry = randomBoolean();
         // make sure we don't ping
-        settings.put("discovery.zen.fd.connect_on_network_disconnect", shouldRetry).put("discovery.zen.fd.ping_interval", "5m");
+        settings.put(FaultDetection.SETTING_CONNECT_ON_NETWORK_DISCONNECT, shouldRetry)
+                .put(FaultDetection.SETTING_PING_INTERVAL, "5m");
         NodesFaultDetection nodesFD = new NodesFaultDetection(settings.build(), threadPool, serviceA, new ClusterName("test"));
         nodesFD.start();
         nodesFD.updateNodes(buildNodesForA(true), -1);
@@ -165,7 +167,8 @@ public void testMasterFaultDetectionConnectOnDisconnect() throws InterruptedExce
         ImmutableSettings.Builder settings = ImmutableSettings.builder();
         boolean shouldRetry = randomBoolean();
         // make sure we don't ping
-        settings.put("discovery.zen.fd.connect_on_network_disconnect", shouldRetry).put("discovery.zen.fd.ping_interval", "5m");
+        settings.put(FaultDetection.SETTING_CONNECT_ON_NETWORK_DISCONNECT, shouldRetry)
+                .put(FaultDetection.SETTING_PING_INTERVAL, "5m");
         ClusterName clusterName = new ClusterName(randomAsciiOfLengthBetween(3, 20));
         final DiscoveryNodes nodes = buildNodesForA(false);
         MasterFaultDetection masterFD = new MasterFaultDetection(settings.build(), threadPool, serviceA,
diff --git a/src/test/java/org/elasticsearch/discovery/zen/ZenDiscoveryRejoinOnMaster.java b/src/test/java/org/elasticsearch/discovery/zen/ZenDiscoveryRejoinOnMaster.java
index f5b0067ea1836..1ee31505d5e0f 100644
--- a/src/test/java/org/elasticsearch/discovery/zen/ZenDiscoveryRejoinOnMaster.java
+++ b/src/test/java/org/elasticsearch/discovery/zen/ZenDiscoveryRejoinOnMaster.java
@@ -25,6 +25,7 @@
 import org.elasticsearch.common.settings.ImmutableSettings;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.discovery.Discovery;
+import org.elasticsearch.discovery.zen.fd.FaultDetection;
 import org.elasticsearch.test.ElasticsearchIntegrationTest;
 import org.junit.Test;
 
@@ -54,8 +55,8 @@ public void testChangeRejoinOnMasterOptionIsDynamic() throws Exception {
     @Test
     public void testNoShardRelocationsOccurWhenElectedMasterNodeFails() throws Exception {
         Settings defaultSettings = ImmutableSettings.builder()
-                .put("discovery.zen.fd.ping_timeout", "1s")
-                .put("discovery.zen.fd.ping_retries", "1")
+                .put(FaultDetection.SETTING_PING_TIMEOUT, "1s")
+                .put(FaultDetection.SETTING_PING_RETRIES, "1")
                 .put("discovery.type", "zen")
                 .build();
 
diff --git a/src/test/java/org/elasticsearch/index/TransportIndexFailuresTest.java b/src/test/java/org/elasticsearch/index/TransportIndexFailuresTest.java
index f8fe46cae1f07..c7c20b790ddae 100644
--- a/src/test/java/org/elasticsearch/index/TransportIndexFailuresTest.java
+++ b/src/test/java/org/elasticsearch/index/TransportIndexFailuresTest.java
@@ -33,6 +33,7 @@
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.discovery.Discovery;
 import org.elasticsearch.discovery.DiscoverySettings;
+import org.elasticsearch.discovery.zen.fd.FaultDetection;
 import org.elasticsearch.test.ElasticsearchIntegrationTest;
 import org.elasticsearch.test.junit.annotations.TestLogging;
 import org.elasticsearch.test.transport.MockTransportService;
@@ -54,8 +55,8 @@ public class TransportIndexFailuresTest extends ElasticsearchIntegrationTest {
 
     private static final Settings nodeSettings = ImmutableSettings.settingsBuilder()
             .put("discovery.type", "zen") // <-- To override the local setting if set externally
-            .put("discovery.zen.fd.ping_timeout", "1s") // <-- for hitting simulated network failures quickly
-            .put("discovery.zen.fd.ping_retries", "1") // <-- for hitting simulated network failures quickly
+            .put(FaultDetection.SETTING_PING_TIMEOUT, "1s") // <-- for hitting simulated network failures quickly
+            .put(FaultDetection.SETTING_PING_RETRIES, "1") // <-- for hitting simulated network failures quickly
             .put(DiscoverySettings.PUBLISH_TIMEOUT, "1s") // <-- for hitting simulated network failures quickly
             .put("discovery.zen.minimum_master_nodes", 1)
             .put(TransportModule.TRANSPORT_SERVICE_TYPE_KEY, MockTransportService.class.getName())

From 34f4ca763c5bb16b4fa9fb0b657b89971003fb74 Mon Sep 17 00:00:00 2001
From: Boaz Leskes <b.leskes@gmail.com>
Date: Fri, 29 Aug 2014 16:46:53 +0200
Subject: [PATCH 74/74] [Cluster] Refactored ClusterStateUpdateTask protection
 against execution on a non master

Previous implementation used a marker interface and had no explicit failure call back for the case update task was run on a non master (i.e., the master stepped down after it was submitted). That lead to a couple of instance of checks.

This approach moves ClusterStateUpdateTask from an interface to an abstract class, which allows adding a flag to indicate whether it should only run on master nodes (defaults to true). It also adds an explicit onNoLongerMaster call back to allow different error handling for that case. This also removed the need for the  NoLongerMaster.

Closes #7511
---
 .../TransportClusterUpdateSettingsAction.java | 15 ++--
 .../action/bench/BenchmarkService.java        | 42 ++++++-----
 .../cluster/AckedClusterStateUpdateTask.java  |  4 +-
 .../elasticsearch/cluster/ClusterService.java | 11 ---
 .../ClusterStateNonMasterUpdateTask.java      |  7 +-
 .../cluster/ClusterStateUpdateTask.java       | 24 ++++++-
 ...cessedClusterStateNonMasterUpdateTask.java |  9 ++-
 .../ProcessedClusterStateUpdateTask.java      |  4 +-
 .../TimeoutClusterStateUpdateTask.java        |  4 +-
 .../cluster/routing/RoutingService.java       |  7 +-
 .../service/InternalClusterService.java       |  4 +-
 .../discovery/zen/ZenDiscovery.java           | 70 ++++++++++++-------
 .../cluster/ClusterServiceTests.java          |  4 +-
 13 files changed, 124 insertions(+), 81 deletions(-)

diff --git a/src/main/java/org/elasticsearch/action/admin/cluster/settings/TransportClusterUpdateSettingsAction.java b/src/main/java/org/elasticsearch/action/admin/cluster/settings/TransportClusterUpdateSettingsAction.java
index a94b322ceb376..edcf833464073 100644
--- a/src/main/java/org/elasticsearch/action/admin/cluster/settings/TransportClusterUpdateSettingsAction.java
+++ b/src/main/java/org/elasticsearch/action/admin/cluster/settings/TransportClusterUpdateSettingsAction.java
@@ -137,16 +137,17 @@ protected ClusterUpdateSettingsResponse newResponse(boolean acknowledged) {
                         return new ClusterUpdateSettingsResponse(updateSettingsAcked && acknowledged, transientUpdates.build(), persistentUpdates.build());
                     }
 
+                    @Override
+                    public void onNoLongerMaster(String source) {
+                        logger.debug("failed to preform reroute after cluster settings were updated - current node is no longer a master");
+                        listener.onResponse(new ClusterUpdateSettingsResponse(updateSettingsAcked, transientUpdates.build(), persistentUpdates.build()));
+                    }
+
                     @Override
                     public void onFailure(String source, Throwable t) {
                         //if the reroute fails we only log
-                        if (t instanceof ClusterService.NoLongerMasterException) {
-                            logger.debug("failed to preform reroute after cluster settings were updated - current node is no longer a master");
-                            listener.onResponse(new ClusterUpdateSettingsResponse(updateSettingsAcked, transientUpdates.build(), persistentUpdates.build()));
-                        } else {
-                            logger.debug("failed to perform [{}]", t, source);
-                            listener.onFailure(new ElasticsearchException("reroute after update settings failed", t));
-                        }
+                        logger.debug("failed to perform [{}]", t, source);
+                        listener.onFailure(new ElasticsearchException("reroute after update settings failed", t));
                     }
 
                     @Override
diff --git a/src/main/java/org/elasticsearch/action/bench/BenchmarkService.java b/src/main/java/org/elasticsearch/action/bench/BenchmarkService.java
index 0ebfd47593e0a..5868aa12b5a28 100644
--- a/src/main/java/org/elasticsearch/action/bench/BenchmarkService.java
+++ b/src/main/java/org/elasticsearch/action/bench/BenchmarkService.java
@@ -66,11 +66,11 @@ public class BenchmarkService extends AbstractLifecycleComponent<BenchmarkServic
     /**
      * Constructs a service component for running benchmarks
      *
-     * @param settings          Settings
-     * @param clusterService    Cluster service
-     * @param threadPool        Thread pool
-     * @param client            Client
-     * @param transportService  Transport service
+     * @param settings         Settings
+     * @param clusterService   Cluster service
+     * @param threadPool       Thread pool
+     * @param client           Client
+     * @param transportService Transport service
      */
     @Inject
     public BenchmarkService(Settings settings, ClusterService clusterService, ThreadPool threadPool,
@@ -86,19 +86,22 @@ public BenchmarkService(Settings settings, ClusterService clusterService, Thread
     }
 
     @Override
-    protected void doStart() throws ElasticsearchException { }
+    protected void doStart() throws ElasticsearchException {
+    }
 
     @Override
-    protected void doStop() throws ElasticsearchException { }
+    protected void doStop() throws ElasticsearchException {
+    }
 
     @Override
-    protected void doClose() throws ElasticsearchException { }
+    protected void doClose() throws ElasticsearchException {
+    }
 
     /**
      * Lists actively running benchmarks on the cluster
      *
-     * @param request   Status request
-     * @param listener  Response listener
+     * @param request  Status request
+     * @param listener Response listener
      */
     public void listBenchmarks(final BenchmarkStatusRequest request, final ActionListener<BenchmarkStatusResponse> listener) {
 
@@ -171,8 +174,8 @@ public void onFailure(Throwable t) {
     /**
      * Executes benchmarks on the cluster
      *
-     * @param request   Benchmark request
-     * @param listener  Response listener
+     * @param request  Benchmark request
+     * @param listener Response listener
      */
     public void startBenchmark(final BenchmarkRequest request, final ActionListener<BenchmarkResponse> listener) {
 
@@ -228,7 +231,7 @@ public void onFailure(Throwable t) {
                 listener.onFailure(t);
             }
         }, (benchmarkResponse.state() != BenchmarkResponse.State.ABORTED) &&
-           (benchmarkResponse.state() != BenchmarkResponse.State.FAILED)));
+                (benchmarkResponse.state() != BenchmarkResponse.State.FAILED)));
     }
 
     private final boolean isBenchmarkNode(DiscoveryNode node) {
@@ -403,6 +406,7 @@ protected CountDownAsyncHandler(int size) {
         }
 
         public abstract T newInstance();
+
         protected abstract void sendResponse();
 
         @Override
@@ -593,7 +597,7 @@ public ClusterState execute(ClusterState currentState) {
 
             if (bmd != null) {
                 for (BenchmarkMetaData.Entry entry : bmd.entries()) {
-                    if (request.benchmarkName().equals(entry.benchmarkId())){
+                    if (request.benchmarkName().equals(entry.benchmarkId())) {
                         if (entry.state() != BenchmarkMetaData.State.SUCCESS && entry.state() != BenchmarkMetaData.State.FAILED) {
                             throw new ElasticsearchException("A benchmark with ID [" + request.benchmarkName() + "] is already running in state [" + entry.state() + "]");
                         }
@@ -648,7 +652,7 @@ public FinishBenchmarkTask(String reason, String benchmarkId, BenchmarkStateList
         @Override
         protected BenchmarkMetaData.Entry process(BenchmarkMetaData.Entry entry) {
             BenchmarkMetaData.State state = entry.state();
-            assert state == BenchmarkMetaData.State.STARTED || state == BenchmarkMetaData.State.ABORTED :  "Expected state: STARTED or ABORTED but was: " + entry.state();
+            assert state == BenchmarkMetaData.State.STARTED || state == BenchmarkMetaData.State.ABORTED : "Expected state: STARTED or ABORTED but was: " + entry.state();
             if (success) {
                 return new BenchmarkMetaData.Entry(entry, BenchmarkMetaData.State.SUCCESS);
             } else {
@@ -661,7 +665,7 @@ public final class AbortBenchmarkTask extends UpdateBenchmarkStateTask {
         private final String[] patterns;
 
         public AbortBenchmarkTask(String[] patterns, BenchmarkStateListener listener) {
-            super("abort_benchmark", null , listener);
+            super("abort_benchmark", null, listener);
             this.patterns = patterns;
         }
 
@@ -675,7 +679,7 @@ protected BenchmarkMetaData.Entry process(BenchmarkMetaData.Entry entry) {
         }
     }
 
-    public abstract class UpdateBenchmarkStateTask implements ProcessedClusterStateUpdateTask {
+    public abstract class UpdateBenchmarkStateTask extends ProcessedClusterStateUpdateTask {
 
         private final String reason;
         protected final String benchmarkId;
@@ -702,7 +706,7 @@ public ClusterState execute(ClusterState currentState) {
                 ImmutableList.Builder<BenchmarkMetaData.Entry> builder = new ImmutableList.Builder<BenchmarkMetaData.Entry>();
                 for (BenchmarkMetaData.Entry e : bmd.entries()) {
                     if (benchmarkId == null || match(e)) {
-                        e = process(e) ;
+                        e = process(e);
                         instances.add(e);
                     }
                     // Don't keep finished benchmarks around in cluster state
@@ -741,7 +745,7 @@ public String reason() {
         }
     }
 
-    public abstract class BenchmarkStateChangeAction<R extends MasterNodeOperationRequest> implements TimeoutClusterStateUpdateTask {
+    public abstract class BenchmarkStateChangeAction<R extends MasterNodeOperationRequest> extends TimeoutClusterStateUpdateTask {
         protected final R request;
 
         public BenchmarkStateChangeAction(R request) {
diff --git a/src/main/java/org/elasticsearch/cluster/AckedClusterStateUpdateTask.java b/src/main/java/org/elasticsearch/cluster/AckedClusterStateUpdateTask.java
index 7cdee75387382..087bd1c6ad68a 100644
--- a/src/main/java/org/elasticsearch/cluster/AckedClusterStateUpdateTask.java
+++ b/src/main/java/org/elasticsearch/cluster/AckedClusterStateUpdateTask.java
@@ -28,7 +28,7 @@
  * An extension interface to {@link ClusterStateUpdateTask} that allows to be notified when
  * all the nodes have acknowledged a cluster state update request
  */
-public abstract class AckedClusterStateUpdateTask<Response> implements TimeoutClusterStateUpdateTask {
+public abstract class AckedClusterStateUpdateTask<Response> extends TimeoutClusterStateUpdateTask {
 
     private final ActionListener<Response> listener;
     private final AckedRequest request;
@@ -40,6 +40,7 @@ protected AckedClusterStateUpdateTask(AckedRequest request, ActionListener<Respo
 
     /**
      * Called to determine which nodes the acknowledgement is expected from
+     *
      * @param discoveryNode a node
      * @return true if the node is expected to send ack back, false otherwise
      */
@@ -50,6 +51,7 @@ public boolean mustAck(DiscoveryNode discoveryNode) {
     /**
      * Called once all the nodes have acknowledged the cluster state update request. Must be
      * very lightweight execution, since it gets executed on the cluster service thread.
+     *
      * @param t optional error that might have been thrown
      */
     public void onAllNodesAcked(@Nullable Throwable t) {
diff --git a/src/main/java/org/elasticsearch/cluster/ClusterService.java b/src/main/java/org/elasticsearch/cluster/ClusterService.java
index f032a0cd06454..080fce84a36c2 100644
--- a/src/main/java/org/elasticsearch/cluster/ClusterService.java
+++ b/src/main/java/org/elasticsearch/cluster/ClusterService.java
@@ -111,15 +111,4 @@ public interface ClusterService extends LifecycleComponent<ClusterService> {
      */
     List<PendingClusterTask> pendingTasks();
 
-    /**
-     * an exception to indicate a {@link org.elasticsearch.cluster.ClusterStateUpdateTask} was not executed as
-     * the current node is no longer master
-     */
-    public static class NoLongerMasterException extends ElasticsearchIllegalStateException {
-
-        public NoLongerMasterException(String msg) {
-            super(msg);
-        }
-
-    }
 }
diff --git a/src/main/java/org/elasticsearch/cluster/ClusterStateNonMasterUpdateTask.java b/src/main/java/org/elasticsearch/cluster/ClusterStateNonMasterUpdateTask.java
index 2fac718ae2de2..48afbb8f1fe8f 100644
--- a/src/main/java/org/elasticsearch/cluster/ClusterStateNonMasterUpdateTask.java
+++ b/src/main/java/org/elasticsearch/cluster/ClusterStateNonMasterUpdateTask.java
@@ -23,5 +23,10 @@
  * This is a marker interface to indicate that the task should be executed
  * even if the current node is not a master.
  */
-public interface ClusterStateNonMasterUpdateTask extends ClusterStateUpdateTask {
+public abstract class ClusterStateNonMasterUpdateTask extends ClusterStateUpdateTask {
+
+    @Override
+    public boolean runOnlyOnMaster() {
+        return false;
+    }
 }
diff --git a/src/main/java/org/elasticsearch/cluster/ClusterStateUpdateTask.java b/src/main/java/org/elasticsearch/cluster/ClusterStateUpdateTask.java
index 490a556ab1264..921b6d149ee41 100644
--- a/src/main/java/org/elasticsearch/cluster/ClusterStateUpdateTask.java
+++ b/src/main/java/org/elasticsearch/cluster/ClusterStateUpdateTask.java
@@ -19,19 +19,37 @@
 
 package org.elasticsearch.cluster;
 
+import org.elasticsearch.common.Nullable;
+import org.elasticsearch.common.util.concurrent.EsRejectedExecutionException;
+
 /**
  * A task that can update the cluster state.
  */
-public interface ClusterStateUpdateTask {
+abstract public class ClusterStateUpdateTask {
 
     /**
      * Update the cluster state based on the current state. Return the *same instance* if no state
      * should be changed.
      */
-    ClusterState execute(ClusterState currentState) throws Exception;
+    abstract public ClusterState execute(ClusterState currentState) throws Exception;
 
     /**
      * A callback called when execute fails.
      */
-    void onFailure(String source, Throwable t);
+    abstract public void onFailure(String source, @Nullable Throwable t);
+
+
+    /**
+     * indicates whether this task should only run if current node is master
+     */
+    public boolean runOnlyOnMaster() {
+        return true;
+    }
+
+    /**
+     * called when the task was rejected because the local node is no longer master
+     */
+    public void onNoLongerMaster(String source) {
+        onFailure(source, new EsRejectedExecutionException("no longer master. source: [" + source + "]"));
+    }
 }
diff --git a/src/main/java/org/elasticsearch/cluster/ProcessedClusterStateNonMasterUpdateTask.java b/src/main/java/org/elasticsearch/cluster/ProcessedClusterStateNonMasterUpdateTask.java
index e46a2edc79245..4af05b43581e0 100644
--- a/src/main/java/org/elasticsearch/cluster/ProcessedClusterStateNonMasterUpdateTask.java
+++ b/src/main/java/org/elasticsearch/cluster/ProcessedClusterStateNonMasterUpdateTask.java
@@ -19,8 +19,13 @@
 package org.elasticsearch.cluster;
 
 /**
- * A combination interface between {@link org.elasticsearch.cluster.ProcessedClusterStateUpdateTask} and
+ * A combination between {@link org.elasticsearch.cluster.ProcessedClusterStateUpdateTask} and
  * {@link org.elasticsearch.cluster.ClusterStateNonMasterUpdateTask} to allow easy creation of anonymous classes
  */
-public interface ProcessedClusterStateNonMasterUpdateTask extends ProcessedClusterStateUpdateTask, ClusterStateNonMasterUpdateTask {
+abstract public class ProcessedClusterStateNonMasterUpdateTask extends ProcessedClusterStateUpdateTask {
+
+    @Override
+    public boolean runOnlyOnMaster() {
+        return false;
+    }
 }
diff --git a/src/main/java/org/elasticsearch/cluster/ProcessedClusterStateUpdateTask.java b/src/main/java/org/elasticsearch/cluster/ProcessedClusterStateUpdateTask.java
index 72074965f95ed..2d703ed26212a 100644
--- a/src/main/java/org/elasticsearch/cluster/ProcessedClusterStateUpdateTask.java
+++ b/src/main/java/org/elasticsearch/cluster/ProcessedClusterStateUpdateTask.java
@@ -23,11 +23,11 @@
  * An extension interface to {@link ClusterStateUpdateTask} that allows to be notified when
  * the cluster state update has been processed.
  */
-public interface ProcessedClusterStateUpdateTask extends ClusterStateUpdateTask {
+public abstract class ProcessedClusterStateUpdateTask extends ClusterStateUpdateTask {
 
     /**
      * Called when the result of the {@link #execute(ClusterState)} have been processed
      * properly by all listeners.
      */
-    void clusterStateProcessed(String source, ClusterState oldState, ClusterState newState);
+    public abstract void clusterStateProcessed(String source, ClusterState oldState, ClusterState newState);
 }
diff --git a/src/main/java/org/elasticsearch/cluster/TimeoutClusterStateUpdateTask.java b/src/main/java/org/elasticsearch/cluster/TimeoutClusterStateUpdateTask.java
index 1083e1ddcbeda..1ae767c6560ae 100644
--- a/src/main/java/org/elasticsearch/cluster/TimeoutClusterStateUpdateTask.java
+++ b/src/main/java/org/elasticsearch/cluster/TimeoutClusterStateUpdateTask.java
@@ -25,11 +25,11 @@
  * An extension interface to {@link org.elasticsearch.cluster.ClusterStateUpdateTask} that allows to associate
  * a timeout.
  */
-public interface TimeoutClusterStateUpdateTask extends ProcessedClusterStateUpdateTask {
+abstract public class TimeoutClusterStateUpdateTask extends ProcessedClusterStateUpdateTask {
 
     /**
      * If the cluster state update task wasn't processed by the provided timeout, call
      * {@link #onFailure(String, Throwable)}
      */
-    TimeValue timeout();
+    abstract public TimeValue timeout();
 }
diff --git a/src/main/java/org/elasticsearch/cluster/routing/RoutingService.java b/src/main/java/org/elasticsearch/cluster/routing/RoutingService.java
index 828244494a976..555b8b3ef1b48 100644
--- a/src/main/java/org/elasticsearch/cluster/routing/RoutingService.java
+++ b/src/main/java/org/elasticsearch/cluster/routing/RoutingService.java
@@ -149,12 +149,15 @@ public ClusterState execute(ClusterState currentState) {
                     return ClusterState.builder(currentState).routingResult(routingResult).build();
                 }
 
+                @Override
+                public void onNoLongerMaster(String source) {
+                    // no biggie
+                }
+
                 @Override
                 public void onFailure(String source, Throwable t) {
-                    if (!(t instanceof ClusterService.NoLongerMasterException)) {
                         ClusterState state = clusterService.state();
                         logger.error("unexpected failure during [{}], current state:\n{}", t, source, state.prettyPrint());
-                    }
                 }
             });
             routingTableDirty = false;
diff --git a/src/main/java/org/elasticsearch/cluster/service/InternalClusterService.java b/src/main/java/org/elasticsearch/cluster/service/InternalClusterService.java
index be4f8d26df79c..c5fe004f8b9b2 100644
--- a/src/main/java/org/elasticsearch/cluster/service/InternalClusterService.java
+++ b/src/main/java/org/elasticsearch/cluster/service/InternalClusterService.java
@@ -325,9 +325,9 @@ public void run() {
             }
             logger.debug("processing [{}]: execute", source);
             ClusterState previousClusterState = clusterState;
-            if (!previousClusterState.nodes().localNodeMaster() && !(updateTask instanceof ClusterStateNonMasterUpdateTask)) {
+            if (!previousClusterState.nodes().localNodeMaster() && updateTask.runOnlyOnMaster()) {
                 logger.debug("failing [{}]: local node is no longer master", source);
-                updateTask.onFailure(source, new NoLongerMasterException("source: " + source));
+                updateTask.onNoLongerMaster(source);
                 return;
             }
             ClusterState newClusterState;
diff --git a/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java b/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
index 759b011dfb034..ca7aaf4f59efa 100644
--- a/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
+++ b/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
@@ -43,6 +43,7 @@
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.common.unit.TimeValue;
 import org.elasticsearch.common.util.concurrent.ConcurrentCollections;
+import org.elasticsearch.common.util.concurrent.EsRejectedExecutionException;
 import org.elasticsearch.discovery.Discovery;
 import org.elasticsearch.discovery.DiscoveryService;
 import org.elasticsearch.discovery.DiscoverySettings;
@@ -476,13 +477,14 @@ public ClusterState execute(ClusterState currentState) {
                     return ClusterState.builder(currentState).routingResult(routingResult).build();
                 }
 
+                @Override
+                public void onNoLongerMaster(String source) {
+                    // ignoring (already logged)
+                }
+
                 @Override
                 public void onFailure(String source, Throwable t) {
-                    if (t instanceof ClusterService.NoLongerMasterException) {
-                        logger.debug("not processing {} leave request as we are no longer master", node);
-                    } else {
-                        logger.error("unexpected failure during [{}]", t, source);
-                    }
+                    logger.error("unexpected failure during [{}]", t, source);
                 }
             });
         } else {
@@ -515,13 +517,14 @@ public ClusterState execute(ClusterState currentState) {
                 return ClusterState.builder(currentState).routingResult(routingResult).build();
             }
 
+            @Override
+            public void onNoLongerMaster(String source) {
+                // already logged
+            }
+
             @Override
             public void onFailure(String source, Throwable t) {
-                if (t instanceof ClusterService.NoLongerMasterException) {
-                    logger.debug("not processing [{}] as we are no longer master", source);
-                } else {
-                    logger.error("unexpected failure during [{}]", t, source);
-                }
+                logger.error("unexpected failure during [{}]", t, source);
             }
 
             @Override
@@ -552,13 +555,15 @@ public ClusterState execute(ClusterState currentState) {
                 return currentState;
             }
 
+
+            @Override
+            public void onNoLongerMaster(String source) {
+                // ignoring (already logged)
+            }
+
             @Override
             public void onFailure(String source, Throwable t) {
-                if (t instanceof ClusterService.NoLongerMasterException) {
-                    logger.debug("not processing [{}] as we are no longer master", source);
-                } else {
-                    logger.error("unexpected failure during [{}]", t, source);
-                }
+                logger.error("unexpected failure during [{}]", t, source);
             }
 
             @Override
@@ -870,17 +875,27 @@ public ClusterState execute(ClusterState currentState) {
                 }
 
                 @Override
-                public void onFailure(String source, Throwable t) {
-                    if (t instanceof ClusterService.NoLongerMasterException) {
-                        logger.debug("not processing [{}] as we are no longer master", source);
-                    } else {
-                        logger.error("unexpected failure during [{}]", t, source);
-                    }
+                public void onNoLongerMaster(String source) {
+                    Exception e = new EsRejectedExecutionException("no longer master. source: [" + source + "]");
+                    innerOnFailure(e);
+                }
+
+                void innerOnFailure(Throwable t) {
                     for (Tuple<DiscoveryNode, MembershipAction.JoinCallback> drainedTask : drainedTasks) {
-                        drainedTask.v2().onFailure(t);
+                        try {
+                            drainedTask.v2().onFailure(t);
+                        } catch (Exception e) {
+                            logger.error("error during task failure", e);
+                        }
                     }
                 }
 
+                @Override
+                public void onFailure(String source, Throwable t) {
+                    logger.error("unexpected failure during [{}]", t, source);
+                    innerOnFailure(t);
+                }
+
                 @Override
                 public void clusterStateProcessed(String source, ClusterState oldState, ClusterState newState) {
                     for (Tuple<DiscoveryNode, MembershipAction.JoinCallback> drainedTask : drainedTasks) {
@@ -1157,13 +1172,14 @@ public ClusterState execute(ClusterState currentState) {
                     return rejoin(currentState, "received a request to rejoin the cluster from [" + request.fromNodeId + "]");
                 }
 
+                @Override
+                public void onNoLongerMaster(String source) {
+                    // already logged
+                }
+
                 @Override
                 public void onFailure(String source, Throwable t) {
-                    if (t instanceof ClusterService.NoLongerMasterException) {
-                        logger.debug("not processing [{}] as we are no longer master", source);
-                    } else {
-                        logger.error("unexpected failure during [{}]", t, source);
-                    }
+                    logger.error("unexpected failure during [{}]", t, source);
                 }
             });
         }
diff --git a/src/test/java/org/elasticsearch/cluster/ClusterServiceTests.java b/src/test/java/org/elasticsearch/cluster/ClusterServiceTests.java
index 52f00035c4ab8..1d0a20386159b 100644
--- a/src/test/java/org/elasticsearch/cluster/ClusterServiceTests.java
+++ b/src/test/java/org/elasticsearch/cluster/ClusterServiceTests.java
@@ -708,7 +708,7 @@ public void testPrioritizedTasks() throws Exception {
         }
     }
 
-    private static class BlockingTask implements ClusterStateUpdateTask {
+    private static class BlockingTask extends ClusterStateUpdateTask {
         private final CountDownLatch latch = new CountDownLatch(1);
 
         @Override
@@ -727,7 +727,7 @@ public void release() {
 
     }
 
-    private static class PrioritiezedTask implements ClusterStateUpdateTask {
+    private static class PrioritiezedTask extends ClusterStateUpdateTask {
 
         private final Priority priority;
         private final CountDownLatch latch;