RELOCATION:Fix Indef. Block when Wait on Refresh

original-brownbear · original-brownbear · commit 146f1d00c6d7 · 2018-12-20T11:00:56.000+01:00
* Fixes the issue reproduced in the added tests:
   * When having open index requests on a shard that are waiting for a refresh, relocating that shard
becomes blocked until that refresh happens (which could be never as in the test scenario).
* Fixed by:
  * Before trying to aquire all permits for relocation, refresh if there are outstanding operations
diff --git a/server/src/main/java/org/elasticsearch/index/shard/IndexShard.java b/server/src/main/java/org/elasticsearch/index/shard/IndexShard.java
@@ -609,31 +609,35 @@ public void relocated(final Consumer<ReplicationTracker.PrimaryContext> consumer
                                             throws IllegalIndexShardStateException, InterruptedException {
         assert shardRouting.primary() : "only primaries can be marked as relocated: " + shardRouting;
         try {
-            indexShardOperationPermits.blockOperations(30, TimeUnit.MINUTES, () -> {
-                // no shard operation permits are being held here, move state from started to relocated
-                assert indexShardOperationPermits.getActiveOperationsCount() == 0 :
+            indexShardOperationPermits.blockOperations(30, TimeUnit.MINUTES,
+                () -> refresh("relocation requested"),
+                () -> {
+                    // no shard operation permits are being held here, move state from started to relocated
+                    assert indexShardOperationPermits.getActiveOperationsCount() == 0 :
                         "in-flight operations in progress while moving shard state to relocated";
-                /*
-                 * We should not invoke the runnable under the mutex as the expected implementation is to handoff the primary context via a
-                 * network operation. Doing this under the mutex can implicitly block the cluster state update thread on network operations.
-                 */
-                verifyRelocatingState();
-                final ReplicationTracker.PrimaryContext primaryContext = replicationTracker.startRelocationHandoff();
-                try {
-                    consumer.accept(primaryContext);
-                    synchronized (mutex) {
-                        verifyRelocatingState();
-                        replicationTracker.completeRelocationHandoff(); // make changes to primaryMode and relocated flag only under mutex
-                    }
-                } catch (final Exception e) {
+                    /*
+                     * We should not invoke the runnable under the mutex as the expected implementation is to handoff the primary context
+                     * via a network operation. Doing this under the mutex can implicitly block the cluster state update thread
+                     * on network operations.
+                     */
+                    verifyRelocatingState();
+                    final ReplicationTracker.PrimaryContext primaryContext = replicationTracker.startRelocationHandoff();
                     try {
-                        replicationTracker.abortRelocationHandoff();
-                    } catch (final Exception inner) {
-                        e.addSuppressed(inner);
+                        consumer.accept(primaryContext);
+                        synchronized (mutex) {
+                            verifyRelocatingState();
+                            // make changes to primaryMode and relocated flag only under mutex
+                            replicationTracker.completeRelocationHandoff();
+                        }
+                    } catch (final Exception e) {
+                        try {
+                            replicationTracker.abortRelocationHandoff();
+                        } catch (final Exception inner) {
+                            e.addSuppressed(inner);
+                        }
+                        throw e;
                     }
-                    throw e;
-                }
-            });
+                });
         } catch (TimeoutException e) {
             logger.warn("timed out waiting for relocation hand-off to complete");
             // This is really bad as ongoing replication operations are preventing this shard from completing relocation hand-off.
diff --git a/server/src/main/java/org/elasticsearch/index/shard/IndexShardOperationPermits.java b/server/src/main/java/org/elasticsearch/index/shard/IndexShardOperationPermits.java
@@ -91,21 +91,28 @@ public void close() {
      * Wait for in-flight operations to finish and executes {@code onBlocked} under the guarantee that no new operations are started. Queues
      * operations that are occurring in the meanwhile and runs them once {@code onBlocked} has executed.
      *
-     * @param timeout   the maximum time to wait for the in-flight operations block
-     * @param timeUnit  the time unit of the {@code timeout} argument
-     * @param onBlocked the action to run once the block has been acquired
-     * @param <E>       the type of checked exception thrown by {@code onBlocked}
+     * @param timeout            the maximum time to wait for the in-flight operations block
+     * @param timeUnit           the time unit of the {@code timeout} argument
+     * @param onActiveOperations the action to run before trying to acquire the block if there are active operations
+     * @param onBlocked          the action to run once the block has been acquired
+     * @param <E>                the type of checked exception thrown by {@code onBlocked}
      * @throws InterruptedException      if calling thread is interrupted
      * @throws TimeoutException          if timed out waiting for in-flight operations to finish
      * @throws IndexShardClosedException if operation permit has been closed
      */
     <E extends Exception> void blockOperations(
             final long timeout,
             final TimeUnit timeUnit,
+            final CheckedRunnable<E> onActiveOperations,
             final CheckedRunnable<E> onBlocked) throws InterruptedException, TimeoutException, E {
         delayOperations();
-        try (Releasable ignored = acquireAll(timeout, timeUnit)) {
-            onBlocked.run();
+        try {
+            if (getActiveOperationsCount() > 0) {
+                onActiveOperations.run();
+            }
+            try (Releasable ignored = acquireAll(timeout, timeUnit)) {
+                onBlocked.run();
+            }
         } finally {
             releaseDelayedOperations();
         }
@@ -211,7 +218,7 @@ private void releaseDelayedOperations() {
     /**
      * Acquires a permit whenever permit acquisition is not blocked. If the permit is directly available, the provided
      * {@link ActionListener} will be called on the calling thread. During calls of
-     * {@link #blockOperations(long, TimeUnit, CheckedRunnable)}, permit acquisition can be delayed.
+     * {@link #blockOperations(long, TimeUnit, CheckedRunnable, CheckedRunnable)}, permit acquisition can be delayed.
      * The {@link ActionListener#onResponse(Object)} method will then be called using the provided executor once operations are no
      * longer blocked. Note that the executor will not be used for {@link ActionListener#onFailure(Exception)} calls. Those will run
      * directly on the calling thread, which in case of delays, will be a generic thread. Callers should thus make sure
@@ -295,7 +302,7 @@ private Releasable acquire(Object debugInfo, StackTraceElement[] stackTrace) thr
     /**
      * Obtain the active operation count, or zero if all permits are held (even if there are outstanding operations in flight).
      *
-     * @return the active operation count, or zero when all permits ar eheld
+     * @return the active operation count, or zero when all permits are held
      */
     int getActiveOperationsCount() {
         int availablePermits = semaphore.availablePermits();
diff --git a/server/src/test/java/org/elasticsearch/index/shard/IndexShardOperationPermitsTests.java b/server/src/test/java/org/elasticsearch/index/shard/IndexShardOperationPermitsTests.java
@@ -199,7 +199,7 @@ public void testOperationsIfClosed() {
     public void testBlockIfClosed() {
         permits.close();
         expectThrows(IndexShardClosedException.class, () -> permits.blockOperations(randomInt(10), TimeUnit.MINUTES,
-            () -> { throw new IllegalArgumentException("fake error"); }));
+            () -> {}, () -> { throw new IllegalArgumentException("fake error"); }));
         expectThrows(IndexShardClosedException.class,
             () -> permits.asyncBlockOperations(wrap(() -> { throw new IllegalArgumentException("fake error");}),
                 randomInt(10), TimeUnit.MINUTES));
@@ -296,7 +296,7 @@ private Releasable blockAndWait() throws InterruptedException {
         IndexShardClosedException exception = new IndexShardClosedException(new ShardId("blubb", "id", 0));
         threadPool.generic().execute(() -> {
                 try {
-                    permits.blockOperations(1, TimeUnit.MINUTES, () -> {
+                    permits.blockOperations(1, TimeUnit.MINUTES, () -> {}, () -> {
                         try {
                             blockAcquired.countDown();
                             releaseBlock.await();
@@ -572,7 +572,7 @@ public void testTimeout() throws BrokenBarrierException, InterruptedException {
 
         {
             final TimeoutException e =
-                    expectThrows(TimeoutException.class, () -> permits.blockOperations(1, TimeUnit.MILLISECONDS, () -> {}));
+                    expectThrows(TimeoutException.class, () -> permits.blockOperations(1, TimeUnit.MILLISECONDS, () -> {}, () -> {}));
             assertThat(e, hasToString(containsString("timeout while blocking operations")));
         }
 
diff --git a/server/src/test/java/org/elasticsearch/recovery/RelocationIT.java b/server/src/test/java/org/elasticsearch/recovery/RelocationIT.java
@@ -23,9 +23,12 @@
 import com.carrotsearch.hppc.procedures.IntProcedure;
 import org.apache.lucene.index.IndexFileNames;
 import org.apache.lucene.util.English;
+import org.elasticsearch.action.ActionFuture;
 import org.elasticsearch.action.admin.cluster.health.ClusterHealthResponse;
+import org.elasticsearch.action.admin.cluster.reroute.ClusterRerouteResponse;
 import org.elasticsearch.action.index.IndexRequestBuilder;
 import org.elasticsearch.action.search.SearchResponse;
+import org.elasticsearch.action.support.WriteRequest;
 import org.elasticsearch.client.Client;
 import org.elasticsearch.cluster.ClusterState;
 import org.elasticsearch.cluster.metadata.IndexMetaData;
@@ -506,6 +509,97 @@ public void testIndexAndRelocateConcurrently() throws ExecutionException, Interr
 
     }
 
+    public void testRelocateWhileWaitingForRefresh() {
+        logger.info("--> starting [node1] ...");
+        final String node1 = internalCluster().startNode();
+
+        logger.info("--> creating test index ...");
+        prepareCreate("test", Settings.builder()
+            .put("index.number_of_shards", 1)
+            .put("index.number_of_replicas", 0)
+            .put("index.refresh_interval", -1) // we want to control refreshes
+        ).get();
+
+        logger.info("--> index 10 docs");
+        for (int i = 0; i < 10; i++) {
+            client().prepareIndex("test", "type", Integer.toString(i)).setSource("field", "value" + i).execute().actionGet();
+        }
+        logger.info("--> flush so we have an actual index");
+        client().admin().indices().prepareFlush().execute().actionGet();
+        logger.info("--> index more docs so we have something in the translog");
+        for (int i = 10; i < 20; i++) {
+            client().prepareIndex("test", "type", Integer.toString(i)).setRefreshPolicy(WriteRequest.RefreshPolicy.WAIT_UNTIL)
+                .setSource("field", "value" + i).execute();
+        }
+
+        logger.info("--> start another node");
+        final String node2 = internalCluster().startNode();
+        ClusterHealthResponse clusterHealthResponse = client().admin().cluster().prepareHealth().setWaitForEvents(Priority.LANGUID)
+            .setWaitForNodes("2").execute().actionGet();
+        assertThat(clusterHealthResponse.isTimedOut(), equalTo(false));
+
+        logger.info("--> relocate the shard from node1 to node2");
+        client().admin().cluster().prepareReroute()
+            .add(new MoveAllocationCommand("test", 0, node1, node2))
+            .execute().actionGet();
+
+        clusterHealthResponse = client().admin().cluster().prepareHealth().setWaitForEvents(Priority.LANGUID)
+            .setWaitForNoRelocatingShards(true).setTimeout(ACCEPTABLE_RELOCATION_TIME).execute().actionGet();
+        assertThat(clusterHealthResponse.isTimedOut(), equalTo(false));
+
+        logger.info("--> verifying count");
+        client().admin().indices().prepareRefresh().execute().actionGet();
+        assertThat(client().prepareSearch("test").setSize(0).execute().actionGet().getHits().getTotalHits().value, equalTo(20L));
+    }
+
+    public void testRelocateWhileContinuouslyIndexingAndWaitingForRefresh() {
+        logger.info("--> starting [node1] ...");
+        final String node1 = internalCluster().startNode();
+
+        logger.info("--> creating test index ...");
+        prepareCreate("test", Settings.builder()
+            .put("index.number_of_shards", 1)
+            .put("index.number_of_replicas", 0)
+            .put("index.refresh_interval", -1) // we want to control refreshes
+        ).get();
+
+        logger.info("--> index 10 docs");
+        for (int i = 0; i < 10; i++) {
+            client().prepareIndex("test", "type", Integer.toString(i)).setSource("field", "value" + i).execute().actionGet();
+        }
+        logger.info("--> flush so we have an actual index");
+        client().admin().indices().prepareFlush().execute().actionGet();
+        logger.info("--> index more docs so we have something in the translog");
+        for (int i = 10; i < 20; i++) {
+            client().prepareIndex("test", "type", Integer.toString(i)).setRefreshPolicy(WriteRequest.RefreshPolicy.WAIT_UNTIL)
+                .setSource("field", "value" + i).execute();
+        }
+
+        logger.info("--> start another node");
+        final String node2 = internalCluster().startNode();
+        ClusterHealthResponse clusterHealthResponse = client().admin().cluster().prepareHealth().setWaitForEvents(Priority.LANGUID)
+            .setWaitForNodes("2").execute().actionGet();
+        assertThat(clusterHealthResponse.isTimedOut(), equalTo(false));
+
+        logger.info("--> relocate the shard from node1 to node2");
+        ActionFuture<ClusterRerouteResponse> relocationListener = client().admin().cluster().prepareReroute()
+            .add(new MoveAllocationCommand("test", 0, node1, node2))
+            .execute();
+        logger.info("--> index 100 docs while relocating");
+        for (int i = 20; i < 120; i++) {
+            client().prepareIndex("test", "type", Integer.toString(i)).setRefreshPolicy(WriteRequest.RefreshPolicy.WAIT_UNTIL)
+                .setSource("field", "value" + i).execute();
+        }
+        relocationListener.actionGet();
+        clusterHealthResponse = client().admin().cluster().prepareHealth().setWaitForEvents(Priority.LANGUID)
+            .setWaitForNoRelocatingShards(true).setTimeout(ACCEPTABLE_RELOCATION_TIME).execute().actionGet();
+        assertThat(clusterHealthResponse.isTimedOut(), equalTo(false));
+
+        logger.info("--> verifying count");
+        client().admin().indices().prepareRefresh().execute().actionGet();
+        assertThat(client().prepareSearch("test").setSize(0).execute().actionGet().getHits().getTotalHits().value, equalTo(120L));
+    }
+
     class RecoveryCorruption implements StubbableTransport.SendRequestBehavior {
 
         private final CountDownLatch corruptionCount;