CR: different fix

original-brownbear · original-brownbear · commit f40c6a6bc321 · 2018-12-20T11:01:09.000+01:00
diff --git a/server/src/main/java/org/elasticsearch/index/shard/IndexShard.java b/server/src/main/java/org/elasticsearch/index/shard/IndexShard.java
@@ -244,6 +244,12 @@ Runnable getGlobalCheckpointSyncer() {
      */
     private final RefreshListeners refreshListeners;
 
+    /**
+     * Prevents new refresh listeners from being registered. Used to prevent becoming blocked on operations waiting for refresh
+     * during relocation.
+     */
+    private final AtomicBoolean preventNewRefreshListeners = new AtomicBoolean(false);
+
     private final AtomicLong lastSearcherAccess = new AtomicLong();
     private final AtomicReference<Translog.Location> pendingRefreshLocation = new AtomicReference<>();
 
@@ -608,42 +614,44 @@ public IndexShardState markAsRecovering(String reason, RecoveryState recoverySta
     public void relocated(final Consumer<ReplicationTracker.PrimaryContext> consumer)
                                             throws IllegalIndexShardStateException, InterruptedException {
         assert shardRouting.primary() : "only primaries can be marked as relocated: " + shardRouting;
+        preventNewRefreshListeners.set(true);
         try {
-            indexShardOperationPermits.blockOperations(30, TimeUnit.MINUTES,
-                () -> refresh("relocation requested"),
-                () -> {
-                    // no shard operation permits are being held here, move state from started to relocated
-                    assert indexShardOperationPermits.getActiveOperationsCount() == 0 :
+            if (refreshListeners.refreshNeeded()) {
+                refresh("relocated");
+            }
+            indexShardOperationPermits.blockOperations(30, TimeUnit.MINUTES, () -> {
+                // no shard operation permits are being held here, move state from started to relocated
+                assert indexShardOperationPermits.getActiveOperationsCount() == 0 :
                         "in-flight operations in progress while moving shard state to relocated";
-                    /*
-                     * We should not invoke the runnable under the mutex as the expected implementation is to handoff the primary context
-                     * via a network operation. Doing this under the mutex can implicitly block the cluster state update thread
-                     * on network operations.
-                     */
-                    verifyRelocatingState();
-                    final ReplicationTracker.PrimaryContext primaryContext = replicationTracker.startRelocationHandoff();
+                /*
+                 * We should not invoke the runnable under the mutex as the expected implementation is to handoff the primary context via a
+                 * network operation. Doing this under the mutex can implicitly block the cluster state update thread on network operations.
+                 */
+                verifyRelocatingState();
+                final ReplicationTracker.PrimaryContext primaryContext = replicationTracker.startRelocationHandoff();
+                try {
+                    consumer.accept(primaryContext);
+                    synchronized (mutex) {
+                        verifyRelocatingState();
+                        replicationTracker.completeRelocationHandoff(); // make changes to primaryMode and relocated flag only under mutex
+                    }
+                } catch (final Exception e) {
                     try {
-                        consumer.accept(primaryContext);
-                        synchronized (mutex) {
-                            verifyRelocatingState();
-                            // make changes to primaryMode and relocated flag only under mutex
-                            replicationTracker.completeRelocationHandoff();
-                        }
-                    } catch (final Exception e) {
-                        try {
-                            replicationTracker.abortRelocationHandoff();
-                        } catch (final Exception inner) {
-                            e.addSuppressed(inner);
-                        }
-                        throw e;
+                        replicationTracker.abortRelocationHandoff();
+                    } catch (final Exception inner) {
+                        e.addSuppressed(inner);
                     }
-                });
+                    throw e;
+                }
+            });
         } catch (TimeoutException e) {
             logger.warn("timed out waiting for relocation hand-off to complete");
             // This is really bad as ongoing replication operations are preventing this shard from completing relocation hand-off.
             // Fail primary relocation source and target shards.
             failShard("timed out waiting for relocation hand-off to complete", null);
             throw new IndexShardClosedException(shardId(), "timed out waiting for relocation hand-off to complete");
+        } finally {
+            preventNewRefreshListeners.set(false);
         }
     }
 
@@ -2667,7 +2675,7 @@ public void onAfter() {
      */
     private RefreshListeners buildRefreshListeners() {
         return new RefreshListeners(
-            indexSettings::getMaxRefreshListeners,
+            () -> preventNewRefreshListeners.get() ? 0 : indexSettings.getMaxRefreshListeners(),
             () -> refresh("too_many_listeners"),
             threadPool.executor(ThreadPool.Names.LISTENER)::execute,
             logger, threadPool.getThreadContext());
diff --git a/server/src/main/java/org/elasticsearch/index/shard/IndexShardOperationPermits.java b/server/src/main/java/org/elasticsearch/index/shard/IndexShardOperationPermits.java
@@ -91,28 +91,21 @@ public void close() {
      * Wait for in-flight operations to finish and executes {@code onBlocked} under the guarantee that no new operations are started. Queues
      * operations that are occurring in the meanwhile and runs them once {@code onBlocked} has executed.
      *
-     * @param timeout            the maximum time to wait for the in-flight operations block
-     * @param timeUnit           the time unit of the {@code timeout} argument
-     * @param onActiveOperations the action to run before trying to acquire the block if there are active operations
-     * @param onBlocked          the action to run once the block has been acquired
-     * @param <E>                the type of checked exception thrown by {@code onBlocked}
+     * @param timeout   the maximum time to wait for the in-flight operations block
+     * @param timeUnit  the time unit of the {@code timeout} argument
+     * @param onBlocked the action to run once the block has been acquired
+     * @param <E>       the type of checked exception thrown by {@code onBlocked}
      * @throws InterruptedException      if calling thread is interrupted
      * @throws TimeoutException          if timed out waiting for in-flight operations to finish
      * @throws IndexShardClosedException if operation permit has been closed
      */
     <E extends Exception> void blockOperations(
             final long timeout,
             final TimeUnit timeUnit,
-            final CheckedRunnable<E> onActiveOperations,
             final CheckedRunnable<E> onBlocked) throws InterruptedException, TimeoutException, E {
         delayOperations();
-        try {
-            if (getActiveOperationsCount() > 0) {
-                onActiveOperations.run();
-            }
-            try (Releasable ignored = acquireAll(timeout, timeUnit)) {
-                onBlocked.run();
-            }
+        try (Releasable ignored = acquireAll(timeout, timeUnit)) {
+            onBlocked.run();
         } finally {
             releaseDelayedOperations();
         }
@@ -218,7 +211,7 @@ private void releaseDelayedOperations() {
     /**
      * Acquires a permit whenever permit acquisition is not blocked. If the permit is directly available, the provided
      * {@link ActionListener} will be called on the calling thread. During calls of
-     * {@link #blockOperations(long, TimeUnit, CheckedRunnable, CheckedRunnable)}, permit acquisition can be delayed.
+     * {@link #blockOperations(long, TimeUnit, CheckedRunnable)}, permit acquisition can be delayed.
      * The {@link ActionListener#onResponse(Object)} method will then be called using the provided executor once operations are no
      * longer blocked. Note that the executor will not be used for {@link ActionListener#onFailure(Exception)} calls. Those will run
      * directly on the calling thread, which in case of delays, will be a generic thread. Callers should thus make sure
diff --git a/server/src/test/java/org/elasticsearch/index/shard/IndexShardOperationPermitsTests.java b/server/src/test/java/org/elasticsearch/index/shard/IndexShardOperationPermitsTests.java
@@ -199,7 +199,7 @@ public void testOperationsIfClosed() {
     public void testBlockIfClosed() {
         permits.close();
         expectThrows(IndexShardClosedException.class, () -> permits.blockOperations(randomInt(10), TimeUnit.MINUTES,
-            () -> {}, () -> { throw new IllegalArgumentException("fake error"); }));
+            () -> { throw new IllegalArgumentException("fake error"); }));
         expectThrows(IndexShardClosedException.class,
             () -> permits.asyncBlockOperations(wrap(() -> { throw new IllegalArgumentException("fake error");}),
                 randomInt(10), TimeUnit.MINUTES));
@@ -296,7 +296,7 @@ private Releasable blockAndWait() throws InterruptedException {
         IndexShardClosedException exception = new IndexShardClosedException(new ShardId("blubb", "id", 0));
         threadPool.generic().execute(() -> {
                 try {
-                    permits.blockOperations(1, TimeUnit.MINUTES, () -> {}, () -> {
+                    permits.blockOperations(1, TimeUnit.MINUTES, () -> {
                         try {
                             blockAcquired.countDown();
                             releaseBlock.await();
@@ -572,7 +572,7 @@ public void testTimeout() throws BrokenBarrierException, InterruptedException {
 
         {
             final TimeoutException e =
-                    expectThrows(TimeoutException.class, () -> permits.blockOperations(1, TimeUnit.MILLISECONDS, () -> {}, () -> {}));
+                    expectThrows(TimeoutException.class, () -> permits.blockOperations(1, TimeUnit.MILLISECONDS, () -> {}));
             assertThat(e, hasToString(containsString("timeout while blocking operations")));
         }