Do not wait for advancement of checkpoint in recovery (#39006)

dnhatn · ywelsch · dnhatn · commit cb9049109051 · 2019-02-25T13:06:26.000-05:00
With this change, we won't wait for the local checkpoint to advance to the max_seq_no before starting phase2 of peer-recovery. We also remove the sequence number range check in peer-recovery. We can safely do these thanks to Yannick's finding. The replication group to be used is currently sampled after indexing into the primary (see `ReplicationOperation` class). This means that when initiating tracking of a new replica, we have to consider the following two cases: - There are operations for which the replication group has not been sampled yet. As we initiated the new replica as tracking, we know that those operations will be replicated to the new replica and follow the typical replication group semantics (e.g. marked as stale when unavailable). - There are operations for which the replication group has already been sampled. These operations will not be sent to the new replica. However, we know that those operations are already indexed into Lucene and the translog on the primary, as the sampling is happening after that. This means that by taking a snapshot of Lucene or the translog, we will be getting those ops as well. What we cannot guarantee anymore is that all ops up to `endingSeqNo` are available in the snapshot (i.e. also see comment in `RecoverySourceHandler` saying `We need to wait for all operations up to the current max to complete, otherwise we can not guarantee that all operations in the required range will be available for replaying from the translog of the source.`). This is not needed, though, as we can no longer guarantee that max seq no == local checkpoint. Relates #39000 Closes #38949 Co-authored-by: Yannick Welsch <yannick@welsch.lu>
diff --git a/server/src/main/java/org/elasticsearch/index/engine/Engine.java b/server/src/main/java/org/elasticsearch/index/engine/Engine.java
@@ -808,14 +808,6 @@ public final CommitStats commitStats() {
      */
     public abstract long getLocalCheckpoint();
 
-    /**
-     * Waits for all operations up to the provided sequence number to complete.
-     *
-     * @param seqNo the sequence number that the checkpoint must advance to before this method returns
-     * @throws InterruptedException if the thread was interrupted while blocking on the condition
-     */
-    public abstract void waitForOpsToComplete(long seqNo) throws InterruptedException;
-
     /**
      * @return a {@link SeqNoStats} object, using local state and the supplied global checkpoint
      */
diff --git a/server/src/main/java/org/elasticsearch/index/engine/InternalEngine.java b/server/src/main/java/org/elasticsearch/index/engine/InternalEngine.java
@@ -2510,11 +2510,6 @@ public long getLocalCheckpoint() {
         return localCheckpointTracker.getCheckpoint();
     }
 
-    @Override
-    public void waitForOpsToComplete(long seqNo) throws InterruptedException {
-        localCheckpointTracker.waitForOpsToComplete(seqNo);
-    }
-
     /**
      * Marks the given seq_no as seen and advances the max_seq_no of this engine to at least that value.
      */
diff --git a/server/src/main/java/org/elasticsearch/index/engine/ReadOnlyEngine.java b/server/src/main/java/org/elasticsearch/index/engine/ReadOnlyEngine.java
@@ -321,10 +321,6 @@ public long getLocalCheckpoint() {
         return seqNoStats.getLocalCheckpoint();
     }
 
-    @Override
-    public void waitForOpsToComplete(long seqNo) {
-    }
-
     @Override
     public SeqNoStats getSeqNoStats(long globalCheckpoint) {
         return new SeqNoStats(seqNoStats.getMaxSeqNo(), seqNoStats.getLocalCheckpoint(), globalCheckpoint);
diff --git a/server/src/main/java/org/elasticsearch/index/shard/IndexShard.java b/server/src/main/java/org/elasticsearch/index/shard/IndexShard.java
@@ -2077,16 +2077,6 @@ public void syncRetentionLeases() {
         }
     }
 
-    /**
-     * Waits for all operations up to the provided sequence number to complete.
-     *
-     * @param seqNo the sequence number that the checkpoint must advance to before this method returns
-     * @throws InterruptedException if the thread was interrupted while blocking on the condition
-     */
-    public void waitForOpsToComplete(final long seqNo) throws InterruptedException {
-        getEngine().waitForOpsToComplete(seqNo);
-    }
-
     /**
      * Called when the recovery process for a shard has opened the engine on the target shard. Ensures that the right data structures
      * have been set up locally to track local checkpoint information for the shard and that the shard is added to the replication group.
diff --git a/server/src/main/java/org/elasticsearch/indices/recovery/RecoverySourceHandler.java b/server/src/main/java/org/elasticsearch/indices/recovery/RecoverySourceHandler.java
@@ -158,14 +158,12 @@ public void recoverToTarget(ActionListener<RecoveryResponse> listener) {
             final Closeable retentionLock = shard.acquireRetentionLock();
             resources.add(retentionLock);
             final long startingSeqNo;
-            final long requiredSeqNoRangeStart;
             final boolean isSequenceNumberBasedRecovery = request.startingSeqNo() != SequenceNumbers.UNASSIGNED_SEQ_NO &&
                 isTargetSameHistory() && shard.hasCompleteHistoryOperations("peer-recovery", request.startingSeqNo());
             final SendFileResult sendFileResult;
             if (isSequenceNumberBasedRecovery) {
                 logger.trace("performing sequence numbers based recovery. starting at [{}]", request.startingSeqNo());
                 startingSeqNo = request.startingSeqNo();
-                requiredSeqNoRangeStart = startingSeqNo;
                 sendFileResult = SendFileResult.EMPTY;
             } else {
                 final Engine.IndexCommitRef phase1Snapshot;
@@ -174,13 +172,15 @@ public void recoverToTarget(ActionListener<RecoveryResponse> listener) {
                 } catch (final Exception e) {
                     throw new RecoveryEngineException(shard.shardId(), 1, "snapshot failed", e);
                 }
-                // We must have everything above the local checkpoint in the commit
-                requiredSeqNoRangeStart =
-                    Long.parseLong(phase1Snapshot.getIndexCommit().getUserData().get(SequenceNumbers.LOCAL_CHECKPOINT_KEY)) + 1;
                 // If soft-deletes enabled, we need to transfer only operations after the local_checkpoint of the commit to have
                 // the same history on the target. However, with translog, we need to set this to 0 to create a translog roughly
                 // according to the retention policy on the target. Note that it will still filter out legacy operations without seqNo.
-                startingSeqNo = shard.indexSettings().isSoftDeleteEnabled() ? requiredSeqNoRangeStart : 0;
+                if (shard.indexSettings().isSoftDeleteEnabled()) {
+                    startingSeqNo = Long.parseLong(
+                        phase1Snapshot.getIndexCommit().getUserData().get(SequenceNumbers.LOCAL_CHECKPOINT_KEY)) + 1;
+                } else {
+                    startingSeqNo = 0;
+                }
                 try {
                     final int estimateNumOps = shard.estimateNumberOfHistoryOperations("peer-recovery", startingSeqNo);
                     sendFileResult = phase1(phase1Snapshot.getIndexCommit(), () -> estimateNumOps);
@@ -195,8 +195,6 @@ public void recoverToTarget(ActionListener<RecoveryResponse> listener) {
                 }
             }
             assert startingSeqNo >= 0 : "startingSeqNo must be non negative. got: " + startingSeqNo;
-            assert requiredSeqNoRangeStart >= startingSeqNo : "requiredSeqNoRangeStart [" + requiredSeqNoRangeStart + "] is lower than ["
-                + startingSeqNo + "]";
 
             final StepListener<TimeValue> prepareEngineStep = new StepListener<>();
             // For a sequence based recovery, the target can keep its local translog
@@ -214,13 +212,7 @@ public void recoverToTarget(ActionListener<RecoveryResponse> listener) {
                     shardId + " initiating tracking of " + request.targetAllocationId(), shard, cancellableThreads, logger);
 
                 final long endingSeqNo = shard.seqNoStats().getMaxSeqNo();
-                /*
-                 * We need to wait for all operations up to the current max to complete, otherwise we can not guarantee that all
-                 * operations in the required range will be available for replaying from the translog of the source.
-                 */
-                cancellableThreads.execute(() -> shard.waitForOpsToComplete(endingSeqNo));
                 if (logger.isTraceEnabled()) {
-                    logger.trace("all operations up to [{}] completed, which will be used as an ending sequence number", endingSeqNo);
                     logger.trace("snapshot translog for recovery; current size is [{}]",
                         shard.estimateNumberOfHistoryOperations("peer-recovery", startingSeqNo));
                 }
@@ -233,15 +225,8 @@ public void recoverToTarget(ActionListener<RecoveryResponse> listener) {
                 final long maxSeenAutoIdTimestamp = shard.getMaxSeenAutoIdTimestamp();
                 final long maxSeqNoOfUpdatesOrDeletes = shard.getMaxSeqNoOfUpdatesOrDeletes();
                 final RetentionLeases retentionLeases = shard.getRetentionLeases();
-                phase2(
-                        startingSeqNo,
-                        requiredSeqNoRangeStart,
-                        endingSeqNo,
-                        phase2Snapshot,
-                        maxSeenAutoIdTimestamp,
-                        maxSeqNoOfUpdatesOrDeletes,
-                        retentionLeases,
-                        sendSnapshotStep);
+                phase2(startingSeqNo, endingSeqNo, phase2Snapshot, maxSeenAutoIdTimestamp, maxSeqNoOfUpdatesOrDeletes,
+                    retentionLeases, sendSnapshotStep);
                 sendSnapshotStep.whenComplete(
                     r -> IOUtils.close(phase2Snapshot),
                     e -> {
@@ -519,7 +504,6 @@ void prepareTargetForTranslog(boolean fileBasedRecovery, int totalTranslogOps, A
      *
      * @param startingSeqNo              the sequence number to start recovery from, or {@link SequenceNumbers#UNASSIGNED_SEQ_NO} if all
      *                                   ops should be sent
-     * @param requiredSeqNoRangeStart    the lower sequence number of the required range (ending with endingSeqNo)
      * @param endingSeqNo                the highest sequence number that should be sent
      * @param snapshot                   a snapshot of the translog
      * @param maxSeenAutoIdTimestamp     the max auto_id_timestamp of append-only requests on the primary
@@ -528,26 +512,19 @@ void prepareTargetForTranslog(boolean fileBasedRecovery, int totalTranslogOps, A
      */
     void phase2(
             final long startingSeqNo,
-            final long requiredSeqNoRangeStart,
             final long endingSeqNo,
             final Translog.Snapshot snapshot,
             final long maxSeenAutoIdTimestamp,
             final long maxSeqNoOfUpdatesOrDeletes,
             final RetentionLeases retentionLeases,
             final ActionListener<SendSnapshotResult> listener) throws IOException {
-        assert requiredSeqNoRangeStart <= endingSeqNo + 1:
-            "requiredSeqNoRangeStart " + requiredSeqNoRangeStart + " is larger than endingSeqNo " + endingSeqNo;
-        assert startingSeqNo <= requiredSeqNoRangeStart :
-            "startingSeqNo " + startingSeqNo + " is larger than requiredSeqNoRangeStart " + requiredSeqNoRangeStart;
         if (shard.state() == IndexShardState.CLOSED) {
             throw new IndexShardClosedException(request.shardId());
         }
-        logger.trace("recovery [phase2]: sending transaction log operations (seq# from [" +  startingSeqNo  + "], " +
-            "required [" + requiredSeqNoRangeStart + ":" + endingSeqNo + "]");
+        logger.trace("recovery [phase2]: sending transaction log operations (from [" + startingSeqNo + "] to [" + endingSeqNo + "]");
 
         final AtomicInteger skippedOps = new AtomicInteger();
         final AtomicInteger totalSentOps = new AtomicInteger();
-        final LocalCheckpointTracker requiredOpsTracker = new LocalCheckpointTracker(endingSeqNo, requiredSeqNoRangeStart - 1);
         final AtomicInteger lastBatchCount = new AtomicInteger(); // used to estimate the count of the subsequent batch.
         final CheckedSupplier<List<Translog.Operation>, IOException> readNextBatch = () -> {
             // We need to synchronized Snapshot#next() because it's called by different threads through sendBatch.
@@ -569,7 +546,6 @@ void phase2(
                     ops.add(operation);
                     batchSizeInBytes += operation.estimateSize();
                     totalSentOps.incrementAndGet();
-                    requiredOpsTracker.markSeqNoAsCompleted(seqNo);
 
                     // check if this request is past bytes threshold, and if so, send it off
                     if (batchSizeInBytes >= chunkSizeInBytes) {
@@ -587,11 +563,6 @@ void phase2(
                 assert snapshot.totalOperations() == snapshot.skippedOperations() + skippedOps.get() + totalSentOps.get()
                     : String.format(Locale.ROOT, "expected total [%d], overridden [%d], skipped [%d], total sent [%d]",
                     snapshot.totalOperations(), snapshot.skippedOperations(), skippedOps.get(), totalSentOps.get());
-                if (requiredOpsTracker.getCheckpoint() < endingSeqNo) {
-                    throw new IllegalStateException("translog replay failed to cover required sequence numbers" +
-                        " (required range [" + requiredSeqNoRangeStart + ":" + endingSeqNo + "). first missing op is ["
-                        + (requiredOpsTracker.getCheckpoint() + 1) + "]");
-                }
                 stopWatch.stop();
                 final TimeValue tookTime = stopWatch.totalTime();
                 logger.trace("recovery [phase2]: took [{}]", tookTime);
diff --git a/server/src/test/java/org/elasticsearch/indices/recovery/RecoverySourceHandlerTests.java b/server/src/test/java/org/elasticsearch/indices/recovery/RecoverySourceHandlerTests.java
@@ -97,7 +97,6 @@
 import java.util.concurrent.atomic.AtomicReference;
 import java.util.function.IntSupplier;
 import java.util.function.Supplier;
-import java.util.stream.Collectors;
 import java.util.zip.CRC32;
 
 import static java.util.Collections.emptyMap;
@@ -231,8 +230,7 @@ public void testSendSnapshotSendsOps() throws IOException {
             operations.add(new Translog.Index(index, new Engine.IndexResult(1, 1, i - initialNumberOfDocs, true)));
         }
         final long startingSeqNo = randomIntBetween(0, numberOfDocsWithValidSequenceNumbers - 1);
-        final long requiredStartingSeqNo = randomIntBetween((int) startingSeqNo, numberOfDocsWithValidSequenceNumbers - 1);
-        final long endingSeqNo = randomIntBetween((int) requiredStartingSeqNo - 1, numberOfDocsWithValidSequenceNumbers - 1);
+        final long endingSeqNo = randomLongBetween(startingSeqNo, numberOfDocsWithValidSequenceNumbers - 1);
 
         final List<Translog.Operation> shippedOps = new ArrayList<>();
         final AtomicLong checkpointOnTarget = new AtomicLong(SequenceNumbers.NO_OPS_PERFORMED);
@@ -247,7 +245,7 @@ public void indexTranslogOperations(List<Translog.Operation> operations, int tot
         };
         RecoverySourceHandler handler = new RecoverySourceHandler(shard, recoveryTarget, request, fileChunkSizeInBytes, between(1, 10));
         PlainActionFuture<RecoverySourceHandler.SendSnapshotResult> future = new PlainActionFuture<>();
-        handler.phase2(startingSeqNo, requiredStartingSeqNo, endingSeqNo, newTranslogSnapshot(operations, Collections.emptyList()),
+        handler.phase2(startingSeqNo, endingSeqNo, newTranslogSnapshot(operations, Collections.emptyList()),
             randomNonNegativeLong(), randomNonNegativeLong(), RetentionLeases.EMPTY, future);
         final int expectedOps = (int) (endingSeqNo - startingSeqNo + 1);
         RecoverySourceHandler.SendSnapshotResult result = future.actionGet();
@@ -258,18 +256,6 @@ public void indexTranslogOperations(List<Translog.Operation> operations, int tot
             assertThat(shippedOps.get(i), equalTo(operations.get(i + (int) startingSeqNo + initialNumberOfDocs)));
         }
         assertThat(result.targetLocalCheckpoint, equalTo(checkpointOnTarget.get()));
-        if (endingSeqNo >= requiredStartingSeqNo + 1) {
-            // check that missing ops blows up
-            List<Translog.Operation> requiredOps = operations.subList(0, operations.size() - 1).stream() // remove last null marker
-                .filter(o -> o.seqNo() >= requiredStartingSeqNo && o.seqNo() <= endingSeqNo).collect(Collectors.toList());
-            List<Translog.Operation> opsToSkip = randomSubsetOf(randomIntBetween(1, requiredOps.size()), requiredOps);
-            PlainActionFuture<RecoverySourceHandler.SendSnapshotResult> failedFuture = new PlainActionFuture<>();
-            expectThrows(IllegalStateException.class, () -> {
-                handler.phase2(startingSeqNo, requiredStartingSeqNo, endingSeqNo, newTranslogSnapshot(operations, opsToSkip),
-                    randomNonNegativeLong(), randomNonNegativeLong(), RetentionLeases.EMPTY, failedFuture);
-                failedFuture.actionGet();
-            });
-        }
     }
 
     public void testSendSnapshotStopOnError() throws Exception {
@@ -299,7 +285,7 @@ public void indexTranslogOperations(List<Translog.Operation> operations, int tot
         PlainActionFuture<RecoverySourceHandler.SendSnapshotResult> future = new PlainActionFuture<>();
         final long startingSeqNo = randomLongBetween(0, ops.size() - 1L);
         final long endingSeqNo = randomLongBetween(startingSeqNo, ops.size() - 1L);
-        handler.phase2(startingSeqNo, startingSeqNo, endingSeqNo, newTranslogSnapshot(ops, Collections.emptyList()),
+        handler.phase2(startingSeqNo, endingSeqNo, newTranslogSnapshot(ops, Collections.emptyList()),
             randomNonNegativeLong(), randomNonNegativeLong(), RetentionLeases.EMPTY, future);
         if (wasFailed.get()) {
             assertThat(expectThrows(RuntimeException.class, () -> future.actionGet()).getMessage(), equalTo("test - failed to index"));
@@ -498,11 +484,11 @@ void prepareTargetForTranslog(boolean fileBasedRecovery, int totalTranslogOps, A
             }
 
             @Override
-            void phase2(long startingSeqNo, long requiredSeqNoRangeStart, long endingSeqNo, Translog.Snapshot snapshot,
+            void phase2(long startingSeqNo, long endingSeqNo, Translog.Snapshot snapshot,
                         long maxSeenAutoIdTimestamp, long maxSeqNoOfUpdatesOrDeletes, RetentionLeases retentionLeases,
                         ActionListener<SendSnapshotResult> listener) throws IOException {
                 phase2Called.set(true);
-                super.phase2(startingSeqNo, requiredSeqNoRangeStart, endingSeqNo, snapshot,
+                super.phase2(startingSeqNo, endingSeqNo, snapshot,
                     maxSeenAutoIdTimestamp, maxSeqNoOfUpdatesOrDeletes, retentionLeases, listener);
             }
 
diff --git a/test/framework/src/main/java/org/elasticsearch/index/engine/EngineTestCase.java b/test/framework/src/main/java/org/elasticsearch/index/engine/EngineTestCase.java
@@ -1121,6 +1121,16 @@ public static Translog getTranslog(Engine engine) {
         return internalEngine.getTranslog();
     }
 
+    /**
+     * Waits for all operations up to the provided sequence number to complete in the given internal engine.
+     *
+     * @param seqNo the sequence number that the checkpoint must advance to before this method returns
+     * @throws InterruptedException if the thread was interrupted while blocking on the condition
+     */
+    public static void waitForOpsToComplete(InternalEngine engine, long seqNo) throws InterruptedException {
+        engine.getLocalCheckpointTracker().waitForOpsToComplete(seqNo);
+    }
+
     public static boolean hasSnapshottedCommits(Engine engine) {
         assert engine instanceof InternalEngine : "only InternalEngines have snapshotted commits, got: " + engine.getClass();
         InternalEngine internalEngine = (InternalEngine) engine;
diff --git a/x-pack/plugin/ccr/src/test/java/org/elasticsearch/xpack/ccr/IndexFollowingIT.java b/x-pack/plugin/ccr/src/test/java/org/elasticsearch/xpack/ccr/IndexFollowingIT.java
@@ -220,7 +220,6 @@ public void testFollowIndex() throws Exception {
         }
     }
 
-    @AwaitsFix(bugUrl = "https://github.com/elastic/elasticsearch/issues/38949")
     public void testFollowIndexWithConcurrentMappingChanges() throws Exception {
         final int numberOfPrimaryShards = randomIntBetween(1, 3);
         final String leaderIndexSettings = getIndexSettings(numberOfPrimaryShards, between(0, 1),
diff --git a/x-pack/plugin/ccr/src/test/java/org/elasticsearch/xpack/ccr/index/engine/FollowingEngineTests.java b/x-pack/plugin/ccr/src/test/java/org/elasticsearch/xpack/ccr/index/engine/FollowingEngineTests.java

Original file line number	Diff line number	Diff line change
`@@ -220,7 +220,6 @@ public void testFollowIndex() throws Exception {`
`220`	`220`	`}`
`221`	`221`	`}`
`222`	`222`
`223`		`- @AwaitsFix(bugUrl = "https://github.com/elastic/elasticsearch/issues/38949")`
`224`	`223`	`public void testFollowIndexWithConcurrentMappingChanges() throws Exception {`
`225`	`224`	`final int numberOfPrimaryShards = randomIntBetween(1, 3);`
`226`	`225`	`final String leaderIndexSettings = getIndexSettings(numberOfPrimaryShards, between(0, 1),`