|
33 | 33 | import org.elasticsearch.Version; |
34 | 34 | import org.elasticsearch.action.ActionListener; |
35 | 35 | import org.elasticsearch.action.StepListener; |
36 | | -import org.elasticsearch.action.support.PlainActionFuture; |
37 | 36 | import org.elasticsearch.cluster.routing.IndexShardRoutingTable; |
38 | 37 | import org.elasticsearch.cluster.routing.ShardRouting; |
| 38 | +import org.elasticsearch.common.CheckedSupplier; |
39 | 39 | import org.elasticsearch.common.StopWatch; |
40 | 40 | import org.elasticsearch.common.bytes.BytesArray; |
41 | 41 | import org.elasticsearch.common.collect.Tuple; |
|
71 | 71 | import java.util.Locale; |
72 | 72 | import java.util.concurrent.CompletableFuture; |
73 | 73 | import java.util.concurrent.CopyOnWriteArrayList; |
74 | | -import java.util.concurrent.atomic.AtomicLong; |
| 74 | +import java.util.concurrent.atomic.AtomicInteger; |
75 | 75 | import java.util.concurrent.atomic.AtomicReference; |
76 | 76 | import java.util.function.Consumer; |
77 | 77 | import java.util.function.Supplier; |
@@ -514,97 +514,94 @@ TimeValue prepareTargetForTranslog(final boolean fileBasedRecovery, final int to |
514 | 514 | */ |
515 | 515 | void phase2(long startingSeqNo, long requiredSeqNoRangeStart, long endingSeqNo, Translog.Snapshot snapshot, long maxSeenAutoIdTimestamp, |
516 | 516 | long maxSeqNoOfUpdatesOrDeletes, ActionListener<SendSnapshotResult> listener) throws IOException { |
517 | | - ActionListener.completeWith(listener, () -> sendSnapshotBlockingly( |
518 | | - startingSeqNo, requiredSeqNoRangeStart, endingSeqNo, snapshot, maxSeenAutoIdTimestamp, maxSeqNoOfUpdatesOrDeletes)); |
519 | | - } |
520 | | - |
521 | | - private SendSnapshotResult sendSnapshotBlockingly(long startingSeqNo, long requiredSeqNoRangeStart, long endingSeqNo, |
522 | | - Translog.Snapshot snapshot, long maxSeenAutoIdTimestamp, |
523 | | - long maxSeqNoOfUpdatesOrDeletes) throws IOException { |
524 | 517 | assert requiredSeqNoRangeStart <= endingSeqNo + 1: |
525 | 518 | "requiredSeqNoRangeStart " + requiredSeqNoRangeStart + " is larger than endingSeqNo " + endingSeqNo; |
526 | 519 | assert startingSeqNo <= requiredSeqNoRangeStart : |
527 | 520 | "startingSeqNo " + startingSeqNo + " is larger than requiredSeqNoRangeStart " + requiredSeqNoRangeStart; |
528 | 521 | if (shard.state() == IndexShardState.CLOSED) { |
529 | 522 | throw new IndexShardClosedException(request.shardId()); |
530 | 523 | } |
531 | | - |
532 | | - final StopWatch stopWatch = new StopWatch().start(); |
533 | | - |
534 | 524 | logger.trace("recovery [phase2]: sending transaction log operations (seq# from [" + startingSeqNo + "], " + |
535 | 525 | "required [" + requiredSeqNoRangeStart + ":" + endingSeqNo + "]"); |
536 | 526 |
|
537 | | - int ops = 0; |
538 | | - long size = 0; |
539 | | - int skippedOps = 0; |
540 | | - int totalSentOps = 0; |
541 | | - final AtomicLong targetLocalCheckpoint = new AtomicLong(SequenceNumbers.UNASSIGNED_SEQ_NO); |
542 | | - final List<Translog.Operation> operations = new ArrayList<>(); |
| 527 | + final AtomicInteger skippedOps = new AtomicInteger(); |
| 528 | + final AtomicInteger totalSentOps = new AtomicInteger(); |
543 | 529 | final LocalCheckpointTracker requiredOpsTracker = new LocalCheckpointTracker(endingSeqNo, requiredSeqNoRangeStart - 1); |
| 530 | + final AtomicInteger lastBatchCount = new AtomicInteger(); // used to estimate the count of the subsequent batch. |
| 531 | + final CheckedSupplier<List<Translog.Operation>, IOException> readNextBatch = () -> { |
| 532 | + // We need to synchronized Snapshot#next() because it's called by different threads through sendBatch. |
| 533 | + // Even though those calls are not concurrent, Snapshot#next() uses non-synchronized state and is not multi-thread-compatible. |
| 534 | + synchronized (snapshot) { |
| 535 | + final List<Translog.Operation> ops = lastBatchCount.get() > 0 ? new ArrayList<>(lastBatchCount.get()) : new ArrayList<>(); |
| 536 | + long batchSizeInBytes = 0L; |
| 537 | + Translog.Operation operation; |
| 538 | + while ((operation = snapshot.next()) != null) { |
| 539 | + if (shard.state() == IndexShardState.CLOSED) { |
| 540 | + throw new IndexShardClosedException(request.shardId()); |
| 541 | + } |
| 542 | + cancellableThreads.checkForCancel(); |
| 543 | + final long seqNo = operation.seqNo(); |
| 544 | + if (seqNo < startingSeqNo || seqNo > endingSeqNo) { |
| 545 | + skippedOps.incrementAndGet(); |
| 546 | + continue; |
| 547 | + } |
| 548 | + ops.add(operation); |
| 549 | + batchSizeInBytes += operation.estimateSize(); |
| 550 | + totalSentOps.incrementAndGet(); |
| 551 | + requiredOpsTracker.markSeqNoAsCompleted(seqNo); |
544 | 552 |
|
545 | | - final int expectedTotalOps = snapshot.totalOperations(); |
546 | | - if (expectedTotalOps == 0) { |
547 | | - logger.trace("no translog operations to send"); |
548 | | - } |
549 | | - |
550 | | - final CancellableThreads.IOInterruptible sendBatch = () -> { |
551 | | - // TODO: Make this non-blocking |
552 | | - final PlainActionFuture<Long> future = new PlainActionFuture<>(); |
553 | | - recoveryTarget.indexTranslogOperations( |
554 | | - operations, expectedTotalOps, maxSeenAutoIdTimestamp, maxSeqNoOfUpdatesOrDeletes, future); |
555 | | - targetLocalCheckpoint.set(future.actionGet()); |
556 | | - }; |
557 | | - |
558 | | - // send operations in batches |
559 | | - Translog.Operation operation; |
560 | | - while ((operation = snapshot.next()) != null) { |
561 | | - if (shard.state() == IndexShardState.CLOSED) { |
562 | | - throw new IndexShardClosedException(request.shardId()); |
563 | | - } |
564 | | - cancellableThreads.checkForCancel(); |
565 | | - |
566 | | - final long seqNo = operation.seqNo(); |
567 | | - if (seqNo < startingSeqNo || seqNo > endingSeqNo) { |
568 | | - skippedOps++; |
569 | | - continue; |
570 | | - } |
571 | | - operations.add(operation); |
572 | | - ops++; |
573 | | - size += operation.estimateSize(); |
574 | | - totalSentOps++; |
575 | | - requiredOpsTracker.markSeqNoAsCompleted(seqNo); |
576 | | - |
577 | | - // check if this request is past bytes threshold, and if so, send it off |
578 | | - if (size >= chunkSizeInBytes) { |
579 | | - cancellableThreads.executeIO(sendBatch); |
580 | | - logger.trace("sent batch of [{}][{}] (total: [{}]) translog operations", ops, new ByteSizeValue(size), expectedTotalOps); |
581 | | - ops = 0; |
582 | | - size = 0; |
583 | | - operations.clear(); |
| 553 | + // check if this request is past bytes threshold, and if so, send it off |
| 554 | + if (batchSizeInBytes >= chunkSizeInBytes) { |
| 555 | + break; |
| 556 | + } |
| 557 | + } |
| 558 | + lastBatchCount.set(ops.size()); |
| 559 | + return ops; |
584 | 560 | } |
585 | | - } |
586 | | - |
587 | | - if (!operations.isEmpty() || totalSentOps == 0) { |
588 | | - // send the leftover operations or if no operations were sent, request the target to respond with its local checkpoint |
589 | | - cancellableThreads.executeIO(sendBatch); |
590 | | - } |
| 561 | + }; |
591 | 562 |
|
592 | | - assert expectedTotalOps == snapshot.skippedOperations() + skippedOps + totalSentOps |
593 | | - : String.format(Locale.ROOT, "expected total [%d], overridden [%d], skipped [%d], total sent [%d]", |
594 | | - expectedTotalOps, snapshot.skippedOperations(), skippedOps, totalSentOps); |
| 563 | + final StopWatch stopWatch = new StopWatch().start(); |
| 564 | + final ActionListener<Long> batchedListener = ActionListener.wrap( |
| 565 | + targetLocalCheckpoint -> { |
| 566 | + assert snapshot.totalOperations() == snapshot.skippedOperations() + skippedOps.get() + totalSentOps.get() |
| 567 | + : String.format(Locale.ROOT, "expected total [%d], overridden [%d], skipped [%d], total sent [%d]", |
| 568 | + snapshot.totalOperations(), snapshot.skippedOperations(), skippedOps.get(), totalSentOps.get()); |
| 569 | + if (requiredOpsTracker.getCheckpoint() < endingSeqNo) { |
| 570 | + throw new IllegalStateException("translog replay failed to cover required sequence numbers" + |
| 571 | + " (required range [" + requiredSeqNoRangeStart + ":" + endingSeqNo + "). first missing op is [" |
| 572 | + + (requiredOpsTracker.getCheckpoint() + 1) + "]"); |
| 573 | + } |
| 574 | + stopWatch.stop(); |
| 575 | + final TimeValue tookTime = stopWatch.totalTime(); |
| 576 | + logger.trace("recovery [phase2]: took [{}]", tookTime); |
| 577 | + listener.onResponse(new SendSnapshotResult(targetLocalCheckpoint, totalSentOps.get(), tookTime)); |
| 578 | + }, |
| 579 | + listener::onFailure |
| 580 | + ); |
| 581 | + |
| 582 | + sendBatch(readNextBatch, true, SequenceNumbers.UNASSIGNED_SEQ_NO, snapshot.totalOperations(), |
| 583 | + maxSeenAutoIdTimestamp, maxSeqNoOfUpdatesOrDeletes, batchedListener); |
| 584 | + } |
595 | 585 |
|
596 | | - if (requiredOpsTracker.getCheckpoint() < endingSeqNo) { |
597 | | - throw new IllegalStateException("translog replay failed to cover required sequence numbers" + |
598 | | - " (required range [" + requiredSeqNoRangeStart + ":" + endingSeqNo + "). first missing op is [" |
599 | | - + (requiredOpsTracker.getCheckpoint() + 1) + "]"); |
| 586 | + private void sendBatch(CheckedSupplier<List<Translog.Operation>, IOException> nextBatch, boolean firstBatch, |
| 587 | + long targetLocalCheckpoint, int totalTranslogOps, long maxSeenAutoIdTimestamp, |
| 588 | + long maxSeqNoOfUpdatesOrDeletes, ActionListener<Long> listener) throws IOException { |
| 589 | + final List<Translog.Operation> operations = nextBatch.get(); |
| 590 | + // send the leftover operations or if no operations were sent, request the target to respond with its local checkpoint |
| 591 | + if (operations.isEmpty() == false || firstBatch) { |
| 592 | + cancellableThreads.execute(() -> { |
| 593 | + recoveryTarget.indexTranslogOperations(operations, totalTranslogOps, maxSeenAutoIdTimestamp, maxSeqNoOfUpdatesOrDeletes, |
| 594 | + ActionListener.wrap( |
| 595 | + newCheckpoint -> { |
| 596 | + sendBatch(nextBatch, false, SequenceNumbers.max(targetLocalCheckpoint, newCheckpoint), |
| 597 | + totalTranslogOps, maxSeenAutoIdTimestamp, maxSeqNoOfUpdatesOrDeletes, listener); |
| 598 | + }, |
| 599 | + listener::onFailure |
| 600 | + )); |
| 601 | + }); |
| 602 | + } else { |
| 603 | + listener.onResponse(targetLocalCheckpoint); |
600 | 604 | } |
601 | | - |
602 | | - logger.trace("sent final batch of [{}][{}] (total: [{}]) translog operations", ops, new ByteSizeValue(size), expectedTotalOps); |
603 | | - |
604 | | - stopWatch.stop(); |
605 | | - final TimeValue tookTime = stopWatch.totalTime(); |
606 | | - logger.trace("recovery [phase2]: took [{}]", tookTime); |
607 | | - return new SendSnapshotResult(targetLocalCheckpoint.get(), totalSentOps, tookTime); |
608 | 605 | } |
609 | 606 |
|
610 | 607 | void finalizeRecovery(final long targetLocalCheckpoint, final ActionListener<Void> listener) throws IOException { |
|
0 commit comments