@@ -197,51 +197,51 @@ public void recoverToTarget(ActionListener<RecoveryResponse> listener) {
197197 assert requiredSeqNoRangeStart >= startingSeqNo : "requiredSeqNoRangeStart [" + requiredSeqNoRangeStart + "] is lower than ["
198198 + startingSeqNo + "]" ;
199199
200- final TimeValue prepareEngineTime ;
201- try {
202- // For a sequence based recovery, the target can keep its local translog
203- prepareEngineTime = prepareTargetForTranslog (isSequenceNumberBasedRecovery == false ,
204- shard .estimateNumberOfHistoryOperations ("peer-recovery" , startingSeqNo ));
205- } catch (final Exception e ) {
206- throw new RecoveryEngineException (shard .shardId (), 1 , "prepare target for translog failed" , e );
207- }
200+ final StepListener <TimeValue > prepareEngineStep = new StepListener <>();
201+ // For a sequence based recovery, the target can keep its local translog
202+ prepareTargetForTranslog (isSequenceNumberBasedRecovery == false ,
203+ shard .estimateNumberOfHistoryOperations ("peer-recovery" , startingSeqNo ), prepareEngineStep );
204+ final StepListener <SendSnapshotResult > sendSnapshotStep = new StepListener <>();
205+ prepareEngineStep .whenComplete (prepareEngineTime -> {
206+ /*
207+ * add shard to replication group (shard will receive replication requests from this point on) now that engine is open.
208+ * This means that any document indexed into the primary after this will be replicated to this replica as well
209+ * make sure to do this before sampling the max sequence number in the next step, to ensure that we send
210+ * all documents up to maxSeqNo in phase2.
211+ */
212+ runUnderPrimaryPermit (() -> shard .initiateTracking (request .targetAllocationId ()),
213+ shardId + " initiating tracking of " + request .targetAllocationId (), shard , cancellableThreads , logger );
208214
209- /*
210- * add shard to replication group (shard will receive replication requests from this point on) now that engine is open.
211- * This means that any document indexed into the primary after this will be replicated to this replica as well
212- * make sure to do this before sampling the max sequence number in the next step, to ensure that we send
213- * all documents up to maxSeqNo in phase2.
214- */
215- runUnderPrimaryPermit (() -> shard .initiateTracking (request .targetAllocationId ()),
216- shardId + " initiating tracking of " + request .targetAllocationId (), shard , cancellableThreads , logger );
217-
218- final long endingSeqNo = shard .seqNoStats ().getMaxSeqNo ();
219- /*
220- * We need to wait for all operations up to the current max to complete, otherwise we can not guarantee that all
221- * operations in the required range will be available for replaying from the translog of the source.
222- */
223- cancellableThreads .execute (() -> shard .waitForOpsToComplete (endingSeqNo ));
224-
225- if (logger .isTraceEnabled ()) {
226- logger .trace ("all operations up to [{}] completed, which will be used as an ending sequence number" , endingSeqNo );
227- logger .trace ("snapshot translog for recovery; current size is [{}]" ,
228- shard .estimateNumberOfHistoryOperations ("peer-recovery" , startingSeqNo ));
229- }
215+ final long endingSeqNo = shard .seqNoStats ().getMaxSeqNo ();
216+ /*
217+ * We need to wait for all operations up to the current max to complete, otherwise we can not guarantee that all
218+ * operations in the required range will be available for replaying from the translog of the source.
219+ */
220+ cancellableThreads .execute (() -> shard .waitForOpsToComplete (endingSeqNo ));
221+ if (logger .isTraceEnabled ()) {
222+ logger .trace ("all operations up to [{}] completed, which will be used as an ending sequence number" , endingSeqNo );
223+ logger .trace ("snapshot translog for recovery; current size is [{}]" ,
224+ shard .estimateNumberOfHistoryOperations ("peer-recovery" , startingSeqNo ));
225+ }
226+ final Translog .Snapshot phase2Snapshot = shard .getHistoryOperations ("peer-recovery" , startingSeqNo );
227+ resources .add (phase2Snapshot );
228+ // we can release the retention lock here because the snapshot itself will retain the required operations.
229+ retentionLock .close ();
230+ // we have to capture the max_seen_auto_id_timestamp and the max_seq_no_of_updates to make sure that these values
231+ // are at least as high as the corresponding values on the primary when any of these operations were executed on it.
232+ final long maxSeenAutoIdTimestamp = shard .getMaxSeenAutoIdTimestamp ();
233+ final long maxSeqNoOfUpdatesOrDeletes = shard .getMaxSeqNoOfUpdatesOrDeletes ();
234+ phase2 (startingSeqNo , requiredSeqNoRangeStart , endingSeqNo , phase2Snapshot , maxSeenAutoIdTimestamp ,
235+ maxSeqNoOfUpdatesOrDeletes , sendSnapshotStep );
236+ sendSnapshotStep .whenComplete (
237+ r -> IOUtils .close (phase2Snapshot ),
238+ e -> {
239+ IOUtils .closeWhileHandlingException (phase2Snapshot );
240+ onFailure .accept (new RecoveryEngineException (shard .shardId (), 2 , "phase2 failed" , e ));
241+ });
242+
243+ }, onFailure );
230244
231- final Translog .Snapshot phase2Snapshot = shard .getHistoryOperations ("peer-recovery" , startingSeqNo );
232- resources .add (phase2Snapshot );
233- // we can release the retention lock here because the snapshot itself will retain the required operations.
234- IOUtils .close (retentionLock );
235- // we have to capture the max_seen_auto_id_timestamp and the max_seq_no_of_updates to make sure that these values
236- // are at least as high as the corresponding values on the primary when any of these operations were executed on it.
237- final long maxSeenAutoIdTimestamp = shard .getMaxSeenAutoIdTimestamp ();
238- final long maxSeqNoOfUpdatesOrDeletes = shard .getMaxSeqNoOfUpdatesOrDeletes ();
239- final StepListener <SendSnapshotResult > sendSnapshotStep = new StepListener <>();
240- phase2 (startingSeqNo , requiredSeqNoRangeStart , endingSeqNo , phase2Snapshot , maxSeenAutoIdTimestamp ,
241- maxSeqNoOfUpdatesOrDeletes , sendSnapshotStep );
242- sendSnapshotStep .whenComplete (
243- r -> IOUtils .close (phase2Snapshot ),
244- e -> onFailure .accept (new RecoveryEngineException (shard .shardId (), 2 , "phase2 failed" , e )));
245245 final StepListener <Void > finalizeStep = new StepListener <>();
246246 sendSnapshotStep .whenComplete (r -> finalizeRecovery (r .targetLocalCheckpoint , finalizeStep ), onFailure );
247247
@@ -251,7 +251,7 @@ public void recoverToTarget(ActionListener<RecoveryResponse> listener) {
251251 final RecoveryResponse response = new RecoveryResponse (sendFileResult .phase1FileNames , sendFileResult .phase1FileSizes ,
252252 sendFileResult .phase1ExistingFileNames , sendFileResult .phase1ExistingFileSizes , sendFileResult .totalSize ,
253253 sendFileResult .existingTotalSize , sendFileResult .took .millis (), phase1ThrottlingWaitTime ,
254- prepareEngineTime .millis (), sendSnapshotResult .totalOperations , sendSnapshotResult .tookTime .millis ());
254+ prepareEngineStep . result () .millis (), sendSnapshotResult .totalOperations , sendSnapshotResult .tookTime .millis ());
255255 try {
256256 wrappedListener .onResponse (response );
257257 } finally {
@@ -484,16 +484,21 @@ public SendFileResult phase1(final IndexCommit snapshot, final Supplier<Integer>
484484 }
485485 }
486486
487- TimeValue prepareTargetForTranslog (final boolean fileBasedRecovery , final int totalTranslogOps ) throws IOException {
487+ void prepareTargetForTranslog (boolean fileBasedRecovery , int totalTranslogOps , ActionListener < TimeValue > listener ) {
488488 StopWatch stopWatch = new StopWatch ().start ();
489- logger .trace ("recovery [phase1]: prepare remote engine for translog" );
489+ final ActionListener <Void > wrappedListener = ActionListener .wrap (
490+ nullVal -> {
491+ stopWatch .stop ();
492+ final TimeValue tookTime = stopWatch .totalTime ();
493+ logger .trace ("recovery [phase1]: remote engine start took [{}]" , tookTime );
494+ listener .onResponse (tookTime );
495+ },
496+ e -> listener .onFailure (new RecoveryEngineException (shard .shardId (), 1 , "prepare target for translog failed" , e )));
490497 // Send a request preparing the new shard's translog to receive operations. This ensures the shard engine is started and disables
491498 // garbage collection (not the JVM's GC!) of tombstone deletes.
492- cancellableThreads .executeIO (() -> recoveryTarget .prepareForTranslogOperations (fileBasedRecovery , totalTranslogOps ));
493- stopWatch .stop ();
494- final TimeValue tookTime = stopWatch .totalTime ();
495- logger .trace ("recovery [phase1]: remote engine start took [{}]" , tookTime );
496- return tookTime ;
499+ logger .trace ("recovery [phase1]: prepare remote engine for translog" );
500+ cancellableThreads .execute (() ->
501+ recoveryTarget .prepareForTranslogOperations (fileBasedRecovery , totalTranslogOps , wrappedListener ));
497502 }
498503
499504 /**
0 commit comments