@@ -181,7 +181,7 @@ private void doRecovery(final long recoveryId) {
181181 cancellableThreads = recoveryTarget .cancellableThreads ();
182182 try {
183183 assert recoveryTarget .sourceNode () != null : "can not do a recovery without a source node" ;
184- request = getStartRecoveryRequest (recoveryTarget );
184+ request = getStartRecoveryRequest (recoveryTarget , clusterService . localNode (), logger );
185185 logger .trace ("{} preparing shard for peer recovery" , recoveryTarget .shardId ());
186186 recoveryTarget .indexShard ().prepareForIndexRecovery ();
187187 } catch (final Exception e ) {
@@ -322,7 +322,7 @@ public RecoveryResponse read(StreamInput in) throws IOException {
322322 * @param recoveryTarget the target of the recovery
323323 * @return a snapshot of the store metadata
324324 */
325- private Store .MetadataSnapshot getStoreMetadataSnapshot (final RecoveryTarget recoveryTarget ) {
325+ private static Store .MetadataSnapshot getStoreMetadataSnapshot (final RecoveryTarget recoveryTarget , final Logger logger ) {
326326 try {
327327 return recoveryTarget .indexShard ().snapshotStoreMetadata ();
328328 } catch (final org .apache .lucene .index .IndexNotFoundException e ) {
@@ -341,20 +341,23 @@ private Store.MetadataSnapshot getStoreMetadataSnapshot(final RecoveryTarget rec
341341 * @param recoveryTarget the target of the recovery
342342 * @return a start recovery request
343343 */
344- private StartRecoveryRequest getStartRecoveryRequest (final RecoveryTarget recoveryTarget ) {
344+ public static StartRecoveryRequest getStartRecoveryRequest (RecoveryTarget recoveryTarget , DiscoveryNode localNode , Logger logger ) {
345345 final StartRecoveryRequest request ;
346346 logger .trace ("{} collecting local files for [{}]" , recoveryTarget .shardId (), recoveryTarget .sourceNode ());
347347
348- final Store .MetadataSnapshot metadataSnapshot = getStoreMetadataSnapshot (recoveryTarget );
348+ Store .MetadataSnapshot metadataSnapshot = getStoreMetadataSnapshot (recoveryTarget , logger );
349349 logger .trace ("{} local file count [{}]" , recoveryTarget .shardId (), metadataSnapshot .size ());
350350
351- final long startingSeqNo ;
351+ long startingSeqNo = SequenceNumbers . UNASSIGNED_SEQ_NO ;
352352 if (metadataSnapshot .size () > 0 ) {
353- startingSeqNo = getStartingSeqNo (logger , recoveryTarget );
354- } else {
355- startingSeqNo = SequenceNumbers .UNASSIGNED_SEQ_NO ;
353+ try {
354+ startingSeqNo = getStartingSeqNo (logger , recoveryTarget );
355+ } catch (IOException | TranslogCorruptedException e ) {
356+ logger .warn (new ParameterizedMessage ("error while reading global checkpoint from translog, " +
357+ "resetting the starting sequence number from {} to unassigned and recovering as if there are none" , startingSeqNo ), e );
358+ metadataSnapshot = Store .MetadataSnapshot .EMPTY ;
359+ }
356360 }
357-
358361 if (startingSeqNo == SequenceNumbers .UNASSIGNED_SEQ_NO ) {
359362 logger .trace ("{} preparing for file-based recovery from [{}]" , recoveryTarget .shardId (), recoveryTarget .sourceNode ());
360363 } else {
@@ -369,7 +372,7 @@ private StartRecoveryRequest getStartRecoveryRequest(final RecoveryTarget recove
369372 recoveryTarget .shardId (),
370373 recoveryTarget .indexShard ().routingEntry ().allocationId ().getId (),
371374 recoveryTarget .sourceNode (),
372- clusterService . localNode () ,
375+ localNode ,
373376 metadataSnapshot ,
374377 recoveryTarget .state ().getPrimary (),
375378 recoveryTarget .recoveryId (),
@@ -384,39 +387,30 @@ private StartRecoveryRequest getStartRecoveryRequest(final RecoveryTarget recove
384387 * @return the starting sequence number or {@link SequenceNumbers#UNASSIGNED_SEQ_NO} if obtaining the starting sequence number
385388 * failed
386389 */
387- public static long getStartingSeqNo (final Logger logger , final RecoveryTarget recoveryTarget ) {
388- try {
389- final Store store = recoveryTarget .store ();
390- final String translogUUID = store .readLastCommittedSegmentsInfo ().getUserData ().get (Translog .TRANSLOG_UUID_KEY );
391- final long globalCheckpoint = Translog .readGlobalCheckpoint (recoveryTarget .translogLocation (), translogUUID );
392- final List <IndexCommit > existingCommits = DirectoryReader .listCommits (store .directory ());
393- final IndexCommit safeCommit = CombinedDeletionPolicy .findSafeCommitPoint (existingCommits , globalCheckpoint );
394- final SequenceNumbers .CommitInfo seqNoStats = Store .loadSeqNoInfo (safeCommit );
395- if (logger .isTraceEnabled ()) {
396- final StringJoiner descriptionOfExistingCommits = new StringJoiner ("," );
397- for (IndexCommit commit : existingCommits ) {
398- descriptionOfExistingCommits .add (CombinedDeletionPolicy .commitDescription (commit ));
399- }
400- logger .trace ("Calculate starting seqno based on global checkpoint [{}], safe commit [{}], existing commits [{}]" ,
401- globalCheckpoint , CombinedDeletionPolicy .commitDescription (safeCommit ), descriptionOfExistingCommits );
402- }
403- if (seqNoStats .maxSeqNo <= globalCheckpoint ) {
404- assert seqNoStats .localCheckpoint <= globalCheckpoint ;
405- /*
406- * Commit point is good for sequence-number based recovery as the maximum sequence number included in it is below the global
407- * checkpoint (i.e., it excludes any operations that may not be on the primary). Recovery will start at the first operation
408- * after the local checkpoint stored in the commit.
409- */
410- return seqNoStats .localCheckpoint + 1 ;
411- } else {
412- return SequenceNumbers .UNASSIGNED_SEQ_NO ;
390+ private static long getStartingSeqNo (final Logger logger , final RecoveryTarget recoveryTarget ) throws IOException {
391+ final Store store = recoveryTarget .store ();
392+ final String translogUUID = store .readLastCommittedSegmentsInfo ().getUserData ().get (Translog .TRANSLOG_UUID_KEY );
393+ final long globalCheckpoint = Translog .readGlobalCheckpoint (recoveryTarget .translogLocation (), translogUUID );
394+ final List <IndexCommit > existingCommits = DirectoryReader .listCommits (store .directory ());
395+ final IndexCommit safeCommit = CombinedDeletionPolicy .findSafeCommitPoint (existingCommits , globalCheckpoint );
396+ final SequenceNumbers .CommitInfo seqNoStats = Store .loadSeqNoInfo (safeCommit );
397+ if (logger .isTraceEnabled ()) {
398+ final StringJoiner descriptionOfExistingCommits = new StringJoiner ("," );
399+ for (IndexCommit commit : existingCommits ) {
400+ descriptionOfExistingCommits .add (CombinedDeletionPolicy .commitDescription (commit ));
413401 }
414- } catch (final TranslogCorruptedException | IOException e ) {
402+ logger .trace ("Calculate starting seqno based on global checkpoint [{}], safe commit [{}], existing commits [{}]" ,
403+ globalCheckpoint , CombinedDeletionPolicy .commitDescription (safeCommit ), descriptionOfExistingCommits );
404+ }
405+ if (seqNoStats .maxSeqNo <= globalCheckpoint ) {
406+ assert seqNoStats .localCheckpoint <= globalCheckpoint ;
415407 /*
416- * This can happen, for example, if a phase one of the recovery completed successfully, a network partition happens before the
417- * translog on the recovery target is opened, the recovery enters a retry loop seeing now that the index files are on disk and
418- * proceeds to attempt a sequence-number-based recovery .
408+ * Commit point is good for sequence-number based recovery as the maximum sequence number included in it is below the global
409+ * checkpoint (i.e., it excludes any operations that may not be on the primary). Recovery will start at the first operation
410+ * after the local checkpoint stored in the commit .
419411 */
412+ return seqNoStats .localCheckpoint + 1 ;
413+ } else {
420414 return SequenceNumbers .UNASSIGNED_SEQ_NO ;
421415 }
422416 }
0 commit comments