Skip to content

Commit 3b827b8

Browse files
committed
Ignore Lucene index in peer recovery if translog corrupted (#49114)
If the translog on a replica is corrupt, we should not perform an operation-based recovery or utilize sync_id as we won't be able to open an engine in the next step. This change adds an extra validation that ensures translog is okay when preparing a peer recovery request.
1 parent 7ddf540 commit 3b827b8

File tree

3 files changed

+53
-57
lines changed

3 files changed

+53
-57
lines changed

server/src/main/java/org/elasticsearch/indices/recovery/PeerRecoveryTargetService.java

Lines changed: 34 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,7 @@ private void doRecovery(final long recoveryId) {
181181
cancellableThreads = recoveryTarget.cancellableThreads();
182182
try {
183183
assert recoveryTarget.sourceNode() != null : "can not do a recovery without a source node";
184-
request = getStartRecoveryRequest(recoveryTarget);
184+
request = getStartRecoveryRequest(recoveryTarget, clusterService.localNode(), logger);
185185
logger.trace("{} preparing shard for peer recovery", recoveryTarget.shardId());
186186
recoveryTarget.indexShard().prepareForIndexRecovery();
187187
} catch (final Exception e) {
@@ -322,7 +322,7 @@ public RecoveryResponse read(StreamInput in) throws IOException {
322322
* @param recoveryTarget the target of the recovery
323323
* @return a snapshot of the store metadata
324324
*/
325-
private Store.MetadataSnapshot getStoreMetadataSnapshot(final RecoveryTarget recoveryTarget) {
325+
private static Store.MetadataSnapshot getStoreMetadataSnapshot(final RecoveryTarget recoveryTarget, final Logger logger) {
326326
try {
327327
return recoveryTarget.indexShard().snapshotStoreMetadata();
328328
} catch (final org.apache.lucene.index.IndexNotFoundException e) {
@@ -341,20 +341,23 @@ private Store.MetadataSnapshot getStoreMetadataSnapshot(final RecoveryTarget rec
341341
* @param recoveryTarget the target of the recovery
342342
* @return a start recovery request
343343
*/
344-
private StartRecoveryRequest getStartRecoveryRequest(final RecoveryTarget recoveryTarget) {
344+
public static StartRecoveryRequest getStartRecoveryRequest(RecoveryTarget recoveryTarget, DiscoveryNode localNode, Logger logger) {
345345
final StartRecoveryRequest request;
346346
logger.trace("{} collecting local files for [{}]", recoveryTarget.shardId(), recoveryTarget.sourceNode());
347347

348-
final Store.MetadataSnapshot metadataSnapshot = getStoreMetadataSnapshot(recoveryTarget);
348+
Store.MetadataSnapshot metadataSnapshot = getStoreMetadataSnapshot(recoveryTarget, logger);
349349
logger.trace("{} local file count [{}]", recoveryTarget.shardId(), metadataSnapshot.size());
350350

351-
final long startingSeqNo;
351+
long startingSeqNo = SequenceNumbers.UNASSIGNED_SEQ_NO;
352352
if (metadataSnapshot.size() > 0) {
353-
startingSeqNo = getStartingSeqNo(logger, recoveryTarget);
354-
} else {
355-
startingSeqNo = SequenceNumbers.UNASSIGNED_SEQ_NO;
353+
try {
354+
startingSeqNo = getStartingSeqNo(logger, recoveryTarget);
355+
} catch (IOException | TranslogCorruptedException e) {
356+
logger.warn(new ParameterizedMessage("error while reading global checkpoint from translog, " +
357+
"resetting the starting sequence number from {} to unassigned and recovering as if there are none", startingSeqNo), e);
358+
metadataSnapshot = Store.MetadataSnapshot.EMPTY;
359+
}
356360
}
357-
358361
if (startingSeqNo == SequenceNumbers.UNASSIGNED_SEQ_NO) {
359362
logger.trace("{} preparing for file-based recovery from [{}]", recoveryTarget.shardId(), recoveryTarget.sourceNode());
360363
} else {
@@ -369,7 +372,7 @@ private StartRecoveryRequest getStartRecoveryRequest(final RecoveryTarget recove
369372
recoveryTarget.shardId(),
370373
recoveryTarget.indexShard().routingEntry().allocationId().getId(),
371374
recoveryTarget.sourceNode(),
372-
clusterService.localNode(),
375+
localNode,
373376
metadataSnapshot,
374377
recoveryTarget.state().getPrimary(),
375378
recoveryTarget.recoveryId(),
@@ -384,39 +387,30 @@ private StartRecoveryRequest getStartRecoveryRequest(final RecoveryTarget recove
384387
* @return the starting sequence number or {@link SequenceNumbers#UNASSIGNED_SEQ_NO} if obtaining the starting sequence number
385388
* failed
386389
*/
387-
public static long getStartingSeqNo(final Logger logger, final RecoveryTarget recoveryTarget) {
388-
try {
389-
final Store store = recoveryTarget.store();
390-
final String translogUUID = store.readLastCommittedSegmentsInfo().getUserData().get(Translog.TRANSLOG_UUID_KEY);
391-
final long globalCheckpoint = Translog.readGlobalCheckpoint(recoveryTarget.translogLocation(), translogUUID);
392-
final List<IndexCommit> existingCommits = DirectoryReader.listCommits(store.directory());
393-
final IndexCommit safeCommit = CombinedDeletionPolicy.findSafeCommitPoint(existingCommits, globalCheckpoint);
394-
final SequenceNumbers.CommitInfo seqNoStats = Store.loadSeqNoInfo(safeCommit);
395-
if (logger.isTraceEnabled()) {
396-
final StringJoiner descriptionOfExistingCommits = new StringJoiner(",");
397-
for (IndexCommit commit : existingCommits) {
398-
descriptionOfExistingCommits.add(CombinedDeletionPolicy.commitDescription(commit));
399-
}
400-
logger.trace("Calculate starting seqno based on global checkpoint [{}], safe commit [{}], existing commits [{}]",
401-
globalCheckpoint, CombinedDeletionPolicy.commitDescription(safeCommit), descriptionOfExistingCommits);
402-
}
403-
if (seqNoStats.maxSeqNo <= globalCheckpoint) {
404-
assert seqNoStats.localCheckpoint <= globalCheckpoint;
405-
/*
406-
* Commit point is good for sequence-number based recovery as the maximum sequence number included in it is below the global
407-
* checkpoint (i.e., it excludes any operations that may not be on the primary). Recovery will start at the first operation
408-
* after the local checkpoint stored in the commit.
409-
*/
410-
return seqNoStats.localCheckpoint + 1;
411-
} else {
412-
return SequenceNumbers.UNASSIGNED_SEQ_NO;
390+
private static long getStartingSeqNo(final Logger logger, final RecoveryTarget recoveryTarget) throws IOException {
391+
final Store store = recoveryTarget.store();
392+
final String translogUUID = store.readLastCommittedSegmentsInfo().getUserData().get(Translog.TRANSLOG_UUID_KEY);
393+
final long globalCheckpoint = Translog.readGlobalCheckpoint(recoveryTarget.translogLocation(), translogUUID);
394+
final List<IndexCommit> existingCommits = DirectoryReader.listCommits(store.directory());
395+
final IndexCommit safeCommit = CombinedDeletionPolicy.findSafeCommitPoint(existingCommits, globalCheckpoint);
396+
final SequenceNumbers.CommitInfo seqNoStats = Store.loadSeqNoInfo(safeCommit);
397+
if (logger.isTraceEnabled()) {
398+
final StringJoiner descriptionOfExistingCommits = new StringJoiner(",");
399+
for (IndexCommit commit : existingCommits) {
400+
descriptionOfExistingCommits.add(CombinedDeletionPolicy.commitDescription(commit));
413401
}
414-
} catch (final TranslogCorruptedException | IOException e) {
402+
logger.trace("Calculate starting seqno based on global checkpoint [{}], safe commit [{}], existing commits [{}]",
403+
globalCheckpoint, CombinedDeletionPolicy.commitDescription(safeCommit), descriptionOfExistingCommits);
404+
}
405+
if (seqNoStats.maxSeqNo <= globalCheckpoint) {
406+
assert seqNoStats.localCheckpoint <= globalCheckpoint;
415407
/*
416-
* This can happen, for example, if a phase one of the recovery completed successfully, a network partition happens before the
417-
* translog on the recovery target is opened, the recovery enters a retry loop seeing now that the index files are on disk and
418-
* proceeds to attempt a sequence-number-based recovery.
408+
* Commit point is good for sequence-number based recovery as the maximum sequence number included in it is below the global
409+
* checkpoint (i.e., it excludes any operations that may not be on the primary). Recovery will start at the first operation
410+
* after the local checkpoint stored in the commit.
419411
*/
412+
return seqNoStats.localCheckpoint + 1;
413+
} else {
420414
return SequenceNumbers.UNASSIGNED_SEQ_NO;
421415
}
422416
}

server/src/test/java/org/elasticsearch/indices/recovery/PeerRecoveryTargetServiceTests.java

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -52,17 +52,22 @@
5252

5353
import static org.hamcrest.Matchers.empty;
5454
import static org.hamcrest.Matchers.equalTo;
55+
import static org.hamcrest.Matchers.greaterThan;
56+
import static org.hamcrest.Matchers.sameInstance;
5557

5658
public class PeerRecoveryTargetServiceTests extends IndexShardTestCase {
5759

5860
public void testGetStartingSeqNo() throws Exception {
5961
final IndexShard replica = newShard(false);
62+
final DiscoveryNode rNode = getFakeDiscoNode(replica.routingEntry().currentNodeId());
6063
try {
6164
// Empty store
6265
{
6366
recoveryEmptyReplica(replica, true);
6467
final RecoveryTarget recoveryTarget = new RecoveryTarget(replica, null, null, null);
65-
assertThat(PeerRecoveryTargetService.getStartingSeqNo(logger, recoveryTarget), equalTo(0L));
68+
final StartRecoveryRequest request = PeerRecoveryTargetService.getStartRecoveryRequest(recoveryTarget, rNode, logger);
69+
assertThat(request.startingSeqNo(), equalTo(0L));
70+
assertThat(request.metadataSnapshot().size(), greaterThan(0));
6671
recoveryTarget.decRef();
6772
}
6873
// Last commit is good - use it.
@@ -78,7 +83,9 @@ public void testGetStartingSeqNo() throws Exception {
7883
replica.updateGlobalCheckpointOnReplica(initDocs - 1, "test");
7984
replica.sync();
8085
final RecoveryTarget recoveryTarget = new RecoveryTarget(replica, null, null, null);
81-
assertThat(PeerRecoveryTargetService.getStartingSeqNo(logger, recoveryTarget), equalTo(initDocs));
86+
final StartRecoveryRequest request = PeerRecoveryTargetService.getStartRecoveryRequest(recoveryTarget, rNode, logger);
87+
assertThat(request.startingSeqNo(), equalTo(initDocs));
88+
assertThat(request.metadataSnapshot().size(), greaterThan(0));
8289
recoveryTarget.decRef();
8390
}
8491
// Global checkpoint does not advance, last commit is not good - use the previous commit
@@ -92,15 +99,19 @@ public void testGetStartingSeqNo() throws Exception {
9299
}
93100
flushShard(replica);
94101
final RecoveryTarget recoveryTarget = new RecoveryTarget(replica, null, null, null);
95-
assertThat(PeerRecoveryTargetService.getStartingSeqNo(logger, recoveryTarget), equalTo(initDocs));
102+
final StartRecoveryRequest request = PeerRecoveryTargetService.getStartRecoveryRequest(recoveryTarget, rNode, logger);
103+
assertThat(request.startingSeqNo(), equalTo(initDocs));
104+
assertThat(request.metadataSnapshot().size(), greaterThan(0));
96105
recoveryTarget.decRef();
97106
}
98107
// Advances the global checkpoint, a safe commit also advances
99108
{
100109
replica.updateGlobalCheckpointOnReplica(initDocs + moreDocs - 1, "test");
101110
replica.sync();
102111
final RecoveryTarget recoveryTarget = new RecoveryTarget(replica, null, null, null);
103-
assertThat(PeerRecoveryTargetService.getStartingSeqNo(logger, recoveryTarget), equalTo(initDocs + moreDocs));
112+
final StartRecoveryRequest request = PeerRecoveryTargetService.getStartRecoveryRequest(recoveryTarget, rNode, logger);
113+
assertThat(request.startingSeqNo(), equalTo(initDocs + moreDocs));
114+
assertThat(request.metadataSnapshot().size(), greaterThan(0));
104115
recoveryTarget.decRef();
105116
}
106117
// Different translogUUID, fallback to file-based
@@ -119,7 +130,9 @@ public void testGetStartingSeqNo() throws Exception {
119130
writer.commit();
120131
}
121132
final RecoveryTarget recoveryTarget = new RecoveryTarget(replica, null, null, null);
122-
assertThat(PeerRecoveryTargetService.getStartingSeqNo(logger, recoveryTarget), equalTo(SequenceNumbers.UNASSIGNED_SEQ_NO));
133+
final StartRecoveryRequest request = PeerRecoveryTargetService.getStartRecoveryRequest(recoveryTarget, rNode, logger);
134+
assertThat(request.metadataSnapshot(), sameInstance(Store.MetadataSnapshot.EMPTY));
135+
assertThat(request.startingSeqNo(), equalTo(SequenceNumbers.UNASSIGNED_SEQ_NO));
123136
recoveryTarget.decRef();
124137
}
125138
} finally {

test/framework/src/main/java/org/elasticsearch/index/shard/IndexShardTestCase.java

Lines changed: 1 addition & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -609,18 +609,7 @@ protected final void recoverUnstartedReplica(final IndexShard replica,
609609
}
610610
replica.prepareForIndexRecovery();
611611
final RecoveryTarget recoveryTarget = targetSupplier.apply(replica, pNode);
612-
final String targetAllocationId = recoveryTarget.indexShard().routingEntry().allocationId().getId();
613-
614-
final Store.MetadataSnapshot snapshot = getMetadataSnapshotOrEmpty(replica);
615-
final long startingSeqNo;
616-
if (snapshot.size() > 0) {
617-
startingSeqNo = PeerRecoveryTargetService.getStartingSeqNo(logger, recoveryTarget);
618-
} else {
619-
startingSeqNo = SequenceNumbers.UNASSIGNED_SEQ_NO;
620-
}
621-
622-
final StartRecoveryRequest request = new StartRecoveryRequest(replica.shardId(), targetAllocationId,
623-
pNode, rNode, snapshot, replica.routingEntry().primary(), 0, startingSeqNo);
612+
final StartRecoveryRequest request = PeerRecoveryTargetService.getStartRecoveryRequest(recoveryTarget, rNode, logger);
624613
final RecoverySourceHandler recovery = new RecoverySourceHandler(
625614
primary, recoveryTarget, request, Math.toIntExact(ByteSizeUnit.MB.toBytes(1)), between(1, 8));
626615
primary.updateShardState(primary.routingEntry(), primary.getPendingPrimaryTerm(), null,

0 commit comments

Comments
 (0)