Skip to content

Commit 5cb5d92

Browse files
authored
Fix race condition in SnapshotBasedIndexRecoveryIT (#79404)
If we don't cancel the re-location of the index to the same target node, it is possible that the recovery is retried, meaning that it's possible that the available permit is granted to indexRecoveredFromSnapshot1 instead of to indexRecoveredFromSnapshot2. Relates #79316 Closes #79420
1 parent f3b5299 commit 5cb5d92

File tree

2 files changed

+26
-4
lines changed

2 files changed

+26
-4
lines changed

server/src/main/java/org/elasticsearch/indices/recovery/PeerRecoveryTargetService.java

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -169,7 +169,7 @@ public void beforeIndexShardClosed(ShardId shardId, @Nullable IndexShard indexSh
169169
}
170170

171171
public void startRecovery(final IndexShard indexShard, final DiscoveryNode sourceNode, final RecoveryListener listener) {
172-
final Releasable snapshotFileDownloadsPermit = recoverySettings.tryAcquireSnapshotDownloadPermits();
172+
final Releasable snapshotFileDownloadsPermit = tryAcquireSnapshotDownloadPermits();
173173
// create a new recovery status, and process...
174174
final long recoveryId = onGoingRecoveries.startRecovery(
175175
indexShard,
@@ -258,6 +258,11 @@ private void doRecovery(final long recoveryId, final StartRecoveryRequest preExi
258258
);
259259
}
260260

261+
// Visible for testing
262+
public Releasable tryAcquireSnapshotDownloadPermits() {
263+
return recoverySettings.tryAcquireSnapshotDownloadPermits();
264+
}
265+
261266
/**
262267
* Prepare the start recovery request.
263268
*

x-pack/plugin/snapshot-based-recoveries/src/internalClusterTest/java/org/elasticsearch/xpack/snapshotbasedrecoveries/recovery/SnapshotBasedIndexRecoveryIT.java

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
import org.elasticsearch.common.settings.Settings;
3232
import org.elasticsearch.common.util.BigArrays;
3333
import org.elasticsearch.core.CheckedRunnable;
34+
import org.elasticsearch.core.Releasable;
3435
import org.elasticsearch.env.Environment;
3536
import org.elasticsearch.index.IndexService;
3637
import org.elasticsearch.index.MergePolicyConfig;
@@ -85,6 +86,7 @@
8586
import java.util.concurrent.atomic.AtomicReference;
8687
import java.util.stream.Collectors;
8788

89+
import static org.elasticsearch.cluster.routing.allocation.decider.MaxRetryAllocationDecider.SETTING_ALLOCATION_MAX_RETRY;
8890
import static org.elasticsearch.indices.recovery.RecoverySettings.INDICES_RECOVERY_MAX_BYTES_PER_SEC_SETTING;
8991
import static org.elasticsearch.indices.recovery.RecoverySettings.INDICES_RECOVERY_MAX_CONCURRENT_SNAPSHOT_FILE_DOWNLOADS;
9092
import static org.elasticsearch.indices.recovery.RecoverySettings.INDICES_RECOVERY_MAX_CONCURRENT_SNAPSHOT_FILE_DOWNLOADS_PER_NODE;
@@ -914,7 +916,6 @@ public void testRecoveryUsingSnapshotsIsThrottledPerNode() throws Exception {
914916
);
915917
}
916918

917-
@AwaitsFix(bugUrl = "https://github.com/elastic/elasticsearch/issues/79420")
918919
public void testRecoveryUsingSnapshotsPermitIsReturnedAfterFailureOrCancellation() throws Exception {
919920
executeRecoveryWithSnapshotFileDownloadThrottled(
920921
(
@@ -930,7 +931,12 @@ public void testRecoveryUsingSnapshotsPermitIsReturnedAfterFailureOrCancellation
930931
client().admin()
931932
.indices()
932933
.prepareUpdateSettings(indexRecoveredFromSnapshot1)
933-
.setSettings(Settings.builder().put("index.routing.allocation.require._name", targetNode))
934+
.setSettings(
935+
Settings.builder()
936+
.put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 1)
937+
.put("index.routing.allocation.require._name", (String) null)
938+
.put("index.routing.allocation.include._name", sourceNode + "," + targetNode)
939+
)
934940
.get()
935941
);
936942

@@ -963,6 +969,16 @@ public void testRecoveryUsingSnapshotsPermitIsReturnedAfterFailureOrCancellation
963969

964970
targetMockTransportService.clearAllRules();
965971
channelRef.get().sendResponse(new IOException("unable to clean files"));
972+
PeerRecoveryTargetService peerRecoveryTargetService = internalCluster().getInstance(
973+
PeerRecoveryTargetService.class,
974+
targetNode
975+
);
976+
assertBusy(() -> {
977+
// Wait until the current RecoveryTarget releases the snapshot download permit
978+
try (Releasable snapshotDownloadPermit = peerRecoveryTargetService.tryAcquireSnapshotDownloadPermits()) {
979+
assertThat(snapshotDownloadPermit, is(notNullValue()));
980+
}
981+
});
966982
}
967983

968984
String indexRecoveredFromSnapshot2 = indices.get(1);
@@ -1140,10 +1156,11 @@ private void executeRecoveryWithSnapshotFileDownloadThrottled(SnapshotBasedRecov
11401156
indexName,
11411157
Settings.builder()
11421158
.put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 1)
1159+
.put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 0)
11431160
.put(MergePolicyConfig.INDEX_MERGE_ENABLED, false)
11441161
.put(IndexService.GLOBAL_CHECKPOINT_SYNC_INTERVAL_SETTING.getKey(), "1s")
11451162
.put("index.routing.allocation.require._name", dataNodes.get(0))
1146-
.put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 0)
1163+
.put(SETTING_ALLOCATION_MAX_RETRY.getKey(), 0)
11471164
.build()
11481165
);
11491166
indices.add(indexName);

0 commit comments

Comments
 (0)