Skip to content

Commit 3a3f5b3

Browse files
Fix Race in Concurrent Snapshot Delete and Create (#37612)
* The repo id was determined wrong when the delete picked up on an in progress snapshot * NOTE: This solution is still a best-effort fix and there's a slight chance of running into concurrency issues here when multiple create and delete requests for the same snapshot name are happening concurrently, but these require a sequence of multiple cluster state updates between the changed method reading the genId and submitting its cluster state update task * Added test reproduced the issue reliably in about 50% of runs * Closes #37581
1 parent f70ec3b commit 3a3f5b3

File tree

2 files changed

+64
-3
lines changed

2 files changed

+64
-3
lines changed

server/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1118,14 +1118,20 @@ public void deleteSnapshot(final String repositoryName, final String snapshotNam
11181118
.filter(s -> s.getName().equals(snapshotName))
11191119
.findFirst();
11201120
// if nothing found by the same name, then look in the cluster state for current in progress snapshots
1121+
long repoGenId = repositoryData.getGenId();
11211122
if (matchedEntry.isPresent() == false) {
1122-
matchedEntry = currentSnapshots(repositoryName, Collections.emptyList()).stream()
1123-
.map(e -> e.snapshot().getSnapshotId()).filter(s -> s.getName().equals(snapshotName)).findFirst();
1123+
Optional<SnapshotsInProgress.Entry> matchedInProgress = currentSnapshots(repositoryName, Collections.emptyList()).stream()
1124+
.filter(s -> s.snapshot().getSnapshotId().getName().equals(snapshotName)).findFirst();
1125+
if (matchedInProgress.isPresent()) {
1126+
matchedEntry = matchedInProgress.map(s -> s.snapshot().getSnapshotId());
1127+
// Derive repository generation if a snapshot is in progress because it will increment the generation when it finishes
1128+
repoGenId = matchedInProgress.get().getRepositoryStateId() + 1L;
1129+
}
11241130
}
11251131
if (matchedEntry.isPresent() == false) {
11261132
throw new SnapshotMissingException(repositoryName, snapshotName);
11271133
}
1128-
deleteSnapshot(new Snapshot(repositoryName, matchedEntry.get()), listener, repositoryData.getGenId(), immediatePriority);
1134+
deleteSnapshot(new Snapshot(repositoryName, matchedEntry.get()), listener, repoGenId, immediatePriority);
11291135
}
11301136

11311137
/**

server/src/test/java/org/elasticsearch/snapshots/SnapshotsServiceTests.java

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,9 @@
2929
import org.elasticsearch.action.admin.cluster.repositories.put.TransportPutRepositoryAction;
3030
import org.elasticsearch.action.admin.cluster.snapshots.create.CreateSnapshotAction;
3131
import org.elasticsearch.action.admin.cluster.snapshots.create.TransportCreateSnapshotAction;
32+
import org.elasticsearch.action.admin.cluster.snapshots.delete.DeleteSnapshotAction;
33+
import org.elasticsearch.action.admin.cluster.snapshots.delete.DeleteSnapshotRequest;
34+
import org.elasticsearch.action.admin.cluster.snapshots.delete.TransportDeleteSnapshotAction;
3235
import org.elasticsearch.action.admin.indices.create.CreateIndexAction;
3336
import org.elasticsearch.action.admin.indices.create.CreateIndexRequest;
3437
import org.elasticsearch.action.admin.indices.create.TransportCreateIndexAction;
@@ -196,6 +199,53 @@ public void testSuccessfulSnapshot() {
196199
assertEquals(0, snapshotInfo.failedShards());
197200
}
198201

202+
public void testConcurrentSnapshotCreateAndDelete() {
203+
setupTestCluster(randomFrom(1, 3, 5), randomIntBetween(2, 10));
204+
205+
String repoName = "repo";
206+
String snapshotName = "snapshot";
207+
final String index = "test";
208+
209+
final int shards = randomIntBetween(1, 10);
210+
211+
TestClusterNode masterNode =
212+
testClusterNodes.currentMaster(testClusterNodes.nodes.values().iterator().next().clusterService.state());
213+
final AtomicBoolean createdSnapshot = new AtomicBoolean();
214+
masterNode.client.admin().cluster().preparePutRepository(repoName)
215+
.setType(FsRepository.TYPE).setSettings(Settings.builder().put("location", randomAlphaOfLength(10)))
216+
.execute(
217+
assertNoFailureListener(
218+
() -> masterNode.client.admin().indices().create(
219+
new CreateIndexRequest(index).waitForActiveShards(ActiveShardCount.ALL).settings(
220+
Settings.builder()
221+
.put(IndexMetaData.INDEX_NUMBER_OF_SHARDS_SETTING.getKey(), shards)
222+
.put(IndexMetaData.INDEX_NUMBER_OF_REPLICAS_SETTING.getKey(), 0)),
223+
assertNoFailureListener(
224+
() -> masterNode.client.admin().cluster().prepareCreateSnapshot(repoName, snapshotName)
225+
.execute(assertNoFailureListener(
226+
() -> masterNode.client.admin().cluster().deleteSnapshot(
227+
new DeleteSnapshotRequest(repoName, snapshotName),
228+
assertNoFailureListener(() -> masterNode.client.admin().cluster()
229+
.prepareCreateSnapshot(repoName, snapshotName).execute(
230+
assertNoFailureListener(() -> createdSnapshot.set(true))
231+
)))))))));
232+
233+
deterministicTaskQueue.runAllRunnableTasks();
234+
235+
assertTrue(createdSnapshot.get());
236+
SnapshotsInProgress finalSnapshotsInProgress = masterNode.clusterService.state().custom(SnapshotsInProgress.TYPE);
237+
assertFalse(finalSnapshotsInProgress.entries().stream().anyMatch(entry -> entry.state().completed() == false));
238+
final Repository repository = masterNode.repositoriesService.repository(repoName);
239+
Collection<SnapshotId> snapshotIds = repository.getRepositoryData().getSnapshotIds();
240+
assertThat(snapshotIds, hasSize(1));
241+
242+
final SnapshotInfo snapshotInfo = repository.getSnapshotInfo(snapshotIds.iterator().next());
243+
assertEquals(SnapshotState.SUCCESS, snapshotInfo.state());
244+
assertThat(snapshotInfo.indices(), containsInAnyOrder(index));
245+
assertEquals(shards, snapshotInfo.successfulShards());
246+
assertEquals(0, snapshotInfo.failedShards());
247+
}
248+
199249
private void startCluster() {
200250
final ClusterState initialClusterState =
201251
new ClusterState.Builder(ClusterName.DEFAULT).nodes(testClusterNodes.randomDiscoveryNodes()).build();
@@ -519,6 +569,11 @@ allocationService, new AliasValidator(), environment, indexScopedSettings,
519569
transportService, clusterService, threadPool,
520570
snapshotsService, actionFilters, indexNameExpressionResolver
521571
));
572+
actions.put(DeleteSnapshotAction.INSTANCE,
573+
new TransportDeleteSnapshotAction(
574+
transportService, clusterService, threadPool,
575+
snapshotsService, actionFilters, indexNameExpressionResolver
576+
));
522577
client.initialize(actions, () -> clusterService.localNode().getId(), transportService.getRemoteClusterService());
523578
}
524579

0 commit comments

Comments
 (0)