Skip to content

Commit 0f61ef4

Browse files
Fix Bug Causing Queued Snapshots of Deleted Indices to Never Finalize (#75942) (#76574)
We have to run the loop checking for completed snapshots if we see an index disappearing. Otherwise, we never get around to finalizing a queued snapshot stuck after a clone if the index is deleted during cloning.
1 parent 8a9035c commit 0f61ef4

File tree

3 files changed

+68
-5
lines changed

3 files changed

+68
-5
lines changed

server/src/internalClusterTest/java/org/elasticsearch/snapshots/ConcurrentSnapshotsIT.java

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1611,6 +1611,57 @@ public void testOutOfOrderCloneFinalization() throws Exception {
16111611
);
16121612
}
16131613

1614+
public void testIndexDeletedWhileSnapshotQueuedAfterClone() throws Exception {
1615+
final String master = internalCluster().startMasterOnlyNode(LARGE_SNAPSHOT_POOL_SETTINGS);
1616+
internalCluster().startDataOnlyNode();
1617+
final String index1 = "index-1";
1618+
final String index2 = "index-2";
1619+
createIndexWithContent(index1);
1620+
createIndexWithContent(index2);
1621+
1622+
final String repository = "test-repo";
1623+
createRepository(repository, "mock");
1624+
1625+
final String sourceSnapshot = "source-snapshot";
1626+
createFullSnapshot(repository, sourceSnapshot);
1627+
1628+
final IndexId index1Id = getRepositoryData(repository).resolveIndexId(index1);
1629+
blockMasterOnShardLevelSnapshotFile(repository, index1Id.getId());
1630+
1631+
final String cloneTarget = "target-snapshot";
1632+
final ActionFuture<AcknowledgedResponse> cloneSnapshot = clusterAdmin().prepareCloneSnapshot(
1633+
repository,
1634+
sourceSnapshot,
1635+
cloneTarget
1636+
).setIndices(index1, index2).execute();
1637+
awaitNumberOfSnapshotsInProgress(1);
1638+
waitForBlock(master, repository);
1639+
1640+
final ActionFuture<CreateSnapshotResponse> snapshot3 = clusterAdmin().prepareCreateSnapshot(repository, "snapshot-3")
1641+
.setIndices(index1, index2)
1642+
.setWaitForCompletion(true)
1643+
.setPartial(true)
1644+
.execute();
1645+
final ActionFuture<CreateSnapshotResponse> snapshot2 = clusterAdmin().prepareCreateSnapshot(repository, "snapshot-2")
1646+
.setIndices(index2)
1647+
.setWaitForCompletion(true)
1648+
.execute();
1649+
assertSuccessful(snapshot2);
1650+
awaitNumberOfSnapshotsInProgress(2);
1651+
assertFalse(snapshot3.isDone());
1652+
assertAcked(admin().indices().prepareDelete(index1).get());
1653+
assertSuccessful(snapshot3);
1654+
unblockNode(repository, master);
1655+
1656+
assertAcked(cloneSnapshot.get());
1657+
assertAcked(startDeleteSnapshot(repository, cloneTarget).get());
1658+
1659+
assertThat(
1660+
clusterAdmin().prepareSnapshotStatus().setSnapshots("snapshot-2", "snapshot-3").setRepository(repository).get().getSnapshots(),
1661+
hasSize(2)
1662+
);
1663+
}
1664+
16141665
public void testQueuedAfterFailedShardSnapshot() throws Exception {
16151666
internalCluster().startMasterOnlyNode();
16161667
final String dataNode = internalCluster().startDataOnlyNode();

server/src/main/java/org/elasticsearch/cluster/SnapshotsInProgress.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -825,6 +825,7 @@ public ImmutableOpenMap<RepositoryShardId, ShardSnapshotStatus> shardsByRepoShar
825825
}
826826

827827
public Index indexByName(String name) {
828+
assert isClone() == false : "tried to get routing index for clone entry [" + this + "]";
828829
return snapshotIndices.get(name);
829830
}
830831

server/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1692,8 +1692,18 @@ private static ImmutableOpenMap<ShardId, ShardSnapshotStatus> processWaitingShar
16921692
// this shard snapshot is waiting for a previous snapshot to finish execution for this shard
16931693
final ShardSnapshotStatus knownFailure = knownFailures.get(shardId);
16941694
if (knownFailure == null) {
1695-
// if no failure is known for the shard we keep waiting
1696-
shards.put(shardId, shardStatus);
1695+
final IndexRoutingTable indexShardRoutingTable = routingTable.index(shardId.getIndex());
1696+
if (indexShardRoutingTable == null) {
1697+
// shard became unassigned while queued so we fail as missing here
1698+
assert entry.partial();
1699+
snapshotChanged = true;
1700+
logger.debug("failing snapshot of shard [{}] because index got deleted", shardId);
1701+
shards.put(shardId, ShardSnapshotStatus.MISSING);
1702+
knownFailures.put(shardId, ShardSnapshotStatus.MISSING);
1703+
} else {
1704+
// if no failure is known for the shard we keep waiting
1705+
shards.put(shardId, shardStatus);
1706+
}
16971707
} else {
16981708
// If a failure is known for an execution we waited on for this shard then we fail with the same exception here
16991709
// as well
@@ -1761,9 +1771,10 @@ private static ImmutableOpenMap<ShardId, ShardSnapshotStatus> processWaitingShar
17611771

17621772
private static boolean waitingShardsStartedOrUnassigned(SnapshotsInProgress snapshotsInProgress, ClusterChangedEvent event) {
17631773
for (SnapshotsInProgress.Entry entry : snapshotsInProgress.entries()) {
1764-
if (entry.state() == State.STARTED) {
1774+
if (entry.state() == State.STARTED && entry.isClone() == false) {
17651775
for (ObjectObjectCursor<RepositoryShardId, ShardSnapshotStatus> shardStatus : entry.shardsByRepoShardId()) {
1766-
if (shardStatus.value.state() != ShardState.WAITING) {
1776+
final ShardState state = shardStatus.value.state();
1777+
if (state != ShardState.WAITING && state != ShardState.QUEUED) {
17671778
continue;
17681779
}
17691780
final RepositoryShardId shardId = shardStatus.key;
@@ -1772,7 +1783,7 @@ private static boolean waitingShardsStartedOrUnassigned(SnapshotsInProgress snap
17721783
.getRoutingTable()
17731784
.index(entry.indexByName(shardId.indexName()));
17741785
if (indexShardRoutingTable == null) {
1775-
// index got removed concurrently and we have to fail WAITING state shards
1786+
// index got removed concurrently and we have to fail WAITING or QUEUED state shards
17761787
return true;
17771788
}
17781789
ShardRouting shardRouting = indexShardRoutingTable.shard(shardId.shardId()).primaryShard();

0 commit comments

Comments
 (0)