Skip to content

Commit b9ce649

Browse files
Cleanup Stale Root Level Blobs in Sn. Repository (#43542)
* Cleans up all root level temp., snap-%s.dat, meta-%s.dat blobs that aren't referenced by any snapshot to deal with dangling blobs left behind by delete and snapshot finalization failures * The scenario that get's us here is a snapshot failing before it was finalized or a delete failing right after it wrote the updated index-(N+1) that doesn't reference a snapshot anymore but then fails to remove that snapshot * Not deleting other dangling blobs since that don't follow the snap-, meta- or tempfile naming schemes to not accidentally delete blobs not created by the snapshot logic * Follow up to #42189 * Same safety logic, get list of all blobs before writing index-N blobs, delete things after index-N blobs was written
1 parent 7578db9 commit b9ce649

File tree

2 files changed

+61
-3
lines changed

2 files changed

+61
-3
lines changed

server/src/main/java/org/elasticsearch/repositories/blobstore/BlobStoreRepository.java

Lines changed: 54 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@
5959
import org.elasticsearch.common.settings.Settings;
6060
import org.elasticsearch.common.unit.ByteSizeUnit;
6161
import org.elasticsearch.common.unit.ByteSizeValue;
62+
import org.elasticsearch.common.util.set.Sets;
6263
import org.elasticsearch.common.xcontent.LoggingDeprecationHandler;
6364
import org.elasticsearch.common.xcontent.NamedXContentRegistry;
6465
import org.elasticsearch.common.xcontent.XContentFactory;
@@ -101,6 +102,7 @@
101102
import java.util.Arrays;
102103
import java.util.Collection;
103104
import java.util.Collections;
105+
import java.util.HashSet;
104106
import java.util.List;
105107
import java.util.Map;
106108
import java.util.Optional;
@@ -139,7 +141,9 @@ public abstract class BlobStoreRepository extends AbstractLifecycleComponent imp
139141

140142
private static final String TESTS_FILE = "tests-";
141143

142-
private static final String METADATA_NAME_FORMAT = "meta-%s.dat";
144+
private static final String METADATA_PREFIX = "meta-";
145+
146+
private static final String METADATA_NAME_FORMAT = METADATA_PREFIX + "%s.dat";
143147

144148
private static final String METADATA_CODEC = "metadata";
145149

@@ -393,21 +397,24 @@ public void deleteSnapshot(SnapshotId snapshotId, long repositoryStateId, Action
393397
// Delete snapshot from the index file, since it is the maintainer of truth of active snapshots
394398
final RepositoryData updatedRepositoryData;
395399
final Map<String, BlobContainer> foundIndices;
400+
final Set<String> rootBlobs;
396401
try {
397402
final RepositoryData repositoryData = getRepositoryData();
398403
updatedRepositoryData = repositoryData.removeSnapshot(snapshotId);
399404
// Cache the indices that were found before writing out the new index-N blob so that a stuck master will never
400405
// delete an index that was created by another master node after writing this index-N blob.
401406
foundIndices = blobStore().blobContainer(basePath().add("indices")).children();
407+
rootBlobs = blobContainer().listBlobs().keySet();
402408
writeIndexGen(updatedRepositoryData, repositoryStateId);
403409
} catch (Exception ex) {
404410
listener.onFailure(new RepositoryException(metadata.name(), "failed to delete snapshot [" + snapshotId + "]", ex));
405411
return;
406412
}
407413
final SnapshotInfo finalSnapshotInfo = snapshot;
414+
final List<String> snapMetaFilesToDelete =
415+
Arrays.asList(snapshotFormat.blobName(snapshotId.getUUID()), globalMetaDataFormat.blobName(snapshotId.getUUID()));
408416
try {
409-
blobContainer().deleteBlobsIgnoringIfNotExists(
410-
Arrays.asList(snapshotFormat.blobName(snapshotId.getUUID()), globalMetaDataFormat.blobName(snapshotId.getUUID())));
417+
blobContainer().deleteBlobsIgnoringIfNotExists(snapMetaFilesToDelete);
411418
} catch (IOException e) {
412419
logger.warn(() -> new ParameterizedMessage("[{}] Unable to delete global metadata files", snapshotId), e);
413420
}
@@ -420,12 +427,56 @@ public void deleteSnapshot(SnapshotId snapshotId, long repositoryStateId, Action
420427
snapshotId,
421428
ActionListener.map(listener, v -> {
422429
cleanupStaleIndices(foundIndices, survivingIndices);
430+
cleanupStaleRootFiles(Sets.difference(rootBlobs, new HashSet<>(snapMetaFilesToDelete)), updatedRepositoryData);
423431
return null;
424432
})
425433
);
426434
}
427435
}
428436

437+
private void cleanupStaleRootFiles(Set<String> rootBlobNames, RepositoryData repositoryData) {
438+
final Set<String> allSnapshotIds =
439+
repositoryData.getAllSnapshotIds().stream().map(SnapshotId::getUUID).collect(Collectors.toSet());
440+
final List<String> blobsToDelete = rootBlobNames.stream().filter(
441+
blob -> {
442+
if (FsBlobContainer.isTempBlobName(blob)) {
443+
return true;
444+
}
445+
if (blob.endsWith(".dat")) {
446+
final String foundUUID;
447+
if (blob.startsWith(SNAPSHOT_PREFIX)) {
448+
foundUUID = blob.substring(SNAPSHOT_PREFIX.length(), blob.length() - ".dat".length());
449+
assert snapshotFormat.blobName(foundUUID).equals(blob);
450+
} else if (blob.startsWith(METADATA_PREFIX)) {
451+
foundUUID = blob.substring(METADATA_PREFIX.length(), blob.length() - ".dat".length());
452+
assert globalMetaDataFormat.blobName(foundUUID).equals(blob);
453+
} else {
454+
return false;
455+
}
456+
return allSnapshotIds.contains(foundUUID) == false;
457+
}
458+
return false;
459+
}
460+
).collect(Collectors.toList());
461+
if (blobsToDelete.isEmpty()) {
462+
return;
463+
}
464+
try {
465+
logger.info("[{}] Found stale root level blobs {}. Cleaning them up", metadata.name(), blobsToDelete);
466+
blobContainer().deleteBlobsIgnoringIfNotExists(blobsToDelete);
467+
} catch (IOException e) {
468+
logger.warn(() -> new ParameterizedMessage(
469+
"[{}] The following blobs are no longer part of any snapshot [{}] but failed to remove them",
470+
metadata.name(), blobsToDelete), e);
471+
} catch (Exception e) {
472+
// TODO: We shouldn't be blanket catching and suppressing all exceptions here and instead handle them safely upstream.
473+
// Currently this catch exists as a stop gap solution to tackle unexpected runtime exceptions from implementations
474+
// bubbling up and breaking the snapshot functionality.
475+
assert false : e;
476+
logger.warn(new ParameterizedMessage("[{}] Exception during cleanup of root level blobs", metadata.name()), e);
477+
}
478+
}
479+
429480
private void cleanupStaleIndices(Map<String, BlobContainer> foundIndices, Map<String, IndexId> survivingIndices) {
430481
try {
431482
final Set<String> survivingIndexIds = survivingIndices.values().stream()

test/framework/src/main/java/org/elasticsearch/repositories/AbstractThirdPartyRepositoryTestCase.java

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
import org.elasticsearch.threadpool.ThreadPool;
3737

3838
import java.io.ByteArrayInputStream;
39+
import java.util.Arrays;
3940
import java.util.Collection;
4041
import java.util.Collections;
4142
import java.util.List;
@@ -237,6 +238,10 @@ protected void doRun() throws Exception {
237238
final BlobStore blobStore = repo.blobStore();
238239
blobStore.blobContainer(BlobPath.cleanPath().add("indices").add("foo"))
239240
.writeBlob("bar", new ByteArrayInputStream(new byte[0]), 0, false);
241+
for (String prefix : Arrays.asList("snap-", "meta-")) {
242+
blobStore.blobContainer(BlobPath.cleanPath())
243+
.writeBlob(prefix + "foo.dat", new ByteArrayInputStream(new byte[0]), 0, false);
244+
}
240245
future.onResponse(null);
241246
}
242247
});
@@ -257,6 +262,8 @@ protected void doRun() throws Exception {
257262
future.onResponse(
258263
blobStore.blobContainer(BlobPath.cleanPath().add("indices")).children().containsKey("foo")
259264
&& blobStore.blobContainer(BlobPath.cleanPath().add("indices").add("foo")).blobExists("bar")
265+
&& blobStore.blobContainer(BlobPath.cleanPath()).blobExists("meta-foo.dat")
266+
&& blobStore.blobContainer(BlobPath.cleanPath()).blobExists("snap-foo.dat")
260267
);
261268
}
262269
});

0 commit comments

Comments
 (0)