Skip to content

Commit f7d827d

Browse files
Add CoolDown Period to S3 Repository (#51074) (#51217)
Add cool down period after snapshot finalization and delete to prevent eventually consistent AWS S3 from corrupting shard level metadata as long as the repository is using the old format metadata on the shard level.
1 parent 09c8bcf commit f7d827d

File tree

2 files changed

+169
-1
lines changed

2 files changed

+169
-1
lines changed

plugins/repository-s3/src/main/java/org/elasticsearch/repositories/s3/S3Repository.java

Lines changed: 115 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,9 @@
2121

2222
import org.apache.logging.log4j.LogManager;
2323
import org.apache.logging.log4j.Logger;
24+
import org.elasticsearch.action.ActionListener;
25+
import org.elasticsearch.action.ActionRunnable;
26+
import org.elasticsearch.cluster.metadata.MetaData;
2427
import org.elasticsearch.cluster.metadata.RepositoryMetaData;
2528
import org.elasticsearch.cluster.service.ClusterService;
2629
import org.elasticsearch.common.Strings;
@@ -32,11 +35,23 @@
3235
import org.elasticsearch.common.settings.Setting;
3336
import org.elasticsearch.common.unit.ByteSizeUnit;
3437
import org.elasticsearch.common.unit.ByteSizeValue;
38+
import org.elasticsearch.common.unit.TimeValue;
3539
import org.elasticsearch.common.xcontent.NamedXContentRegistry;
3640
import org.elasticsearch.monitor.jvm.JvmInfo;
3741
import org.elasticsearch.repositories.RepositoryException;
42+
import org.elasticsearch.repositories.ShardGenerations;
3843
import org.elasticsearch.repositories.blobstore.BlobStoreRepository;
39-
44+
import org.elasticsearch.snapshots.SnapshotId;
45+
import org.elasticsearch.snapshots.SnapshotInfo;
46+
import org.elasticsearch.snapshots.SnapshotShardFailure;
47+
import org.elasticsearch.snapshots.SnapshotsService;
48+
import org.elasticsearch.threadpool.Scheduler;
49+
import org.elasticsearch.threadpool.ThreadPool;
50+
51+
import java.util.List;
52+
import java.util.Map;
53+
import java.util.concurrent.TimeUnit;
54+
import java.util.concurrent.atomic.AtomicReference;
4055
import java.util.function.Function;
4156

4257
/**
@@ -142,6 +157,23 @@ class S3Repository extends BlobStoreRepository {
142157

143158
static final Setting<String> CLIENT_NAME = new Setting<>("client", "default", Function.identity());
144159

160+
/**
161+
* Artificial delay to introduce after a snapshot finalization or delete has finished so long as the repository is still using the
162+
* backwards compatible snapshot format from before
163+
* {@link org.elasticsearch.snapshots.SnapshotsService#SHARD_GEN_IN_REPO_DATA_VERSION} ({@link org.elasticsearch.Version#V_7_6_0}).
164+
* This delay is necessary so that the eventually consistent nature of AWS S3 does not randomly result in repository corruption when
165+
* doing repository operations in rapid succession on a repository in the old metadata format.
166+
* This setting should not be adjusted in production when working with an AWS S3 backed repository. Doing so risks the repository
167+
* becoming silently corrupted. To get rid of this waiting period, either create a new S3 repository or remove all snapshots older than
168+
* {@link org.elasticsearch.Version#V_7_6_0} from the repository which will trigger an upgrade of the repository metadata to the new
169+
* format and disable the cooldown period.
170+
*/
171+
static final Setting<TimeValue> COOLDOWN_PERIOD = Setting.timeSetting(
172+
"cooldown_period",
173+
new TimeValue(3, TimeUnit.MINUTES),
174+
new TimeValue(0, TimeUnit.MILLISECONDS),
175+
Setting.Property.Dynamic);
176+
145177
/**
146178
* Specifies the path within bucket to repository data. Defaults to root directory.
147179
*/
@@ -165,6 +197,12 @@ class S3Repository extends BlobStoreRepository {
165197

166198
private final RepositoryMetaData repositoryMetaData;
167199

200+
/**
201+
* Time period to delay repository operations by after finalizing or deleting a snapshot.
202+
* See {@link #COOLDOWN_PERIOD} for details.
203+
*/
204+
private final TimeValue coolDown;
205+
168206
/**
169207
* Constructs an s3 backed repository
170208
*/
@@ -211,6 +249,8 @@ class S3Repository extends BlobStoreRepository {
211249
+ "store these in named clients and the elasticsearch keystore for secure settings.");
212250
}
213251

252+
coolDown = COOLDOWN_PERIOD.get(metadata.settings());
253+
214254
logger.debug(
215255
"using bucket [{}], chunk_size [{}], server_side_encryption [{}], buffer_size [{}], cannedACL [{}], storageClass [{}]",
216256
bucket,
@@ -221,6 +261,70 @@ class S3Repository extends BlobStoreRepository {
221261
storageClass);
222262
}
223263

264+
/**
265+
* Holds a reference to delayed repository operation {@link Scheduler.Cancellable} so it can be cancelled should the repository be
266+
* closed concurrently.
267+
*/
268+
private final AtomicReference<Scheduler.Cancellable> finalizationFuture = new AtomicReference<>();
269+
270+
@Override
271+
public void finalizeSnapshot(SnapshotId snapshotId, ShardGenerations shardGenerations, long startTime, String failure, int totalShards,
272+
List<SnapshotShardFailure> shardFailures, long repositoryStateId, boolean includeGlobalState,
273+
MetaData clusterMetaData, Map<String, Object> userMetadata, boolean writeShardGens,
274+
ActionListener<SnapshotInfo> listener) {
275+
if (writeShardGens == false) {
276+
listener = delayedListener(listener);
277+
}
278+
super.finalizeSnapshot(snapshotId, shardGenerations, startTime, failure, totalShards, shardFailures, repositoryStateId,
279+
includeGlobalState, clusterMetaData, userMetadata, writeShardGens, listener);
280+
}
281+
282+
@Override
283+
public void deleteSnapshot(SnapshotId snapshotId, long repositoryStateId, boolean writeShardGens, ActionListener<Void> listener) {
284+
if (writeShardGens == false) {
285+
listener = delayedListener(listener);
286+
}
287+
super.deleteSnapshot(snapshotId, repositoryStateId, writeShardGens, listener);
288+
}
289+
290+
/**
291+
* Wraps given listener such that it is executed with a delay of {@link #coolDown} on the snapshot thread-pool after being invoked.
292+
* See {@link #COOLDOWN_PERIOD} for details.
293+
*/
294+
private <T> ActionListener<T> delayedListener(ActionListener<T> listener) {
295+
final ActionListener<T> wrappedListener = ActionListener.runBefore(listener, () -> {
296+
final Scheduler.Cancellable cancellable = finalizationFuture.getAndSet(null);
297+
assert cancellable != null;
298+
});
299+
return new ActionListener<T>() {
300+
@Override
301+
public void onResponse(T response) {
302+
logCooldownInfo();
303+
final Scheduler.Cancellable existing = finalizationFuture.getAndSet(
304+
threadPool.schedule(ActionRunnable.wrap(wrappedListener, l -> l.onResponse(response)),
305+
coolDown, ThreadPool.Names.SNAPSHOT));
306+
assert existing == null : "Already have an ongoing finalization " + finalizationFuture;
307+
}
308+
309+
@Override
310+
public void onFailure(Exception e) {
311+
logCooldownInfo();
312+
final Scheduler.Cancellable existing = finalizationFuture.getAndSet(
313+
threadPool.schedule(ActionRunnable.wrap(wrappedListener, l -> l.onFailure(e)), coolDown, ThreadPool.Names.SNAPSHOT));
314+
assert existing == null : "Already have an ongoing finalization " + finalizationFuture;
315+
}
316+
};
317+
}
318+
319+
private void logCooldownInfo() {
320+
logger.info("Sleeping for [{}] after modifying repository [{}] because it contains snapshots older than version [{}]" +
321+
" and therefore is using a backwards compatible metadata format that requires this cooldown period to avoid " +
322+
"repository corruption. To get rid of this message and move to the new repository metadata format, either remove " +
323+
"all snapshots older than version [{}] from the repository or create a new repository at an empty location.",
324+
coolDown, metadata.name(), SnapshotsService.SHARD_GEN_IN_REPO_DATA_VERSION,
325+
SnapshotsService.SHARD_GEN_IN_REPO_DATA_VERSION);
326+
}
327+
224328
@Override
225329
protected S3BlobStore createBlobStore() {
226330
return new S3BlobStore(service, bucket, serverSideEncryption, bufferSize, cannedACL, storageClass, repositoryMetaData);
@@ -241,4 +345,14 @@ public BlobPath basePath() {
241345
protected ByteSizeValue chunkSize() {
242346
return chunkSize;
243347
}
348+
349+
@Override
350+
protected void doClose() {
351+
final Scheduler.Cancellable cancellable = finalizationFuture.getAndSet(null);
352+
if (cancellable != null) {
353+
logger.debug("Repository [{}] closed during cool-down period", metadata.name());
354+
cancellable.cancel();
355+
}
356+
super.doClose();
357+
}
244358
}

plugins/repository-s3/src/test/java/org/elasticsearch/repositories/s3/S3BlobStoreRepositoryTests.java

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,30 +22,48 @@
2222
import com.sun.net.httpserver.HttpExchange;
2323
import com.sun.net.httpserver.HttpHandler;
2424
import fixture.s3.S3HttpHandler;
25+
import org.elasticsearch.action.ActionRunnable;
26+
import org.elasticsearch.action.support.PlainActionFuture;
2527
import org.elasticsearch.cluster.metadata.RepositoryMetaData;
2628
import org.elasticsearch.cluster.service.ClusterService;
2729
import org.elasticsearch.common.SuppressForbidden;
2830
import org.elasticsearch.common.blobstore.BlobContainer;
2931
import org.elasticsearch.common.blobstore.BlobPath;
3032
import org.elasticsearch.common.blobstore.BlobStore;
33+
import org.elasticsearch.common.bytes.BytesReference;
3134
import org.elasticsearch.common.settings.MockSecureSettings;
3235
import org.elasticsearch.common.settings.Setting;
3336
import org.elasticsearch.common.settings.Settings;
3437
import org.elasticsearch.common.unit.ByteSizeUnit;
38+
import org.elasticsearch.common.unit.TimeValue;
3539
import org.elasticsearch.common.xcontent.NamedXContentRegistry;
40+
import org.elasticsearch.common.xcontent.XContentFactory;
3641
import org.elasticsearch.plugins.Plugin;
42+
import org.elasticsearch.repositories.RepositoriesService;
43+
import org.elasticsearch.repositories.RepositoryData;
44+
import org.elasticsearch.repositories.blobstore.BlobStoreRepository;
3745
import org.elasticsearch.repositories.blobstore.ESMockAPIBasedRepositoryIntegTestCase;
46+
import org.elasticsearch.snapshots.SnapshotId;
47+
import org.elasticsearch.snapshots.SnapshotsService;
3848
import org.elasticsearch.snapshots.mockstore.BlobStoreWrapper;
49+
import org.elasticsearch.threadpool.ThreadPool;
3950

51+
import java.io.IOException;
52+
import java.io.InputStream;
4053
import java.util.ArrayList;
4154
import java.util.Collection;
4255
import java.util.Collections;
4356
import java.util.List;
4457
import java.util.Map;
4558

59+
import static org.hamcrest.Matchers.greaterThan;
60+
import static org.hamcrest.Matchers.lessThan;
61+
4662
@SuppressForbidden(reason = "this test uses a HttpServer to emulate an S3 endpoint")
4763
public class S3BlobStoreRepositoryTests extends ESMockAPIBasedRepositoryIntegTestCase {
4864

65+
private static final TimeValue TEST_COOLDOWN_PERIOD = TimeValue.timeValueSeconds(5L);
66+
4967
@Override
5068
protected String repositoryType() {
5169
return S3Repository.TYPE;
@@ -82,6 +100,7 @@ protected Settings nodeSettings(int nodeOrdinal) {
82100
secureSettings.setString(S3ClientSettings.SECRET_KEY_SETTING.getConcreteSettingForNamespace("test").getKey(), "secret");
83101

84102
return Settings.builder()
103+
.put(ThreadPool.ESTIMATED_TIME_INTERVAL_SETTING.getKey(), 0) // We have tests that verify an exact wait time
85104
.put(S3ClientSettings.ENDPOINT_SETTING.getConcreteSettingForNamespace("test").getKey(), httpServerUrl())
86105
// Disable chunked encoding as it simplifies a lot the request parsing on the httpServer side
87106
.put(S3ClientSettings.DISABLE_CHUNKED_ENCODING.getConcreteSettingForNamespace("test").getKey(), true)
@@ -92,6 +111,41 @@ protected Settings nodeSettings(int nodeOrdinal) {
92111
.build();
93112
}
94113

114+
public void testEnforcedCooldownPeriod() throws IOException {
115+
final String repoName = createRepository(randomName(), Settings.builder().put(repositorySettings())
116+
.put(S3Repository.COOLDOWN_PERIOD.getKey(), TEST_COOLDOWN_PERIOD).build());
117+
118+
final SnapshotId fakeOldSnapshot = client().admin().cluster().prepareCreateSnapshot(repoName, "snapshot-old")
119+
.setWaitForCompletion(true).setIndices().get().getSnapshotInfo().snapshotId();
120+
final RepositoriesService repositoriesService = internalCluster().getCurrentMasterNodeInstance(RepositoriesService.class);
121+
final BlobStoreRepository repository = (BlobStoreRepository) repositoriesService.repository(repoName);
122+
final RepositoryData repositoryData =
123+
PlainActionFuture.get(f -> repository.threadPool().generic().execute(() -> repository.getRepositoryData(f)));
124+
final RepositoryData modifiedRepositoryData = repositoryData.withVersions(Collections.singletonMap(fakeOldSnapshot,
125+
SnapshotsService.SHARD_GEN_IN_REPO_DATA_VERSION.minimumCompatibilityVersion()));
126+
final BytesReference serialized =
127+
BytesReference.bytes(modifiedRepositoryData.snapshotsToXContent(XContentFactory.jsonBuilder(), false));
128+
PlainActionFuture.get(f -> repository.threadPool().generic().execute(ActionRunnable.run(f, () -> {
129+
try (InputStream stream = serialized.streamInput()) {
130+
repository.blobStore().blobContainer(repository.basePath()).writeBlobAtomic(
131+
BlobStoreRepository.INDEX_FILE_PREFIX + modifiedRepositoryData.getGenId(), stream, serialized.length(), true);
132+
}
133+
})));
134+
135+
final String newSnapshotName = "snapshot-new";
136+
final long beforeThrottledSnapshot = repository.threadPool().relativeTimeInNanos();
137+
client().admin().cluster().prepareCreateSnapshot(repoName, newSnapshotName).setWaitForCompletion(true).setIndices().get();
138+
assertThat(repository.threadPool().relativeTimeInNanos() - beforeThrottledSnapshot, greaterThan(TEST_COOLDOWN_PERIOD.getNanos()));
139+
140+
final long beforeThrottledDelete = repository.threadPool().relativeTimeInNanos();
141+
client().admin().cluster().prepareDeleteSnapshot(repoName, newSnapshotName).get();
142+
assertThat(repository.threadPool().relativeTimeInNanos() - beforeThrottledDelete, greaterThan(TEST_COOLDOWN_PERIOD.getNanos()));
143+
144+
final long beforeFastDelete = repository.threadPool().relativeTimeInNanos();
145+
client().admin().cluster().prepareDeleteSnapshot(repoName, fakeOldSnapshot.getName()).get();
146+
assertThat(repository.threadPool().relativeTimeInNanos() - beforeFastDelete, lessThan(TEST_COOLDOWN_PERIOD.getNanos()));
147+
}
148+
95149
/**
96150
* S3RepositoryPlugin that allows to disable chunked encoding and to set a low threshold between single upload and multipart upload.
97151
*/

0 commit comments

Comments
 (0)