Skip to content

Commit d534689

Browse files
authored
Skip SLM retention if ILM is STOPPING or STOPPED (#45869)
This adds a check to ensure we take no action during SLM retention if ILM is currently stopped or in the process of stopping. Relates to #43663
1 parent be684f8 commit d534689

File tree

3 files changed

+51
-3
lines changed

3 files changed

+51
-3
lines changed

x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/slm/SnapshotLifecycleService.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ SchedulerEngine getScheduler() {
104104
/**
105105
* Returns true if ILM is in the stopped or stopped state
106106
*/
107-
private static boolean ilmStoppedOrStopping(ClusterState state) {
107+
static boolean ilmStoppedOrStopping(ClusterState state) {
108108
return Optional.ofNullable((SnapshotLifecycleMetadata) state.metaData().custom(SnapshotLifecycleMetadata.TYPE))
109109
.map(SnapshotLifecycleMetadata::getOperationMode)
110110
.map(mode -> OperationMode.STOPPING == mode || OperationMode.STOPPED == mode)

x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/slm/SnapshotRetentionTask.java

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,13 @@ public SnapshotRetentionTask(Client client, ClusterService clusterService, LongS
8383
public void triggered(SchedulerEngine.Event event) {
8484
assert event.getJobName().equals(SnapshotRetentionService.SLM_RETENTION_JOB_ID) :
8585
"expected id to be " + SnapshotRetentionService.SLM_RETENTION_JOB_ID + " but it was " + event.getJobName();
86+
87+
final ClusterState state = clusterService.state();
88+
if (SnapshotLifecycleService.ilmStoppedOrStopping(state)) {
89+
logger.debug("skipping SLM retention as ILM is currently stopped or stopping");
90+
return;
91+
}
92+
8693
if (running.compareAndSet(false, true)) {
8794
final SnapshotLifecycleStats slmStats = new SnapshotLifecycleStats();
8895

@@ -98,7 +105,6 @@ public void triggered(SchedulerEngine.Event event) {
98105
};
99106

100107
try {
101-
final ClusterState state = clusterService.state();
102108
final TimeValue maxDeletionTime = LifecycleSettings.SLM_RETENTION_DURATION_SETTING.get(state.metaData().settings());
103109

104110
logger.info("starting SLM retention snapshot cleanup task");

x-pack/plugin/ilm/src/test/java/org/elasticsearch/xpack/slm/SnapshotRetentionTaskTests.java

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -315,10 +315,52 @@ private void timeBoundedDeletion(final boolean deletionSuccess) throws Exception
315315
threadPool.shutdownNow();
316316
threadPool.awaitTermination(10, TimeUnit.SECONDS);
317317
}
318+
}
319+
320+
public void testSkipWhileStopping() throws Exception {
321+
doTestSkipDuringMode(OperationMode.STOPPING);
322+
}
323+
324+
public void testSkipWhileStopped() throws Exception {
325+
doTestSkipDuringMode(OperationMode.STOPPED);
326+
}
327+
328+
private void doTestSkipDuringMode(OperationMode mode) throws Exception {
329+
try (ThreadPool threadPool = new TestThreadPool("slm-test");
330+
ClusterService clusterService = ClusterServiceUtils.createClusterService(threadPool);
331+
Client noOpClient = new NoOpClient("slm-test")) {
332+
final String policyId = "policy";
333+
final String repoId = "repo";
334+
SnapshotLifecyclePolicy policy = new SnapshotLifecyclePolicy(policyId, "snap", "1 * * * * ?",
335+
repoId, null, new SnapshotRetentionConfiguration(TimeValue.timeValueDays(30), null, null));
336+
337+
ClusterState state = createState(mode, policy);
338+
ClusterServiceUtils.setState(clusterService, state);
339+
340+
SnapshotRetentionTask task = new MockSnapshotRetentionTask(noOpClient, clusterService,
341+
new SnapshotLifecycleTaskTests.VerifyingHistoryStore(noOpClient, ZoneOffset.UTC,
342+
(historyItem) -> fail("should never write history")),
343+
threadPool,
344+
() -> {
345+
fail("should not retrieve snapshots");
346+
return null;
347+
},
348+
(a, b, c, d, e) -> fail("should not delete snapshots"),
349+
System::nanoTime);
318350

351+
long time = System.currentTimeMillis();
352+
task.triggered(new SchedulerEngine.Event(SnapshotRetentionService.SLM_RETENTION_JOB_ID, time, time));
353+
354+
threadPool.shutdownNow();
355+
threadPool.awaitTermination(10, TimeUnit.SECONDS);
356+
}
319357
}
320358

321359
public ClusterState createState(SnapshotLifecyclePolicy... policies) {
360+
return createState(OperationMode.RUNNING, policies);
361+
}
362+
363+
public ClusterState createState(OperationMode mode, SnapshotLifecyclePolicy... policies) {
322364
Map<String, SnapshotLifecyclePolicyMetadata> policyMetadataMap = Arrays.stream(policies)
323365
.map(policy -> SnapshotLifecyclePolicyMetadata.builder()
324366
.setPolicy(policy)
@@ -330,7 +372,7 @@ public ClusterState createState(SnapshotLifecyclePolicy... policies) {
330372

331373
MetaData metaData = MetaData.builder()
332374
.putCustom(SnapshotLifecycleMetadata.TYPE,
333-
new SnapshotLifecycleMetadata(policyMetadataMap, OperationMode.RUNNING, new SnapshotLifecycleStats()))
375+
new SnapshotLifecycleMetadata(policyMetadataMap, mode, new SnapshotLifecycleStats()))
334376
.build();
335377
return ClusterState.builder(new ClusterName("cluster"))
336378
.metaData(metaData)

0 commit comments

Comments
 (0)