Skip to content

Commit b576a6a

Browse files
author
David Roberts
committed
[ML] Throttle the delete-by-query of expired results (#47177)
Due to #47003 many clusters will have built up a large backlog of expired results. On upgrading to a version where that bug is fixed users could find that the first ML daily maintenance task deletes a very large amount of documents. This change introduces throttling to the delete-by-query that the ML daily maintenance uses to delete expired results to limit it to deleting an average 200 documents per second. (There is no throttling for state/forecast documents as these are expected to be lower volume.) Additionally a rough time limit of 8 hours is applied to the whole delete expired data action. (This is only rough as it won't stop part way through a single operation - it only checks the timeout between operations.) Relates #47103
1 parent 6607204 commit b576a6a

File tree

14 files changed

+316
-46
lines changed

14 files changed

+316
-46
lines changed

x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/action/DeleteExpiredDataAction.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,10 @@ public void readFrom(StreamInput in) throws IOException {
7575
deleted = in.readBoolean();
7676
}
7777

78+
public boolean isDeleted() {
79+
return deleted;
80+
}
81+
7882
@Override
7983
public void writeTo(StreamOutput out) throws IOException {
8084
super.writeTo(out);

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MlDailyMaintenanceService.java

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,13 @@ private void triggerTasks() {
111111
LOGGER.info("triggering scheduled [ML] maintenance tasks");
112112
executeAsyncWithOrigin(client, ML_ORIGIN, DeleteExpiredDataAction.INSTANCE, new DeleteExpiredDataAction.Request(),
113113
ActionListener.wrap(
114-
response -> LOGGER.info("Successfully completed [ML] maintenance tasks"),
114+
response -> {
115+
if (response.isDeleted()) {
116+
LOGGER.info("Successfully completed [ML] maintenance tasks");
117+
} else {
118+
LOGGER.info("Halting [ML] maintenance tasks before completion as elapsed time is too great");
119+
}
120+
},
115121
e -> LOGGER.error("An error occurred during maintenance tasks execution", e)));
116122
scheduleNext();
117123
}

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportDeleteExpiredDataAction.java

Lines changed: 40 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -27,33 +27,54 @@
2727
import org.elasticsearch.xpack.ml.notifications.Auditor;
2828
import org.elasticsearch.xpack.ml.utils.VolatileCursorIterator;
2929

30+
import java.time.Clock;
31+
import java.time.Duration;
32+
import java.time.Instant;
3033
import java.util.Arrays;
3134
import java.util.Iterator;
3235
import java.util.List;
36+
import java.util.function.Supplier;
3337

3438
public class TransportDeleteExpiredDataAction extends HandledTransportAction<DeleteExpiredDataAction.Request,
3539
DeleteExpiredDataAction.Response> {
3640

41+
// TODO: make configurable in the request
42+
static final Duration MAX_DURATION = Duration.ofHours(8);
43+
44+
private final String executor;
3745
private final Client client;
3846
private final ClusterService clusterService;
47+
private final Clock clock;
3948

4049
@Inject
4150
public TransportDeleteExpiredDataAction(Settings settings, ThreadPool threadPool, TransportService transportService,
4251
ActionFilters actionFilters, IndexNameExpressionResolver indexNameExpressionResolver,
4352
Client client, ClusterService clusterService) {
53+
this(settings, threadPool, MachineLearning.UTILITY_THREAD_POOL_NAME, transportService, actionFilters, indexNameExpressionResolver,
54+
client, clusterService, Clock.systemUTC());
55+
}
56+
57+
TransportDeleteExpiredDataAction(Settings settings, ThreadPool threadPool, String executor, TransportService transportService,
58+
ActionFilters actionFilters, IndexNameExpressionResolver indexNameExpressionResolver, Client client,
59+
ClusterService clusterService, Clock clock) {
4460
super(settings, DeleteExpiredDataAction.NAME, threadPool, transportService, actionFilters, indexNameExpressionResolver,
45-
DeleteExpiredDataAction.Request::new);
61+
DeleteExpiredDataAction.Request::new, executor);
62+
this.executor = executor;
4663
this.client = ClientHelper.clientWithOrigin(client, ClientHelper.ML_ORIGIN);
4764
this.clusterService = clusterService;
65+
this.clock = clock;
4866
}
4967

5068
@Override
5169
protected void doExecute(DeleteExpiredDataAction.Request request, ActionListener<DeleteExpiredDataAction.Response> listener) {
5270
logger.info("Deleting expired data");
53-
threadPool.executor(MachineLearning.UTILITY_THREAD_POOL_NAME).execute(() -> deleteExpiredData(listener));
71+
Instant timeoutTime = Instant.now(clock).plus(MAX_DURATION);
72+
Supplier<Boolean> isTimedOutSupplier = () -> Instant.now(clock).isAfter(timeoutTime);
73+
threadPool.executor(MachineLearning.UTILITY_THREAD_POOL_NAME).execute(() -> deleteExpiredData(listener, isTimedOutSupplier));
5474
}
5575

56-
private void deleteExpiredData(ActionListener<DeleteExpiredDataAction.Response> listener) {
76+
private void deleteExpiredData(ActionListener<DeleteExpiredDataAction.Response> listener,
77+
Supplier<Boolean> isTimedOutSupplier) {
5778
Auditor auditor = new Auditor(client, clusterService.getNodeName());
5879
List<MlDataRemover> dataRemovers = Arrays.asList(
5980
new ExpiredResultsRemover(client, clusterService, auditor),
@@ -62,25 +83,32 @@ private void deleteExpiredData(ActionListener<DeleteExpiredDataAction.Response>
6283
new UnusedStateRemover(client, clusterService)
6384
);
6485
Iterator<MlDataRemover> dataRemoversIterator = new VolatileCursorIterator<>(dataRemovers);
65-
deleteExpiredData(dataRemoversIterator, listener);
86+
deleteExpiredData(dataRemoversIterator, listener, isTimedOutSupplier, true);
6687
}
6788

68-
private void deleteExpiredData(Iterator<MlDataRemover> mlDataRemoversIterator,
69-
ActionListener<DeleteExpiredDataAction.Response> listener) {
70-
if (mlDataRemoversIterator.hasNext()) {
89+
void deleteExpiredData(Iterator<MlDataRemover> mlDataRemoversIterator,
90+
ActionListener<DeleteExpiredDataAction.Response> listener,
91+
Supplier<Boolean> isTimedOutSupplier,
92+
boolean haveAllPreviousDeletionsCompleted) {
93+
if (haveAllPreviousDeletionsCompleted && mlDataRemoversIterator.hasNext()) {
7194
MlDataRemover remover = mlDataRemoversIterator.next();
7295
ActionListener<Boolean> nextListener = ActionListener.wrap(
73-
booleanResponse -> deleteExpiredData(mlDataRemoversIterator, listener), listener::onFailure);
96+
booleanResponse -> deleteExpiredData(mlDataRemoversIterator, listener, isTimedOutSupplier, booleanResponse),
97+
listener::onFailure);
7498
// Removing expired ML data and artifacts requires multiple operations.
7599
// These are queued up and executed sequentially in the action listener,
76100
// the chained calls must all run the ML utility thread pool NOT the thread
77101
// the previous action returned in which in the case of a transport_client_boss
78102
// thread is a disaster.
79-
remover.remove(new ThreadedActionListener<>(logger, threadPool, MachineLearning.UTILITY_THREAD_POOL_NAME, nextListener,
80-
false));
103+
remover.remove(new ThreadedActionListener<>(logger, threadPool, executor, nextListener, false),
104+
isTimedOutSupplier);
81105
} else {
82-
logger.info("Completed deletion of expired data");
83-
listener.onResponse(new DeleteExpiredDataAction.Response(true));
106+
if (haveAllPreviousDeletionsCompleted) {
107+
logger.info("Completed deletion of expired ML data");
108+
} else {
109+
logger.info("Halted deletion of expired ML data until next invocation");
110+
}
111+
listener.onResponse(new DeleteExpiredDataAction.Response(haveAllPreviousDeletionsCompleted));
84112
}
85113
}
86114
}

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/persistence/JobDataDeleter.java

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import org.elasticsearch.index.reindex.DeleteByQueryAction;
2222
import org.elasticsearch.index.reindex.DeleteByQueryRequest;
2323
import org.elasticsearch.xpack.core.ml.job.persistence.AnomalyDetectorsIndex;
24+
import org.elasticsearch.xpack.core.ml.job.persistence.ElasticsearchMappings;
2425
import org.elasticsearch.xpack.core.ml.job.process.autodetect.state.ModelSnapshot;
2526
import org.elasticsearch.xpack.core.ml.job.results.Result;
2627

@@ -79,6 +80,9 @@ public void deleteModelSnapshots(List<ModelSnapshot> modelSnapshots, ActionListe
7980
.setIndicesOptions(IndicesOptions.lenientExpandOpen())
8081
.setQuery(new IdsQueryBuilder().addIds(idsToDelete.toArray(new String[0])));
8182

83+
// _doc is the most efficient sort order and will also disable scoring
84+
deleteByQueryRequest.getSearchRequest().source().sort(ElasticsearchMappings.ES_DOC);
85+
8286
try {
8387
executeAsyncWithOrigin(client, ML_ORIGIN, DeleteByQueryAction.INSTANCE, deleteByQueryRequest, listener);
8488
} catch (Exception e) {
@@ -101,6 +105,10 @@ public void deleteResultsFromTime(long cutoffEpochMs, ActionListener<Boolean> li
101105
.filter(QueryBuilders.rangeQuery(Result.TIMESTAMP.getPreferredName()).gte(cutoffEpochMs));
102106
deleteByQueryHolder.dbqRequest.setIndicesOptions(IndicesOptions.lenientExpandOpen());
103107
deleteByQueryHolder.dbqRequest.setQuery(query);
108+
109+
// _doc is the most efficient sort order and will also disable scoring
110+
deleteByQueryHolder.dbqRequest.getSearchRequest().source().sort(ElasticsearchMappings.ES_DOC);
111+
104112
executeAsyncWithOrigin(client, ML_ORIGIN, DeleteByQueryAction.INSTANCE, deleteByQueryHolder.dbqRequest,
105113
ActionListener.wrap(r -> listener.onResponse(true), listener::onFailure));
106114
}
@@ -116,6 +124,9 @@ public void deleteInterimResults() {
116124
QueryBuilder qb = QueryBuilders.termQuery(Result.IS_INTERIM.getPreferredName(), true);
117125
deleteByQueryHolder.dbqRequest.setQuery(new ConstantScoreQueryBuilder(qb));
118126

127+
// _doc is the most efficient sort order and will also disable scoring
128+
deleteByQueryHolder.dbqRequest.getSearchRequest().source().sort(ElasticsearchMappings.ES_DOC);
129+
119130
try (ThreadContext.StoredContext ignore = client.threadPool().getThreadContext().stashWithOrigin(ML_ORIGIN)) {
120131
client.execute(DeleteByQueryAction.INSTANCE, deleteByQueryHolder.dbqRequest).get();
121132
} catch (Exception e) {

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/retention/AbstractExpiredJobDataRemover.java

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
import java.util.Iterator;
2727
import java.util.List;
2828
import java.util.concurrent.TimeUnit;
29+
import java.util.function.Supplier;
2930
import java.util.stream.Collectors;
3031

3132
/**
@@ -50,11 +51,12 @@ protected Client getClient() {
5051
}
5152

5253
@Override
53-
public void remove(ActionListener<Boolean> listener) {
54-
removeData(newJobIterator(), listener);
54+
public void remove(ActionListener<Boolean> listener, Supplier<Boolean> isTimedOutSupplier) {
55+
removeData(newJobIterator(), listener, isTimedOutSupplier);
5556
}
5657

57-
private void removeData(WrappedBatchedJobsIterator jobIterator, ActionListener<Boolean> listener) {
58+
private void removeData(WrappedBatchedJobsIterator jobIterator, ActionListener<Boolean> listener,
59+
Supplier<Boolean> isTimedOutSupplier) {
5860
if (jobIterator.hasNext() == false) {
5961
listener.onResponse(true);
6062
return;
@@ -66,13 +68,19 @@ private void removeData(WrappedBatchedJobsIterator jobIterator, ActionListener<B
6668
return;
6769
}
6870

71+
if (isTimedOutSupplier.get()) {
72+
listener.onResponse(false);
73+
return;
74+
}
75+
6976
Long retentionDays = getRetentionDays(job);
7077
if (retentionDays == null) {
71-
removeData(jobIterator, listener);
78+
removeData(jobIterator, listener, isTimedOutSupplier);
7279
return;
7380
}
7481
long cutoffEpochMs = calcCutoffEpochMs(retentionDays);
75-
removeDataBefore(job, cutoffEpochMs, ActionListener.wrap(response -> removeData(jobIterator, listener), listener::onFailure));
82+
removeDataBefore(job, cutoffEpochMs,
83+
ActionListener.wrap(response -> removeData(jobIterator, listener, isTimedOutSupplier), listener::onFailure));
7684
}
7785

7886
private WrappedBatchedJobsIterator newJobIterator() {

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/retention/ExpiredForecastsRemover.java

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
import org.elasticsearch.threadpool.ThreadPool;
3232
import org.elasticsearch.xpack.core.ml.job.config.Job;
3333
import org.elasticsearch.xpack.core.ml.job.persistence.AnomalyDetectorsIndex;
34+
import org.elasticsearch.xpack.core.ml.job.persistence.ElasticsearchMappings;
3435
import org.elasticsearch.xpack.core.ml.job.results.Forecast;
3536
import org.elasticsearch.xpack.core.ml.job.results.ForecastRequestStats;
3637
import org.elasticsearch.xpack.core.ml.job.results.Result;
@@ -43,6 +44,7 @@
4344
import java.util.ArrayList;
4445
import java.util.List;
4546
import java.util.Objects;
47+
import java.util.function.Supplier;
4648

4749
/**
4850
* Removes up to {@link #MAX_FORECASTS} forecasts (stats + forecasts docs) that have expired.
@@ -70,10 +72,10 @@ public ExpiredForecastsRemover(Client client, ThreadPool threadPool) {
7072
}
7173

7274
@Override
73-
public void remove(ActionListener<Boolean> listener) {
75+
public void remove(ActionListener<Boolean> listener, Supplier<Boolean> isTimedOutSupplier) {
7476
LOGGER.debug("Removing forecasts that expire before [{}]", cutoffEpochMs);
7577
ActionListener<SearchResponse> forecastStatsHandler = ActionListener.wrap(
76-
searchResponse -> deleteForecasts(searchResponse, listener),
78+
searchResponse -> deleteForecasts(searchResponse, listener, isTimedOutSupplier),
7779
e -> listener.onFailure(new ElasticsearchException("An error occurred while searching forecasts to delete", e)));
7880

7981
SearchSourceBuilder source = new SearchSourceBuilder();
@@ -82,13 +84,16 @@ public void remove(ActionListener<Boolean> listener) {
8284
.filter(QueryBuilders.existsQuery(ForecastRequestStats.EXPIRY_TIME.getPreferredName())));
8385
source.size(MAX_FORECASTS);
8486

87+
// _doc is the most efficient sort order and will also disable scoring
88+
source.sort(ElasticsearchMappings.ES_DOC);
89+
8590
SearchRequest searchRequest = new SearchRequest(RESULTS_INDEX_PATTERN);
8691
searchRequest.source(source);
8792
client.execute(SearchAction.INSTANCE, searchRequest, new ThreadedActionListener<>(LOGGER, threadPool,
8893
MachineLearning.UTILITY_THREAD_POOL_NAME, forecastStatsHandler, false));
8994
}
9095

91-
private void deleteForecasts(SearchResponse searchResponse, ActionListener<Boolean> listener) {
96+
private void deleteForecasts(SearchResponse searchResponse, ActionListener<Boolean> listener, Supplier<Boolean> isTimedOutSupplier) {
9297
List<ForecastRequestStats> forecastsToDelete;
9398
try {
9499
forecastsToDelete = findForecastsToDelete(searchResponse);
@@ -97,6 +102,11 @@ private void deleteForecasts(SearchResponse searchResponse, ActionListener<Boole
97102
return;
98103
}
99104

105+
if (isTimedOutSupplier.get()) {
106+
listener.onResponse(false);
107+
return;
108+
}
109+
100110
DeleteByQueryRequest request = buildDeleteByQuery(forecastsToDelete);
101111
client.execute(DeleteByQueryAction.INSTANCE, request, new ActionListener<BulkByScrollResponse>() {
102112
@Override
@@ -155,6 +165,10 @@ private DeleteByQueryRequest buildDeleteByQuery(List<ForecastRequestStats> forec
155165
}
156166
QueryBuilder query = QueryBuilders.boolQuery().filter(boolQuery);
157167
request.setQuery(query);
168+
169+
// _doc is the most efficient sort order and will also disable scoring
170+
request.getSearchRequest().source().sort(ElasticsearchMappings.ES_DOC);
171+
158172
return request;
159173
}
160174
}

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/retention/ExpiredModelSnapshotsRemover.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
import org.elasticsearch.xpack.core.ml.action.DeleteModelSnapshotAction;
2525
import org.elasticsearch.xpack.core.ml.job.config.Job;
2626
import org.elasticsearch.xpack.core.ml.job.persistence.AnomalyDetectorsIndex;
27+
import org.elasticsearch.xpack.core.ml.job.persistence.ElasticsearchMappings;
2728
import org.elasticsearch.xpack.core.ml.job.process.autodetect.state.ModelSnapshot;
2829
import org.elasticsearch.xpack.core.ml.job.process.autodetect.state.ModelSnapshotField;
2930
import org.elasticsearch.xpack.ml.MachineLearning;
@@ -87,7 +88,7 @@ protected void removeDataBefore(Job job, long cutoffEpochMs, ActionListener<Bool
8788
.mustNot(activeSnapshotFilter)
8889
.mustNot(retainFilter);
8990

90-
searchRequest.source(new SearchSourceBuilder().query(query).size(MODEL_SNAPSHOT_SEARCH_SIZE));
91+
searchRequest.source(new SearchSourceBuilder().query(query).size(MODEL_SNAPSHOT_SEARCH_SIZE).sort(ElasticsearchMappings.ES_DOC));
9192

9293
getClient().execute(SearchAction.INSTANCE, searchRequest, new ThreadedActionListener<>(LOGGER, threadPool,
9394
MachineLearning.UTILITY_THREAD_POOL_NAME, expiredSnapshotsListener(job.getId(), listener), false));

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/retention/ExpiredResultsRemover.java

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
import org.elasticsearch.xpack.core.ml.job.config.Job;
2020
import org.elasticsearch.xpack.core.ml.job.messages.Messages;
2121
import org.elasticsearch.xpack.core.ml.job.persistence.AnomalyDetectorsIndex;
22+
import org.elasticsearch.xpack.core.ml.job.persistence.ElasticsearchMappings;
2223
import org.elasticsearch.xpack.core.ml.job.process.autodetect.state.ModelSizeStats;
2324
import org.elasticsearch.xpack.core.ml.job.results.Forecast;
2425
import org.elasticsearch.xpack.core.ml.job.results.ForecastRequestStats;
@@ -87,13 +88,22 @@ private DeleteByQueryRequest createDBQRequest(Job job, long cutoffEpochMs) {
8788
DeleteByQueryRequest request = new DeleteByQueryRequest();
8889
request.setSlices(5);
8990

91+
// Delete the documents gradually.
92+
// With batch size 1000 and 200 requests per second this implies we spread
93+
// deletion of 1 million documents over 5000 seconds ~= 83 minutes.
94+
request.setBatchSize(1000);
95+
request.setRequestsPerSecond(200);
96+
9097
request.indices(AnomalyDetectorsIndex.jobResultsAliasedName(job.getId()));
9198
QueryBuilder excludeFilter = QueryBuilders.termsQuery(Result.RESULT_TYPE.getPreferredName(),
9299
ModelSizeStats.RESULT_TYPE_VALUE, ForecastRequestStats.RESULT_TYPE_VALUE, Forecast.RESULT_TYPE_VALUE);
93100
QueryBuilder query = createQuery(job.getId(), cutoffEpochMs)
94101
.filter(QueryBuilders.existsQuery(Result.RESULT_TYPE.getPreferredName()))
95102
.mustNot(excludeFilter);
96103
request.setQuery(query);
104+
105+
// _doc is the most efficient sort order and will also disable scoring
106+
request.getSearchRequest().source().sort(ElasticsearchMappings.ES_DOC);
97107
return request;
98108
}
99109

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/retention/MlDataRemover.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77

88
import org.elasticsearch.action.ActionListener;
99

10+
import java.util.function.Supplier;
11+
1012
public interface MlDataRemover {
11-
void remove(ActionListener<Boolean> listener);
13+
void remove(ActionListener<Boolean> listener, Supplier<Boolean> isTimedOutSupplier);
1214
}

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/retention/UnusedStateRemover.java

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
import java.util.Objects;
3434
import java.util.Set;
3535
import java.util.function.Function;
36+
import java.util.function.Supplier;
3637

3738
/**
3839
* If for any reason a job is deleted by some of its state documents
@@ -52,13 +53,17 @@ public UnusedStateRemover(Client client, ClusterService clusterService) {
5253
}
5354

5455
@Override
55-
public void remove(ActionListener<Boolean> listener) {
56+
public void remove(ActionListener<Boolean> listener, Supplier<Boolean> isTimedOutSupplier) {
5657
try {
5758
List<String> unusedStateDocIds = findUnusedStateDocIds();
58-
if (unusedStateDocIds.size() > 0) {
59-
executeDeleteUnusedStateDocs(unusedStateDocIds, listener);
59+
if (isTimedOutSupplier.get()) {
60+
listener.onResponse(false);
6061
} else {
61-
listener.onResponse(true);
62+
if (unusedStateDocIds.size() > 0) {
63+
executeDeleteUnusedStateDocs(unusedStateDocIds, listener);
64+
} else {
65+
listener.onResponse(true);
66+
}
6267
}
6368
} catch (Exception e) {
6469
listener.onFailure(e);
@@ -108,6 +113,10 @@ private void executeDeleteUnusedStateDocs(List<String> unusedDocIds, ActionListe
108113
.types(ElasticsearchMappings.DOC_TYPE)
109114
.setIndicesOptions(IndicesOptions.lenientExpandOpen())
110115
.setQuery(QueryBuilders.idsQuery().addIds(unusedDocIds.toArray(new String[0])));
116+
117+
// _doc is the most efficient sort order and will also disable scoring
118+
deleteByQueryRequest.getSearchRequest().source().sort(ElasticsearchMappings.ES_DOC);
119+
111120
client.execute(DeleteByQueryAction.INSTANCE, deleteByQueryRequest, ActionListener.wrap(
112121
response -> {
113122
if (response.getBulkFailures().size() > 0 || response.getSearchFailures().size() > 0) {

0 commit comments

Comments
 (0)