Skip to content

Commit da24285

Browse files
Introduce SNAPSHOT_META Threadpool for Fetching Repository Metadata (#73172)
Adds new snapshot meta pool that is used to speed up the get snapshots API by making `SnapshotInfo` load in parallel. Also use this pool to load `RepositoryData`. A follow-up to this would expand the use of this pool to the snapshot status API and make it run in parallel as well.
1 parent 77d756b commit da24285

File tree

5 files changed

+102
-23
lines changed

5 files changed

+102
-23
lines changed

docs/reference/modules/threadpool.asciidoc

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,11 @@ There are several thread pools, but the important ones include:
4141
keep-alive of `5m` and a max of `min(5, (`<<node.processors,
4242
`# of allocated processors`>>`) / 2)`.
4343

44+
`snapshot_meta`::
45+
For snapshot repository metadata read operations. Thread pool type is `scaling` with a
46+
keep-alive of `5m` and a max of `min(50, (`<<node.processors,
47+
`# of allocated processors`>>` pass:[ * ]3))`.
48+
4449
`warmer`::
4550
For segment warm-up operations. Thread pool type is `scaling` with a
4651
keep-alive of `5m` and a max of `min(5, (`<<node.processors,

server/src/main/java/org/elasticsearch/action/admin/cluster/snapshots/get/TransportGetSnapshotsAction.java

Lines changed: 86 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@
1414
import org.apache.lucene.util.CollectionUtil;
1515
import org.elasticsearch.ElasticsearchException;
1616
import org.elasticsearch.action.ActionListener;
17-
import org.elasticsearch.action.ActionRunnable;
1817
import org.elasticsearch.action.StepListener;
1918
import org.elasticsearch.action.admin.cluster.repositories.get.TransportGetRepositoriesAction;
2019
import org.elasticsearch.action.support.ActionFilters;
@@ -34,6 +33,7 @@
3433
import org.elasticsearch.repositories.RepositoriesService;
3534
import org.elasticsearch.repositories.Repository;
3635
import org.elasticsearch.repositories.RepositoryData;
36+
import org.elasticsearch.repositories.RepositoryMissingException;
3737
import org.elasticsearch.snapshots.SnapshotException;
3838
import org.elasticsearch.snapshots.SnapshotId;
3939
import org.elasticsearch.snapshots.SnapshotInfo;
@@ -46,12 +46,15 @@
4646
import org.elasticsearch.transport.TransportService;
4747

4848
import java.util.ArrayList;
49+
import java.util.Collection;
4950
import java.util.Collections;
5051
import java.util.HashMap;
5152
import java.util.HashSet;
5253
import java.util.List;
5354
import java.util.Map;
5455
import java.util.Set;
56+
import java.util.concurrent.BlockingQueue;
57+
import java.util.concurrent.LinkedBlockingQueue;
5558
import java.util.stream.Collectors;
5659

5760
import static java.util.Collections.unmodifiableList;
@@ -211,8 +214,7 @@ private void loadSnapshotInfos(SnapshotsInProgress snapshotsInProgress, String r
211214
}
212215

213216
if (verbose) {
214-
threadPool.generic().execute(ActionRunnable.supply(
215-
listener, () -> snapshots(snapshotsInProgress, repo, new ArrayList<>(toResolve), ignoreUnavailable, task)));
217+
snapshots(snapshotsInProgress, repo, toResolve, ignoreUnavailable, task, listener);
216218
} else {
217219
final List<SnapshotInfo> snapshotInfos;
218220
if (repositoryData != null) {
@@ -235,12 +237,16 @@ private void loadSnapshotInfos(SnapshotsInProgress snapshotsInProgress, String r
235237
* @param snapshotIds snapshots for which to fetch snapshot information
236238
* @param ignoreUnavailable if true, snapshots that could not be read will only be logged with a warning,
237239
* if false, they will throw an error
238-
* @return list of snapshots
239240
*/
240-
private List<SnapshotInfo> snapshots(SnapshotsInProgress snapshotsInProgress, String repositoryName,
241-
List<SnapshotId> snapshotIds, boolean ignoreUnavailable, CancellableTask task) {
241+
private void snapshots(SnapshotsInProgress snapshotsInProgress,
242+
String repositoryName,
243+
Collection<SnapshotId> snapshotIds,
244+
boolean ignoreUnavailable,
245+
CancellableTask task,
246+
ActionListener<List<SnapshotInfo>> listener) {
242247
if (task.isCancelled()) {
243-
throw new TaskCancelledException("task cancelled");
248+
listener.onFailure(new TaskCancelledException("task cancelled"));
249+
return;
244250
}
245251
final Set<SnapshotInfo> snapshotSet = new HashSet<>();
246252
final Set<SnapshotId> snapshotIdsToIterate = new HashSet<>(snapshotIds);
@@ -252,28 +258,88 @@ private List<SnapshotInfo> snapshots(SnapshotsInProgress snapshotsInProgress, St
252258
snapshotSet.add(new SnapshotInfo(entry));
253259
}
254260
}
255-
// then, look in the repository
256-
final Repository repository = repositoriesService.repository(repositoryName);
257-
for (SnapshotId snapshotId : snapshotIdsToIterate) {
261+
// then, look in the repository if there's any matching snapshots left
262+
final List<SnapshotInfo> snapshotInfos;
263+
if (snapshotIdsToIterate.isEmpty()) {
264+
snapshotInfos = Collections.emptyList();
265+
} else {
266+
snapshotInfos = Collections.synchronizedList(new ArrayList<>());
267+
}
268+
final ActionListener<Collection<Void>> allDoneListener = listener.delegateFailure((l, v) -> {
269+
final ArrayList<SnapshotInfo> snapshotList = new ArrayList<>(snapshotInfos);
270+
snapshotList.addAll(snapshotSet);
271+
CollectionUtil.timSort(snapshotList);
272+
listener.onResponse(unmodifiableList(snapshotList));
273+
});
274+
if (snapshotIdsToIterate.isEmpty()) {
275+
allDoneListener.onResponse(Collections.emptyList());
276+
return;
277+
}
278+
// put snapshot info downloads into a task queue instead of pushing them all into the queue to not completely monopolize the
279+
// snapshot meta pool for a single request
280+
final int workers = Math.min(threadPool.info(ThreadPool.Names.SNAPSHOT_META).getMax(), snapshotIdsToIterate.size());
281+
final BlockingQueue<SnapshotId> queue = new LinkedBlockingQueue<>(snapshotIdsToIterate);
282+
final ActionListener<Void> workerDoneListener = new GroupedActionListener<>(allDoneListener, workers).delegateResponse((l, e) -> {
283+
queue.clear(); // Stop fetching the remaining snapshots once we've failed fetching one since the response is an error response
284+
// anyway in this case
285+
l.onFailure(e);
286+
});
287+
final Repository repository;
288+
try {
289+
repository = repositoriesService.repository(repositoryName);
290+
} catch (RepositoryMissingException e) {
291+
listener.onFailure(e);
292+
return;
293+
}
294+
for (int i = 0; i < workers; i++) {
295+
getOneSnapshotInfo(
296+
ignoreUnavailable,
297+
repository,
298+
queue,
299+
snapshotInfos,
300+
task,
301+
workerDoneListener
302+
);
303+
}
304+
}
305+
306+
/**
307+
* Tries to poll a {@link SnapshotId} to load {@link SnapshotInfo} for from the given {@code queue}. If it finds one in the queue,
308+
* loads the snapshot info from the repository and adds it to the given {@code snapshotInfos} collection, then invokes itself again to
309+
* try and poll another task from the queue.
310+
* If the queue is empty resolves {@code} listener.
311+
*/
312+
private void getOneSnapshotInfo(boolean ignoreUnavailable,
313+
Repository repository,
314+
BlockingQueue<SnapshotId> queue,
315+
Collection<SnapshotInfo> snapshotInfos,
316+
CancellableTask task,
317+
ActionListener<Void> listener) {
318+
final SnapshotId snapshotId = queue.poll();
319+
if (snapshotId == null) {
320+
listener.onResponse(null);
321+
return;
322+
}
323+
threadPool.executor(ThreadPool.Names.SNAPSHOT_META).execute(() -> {
258324
if (task.isCancelled()) {
259-
throw new TaskCancelledException("task cancelled");
325+
listener.onFailure(new TaskCancelledException("task cancelled"));
326+
return;
260327
}
261328
try {
262-
snapshotSet.add(repository.getSnapshotInfo(snapshotId));
329+
snapshotInfos.add(repository.getSnapshotInfo(snapshotId));
263330
} catch (Exception ex) {
264331
if (ignoreUnavailable) {
265332
logger.warn(() -> new ParameterizedMessage("failed to get snapshot [{}]", snapshotId), ex);
266333
} else {
267-
if (ex instanceof SnapshotException) {
268-
throw ex;
269-
}
270-
throw new SnapshotException(repositoryName, snapshotId, "Snapshot could not be read", ex);
334+
listener.onFailure(
335+
ex instanceof SnapshotException
336+
? ex
337+
: new SnapshotException(repository.getMetadata().name(), snapshotId, "Snapshot could not be read", ex)
338+
);
271339
}
272340
}
273-
}
274-
final ArrayList<SnapshotInfo> snapshotList = new ArrayList<>(snapshotSet);
275-
CollectionUtil.timSort(snapshotList);
276-
return unmodifiableList(snapshotList);
341+
getOneSnapshotInfo(ignoreUnavailable, repository, queue, snapshotInfos, task, listener);
342+
});
277343
}
278344

279345
private boolean isAllSnapshots(String[] snapshots) {

server/src/main/java/org/elasticsearch/repositories/blobstore/BlobStoreRepository.java

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1315,6 +1315,7 @@ public long getRestoreThrottleTimeInNanos() {
13151315

13161316
protected void assertSnapshotOrGenericThread() {
13171317
assert Thread.currentThread().getName().contains('[' + ThreadPool.Names.SNAPSHOT + ']')
1318+
|| Thread.currentThread().getName().contains('[' + ThreadPool.Names.SNAPSHOT_META + ']')
13181319
|| Thread.currentThread().getName().contains('[' + ThreadPool.Names.GENERIC + ']') :
13191320
"Expected current thread [" + Thread.currentThread() + "] to be the snapshot or generic thread.";
13201321
}
@@ -1428,11 +1429,12 @@ public void getRepositoryData(ActionListener<RepositoryData> listener) {
14281429
// Don't deduplicate repo data loading if we don't have strong consistency guarantees between the repo and the cluster state
14291430
// Also, if we are not caching repository data (for tests) we assume that the contents of the repository data at a given
14301431
// generation may change
1432+
final Executor executor = threadPool.executor(ThreadPool.Names.SNAPSHOT_META);
14311433
if (bestEffortConsistency || cacheRepositoryData == false) {
1432-
threadPool.generic().execute(ActionRunnable.wrap(listener, this::doGetRepositoryData));
1434+
executor.execute(ActionRunnable.wrap(listener, this::doGetRepositoryData));
14331435
} else {
14341436
repoDataDeduplicator.executeOnce(metadata, listener, (metadata, l) ->
1435-
threadPool.generic().execute(ActionRunnable.wrap(l, this::doGetRepositoryData)));
1437+
executor.execute(ActionRunnable.wrap(l, this::doGetRepositoryData)));
14361438
}
14371439
}
14381440
}

server/src/main/java/org/elasticsearch/threadpool/ThreadPool.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ public static class Names {
6969
public static final String REFRESH = "refresh";
7070
public static final String WARMER = "warmer";
7171
public static final String SNAPSHOT = "snapshot";
72+
public static final String SNAPSHOT_META = "snapshot_meta";
7273
public static final String FORCE_MERGE = "force_merge";
7374
public static final String FETCH_SHARD_STARTED = "fetch_shard_started";
7475
public static final String FETCH_SHARD_STORE = "fetch_shard_store";
@@ -116,6 +117,7 @@ public static ThreadPoolType fromType(String type) {
116117
entry(Names.REFRESH, ThreadPoolType.SCALING),
117118
entry(Names.WARMER, ThreadPoolType.SCALING),
118119
entry(Names.SNAPSHOT, ThreadPoolType.SCALING),
120+
entry(Names.SNAPSHOT_META, ThreadPoolType.SCALING),
119121
entry(Names.FORCE_MERGE, ThreadPoolType.FIXED),
120122
entry(Names.FETCH_SHARD_STARTED, ThreadPoolType.SCALING),
121123
entry(Names.FETCH_SHARD_STORE, ThreadPoolType.SCALING),
@@ -189,6 +191,8 @@ public ThreadPool(final Settings settings, final ExecutorBuilder<?>... customBui
189191
builders.put(Names.REFRESH, new ScalingExecutorBuilder(Names.REFRESH, 1, halfProcMaxAt10, TimeValue.timeValueMinutes(5)));
190192
builders.put(Names.WARMER, new ScalingExecutorBuilder(Names.WARMER, 1, halfProcMaxAt5, TimeValue.timeValueMinutes(5)));
191193
builders.put(Names.SNAPSHOT, new ScalingExecutorBuilder(Names.SNAPSHOT, 1, halfProcMaxAt5, TimeValue.timeValueMinutes(5)));
194+
builders.put(Names.SNAPSHOT_META, new ScalingExecutorBuilder(Names.SNAPSHOT_META, 1, Math.min(allocatedProcessors * 3, 50),
195+
TimeValue.timeValueSeconds(30L)));
192196
builders.put(Names.FETCH_SHARD_STARTED,
193197
new ScalingExecutorBuilder(Names.FETCH_SHARD_STARTED, 1, 2 * allocatedProcessors, TimeValue.timeValueMinutes(5)));
194198
builders.put(Names.FORCE_MERGE, new FixedExecutorBuilder(settings, Names.FORCE_MERGE, 1, -1, false));

server/src/test/java/org/elasticsearch/threadpool/ScalingThreadPoolTests.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,8 @@ public void testScalingThreadPoolConfiguration() throws InterruptedException {
6262
keepAlive = randomIntBetween(1, 300);
6363
builder.put("thread_pool." + threadPoolName + ".keep_alive", keepAlive + "s");
6464
} else {
65-
keepAlive = "generic".equals(threadPoolName) ? 30 : 300; // the defaults
65+
keepAlive = "generic".equals(threadPoolName) || ThreadPool.Names.SNAPSHOT_META.equals(threadPoolName)
66+
? 30 : 300; // the defaults
6667
}
6768

6869
runScalingThreadPoolTest(builder.build(), (clusterSettings, threadPool) -> {
@@ -96,6 +97,7 @@ private int expectedSize(final String threadPoolName, final int numberOfProcesso
9697
sizes.put(ThreadPool.Names.REFRESH, ThreadPool::halfAllocatedProcessorsMaxTen);
9798
sizes.put(ThreadPool.Names.WARMER, ThreadPool::halfAllocatedProcessorsMaxFive);
9899
sizes.put(ThreadPool.Names.SNAPSHOT, ThreadPool::halfAllocatedProcessorsMaxFive);
100+
sizes.put(ThreadPool.Names.SNAPSHOT_META, n -> Math.min(n * 3, 50));
99101
sizes.put(ThreadPool.Names.FETCH_SHARD_STARTED, ThreadPool::twiceAllocatedProcessors);
100102
sizes.put(ThreadPool.Names.FETCH_SHARD_STORE, ThreadPool::twiceAllocatedProcessors);
101103
return sizes.get(threadPoolName).apply(numberOfProcessors);

0 commit comments

Comments
 (0)