Skip to content

Commit 452f7f6

Browse files
authored
Defer reroute when starting shards (#44539)
Today we reroute the cluster as part of the process of starting a shard, which runs at `URGENT` priority. In large clusters, rerouting may take some time to complete, and this means that a mere trickle of shard-started events can cause starvation for other, lower-priority, tasks that are pending on the master. However, it isn't really necessary to perform a reroute when starting a shard, as long as one occurs eventually. This commit removes the inline reroute from the process of starting a shard and replaces it with a deferred one that runs at `NORMAL` priority, avoiding starvation of higher-priority tasks. Backport of #44433 and #44543.
1 parent 4c95cc3 commit 452f7f6

File tree

63 files changed

+630
-517
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

63 files changed

+630
-517
lines changed

docs/reference/migration/migrate_7_4.asciidoc

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,4 +67,27 @@ unsupported on buckets created after September 30th 2020.
6767
Starting in version 7.4, a `+` in a URL will be encoded as `%2B` by all REST API functionality. Prior versions handled a `+` as a single space.
6868
If your application requires handling `+` as a single space you can return to the old behaviour by setting the system property
6969
`es.rest.url_plus_as_space` to `true`. Note that this behaviour is deprecated and setting this system property to `true` will cease
70-
to be supported in version 8.
70+
to be supported in version 8.
71+
72+
[float]
73+
[[breaking_74_cluster_changes]]
74+
=== Cluster changes
75+
76+
[float]
77+
==== Rerouting after starting a shard runs at lower priority
78+
79+
After starting each shard the elected master node must perform a reroute to
80+
search for other shards that could be allocated. In particular, when creating
81+
an index it is this task that allocates the replicas once the primaries have
82+
started. In versions prior to 7.4 this task runs at priority `URGENT`, but
83+
starting in version 7.4 its priority is reduced to `NORMAL`. In a
84+
well-configured cluster this reduces the amount of work the master must do, but
85+
means that a cluster with a master that is overloaded with other tasks at
86+
`HIGH` or `URGENT` priority may take longer to allocate all replicas.
87+
88+
Additionally, before 7.4 the `GET
89+
_cluster_health?wait_for_no_initializing_shards` and `GET
90+
_cluster/health?wait_for_no_relocating_shards` APIs would return only once all
91+
pending reroutes have completed too, but starting in version 7.4 if you want to
92+
wait for the rerouting process to completely finish you should add the
93+
`wait_for_events=languid` query parameter when calling these APIs.

server/src/main/java/org/elasticsearch/cluster/action/shard/ShardStateAction.java

Lines changed: 57 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@
4848
import org.elasticsearch.common.inject.Inject;
4949
import org.elasticsearch.common.io.stream.StreamInput;
5050
import org.elasticsearch.common.io.stream.StreamOutput;
51+
import org.elasticsearch.common.settings.Setting;
5152
import org.elasticsearch.common.unit.TimeValue;
5253
import org.elasticsearch.index.shard.ShardId;
5354
import org.elasticsearch.node.NodeClosedException;
@@ -72,6 +73,7 @@
7273
import java.util.Objects;
7374
import java.util.Set;
7475
import java.util.function.Predicate;
76+
import java.util.function.Supplier;
7577

7678
import static org.elasticsearch.index.seqno.SequenceNumbers.UNASSIGNED_PRIMARY_TERM;
7779

@@ -82,10 +84,34 @@ public class ShardStateAction {
8284
public static final String SHARD_STARTED_ACTION_NAME = "internal:cluster/shard/started";
8385
public static final String SHARD_FAILED_ACTION_NAME = "internal:cluster/shard/failure";
8486

87+
/**
88+
* Adjusts the priority of the followup reroute task. NORMAL is right for reasonable clusters, but in a badly configured cluster it may
89+
* be necessary to raise this higher to recover the older behaviour of rerouting after processing every shard-started task. Deliberately
90+
* undocumented, since this is a last-resort escape hatch for experts rather than something we want to expose to anyone, and deprecated
91+
* since we will remove it once we have confirmed from experience that this priority is appropriate in all cases.
92+
*/
93+
public static final Setting<Priority> FOLLOW_UP_REROUTE_PRIORITY_SETTING
94+
= new Setting<>("cluster.routing.allocation.shard_state.reroute.priority", Priority.NORMAL.toString(),
95+
ShardStateAction::parseReroutePriority, Setting.Property.NodeScope, Setting.Property.Dynamic, Setting.Property.Deprecated);
96+
97+
private static Priority parseReroutePriority(String priorityString) {
98+
final Priority priority = Priority.valueOf(priorityString.toUpperCase(Locale.ROOT));
99+
switch (priority) {
100+
case NORMAL:
101+
case HIGH:
102+
case URGENT:
103+
return priority;
104+
}
105+
throw new IllegalArgumentException(
106+
"priority [" + priority + "] not supported for [" + FOLLOW_UP_REROUTE_PRIORITY_SETTING.getKey() + "]");
107+
}
108+
85109
private final TransportService transportService;
86110
private final ClusterService clusterService;
87111
private final ThreadPool threadPool;
88112

113+
private volatile Priority followUpRerouteTaskPriority;
114+
89115
// a list of shards that failed during replication
90116
// we keep track of these shards in order to avoid sending duplicate failed shard requests for a single failing shard.
91117
private final TransportRequestDeduplicator<FailedShardEntry> remoteFailedShardsDeduplicator = new TransportRequestDeduplicator<>();
@@ -97,11 +123,18 @@ public ShardStateAction(ClusterService clusterService, TransportService transpor
97123
this.clusterService = clusterService;
98124
this.threadPool = threadPool;
99125

126+
followUpRerouteTaskPriority = FOLLOW_UP_REROUTE_PRIORITY_SETTING.get(clusterService.getSettings());
127+
clusterService.getClusterSettings().addSettingsUpdateConsumer(FOLLOW_UP_REROUTE_PRIORITY_SETTING,
128+
this::setFollowUpRerouteTaskPriority);
129+
100130
transportService.registerRequestHandler(SHARD_STARTED_ACTION_NAME, ThreadPool.Names.SAME, StartedShardEntry::new,
101-
new ShardStartedTransportHandler(clusterService, new ShardStartedClusterStateTaskExecutor(allocationService, logger), logger));
131+
new ShardStartedTransportHandler(clusterService,
132+
new ShardStartedClusterStateTaskExecutor(allocationService, rerouteService, () -> followUpRerouteTaskPriority, logger),
133+
logger));
102134
transportService.registerRequestHandler(SHARD_FAILED_ACTION_NAME, ThreadPool.Names.SAME, FailedShardEntry::new,
103135
new ShardFailedTransportHandler(clusterService,
104-
new ShardFailedClusterStateTaskExecutor(allocationService, rerouteService, logger), logger));
136+
new ShardFailedClusterStateTaskExecutor(allocationService, rerouteService, () -> followUpRerouteTaskPriority, logger),
137+
logger));
105138
}
106139

107140
private void sendShardAction(final String actionName, final ClusterState currentState,
@@ -218,6 +251,10 @@ public void onTimeout(TimeValue timeout) {
218251
}, changePredicate);
219252
}
220253

254+
private void setFollowUpRerouteTaskPriority(Priority followUpRerouteTaskPriority) {
255+
this.followUpRerouteTaskPriority = followUpRerouteTaskPriority;
256+
}
257+
221258
private static class ShardFailedTransportHandler implements TransportRequestHandler<FailedShardEntry> {
222259
private final ClusterService clusterService;
223260
private final ShardFailedClusterStateTaskExecutor shardFailedClusterStateTaskExecutor;
@@ -285,11 +322,14 @@ public static class ShardFailedClusterStateTaskExecutor implements ClusterStateT
285322
private final AllocationService allocationService;
286323
private final RerouteService rerouteService;
287324
private final Logger logger;
325+
private final Supplier<Priority> prioritySupplier;
288326

289-
public ShardFailedClusterStateTaskExecutor(AllocationService allocationService, RerouteService rerouteService, Logger logger) {
327+
public ShardFailedClusterStateTaskExecutor(AllocationService allocationService, RerouteService rerouteService,
328+
Supplier<Priority> prioritySupplier, Logger logger) {
290329
this.allocationService = allocationService;
291330
this.rerouteService = rerouteService;
292331
this.logger = logger;
332+
this.prioritySupplier = prioritySupplier;
293333
}
294334

295335
@Override
@@ -383,7 +423,7 @@ public void clusterStatePublished(ClusterChangedEvent clusterChangedEvent) {
383423
// assign it again, even if that means putting it back on the node on which it previously failed:
384424
final String reason = String.format(Locale.ROOT, "[%d] unassigned shards after failing shards", numberOfUnassignedShards);
385425
logger.trace("{}, scheduling a reroute", reason);
386-
rerouteService.reroute(reason, Priority.HIGH, ActionListener.wrap(
426+
rerouteService.reroute(reason, prioritySupplier.get(), ActionListener.wrap(
387427
r -> logger.trace("{}, reroute completed", reason),
388428
e -> logger.debug(new ParameterizedMessage("{}, reroute failed", reason), e)));
389429
}
@@ -520,10 +560,15 @@ public static class ShardStartedClusterStateTaskExecutor
520560
implements ClusterStateTaskExecutor<StartedShardEntry>, ClusterStateTaskListener {
521561
private final AllocationService allocationService;
522562
private final Logger logger;
563+
private final RerouteService rerouteService;
564+
private final Supplier<Priority> prioritySupplier;
523565

524-
public ShardStartedClusterStateTaskExecutor(AllocationService allocationService, Logger logger) {
566+
public ShardStartedClusterStateTaskExecutor(AllocationService allocationService, RerouteService rerouteService,
567+
Supplier<Priority> prioritySupplier, Logger logger) {
525568
this.allocationService = allocationService;
526569
this.logger = logger;
570+
this.rerouteService = rerouteService;
571+
this.prioritySupplier = prioritySupplier;
527572
}
528573

529574
@Override
@@ -598,6 +643,13 @@ public void onFailure(String source, Exception e) {
598643
logger.error(() -> new ParameterizedMessage("unexpected failure during [{}]", source), e);
599644
}
600645
}
646+
647+
@Override
648+
public void clusterStatePublished(ClusterChangedEvent clusterChangedEvent) {
649+
rerouteService.reroute("reroute after starting shards", prioritySupplier.get(), ActionListener.wrap(
650+
r -> logger.trace("reroute after starting shards succeeded"),
651+
e -> logger.debug("reroute after starting shards failed", e)));
652+
}
601653
}
602654

603655
public static class StartedShardEntry extends TransportRequest {

server/src/main/java/org/elasticsearch/cluster/routing/allocation/AllocationService.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ public ClusterState applyStartedShards(ClusterState clusterState, List<ShardRout
109109
Collections.sort(startedShards, Comparator.comparing(ShardRouting::primary));
110110
applyStartedShards(allocation, startedShards);
111111
gatewayAllocator.applyStartedShards(allocation, startedShards);
112-
reroute(allocation);
112+
assert RoutingNodes.assertShardStats(allocation.routingNodes());
113113
String startedShardsAsString
114114
= firstListElementsToCommaDelimitedString(startedShards, s -> s.shardId().toString(), logger.isDebugEnabled());
115115
return buildResultAndLogHealthChange(clusterState, allocation, "shards started [" + startedShardsAsString + "]");

server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
import org.elasticsearch.cluster.InternalClusterInfoService;
3333
import org.elasticsearch.cluster.NodeConnectionsService;
3434
import org.elasticsearch.cluster.action.index.MappingUpdatedAction;
35+
import org.elasticsearch.cluster.action.shard.ShardStateAction;
3536
import org.elasticsearch.cluster.coordination.ClusterBootstrapService;
3637
import org.elasticsearch.cluster.coordination.ClusterFormationFailureHelper;
3738
import org.elasticsearch.cluster.coordination.Coordinator;
@@ -226,6 +227,7 @@ public void apply(Settings value, Settings current, Settings previous) {
226227
DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_INCLUDE_RELOCATIONS_SETTING,
227228
DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_REROUTE_INTERVAL_SETTING,
228229
SameShardAllocationDecider.CLUSTER_ROUTING_ALLOCATION_SAME_HOST_SETTING,
230+
ShardStateAction.FOLLOW_UP_REROUTE_PRIORITY_SETTING,
229231
InternalClusterInfoService.INTERNAL_CLUSTER_INFO_UPDATE_INTERVAL_SETTING,
230232
InternalClusterInfoService.INTERNAL_CLUSTER_INFO_TIMEOUT_SETTING,
231233
DestructiveOperations.REQUIRES_NAME_SETTING,

server/src/test/java/org/elasticsearch/action/admin/indices/shrink/TransportResizeActionTests.java

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
import org.elasticsearch.action.support.ActiveShardCount;
2626
import org.elasticsearch.cluster.ClusterName;
2727
import org.elasticsearch.cluster.ClusterState;
28+
import org.elasticsearch.cluster.ESAllocationTestCase;
2829
import org.elasticsearch.cluster.EmptyClusterInfoService;
2930
import org.elasticsearch.cluster.block.ClusterBlocks;
3031
import org.elasticsearch.cluster.metadata.IndexMetaData;
@@ -33,7 +34,6 @@
3334
import org.elasticsearch.cluster.node.DiscoveryNodeRole;
3435
import org.elasticsearch.cluster.node.DiscoveryNodes;
3536
import org.elasticsearch.cluster.routing.RoutingTable;
36-
import org.elasticsearch.cluster.routing.ShardRoutingState;
3737
import org.elasticsearch.cluster.routing.allocation.AllocationService;
3838
import org.elasticsearch.cluster.routing.allocation.allocator.BalancedShardsAllocator;
3939
import org.elasticsearch.cluster.routing.allocation.decider.AllocationDeciders;
@@ -116,8 +116,7 @@ public void testErrorCondition() {
116116
RoutingTable routingTable = service.reroute(clusterState, "reroute").routingTable();
117117
clusterState = ClusterState.builder(clusterState).routingTable(routingTable).build();
118118
// now we start the shard
119-
routingTable = service.applyStartedShards(clusterState,
120-
routingTable.index("source").shardsWithState(ShardRoutingState.INITIALIZING)).routingTable();
119+
routingTable = ESAllocationTestCase.startInitializingShardsAndReroute(service, clusterState, "source").routingTable();
121120
clusterState = ClusterState.builder(clusterState).routingTable(routingTable).build();
122121

123122
TransportResizeAction.prepareCreateIndexRequest(new ResizeRequest("target", "source"), clusterState,
@@ -135,8 +134,7 @@ public void testPassNumRoutingShards() {
135134
RoutingTable routingTable = service.reroute(clusterState, "reroute").routingTable();
136135
clusterState = ClusterState.builder(clusterState).routingTable(routingTable).build();
137136
// now we start the shard
138-
routingTable = service.applyStartedShards(clusterState,
139-
routingTable.index("source").shardsWithState(ShardRoutingState.INITIALIZING)).routingTable();
137+
routingTable = ESAllocationTestCase.startInitializingShardsAndReroute(service, clusterState, "source").routingTable();
140138
clusterState = ClusterState.builder(clusterState).routingTable(routingTable).build();
141139

142140
ResizeRequest resizeRequest = new ResizeRequest("target", "source");
@@ -165,8 +163,7 @@ public void testPassNumRoutingShardsAndFail() {
165163
RoutingTable routingTable = service.reroute(clusterState, "reroute").routingTable();
166164
clusterState = ClusterState.builder(clusterState).routingTable(routingTable).build();
167165
// now we start the shard
168-
routingTable = service.applyStartedShards(clusterState,
169-
routingTable.index("source").shardsWithState(ShardRoutingState.INITIALIZING)).routingTable();
166+
routingTable = ESAllocationTestCase.startInitializingShardsAndReroute(service, clusterState, "source").routingTable();
170167
clusterState = ClusterState.builder(clusterState).routingTable(routingTable).build();
171168

172169
ResizeRequest resizeRequest = new ResizeRequest("target", "source");
@@ -200,8 +197,7 @@ public void testShrinkIndexSettings() {
200197
RoutingTable routingTable = service.reroute(clusterState, "reroute").routingTable();
201198
clusterState = ClusterState.builder(clusterState).routingTable(routingTable).build();
202199
// now we start the shard
203-
routingTable = service.applyStartedShards(clusterState,
204-
routingTable.index(indexName).shardsWithState(ShardRoutingState.INITIALIZING)).routingTable();
200+
routingTable = ESAllocationTestCase.startInitializingShardsAndReroute(service, clusterState, indexName).routingTable();
205201
clusterState = ClusterState.builder(clusterState).routingTable(routingTable).build();
206202
int numSourceShards = clusterState.metaData().index(indexName).getNumberOfShards();
207203
DocsStats stats = new DocsStats(between(0, (IndexWriter.MAX_DOCS) / numSourceShards), between(1, 1000), between(1, 10000));

server/src/test/java/org/elasticsearch/action/support/replication/TransportReplicationActionTests.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -466,7 +466,7 @@ public void testNoRerouteOnStaleClusterState() {
466466
ShardRouting relocationTarget = clusterService.state().getRoutingTable().shardRoutingTable(shardId)
467467
.shardsWithState(ShardRoutingState.INITIALIZING).get(0);
468468
AllocationService allocationService = ESAllocationTestCase.createAllocationService();
469-
ClusterState updatedState = allocationService.applyStartedShards(state, Collections.singletonList(relocationTarget));
469+
ClusterState updatedState = ESAllocationTestCase.startShardsAndReroute(allocationService, state, relocationTarget);
470470

471471
setState(clusterService, updatedState);
472472
logger.debug("--> relocation complete state:\n{}", clusterService.state());

0 commit comments

Comments
 (0)