Skip to content

Commit e14300e

Browse files
authored
Remove followup reroute priority setting (#44611)
In #44433 we introduced a temporary (immediately deprecated) escape-hatch setting to control the priority of the reroute scheduled after starting a batch of shards. This commit removes this setting in `master`, fixing the followup reroute's priority at `NORMAL`.
1 parent 802bd2b commit e14300e

File tree

7 files changed

+13
-197
lines changed

7 files changed

+13
-197
lines changed

server/src/main/java/org/elasticsearch/cluster/action/shard/ShardStateAction.java

Lines changed: 6 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,6 @@
4747
import org.elasticsearch.common.inject.Inject;
4848
import org.elasticsearch.common.io.stream.StreamInput;
4949
import org.elasticsearch.common.io.stream.StreamOutput;
50-
import org.elasticsearch.common.settings.Setting;
5150
import org.elasticsearch.common.unit.TimeValue;
5251
import org.elasticsearch.index.shard.ShardId;
5352
import org.elasticsearch.node.NodeClosedException;
@@ -72,7 +71,6 @@
7271
import java.util.Objects;
7372
import java.util.Set;
7473
import java.util.function.Predicate;
75-
import java.util.function.Supplier;
7674

7775
public class ShardStateAction {
7876

@@ -81,34 +79,10 @@ public class ShardStateAction {
8179
public static final String SHARD_STARTED_ACTION_NAME = "internal:cluster/shard/started";
8280
public static final String SHARD_FAILED_ACTION_NAME = "internal:cluster/shard/failure";
8381

84-
/**
85-
* Adjusts the priority of the followup reroute task. NORMAL is right for reasonable clusters, but in a badly configured cluster it may
86-
* be necessary to raise this higher to recover the older behaviour of rerouting after processing every shard-started task. Deliberately
87-
* undocumented, since this is a last-resort escape hatch for experts rather than something we want to expose to anyone, and deprecated
88-
* since we will remove it once we have confirmed from experience that this priority is appropriate in all cases.
89-
*/
90-
public static final Setting<Priority> FOLLOW_UP_REROUTE_PRIORITY_SETTING
91-
= new Setting<>("cluster.routing.allocation.shard_state.reroute.priority", Priority.NORMAL.toString(),
92-
ShardStateAction::parseReroutePriority, Setting.Property.NodeScope, Setting.Property.Dynamic, Setting.Property.Deprecated);
93-
94-
private static Priority parseReroutePriority(String priorityString) {
95-
final Priority priority = Priority.valueOf(priorityString.toUpperCase(Locale.ROOT));
96-
switch (priority) {
97-
case NORMAL:
98-
case HIGH:
99-
case URGENT:
100-
return priority;
101-
}
102-
throw new IllegalArgumentException(
103-
"priority [" + priority + "] not supported for [" + FOLLOW_UP_REROUTE_PRIORITY_SETTING.getKey() + "]");
104-
}
105-
10682
private final TransportService transportService;
10783
private final ClusterService clusterService;
10884
private final ThreadPool threadPool;
10985

110-
private volatile Priority followUpRerouteTaskPriority;
111-
11286
// a list of shards that failed during replication
11387
// we keep track of these shards in order to avoid sending duplicate failed shard requests for a single failing shard.
11488
private final TransportRequestDeduplicator<FailedShardEntry> remoteFailedShardsDeduplicator = new TransportRequestDeduplicator<>();
@@ -120,17 +94,13 @@ public ShardStateAction(ClusterService clusterService, TransportService transpor
12094
this.clusterService = clusterService;
12195
this.threadPool = threadPool;
12296

123-
followUpRerouteTaskPriority = FOLLOW_UP_REROUTE_PRIORITY_SETTING.get(clusterService.getSettings());
124-
clusterService.getClusterSettings().addSettingsUpdateConsumer(FOLLOW_UP_REROUTE_PRIORITY_SETTING,
125-
this::setFollowUpRerouteTaskPriority);
126-
12797
transportService.registerRequestHandler(SHARD_STARTED_ACTION_NAME, ThreadPool.Names.SAME, StartedShardEntry::new,
12898
new ShardStartedTransportHandler(clusterService,
129-
new ShardStartedClusterStateTaskExecutor(allocationService, rerouteService, () -> followUpRerouteTaskPriority, logger),
99+
new ShardStartedClusterStateTaskExecutor(allocationService, rerouteService, logger),
130100
logger));
131101
transportService.registerRequestHandler(SHARD_FAILED_ACTION_NAME, ThreadPool.Names.SAME, FailedShardEntry::new,
132102
new ShardFailedTransportHandler(clusterService,
133-
new ShardFailedClusterStateTaskExecutor(allocationService, rerouteService, () -> followUpRerouteTaskPriority, logger),
103+
new ShardFailedClusterStateTaskExecutor(allocationService, rerouteService, logger),
134104
logger));
135105
}
136106

@@ -248,10 +218,6 @@ public void onTimeout(TimeValue timeout) {
248218
}, changePredicate);
249219
}
250220

251-
private void setFollowUpRerouteTaskPriority(Priority followUpRerouteTaskPriority) {
252-
this.followUpRerouteTaskPriority = followUpRerouteTaskPriority;
253-
}
254-
255221
private static class ShardFailedTransportHandler implements TransportRequestHandler<FailedShardEntry> {
256222
private final ClusterService clusterService;
257223
private final ShardFailedClusterStateTaskExecutor shardFailedClusterStateTaskExecutor;
@@ -319,14 +285,11 @@ public static class ShardFailedClusterStateTaskExecutor implements ClusterStateT
319285
private final AllocationService allocationService;
320286
private final RerouteService rerouteService;
321287
private final Logger logger;
322-
private final Supplier<Priority> prioritySupplier;
323288

324-
public ShardFailedClusterStateTaskExecutor(AllocationService allocationService, RerouteService rerouteService,
325-
Supplier<Priority> prioritySupplier, Logger logger) {
289+
public ShardFailedClusterStateTaskExecutor(AllocationService allocationService, RerouteService rerouteService, Logger logger) {
326290
this.allocationService = allocationService;
327291
this.rerouteService = rerouteService;
328292
this.logger = logger;
329-
this.prioritySupplier = prioritySupplier;
330293
}
331294

332295
@Override
@@ -420,7 +383,7 @@ public void clusterStatePublished(ClusterChangedEvent clusterChangedEvent) {
420383
// assign it again, even if that means putting it back on the node on which it previously failed:
421384
final String reason = String.format(Locale.ROOT, "[%d] unassigned shards after failing shards", numberOfUnassignedShards);
422385
logger.trace("{}, scheduling a reroute", reason);
423-
rerouteService.reroute(reason, prioritySupplier.get(), ActionListener.wrap(
386+
rerouteService.reroute(reason, Priority.NORMAL, ActionListener.wrap(
424387
r -> logger.trace("{}, reroute completed", reason),
425388
e -> logger.debug(new ParameterizedMessage("{}, reroute failed", reason), e)));
426389
}
@@ -552,14 +515,11 @@ public static class ShardStartedClusterStateTaskExecutor
552515
private final AllocationService allocationService;
553516
private final Logger logger;
554517
private final RerouteService rerouteService;
555-
private final Supplier<Priority> prioritySupplier;
556518

557-
public ShardStartedClusterStateTaskExecutor(AllocationService allocationService, RerouteService rerouteService,
558-
Supplier<Priority> prioritySupplier, Logger logger) {
519+
public ShardStartedClusterStateTaskExecutor(AllocationService allocationService, RerouteService rerouteService, Logger logger) {
559520
this.allocationService = allocationService;
560521
this.logger = logger;
561522
this.rerouteService = rerouteService;
562-
this.prioritySupplier = prioritySupplier;
563523
}
564524

565525
@Override
@@ -637,7 +597,7 @@ public void onFailure(String source, Exception e) {
637597

638598
@Override
639599
public void clusterStatePublished(ClusterChangedEvent clusterChangedEvent) {
640-
rerouteService.reroute("reroute after starting shards", prioritySupplier.get(), ActionListener.wrap(
600+
rerouteService.reroute("reroute after starting shards", Priority.NORMAL, ActionListener.wrap(
641601
r -> logger.trace("reroute after starting shards succeeded"),
642602
e -> logger.debug("reroute after starting shards failed", e)));
643603
}

server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,6 @@
3131
import org.elasticsearch.cluster.InternalClusterInfoService;
3232
import org.elasticsearch.cluster.NodeConnectionsService;
3333
import org.elasticsearch.cluster.action.index.MappingUpdatedAction;
34-
import org.elasticsearch.cluster.action.shard.ShardStateAction;
3534
import org.elasticsearch.cluster.coordination.ClusterBootstrapService;
3635
import org.elasticsearch.cluster.coordination.ClusterFormationFailureHelper;
3736
import org.elasticsearch.cluster.coordination.Coordinator;
@@ -214,7 +213,6 @@ public void apply(Settings value, Settings current, Settings previous) {
214213
DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_INCLUDE_RELOCATIONS_SETTING,
215214
DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_REROUTE_INTERVAL_SETTING,
216215
SameShardAllocationDecider.CLUSTER_ROUTING_ALLOCATION_SAME_HOST_SETTING,
217-
ShardStateAction.FOLLOW_UP_REROUTE_PRIORITY_SETTING,
218216
InternalClusterInfoService.INTERNAL_CLUSTER_INFO_UPDATE_INTERVAL_SETTING,
219217
InternalClusterInfoService.INTERNAL_CLUSTER_INFO_TIMEOUT_SETTING,
220218
DestructiveOperations.REQUIRES_NAME_SETTING,

server/src/test/java/org/elasticsearch/cluster/action/shard/ShardFailedClusterStateTaskExecutorTests.java

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,6 @@
4141
import org.elasticsearch.cluster.routing.allocation.FailedShard;
4242
import org.elasticsearch.cluster.routing.allocation.StaleShard;
4343
import org.elasticsearch.cluster.routing.allocation.decider.ClusterRebalanceAllocationDecider;
44-
import org.elasticsearch.common.Priority;
4544
import org.elasticsearch.common.UUIDs;
4645
import org.elasticsearch.common.collect.Tuple;
4746
import org.elasticsearch.common.settings.Settings;
@@ -87,7 +86,7 @@ public void setUp() throws Exception {
8786
.build();
8887
clusterState = ClusterState.builder(ClusterName.CLUSTER_NAME_SETTING.getDefault(Settings.EMPTY))
8988
.metaData(metaData).routingTable(routingTable).build();
90-
executor = new ShardStateAction.ShardFailedClusterStateTaskExecutor(allocationService, null, () -> Priority.NORMAL, logger);
89+
executor = new ShardStateAction.ShardFailedClusterStateTaskExecutor(allocationService, null, logger);
9190
}
9291

9392
public void testEmptyTaskListProducesSameClusterState() throws Exception {
@@ -119,7 +118,7 @@ public void testTriviallySuccessfulTasksBatchedWithFailingTasks() throws Excepti
119118
List<FailedShardEntry> failingTasks = createExistingShards(currentState, reason);
120119
List<FailedShardEntry> nonExistentTasks = createNonExistentShards(currentState, reason);
121120
ShardStateAction.ShardFailedClusterStateTaskExecutor failingExecutor =
122-
new ShardStateAction.ShardFailedClusterStateTaskExecutor(allocationService, null, () -> Priority.NORMAL, logger) {
121+
new ShardStateAction.ShardFailedClusterStateTaskExecutor(allocationService, null, logger) {
123122
@Override
124123
ClusterState applyFailedShards(ClusterState currentState, List<FailedShard> failedShards, List<StaleShard> staleShards) {
125124
throw new RuntimeException("simulated applyFailedShards failure");

server/src/test/java/org/elasticsearch/cluster/action/shard/ShardStartedClusterStateTaskExecutorTests.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ public void setUp() throws Exception {
6666
.put(CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_RECOVERIES_SETTING.getKey(), Integer.MAX_VALUE)
6767
.build());
6868
executor = new ShardStateAction.ShardStartedClusterStateTaskExecutor(allocationService,
69-
ShardStartedClusterStateTaskExecutorTests::neverReroutes, () -> Priority.NORMAL, logger);
69+
ShardStartedClusterStateTaskExecutorTests::neverReroutes, logger);
7070
}
7171

7272
public void testEmptyTaskListProducesSameClusterState() throws Exception {

server/src/test/java/org/elasticsearch/cluster/action/shard/ShardStateActionIT.java

Lines changed: 0 additions & 139 deletions
This file was deleted.

server/src/test/java/org/elasticsearch/cluster/routing/allocation/InSyncAllocationIdTests.java

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,6 @@
3232
import org.elasticsearch.cluster.routing.ShardRouting;
3333
import org.elasticsearch.cluster.routing.allocation.command.AllocateEmptyPrimaryAllocationCommand;
3434
import org.elasticsearch.cluster.routing.allocation.command.AllocationCommands;
35-
import org.elasticsearch.common.Priority;
3635
import org.elasticsearch.common.settings.Settings;
3736
import org.elasticsearch.common.util.set.Sets;
3837
import org.junit.Before;
@@ -59,7 +58,7 @@ public class InSyncAllocationIdTests extends ESAllocationTestCase {
5958
public void setupAllocationService() {
6059
allocation = createAllocationService();
6160
failedClusterStateTaskExecutor
62-
= new ShardStateAction.ShardFailedClusterStateTaskExecutor(allocation, null, () -> Priority.NORMAL, logger);
61+
= new ShardStateAction.ShardFailedClusterStateTaskExecutor(allocation, null, logger);
6362
}
6463

6564
public void testInSyncAllocationIdsUpdated() {
@@ -165,7 +164,7 @@ public void testDeadNodesBeforeReplicaFailed() throws Exception {
165164
logger.info("fail replica (for which there is no shard routing in the CS anymore)");
166165
assertNull(clusterState.getRoutingNodes().getByAllocationId(replicaShard.shardId(), replicaShard.allocationId().getId()));
167166
ShardStateAction.ShardFailedClusterStateTaskExecutor failedClusterStateTaskExecutor =
168-
new ShardStateAction.ShardFailedClusterStateTaskExecutor(allocation, null, () -> Priority.NORMAL, logger);
167+
new ShardStateAction.ShardFailedClusterStateTaskExecutor(allocation, null, logger);
169168
long primaryTerm = clusterState.metaData().index("test").primaryTerm(0);
170169
clusterState = failedClusterStateTaskExecutor.execute(clusterState, Arrays.asList(
171170
new FailedShardEntry(shardRoutingTable.shardId(), replicaShard.allocationId().getId(), primaryTerm, "dummy", null, true))

0 commit comments

Comments
 (0)