From c0fb07cfb770626dd77ccbea2631e1ed1b25fa6d Mon Sep 17 00:00:00 2001 From: David Turner Date: Mon, 7 Nov 2022 10:01:10 +0000 Subject: [PATCH 01/15] Introduce desired-balance allocator Today when updating the routing table (i.e. within `AllocationService#reroute()`) Elasticsearch computes the desired balance of shards and then identifies some shard movements that work towards that goal. At the end of the computation it discards the computed desired allocation and recomputes it the next time round. It's kind of inefficient to recompute the desired allocation each time, and it makes it hard to predict how long it will take until the goal is reached. The computation also happens on the critical path for cluster state updates. With this commit we introduce a new allocator which keeps hold of the desired balance between iterations. It also computes the desired balance asynchronously, allowing other cluster state updates to happen while the computation is ongoing. Relates #88647, #83777, and many more. --- .../allocation/AllocationBenchmark.java | 6 +- .../DataStreamGetWriteIndexTests.java | 7 +- .../coordination/RareClusterStateIT.java | 4 +- .../gateway/ReplicaShardAllocatorIT.java | 16 +- .../index/store/CorruptedFileIT.java | 1 + .../indices/IndicesLifecycleListenerIT.java | 2 +- .../indices/recovery/IndexRecoveryIT.java | 16 +- .../TransportClusterRerouteAction.java | 70 +- .../indices/create/AutoCreateAction.java | 53 +- .../rollover/MetadataRolloverService.java | 21 +- .../rollover/TransportRolloverAction.java | 26 +- .../cluster/ClusterInfoSimulator.java | 106 ++ .../elasticsearch/cluster/ClusterModule.java | 44 +- .../org/elasticsearch/cluster/DiskUsage.java | 4 + .../MetadataCreateDataStreamService.java | 48 +- .../metadata/MetadataCreateIndexService.java | 100 +- .../metadata/MetadataDeleteIndexService.java | 8 +- .../metadata/MetadataIndexStateService.java | 64 +- .../MetadataMigrateToDataStreamService.java | 18 +- .../MetadataUpdateSettingsService.java | 47 +- .../routing/BatchedRerouteService.java | 10 +- .../routing/DelayedAllocationService.java | 5 +- .../cluster/routing/RoutingNodes.java | 20 +- .../cluster/routing/RoutingTable.java | 5 + .../routing/allocation/AllocationService.java | 81 +- .../routing/allocation/RoutingAllocation.java | 57 +- .../allocator/AllocationActionListener.java | 102 ++ .../AllocationActionMultiListener.java | 98 ++ .../allocator/ContinuousComputation.java | 101 ++ .../allocation/allocator/DesiredBalance.java | 32 + .../allocator/DesiredBalanceComputer.java | 303 +++++ .../allocator/DesiredBalanceInput.java | 67 + .../allocator/DesiredBalanceReconciler.java | 438 +++++++ .../DesiredBalanceShardsAllocator.java | 330 +++++ .../allocator/DesiredBalanceStats.java | 70 + .../allocator/NodeAllocationOrdering.java | 44 + .../allocator/PendingListenersQueue.java | 88 ++ .../allocation/allocator/ShardAssignment.java | 35 + .../allocation/allocator/ShardsAllocator.java | 38 + .../allocation/decider/AllocationDecider.java | 13 + .../decider/AllocationDeciders.java | 14 + .../decider/EnableAllocationDecider.java | 8 + .../decider/FilterAllocationDecider.java | 18 + .../decider/MaxRetryAllocationDecider.java | 7 +- .../decider/ResizeAllocationDecider.java | 26 + .../SnapshotInProgressAllocationDecider.java | 4 + .../decider/ThrottlingAllocationDecider.java | 4 +- .../gateway/LocalAllocateDangledIndices.java | 29 +- .../java/org/elasticsearch/node/Node.java | 12 +- .../snapshots/RestoreService.java | 18 +- .../cluster/reroute/ClusterRerouteTests.java | 5 +- .../TransportRolloverActionTests.java | 2 +- .../shrink/TransportResizeActionTests.java | 11 +- .../cluster/ClusterModuleTests.java | 45 +- ...rdFailedClusterStateTaskExecutorTests.java | 2 +- .../MetadataCreateDataStreamServiceTests.java | 51 +- .../MetadataCreateIndexServiceTests.java | 25 +- .../MetadataDeleteIndexServiceTests.java | 7 +- ...tadataMigrateToDataStreamServiceTests.java | 10 +- .../routing/BatchedRerouteServiceTests.java | 29 +- .../DelayedAllocationServiceTests.java | 9 +- .../cluster/routing/PrimaryTermsTests.java | 5 +- .../cluster/routing/RoutingTableTests.java | 3 +- .../cluster/routing/UnassignedInfoTests.java | 13 +- .../allocation/AddIncrementallyTests.java | 13 +- .../allocation/AllocationCommandsTests.java | 201 ++- .../allocation/AllocationPriorityTests.java | 5 +- .../allocation/AllocationServiceTests.java | 3 +- .../allocation/AwarenessAllocationTests.java | 135 +- .../allocation/BalanceConfigurationTests.java | 11 +- .../BalanceUnbalancedClusterTests.java | 3 +- .../allocation/CatAllocationTestCase.java | 3 +- .../ClusterRebalanceRoutingTests.java | 37 +- .../ConcurrentRebalanceRoutingTests.java | 5 +- .../allocation/DeadNodesAllocationTests.java | 19 +- .../DecisionsImpactOnClusterHealthTests.java | 3 +- ...ReplicaAsPrimaryDuringRelocationTests.java | 5 +- .../ExpectedShardSizeAllocationTests.java | 11 +- .../allocation/FailedNodeRoutingTests.java | 3 +- .../allocation/FailedShardsRoutingTests.java | 83 +- .../allocation/FilterRoutingTests.java | 15 +- .../allocation/InSyncAllocationIdTests.java | 12 +- .../routing/allocation/IndexBalanceTests.java | 25 +- .../MaxRetryAllocationDeciderTests.java | 9 +- .../NodeVersionAllocationDeciderTests.java | 11 +- ...alPrimariesToRelocatingPrimariesTests.java | 5 +- .../PreferPrimaryAllocationTests.java | 7 +- .../PrimaryElectionRoutingTests.java | 7 +- ...yNotRelocatedWhileBeingRecoveredTests.java | 7 +- .../RandomAllocationDeciderTests.java | 5 +- .../allocation/RebalanceAfterActiveTests.java | 5 +- .../ReplicaAllocatedAfterPrimaryTests.java | 3 +- .../ResizeAllocationDeciderTests.java | 62 +- ...ResizeSourceIndexSettingsUpdaterTests.java | 5 +- .../RetryFailedAllocationTests.java | 11 +- .../RoutingNodesIntegrityTests.java | 23 +- .../allocation/SameShardRoutingTests.java | 5 +- .../allocation/ShardVersioningTests.java | 3 +- .../ShardsLimitAllocationTests.java | 13 +- .../SingleShardNoReplicasRoutingTests.java | 21 +- .../SingleShardOneReplicaRoutingTests.java | 9 +- .../TenShardsOneReplicaRoutingTests.java | 9 +- .../allocation/ThrottlingAllocationTests.java | 23 +- .../TrackFailedAllocationNodesTests.java | 6 +- .../UpdateNumberOfReplicasTests.java | 7 +- .../AllocationActionListenerTests.java | 159 +++ .../AllocationActionMultiListenerTests.java | 168 +++ .../BalancedShardsAllocatorTests.java | 3 +- .../allocator/ClusterInfoSimulatorTests.java | 379 ++++++ .../allocator/ContinuousComputationTests.java | 147 +++ .../DesiredBalanceComputerTests.java | 876 +++++++++++++ .../DesiredBalanceReconcilerTests.java | 1166 +++++++++++++++++ .../DesiredBalanceShardsAllocatorTests.java | 466 +++++++ .../allocator/DesiredBalanceStatsTests.java | 76 ++ .../NodeAllocationOrderingTests.java | 47 + .../allocator/PendingListenersQueueTests.java | 73 ++ .../decider/AllocationDecidersTests.java | 79 ++ .../decider/DiskThresholdDeciderTests.java | 56 +- .../DiskThresholdDeciderUnitTests.java | 5 +- .../EnableAllocationShortCircuitTests.java | 11 +- .../decider/EnableAllocationTests.java | 19 +- .../decider/FilterAllocationDeciderTests.java | 54 +- ...NodeReplacementAllocationDeciderTests.java | 19 +- .../NodeShutdownAllocationDeciderTests.java | 17 +- .../ClusterSerializationTests.java | 7 +- .../ClusterStateToStringTests.java | 5 +- .../structure/RoutingIteratorTests.java | 7 +- .../indices/cluster/ClusterStateChanges.java | 5 +- .../snapshots/SnapshotResiliencyTests.java | 3 +- .../cluster/ESAllocationTestCase.java | 47 +- .../metadata/DataStreamTestHelper.java | 2 +- .../downsample/TransportRollupAction.java | 5 +- ...chableSnapshotEnableAllocationDecider.java | 4 + 133 files changed, 7054 insertions(+), 654 deletions(-) create mode 100644 server/src/main/java/org/elasticsearch/cluster/ClusterInfoSimulator.java create mode 100644 server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/AllocationActionListener.java create mode 100644 server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/AllocationActionMultiListener.java create mode 100644 server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/ContinuousComputation.java create mode 100644 server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalance.java create mode 100644 server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceComputer.java create mode 100644 server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceInput.java create mode 100644 server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java create mode 100644 server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceShardsAllocator.java create mode 100644 server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceStats.java create mode 100644 server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/NodeAllocationOrdering.java create mode 100644 server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/PendingListenersQueue.java create mode 100644 server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/ShardAssignment.java create mode 100644 server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/AllocationActionListenerTests.java create mode 100644 server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/AllocationActionMultiListenerTests.java create mode 100644 server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/ClusterInfoSimulatorTests.java create mode 100644 server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/ContinuousComputationTests.java create mode 100644 server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceComputerTests.java create mode 100644 server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconcilerTests.java create mode 100644 server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceShardsAllocatorTests.java create mode 100644 server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceStatsTests.java create mode 100644 server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/NodeAllocationOrderingTests.java create mode 100644 server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/PendingListenersQueueTests.java diff --git a/benchmarks/src/main/java/org/elasticsearch/benchmark/routing/allocation/AllocationBenchmark.java b/benchmarks/src/main/java/org/elasticsearch/benchmark/routing/allocation/AllocationBenchmark.java index a0065662faee5..2a8bf1b91ce74 100644 --- a/benchmarks/src/main/java/org/elasticsearch/benchmark/routing/allocation/AllocationBenchmark.java +++ b/benchmarks/src/main/java/org/elasticsearch/benchmark/routing/allocation/AllocationBenchmark.java @@ -8,6 +8,7 @@ package org.elasticsearch.benchmark.routing.allocation; import org.elasticsearch.Version; +import org.elasticsearch.action.ActionListener; import org.elasticsearch.cluster.ClusterName; import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.metadata.IndexMetadata; @@ -150,6 +151,9 @@ private int toInt(String v) { return Integer.valueOf(v.trim()); } + /** + * Once we use DesiredBalanceShardsAllocator this only measures reconciliation, not the balance calculation + */ @Benchmark public ClusterState measureAllocation() { ClusterState clusterState = initialClusterState; @@ -162,7 +166,7 @@ public ClusterState measureAllocation() { .filter(ShardRouting::initializing) .collect(Collectors.toList()) ); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); } return clusterState; } diff --git a/modules/data-streams/src/test/java/org/elasticsearch/datastreams/DataStreamGetWriteIndexTests.java b/modules/data-streams/src/test/java/org/elasticsearch/datastreams/DataStreamGetWriteIndexTests.java index 2b7cf6374e14e..a239ff07b4329 100644 --- a/modules/data-streams/src/test/java/org/elasticsearch/datastreams/DataStreamGetWriteIndexTests.java +++ b/modules/data-streams/src/test/java/org/elasticsearch/datastreams/DataStreamGetWriteIndexTests.java @@ -9,6 +9,7 @@ package org.elasticsearch.datastreams; import org.elasticsearch.Version; +import org.elasticsearch.action.ActionListener; import org.elasticsearch.action.DocWriteRequest; import org.elasticsearch.action.admin.indices.create.CreateIndexRequest; import org.elasticsearch.action.admin.indices.rollover.Condition; @@ -245,7 +246,7 @@ public void setup() throws Exception { Environment env = mock(Environment.class); when(env.sharedDataFile()).thenReturn(null); AllocationService allocationService = mock(AllocationService.class); - when(allocationService.reroute(any(ClusterState.class), any(String.class))).then(i -> i.getArguments()[0]); + when(allocationService.reroute(any(ClusterState.class), any(String.class), any())).then(i -> i.getArguments()[0]); ShardLimitValidator shardLimitValidator = new ShardLimitValidator(Settings.EMPTY, clusterService); createIndexService = new MetadataCreateIndexService( Settings.EMPTY, @@ -277,7 +278,7 @@ public void setup() throws Exception { ); } - createDataStreamService = new MetadataCreateDataStreamService(clusterService, createIndexService); + createDataStreamService = new MetadataCreateDataStreamService(testThreadPool, clusterService, createIndexService); } @After @@ -306,7 +307,7 @@ private ClusterState createDataStream(ClusterState state, String name, Instant t TimeValue.ZERO, false ); - return createDataStreamService.createDataStream(request, state); + return createDataStreamService.createDataStream(request, state, ActionListener.noop()); } private MetadataRolloverService.RolloverResult rolloverOver(ClusterState state, String name, Instant time) throws Exception { diff --git a/server/src/internalClusterTest/java/org/elasticsearch/cluster/coordination/RareClusterStateIT.java b/server/src/internalClusterTest/java/org/elasticsearch/cluster/coordination/RareClusterStateIT.java index 5ddd3dcf4de7f..a8d12d52b50a4 100644 --- a/server/src/internalClusterTest/java/org/elasticsearch/cluster/coordination/RareClusterStateIT.java +++ b/server/src/internalClusterTest/java/org/elasticsearch/cluster/coordination/RareClusterStateIT.java @@ -11,6 +11,7 @@ import org.elasticsearch.ElasticsearchParseException; import org.elasticsearch.Version; import org.elasticsearch.action.ActionFuture; +import org.elasticsearch.action.ActionListener; import org.elasticsearch.action.ActionRequest; import org.elasticsearch.action.ActionRequestBuilder; import org.elasticsearch.action.ActionResponse; @@ -105,7 +106,7 @@ public ClusterState execute(ClusterState currentState) { routingTable.addAsRecovery(updatedState.metadata().index(index)); updatedState = ClusterState.builder(updatedState).routingTable(routingTable.build()).build(); - return allocationService.reroute(updatedState, "reroute"); + return allocationService.reroute(updatedState, "reroute", ActionListener.noop()); } @Override @@ -177,6 +178,7 @@ public void onFailure(Exception e) { return future; } + @AwaitsFix(bugUrl = "https://github.com/elastic/elasticsearch/issues/88647") public void testDeleteCreateInOneBulk() throws Exception { internalCluster().startMasterOnlyNode(); String dataNode = internalCluster().startDataOnlyNode(); diff --git a/server/src/internalClusterTest/java/org/elasticsearch/gateway/ReplicaShardAllocatorIT.java b/server/src/internalClusterTest/java/org/elasticsearch/gateway/ReplicaShardAllocatorIT.java index fefb93e537975..459b5cfeb4916 100644 --- a/server/src/internalClusterTest/java/org/elasticsearch/gateway/ReplicaShardAllocatorIT.java +++ b/server/src/internalClusterTest/java/org/elasticsearch/gateway/ReplicaShardAllocatorIT.java @@ -13,6 +13,9 @@ import org.elasticsearch.cluster.node.DiscoveryNode; import org.elasticsearch.cluster.routing.RoutingNodesHelper; import org.elasticsearch.cluster.routing.UnassignedInfo; +import org.elasticsearch.cluster.routing.allocation.allocator.BalancedShardsAllocator; +import org.elasticsearch.cluster.routing.allocation.allocator.DesiredBalanceShardsAllocator; +import org.elasticsearch.cluster.routing.allocation.allocator.ShardsAllocator; import org.elasticsearch.cluster.routing.allocation.decider.EnableAllocationDecider; import org.elasticsearch.cluster.service.ClusterService; import org.elasticsearch.common.Priority; @@ -425,7 +428,18 @@ public void testDoNotCancelRecoveryForBrokenNode() throws Exception { ); internalCluster().startDataOnlyNode(); newNodeStarted.countDown(); - ensureGreen(indexName); + + var allocator = internalCluster().getInstance(ShardsAllocator.class); + if (allocator instanceof BalancedShardsAllocator) { + // BalancedShardsAllocator will try other node once retries are exhausted + ensureGreen(indexName); + } else if (allocator instanceof DesiredBalanceShardsAllocator) { + // DesiredBalanceShardsAllocator will keep shard in the error state if it could not be allocated on the desired node + ensureYellow(indexName); + } else { + fail("Unknown allocator used"); + } + transportService.clearAllRules(); } diff --git a/server/src/internalClusterTest/java/org/elasticsearch/index/store/CorruptedFileIT.java b/server/src/internalClusterTest/java/org/elasticsearch/index/store/CorruptedFileIT.java index d9a21acbd4175..ec986d1d1f6ea 100644 --- a/server/src/internalClusterTest/java/org/elasticsearch/index/store/CorruptedFileIT.java +++ b/server/src/internalClusterTest/java/org/elasticsearch/index/store/CorruptedFileIT.java @@ -616,6 +616,7 @@ public void testCorruptFileThenSnapshotAndRestore() throws InterruptedException, * nodes, so that replica won't be sneaky and allocated on a node that doesn't have a corrupted * replica. */ + @AwaitsFix(bugUrl = "https://github.com/elastic/elasticsearch/issues/86429") public void testReplicaCorruption() throws Exception { int numDocs = scaledRandomIntBetween(100, 1000); internalCluster().ensureAtLeastNumDataNodes(2); diff --git a/server/src/internalClusterTest/java/org/elasticsearch/indices/IndicesLifecycleListenerIT.java b/server/src/internalClusterTest/java/org/elasticsearch/indices/IndicesLifecycleListenerIT.java index 6ddb8fdc52d83..678efc0903234 100644 --- a/server/src/internalClusterTest/java/org/elasticsearch/indices/IndicesLifecycleListenerIT.java +++ b/server/src/internalClusterTest/java/org/elasticsearch/indices/IndicesLifecycleListenerIT.java @@ -114,7 +114,7 @@ public void beforeIndexCreated(Index index, Settings indexSettings) { * Tests that if an *index* structure creation fails on relocation to a new node, the shard * is not stuck but properly failed. */ - public void testIndexShardFailedOnRelocation() throws Throwable { + public void testIndexShardFailedOnRelocation() { String node1 = internalCluster().startNode(); client().admin() .indices() diff --git a/server/src/internalClusterTest/java/org/elasticsearch/indices/recovery/IndexRecoveryIT.java b/server/src/internalClusterTest/java/org/elasticsearch/indices/recovery/IndexRecoveryIT.java index b9e5b2a3ee93f..c86e9db4650c8 100644 --- a/server/src/internalClusterTest/java/org/elasticsearch/indices/recovery/IndexRecoveryIT.java +++ b/server/src/internalClusterTest/java/org/elasticsearch/indices/recovery/IndexRecoveryIT.java @@ -52,6 +52,9 @@ import org.elasticsearch.cluster.routing.ShardRouting; import org.elasticsearch.cluster.routing.ShardRoutingState; import org.elasticsearch.cluster.routing.UnassignedInfo; +import org.elasticsearch.cluster.routing.allocation.allocator.BalancedShardsAllocator; +import org.elasticsearch.cluster.routing.allocation.allocator.DesiredBalanceShardsAllocator; +import org.elasticsearch.cluster.routing.allocation.allocator.ShardsAllocator; import org.elasticsearch.cluster.routing.allocation.command.AllocateEmptyPrimaryAllocationCommand; import org.elasticsearch.cluster.routing.allocation.command.MoveAllocationCommand; import org.elasticsearch.cluster.routing.allocation.decider.EnableAllocationDecider; @@ -937,7 +940,18 @@ public void testDoNotInfinitelyWaitForMapping() { }); } client().admin().indices().prepareUpdateSettings("test").setSettings(Settings.builder().put("index.number_of_replicas", 1)).get(); - ensureGreen("test"); + + var allocator = internalCluster().getInstance(ShardsAllocator.class); + if (allocator instanceof BalancedShardsAllocator) { + // BalancedShardsAllocator will try other node once retries are exhausted + ensureGreen("test"); + } else if (allocator instanceof DesiredBalanceShardsAllocator) { + // DesiredBalanceShardsAllocator will keep shard in the error state if it could not be allocated on the desired node + ensureYellow("test"); + } else { + fail("Unknown allocator used"); + } + client().admin().indices().prepareRefresh("test").get(); assertHitCount(client().prepareSearch().get(), numDocs); } diff --git a/server/src/main/java/org/elasticsearch/action/admin/cluster/reroute/TransportClusterRerouteAction.java b/server/src/main/java/org/elasticsearch/action/admin/cluster/reroute/TransportClusterRerouteAction.java index 5c73581ffb270..b0376f2a5ca77 100644 --- a/server/src/main/java/org/elasticsearch/action/admin/cluster/reroute/TransportClusterRerouteAction.java +++ b/server/src/main/java/org/elasticsearch/action/admin/cluster/reroute/TransportClusterRerouteAction.java @@ -18,8 +18,8 @@ import org.elasticsearch.action.admin.indices.shards.IndicesShardStoresResponse; import org.elasticsearch.action.support.ActionFilters; import org.elasticsearch.action.support.master.TransportMasterNodeAction; -import org.elasticsearch.cluster.AckedClusterStateUpdateTask; import org.elasticsearch.cluster.ClusterState; +import org.elasticsearch.cluster.ClusterStateAckListener; import org.elasticsearch.cluster.ClusterStateUpdateTask; import org.elasticsearch.cluster.block.ClusterBlockException; import org.elasticsearch.cluster.block.ClusterBlockLevel; @@ -27,6 +27,7 @@ import org.elasticsearch.cluster.node.DiscoveryNode; import org.elasticsearch.cluster.routing.allocation.AllocationService; import org.elasticsearch.cluster.routing.allocation.RoutingExplanations; +import org.elasticsearch.cluster.routing.allocation.allocator.AllocationActionListener; import org.elasticsearch.cluster.routing.allocation.command.AbstractAllocateAllocationCommand; import org.elasticsearch.cluster.routing.allocation.command.AllocateStalePrimaryAllocationCommand; import org.elasticsearch.cluster.routing.allocation.command.AllocationCommand; @@ -34,7 +35,9 @@ import org.elasticsearch.common.Priority; import org.elasticsearch.common.Strings; import org.elasticsearch.common.inject.Inject; +import org.elasticsearch.common.util.concurrent.ThreadContext; import org.elasticsearch.core.SuppressForbidden; +import org.elasticsearch.core.TimeValue; import org.elasticsearch.tasks.Task; import org.elasticsearch.threadpool.ThreadPool; import org.elasticsearch.transport.TransportService; @@ -161,12 +164,18 @@ private void verifyThenSubmitUpdate( private void submitStateUpdate(final ClusterRerouteRequest request, final ActionListener listener) { submitUnbatchedTask( TASK_SOURCE, - new ClusterRerouteResponseAckedClusterStateUpdateTask(logger, allocationService, request, listener.map(response -> { - if (request.dryRun() == false) { - response.getExplanations().getYesDecisionMessages().forEach(logger::info); - } - return response; - })) + new ClusterRerouteResponseAckedClusterStateUpdateTask( + logger, + allocationService, + threadPool.getThreadContext(), + request, + listener.map(response -> { + if (request.dryRun() == false) { + response.getExplanations().getYesDecisionMessages().forEach(logger::info); + } + return response; + }) + ) ); } @@ -175,10 +184,10 @@ private void submitUnbatchedTask(@SuppressWarnings("SameParameterValue") String clusterService.submitUnbatchedStateUpdateTask(source, task); } - static class ClusterRerouteResponseAckedClusterStateUpdateTask extends AckedClusterStateUpdateTask { + static class ClusterRerouteResponseAckedClusterStateUpdateTask extends ClusterStateUpdateTask implements ClusterStateAckListener { private final ClusterRerouteRequest request; - private final ActionListener listener; + private final AllocationActionListener listener; private final Logger logger; private final AllocationService allocationService; private volatile ClusterState clusterStateToSend; @@ -187,46 +196,61 @@ static class ClusterRerouteResponseAckedClusterStateUpdateTask extends AckedClus ClusterRerouteResponseAckedClusterStateUpdateTask( Logger logger, AllocationService allocationService, + ThreadContext context, ClusterRerouteRequest request, ActionListener listener ) { - super(Priority.IMMEDIATE, request, listener); + super(Priority.IMMEDIATE); this.request = request; - this.listener = listener; + this.listener = new AllocationActionListener<>(listener, context); this.logger = logger; this.allocationService = allocationService; } @Override - protected ClusterRerouteResponse newResponse(boolean acknowledged) { - return new ClusterRerouteResponse(acknowledged, clusterStateToSend, explanations); + public boolean mustAck(DiscoveryNode discoveryNode) { + return true; + } + + @Override + public TimeValue ackTimeout() { + return request.ackTimeout(); + } + + @Override + public void onAllNodesAcked() { + listener.clusterStateUpdate().onResponse(new ClusterRerouteResponse(true, clusterStateToSend, explanations)); + } + + @Override + public void onAckFailure(Exception e) { + listener.clusterStateUpdate().onResponse(new ClusterRerouteResponse(false, clusterStateToSend, explanations)); } @Override public void onAckTimeout() { - listener.onResponse(new ClusterRerouteResponse(false, clusterStateToSend, new RoutingExplanations())); + listener.clusterStateUpdate().onResponse(new ClusterRerouteResponse(false, clusterStateToSend, new RoutingExplanations())); } @Override public void onFailure(Exception e) { logger.debug("failed to perform [" + TASK_SOURCE + "]", e); - super.onFailure(e); + listener.clusterStateUpdate().onFailure(e); } @Override public ClusterState execute(ClusterState currentState) { - AllocationService.CommandsResult commandsResult = allocationService.reroute( + var result = allocationService.reroute( currentState, request.getCommands(), request.explain(), - request.isRetryFailed() + request.isRetryFailed(), + request.dryRun(), + listener.reroute() ); - clusterStateToSend = commandsResult.clusterState(); - explanations = commandsResult.explanations(); - if (request.dryRun()) { - return currentState; - } - return commandsResult.clusterState(); + clusterStateToSend = result.clusterState(); + explanations = result.explanations(); + return request.dryRun() ? currentState : result.clusterState(); } } } diff --git a/server/src/main/java/org/elasticsearch/action/admin/indices/create/AutoCreateAction.java b/server/src/main/java/org/elasticsearch/action/admin/indices/create/AutoCreateAction.java index 5d4557b90a806..88d6ddca28af4 100644 --- a/server/src/main/java/org/elasticsearch/action/admin/indices/create/AutoCreateAction.java +++ b/server/src/main/java/org/elasticsearch/action/admin/indices/create/AutoCreateAction.java @@ -34,6 +34,7 @@ import org.elasticsearch.cluster.metadata.MetadataIndexTemplateService; import org.elasticsearch.cluster.node.DiscoveryNode; import org.elasticsearch.cluster.routing.allocation.AllocationService; +import org.elasticsearch.cluster.routing.allocation.allocator.AllocationActionMultiListener; import org.elasticsearch.cluster.service.ClusterService; import org.elasticsearch.common.Priority; import org.elasticsearch.common.inject.Inject; @@ -53,6 +54,7 @@ import java.util.Set; import static org.elasticsearch.cluster.metadata.IndexMetadata.SETTING_INDEX_HIDDEN; +import static org.elasticsearch.cluster.routing.allocation.allocator.AllocationActionListener.rerouteCompletionIsNotRequired; /** * Api that auto creates an index or data stream that originate from requests that write into an index that doesn't yet exist. @@ -106,13 +108,14 @@ public TransportAction( this.metadataCreateDataStreamService = metadataCreateDataStreamService; this.autoCreateIndex = autoCreateIndex; this.executor = batchExecutionContext -> { + final var listener = new AllocationActionMultiListener(threadPool.getThreadContext()); final var taskContexts = batchExecutionContext.taskContexts(); - final Map successfulRequests = Maps.newMapWithExpectedSize(taskContexts.size()); - ClusterState state = batchExecutionContext.initialState(); + final var successfulRequests = Maps.newMapWithExpectedSize(taskContexts.size()); + var state = batchExecutionContext.initialState(); for (final var taskContext : taskContexts) { final var task = taskContext.getTask(); try (var ignored = taskContext.captureResponseHeaders()) { - state = task.execute(state, successfulRequests, taskContext); + state = task.execute(state, successfulRequests, taskContext, listener); assert successfulRequests.containsKey(task.request); } catch (Exception e) { taskContext.onFailure(e); @@ -120,8 +123,10 @@ public TransportAction( } if (state != batchExecutionContext.initialState()) { try (var ignored = batchExecutionContext.dropHeadersContext()) { - state = allocationService.reroute(state, "auto-create"); + state = allocationService.reroute(state, "auto-create", listener.reroute()); } + } else { + listener.noRerouteNeeded(); } return state; }; @@ -161,7 +166,10 @@ public void onFailure(Exception e) { listener.onFailure(e); } - private ClusterStateAckListener getAckListener(String indexName) { + private ClusterStateAckListener getAckListener( + String indexName, + AllocationActionMultiListener allocationActionMultiListener + ) { return new ClusterStateAckListener() { @Override public boolean mustAck(DiscoveryNode discoveryNode) { @@ -175,18 +183,19 @@ public void onAllNodesAcked() { new String[] { indexName }, ActiveShardCount.DEFAULT, request.timeout(), - listener.map(shardsAcked -> new CreateIndexResponse(true, shardsAcked, indexName)) + allocationActionMultiListener.delay(listener) + .map(shardsAcked -> new CreateIndexResponse(true, shardsAcked, indexName)) ); } @Override public void onAckFailure(Exception e) { - listener.onResponse(new CreateIndexResponse(false, false, indexName)); + allocationActionMultiListener.delay(listener).onResponse(new CreateIndexResponse(false, false, indexName)); } @Override public void onAckTimeout() { - listener.onResponse(new CreateIndexResponse(false, false, indexName)); + allocationActionMultiListener.delay(listener).onResponse(new CreateIndexResponse(false, false, indexName)); } @Override @@ -204,11 +213,12 @@ public TimeValue ackTimeout() { ClusterState execute( ClusterState currentState, Map successfulRequests, - ClusterStateTaskExecutor.TaskContext taskContext + ClusterStateTaskExecutor.TaskContext taskContext, + AllocationActionMultiListener allocationActionMultiListener ) throws Exception { final var previousIndexName = successfulRequests.get(request); if (previousIndexName != null) { - taskContext.success(getAckListener(previousIndexName)); + taskContext.success(getAckListener(previousIndexName, allocationActionMultiListener)); return currentState; } @@ -237,10 +247,16 @@ ClusterState execute( request.timeout(), false ); - ClusterState clusterState = metadataCreateDataStreamService.createDataStream(createRequest, currentState); + assert createRequest.performReroute() == false + : "rerouteCompletionIsNotRequired() assumes reroute is not called by underlying service"; + ClusterState clusterState = metadataCreateDataStreamService.createDataStream( + createRequest, + currentState, + rerouteCompletionIsNotRequired() + ); final var indexName = clusterState.metadata().dataStreams().get(request.index()).getIndices().get(0).getName(); - taskContext.success(getAckListener(indexName)); + taskContext.success(getAckListener(indexName, allocationActionMultiListener)); successfulRequests.put(request, indexName); return clusterState; } else { @@ -255,7 +271,7 @@ ClusterState execute( if (shouldAutoCreate == false) { // The index already exists. - taskContext.success(getAckListener(indexName)); + taskContext.success(getAckListener(indexName, allocationActionMultiListener)); successfulRequests.put(request, indexName); return currentState; } @@ -293,8 +309,15 @@ ClusterState execute( updateRequest = buildUpdateRequest(indexName); } - final var clusterState = createIndexService.applyCreateIndexRequest(currentState, updateRequest, false); - taskContext.success(getAckListener(indexName)); + assert updateRequest.performReroute() == false + : "rerouteCompletionIsNotRequired() assumes reroute is not called by underlying service"; + final var clusterState = createIndexService.applyCreateIndexRequest( + currentState, + updateRequest, + false, + rerouteCompletionIsNotRequired() + ); + taskContext.success(getAckListener(indexName, allocationActionMultiListener)); successfulRequests.put(request, indexName); return clusterState; } diff --git a/server/src/main/java/org/elasticsearch/action/admin/indices/rollover/MetadataRolloverService.java b/server/src/main/java/org/elasticsearch/action/admin/indices/rollover/MetadataRolloverService.java index 447b5da508635..9ab67dd4672b3 100644 --- a/server/src/main/java/org/elasticsearch/action/admin/indices/rollover/MetadataRolloverService.java +++ b/server/src/main/java/org/elasticsearch/action/admin/indices/rollover/MetadataRolloverService.java @@ -49,6 +49,7 @@ import static org.elasticsearch.cluster.metadata.MetadataCreateDataStreamService.lookupTemplateForDataStream; import static org.elasticsearch.cluster.metadata.MetadataIndexTemplateService.findV1Templates; import static org.elasticsearch.cluster.metadata.MetadataIndexTemplateService.findV2Template; +import static org.elasticsearch.cluster.routing.allocation.allocator.AllocationActionListener.rerouteCompletionIsNotRequired; /** * Service responsible for handling rollover requests for write aliases and data streams @@ -201,12 +202,15 @@ private RolloverResult rolloverAlias( return new RolloverResult(rolloverIndexName, sourceIndexName, currentState); } - CreateIndexClusterStateUpdateRequest createIndexClusterStateRequest = prepareCreateIndexRequest( - unresolvedName, - rolloverIndexName, - createIndexRequest + var createIndexClusterStateRequest = prepareCreateIndexRequest(unresolvedName, rolloverIndexName, createIndexRequest); + assert createIndexClusterStateRequest.performReroute() == false + : "rerouteCompletionIsNotRequired() assumes reroute is not called by underlying service"; + ClusterState newState = createIndexService.applyCreateIndexRequest( + currentState, + createIndexClusterStateRequest, + silent, + rerouteCompletionIsNotRequired() ); - ClusterState newState = createIndexService.applyCreateIndexRequest(currentState, createIndexClusterStateRequest, silent); newState = indexAliasesService.applyAliasActions( newState, rolloverAliasToNewIndex(sourceIndexName, rolloverIndexName, explicitWriteIndex, aliasMetadata.isHidden(), aliasName) @@ -270,7 +274,7 @@ private RolloverResult rolloverDataStream( return new RolloverResult(newWriteIndexName, originalWriteIndex.getName(), currentState); } - CreateIndexClusterStateUpdateRequest createIndexClusterStateRequest = prepareDataStreamCreateIndexRequest( + var createIndexClusterStateRequest = prepareDataStreamCreateIndexRequest( dataStreamName, newWriteIndexName, createIndexRequest, @@ -278,13 +282,16 @@ private RolloverResult rolloverDataStream( now ); createIndexClusterStateRequest.setMatchingTemplate(templateV2); + assert createIndexClusterStateRequest.performReroute() == false + : "rerouteCompletionIsNotRequired() assumes reroute is not called by underlying service"; ClusterState newState = createIndexService.applyCreateIndexRequest( currentState, createIndexClusterStateRequest, silent, (builder, indexMetadata) -> builder.put( ds.rollover(indexMetadata.getIndex(), newGeneration, metadata.isTimeSeriesTemplate(templateV2)) - ) + ), + rerouteCompletionIsNotRequired() ); RolloverInfo rolloverInfo = new RolloverInfo(dataStreamName, metConditions, threadPool.absoluteTimeInMillis()); diff --git a/server/src/main/java/org/elasticsearch/action/admin/indices/rollover/TransportRolloverAction.java b/server/src/main/java/org/elasticsearch/action/admin/indices/rollover/TransportRolloverAction.java index d10d607e0acaf..4ae1fe18e3ccf 100644 --- a/server/src/main/java/org/elasticsearch/action/admin/indices/rollover/TransportRolloverAction.java +++ b/server/src/main/java/org/elasticsearch/action/admin/indices/rollover/TransportRolloverAction.java @@ -32,6 +32,7 @@ import org.elasticsearch.cluster.metadata.IndexNameExpressionResolver; import org.elasticsearch.cluster.metadata.Metadata; import org.elasticsearch.cluster.routing.allocation.AllocationService; +import org.elasticsearch.cluster.routing.allocation.allocator.AllocationActionMultiListener; import org.elasticsearch.cluster.service.ClusterService; import org.elasticsearch.common.Priority; import org.elasticsearch.common.Strings; @@ -88,7 +89,7 @@ public TransportRolloverAction( ThreadPool.Names.SAME ); this.client = client; - this.rolloverTaskExecutor = new RolloverExecutor(clusterService, allocationService, rolloverService); + this.rolloverTaskExecutor = new RolloverExecutor(clusterService, allocationService, rolloverService, threadPool); } @Override @@ -246,16 +247,20 @@ public void onFailure(Exception e) { } } - record RolloverExecutor(ClusterService clusterService, AllocationService allocationService, MetadataRolloverService rolloverService) - implements - ClusterStateTaskExecutor { + record RolloverExecutor( + ClusterService clusterService, + AllocationService allocationService, + MetadataRolloverService rolloverService, + ThreadPool threadPool + ) implements ClusterStateTaskExecutor { @Override - public ClusterState execute(BatchExecutionContext batchExecutionContext) throws Exception { + public ClusterState execute(BatchExecutionContext batchExecutionContext) { + final var listener = new AllocationActionMultiListener(threadPool.getThreadContext()); final var results = new ArrayList(batchExecutionContext.taskContexts().size()); var state = batchExecutionContext.initialState(); for (final var taskContext : batchExecutionContext.taskContexts()) { try (var ignored = taskContext.captureResponseHeaders()) { - state = executeTask(state, results, taskContext); + state = executeTask(state, results, taskContext, listener); } catch (Exception e) { taskContext.onFailure(e); } @@ -272,8 +277,10 @@ public ClusterState execute(BatchExecutionContext batchExecutionCo reason ); try (var ignored = batchExecutionContext.dropHeadersContext()) { - state = allocationService.reroute(state, reason.toString()); + state = allocationService.reroute(state, reason.toString(), listener.reroute()); } + } else { + listener.noRerouteNeeded(); } return state; } @@ -281,7 +288,8 @@ public ClusterState execute(BatchExecutionContext batchExecutionCo public ClusterState executeTask( ClusterState currentState, List results, - TaskContext rolloverTaskContext + TaskContext rolloverTaskContext, + AllocationActionMultiListener allocationActionMultiListener ) throws Exception { final var rolloverTask = rolloverTaskContext.getTask(); final var rolloverRequest = rolloverTask.rolloverRequest(); @@ -342,7 +350,7 @@ public ClusterState executeTask( new String[] { rolloverIndexName }, rolloverRequest.getCreateIndexRequest().waitForActiveShards(), rolloverRequest.masterNodeTimeout(), - rolloverTask.listener() + allocationActionMultiListener.delay(rolloverTask.listener()) .map( isShardsAcknowledged -> new RolloverResponse( // Note that we use the actual rollover result for these, because even though we're single threaded, diff --git a/server/src/main/java/org/elasticsearch/cluster/ClusterInfoSimulator.java b/server/src/main/java/org/elasticsearch/cluster/ClusterInfoSimulator.java new file mode 100644 index 0000000000000..748ba9459831e --- /dev/null +++ b/server/src/main/java/org/elasticsearch/cluster/ClusterInfoSimulator.java @@ -0,0 +1,106 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0 and the Server Side Public License, v 1; you may not use this file except + * in compliance with, at your election, the Elastic License 2.0 or the Server + * Side Public License, v 1. + */ + +package org.elasticsearch.cluster; + +import org.elasticsearch.cluster.routing.ShardRouting; +import org.elasticsearch.index.shard.ShardId; + +import java.util.HashMap; +import java.util.Map; +import java.util.Objects; + +public class ClusterInfoSimulator { + + private final Map leastAvailableSpaceUsage; + private final Map mostAvailableSpaceUsage; + private final Map shardSizes; + private final Map shardDataSetSizes; + private final Map dataPath; + + public ClusterInfoSimulator(ClusterInfo clusterInfo) { + this.leastAvailableSpaceUsage = new HashMap<>(clusterInfo.getNodeLeastAvailableDiskUsages()); + this.mostAvailableSpaceUsage = new HashMap<>(clusterInfo.getNodeMostAvailableDiskUsages()); + this.shardSizes = new HashMap<>(clusterInfo.shardSizes); + this.shardDataSetSizes = Map.copyOf(clusterInfo.shardDataSetSizes); + this.dataPath = Map.copyOf(clusterInfo.dataPath); + } + + /** + * This method updates disk usage to reflect shard relocations and new replica initialization. + * In case of a single data path both mostAvailableSpaceUsage and leastAvailableSpaceUsage are update to reflect the change. + * In case of multiple data path only mostAvailableSpaceUsage as it is used in calculation in + * {@link org.elasticsearch.cluster.routing.allocation.decider.DiskThresholdDecider} for allocating new shards. + * This assumes the worst case (all shards are placed on a single most used disk) and prevents node overflow. + * Balance is later recalculated with a refreshed cluster info containing actual shards placement. + */ + public void simulate(ShardRouting shard) { + assert shard.initializing(); + + var size = getEstimatedShardSize(shard); + if (size != null && size > 0) { + if (shard.relocatingNodeId() != null) { + // relocation + modifyDiskUsage(shard.relocatingNodeId(), size); + modifyDiskUsage(shard.currentNodeId(), -size); + } else { + // new shard + modifyDiskUsage(shard.currentNodeId(), -size); + shardSizes.put(ClusterInfo.shardIdentifierFromRouting(shard), size); + } + } + } + + private Long getEstimatedShardSize(ShardRouting routing) { + if (routing.relocatingNodeId() != null) { + // relocation existing shard, get size of the source shard + return shardSizes.get(ClusterInfo.shardIdentifierFromRouting(routing)); + } else if (routing.primary() == false) { + // initializing new replica, get size of the source primary shard + return shardSizes.get(ClusterInfo.shardIdentifierFromRouting(routing.shardId(), true)); + } else { + // initializing new (empty) primary + return 0L; + } + } + + private void modifyDiskUsage(String nodeId, long delta) { + var diskUsage = mostAvailableSpaceUsage.get(nodeId); + if (diskUsage == null) { + return; + } + var path = diskUsage.getPath(); + + var leastUsage = leastAvailableSpaceUsage.get(nodeId); + if (leastUsage != null && Objects.equals(leastUsage.getPath(), path)) { + // ensure new value is within bounds + leastAvailableSpaceUsage.put(nodeId, updateWithFreeBytes(leastUsage, delta)); + } + var mostUsage = mostAvailableSpaceUsage.get(nodeId); + if (mostUsage != null && Objects.equals(mostUsage.getPath(), path)) { + // ensure new value is within bounds + mostAvailableSpaceUsage.put(nodeId, updateWithFreeBytes(mostUsage, delta)); + } + } + + private static DiskUsage updateWithFreeBytes(DiskUsage usage, long delta) { + // free bytes might go out of range in case when multiple data path are used + // we might not know exact disk used to allocate a shard and conservatively update + // most used disk on a target node and least used disk on a source node + var freeBytes = withinRange(0, usage.getTotalBytes(), usage.freeBytes() + delta); + return usage.copyWithFreeBytes(freeBytes); + } + + private static long withinRange(long min, long max, long value) { + return Math.max(min, Math.min(max, value)); + } + + public ClusterInfo getClusterInfo() { + return new ClusterInfo(leastAvailableSpaceUsage, mostAvailableSpaceUsage, shardSizes, shardDataSetSizes, dataPath, Map.of()); + } +} diff --git a/server/src/main/java/org/elasticsearch/cluster/ClusterModule.java b/server/src/main/java/org/elasticsearch/cluster/ClusterModule.java index b6ce59ea6a976..d25e3d04bd29b 100644 --- a/server/src/main/java/org/elasticsearch/cluster/ClusterModule.java +++ b/server/src/main/java/org/elasticsearch/cluster/ClusterModule.java @@ -25,9 +25,13 @@ import org.elasticsearch.cluster.metadata.NodesShutdownMetadata; import org.elasticsearch.cluster.metadata.RepositoriesMetadata; import org.elasticsearch.cluster.routing.DelayedAllocationService; +import org.elasticsearch.cluster.routing.RerouteService; import org.elasticsearch.cluster.routing.allocation.AllocationService; import org.elasticsearch.cluster.routing.allocation.ExistingShardsAllocator; +import org.elasticsearch.cluster.routing.allocation.RoutingAllocation; import org.elasticsearch.cluster.routing.allocation.allocator.BalancedShardsAllocator; +import org.elasticsearch.cluster.routing.allocation.allocator.DesiredBalanceShardsAllocator; +import org.elasticsearch.cluster.routing.allocation.allocator.DesiredBalanceShardsAllocator.DesiredBalanceReconcilerAction; import org.elasticsearch.cluster.routing.allocation.allocator.ShardsAllocator; import org.elasticsearch.cluster.routing.allocation.decider.AllocationDecider; import org.elasticsearch.cluster.routing.allocation.decider.AllocationDeciders; @@ -58,7 +62,6 @@ import org.elasticsearch.common.settings.Setting; import org.elasticsearch.common.settings.Setting.Property; import org.elasticsearch.common.settings.Settings; -import org.elasticsearch.common.util.concurrent.ThreadContext; import org.elasticsearch.gateway.GatewayAllocator; import org.elasticsearch.health.metadata.HealthMetadataService; import org.elasticsearch.health.node.selection.HealthNodeTaskExecutor; @@ -71,6 +74,7 @@ import org.elasticsearch.snapshots.SnapshotsInfoService; import org.elasticsearch.tasks.Task; import org.elasticsearch.tasks.TaskResultsService; +import org.elasticsearch.threadpool.ThreadPool; import org.elasticsearch.upgrades.FeatureMigrationResults; import org.elasticsearch.xcontent.NamedXContentRegistry; import org.elasticsearch.xcontent.ParseField; @@ -82,6 +86,7 @@ import java.util.List; import java.util.Map; import java.util.Objects; +import java.util.function.Consumer; import java.util.function.Function; import java.util.function.Supplier; @@ -91,9 +96,10 @@ public class ClusterModule extends AbstractModule { public static final String BALANCED_ALLOCATOR = "balanced"; // default + public static final String DESIRED_BALANCE_ALLOCATOR = "desired_balance"; public static final Setting SHARDS_ALLOCATOR_TYPE_SETTING = new Setting<>( "cluster.routing.allocation.type", - BALANCED_ALLOCATOR, + DESIRED_BALANCE_ALLOCATOR, Function.identity(), Property.NodeScope ); @@ -114,19 +120,31 @@ public ClusterModule( List clusterPlugins, ClusterInfoService clusterInfoService, SnapshotsInfoService snapshotsInfoService, - ThreadContext threadContext, - SystemIndices systemIndices + ThreadPool threadPool, + SystemIndices systemIndices, + Supplier rerouteServiceSupplier ) { this.clusterPlugins = clusterPlugins; this.deciderList = createAllocationDeciders(settings, clusterService.getClusterSettings(), clusterPlugins); this.allocationDeciders = new AllocationDeciders(deciderList); - this.shardsAllocator = createShardsAllocator(settings, clusterService.getClusterSettings(), clusterPlugins); + this.shardsAllocator = createShardsAllocator( + settings, + clusterService.getClusterSettings(), + threadPool, + clusterPlugins, + clusterService, + this::reconcile + ); this.clusterService = clusterService; - this.indexNameExpressionResolver = new IndexNameExpressionResolver(threadContext, systemIndices); + this.indexNameExpressionResolver = new IndexNameExpressionResolver(threadPool.getThreadContext(), systemIndices); this.allocationService = new AllocationService(allocationDeciders, shardsAllocator, clusterInfoService, snapshotsInfoService); this.metadataDeleteIndexService = new MetadataDeleteIndexService(settings, clusterService, allocationService); } + private ClusterState reconcile(ClusterState clusterState, Consumer routingAllocationConsumer) { + return allocationService.executeWithRoutingAllocation(clusterState, "reconcile-desired-balance", routingAllocationConsumer); + } + public static List getNamedWriteables() { List entries = new ArrayList<>(); // Cluster State @@ -323,10 +341,22 @@ private static void addAllocationDecider(Map, AllocationDecider> decide private static ShardsAllocator createShardsAllocator( Settings settings, ClusterSettings clusterSettings, - List clusterPlugins + ThreadPool threadPool, + List clusterPlugins, + ClusterService clusterService, + DesiredBalanceReconcilerAction reconciler ) { Map> allocators = new HashMap<>(); allocators.put(BALANCED_ALLOCATOR, () -> new BalancedShardsAllocator(settings, clusterSettings)); + allocators.put( + DESIRED_BALANCE_ALLOCATOR, + () -> new DesiredBalanceShardsAllocator( + new BalancedShardsAllocator(settings, clusterSettings), + threadPool, + clusterService, + reconciler + ) + ); for (ClusterPlugin plugin : clusterPlugins) { plugin.getShardsAllocators(settings, clusterSettings).forEach((k, v) -> { diff --git a/server/src/main/java/org/elasticsearch/cluster/DiskUsage.java b/server/src/main/java/org/elasticsearch/cluster/DiskUsage.java index e3422860b1b89..1d606737edf3a 100644 --- a/server/src/main/java/org/elasticsearch/cluster/DiskUsage.java +++ b/server/src/main/java/org/elasticsearch/cluster/DiskUsage.java @@ -119,6 +119,10 @@ public String toString() { + "]"; } + public DiskUsage copyWithFreeBytes(long freeBytes) { + return new DiskUsage(nodeId, nodeName, path, totalBytes, freeBytes); + } + /** * Finds the path with the least available disk space and returns its disk usage. It returns null if there is no * file system data in the NodeStats or if the total bytes are a negative number. diff --git a/server/src/main/java/org/elasticsearch/cluster/metadata/MetadataCreateDataStreamService.java b/server/src/main/java/org/elasticsearch/cluster/metadata/MetadataCreateDataStreamService.java index 900fea320e717..21df6b1b61209 100644 --- a/server/src/main/java/org/elasticsearch/cluster/metadata/MetadataCreateDataStreamService.java +++ b/server/src/main/java/org/elasticsearch/cluster/metadata/MetadataCreateDataStreamService.java @@ -21,6 +21,7 @@ import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.ClusterStateUpdateTask; import org.elasticsearch.cluster.ack.ClusterStateUpdateRequest; +import org.elasticsearch.cluster.routing.allocation.allocator.AllocationActionListener; import org.elasticsearch.cluster.service.ClusterService; import org.elasticsearch.common.Priority; import org.elasticsearch.common.Strings; @@ -34,6 +35,7 @@ import org.elasticsearch.indices.SystemDataStreamDescriptor; import org.elasticsearch.indices.SystemIndexDescriptor; import org.elasticsearch.rest.RestStatus; +import org.elasticsearch.threadpool.ThreadPool; import java.io.IOException; import java.util.ArrayList; @@ -48,10 +50,16 @@ public class MetadataCreateDataStreamService { private static final Logger logger = LogManager.getLogger(MetadataCreateDataStreamService.class); + private final ThreadPool threadPool; private final ClusterService clusterService; private final MetadataCreateIndexService metadataCreateIndexService; - public MetadataCreateDataStreamService(ClusterService clusterService, MetadataCreateIndexService metadataCreateIndexService) { + public MetadataCreateDataStreamService( + ThreadPool threadPool, + ClusterService clusterService, + MetadataCreateIndexService metadataCreateIndexService + ) { + this.threadPool = threadPool; this.clusterService = clusterService; this.metadataCreateIndexService = metadataCreateIndexService; } @@ -73,14 +81,18 @@ public void createDataStream(CreateDataStreamClusterStateUpdateRequest request, finalListener.onResponse(AcknowledgedResponse.FALSE); } }, finalListener::onFailure); - submitUnbatchedTask("create-data-stream [" + request.name + "]", new AckedClusterStateUpdateTask(Priority.HIGH, request, listener) { - @Override - public ClusterState execute(ClusterState currentState) throws Exception { - ClusterState clusterState = createDataStream(metadataCreateIndexService, currentState, request); - firstBackingIndexRef.set(clusterState.metadata().dataStreams().get(request.name).getIndices().get(0).getName()); - return clusterState; + var delegate = new AllocationActionListener<>(listener, threadPool.getThreadContext()); + submitUnbatchedTask( + "create-data-stream [" + request.name + "]", + new AckedClusterStateUpdateTask(Priority.HIGH, request, delegate.clusterStateUpdate()) { + @Override + public ClusterState execute(ClusterState currentState) throws Exception { + ClusterState clusterState = createDataStream(metadataCreateIndexService, currentState, request, delegate.reroute()); + firstBackingIndexRef.set(clusterState.metadata().dataStreams().get(request.name).getIndices().get(0).getName()); + return clusterState; + } } - }); + ); } @SuppressForbidden(reason = "legacy usage of unbatched task") // TODO add support for batching here @@ -88,8 +100,12 @@ private void submitUnbatchedTask(@SuppressWarnings("SameParameterValue") String clusterService.submitUnbatchedStateUpdateTask(source, task); } - public ClusterState createDataStream(CreateDataStreamClusterStateUpdateRequest request, ClusterState current) throws Exception { - return createDataStream(metadataCreateIndexService, current, request); + public ClusterState createDataStream( + CreateDataStreamClusterStateUpdateRequest request, + ClusterState current, + ActionListener listener + ) throws Exception { + return createDataStream(metadataCreateIndexService, current, request, listener); } public static final class CreateDataStreamClusterStateUpdateRequest extends ClusterStateUpdateRequest< @@ -146,9 +162,10 @@ public SystemDataStreamDescriptor getSystemDataStreamDescriptor() { static ClusterState createDataStream( MetadataCreateIndexService metadataCreateIndexService, ClusterState currentState, - CreateDataStreamClusterStateUpdateRequest request + CreateDataStreamClusterStateUpdateRequest request, + ActionListener listener ) throws Exception { - return createDataStream(metadataCreateIndexService, currentState, request, List.of(), null); + return createDataStream(metadataCreateIndexService, currentState, request, List.of(), null, listener); } /** @@ -166,7 +183,8 @@ static ClusterState createDataStream( ClusterState currentState, CreateDataStreamClusterStateUpdateRequest request, List backingIndices, - IndexMetadata writeIndex + IndexMetadata writeIndex, + ActionListener listener ) throws Exception { String dataStreamName = request.name; SystemDataStreamDescriptor systemDataStreamDescriptor = request.getSystemDataStreamDescriptor(); @@ -221,7 +239,7 @@ static ClusterState createDataStream( } try { - currentState = metadataCreateIndexService.applyCreateIndexRequest(currentState, createIndexRequest, false); + currentState = metadataCreateIndexService.applyCreateIndexRequest(currentState, createIndexRequest, false, listener); } catch (ResourceAlreadyExistsException e) { // Rethrow as ElasticsearchStatusException, so that bulk transport action doesn't ignore it during // auto index/data stream creation. @@ -234,6 +252,8 @@ static ClusterState createDataStream( ); } writeIndex = currentState.metadata().index(firstBackingIndexName); + } else { + listener.onResponse(null); } assert writeIndex != null; assert writeIndex.mapping() != null : "no mapping found for backing index [" + writeIndex.getIndex().getName() + "]"; diff --git a/server/src/main/java/org/elasticsearch/cluster/metadata/MetadataCreateIndexService.java b/server/src/main/java/org/elasticsearch/cluster/metadata/MetadataCreateIndexService.java index 563319df226d9..99585895005ec 100644 --- a/server/src/main/java/org/elasticsearch/cluster/metadata/MetadataCreateIndexService.java +++ b/server/src/main/java/org/elasticsearch/cluster/metadata/MetadataCreateIndexService.java @@ -34,6 +34,7 @@ import org.elasticsearch.cluster.routing.ShardRoutingState; import org.elasticsearch.cluster.routing.allocation.AllocationService; import org.elasticsearch.cluster.routing.allocation.DataTier; +import org.elasticsearch.cluster.routing.allocation.allocator.AllocationActionListener; import org.elasticsearch.cluster.service.ClusterService; import org.elasticsearch.common.Priority; import org.elasticsearch.common.Strings; @@ -129,6 +130,7 @@ public class MetadataCreateIndexService { private final ShardLimitValidator shardLimitValidator; private final boolean forbidPrivateIndexSettings; private final Set indexSettingProviders; + private final ThreadPool threadPool; public MetadataCreateIndexService( final Settings settings, @@ -155,6 +157,7 @@ public MetadataCreateIndexService( this.forbidPrivateIndexSettings = forbidPrivateIndexSettings; this.shardLimitValidator = shardLimitValidator; this.indexSettingProviders = indexSettingProviders.getIndexSettingProviders(); + this.threadPool = threadPool; } /** @@ -284,13 +287,15 @@ public void createIndex(final CreateIndexClusterStateUpdateRequest request, fina private void onlyCreateIndex(final CreateIndexClusterStateUpdateRequest request, final ActionListener listener) { normalizeRequestSetting(request); + + var delegate = new AllocationActionListener<>(listener, threadPool.getThreadContext()); submitUnbatchedTask( "create-index [" + request.index() + "], cause [" + request.cause() + "]", - new AckedClusterStateUpdateTask(Priority.URGENT, request, listener) { + new AckedClusterStateUpdateTask(Priority.URGENT, request, delegate.clusterStateUpdate()) { @Override public ClusterState execute(ClusterState currentState) throws Exception { - return applyCreateIndexRequest(currentState, request, false); + return applyCreateIndexRequest(currentState, request, false, null, delegate.reroute()); } @Override @@ -328,7 +333,8 @@ public ClusterState applyCreateIndexRequest( ClusterState currentState, CreateIndexClusterStateUpdateRequest request, boolean silent, - BiConsumer metadataTransformer + BiConsumer metadataTransformer, + ActionListener rerouteListener ) throws Exception { normalizeRequestSetting(request); @@ -342,21 +348,28 @@ public ClusterState applyCreateIndexRequest( if (sourceMetadata != null) { // If source metadata was provided, it means we're recovering from an existing index, // in which case templates don't apply, so create the index from the source metadata - return applyCreateIndexRequestWithExistingMetadata(currentState, request, silent, sourceMetadata, metadataTransformer); + return applyCreateIndexRequestWithExistingMetadata( + currentState, + request, + silent, + sourceMetadata, + metadataTransformer, + rerouteListener + ); } else { // The backing index may have a different name or prefix than the data stream name. final String name = request.dataStreamName() != null ? request.dataStreamName() : request.index(); // The index being created is for a system data stream, so the backing index will also be a system index if (request.systemDataStreamDescriptor() != null) { - return applyCreateIndexRequestForSystemDataStream(currentState, request, silent, metadataTransformer); + return applyCreateIndexRequestForSystemDataStream(currentState, request, silent, metadataTransformer, rerouteListener); } SystemIndexDescriptor descriptor = systemIndices.findMatchingDescriptor(request.index()); // ignore all templates for all system indices that do not allow templates. // Essentially, all but .kibana indices, see KibanaPlugin.java. if (Objects.nonNull(descriptor) && descriptor.allowsTemplates() == false) { - return applyCreateIndexRequestForSystemIndex(currentState, request, silent, descriptor.getIndexPattern()); + return applyCreateIndexRequestForSystemIndex(currentState, request, silent, descriptor.getIndexPattern(), rerouteListener); } // Hidden indices apply templates slightly differently (ignoring wildcard '*' @@ -376,7 +389,14 @@ public ClusterState applyCreateIndexRequest( if (v2Template != null) { // If a v2 template was found, it takes precedence over all v1 templates, so create // the index using that template and the request's specified settings - return applyCreateIndexRequestWithV2Template(currentState, request, silent, v2Template, metadataTransformer); + return applyCreateIndexRequestWithV2Template( + currentState, + request, + silent, + v2Template, + metadataTransformer, + rerouteListener + ); } else { // A v2 template wasn't found, check the v1 templates, in the event no templates are // found creation still works using the request's specified index settings @@ -396,14 +416,25 @@ public ClusterState applyCreateIndexRequest( ); } - return applyCreateIndexRequestWithV1Templates(currentState, request, silent, v1Templates, metadataTransformer); + return applyCreateIndexRequestWithV1Templates( + currentState, + request, + silent, + v1Templates, + metadataTransformer, + rerouteListener + ); } } } - public ClusterState applyCreateIndexRequest(ClusterState currentState, CreateIndexClusterStateUpdateRequest request, boolean silent) - throws Exception { - return applyCreateIndexRequest(currentState, request, silent, null); + public ClusterState applyCreateIndexRequest( + ClusterState currentState, + CreateIndexClusterStateUpdateRequest request, + boolean silent, + ActionListener rerouteListener + ) throws Exception { + return applyCreateIndexRequest(currentState, request, silent, null, rerouteListener); } /** @@ -431,7 +462,8 @@ private ClusterState applyCreateIndexWithTemporaryService( final List mappings, final Function> aliasSupplier, final List templatesApplied, - final BiConsumer metadataTransformer + final BiConsumer metadataTransformer, + final ActionListener rerouteListener ) throws Exception { // create the index here (on the master) to validate it can be created, as well as adding the mapping return indicesService.withTempIndexService(temporaryIndexMeta, indexService -> { @@ -471,10 +503,12 @@ private ClusterState applyCreateIndexWithTemporaryService( ); indexService.getIndexEventListener().beforeIndexAddedToCluster(indexMetadata.getIndex(), indexMetadata.getSettings()); - BiFunction rerouteFunction = request.performReroute() - ? allocationService::reroute - : (cs, reason) -> cs; - return clusterStateCreateIndex(currentState, request.blocks(), indexMetadata, rerouteFunction, metadataTransformer); + + ClusterState updated = clusterStateCreateIndex(currentState, request.blocks(), indexMetadata, metadataTransformer); + if (request.performReroute()) { + updated = allocationService.reroute(updated, "index [" + indexMetadata.getIndex().getName() + "] created", rerouteListener); + } + return updated; }); } @@ -514,7 +548,8 @@ private ClusterState applyCreateIndexRequestWithV1Templates( final CreateIndexClusterStateUpdateRequest request, final boolean silent, final List templates, - final BiConsumer metadataTransformer + final BiConsumer metadataTransformer, + final ActionListener rerouteListener ) throws Exception { logger.debug( "applying create index request using legacy templates {}", @@ -568,7 +603,8 @@ private ClusterState applyCreateIndexRequestWithV1Templates( systemIndices::isSystemName ), templates.stream().map(IndexTemplateMetadata::getName).collect(toList()), - metadataTransformer + metadataTransformer, + rerouteListener ); } @@ -577,7 +613,8 @@ private ClusterState applyCreateIndexRequestWithV2Template( final CreateIndexClusterStateUpdateRequest request, final boolean silent, final String templateName, - final BiConsumer metadataTransformer + final BiConsumer metadataTransformer, + final ActionListener rerouteListener ) throws Exception { logger.debug("applying create index request using composable template [{}]", templateName); @@ -635,7 +672,8 @@ private ClusterState applyCreateIndexRequestWithV2Template( systemIndices::isSystemName ), Collections.singletonList(templateName), - metadataTransformer + metadataTransformer, + rerouteListener ); } @@ -643,7 +681,8 @@ private ClusterState applyCreateIndexRequestForSystemIndex( final ClusterState currentState, final CreateIndexClusterStateUpdateRequest request, final boolean silent, - final String indexPattern + final String indexPattern, + final ActionListener rerouteListener ) throws Exception { logger.debug("applying create index request for system index [{}] matching pattern [{}]", request.index(), indexPattern); @@ -681,7 +720,8 @@ private ClusterState applyCreateIndexRequestForSystemIndex( systemIndices::isSystemName ), List.of(), - null + null, + rerouteListener ); } @@ -689,7 +729,8 @@ private ClusterState applyCreateIndexRequestForSystemDataStream( final ClusterState currentState, final CreateIndexClusterStateUpdateRequest request, final boolean silent, - final BiConsumer metadataTransformer + final BiConsumer metadataTransformer, + final ActionListener rerouteListener ) throws Exception { Objects.requireNonNull(request.systemDataStreamDescriptor()); logger.debug("applying create index request for system data stream [{}]", request.systemDataStreamDescriptor()); @@ -738,7 +779,8 @@ private ClusterState applyCreateIndexRequestForSystemDataStream( systemIndices::isSystemName ), List.of(), - metadataTransformer + metadataTransformer, + rerouteListener ); } @@ -788,7 +830,8 @@ private ClusterState applyCreateIndexRequestWithExistingMetadata( final CreateIndexClusterStateUpdateRequest request, final boolean silent, final IndexMetadata sourceMetadata, - final BiConsumer metadataTransformer + final BiConsumer metadataTransformer, + final ActionListener rerouteListener ) throws Exception { logger.info("applying create index request using existing index [{}] metadata", sourceMetadata.getIndex().getName()); @@ -833,7 +876,8 @@ private ClusterState applyCreateIndexRequestWithExistingMetadata( systemIndices::isSystemName ), List.of(), - metadataTransformer + metadataTransformer, + rerouteListener ); } @@ -1175,7 +1219,6 @@ static ClusterState clusterStateCreateIndex( ClusterState currentState, Set clusterBlocks, IndexMetadata indexMetadata, - BiFunction rerouteRoutingTable, BiConsumer metadataTransformer ) { final Metadata newMetadata; @@ -1195,8 +1238,7 @@ static ClusterState clusterStateCreateIndex( RoutingTable.Builder routingTableBuilder = RoutingTable.builder(updatedState.routingTable()) .addAsNew(updatedState.metadata().index(indexName)); - updatedState = ClusterState.builder(updatedState).routingTable(routingTableBuilder.build()).build(); - return rerouteRoutingTable.apply(updatedState, "index [" + indexName + "] created"); + return ClusterState.builder(updatedState).routingTable(routingTableBuilder.build()).build(); } static IndexMetadata buildIndexMetadata( diff --git a/server/src/main/java/org/elasticsearch/cluster/metadata/MetadataDeleteIndexService.java b/server/src/main/java/org/elasticsearch/cluster/metadata/MetadataDeleteIndexService.java index fcf07ae6a014b..773ebff7d1622 100644 --- a/server/src/main/java/org/elasticsearch/cluster/metadata/MetadataDeleteIndexService.java +++ b/server/src/main/java/org/elasticsearch/cluster/metadata/MetadataDeleteIndexService.java @@ -38,6 +38,8 @@ import java.util.Map; import java.util.Set; +import static org.elasticsearch.cluster.routing.allocation.allocator.AllocationActionListener.rerouteCompletionIsNotRequired; + /** * Deletes indices. */ @@ -67,7 +69,11 @@ public Tuple executeTask( @Override public ClusterState afterBatchExecution(ClusterState clusterState, boolean clusterStateChanged) { if (clusterStateChanged) { - return allocationService.reroute(clusterState, "deleted indices"); + return allocationService.reroute( + clusterState, + "deleted indices", + rerouteCompletionIsNotRequired() // it is not required to balance shard to report index deletion success + ); } return clusterState; } diff --git a/server/src/main/java/org/elasticsearch/cluster/metadata/MetadataIndexStateService.java b/server/src/main/java/org/elasticsearch/cluster/metadata/MetadataIndexStateService.java index 3caf0b0fd6af8..8d6805902f54e 100644 --- a/server/src/main/java/org/elasticsearch/cluster/metadata/MetadataIndexStateService.java +++ b/server/src/main/java/org/elasticsearch/cluster/metadata/MetadataIndexStateService.java @@ -46,6 +46,7 @@ import org.elasticsearch.cluster.routing.IndexShardRoutingTable; import org.elasticsearch.cluster.routing.RoutingTable; import org.elasticsearch.cluster.routing.allocation.AllocationService; +import org.elasticsearch.cluster.routing.allocation.allocator.AllocationActionMultiListener; import org.elasticsearch.cluster.service.ClusterService; import org.elasticsearch.common.Priority; import org.elasticsearch.common.Strings; @@ -215,8 +216,9 @@ private class CloseIndicesExecutor implements ClusterStateTaskExecutor batchExecutionContext) throws Exception { - ClusterState state = batchExecutionContext.initialState(); + public ClusterState execute(BatchExecutionContext batchExecutionContext) { + var listener = new AllocationActionMultiListener(threadPool.getThreadContext()); + var state = batchExecutionContext.initialState(); for (final var taskContext : batchExecutionContext.taskContexts()) { final var task = taskContext.getTask(); try { @@ -243,7 +245,7 @@ public ClusterState execute(BatchExecutionContext batchExecuti waitForIndices, task.request.waitForActiveShards(), task.request.ackTimeout(), - task.listener().map(shardsAcknowledged -> { + listener.delay(task.listener()).map(shardsAcknowledged -> { if (shardsAcknowledged == false) { logger.debug( () -> format( @@ -261,7 +263,7 @@ public ClusterState execute(BatchExecutionContext batchExecuti }) ); } else { - task.listener().onResponse(new CloseIndexResponse(acknowledged, false, indices)); + listener.delay(task.listener()).onResponse(new CloseIndexResponse(acknowledged, false, indices)); } }); } catch (Exception e) { @@ -271,7 +273,7 @@ public ClusterState execute(BatchExecutionContext batchExecuti try (var ignored = batchExecutionContext.dropHeadersContext()) { // reroute may encounter deprecated features but the resulting warnings are not associated with any particular task - return allocationService.reroute(state, "indices closed"); + return allocationService.reroute(state, "indices closed", listener.reroute()); } } } @@ -1053,7 +1055,8 @@ private class OpenIndicesExecutor implements ClusterStateTaskExecutor batchExecutionContext) { - ClusterState state = batchExecutionContext.initialState(); + var listener = new AllocationActionMultiListener(threadPool.getThreadContext()); + var state = batchExecutionContext.initialState(); try (var ignored = batchExecutionContext.dropHeadersContext()) { // we may encounter deprecated settings but they are not directly related to opening the indices, nor are they really @@ -1070,11 +1073,11 @@ public ClusterState execute(BatchExecutionContext batchExecutio state = openIndices(indices, state); // do a final reroute - state = allocationService.reroute(state, "indices opened"); + state = allocationService.reroute(state, "indices opened", listener.reroute()); for (final var taskContext : batchExecutionContext.taskContexts()) { final var task = taskContext.getTask(); - taskContext.success(task); + taskContext.success(task.getAckListener(listener)); } } catch (Exception e) { for (final var taskContext : batchExecutionContext.taskContexts()) { @@ -1160,37 +1163,40 @@ private ClusterState openIndices(final Index[] indices, final ClusterState curre private record OpenIndicesTask(OpenIndexClusterStateUpdateRequest request, ActionListener listener) implements - ClusterStateTaskListener, - ClusterStateAckListener { + ClusterStateTaskListener { @Override public void onFailure(Exception e) { listener.onFailure(e); } - @Override - public boolean mustAck(DiscoveryNode discoveryNode) { - return true; - } + public ClusterStateAckListener getAckListener(AllocationActionMultiListener multiListener) { + return new ClusterStateAckListener() { + @Override + public boolean mustAck(DiscoveryNode discoveryNode) { + return true; + } - @Override - public void onAllNodesAcked() { - listener.onResponse(AcknowledgedResponse.of(true)); - } + @Override + public void onAllNodesAcked() { + multiListener.delay(listener).onResponse(AcknowledgedResponse.of(true)); + } - @Override - public void onAckFailure(Exception e) { - listener.onResponse(AcknowledgedResponse.of(false)); - } + @Override + public void onAckFailure(Exception e) { + multiListener.delay(listener).onResponse(AcknowledgedResponse.of(false)); + } - @Override - public void onAckTimeout() { - listener.onResponse(AcknowledgedResponse.FALSE); - } + @Override + public void onAckTimeout() { + multiListener.delay(listener).onResponse(AcknowledgedResponse.FALSE); + } - @Override - public TimeValue ackTimeout() { - return request.ackTimeout(); + @Override + public TimeValue ackTimeout() { + return request.ackTimeout(); + } + }; } } } diff --git a/server/src/main/java/org/elasticsearch/cluster/metadata/MetadataMigrateToDataStreamService.java b/server/src/main/java/org/elasticsearch/cluster/metadata/MetadataMigrateToDataStreamService.java index 80e92a2c6857d..e428d70a9d2f2 100644 --- a/server/src/main/java/org/elasticsearch/cluster/metadata/MetadataMigrateToDataStreamService.java +++ b/server/src/main/java/org/elasticsearch/cluster/metadata/MetadataMigrateToDataStreamService.java @@ -19,6 +19,7 @@ import org.elasticsearch.cluster.ClusterStateUpdateTask; import org.elasticsearch.cluster.ack.ClusterStateUpdateRequest; import org.elasticsearch.cluster.metadata.MetadataCreateDataStreamService.CreateDataStreamClusterStateUpdateRequest; +import org.elasticsearch.cluster.routing.allocation.allocator.AllocationActionListener; import org.elasticsearch.cluster.service.ClusterService; import org.elasticsearch.common.Priority; import org.elasticsearch.common.Strings; @@ -96,9 +97,10 @@ public void migrateToDataStream( finalListener.onResponse(AcknowledgedResponse.FALSE); } }, finalListener::onFailure); + var delegate = new AllocationActionListener<>(listener, threadContext); submitUnbatchedTask( "migrate-to-data-stream [" + request.aliasName + "]", - new AckedClusterStateUpdateTask(Priority.HIGH, request, listener) { + new AckedClusterStateUpdateTask(Priority.HIGH, request, delegate.clusterStateUpdate()) { @Override public ClusterState execute(ClusterState currentState) throws Exception { @@ -108,7 +110,7 @@ public ClusterState execute(ClusterState currentState) throws Exception { } catch (IOException e) { throw new IllegalStateException(e); } - }, request, metadataCreateIndexService); + }, request, metadataCreateIndexService, delegate.reroute()); writeIndexRef.set(clusterState.metadata().dataStreams().get(request.aliasName).getWriteIndex().getName()); return clusterState; } @@ -125,7 +127,8 @@ static ClusterState migrateToDataStream( ClusterState currentState, Function mapperSupplier, MigrateToDataStreamClusterStateUpdateRequest request, - MetadataCreateIndexService metadataCreateIndexService + MetadataCreateIndexService metadataCreateIndexService, + ActionListener listener ) throws Exception { validateRequest(currentState, request); IndexAbstraction.Alias alias = (IndexAbstraction.Alias) currentState.metadata().getIndicesLookup().get(request.aliasName); @@ -149,7 +152,14 @@ static ClusterState migrateToDataStream( logger.info("submitting request to migrate alias [{}] to a data stream", request.aliasName); CreateDataStreamClusterStateUpdateRequest req = new CreateDataStreamClusterStateUpdateRequest(request.aliasName); - return createDataStream(metadataCreateIndexService, currentState, req, backingIndices, currentState.metadata().index(writeIndex)); + return createDataStream( + metadataCreateIndexService, + currentState, + req, + backingIndices, + currentState.metadata().index(writeIndex), + listener + ); } // package-visible for testing diff --git a/server/src/main/java/org/elasticsearch/cluster/metadata/MetadataUpdateSettingsService.java b/server/src/main/java/org/elasticsearch/cluster/metadata/MetadataUpdateSettingsService.java index 9fdc144e76283..0c138087e6e52 100644 --- a/server/src/main/java/org/elasticsearch/cluster/metadata/MetadataUpdateSettingsService.java +++ b/server/src/main/java/org/elasticsearch/cluster/metadata/MetadataUpdateSettingsService.java @@ -19,12 +19,12 @@ import org.elasticsearch.cluster.ClusterStateTaskConfig; import org.elasticsearch.cluster.ClusterStateTaskExecutor; import org.elasticsearch.cluster.ClusterStateTaskListener; -import org.elasticsearch.cluster.SimpleBatchedAckListenerTaskExecutor; import org.elasticsearch.cluster.block.ClusterBlock; import org.elasticsearch.cluster.block.ClusterBlocks; import org.elasticsearch.cluster.node.DiscoveryNode; import org.elasticsearch.cluster.routing.RoutingTable; import org.elasticsearch.cluster.routing.allocation.AllocationService; +import org.elasticsearch.cluster.routing.allocation.allocator.AllocationActionMultiListener; import org.elasticsearch.cluster.service.ClusterService; import org.elasticsearch.common.Priority; import org.elasticsearch.common.regex.Regex; @@ -32,11 +32,11 @@ import org.elasticsearch.common.settings.Setting; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.core.TimeValue; -import org.elasticsearch.core.Tuple; import org.elasticsearch.index.Index; import org.elasticsearch.index.IndexSettings; import org.elasticsearch.indices.IndicesService; import org.elasticsearch.indices.ShardLimitValidator; +import org.elasticsearch.threadpool.ThreadPool; import java.io.IOException; import java.util.Arrays; @@ -65,27 +65,38 @@ public MetadataUpdateSettingsService( AllocationService allocationService, IndexScopedSettings indexScopedSettings, IndicesService indicesService, - ShardLimitValidator shardLimitValidator + ShardLimitValidator shardLimitValidator, + ThreadPool threadPool ) { this.clusterService = clusterService; this.allocationService = allocationService; this.indexScopedSettings = indexScopedSettings; this.indicesService = indicesService; this.shardLimitValidator = shardLimitValidator; - this.executor = new SimpleBatchedAckListenerTaskExecutor<>() { - @Override - public Tuple executeTask(UpdateSettingsTask task, ClusterState clusterState) { - return Tuple.tuple(task.execute(clusterState), task.getAckListener()); - } + this.executor = batchExecutionContext -> { + var listener = new AllocationActionMultiListener(threadPool.getThreadContext()); + var state = batchExecutionContext.initialState(); + for (final var taskContext : batchExecutionContext.taskContexts()) { + try { + final var task = taskContext.getTask(); + try (var ignored = taskContext.captureResponseHeaders()) { + state = task.execute(state); + } + taskContext.success(task.getAckListener(listener)); + } catch (Exception e) { + taskContext.onFailure(e); + } - @Override - public ClusterState afterBatchExecution(ClusterState clusterState, boolean clusterStateChanged) { - if (clusterStateChanged) { - // reroute in case things change that require it (like number of replicas) - return allocationService.reroute(clusterState, "settings update"); + } + if (state != batchExecutionContext.initialState()) { + // reroute in case things change that require it (like number of replicas) + try (var ignored = batchExecutionContext.dropHeadersContext()) { + state = allocationService.reroute(state, "settings update", listener.reroute()); } - return clusterState; + } else { + listener.noRerouteNeeded(); } + return state; }; } @@ -98,7 +109,7 @@ private UpdateSettingsTask(UpdateSettingsClusterStateUpdateRequest request, Acti this.listener = listener; } - private ClusterStateAckListener getAckListener() { + private ClusterStateAckListener getAckListener(AllocationActionMultiListener multiListener) { return new ClusterStateAckListener() { @Override public boolean mustAck(DiscoveryNode discoveryNode) { @@ -107,17 +118,17 @@ public boolean mustAck(DiscoveryNode discoveryNode) { @Override public void onAllNodesAcked() { - listener.onResponse(AcknowledgedResponse.of(true)); + multiListener.delay(listener).onResponse(AcknowledgedResponse.of(true)); } @Override public void onAckFailure(Exception e) { - listener.onFailure(e); + multiListener.delay(listener).onFailure(e); } @Override public void onAckTimeout() { - listener.onResponse(AcknowledgedResponse.of(false)); + multiListener.delay(listener).onResponse(AcknowledgedResponse.of(false)); } @Override diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/BatchedRerouteService.java b/server/src/main/java/org/elasticsearch/cluster/routing/BatchedRerouteService.java index 8cc59648d1dcb..bab3ca3a48ff3 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/BatchedRerouteService.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/BatchedRerouteService.java @@ -19,6 +19,7 @@ import org.elasticsearch.cluster.service.ClusterService; import org.elasticsearch.cluster.service.MasterService; import org.elasticsearch.common.Priority; +import org.elasticsearch.common.util.concurrent.ListenableFuture; import org.elasticsearch.core.Nullable; import org.elasticsearch.core.SuppressForbidden; @@ -46,7 +47,7 @@ public class BatchedRerouteService implements RerouteService { private Priority pendingTaskPriority = Priority.LANGUID; public interface RerouteAction { - ClusterState reroute(ClusterState state, String reason); + ClusterState reroute(ClusterState state, String reason, ActionListener listener); } /** @@ -101,6 +102,7 @@ public final void reroute(String reason, Priority priority, ActionListener(); final String source = CLUSTER_UPDATE_TASK_SOURCE + "(" + reason + ")"; submitUnbatchedTask(source, new ClusterStateUpdateTask(priority) { @@ -117,9 +119,11 @@ public ClusterState execute(ClusterState currentState) { } if (currentListenersArePending) { logger.trace("performing batched reroute [{}]", reason); - return reroute.reroute(currentState, reason); + return reroute.reroute(currentState, reason, future); } else { logger.trace("batched reroute [{}] was promoted", reason); + // reroute was batched and completed in other branch + future.onResponse(null); return currentState; } } @@ -148,7 +152,7 @@ public void onFailure(Exception e) { @Override public void clusterStateProcessed(ClusterState oldState, ClusterState newState) { - ActionListener.onResponse(currentListeners, newState); + future.addListener(ActionListener.wrap(() -> ActionListener.onResponse(currentListeners, newState))); } }); } catch (Exception e) { diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/DelayedAllocationService.java b/server/src/main/java/org/elasticsearch/cluster/routing/DelayedAllocationService.java index 0e2f709a55038..f251c6b9fe8d9 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/DelayedAllocationService.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/DelayedAllocationService.java @@ -29,6 +29,8 @@ import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicReference; +import static org.elasticsearch.cluster.routing.allocation.allocator.AllocationActionListener.rerouteCompletionIsNotRequired; + /** * The {@link DelayedAllocationService} listens to cluster state changes and checks * if there are unassigned shards with delayed allocation (unassigned shards that have @@ -100,7 +102,8 @@ public void onFailure(Exception e) { @Override public ClusterState execute(ClusterState currentState) throws Exception { removeIfSameTask(this); - return allocationService.reroute(currentState, "assign delayed unassigned shards"); + // rerouteCompletionIsNotRequired() as this update is scheduled and is not triggered by user request + return allocationService.reroute(currentState, "assign delayed unassigned shards", rerouteCompletionIsNotRequired()); } @Override diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/RoutingNodes.java b/server/src/main/java/org/elasticsearch/cluster/routing/RoutingNodes.java index cb112ffca16ae..c5e90c2a0c112 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/RoutingNodes.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/RoutingNodes.java @@ -18,7 +18,6 @@ import org.elasticsearch.cluster.node.DiscoveryNodes; import org.elasticsearch.cluster.routing.UnassignedInfo.AllocationStatus; import org.elasticsearch.cluster.routing.allocation.ExistingShardsAllocator; -import org.elasticsearch.cluster.service.MasterService; import org.elasticsearch.common.collect.Iterators; import org.elasticsearch.common.util.Maps; import org.elasticsearch.core.Nullable; @@ -97,7 +96,7 @@ private RoutingNodes(RoutingTable routingTable, DiscoveryNodes discoveryNodes, b final int indexCount = routingTable.indicesRouting().size(); this.assignedShards = Maps.newMapWithExpectedSize(indexCount); this.unassignedShards = new UnassignedShards(this); - this.attributeValuesByAttribute = new HashMap<>(); + this.attributeValuesByAttribute = Collections.synchronizedMap(new HashMap<>()); nodesToShards = Maps.newMapWithExpectedSize(discoveryNodes.getDataNodes().size()); // fill in the nodeToShards with the "live" nodes @@ -166,7 +165,7 @@ private RoutingNodes(RoutingNodes routingNodes) { this.inactivePrimaryCount = routingNodes.inactivePrimaryCount; this.inactiveShardCount = routingNodes.inactiveShardCount; this.relocatingShards = routingNodes.relocatingShards; - this.attributeValuesByAttribute = Maps.copyOf(routingNodes.attributeValuesByAttribute, HashSet::new); + this.attributeValuesByAttribute = Collections.synchronizedMap(Maps.copyOf(routingNodes.attributeValuesByAttribute, HashSet::new)); this.recoveriesPerNode = Maps.copyOf(routingNodes.recoveriesPerNode, Recoveries::copy); } @@ -269,8 +268,6 @@ public RoutingNode node(String nodeId) { } public Set getAttributeValues(String attributeName) { - // Only ever accessed on the master service thread so no need for synchronization - assert MasterService.assertMasterUpdateOrTestThread(); return attributeValuesByAttribute.computeIfAbsent( attributeName, ignored -> stream().map(r -> r.node().getAttributes().get(attributeName)).filter(Objects::nonNull).collect(Collectors.toSet()) @@ -843,6 +840,13 @@ public int size() { return nodesToShards.size(); } + /** + * @return collection of {@link ShardRouting}s, keyed by shard ID. + */ + public Map> getAssignedShards() { + return Collections.unmodifiableMap(assignedShards); + } + @Override public boolean equals(Object o) { if (this == o) { @@ -991,6 +995,12 @@ public void ignoreShard(ShardRouting shard, AllocationStatus allocationStatus, R ignored.add(shard); } + public void resetIgnored() { + assert unassigned.size() == 0; // every unassigned shard should be ignored before resetting + unassigned.addAll(ignored); + ignored.clear(); + } + public class UnassignedIterator implements Iterator, ExistingShardsAllocator.UnassignedAllocationHandler { private final ListIterator iterator; diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/RoutingTable.java b/server/src/main/java/org/elasticsearch/cluster/routing/RoutingTable.java index 56eb57121e439..dc53023ed11de 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/RoutingTable.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/RoutingTable.java @@ -595,6 +595,11 @@ public Builder version(long version) { return this; } + public Builder incrementVersion() { + this.version++; + return this; + } + /** * Builds the routing table. Note that once this is called the builder * must be thrown away. If you need to build a new RoutingTable as a diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/AllocationService.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/AllocationService.java index 266261d004562..6038bb25ca166 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/AllocationService.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/AllocationService.java @@ -10,6 +10,7 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; +import org.elasticsearch.action.ActionListener; import org.elasticsearch.cluster.ClusterInfoService; import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.RestoreInProgress; @@ -28,6 +29,7 @@ import org.elasticsearch.cluster.routing.ShardRouting; import org.elasticsearch.cluster.routing.UnassignedInfo; import org.elasticsearch.cluster.routing.UnassignedInfo.AllocationStatus; +import org.elasticsearch.cluster.routing.allocation.allocator.BalancedShardsAllocator; import org.elasticsearch.cluster.routing.allocation.allocator.ShardsAllocator; import org.elasticsearch.cluster.routing.allocation.command.AllocationCommands; import org.elasticsearch.cluster.routing.allocation.decider.AllocationDeciders; @@ -50,12 +52,14 @@ import java.util.Map; import java.util.Objects; import java.util.Set; +import java.util.function.Consumer; import java.util.function.Function; import java.util.function.Supplier; import java.util.stream.Collectors; import static org.elasticsearch.cluster.health.ClusterShardHealth.getInactivePrimaryHealth; import static org.elasticsearch.cluster.routing.UnassignedInfo.INDEX_DELAYED_NODE_LEFT_TIMEOUT_SETTING; +import static org.elasticsearch.cluster.routing.allocation.allocator.AllocationActionListener.rerouteCompletionIsNotRequired; /** * This service manages the node allocation of a cluster. For this reason the @@ -238,7 +242,14 @@ public ClusterState applyFailedShards( allocator.applyFailedShards(failedShards, allocation); } - reroute(allocation); + reroute( + allocation, + routingAllocation -> shardsAllocator.allocate( + routingAllocation, + rerouteCompletionIsNotRequired() /* this is not triggered by a user request */ + ) + ); + String failedShardsAsString = firstListElementsToCommaDelimitedString( failedShards, s -> s.routingEntry().shardId().toString(), @@ -261,7 +272,7 @@ public ClusterState disassociateDeadNodes(ClusterState clusterState, boolean rer clusterState = buildResultAndLogHealthChange(clusterState, allocation, reason); } if (reroute) { - return reroute(clusterState, reason); + return reroute(clusterState, reason, rerouteCompletionIsNotRequired());// this is not triggered by a user request } else { return clusterState; } @@ -375,26 +386,23 @@ public static String firstListElementsToCommaDelimitedString( } } - public CommandsResult reroute(final ClusterState clusterState, AllocationCommands commands, boolean explain, boolean retryFailed) { - // we don't shuffle the unassigned shards here, to try and get as close as possible to - // a consistent result of the effect the commands have on the routing - // this allows systems to dry run the commands, see the resulting cluster state, and act on it + public CommandsResult reroute( + ClusterState clusterState, + AllocationCommands commands, + boolean explain, + boolean retryFailed, + boolean dryRun, + ActionListener reroute + ) { RoutingAllocation allocation = createRoutingAllocation(clusterState, currentNanoTime()); - // don't short circuit deciders, we want a full explanation - allocation.debugDecision(true); - // we ignore disable allocation, because commands are explicit - allocation.ignoreDisable(true); - - if (retryFailed) { - allocation.routingNodes().resetFailedCounter(allocation.changes()); - } - - RoutingExplanations explanations = commands.execute(allocation, explain); - // we revert the ignore disable flag, since when rerouting, we want the original setting to take place - allocation.ignoreDisable(false); + var explanations = shardsAllocator.execute(allocation, commands, explain, retryFailed); // the assumption is that commands will move / act on shards (or fail through exceptions) // so, there will always be shard "movements", so no need to check on reroute - reroute(allocation); + if (dryRun == false) { + reroute(allocation, routingAllocation -> shardsAllocator.allocate(routingAllocation, reroute)); + } else { + reroute.onResponse(null); + } return new CommandsResult(explanations, buildResultAndLogHealthChange(clusterState, allocation, "reroute commands")); } @@ -408,10 +416,32 @@ public CommandsResult reroute(final ClusterState clusterState, AllocationCommand * * @return an updated cluster state, or the same instance that was passed as an argument if no changes were made. */ - public ClusterState reroute(ClusterState clusterState, String reason) { + public ClusterState reroute(ClusterState clusterState, String reason, ActionListener listener) { + return executeWithRoutingAllocation( + clusterState, + reason, + routingAllocation -> shardsAllocator.allocate(routingAllocation, listener) + ); + } + + /** + * Computes the next step towards a fully allocated and balanced cluster and records this step in the routing table of the returned + * state. Should be called after every change to the cluster that affects the routing table and/or the balance of shards. + *

+ * This method is expensive in larger clusters. Wherever possible you should invoke this method asynchronously using + * {@link RerouteService#reroute} to batch up invocations rather than calling the method directly. The node's reroute service is + * typically obtained from {@link ClusterService#getRerouteService}. + * + * @return an updated cluster state, or the same instance that was passed as an argument if no changes were made. + */ + public ClusterState executeWithRoutingAllocation( + ClusterState clusterState, + String reason, + Consumer routingAllocationConsumer + ) { ClusterState fixedClusterState = adaptAutoExpandReplicas(clusterState); RoutingAllocation allocation = createRoutingAllocation(fixedClusterState, currentNanoTime()); - reroute(allocation); + reroute(allocation, routingAllocationConsumer); if (fixedClusterState == clusterState && allocation.routingNodesChanged() == false) { return clusterState; } @@ -476,7 +506,7 @@ private static boolean hasDeadNodes(RoutingAllocation allocation) { return false; } - private void reroute(RoutingAllocation allocation) { + private void reroute(RoutingAllocation allocation, Consumer routingAllocationConsumer) { assert hasDeadNodes(allocation) == false : "dead nodes should be explicitly cleaned up. See disassociateDeadNodes"; assert AutoExpandReplicas.getAutoExpandReplicaChanges(allocation.metadata(), () -> allocation).isEmpty() : "auto-expand replicas out of sync with number of nodes in the cluster"; @@ -485,7 +515,7 @@ private void reroute(RoutingAllocation allocation) { removeDelayMarkers(allocation); allocateExistingUnassignedShards(allocation); // try to allocate existing shard copies first - shardsAllocator.allocate(allocation); + routingAllocationConsumer.accept(allocation); assert RoutingNodes.assertShardStats(allocation.routingNodes()); } @@ -645,6 +675,11 @@ private boolean assertInitialized() { return true; } + // exposed for tests whose behaviour depends on this + boolean isBalancedShardsAllocator() { + return shardsAllocator instanceof BalancedShardsAllocator; + } + private static class NotFoundAllocator implements ExistingShardsAllocator { private final String allocatorName; diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/RoutingAllocation.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/RoutingAllocation.java index 5d2d185f453e0..ecc6fb295feae 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/RoutingAllocation.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/RoutingAllocation.java @@ -50,7 +50,7 @@ public class RoutingAllocation { private final ClusterState clusterState; - private final ClusterInfo clusterInfo; + private ClusterInfo clusterInfo; private final SnapshotShardSizeInfo shardSizeInfo; @@ -63,6 +63,7 @@ public class RoutingAllocation { private boolean hasPendingAsyncFetch = false; private final long currentNanoTime; + private final boolean isSimulating; private final IndexMetadataUpdater indexMetadataUpdater = new IndexMetadataUpdater(); private final RoutingNodesChangedObserver nodesChangedObserver = new RoutingNodesChangedObserver(); @@ -109,6 +110,26 @@ public RoutingAllocation( @Nullable ClusterInfo clusterInfo, SnapshotShardSizeInfo shardSizeInfo, long currentNanoTime + ) { + this(deciders, routingNodes, clusterState, clusterInfo, shardSizeInfo, currentNanoTime, false); + } + + /** + * Creates a new {@link RoutingAllocation} + * @param deciders {@link AllocationDeciders} to used to make decisions for routing allocations + * @param routingNodes Routing nodes in the current cluster or {@code null} if using those in the given cluster state + * @param clusterState cluster state before rerouting + * @param currentNanoTime the nano time to use for all delay allocation calculation (typically {@link System#nanoTime()}) + * @param isSimulating {@code true} if "transient" deciders should be ignored because we are simulating the final allocation + */ + private RoutingAllocation( + AllocationDeciders deciders, + @Nullable RoutingNodes routingNodes, + ClusterState clusterState, + ClusterInfo clusterInfo, + SnapshotShardSizeInfo shardSizeInfo, + long currentNanoTime, + boolean isSimulating ) { this.deciders = deciders; this.routingNodes = routingNodes; @@ -116,6 +137,7 @@ public RoutingAllocation( this.clusterInfo = clusterInfo; this.shardSizeInfo = shardSizeInfo; this.currentNanoTime = currentNanoTime; + this.isSimulating = isSimulating; this.nodeReplacementTargets = nodeReplacementTargets(clusterState); this.desiredNodes = DesiredNodes.latestFromClusterState(clusterState); this.unaccountedSearchableSnapshotSizes = unaccountedSearchableSnapshotSizes(clusterState, clusterInfo); @@ -202,6 +224,10 @@ public DiscoveryNodes nodes() { return clusterState.nodes(); } + public ClusterState getClusterState() { + return clusterState; + } + public ClusterInfo clusterInfo() { return clusterInfo; } @@ -367,6 +393,35 @@ public long unaccountedSearchableSnapshotSize(RoutingNode routingNode) { return unaccountedSearchableSnapshotSizes.getOrDefault(routingNode.nodeId(), 0L); } + /** + * @return {@code true} if this allocation computation is trying to simulate the final allocation and therefore "transient" allocation + * blockers should be ignored. + */ + public boolean isSimulating() { + return isSimulating; + } + + public void setSimulatedClusterInfo(ClusterInfo clusterInfo) { + assert isSimulating : "Should be called only while simulating"; + this.clusterInfo = clusterInfo; + } + + public RoutingAllocation immutableClone() { + return new RoutingAllocation(deciders, clusterState, clusterInfo, shardSizeInfo, currentNanoTime); + } + + public RoutingAllocation mutableCloneForSimulation() { + return new RoutingAllocation( + deciders, + clusterState.mutableRoutingNodes(), + clusterState, + clusterInfo, + shardSizeInfo, + currentNanoTime, + true + ); + } + public enum DebugMode { /** * debug mode is off diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/AllocationActionListener.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/AllocationActionListener.java new file mode 100644 index 0000000000000..6ac159353dd1f --- /dev/null +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/AllocationActionListener.java @@ -0,0 +1,102 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0 and the Server Side Public License, v 1; you may not use this file except + * in compliance with, at your election, the Elastic License 2.0 or the Server + * Side Public License, v 1. + */ + +package org.elasticsearch.cluster.routing.allocation.allocator; + +import org.apache.lucene.util.SetOnce; +import org.elasticsearch.action.ActionListener; +import org.elasticsearch.common.util.concurrent.ThreadContext; + +import java.util.List; +import java.util.Map; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.function.Supplier; + +public class AllocationActionListener { + + private final ActionListener delegate; + private final SetOnce response = new SetOnce<>(); + private final AtomicInteger listenersExecuted = new AtomicInteger(2); + private final ThreadContext context; + private final Supplier original; + private final SetOnce>> additionalResponseHeaders = new SetOnce<>(); + + /** + * This listener could be used when reroute completion (such as even balancing shards across the cluster) is not required for the + * completion of the caller operation. + * + * For example, it is required to compute the desired balance to properly allocate newly created index, but it is not when deleting one. + */ + public static ActionListener rerouteCompletionIsNotRequired() { + return ActionListener.noop(); + } + + public AllocationActionListener(ActionListener delegate, ThreadContext context) { + this.delegate = delegate; + this.context = context; + this.original = context.newRestorableContext(false); + } + + private void notifyListenerExecuted() { + if (listenersExecuted.decrementAndGet() == 0) { + executeInContext(() -> delegate.onResponse(AllocationActionListener.this.response.get())); + } + } + + private void notifyListenerFailed(Exception e) { + executeInContext(() -> delegate.onFailure(e)); + } + + private void executeInContext(Runnable action) { + try (ThreadContext.StoredContext ignore2 = original.get()) { + appendAdditionalResponseHeaders(context, additionalResponseHeaders.get()); + action.run(); + } + } + + private static void appendAdditionalResponseHeaders(ThreadContext context, Map> additionalHeaders) { + if (additionalHeaders != null) { + for (var entry : additionalHeaders.entrySet()) { + for (String header : entry.getValue()) { + context.addResponseHeader(entry.getKey(), header); + } + } + } + } + + public ActionListener clusterStateUpdate() { + return new ActionListener<>() { + @Override + public void onResponse(T response) { + AllocationActionListener.this.response.set(response); + additionalResponseHeaders.set(context.getResponseHeaders()); + notifyListenerExecuted(); + } + + @Override + public void onFailure(Exception e) { + additionalResponseHeaders.set(context.getResponseHeaders()); + notifyListenerFailed(e); + } + }; + } + + public ActionListener reroute() { + return new ActionListener<>() { + @Override + public void onResponse(Void unused) { + notifyListenerExecuted(); + } + + @Override + public void onFailure(Exception e) { + notifyListenerFailed(e); + } + }; + } +} diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/AllocationActionMultiListener.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/AllocationActionMultiListener.java new file mode 100644 index 0000000000000..72494eb0b7617 --- /dev/null +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/AllocationActionMultiListener.java @@ -0,0 +1,98 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0 and the Server Side Public License, v 1; you may not use this file except + * in compliance with, at your election, the Elastic License 2.0 or the Server + * Side Public License, v 1. + */ + +package org.elasticsearch.cluster.routing.allocation.allocator; + +import org.elasticsearch.action.ActionListener; +import org.elasticsearch.common.util.concurrent.ThreadContext; + +import java.util.ArrayList; +import java.util.List; + +import static org.elasticsearch.action.support.ContextPreservingActionListener.wrapPreservingContext; + +/** + * This event listener might be needed to delay execution of multiple distinct tasks until followup reroute is complete. + */ +public class AllocationActionMultiListener { + + private volatile boolean complete = false; + private final List> delayed = new ArrayList<>(); + private final ThreadContext context; + + public AllocationActionMultiListener(ThreadContext context) { + this.context = context; + } + + public ActionListener delay(ActionListener delegate) { + final var wrappedDelegate = wrapPreservingContext(delegate, context); + return new ActionListener() { + @Override + public void onResponse(T response) { + if (tryDelayListener(wrappedDelegate, response) == false) { + wrappedDelegate.onResponse(response); + } + } + + @Override + public void onFailure(Exception e) { + // there is no need to delay listener in case of failure + wrappedDelegate.onFailure(e); + } + }; + } + + public ActionListener reroute() { + return new ActionListener<>() { + @Override + public void onResponse(Void unused) { + for (var listener : completeAndGetDelayedListeners()) { + listener.listener.onResponse(listener.response); + } + } + + @Override + public void onFailure(Exception e) { + for (var listener : completeAndGetDelayedListeners()) { + listener.listener.onFailure(e); + } + } + }; + } + + public void noRerouteNeeded() { + for (var listener : completeAndGetDelayedListeners()) { + listener.listener.onResponse(listener.response); + } + } + + /** + * @return {@code true} if listener should be delayed or {@code false} if it needs to be completed immediately + */ + private synchronized boolean tryDelayListener(ActionListener listener, T response) { + if (complete) { + return false; + } else { + delayed.add(new DelayedListener<>(listener, response)); + return true; + } + } + + /** + * Completes a delay and returns a list of all delayed listeners + */ + private synchronized List> completeAndGetDelayedListeners() { + assert complete == false : "Should only complete once"; + complete = true; + var listeners = List.copyOf(delayed); + delayed.clear(); + return listeners; + } + + private record DelayedListener (ActionListener listener, T response) {} +} diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/ContinuousComputation.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/ContinuousComputation.java new file mode 100644 index 0000000000000..853451bdb986f --- /dev/null +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/ContinuousComputation.java @@ -0,0 +1,101 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0 and the Server Side Public License, v 1; you may not use this file except + * in compliance with, at your election, the Elastic License 2.0 or the Server + * Side Public License, v 1. + */ + +package org.elasticsearch.cluster.routing.allocation.allocator; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.elasticsearch.common.util.concurrent.AbstractRunnable; + +import java.util.Objects; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.atomic.AtomicReference; + +/** + * Asynchronously runs some computation using at most one thread but expects the input value changes over time as it's running. Newer input + * values are assumed to be fresher and trigger a recomputation. If a computation never starts before a fresher value arrives then it is + * skipped. + */ +public abstract class ContinuousComputation { + + private static final Logger logger = LogManager.getLogger(ContinuousComputation.class); + + private final ExecutorService executorService; + private final AtomicReference enqueuedInput = new AtomicReference<>(); + private final Processor processor = new Processor(); + + /** + * @param executorService the background executor service to use to run the computations. No more than one task is executed at once. + */ + public ContinuousComputation(ExecutorService executorService) { + this.executorService = executorService; + } + + /** + * Called when the input value has changed. If no newer value is received then eventually either the computation will run on this value. + */ + public void onNewInput(T input) { + assert input != null; + if (enqueuedInput.getAndSet(Objects.requireNonNull(input)) == null) { + executorService.execute(processor); + } + } + + /** + * @return {@code false} iff there are no active/enqueued computations + */ + // exposed for tests + boolean isActive() { + return enqueuedInput.get() != null; + } + + /** + * @return {@code true} iff the given {@code input} is the latest known input. + */ + protected boolean isFresh(T input) { + return enqueuedInput.get() == input; + } + + /** + * Process the given input. + * + * @param input the value that was last received by {@link #onNewInput} before invocation. + */ + protected abstract void processInput(T input); + + private class Processor extends AbstractRunnable { + + @Override + public void onFailure(Exception e) { + assert false : e; + } + + @Override + public void onRejection(Exception e) { + // shutting down, just give up + logger.debug("rejected", e); + } + + @Override + protected void doRun() throws Exception { + final T input = enqueuedInput.get(); + assert input != null; + + processInput(input); + + if (enqueuedInput.compareAndSet(input, null) == false) { + executorService.execute(this); + } + } + + @Override + public String toString() { + return "ContinuousComputation$Processor[" + ContinuousComputation.this + "]"; + } + } +} diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalance.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalance.java new file mode 100644 index 0000000000000..7bfaddc49fbb3 --- /dev/null +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalance.java @@ -0,0 +1,32 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0 and the Server Side Public License, v 1; you may not use this file except + * in compliance with, at your election, the Elastic License 2.0 or the Server + * Side Public License, v 1. + */ + +package org.elasticsearch.cluster.routing.allocation.allocator; + +import org.elasticsearch.index.shard.ShardId; + +import java.util.Map; +import java.util.Objects; + +/** + * The desired balance of the cluster, indicating which nodes should hold a copy of each shard. + * + * @param assignments a set of the (persistent) node IDs to which each {@link ShardId} should be allocated + */ +public record DesiredBalance(long lastConvergedIndex, Map assignments) { + + public ShardAssignment getAssignment(ShardId shardId) { + return assignments.get(shardId); + } + + public static boolean hasChanges(DesiredBalance a, DesiredBalance b) { + return Objects.equals(a.assignments, b.assignments) == false; + } + + public static DesiredBalance INITIAL = new DesiredBalance(-1, Map.of()); +} diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceComputer.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceComputer.java new file mode 100644 index 0000000000000..0b291d1ed304b --- /dev/null +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceComputer.java @@ -0,0 +1,303 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0 and the Server Side Public License, v 1; you may not use this file except + * in compliance with, at your election, the Elastic License 2.0 or the Server + * Side Public License, v 1. + */ + +package org.elasticsearch.cluster.routing.allocation.allocator; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.elasticsearch.cluster.ClusterInfoSimulator; +import org.elasticsearch.cluster.node.DiscoveryNode; +import org.elasticsearch.cluster.routing.RoutingNodes; +import org.elasticsearch.cluster.routing.ShardRouting; +import org.elasticsearch.cluster.routing.UnassignedInfo; +import org.elasticsearch.cluster.routing.allocation.command.MoveAllocationCommand; +import org.elasticsearch.common.metrics.MeanMetric; +import org.elasticsearch.index.shard.ShardId; + +import java.util.ArrayList; +import java.util.Comparator; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Queue; +import java.util.Set; +import java.util.TreeSet; +import java.util.function.Predicate; + +import static java.util.stream.Collectors.toSet; + +/** + * Holds the desired balance and updates it as the cluster evolves. + */ +public class DesiredBalanceComputer { + + private static final Logger logger = LogManager.getLogger(DesiredBalanceComputer.class); + + private final ShardsAllocator delegateAllocator; + + protected final MeanMetric iterations = new MeanMetric(); + + public DesiredBalanceComputer(ShardsAllocator delegateAllocator) { + this.delegateAllocator = delegateAllocator; + } + + public DesiredBalance compute( + DesiredBalance previousDesiredBalance, + DesiredBalanceInput desiredBalanceInput, + Queue> pendingDesiredBalanceMoves, + Predicate isFresh + ) { + + logger.debug("Recomputing desired balance for [{}]", desiredBalanceInput.index()); + + final var routingAllocation = desiredBalanceInput.routingAllocation().mutableCloneForSimulation(); + final var routingNodes = routingAllocation.routingNodes(); + final var changes = routingAllocation.changes(); + final var ignoredShards = desiredBalanceInput.ignoredShards(); + final var knownNodeIds = routingAllocation.nodes().stream().map(DiscoveryNode::getId).collect(toSet()); + final var clusterInfoSimulator = new ClusterInfoSimulator(routingAllocation.clusterInfo()); + + if (routingNodes.isEmpty()) { + return new DesiredBalance(desiredBalanceInput.index(), Map.of()); + } + + // we assume that all ongoing recoveries will complete + for (final var routingNode : routingNodes) { + for (final var shardRouting : routingNode) { + if (shardRouting.initializing()) { + clusterInfoSimulator.simulate(shardRouting); + routingNodes.startShard(logger, shardRouting, changes, 0L); + } + } + } + + // we are not responsible for allocating unassigned primaries of existing shards, and we're only responsible for allocating + // unassigned replicas if the ReplicaShardAllocator gives up, so we must respect these ignored shards + final var unassignedPrimaries = new HashSet(); + final var shardRoutings = new HashMap(); + for (final var primary : new boolean[] { true, false }) { + final RoutingNodes.UnassignedShards unassigned = routingNodes.unassigned(); + for (final var iterator = unassigned.iterator(); iterator.hasNext();) { + final var shardRouting = iterator.next(); + if (shardRouting.primary() == primary) { + var lastAllocatedNodeId = shardRouting.unassignedInfo().getLastAllocatedNodeId(); + if (knownNodeIds.contains(lastAllocatedNodeId) || ignoredShards.contains(shardRouting) == false) { + shardRoutings.computeIfAbsent(shardRouting.shardId(), ShardRoutings::new).unassigned().add(shardRouting); + } else { + iterator.removeAndIgnore(UnassignedInfo.AllocationStatus.NO_ATTEMPT, changes); + if (shardRouting.primary()) { + unassignedPrimaries.add(shardRouting.shardId()); + } + } + } + } + } + + for (final var assigned : routingNodes.getAssignedShards().entrySet()) { + shardRoutings.computeIfAbsent(assigned.getKey(), ShardRoutings::new).assigned().addAll(assigned.getValue()); + } + + // we can assume that all possible shards will be allocated/relocated to one of their desired locations + final var unassignedShardsToInitialize = new HashMap>(); + for (final var entry : shardRoutings.entrySet()) { + final var shardId = entry.getKey(); + final var routings = entry.getValue(); + + // treesets so that we are consistent about the order of future relocations + final var shardsToRelocate = new TreeSet<>(Comparator.comparing(ShardRouting::currentNodeId)); + final var assignment = previousDesiredBalance.getAssignment(shardId); + + final var targetNodes = assignment != null ? new TreeSet<>(assignment.nodeIds()) : new TreeSet(); + targetNodes.retainAll(knownNodeIds); + // preserving last known shard location as a starting point to avoid unnecessary relocations + for (ShardRouting shardRouting : routings.unassigned()) { + var lastAllocatedNodeId = shardRouting.unassignedInfo().getLastAllocatedNodeId(); + if (knownNodeIds.contains(lastAllocatedNodeId)) { + targetNodes.add(lastAllocatedNodeId); + } + } + + for (final var shardRouting : routings.assigned()) { + assert shardRouting.started(); + if (targetNodes.remove(shardRouting.currentNodeId()) == false) { + shardsToRelocate.add(shardRouting); + } + } + + final var targetNodesIterator = targetNodes.iterator(); + + // Here existing shards are moved to desired locations before initializing unassigned shards because we prefer not to leave + // immovable shards allocated to undesirable locations (e.g. a node that is shutting down). In contrast, reconciliation prefers + // to initialize the unassigned shards first. + for (final var shardRouting : shardsToRelocate) { + assert shardRouting.started(); + if (targetNodesIterator.hasNext()) { + ShardRouting shardToRelocate = routingNodes.relocateShard(shardRouting, targetNodesIterator.next(), 0L, changes).v2(); + clusterInfoSimulator.simulate(shardToRelocate); + routingNodes.startShard(logger, shardToRelocate, changes, 0L); + } else { + break; + } + } + + for (final var shardRouting : routings.unassigned()) { + assert shardRouting.unassigned(); + if (targetNodesIterator.hasNext()) { + unassignedShardsToInitialize.computeIfAbsent(shardRouting, ignored -> new LinkedList<>()) + .add(targetNodesIterator.next()); + } else { + break; + } + } + } + + final var unassignedPrimaryIterator = routingNodes.unassigned().iterator(); + while (unassignedPrimaryIterator.hasNext()) { + final var shardRouting = unassignedPrimaryIterator.next(); + if (shardRouting.primary()) { + final var nodeIds = unassignedShardsToInitialize.get(shardRouting); + if (nodeIds != null && nodeIds.isEmpty() == false) { + final String nodeId = nodeIds.removeFirst(); + ShardRouting shardToInitialized = unassignedPrimaryIterator.initialize(nodeId, null, 0L, changes); + clusterInfoSimulator.simulate(shardToInitialized); + routingNodes.startShard(logger, shardToInitialized, changes, 0L); + } + } + } + + final var unassignedReplicaIterator = routingNodes.unassigned().iterator(); + while (unassignedReplicaIterator.hasNext()) { + final var shardRouting = unassignedReplicaIterator.next(); + if (unassignedPrimaries.contains(shardRouting.shardId()) == false) { + final var nodeIds = unassignedShardsToInitialize.get(shardRouting); + if (nodeIds != null && nodeIds.isEmpty() == false) { + final String nodeId = nodeIds.removeFirst(); + ShardRouting shardToInitialize = unassignedReplicaIterator.initialize(nodeId, null, 0L, changes); + clusterInfoSimulator.simulate(shardToInitialize); + routingNodes.startShard(logger, shardToInitialize, changes, 0L); + } + } + } + + List commands; + while ((commands = pendingDesiredBalanceMoves.poll()) != null) { + for (MoveAllocationCommand command : commands) { + try { + command.execute(routingAllocation, false); + } catch (RuntimeException e) { + logger.debug( + () -> "move shard [" + + command.index() + + ":" + + command.shardId() + + "] command failed during applying it to the desired balance", + e + ); + } + } + } + + int i = 0; + boolean hasChanges = false; + while (true) { + if (hasChanges) { + // Not the first iteration, so every remaining unassigned shard has been ignored, perhaps due to throttling. We must bring + // them all back out of the ignored list to give the allocator another go... + routingNodes.unassigned().resetIgnored(); + // ... but not if they're ignored because they're out of scope for allocation + for (final var iterator = routingNodes.unassigned().iterator(); iterator.hasNext();) { + final var shardRouting = iterator.next(); + if (ignoredShards.contains(shardRouting)) { + iterator.removeAndIgnore(UnassignedInfo.AllocationStatus.NO_ATTEMPT, changes); + } + } + } + + routingAllocation.setSimulatedClusterInfo(clusterInfoSimulator.getClusterInfo()); + logger.trace("running delegate allocator"); + delegateAllocator.allocate(routingAllocation); + assert routingNodes.unassigned().size() == 0; // any unassigned shards should now be ignored + + hasChanges = false; + for (final var routingNode : routingNodes) { + for (final var shardRouting : routingNode) { + if (shardRouting.initializing()) { + hasChanges = true; + clusterInfoSimulator.simulate(shardRouting); + routingNodes.startShard(logger, shardRouting, changes, 0L); + logger.trace("starting shard {}", shardRouting); + } + } + } + + i++; + if (hasChanges == false) { + logger.debug("Desired balance computation converged after {} iterations", i); + break; + } + if (isFresh.test(desiredBalanceInput) == false) { + // we run at least one iteration, but if another reroute happened meanwhile + // then publish the interim state and restart the calculation + logger.debug("Newer cluster state received, publishing incomplete desired balance and restarting computation"); + break; + } + if (i % 100 == 0) { + // TODO this warning should be time based, iteration count should be proportional to the number of shards + logger.debug("Desired balance computation is still not converged after {} iterations", i); + } + } + iterations.inc(i); + + final var assignments = new HashMap(); + for (var shardAndAssignments : routingNodes.getAssignedShards().entrySet()) { + assignments.put(shardAndAssignments.getKey(), ShardAssignment.of(shardAndAssignments.getValue())); + } + + for (var ignored : routingNodes.unassigned().ignored()) { + var info = ignored.unassignedInfo(); + assert info != null + && (info.getLastAllocationStatus() == UnassignedInfo.AllocationStatus.DECIDERS_NO + || info.getLastAllocationStatus() == UnassignedInfo.AllocationStatus.NO_ATTEMPT + || info.getLastAllocationStatus() == UnassignedInfo.AllocationStatus.DECIDERS_THROTTLED) + : "Unexpected stats in: " + info; + + if (hasChanges == false && info.getLastAllocationStatus() == UnassignedInfo.AllocationStatus.DECIDERS_THROTTLED) { + // Simulation could not progress due to missing information in any of the deciders. + // Currently, this could happen if `HasFrozenCacheAllocationDecider` is still fetching the data. + // Progress would be made after the followup reroute call. + hasChanges = true; + } + + var unassigned = ignored.unassignedInfo().getLastAllocationStatus() == UnassignedInfo.AllocationStatus.DECIDERS_NO; + assignments.compute( + ignored.shardId(), + (key, oldValue) -> oldValue == null + ? new ShardAssignment(Set.of(), 1, 1, unassigned ? 0 : 1) + : new ShardAssignment( + oldValue.nodeIds(), + oldValue.total() + 1, + oldValue.unassigned() + 1, + oldValue.ignored() + (unassigned ? 0 : 1) + ) + ); + + } + + long lastConvergedIndex = hasChanges ? previousDesiredBalance.lastConvergedIndex() : desiredBalanceInput.index(); + return new DesiredBalance(lastConvergedIndex, assignments); + } + + private record ShardRoutings(List unassigned, List assigned) { + + private ShardRoutings(ShardId ignored) { + this(new ArrayList<>(), new ArrayList<>()); + } + } +} diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceInput.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceInput.java new file mode 100644 index 0000000000000..9b4cb4c14cd46 --- /dev/null +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceInput.java @@ -0,0 +1,67 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0 and the Server Side Public License, v 1; you may not use this file except + * in compliance with, at your election, the Elastic License 2.0 or the Server + * Side Public License, v 1. + */ + +package org.elasticsearch.cluster.routing.allocation.allocator; + +import org.elasticsearch.cluster.routing.ShardRouting; +import org.elasticsearch.cluster.routing.UnassignedInfo; +import org.elasticsearch.cluster.routing.allocation.RoutingAllocation; + +import java.util.Set; +import java.util.stream.Stream; + +import static java.util.stream.Collectors.toUnmodifiableSet; + +/** + * The input to the desired balance computation. + * + * @param routingAllocation a copy of (the immutable parts of) the context for the allocation decision process + * @param ignoredShards a list of the shards for which earlier allocators have claimed responsibility + */ +public record DesiredBalanceInput(long index, RoutingAllocation routingAllocation, Set ignoredShards) { + + public static DesiredBalanceInput create(long index, RoutingAllocation routingAllocation) { + return new DesiredBalanceInput( + index, + routingAllocation.immutableClone(), + getIgnoredShardsWithDiscardedAllocationStatus(routingAllocation) + ); + } + + private static Set getIgnoredShardsWithDiscardedAllocationStatus(RoutingAllocation routingAllocation) { + return routingAllocation.routingNodes() + .unassigned() + .ignored() + .stream() + .flatMap( + shardRouting -> Stream.of( + shardRouting, + shardRouting.updateUnassigned(discardAllocationStatus(shardRouting.unassignedInfo()), shardRouting.recoverySource()) + ) + ) + .collect(toUnmodifiableSet()); + } + + /** + * AllocationStatus is discarded as it might come from GatewayAllocator and not be present in corresponding routing table + */ + private static UnassignedInfo discardAllocationStatus(UnassignedInfo info) { + return new UnassignedInfo( + info.getReason(), + info.getMessage(), + info.getFailure(), + info.getNumFailedAllocations(), + info.getUnassignedTimeInNanos(), + info.getUnassignedTimeInMillis(), + info.isDelayed(), + UnassignedInfo.AllocationStatus.NO_ATTEMPT, + info.getFailedNodeIds(), + info.getLastAllocatedNodeId() + ); + } +} diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java new file mode 100644 index 0000000000000..4ad4a4a61e543 --- /dev/null +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java @@ -0,0 +1,438 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0 and the Server Side Public License, v 1; you may not use this file except + * in compliance with, at your election, the Elastic License 2.0 or the Server + * Side Public License, v 1. + */ + +package org.elasticsearch.cluster.routing.allocation.allocator; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.apache.lucene.util.ArrayUtil; +import org.elasticsearch.cluster.metadata.SingleNodeShutdownMetadata; +import org.elasticsearch.cluster.node.DiscoveryNode; +import org.elasticsearch.cluster.routing.RoutingNode; +import org.elasticsearch.cluster.routing.RoutingNodes; +import org.elasticsearch.cluster.routing.ShardRouting; +import org.elasticsearch.cluster.routing.UnassignedInfo; +import org.elasticsearch.cluster.routing.allocation.RoutingAllocation; +import org.elasticsearch.cluster.routing.allocation.decider.Decision; +import org.elasticsearch.cluster.routing.allocation.decider.DiskThresholdDecider; +import org.elasticsearch.core.Tuple; +import org.elasticsearch.gateway.PriorityComparator; +import org.elasticsearch.index.shard.ShardId; + +import java.util.Collections; +import java.util.Comparator; +import java.util.List; +import java.util.Set; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.function.BiFunction; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +/** + * Given the current allocation of shards and the desired balance, performs the next (legal) shard movements towards the goal. + */ +public class DesiredBalanceReconciler { + + private static final Logger logger = LogManager.getLogger(DesiredBalanceReconciler.class); + + private final DesiredBalance desiredBalance; + private final RoutingAllocation allocation; // name chosen to align with code in BalancedShardsAllocator but TODO rename + private final RoutingNodes routingNodes; + private final NodeAllocationOrdering allocationOrdering; + + DesiredBalanceReconciler( + DesiredBalance desiredBalance, + RoutingAllocation routingAllocation, + NodeAllocationOrdering allocationOrdering + ) { + this.desiredBalance = desiredBalance; + this.allocation = routingAllocation; + this.routingNodes = routingAllocation.routingNodes(); + this.allocationOrdering = allocationOrdering; + } + + void run() { + + logger.debug("Reconciling desired balance for [{}]", desiredBalance.lastConvergedIndex()); + + if (routingNodes.size() == 0) { + // no data nodes, so fail allocation to report red health + failAllocationOfNewPrimaries(allocation); + logger.trace("no nodes available, nothing to reconcile"); + return; + } + + if (desiredBalance.assignments().isEmpty()) { + // no desired state yet but it is on its way and we'll reroute again when it is ready + logger.trace("desired balance is empty, nothing to reconcile"); + return; + } + + // compute next moves towards current desired balance: + + // 1. allocate unassigned shards first + logger.trace("Reconciler#allocateUnassigned"); + allocateUnassigned(); + assert allocateUnassignedInvariant(); + + // 2. move any shards that cannot remain where they are + logger.trace("Reconciler#moveShards"); + moveShards(); + // 3. move any other shards that are desired elsewhere + logger.trace("Reconciler#balance"); + balance(); + + logger.debug("Reconciliation is complete"); + } + + private boolean allocateUnassignedInvariant() { + // after allocateUnassigned, every shard must be either assigned or ignored + + assert routingNodes.unassigned().isEmpty(); + + final var shardCounts = allocation.metadata() + .stream() + .flatMap( + indexMetadata -> IntStream.range(0, indexMetadata.getNumberOfShards()) + .mapToObj( + shardId -> Tuple.tuple(new ShardId(indexMetadata.getIndex(), shardId), indexMetadata.getNumberOfReplicas() + 1) + ) + ) + .collect(Collectors.toMap(Tuple::v1, Tuple::v2)); + + for (final var shardRouting : routingNodes.unassigned().ignored()) { + shardCounts.computeIfPresent(shardRouting.shardId(), (ignored, count) -> count == 1 ? null : count - 1); + } + + for (final var routingNode : routingNodes) { + for (final var shardRouting : routingNode) { + shardCounts.computeIfPresent(shardRouting.shardId(), (ignored, count) -> count == 1 ? null : count - 1); + } + } + + assert shardCounts.isEmpty() : shardCounts; + + return true; + } + + private void failAllocationOfNewPrimaries(RoutingAllocation allocation) { + RoutingNodes routingNodes = allocation.routingNodes(); + assert routingNodes.size() == 0 : routingNodes; + final RoutingNodes.UnassignedShards.UnassignedIterator unassignedIterator = routingNodes.unassigned().iterator(); + while (unassignedIterator.hasNext()) { + final ShardRouting shardRouting = unassignedIterator.next(); + final UnassignedInfo unassignedInfo = shardRouting.unassignedInfo(); + if (shardRouting.primary() && unassignedInfo.getLastAllocationStatus() == UnassignedInfo.AllocationStatus.NO_ATTEMPT) { + unassignedIterator.updateUnassigned( + new UnassignedInfo( + unassignedInfo.getReason(), + unassignedInfo.getMessage(), + unassignedInfo.getFailure(), + unassignedInfo.getNumFailedAllocations(), + unassignedInfo.getUnassignedTimeInNanos(), + unassignedInfo.getUnassignedTimeInMillis(), + unassignedInfo.isDelayed(), + UnassignedInfo.AllocationStatus.DECIDERS_NO, + unassignedInfo.getFailedNodeIds(), + unassignedInfo.getLastAllocatedNodeId() + ), + shardRouting.recoverySource(), + allocation.changes() + ); + } + } + } + + private void allocateUnassigned() { + RoutingNodes.UnassignedShards unassigned = routingNodes.unassigned(); + if (logger.isTraceEnabled()) { + logger.trace("Start allocating unassigned shards"); + } + if (unassigned.isEmpty()) { + return; + } + + /* + * TODO: We could be smarter here and group the shards by index and then + * use the sorter to save some iterations. + */ + final PriorityComparator secondaryComparator = PriorityComparator.getAllocationComparator(allocation); + final Comparator comparator = (o1, o2) -> { + if (o1.primary() ^ o2.primary()) { + return o1.primary() ? -1 : 1; + } + if (o1.getIndexName().compareTo(o2.getIndexName()) == 0) { + return o1.getId() - o2.getId(); + } + // this comparator is more expensive than all the others up there + // that's why it's added last even though it could be easier to read + // if we'd apply it earlier. this comparator will only differentiate across + // indices all shards of the same index is treated equally. + final int secondary = secondaryComparator.compare(o1, o2); + assert secondary != 0 : "Index names are equal, should be returned early."; + return secondary; + }; + /* + * we use 2 arrays and move replicas to the second array once we allocated an identical + * replica in the current iteration to make sure all indices get allocated in the same manner. + * The arrays are sorted by primaries first and then by index and shard ID so a 2 indices with + * 2 replica and 1 shard would look like: + * [(0,P,IDX1), (0,P,IDX2), (0,R,IDX1), (0,R,IDX1), (0,R,IDX2), (0,R,IDX2)] + * if we allocate for instance (0, R, IDX1) we move the second replica to the secondary array and proceed with + * the next replica. If we could not find a node to allocate (0,R,IDX1) we move all it's replicas to ignoreUnassigned. + */ + ShardRouting[] primary = unassigned.drain(); + ShardRouting[] secondary = new ShardRouting[primary.length]; + int secondaryLength = 0; + int primaryLength = primary.length; + ArrayUtil.timSort(primary, comparator); + + do { + nextShard: for (int i = 0; i < primaryLength; i++) { + final var shard = primary[i]; + final var assignment = desiredBalance.getAssignment(shard.shardId()); + final var isThrottled = new AtomicBoolean(false); + if (assignment != null) { + + for (final var nodeIdIterator : List.of( + getDesiredNodesIds(shard, assignment), + getFallbackNodeIds(shard, isThrottled) + )) { + for (final var desiredNodeId : nodeIdIterator) { + final var routingNode = routingNodes.node(desiredNodeId); + if (routingNode == null) { + // desired node no longer exists + continue; + } + final var decision = allocation.deciders().canAllocate(shard, routingNode, allocation); + switch (decision.type()) { + case YES -> { + if (logger.isTraceEnabled()) { + logger.trace("Assigned shard [{}] to [{}]", shard, desiredNodeId); + } + final long shardSize = DiskThresholdDecider.getExpectedShardSize( + shard, + ShardRouting.UNAVAILABLE_EXPECTED_SHARD_SIZE, + allocation.clusterInfo(), + allocation.snapshotShardSizeInfo(), + allocation.metadata(), + allocation.routingTable() + ); + routingNodes.initializeShard(shard, desiredNodeId, null, shardSize, allocation.changes()); + allocationOrdering.recordAllocation(desiredNodeId); + if (shard.primary() == false) { + // copy over the same replica shards to the secondary array so they will get allocated + // in a subsequent iteration, allowing replicas of other shards to be allocated first + while (i < primaryLength - 1 && comparator.compare(primary[i], primary[i + 1]) == 0) { + secondary[secondaryLength++] = primary[++i]; + } + } + continue nextShard; + } + case THROTTLE -> isThrottled.set(true); + case NO -> { + if (logger.isTraceEnabled()) { + logger.trace("Couldn't assign shard [{}] to [{}]", shard.shardId(), desiredNodeId); + } + } + } + } + } + } + + if (logger.isTraceEnabled()) { + logger.trace("No eligible node found to assign shard [{}] amongst [{}]", shard, assignment); + } + + final UnassignedInfo.AllocationStatus allocationStatus; + if (assignment == null || assignment.isIgnored(shard.primary())) { + allocationStatus = UnassignedInfo.AllocationStatus.NO_ATTEMPT; + } else if (isThrottled.get()) { + allocationStatus = UnassignedInfo.AllocationStatus.DECIDERS_THROTTLED; + } else { + allocationStatus = UnassignedInfo.AllocationStatus.DECIDERS_NO; + } + + unassigned.ignoreShard(shard, allocationStatus, allocation.changes()); + if (shard.primary() == false) { + // we could not allocate it and we are a replica - check if we can ignore the other replicas + while (i < primaryLength - 1 && comparator.compare(primary[i], primary[i + 1]) == 0) { + unassigned.ignoreShard(primary[++i], allocationStatus, allocation.changes()); + } + } + } + primaryLength = secondaryLength; + ShardRouting[] tmp = primary; + primary = secondary; + secondary = tmp; + secondaryLength = 0; + } while (primaryLength > 0); + } + + private Iterable getDesiredNodesIds(ShardRouting shard, ShardAssignment assignment) { + return allocationOrdering.sort(allocation.deciders().getForcedInitialShardAllocationToNodes(shard, allocation).map(forced -> { + if (logger.isDebugEnabled()) { + logger.debug("Shard [{}] assignment is ignored. Initial allocation forced to {}", shard.shardId(), forced); + } + return forced; + }).orElse(assignment.nodeIds())); + } + + private Iterable getFallbackNodeIds(ShardRouting shard, AtomicBoolean isThrottled) { + return () -> { + if (shard.primary() && isThrottled.get() == false) { + var fallbackNodeIds = allocation.routingNodes().stream().map(RoutingNode::nodeId).toList(); + if (logger.isDebugEnabled()) { + logger.trace("Shard [{}] assignment is temporary not possible. Falling back to {}", shard.shardId(), fallbackNodeIds); + } + return allocationOrdering.sort(fallbackNodeIds).iterator(); + } else { + return Collections.emptyIterator(); + } + }; + } + + private void moveShards() { + // Iterate over the started shards interleaving between nodes, and check if they can remain. In the presence of throttling + // shard movements, the goal of this iteration order is to achieve a fairer movement of shards from the nodes that are + // offloading the shards. + for (final var iterator = routingNodes.nodeInterleavedShardIterator(); iterator.hasNext();) { + final var shardRouting = iterator.next(); + + if (shardRouting.started() == false) { + // can only move started shards + continue; + } + + final var assignment = desiredBalance.getAssignment(shardRouting.shardId()); + if (assignment == null) { + // balance is not computed + continue; + } + + if (assignment.nodeIds().contains(shardRouting.currentNodeId())) { + // shard is already on a desired node + continue; + } + + if (allocation.deciders().canAllocate(shardRouting, allocation).type() != Decision.Type.YES) { + // cannot allocate anywhere, no point in looking for a target node + continue; + } + + final var routingNode = routingNodes.node(shardRouting.currentNodeId()); + final var canRemainDecision = allocation.deciders().canRemain(shardRouting, routingNode, allocation); + if (canRemainDecision.type() != Decision.Type.NO) { + // it's desired elsewhere but technically it can remain on its current node. Defer its movement until later on to give + // priority to shards that _must_ move. + continue; + } + + final var moveTarget = findRelocationTarget(shardRouting, assignment.nodeIds()); + if (moveTarget != null) { + routingNodes.relocateShard( + shardRouting, + moveTarget.getId(), + allocation.clusterInfo().getShardSize(shardRouting, ShardRouting.UNAVAILABLE_EXPECTED_SHARD_SIZE), + allocation.changes() + ); + } + } + } + + private void balance() { + if (allocation.deciders().canRebalance(allocation).type() != Decision.Type.YES) { + return; + } + + // Iterate over the started shards interleaving between nodes, and try to move any which are on undesired nodes. In the presence of + // throttling shard movements, the goal of this iteration order is to achieve a fairer movement of shards from the nodes that are + // offloading the shards. + for (final var iterator = routingNodes.nodeInterleavedShardIterator(); iterator.hasNext();) { + final var shardRouting = iterator.next(); + + if (shardRouting.started() == false) { + // can only rebalance started shards + continue; + } + + final var assignment = desiredBalance.getAssignment(shardRouting.shardId()); + if (assignment == null) { + // balance is not computed + continue; + } + + if (assignment.nodeIds().contains(shardRouting.currentNodeId())) { + // shard is already on a desired node + continue; + } + + if (allocation.deciders().canRebalance(shardRouting, allocation).type() != Decision.Type.YES) { + // rebalancing disabled for this shard + continue; + } + + if (allocation.deciders().canAllocate(shardRouting, allocation).type() != Decision.Type.YES) { + // cannot allocate anywhere, no point in looking for a target node + continue; + } + + final var rebalanceTarget = findRelocationTarget(shardRouting, assignment.nodeIds(), this::decideCanAllocate); + if (rebalanceTarget != null) { + routingNodes.relocateShard( + shardRouting, + rebalanceTarget.getId(), + allocation.clusterInfo().getShardSize(shardRouting, ShardRouting.UNAVAILABLE_EXPECTED_SHARD_SIZE), + allocation.changes() + ); + } + } + } + + private DiscoveryNode findRelocationTarget(final ShardRouting shardRouting, Set desiredNodeIds) { + final var moveDecision = findRelocationTarget(shardRouting, desiredNodeIds, this::decideCanAllocate); + if (moveDecision != null) { + return moveDecision; + } + + final var shutdown = allocation.metadata().nodeShutdowns().get(shardRouting.currentNodeId()); + final var shardsOnReplacedNode = shutdown != null && shutdown.getType().equals(SingleNodeShutdownMetadata.Type.REPLACE); + if (shardsOnReplacedNode) { + return findRelocationTarget(shardRouting, desiredNodeIds, this::decideCanForceAllocateForVacate); + } + return null; + } + + private DiscoveryNode findRelocationTarget( + ShardRouting shardRouting, + Set desiredNodeIds, + BiFunction canAllocateDecider + ) { + for (final var nodeId : desiredNodeIds) { + // TODO consider ignored nodes here too? + if (nodeId.equals(shardRouting.currentNodeId()) == false) { + final var currentNode = routingNodes.node(nodeId); + final var decision = canAllocateDecider.apply(shardRouting, currentNode); + logger.trace("relocate {} to {}: {}", shardRouting, nodeId, decision); + if (decision.type() == Decision.Type.YES) { + return currentNode.node(); + } + } + } + + return null; + } + + private Decision decideCanAllocate(ShardRouting shardRouting, RoutingNode target) { + return allocation.deciders().canAllocate(shardRouting, target, allocation); + } + + private Decision decideCanForceAllocateForVacate(ShardRouting shardRouting, RoutingNode target) { + return allocation.deciders().canForceAllocateDuringReplace(shardRouting, target, allocation); + } +} diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceShardsAllocator.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceShardsAllocator.java new file mode 100644 index 0000000000000..36244d067d4e1 --- /dev/null +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceShardsAllocator.java @@ -0,0 +1,330 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0 and the Server Side Public License, v 1; you may not use this file except + * in compliance with, at your election, the Elastic License 2.0 or the Server + * Side Public License, v 1. + */ + +package org.elasticsearch.cluster.routing.allocation.allocator; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.elasticsearch.action.ActionListener; +import org.elasticsearch.cluster.ClusterState; +import org.elasticsearch.cluster.ClusterStateTaskConfig; +import org.elasticsearch.cluster.ClusterStateTaskExecutor; +import org.elasticsearch.cluster.ClusterStateTaskListener; +import org.elasticsearch.cluster.routing.RoutingNode; +import org.elasticsearch.cluster.routing.RoutingNodes; +import org.elasticsearch.cluster.routing.ShardRouting; +import org.elasticsearch.cluster.routing.allocation.RoutingAllocation; +import org.elasticsearch.cluster.routing.allocation.RoutingExplanations; +import org.elasticsearch.cluster.routing.allocation.ShardAllocationDecision; +import org.elasticsearch.cluster.routing.allocation.command.AllocationCommand; +import org.elasticsearch.cluster.routing.allocation.command.AllocationCommands; +import org.elasticsearch.cluster.routing.allocation.command.MoveAllocationCommand; +import org.elasticsearch.cluster.service.ClusterService; +import org.elasticsearch.cluster.service.MasterService; +import org.elasticsearch.common.Priority; +import org.elasticsearch.common.metrics.CounterMetric; +import org.elasticsearch.common.util.set.Sets; +import org.elasticsearch.index.shard.ShardId; +import org.elasticsearch.threadpool.ThreadPool; + +import java.util.ArrayList; +import java.util.Comparator; +import java.util.List; +import java.util.Objects; +import java.util.Set; +import java.util.concurrent.ConcurrentLinkedQueue; +import java.util.concurrent.atomic.AtomicLong; +import java.util.function.Consumer; + +import static java.util.stream.Collectors.toSet; + +/** + * A {@link ShardsAllocator} which asynchronously refreshes the desired balance held by the {@link DesiredBalanceComputer} and then takes + * steps towards the desired balance using the {@link DesiredBalanceReconciler}. + */ +public class DesiredBalanceShardsAllocator implements ShardsAllocator { + + private static final Logger logger = LogManager.getLogger(DesiredBalanceShardsAllocator.class); + + private final ShardsAllocator delegateAllocator; + private final ThreadPool threadPool; + private final ClusterService clusterService; + private final DesiredBalanceReconcilerAction reconciler; + private final DesiredBalanceComputer desiredBalanceComputer; + private final ContinuousComputation desiredBalanceComputation; + private final PendingListenersQueue queue; + private final AtomicLong indexGenerator = new AtomicLong(-1); + private final ConcurrentLinkedQueue> pendingDesiredBalanceMoves = new ConcurrentLinkedQueue<>(); + private final ReconcileDesiredBalanceExecutor executor = new ReconcileDesiredBalanceExecutor(); + private final NodeAllocationOrdering allocationOrdering = new NodeAllocationOrdering(); + private volatile DesiredBalance currentDesiredBalance = DesiredBalance.INITIAL; + + // stats + protected final CounterMetric computationsSubmitted = new CounterMetric(); + protected final CounterMetric computationsExecuted = new CounterMetric(); + protected final CounterMetric computationsConverged = new CounterMetric(); + protected final CounterMetric cumulativeComputationTime = new CounterMetric(); + protected final CounterMetric cumulativeReconciliationTime = new CounterMetric(); + + @FunctionalInterface + public interface DesiredBalanceReconcilerAction { + ClusterState apply(ClusterState clusterState, Consumer routingAllocationAction); + } + + public DesiredBalanceShardsAllocator( + ShardsAllocator delegateAllocator, + ThreadPool threadPool, + ClusterService clusterService, + DesiredBalanceReconcilerAction reconciler + ) { + this(delegateAllocator, threadPool, clusterService, new DesiredBalanceComputer(delegateAllocator), reconciler); + } + + public DesiredBalanceShardsAllocator( + ShardsAllocator delegateAllocator, + ThreadPool threadPool, + ClusterService clusterService, + DesiredBalanceComputer desiredBalanceComputer, + DesiredBalanceReconcilerAction reconciler + ) { + this.delegateAllocator = delegateAllocator; + this.threadPool = threadPool; + this.clusterService = clusterService; + this.reconciler = reconciler; + this.desiredBalanceComputer = desiredBalanceComputer; + this.desiredBalanceComputation = new ContinuousComputation<>(threadPool.generic()) { + + @Override + protected void processInput(DesiredBalanceInput desiredBalanceInput) { + + long index = desiredBalanceInput.index(); + logger.debug("Starting desired balance computation for [{}]", index); + + recordTime( + cumulativeComputationTime, + () -> setCurrentDesiredBalance( + desiredBalanceComputer.compute( + currentDesiredBalance, + desiredBalanceInput, + pendingDesiredBalanceMoves, + this::isFresh + ) + ) + ); + computationsExecuted.inc(); + if (isFresh(desiredBalanceInput)) { + logger.debug("Desired balance computation for [{}] is completed, scheduling reconciliation", index); + computationsConverged.inc(); + submitReconcileTask(currentDesiredBalance); + } else { + logger.debug("Desired balance computation for [{}] is discarded as newer one is submitted", index); + } + } + + @Override + public String toString() { + return "DesiredBalanceShardsAllocator#updateDesiredBalanceAndReroute"; + } + }; + this.queue = new PendingListenersQueue(threadPool); + } + + @Override + public ShardAllocationDecision decideShardAllocation(ShardRouting shard, RoutingAllocation allocation) { + return delegateAllocator.decideShardAllocation(shard, allocation); + } + + @Override + public void allocate(RoutingAllocation allocation) { + throw new UnsupportedOperationException(); + } + + @Override + public void allocate(RoutingAllocation allocation, ActionListener listener) { + assert MasterService.assertMasterUpdateOrTestThread() : Thread.currentThread().getName(); + assert allocation.ignoreDisable() == false; + + computationsSubmitted.inc(); + + var index = indexGenerator.incrementAndGet(); + logger.debug("Executing allocate for [{}]", index); + queue.add(index, listener); + desiredBalanceComputation.onNewInput(DesiredBalanceInput.create(index, allocation)); + + // Starts reconciliation towards desired balance that might have not been updated with a recent calculation yet. + // This is fine as balance should have incremental rather than radical changes. + // This should speed up achieving the desired balance in cases current state is still different from it (due to THROTTLING). + reconcile(currentDesiredBalance, allocation); + } + + @Override + public RoutingExplanations execute(RoutingAllocation allocation, AllocationCommands commands, boolean explain, boolean retryFailed) { + var explanations = ShardsAllocator.super.execute(allocation, commands, explain, retryFailed); + var moves = getMoveCommands(commands); + if (moves.isEmpty() == false) { + pendingDesiredBalanceMoves.add(moves); + } + return explanations; + } + + private static List getMoveCommands(AllocationCommands commands) { + var moves = new ArrayList(); + for (AllocationCommand command : commands.commands()) { + if (command instanceof MoveAllocationCommand move) { + moves.add(move); + } + } + return moves; + } + + private void setCurrentDesiredBalance(DesiredBalance newDesiredBalance) { + if (logger.isTraceEnabled()) { + var diff = DesiredBalance.hasChanges(currentDesiredBalance, newDesiredBalance) + ? "Diff: " + diff(currentDesiredBalance, newDesiredBalance) + : "No changes"; + logger.trace("Desired balance updated: {}. {}", newDesiredBalance, diff); + } else { + logger.debug("Desired balance updated for [{}]", newDesiredBalance.lastConvergedIndex()); + } + currentDesiredBalance = newDesiredBalance; + } + + protected void submitReconcileTask(DesiredBalance desiredBalance) { + clusterService.submitStateUpdateTask( + "reconcile-desired-balance", + new ReconcileDesiredBalanceTask(desiredBalance), + ClusterStateTaskConfig.build(Priority.URGENT), + executor + ); + } + + protected void reconcile(DesiredBalance desiredBalance, RoutingAllocation allocation) { + if (logger.isTraceEnabled()) { + logger.trace("Reconciling desired balance: {}", desiredBalance); + } else { + logger.debug("Reconciling desired balance for [{}]", desiredBalance.lastConvergedIndex()); + } + allocationOrdering.retainNodes(getNodeIds(allocation.routingNodes())); + recordTime(cumulativeReconciliationTime, new DesiredBalanceReconciler(desiredBalance, allocation, allocationOrdering)::run); + } + + public DesiredBalance getDesiredBalance() { + return currentDesiredBalance; + } + + public DesiredBalanceStats getStats() { + return new DesiredBalanceStats( + currentDesiredBalance.lastConvergedIndex(), + desiredBalanceComputation.isActive(), + computationsSubmitted.count(), + computationsExecuted.count(), + computationsConverged.count(), + desiredBalanceComputer.iterations.sum(), + cumulativeComputationTime.count(), + cumulativeReconciliationTime.count() + ); + } + + private void onNoLongerMaster() { + if (indexGenerator.getAndSet(-1) != -1) { + currentDesiredBalance = DesiredBalance.INITIAL; + queue.completeAllAsNotMaster(); + pendingDesiredBalanceMoves.clear(); + allocationOrdering.clear(); + } + } + + private final class ReconcileDesiredBalanceTask implements ClusterStateTaskListener { + private final DesiredBalance desiredBalance; + + private ReconcileDesiredBalanceTask(DesiredBalance desiredBalance) { + this.desiredBalance = desiredBalance; + } + + @Override + public void onFailure(Exception e) { + assert MasterService.isPublishFailureException(e) : e; + onNoLongerMaster(); + } + } + + private final class ReconcileDesiredBalanceExecutor implements ClusterStateTaskExecutor { + + @Override + public ClusterState execute(BatchExecutionContext batchExecutionContext) { + var latest = findLatest(batchExecutionContext.taskContexts()); + var newState = applyBalance(batchExecutionContext, latest); + discardSupersededTasks(batchExecutionContext.taskContexts(), latest); + return newState; + } + + private TaskContext findLatest(List> taskContexts) { + return taskContexts.stream().max(Comparator.comparing(context -> context.getTask().desiredBalance.lastConvergedIndex())).get(); + } + + private ClusterState applyBalance( + BatchExecutionContext batchExecutionContext, + TaskContext latest + ) { + try (var ignored = batchExecutionContext.dropHeadersContext()) { + var newState = reconciler.apply( + batchExecutionContext.initialState(), + routingAllocation -> reconcile(latest.getTask().desiredBalance, routingAllocation) + ); + latest.success(() -> queue.complete(latest.getTask().desiredBalance.lastConvergedIndex())); + return newState; + } + } + + private void discardSupersededTasks( + List> taskContexts, + TaskContext latest + ) { + for (TaskContext taskContext : taskContexts) { + if (taskContext != latest) { + taskContext.success(() -> {}); + } + } + } + } + + private void recordTime(CounterMetric metric, Runnable action) { + final long started = threadPool.relativeTimeInMillis(); + try { + action.run(); + } finally { + final long finished = threadPool.relativeTimeInMillis(); + metric.inc(finished - started); + } + } + + private static String diff(DesiredBalance old, DesiredBalance updated) { + var intersection = Sets.intersection(old.assignments().keySet(), updated.assignments().keySet()); + var diff = Sets.difference(Sets.union(old.assignments().keySet(), updated.assignments().keySet()), intersection); + + var newLine = System.lineSeparator(); + var builder = new StringBuilder(); + for (ShardId shardId : intersection) { + var oldAssignment = old.getAssignment(shardId); + var updatedAssignment = updated.getAssignment(shardId); + if (Objects.equals(oldAssignment, updatedAssignment) == false) { + builder.append(newLine).append(shardId).append(": ").append(oldAssignment).append(" --> ").append(updatedAssignment); + } + } + for (ShardId shardId : diff) { + var oldAssignment = old.getAssignment(shardId); + var updatedAssignment = updated.getAssignment(shardId); + builder.append(newLine).append(shardId).append(": ").append(oldAssignment).append(" --> ").append(updatedAssignment); + } + return builder.append(newLine).toString(); + } + + private static Set getNodeIds(RoutingNodes nodes) { + return nodes.stream().map(RoutingNode::nodeId).collect(toSet()); + } +} diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceStats.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceStats.java new file mode 100644 index 0000000000000..ce7b24d8e60c4 --- /dev/null +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceStats.java @@ -0,0 +1,70 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0 and the Server Side Public License, v 1; you may not use this file except + * in compliance with, at your election, the Elastic License 2.0 or the Server + * Side Public License, v 1. + */ + +package org.elasticsearch.cluster.routing.allocation.allocator; + +import org.elasticsearch.common.io.stream.StreamInput; +import org.elasticsearch.common.io.stream.StreamOutput; +import org.elasticsearch.common.io.stream.Writeable; +import org.elasticsearch.core.TimeValue; +import org.elasticsearch.xcontent.ToXContentFragment; +import org.elasticsearch.xcontent.XContentBuilder; + +import java.io.IOException; + +public record DesiredBalanceStats( + long lastConvergedIndex, + boolean computationActive, + long computationSubmitted, + long computationExecuted, + long computationConverged, + long computationIterations, + long cumulativeComputationTime, + long cumulativeReconciliationTime +) implements Writeable, ToXContentFragment { + + public static DesiredBalanceStats readFrom(StreamInput in) throws IOException { + return new DesiredBalanceStats( + in.readVLong(), + in.readBoolean(), + in.readVLong(), + in.readVLong(), + in.readVLong(), + in.readVLong(), + in.readVLong(), + in.readVLong() + ); + } + + @Override + public void writeTo(StreamOutput out) throws IOException { + out.writeVLong(lastConvergedIndex); + out.writeBoolean(computationActive); + out.writeVLong(computationSubmitted); + out.writeVLong(computationExecuted); + out.writeVLong(computationConverged); + out.writeVLong(computationIterations); + out.writeVLong(cumulativeComputationTime); + out.writeVLong(cumulativeReconciliationTime); + } + + @Override + public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { + + builder.field("computation_active", computationActive); + builder.field("computation_submitted", computationSubmitted); + builder.field("computation_executed", computationExecuted); + builder.field("computation_converged", computationConverged); + builder.field("computation_iterations", computationIterations); + builder.field("computation_converged_index", lastConvergedIndex); + builder.humanReadableField("computation_time_in_millis", "computation_time", new TimeValue(cumulativeComputationTime)); + builder.humanReadableField("reconciliation_time_in_millis", "reconciliation_time", new TimeValue(cumulativeReconciliationTime)); + + return builder; + } +} diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/NodeAllocationOrdering.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/NodeAllocationOrdering.java new file mode 100644 index 0000000000000..60ad1d3480372 --- /dev/null +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/NodeAllocationOrdering.java @@ -0,0 +1,44 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0 and the Server Side Public License, v 1; you may not use this file except + * in compliance with, at your election, the Elastic License 2.0 or the Server + * Side Public License, v 1. + */ + +package org.elasticsearch.cluster.routing.allocation.allocator; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.Comparator; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.atomic.AtomicLong; + +public class NodeAllocationOrdering { + + private final AtomicLong order = new AtomicLong(0); + private final Map recentAllocations = new HashMap<>(); + private final Comparator comparator = Comparator.comparing(nodeId -> recentAllocations.getOrDefault(nodeId, 0L)); + + public void recordAllocation(String nodeId) { + recentAllocations.put(nodeId, order.incrementAndGet()); + } + + public List sort(Collection nodeIds) { + var list = new ArrayList<>(nodeIds); + list.sort(comparator); + return list; + } + + public void retainNodes(Set retainedNodeIds) { + recentAllocations.keySet().retainAll(retainedNodeIds); + } + + public void clear() { + order.set(0L); + recentAllocations.clear(); + } +} diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/PendingListenersQueue.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/PendingListenersQueue.java new file mode 100644 index 0000000000000..309edc8c77094 --- /dev/null +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/PendingListenersQueue.java @@ -0,0 +1,88 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0 and the Server Side Public License, v 1; you may not use this file except + * in compliance with, at your election, the Elastic License 2.0 or the Server + * Side Public License, v 1. + */ + +package org.elasticsearch.cluster.routing.allocation.allocator; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.elasticsearch.action.ActionListener; +import org.elasticsearch.cluster.NotMasterException; +import org.elasticsearch.threadpool.ThreadPool; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.LinkedList; +import java.util.Queue; + +public class PendingListenersQueue { + + private static final Logger logger = LogManager.getLogger(PendingListenersQueue.class); + + private record PendingListener(long index, ActionListener listener) {} + + private final ThreadPool threadPool; + private final Queue pendingListeners = new LinkedList<>(); + private volatile long completedIndex = -1; + + public PendingListenersQueue(ThreadPool threadPool) { + this.threadPool = threadPool; + } + + public void add(long index, ActionListener listener) { + synchronized (pendingListeners) { + pendingListeners.add(new PendingListener(index, listener)); + } + } + + public void complete(long index) { + advance(index); + executeListeners(completedIndex, true); + } + + public void completeAllAsNotMaster() { + completedIndex = -1; + executeListeners(Long.MAX_VALUE, false); + } + + public long getCompletedIndex() { + return completedIndex; + } + + private void executeListeners(long convergedIndex, boolean isMaster) { + var listeners = pollListeners(convergedIndex); + if (listeners.isEmpty() == false) { + threadPool.generic().execute(() -> { + if (isMaster) { + ActionListener.onResponse(listeners, null); + } else { + ActionListener.onFailure(listeners, new NotMasterException("no longer master")); + } + }); + } + } + + private void advance(long index) { + synchronized (pendingListeners) { + if (index > completedIndex) { + completedIndex = index; + } + } + } + + private Collection> pollListeners(long maxIndex) { + var listeners = new ArrayList>(); + PendingListener listener; + synchronized (pendingListeners) { + while ((listener = pendingListeners.peek()) != null && listener.index <= maxIndex) { + listeners.add(pendingListeners.poll().listener); + } + logger.trace("Polled listeners up to [{}]. Poll {}, remaining {}", maxIndex, listeners, pendingListeners); + } + return listeners; + } +} diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/ShardAssignment.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/ShardAssignment.java new file mode 100644 index 0000000000000..e11b02b4e4759 --- /dev/null +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/ShardAssignment.java @@ -0,0 +1,35 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0 and the Server Side Public License, v 1; you may not use this file except + * in compliance with, at your election, the Elastic License 2.0 or the Server + * Side Public License, v 1. + */ + +package org.elasticsearch.cluster.routing.allocation.allocator; + +import org.elasticsearch.cluster.routing.ShardRouting; + +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Set; + +import static java.util.Collections.unmodifiableSet; +import static java.util.stream.Collectors.toCollection; + +public record ShardAssignment(Set nodeIds, int total, int unassigned, int ignored) { + + public ShardAssignment { + assert total > 0 : "Shard assignment should not be empty"; + assert nodeIds.size() + unassigned == total : "Shard assignment should account for all shards"; + } + + public boolean isIgnored(boolean primary) { + return primary ? total == ignored : ignored > 0; + } + + public static ShardAssignment of(List routings) { + var nodeIds = routings.stream().map(ShardRouting::currentNodeId).collect(toCollection(LinkedHashSet::new)); + return new ShardAssignment(unmodifiableSet(nodeIds), routings.size(), 0, 0); + } +} diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/ShardsAllocator.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/ShardsAllocator.java index 231546b9e50a9..b8d647782e0ce 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/ShardsAllocator.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/ShardsAllocator.java @@ -8,11 +8,14 @@ package org.elasticsearch.cluster.routing.allocation.allocator; +import org.elasticsearch.action.ActionListener; import org.elasticsearch.cluster.routing.ShardRouting; import org.elasticsearch.cluster.routing.allocation.AllocateUnassignedDecision; import org.elasticsearch.cluster.routing.allocation.MoveDecision; import org.elasticsearch.cluster.routing.allocation.RoutingAllocation; +import org.elasticsearch.cluster.routing.allocation.RoutingExplanations; import org.elasticsearch.cluster.routing.allocation.ShardAllocationDecision; +import org.elasticsearch.cluster.routing.allocation.command.AllocationCommands; /** *

@@ -33,6 +36,41 @@ public interface ShardsAllocator { */ void allocate(RoutingAllocation allocation); + /** + * Allocates shards to nodes in the cluster. An implementation of this method should: + * - assign unassigned shards + * - relocate shards that cannot stay on a node anymore + * - relocate shards to find a good shard balance in the cluster + * + * @param allocation current node allocation + * @param listener listener to be executed once async allocation is completed + */ + default void allocate(RoutingAllocation allocation, ActionListener listener) { + allocate(allocation); + listener.onResponse(null); + } + + /** + * Execute allocation commands + */ + default RoutingExplanations execute(RoutingAllocation allocation, AllocationCommands commands, boolean explain, boolean retryFailed) { + var originalDebugMode = allocation.getDebugMode(); + allocation.debugDecision(true); + // we ignore disable allocation, because commands are explicit + allocation.ignoreDisable(true); + + try { + if (retryFailed) { + allocation.routingNodes().resetFailedCounter(allocation.changes()); + } + return commands.execute(allocation, explain); + } finally { + // revert the ignore disable flag, since when rerouting, we want the original setting to take place + allocation.ignoreDisable(false); + allocation.setDebugMode(originalDebugMode); + } + } + /** * Returns the decision for where a shard should reside in the cluster. If the shard is unassigned, * then the {@link AllocateUnassignedDecision} will be non-null. If the shard is not in the unassigned diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/AllocationDecider.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/AllocationDecider.java index 1b5cf0805a821..b083ca1617bc4 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/AllocationDecider.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/AllocationDecider.java @@ -15,6 +15,9 @@ import org.elasticsearch.cluster.routing.allocation.RoutingAllocation; import org.elasticsearch.cluster.routing.allocation.decider.Decision.Type; +import java.util.Optional; +import java.util.Set; + /** * {@link AllocationDecider} is an abstract base class that allows to make * dynamic cluster- or index-wide shard allocation decisions on a per-node @@ -139,4 +142,14 @@ public Decision canForceAllocateDuringReplace(ShardRouting shardRouting, Routing public Decision canAllocateReplicaWhenThereIsRetentionLease(ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) { return canAllocate(shardRouting, node, allocation); } + + /** + * Returns a {@code empty()} if shard could be initially allocated anywhere or {@code Option.of(Set.of(nodeIds))} if shard could be + * initially allocated only on subset of a nodes. + * + * This might be required for splitting or shrinking index as resulting shards have to be on the same node as a source shard. + */ + public Optional> getForcedInitialShardAllocationToNodes(ShardRouting shardRouting, RoutingAllocation allocation) { + return Optional.empty(); + } } diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/AllocationDeciders.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/AllocationDeciders.java index 63761297bc602..d80c51d9740fd 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/AllocationDeciders.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/AllocationDeciders.java @@ -15,8 +15,11 @@ import org.elasticsearch.cluster.routing.RoutingNode; import org.elasticsearch.cluster.routing.ShardRouting; import org.elasticsearch.cluster.routing.allocation.RoutingAllocation; +import org.elasticsearch.common.util.set.Sets; import java.util.Collection; +import java.util.Optional; +import java.util.Set; /** * Combines the decision of multiple {@link AllocationDecider} implementations into a single allocation decision. @@ -282,4 +285,15 @@ private static void addDecision(Decision.Multi ret, Decision decision, RoutingAl ret.add(decision); } } + + public Optional> getForcedInitialShardAllocationToNodes(ShardRouting shardRouting, RoutingAllocation allocation) { + var result = Optional.>empty(); + for (AllocationDecider allocationDecider : allocations) { + var r = allocationDecider.getForcedInitialShardAllocationToNodes(shardRouting, allocation); + if (r.isPresent()) { + result = result.isEmpty() ? r : Optional.of(Sets.intersection(result.get(), r.get())); + } + } + return result; + } } diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/EnableAllocationDecider.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/EnableAllocationDecider.java index df8e8bcd10a65..19b2e3d986ebd 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/EnableAllocationDecider.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/EnableAllocationDecider.java @@ -114,6 +114,10 @@ public Decision canAllocate(ShardRouting shardRouting, RoutingAllocation allocat ); } + if (allocation.isSimulating()) { + return allocation.decision(Decision.YES, NAME, "allocation is always enabled when simulating"); + } + final IndexMetadata indexMetadata = allocation.metadata().getIndexSafe(shardRouting.index()); final Allocation enable; final boolean usedIndexSetting; @@ -149,6 +153,10 @@ public Decision canRebalance(RoutingAllocation allocation) { return allocation.decision(Decision.YES, NAME, "allocation is explicitly ignoring any disabling of rebalancing"); } + if (allocation.isSimulating()) { + return allocation.decision(Decision.YES, NAME, "allocation is always enabled when simulating"); + } + if (enableRebalance == Rebalance.NONE) { for (IndexMetadata indexMetadata : allocation.metadata()) { if (INDEX_ROUTING_REBALANCE_ENABLE_SETTING.exists(indexMetadata.getSettings()) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/FilterAllocationDecider.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/FilterAllocationDecider.java index e32fc14bc617b..c46686b89b1df 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/FilterAllocationDecider.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/FilterAllocationDecider.java @@ -22,7 +22,10 @@ import java.util.List; import java.util.Map; +import java.util.Optional; +import java.util.Set; +import static java.util.stream.Collectors.toUnmodifiableSet; import static org.elasticsearch.cluster.node.DiscoveryNodeFilters.OpType.AND; import static org.elasticsearch.cluster.node.DiscoveryNodeFilters.OpType.OR; import static org.elasticsearch.cluster.node.DiscoveryNodeFilters.validateIpValue; @@ -225,4 +228,19 @@ private void setClusterIncludeFilters(Map> filters) { private void setClusterExcludeFilters(Map> filters) { clusterExcludeFilters = DiscoveryNodeFilters.trimTier(DiscoveryNodeFilters.buildFromKeyValues(OR, filters)); } + + @Override + public Optional> getForcedInitialShardAllocationToNodes(ShardRouting shardRouting, RoutingAllocation allocation) { + if (shardRouting.unassigned() && shardRouting.recoverySource().getType() == RecoverySource.Type.LOCAL_SHARDS) { + var indexMetadata = allocation.metadata().getIndexSafe(shardRouting.index()); + var initialRecoveryFilters = DiscoveryNodeFilters.trimTier(indexMetadata.getInitialRecoveryFilters()); + + if (initialRecoveryFilters != null) { + return Optional.of( + allocation.nodes().stream().filter(initialRecoveryFilters::match).map(DiscoveryNode::getId).collect(toUnmodifiableSet()) + ); + } + } + return super.getForcedInitialShardAllocationToNodes(shardRouting, allocation); + } } diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/MaxRetryAllocationDecider.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/MaxRetryAllocationDecider.java index 3ee48759c6593..d244c38b59137 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/MaxRetryAllocationDecider.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/MaxRetryAllocationDecider.java @@ -38,10 +38,15 @@ public class MaxRetryAllocationDecider extends AllocationDecider { private static final Decision YES_NO_FAILURES = Decision.single(Decision.Type.YES, NAME, "shard has no previous failures"); + private static final Decision YES_SIMULATING = Decision.single(Decision.Type.YES, NAME, "previous failures ignored when simulating"); + @Override public Decision canAllocate(ShardRouting shardRouting, RoutingAllocation allocation) { - final int maxRetries = SETTING_ALLOCATION_MAX_RETRY.get(allocation.metadata().getIndexSafe(shardRouting.index()).getSettings()); + if (allocation.isSimulating()) { + return YES_SIMULATING; + } + final int maxRetries = SETTING_ALLOCATION_MAX_RETRY.get(allocation.metadata().getIndexSafe(shardRouting.index()).getSettings()); final var unassignedInfo = shardRouting.unassignedInfo(); final int numFailedAllocations = unassignedInfo == null ? 0 : unassignedInfo.getNumFailedAllocations(); if (numFailedAllocations > 0) { diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/ResizeAllocationDecider.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/ResizeAllocationDecider.java index 7e4100b390905..437d006fee53d 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/ResizeAllocationDecider.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/ResizeAllocationDecider.java @@ -16,6 +16,9 @@ import org.elasticsearch.index.Index; import org.elasticsearch.index.shard.ShardId; +import java.util.Optional; +import java.util.Set; + /** * An allocation decider that ensures we allocate the shards of a target index for resize operations next to the source primaries */ @@ -74,4 +77,27 @@ public Decision canForceAllocatePrimary(ShardRouting shardRouting, RoutingNode n public Decision canForceAllocateDuringReplace(ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) { return canAllocate(shardRouting, node, allocation); } + + @Override + public Optional> getForcedInitialShardAllocationToNodes(ShardRouting shardRouting, RoutingAllocation allocation) { + if (shardRouting.unassignedInfo() != null && shardRouting.recoverySource().getType() == RecoverySource.Type.LOCAL_SHARDS) { + var targetIndexMetadata = allocation.metadata().getIndexSafe(shardRouting.index()); + var sourceIndexMetadata = allocation.metadata().index(targetIndexMetadata.getResizeSourceIndex()); + if (sourceIndexMetadata == null) { + return Optional.of(Set.of());// source index not found + } + if (targetIndexMetadata.getNumberOfShards() < sourceIndexMetadata.getNumberOfShards()) { + return Optional.empty(); + } + var shardId = targetIndexMetadata.getNumberOfShards() == sourceIndexMetadata.getNumberOfShards() + ? IndexMetadata.selectCloneShard(shardRouting.id(), sourceIndexMetadata, targetIndexMetadata.getNumberOfShards()) + : IndexMetadata.selectSplitShard(shardRouting.id(), sourceIndexMetadata, targetIndexMetadata.getNumberOfShards()); + var activePrimary = allocation.routingNodes().activePrimary(shardId); + if (activePrimary == null) { + return Optional.of(Set.of());// primary is active + } + return Optional.of(Set.of(activePrimary.currentNodeId())); + } + return super.getForcedInitialShardAllocationToNodes(shardRouting, allocation); + } } diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/SnapshotInProgressAllocationDecider.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/SnapshotInProgressAllocationDecider.java index 75099d067db57..b5d3276be4b9e 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/SnapshotInProgressAllocationDecider.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/SnapshotInProgressAllocationDecider.java @@ -50,6 +50,10 @@ public Decision canForceAllocateDuringReplace(ShardRouting shardRouting, Routing private static final Decision YES_NOT_SNAPSHOTTED = Decision.single(Decision.Type.YES, NAME, "the shard is not being snapshotted"); private static Decision canMove(ShardRouting shardRouting, RoutingAllocation allocation) { + if (allocation.isSimulating()) { + return allocation.decision(Decision.YES, NAME, "allocation is always enabled when simulating"); + } + if (shardRouting.primary() == false) { // Only primary shards are snapshotted return YES_NOT_SNAPSHOTTED; diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/ThrottlingAllocationDecider.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/ThrottlingAllocationDecider.java index e13abaf4811eb..3952330ba8299 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/ThrottlingAllocationDecider.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/ThrottlingAllocationDecider.java @@ -135,7 +135,9 @@ public Decision canAllocate(ShardRouting shardRouting, RoutingNode node, Routing primariesInRecovery++; } } - if (primariesInRecovery >= primariesInitialRecoveries) { + if (allocation.isSimulating()) { + return allocation.decision(Decision.YES, NAME, "primary allocation is not throttled when simulating"); + } else if (primariesInRecovery >= primariesInitialRecoveries) { // TODO: Should index creation not be throttled for primary shards? return allocation.decision( THROTTLE, diff --git a/server/src/main/java/org/elasticsearch/gateway/LocalAllocateDangledIndices.java b/server/src/main/java/org/elasticsearch/gateway/LocalAllocateDangledIndices.java index 5add10992882e..509d29f885b7f 100644 --- a/server/src/main/java/org/elasticsearch/gateway/LocalAllocateDangledIndices.java +++ b/server/src/main/java/org/elasticsearch/gateway/LocalAllocateDangledIndices.java @@ -13,6 +13,7 @@ import org.elasticsearch.Version; import org.elasticsearch.action.ActionListener; import org.elasticsearch.action.ActionListenerResponseHandler; +import org.elasticsearch.action.support.ChannelActionListener; import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.ClusterStateUpdateTask; import org.elasticsearch.cluster.block.ClusterBlocks; @@ -22,6 +23,7 @@ import org.elasticsearch.cluster.node.DiscoveryNode; import org.elasticsearch.cluster.routing.RoutingTable; import org.elasticsearch.cluster.routing.allocation.AllocationService; +import org.elasticsearch.cluster.routing.allocation.allocator.AllocationActionListener; import org.elasticsearch.cluster.service.ClusterService; import org.elasticsearch.common.UUIDs; import org.elasticsearch.common.inject.Inject; @@ -52,11 +54,8 @@ public class LocalAllocateDangledIndices { public static final String ACTION_NAME = "internal:gateway/local/allocate_dangled"; private final TransportService transportService; - private final ClusterService clusterService; - private final AllocationService allocationService; - private final IndexMetadataVerifier indexMetadataVerifier; @Inject @@ -105,6 +104,12 @@ public void messageReceived(final AllocateDangledRequest request, final Transpor indexNames[i] = request.indices[i].getIndex().getName(); } final String source = "allocation dangled indices " + Arrays.toString(indexNames); + + var listener = new AllocationActionListener( + new ChannelActionListener<>(channel, task.getAction(), request), + transportService.getThreadPool().getThreadContext() + ); + submitUnbatchedTask(source, new ClusterStateUpdateTask() { @Override public ClusterState execute(ClusterState currentState) { @@ -199,6 +204,7 @@ public ClusterState execute(ClusterState currentState) { sb.append("[").append(newIndexMetadata.getIndex()).append("/").append(newIndexMetadata.getState()).append("]"); } if (importNeeded == false) { + listener.reroute().onResponse(null); return currentState; } logger.info("importing dangled indices {} from [{}]", sb.toString(), request.fromNode); @@ -213,28 +219,19 @@ public ClusterState execute(ClusterState currentState) { // now, reroute return allocationService.reroute( ClusterState.builder(updatedState).routingTable(routingTable).build(), - "dangling indices allocated" + "dangling indices allocated", + listener.reroute() ); } @Override public void onFailure(Exception e) { - logger.error(() -> "unexpected failure during [" + source + "]", e); - try { - channel.sendResponse(e); - } catch (Exception inner) { - inner.addSuppressed(e); - logger.warn("failed send response for allocating dangled", inner); - } + listener.clusterStateUpdate().onFailure(e); } @Override public void clusterStateProcessed(ClusterState oldState, ClusterState newState) { - try { - channel.sendResponse(new AllocateDangledResponse()); - } catch (IOException e) { - logger.warn("failed send response for allocating dangled", e); - } + listener.clusterStateUpdate().onResponse(new AllocateDangledResponse()); } }); } diff --git a/server/src/main/java/org/elasticsearch/node/Node.java b/server/src/main/java/org/elasticsearch/node/Node.java index 93c8a927f7436..0cb75947dfc36 100644 --- a/server/src/main/java/org/elasticsearch/node/Node.java +++ b/server/src/main/java/org/elasticsearch/node/Node.java @@ -572,8 +572,9 @@ protected Node( clusterPlugins, clusterInfoService, snapshotsInfoService, - threadPool.getThreadContext(), - systemIndices + threadPool, + systemIndices, + rerouteServiceReference::get ); modules.add(clusterModule); IndicesModule indicesModule = new IndicesModule(pluginsService.filterPlugins(MapperPlugin.class)); @@ -694,6 +695,7 @@ protected Node( ); final MetadataCreateDataStreamService metadataCreateDataStreamService = new MetadataCreateDataStreamService( + threadPool, clusterService, metadataCreateIndexService ); @@ -704,7 +706,8 @@ protected Node( clusterModule.getAllocationService(), settingsModule.getIndexScopedSettings(), indicesService, - shardLimitValidator + shardLimitValidator, + threadPool ); Collection pluginComponents = pluginsService.flatMap( @@ -868,7 +871,8 @@ protected Node( shardLimitValidator, systemIndices, indicesService, - fileSettingsService + fileSettingsService, + threadPool ); final DiskThresholdMonitor diskThresholdMonitor = new DiskThresholdMonitor( settings, diff --git a/server/src/main/java/org/elasticsearch/snapshots/RestoreService.java b/server/src/main/java/org/elasticsearch/snapshots/RestoreService.java index 7d9f859b6ce82..dae877937fc9f 100644 --- a/server/src/main/java/org/elasticsearch/snapshots/RestoreService.java +++ b/server/src/main/java/org/elasticsearch/snapshots/RestoreService.java @@ -44,6 +44,7 @@ import org.elasticsearch.cluster.routing.ShardRouting; import org.elasticsearch.cluster.routing.UnassignedInfo; import org.elasticsearch.cluster.routing.allocation.AllocationService; +import org.elasticsearch.cluster.routing.allocation.allocator.AllocationActionListener; import org.elasticsearch.cluster.service.ClusterService; import org.elasticsearch.cluster.service.MasterService; import org.elasticsearch.common.Priority; @@ -78,6 +79,7 @@ import org.elasticsearch.repositories.RepositoryData; import org.elasticsearch.repositories.blobstore.BlobStoreRepository; import org.elasticsearch.reservedstate.service.FileSettingsService; +import org.elasticsearch.threadpool.ThreadPool; import java.io.IOException; import java.util.ArrayList; @@ -186,6 +188,8 @@ public class RestoreService implements ClusterStateApplier { private final FileSettingsService fileSettingsService; + private final ThreadPool threadPool; + private volatile boolean refreshRepositoryUuidOnRestore; public RestoreService( @@ -198,7 +202,8 @@ public RestoreService( ShardLimitValidator shardLimitValidator, SystemIndices systemIndices, IndicesService indicesService, - FileSettingsService fileSettingsService + FileSettingsService fileSettingsService, + ThreadPool threadPool ) { this.clusterService = clusterService; this.repositoriesService = repositoriesService; @@ -214,6 +219,7 @@ public RestoreService( this.systemIndices = systemIndices; this.indicesService = indicesService; this.fileSettingsService = fileSettingsService; + this.threadPool = threadPool; this.refreshRepositoryUuidOnRestore = REFRESH_REPO_UUID_ON_RESTORE_SETTING.get(clusterService.getSettings()); clusterService.getClusterSettings() .addSettingsUpdateConsumer(REFRESH_REPO_UUID_ON_RESTORE_SETTING, this::setRefreshRepositoryUuidOnRestore); @@ -1228,7 +1234,7 @@ private final class RestoreSnapshotStateTask extends ClusterStateUpdateTask { private final BiConsumer updater; - private final ActionListener listener; + private final AllocationActionListener listener; @Nullable private RestoreInfo restoreInfo; @@ -1253,7 +1259,7 @@ private final class RestoreSnapshotStateTask extends ClusterStateUpdateTask { this.metadata = metadata; this.dataStreamsToRestore = dataStreamsToRestore; this.updater = updater; - this.listener = listener; + this.listener = new AllocationActionListener<>(listener, threadPool.getThreadContext()); } @Override @@ -1412,7 +1418,7 @@ && isSystemIndex(snapshotIndexMetadata) == false) { if (searchableSnapshotsIndices.isEmpty() == false) { ensureSearchableSnapshotsRestorable(updatedClusterState, snapshotInfo, searchableSnapshotsIndices); } - return allocationService.reroute(updatedClusterState, "restored snapshot [" + snapshot + "]"); + return allocationService.reroute(updatedClusterState, "restored snapshot [" + snapshot + "]", listener.reroute()); } private void applyDataStreamRestores(ClusterState currentState, Metadata.Builder mdBuilder) { @@ -1574,7 +1580,7 @@ private void validateExistingClosedIndex( @Override public void onFailure(Exception e) { logger.warn(() -> "[" + snapshot + "] failed to restore snapshot", e); - listener.onFailure(e); + listener.clusterStateUpdate().onFailure(e); } @Override @@ -1585,7 +1591,7 @@ public void clusterStateProcessed(ClusterState oldState, ClusterState newState) snapshot, snapshotInfo.indices() ); - listener.onResponse(new RestoreCompletionResponse(restoreUUID, snapshot, restoreInfo)); + listener.clusterStateUpdate().onResponse(new RestoreCompletionResponse(restoreUUID, snapshot, restoreInfo)); } } diff --git a/server/src/test/java/org/elasticsearch/action/admin/cluster/reroute/ClusterRerouteTests.java b/server/src/test/java/org/elasticsearch/action/admin/cluster/reroute/ClusterRerouteTests.java index 50b2ef14fade8..19a699a5eaee3 100644 --- a/server/src/test/java/org/elasticsearch/action/admin/cluster/reroute/ClusterRerouteTests.java +++ b/server/src/test/java/org/elasticsearch/action/admin/cluster/reroute/ClusterRerouteTests.java @@ -31,6 +31,7 @@ import org.elasticsearch.common.io.stream.StreamInput; import org.elasticsearch.common.network.NetworkModule; import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.util.concurrent.ThreadContext; import org.elasticsearch.core.TimeValue; import org.elasticsearch.snapshots.EmptySnapshotsInfoService; import org.elasticsearch.test.gateway.TestGatewayAllocator; @@ -88,6 +89,7 @@ public void testClusterStateUpdateTaskInDryRun() { var task = new TransportClusterRerouteAction.ClusterRerouteResponseAckedClusterStateUpdateTask( logger, allocationService, + new ThreadContext(Settings.EMPTY), request, responseActionListener ); @@ -112,6 +114,7 @@ public void testClusterStateUpdateTask() { var task = new TransportClusterRerouteAction.ClusterRerouteResponseAckedClusterStateUpdateTask( logger, allocationService, + new ThreadContext(Settings.EMPTY), req, ActionListener.noop() ); @@ -175,7 +178,7 @@ private ClusterState createInitialClusterState(AllocationService service) { .nodes(DiscoveryNodes.builder().add(newNode("node1")).add(newNode("node2"))) .build(); RoutingTable prevRoutingTable = routingTable; - routingTable = service.reroute(clusterState, "reroute").routingTable(); + routingTable = service.reroute(clusterState, "reroute", ActionListener.noop()).routingTable(); clusterState = ClusterState.builder(clusterState).routingTable(routingTable).build(); assertEquals(prevRoutingTable.index("idx").size(), 1); diff --git a/server/src/test/java/org/elasticsearch/action/admin/indices/rollover/TransportRolloverActionTests.java b/server/src/test/java/org/elasticsearch/action/admin/indices/rollover/TransportRolloverActionTests.java index 58413140f177c..346b58c31cfbf 100644 --- a/server/src/test/java/org/elasticsearch/action/admin/indices/rollover/TransportRolloverActionTests.java +++ b/server/src/test/java/org/elasticsearch/action/admin/indices/rollover/TransportRolloverActionTests.java @@ -347,7 +347,7 @@ public void testConditionEvaluationWhenAliasToWriteAndReadIndicesConsidersOnlyPr .metadata(Metadata.builder().put(indexMetadata).put(indexMetadata2)) .build(); - when(mockCreateIndexService.applyCreateIndexRequest(any(), any(), anyBoolean())).thenReturn(stateBefore); + when(mockCreateIndexService.applyCreateIndexRequest(any(), any(), anyBoolean(), any())).thenReturn(stateBefore); when(mdIndexAliasesService.applyAliasActions(any(), any())).thenReturn(stateBefore); MetadataRolloverService rolloverService = new MetadataRolloverService( mockThreadPool, diff --git a/server/src/test/java/org/elasticsearch/action/admin/indices/shrink/TransportResizeActionTests.java b/server/src/test/java/org/elasticsearch/action/admin/indices/shrink/TransportResizeActionTests.java index 02f9349a0320d..132a9c08f208a 100644 --- a/server/src/test/java/org/elasticsearch/action/admin/indices/shrink/TransportResizeActionTests.java +++ b/server/src/test/java/org/elasticsearch/action/admin/indices/shrink/TransportResizeActionTests.java @@ -10,6 +10,7 @@ import org.apache.lucene.index.IndexWriter; import org.elasticsearch.Version; +import org.elasticsearch.action.ActionListener; import org.elasticsearch.action.admin.indices.create.CreateIndexClusterStateUpdateRequest; import org.elasticsearch.action.support.ActiveShardCount; import org.elasticsearch.cluster.ClusterName; @@ -137,7 +138,7 @@ public void testErrorCondition() { EmptySnapshotsInfoService.INSTANCE ); - RoutingTable routingTable = service.reroute(clusterState, "reroute").routingTable(); + RoutingTable routingTable = service.reroute(clusterState, "reroute", ActionListener.noop()).routingTable(); clusterState = ClusterState.builder(clusterState).routingTable(routingTable).build(); // now we start the shard routingTable = ESAllocationTestCase.startInitializingShardsAndReroute(service, clusterState, "source").routingTable(); @@ -166,7 +167,7 @@ public void testPassNumRoutingShards() { EmptySnapshotsInfoService.INSTANCE ); - RoutingTable routingTable = service.reroute(clusterState, "reroute").routingTable(); + RoutingTable routingTable = service.reroute(clusterState, "reroute", ActionListener.noop()).routingTable(); clusterState = ClusterState.builder(clusterState).routingTable(routingTable).build(); // now we start the shard routingTable = ESAllocationTestCase.startInitializingShardsAndReroute(service, clusterState, "source").routingTable(); @@ -208,7 +209,7 @@ public void testPassNumRoutingShardsAndFail() { EmptySnapshotsInfoService.INSTANCE ); - RoutingTable routingTable = service.reroute(clusterState, "reroute").routingTable(); + RoutingTable routingTable = service.reroute(clusterState, "reroute", ActionListener.noop()).routingTable(); clusterState = ClusterState.builder(clusterState).routingTable(routingTable).build(); // now we start the shard routingTable = ESAllocationTestCase.startInitializingShardsAndReroute(service, clusterState, "source").routingTable(); @@ -255,7 +256,7 @@ public void testShrinkIndexSettings() { EmptySnapshotsInfoService.INSTANCE ); - RoutingTable routingTable = service.reroute(clusterState, "reroute").routingTable(); + RoutingTable routingTable = service.reroute(clusterState, "reroute", ActionListener.noop()).routingTable(); clusterState = ClusterState.builder(clusterState).routingTable(routingTable).build(); // now we start the shard routingTable = ESAllocationTestCase.startInitializingShardsAndReroute(service, clusterState, indexName).routingTable(); @@ -319,7 +320,7 @@ public void testShrinkWithMaxPrimaryShardSize() { EmptySnapshotsInfoService.INSTANCE ); - RoutingTable routingTable = service.reroute(clusterState, "reroute").routingTable(); + RoutingTable routingTable = service.reroute(clusterState, "reroute", ActionListener.noop()).routingTable(); clusterState = ClusterState.builder(clusterState).routingTable(routingTable).build(); // now we start the shard routingTable = ESAllocationTestCase.startInitializingShardsAndReroute(service, clusterState, "source").routingTable(); diff --git a/server/src/test/java/org/elasticsearch/cluster/ClusterModuleTests.java b/server/src/test/java/org/elasticsearch/cluster/ClusterModuleTests.java index 4c01ad34527b4..3ab20566ab74c 100644 --- a/server/src/test/java/org/elasticsearch/cluster/ClusterModuleTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/ClusterModuleTests.java @@ -8,6 +8,7 @@ package org.elasticsearch.cluster; +import org.elasticsearch.cluster.routing.RerouteService; import org.elasticsearch.cluster.routing.ShardRouting; import org.elasticsearch.cluster.routing.allocation.ExistingShardsAllocator; import org.elasticsearch.cluster.routing.allocation.RoutingAllocation; @@ -40,12 +41,15 @@ import org.elasticsearch.common.settings.Setting.Property; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.settings.SettingsModule; -import org.elasticsearch.common.util.concurrent.ThreadContext; import org.elasticsearch.gateway.GatewayAllocator; import org.elasticsearch.indices.EmptySystemIndices; import org.elasticsearch.plugins.ClusterPlugin; import org.elasticsearch.tasks.TaskManager; import org.elasticsearch.test.gateway.TestGatewayAllocator; +import org.elasticsearch.threadpool.TestThreadPool; +import org.elasticsearch.threadpool.ThreadPool; +import org.junit.AfterClass; +import org.junit.BeforeClass; import java.util.Arrays; import java.util.Collection; @@ -53,17 +57,28 @@ import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.concurrent.TimeUnit; import java.util.function.Supplier; public class ClusterModuleTests extends ModuleTestCase { private ClusterInfoService clusterInfoService = EmptyClusterInfoService.INSTANCE; private ClusterService clusterService; - private ThreadContext threadContext; + private static ThreadPool threadPool; + + @BeforeClass + public static void createThreadPool() { + threadPool = new TestThreadPool("test"); + } + + @AfterClass + public static void terminateThreadPool() { + assertTrue(ThreadPool.terminate(threadPool, 10, TimeUnit.SECONDS)); + threadPool = null; + } @Override public void setUp() throws Exception { super.setUp(); - threadContext = new ThreadContext(Settings.EMPTY); clusterService = new ClusterService( Settings.EMPTY, new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS), @@ -140,7 +155,7 @@ public void testRegisterAllocationDeciderDuplicate() { public Collection createAllocationDeciders(Settings settings, ClusterSettings clusterSettings) { return Collections.singletonList(new EnableAllocationDecider(settings, clusterSettings)); } - }), clusterInfoService, null, threadContext, EmptySystemIndices.INSTANCE) + }), clusterInfoService, null, threadPool, EmptySystemIndices.INSTANCE, ClusterModuleTests::getFakeRerouteService) ); assertEquals(e.getMessage(), "Cannot specify allocation decider [" + EnableAllocationDecider.class.getName() + "] twice"); } @@ -151,7 +166,7 @@ public void testRegisterAllocationDecider() { public Collection createAllocationDeciders(Settings settings, ClusterSettings clusterSettings) { return Collections.singletonList(new FakeAllocationDecider()); } - }), clusterInfoService, null, threadContext, EmptySystemIndices.INSTANCE); + }), clusterInfoService, null, threadPool, EmptySystemIndices.INSTANCE, ClusterModuleTests::getFakeRerouteService); assertTrue(module.deciderList.stream().anyMatch(d -> d.getClass().equals(FakeAllocationDecider.class))); } @@ -161,7 +176,7 @@ private ClusterModule newClusterModuleWithShardsAllocator(Settings settings, Str public Map> getShardsAllocators(Settings settings, ClusterSettings clusterSettings) { return Collections.singletonMap(name, supplier); } - }), clusterInfoService, null, threadContext, EmptySystemIndices.INSTANCE); + }), clusterInfoService, null, threadPool, EmptySystemIndices.INSTANCE, ClusterModuleTests::getFakeRerouteService); } public void testRegisterShardsAllocator() { @@ -188,8 +203,9 @@ public void testUnknownShardsAllocator() { Collections.emptyList(), clusterInfoService, null, - threadContext, - EmptySystemIndices.INSTANCE + threadPool, + EmptySystemIndices.INSTANCE, + ClusterModuleTests::getFakeRerouteService ) ); assertEquals("Unknown ShardsAllocator [dne]", e.getMessage()); @@ -245,8 +261,9 @@ public void testRejectsReservedExistingShardsAllocatorName() { List.of(existingShardsAllocatorPlugin(GatewayAllocator.ALLOCATOR_NAME)), clusterInfoService, null, - threadContext, - EmptySystemIndices.INSTANCE + threadPool, + EmptySystemIndices.INSTANCE, + ClusterModuleTests::getFakeRerouteService ); expectThrows(IllegalArgumentException.class, () -> clusterModule.setExistingShardsAllocators(new TestGatewayAllocator())); } @@ -258,8 +275,9 @@ public void testRejectsDuplicateExistingShardsAllocatorName() { List.of(existingShardsAllocatorPlugin("duplicate"), existingShardsAllocatorPlugin("duplicate")), clusterInfoService, null, - threadContext, - EmptySystemIndices.INSTANCE + threadPool, + EmptySystemIndices.INSTANCE, + ClusterModuleTests::getFakeRerouteService ); expectThrows(IllegalArgumentException.class, () -> clusterModule.setExistingShardsAllocators(new TestGatewayAllocator())); } @@ -273,4 +291,7 @@ public Map getExistingShardsAllocators() { }; } + private static RerouteService getFakeRerouteService() { + return (s, p, r) -> { throw new AssertionError("should not be called"); }; + } } diff --git a/server/src/test/java/org/elasticsearch/cluster/action/shard/ShardFailedClusterStateTaskExecutorTests.java b/server/src/test/java/org/elasticsearch/cluster/action/shard/ShardFailedClusterStateTaskExecutorTests.java index 72f4fa2474676..aa7f21825aaae 100644 --- a/server/src/test/java/org/elasticsearch/cluster/action/shard/ShardFailedClusterStateTaskExecutorTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/action/shard/ShardFailedClusterStateTaskExecutorTests.java @@ -220,7 +220,7 @@ private ClusterState createClusterStateWithStartedShards(String reason) { DiscoveryNodes.Builder nodes = DiscoveryNodes.builder(); IntStream.rangeClosed(1, numberOfNodes).mapToObj(node -> newNode("node" + node)).forEach(nodes::add); ClusterState stateAfterAddingNode = ClusterState.builder(clusterState).nodes(nodes).build(); - ClusterState stateWithInitializingPrimary = allocationService.reroute(stateAfterAddingNode, reason); + ClusterState stateWithInitializingPrimary = allocationService.reroute(stateAfterAddingNode, reason, ActionListener.noop()); ClusterState stateWithStartedPrimary = startInitializingShardsAndReroute(allocationService, stateWithInitializingPrimary); final boolean secondReroute = randomBoolean(); ClusterState resultingState = secondReroute diff --git a/server/src/test/java/org/elasticsearch/cluster/metadata/MetadataCreateDataStreamServiceTests.java b/server/src/test/java/org/elasticsearch/cluster/metadata/MetadataCreateDataStreamServiceTests.java index 79c17f838711b..c8400ac9197d2 100644 --- a/server/src/test/java/org/elasticsearch/cluster/metadata/MetadataCreateDataStreamServiceTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/metadata/MetadataCreateDataStreamServiceTests.java @@ -9,6 +9,7 @@ import org.elasticsearch.ResourceAlreadyExistsException; import org.elasticsearch.Version; +import org.elasticsearch.action.ActionListener; import org.elasticsearch.action.admin.indices.create.CreateIndexClusterStateUpdateRequest; import org.elasticsearch.cluster.ClusterName; import org.elasticsearch.cluster.ClusterState; @@ -55,7 +56,12 @@ public void testCreateDataStream() throws Exception { .metadata(Metadata.builder().put("template", template).build()) .build(); CreateDataStreamClusterStateUpdateRequest req = new CreateDataStreamClusterStateUpdateRequest(dataStreamName); - ClusterState newState = MetadataCreateDataStreamService.createDataStream(metadataCreateIndexService, cs, req); + ClusterState newState = MetadataCreateDataStreamService.createDataStream( + metadataCreateIndexService, + cs, + req, + ActionListener.noop() + ); assertThat(newState.metadata().dataStreams().size(), equalTo(1)); assertThat(newState.metadata().dataStreams().get(dataStreamName).getName(), equalTo(dataStreamName)); assertThat(newState.metadata().dataStreams().get(dataStreamName).isSystem(), is(false)); @@ -86,7 +92,12 @@ public void testCreateDataStreamWithAliasFromTemplate() throws Exception { .metadata(Metadata.builder().put("template", template).build()) .build(); CreateDataStreamClusterStateUpdateRequest req = new CreateDataStreamClusterStateUpdateRequest(dataStreamName); - ClusterState newState = MetadataCreateDataStreamService.createDataStream(metadataCreateIndexService, cs, req); + ClusterState newState = MetadataCreateDataStreamService.createDataStream( + metadataCreateIndexService, + cs, + req, + ActionListener.noop() + ); assertThat(newState.metadata().dataStreams().size(), equalTo(1)); assertThat(newState.metadata().dataStreams().get(dataStreamName).getName(), equalTo(dataStreamName)); assertThat(newState.metadata().dataStreams().get(dataStreamName).isSystem(), is(false)); @@ -130,7 +141,6 @@ public void testCreateDataStreamWithAliasFromComponentTemplate() throws Exceptio List ctNames = new ArrayList<>(); List> allAliases = new ArrayList<>(); var metadataBuilder = Metadata.builder(); - final List componentTemplates = new ArrayList<>(componentTemplateCount); for (int k = 0; k < componentTemplateCount; k++) { final String ctName = randomAlphaOfLength(5); ctNames.add(ctName); @@ -156,7 +166,12 @@ public void testCreateDataStreamWithAliasFromComponentTemplate() throws Exceptio .metadata(metadataBuilder.put("template", template).build()) .build(); CreateDataStreamClusterStateUpdateRequest req = new CreateDataStreamClusterStateUpdateRequest(dataStreamName); - ClusterState newState = MetadataCreateDataStreamService.createDataStream(metadataCreateIndexService, cs, req); + ClusterState newState = MetadataCreateDataStreamService.createDataStream( + metadataCreateIndexService, + cs, + req, + ActionListener.noop() + ); assertThat(newState.metadata().dataStreams().size(), equalTo(1)); assertThat(newState.metadata().dataStreams().get(dataStreamName).getName(), equalTo(dataStreamName)); assertThat(newState.metadata().dataStreams().get(dataStreamName).isSystem(), is(false)); @@ -203,7 +218,12 @@ public void testCreateSystemDataStream() throws Exception { TimeValue.ZERO, true ); - ClusterState newState = MetadataCreateDataStreamService.createDataStream(metadataCreateIndexService, cs, req); + ClusterState newState = MetadataCreateDataStreamService.createDataStream( + metadataCreateIndexService, + cs, + req, + ActionListener.noop() + ); assertThat(newState.metadata().dataStreams().size(), equalTo(1)); assertThat(newState.metadata().dataStreams().get(dataStreamName).getName(), equalTo(dataStreamName)); assertThat(newState.metadata().dataStreams().get(dataStreamName).isSystem(), is(true)); @@ -229,7 +249,7 @@ public void testCreateDuplicateDataStream() throws Exception { ResourceAlreadyExistsException e = expectThrows( ResourceAlreadyExistsException.class, - () -> MetadataCreateDataStreamService.createDataStream(metadataCreateIndexService, cs, req) + () -> MetadataCreateDataStreamService.createDataStream(metadataCreateIndexService, cs, req, ActionListener.noop()) ); assertThat(e.getMessage(), containsString("data_stream [" + dataStreamName + "] already exists")); } @@ -241,7 +261,7 @@ public void testCreateDataStreamWithInvalidName() throws Exception { CreateDataStreamClusterStateUpdateRequest req = new CreateDataStreamClusterStateUpdateRequest(dataStreamName); IllegalArgumentException e = expectThrows( IllegalArgumentException.class, - () -> MetadataCreateDataStreamService.createDataStream(metadataCreateIndexService, cs, req) + () -> MetadataCreateDataStreamService.createDataStream(metadataCreateIndexService, cs, req, ActionListener.noop()) ); assertThat(e.getMessage(), containsString("must not contain the following characters")); } @@ -253,7 +273,7 @@ public void testCreateDataStreamWithUppercaseCharacters() throws Exception { CreateDataStreamClusterStateUpdateRequest req = new CreateDataStreamClusterStateUpdateRequest(dataStreamName); IllegalArgumentException e = expectThrows( IllegalArgumentException.class, - () -> MetadataCreateDataStreamService.createDataStream(metadataCreateIndexService, cs, req) + () -> MetadataCreateDataStreamService.createDataStream(metadataCreateIndexService, cs, req, ActionListener.noop()) ); assertThat(e.getMessage(), containsString("data_stream [" + dataStreamName + "] must be lowercase")); } @@ -265,7 +285,7 @@ public void testCreateDataStreamStartingWithPeriod() throws Exception { CreateDataStreamClusterStateUpdateRequest req = new CreateDataStreamClusterStateUpdateRequest(dataStreamName); IllegalArgumentException e = expectThrows( IllegalArgumentException.class, - () -> MetadataCreateDataStreamService.createDataStream(metadataCreateIndexService, cs, req) + () -> MetadataCreateDataStreamService.createDataStream(metadataCreateIndexService, cs, req, ActionListener.noop()) ); assertThat(e.getMessage(), containsString("data_stream [" + dataStreamName + "] must not start with '.ds-'")); } @@ -277,7 +297,7 @@ public void testCreateDataStreamNoTemplate() throws Exception { CreateDataStreamClusterStateUpdateRequest req = new CreateDataStreamClusterStateUpdateRequest(dataStreamName); Exception e = expectThrows( IllegalArgumentException.class, - () -> MetadataCreateDataStreamService.createDataStream(metadataCreateIndexService, cs, req) + () -> MetadataCreateDataStreamService.createDataStream(metadataCreateIndexService, cs, req, ActionListener.noop()) ); assertThat(e.getMessage(), equalTo("no matching index template found for data stream [my-data-stream]")); } @@ -292,7 +312,7 @@ public void testCreateDataStreamNoValidTemplate() throws Exception { CreateDataStreamClusterStateUpdateRequest req = new CreateDataStreamClusterStateUpdateRequest(dataStreamName); Exception e = expectThrows( IllegalArgumentException.class, - () -> MetadataCreateDataStreamService.createDataStream(metadataCreateIndexService, cs, req) + () -> MetadataCreateDataStreamService.createDataStream(metadataCreateIndexService, cs, req, ActionListener.noop()) ); assertThat( e.getMessage(), @@ -309,14 +329,14 @@ public static ClusterState createDataStream(final String dataStreamName) throws .metadata(Metadata.builder().put("template", template).build()) .build(); CreateDataStreamClusterStateUpdateRequest req = new CreateDataStreamClusterStateUpdateRequest(dataStreamName); - return MetadataCreateDataStreamService.createDataStream(metadataCreateIndexService, cs, req); + return MetadataCreateDataStreamService.createDataStream(metadataCreateIndexService, cs, req, ActionListener.noop()); } private static MetadataCreateIndexService getMetadataCreateIndexService() throws Exception { MetadataCreateIndexService s = mock(MetadataCreateIndexService.class); when(s.getSystemIndices()).thenReturn(getSystemIndices()); - when(s.applyCreateIndexRequest(any(ClusterState.class), any(CreateIndexClusterStateUpdateRequest.class), anyBoolean())).thenAnswer( - mockInvocation -> { + when(s.applyCreateIndexRequest(any(ClusterState.class), any(CreateIndexClusterStateUpdateRequest.class), anyBoolean(), any())) + .thenAnswer(mockInvocation -> { ClusterState currentState = (ClusterState) mockInvocation.getArguments()[0]; CreateIndexClusterStateUpdateRequest request = (CreateIndexClusterStateUpdateRequest) mockInvocation.getArguments()[1]; @@ -337,8 +357,7 @@ private static MetadataCreateIndexService getMetadataCreateIndexService() throws false ); return ClusterState.builder(currentState).metadata(b.build()).build(); - } - ); + }); return s; } diff --git a/server/src/test/java/org/elasticsearch/cluster/metadata/MetadataCreateIndexServiceTests.java b/server/src/test/java/org/elasticsearch/cluster/metadata/MetadataCreateIndexServiceTests.java index 7fb08c86192ad..e9dc4a6f6f495 100644 --- a/server/src/test/java/org/elasticsearch/cluster/metadata/MetadataCreateIndexServiceTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/metadata/MetadataCreateIndexServiceTests.java @@ -11,6 +11,7 @@ import org.elasticsearch.ExceptionsHelper; import org.elasticsearch.ResourceAlreadyExistsException; import org.elasticsearch.Version; +import org.elasticsearch.action.ActionListener; import org.elasticsearch.action.admin.indices.alias.Alias; import org.elasticsearch.action.admin.indices.create.CreateIndexClusterStateUpdateRequest; import org.elasticsearch.action.admin.indices.shrink.ResizeType; @@ -71,9 +72,7 @@ import java.util.Optional; import java.util.Set; import java.util.UUID; -import java.util.concurrent.atomic.AtomicBoolean; import java.util.function.BiConsumer; -import java.util.function.BiFunction; import java.util.function.Consumer; import java.util.stream.Collectors; import java.util.stream.Stream; @@ -271,7 +270,7 @@ public void testValidateShrinkIndex() { EmptySnapshotsInfoService.INSTANCE ); - RoutingTable routingTable = service.reroute(clusterState, "reroute").routingTable(); + RoutingTable routingTable = service.reroute(clusterState, "reroute", ActionListener.noop()).routingTable(); clusterState = ClusterState.builder(clusterState).routingTable(routingTable).build(); // now we start the shard routingTable = ESAllocationTestCase.startInitializingShardsAndReroute(service, clusterState, "source").routingTable(); @@ -373,7 +372,7 @@ public void testValidateSplitIndex() { EmptySnapshotsInfoService.INSTANCE ); - RoutingTable routingTable = service.reroute(clusterState, "reroute").routingTable(); + RoutingTable routingTable = service.reroute(clusterState, "reroute", ActionListener.noop()).routingTable(); clusterState = ClusterState.builder(clusterState).routingTable(routingTable).build(); // now we start the shard routingTable = ESAllocationTestCase.startInitializingShardsAndReroute(service, clusterState, "source").routingTable(); @@ -521,7 +520,7 @@ private void runPrepareResizeIndexSettingsTest( EmptySnapshotsInfoService.INSTANCE ); - final RoutingTable initialRoutingTable = service.reroute(initialClusterState, "reroute").routingTable(); + final RoutingTable initialRoutingTable = service.reroute(initialClusterState, "reroute", ActionListener.noop()).routingTable(); final ClusterState routingTableClusterState = ClusterState.builder(initialClusterState).routingTable(initialRoutingTable).build(); // now we start the shard @@ -986,10 +985,8 @@ public void testClusterStateCreateIndexThrowsWriteIndexValidationException() thr .build(); assertThat( - expectThrows( - IllegalStateException.class, - () -> clusterStateCreateIndex(currentClusterState, Set.of(), newIndex, (state, reason) -> state, null) - ).getMessage(), + expectThrows(IllegalStateException.class, () -> clusterStateCreateIndex(currentClusterState, Set.of(), newIndex, null)) + .getMessage(), startsWith("alias [alias1] has more than one write index [") ); } @@ -1004,23 +1001,14 @@ public void testClusterStateCreateIndex() { .putAlias(AliasMetadata.builder("alias1").writeIndex(true).build()) .build(); - // used as a value container, not for the concurrency and visibility guarantees - AtomicBoolean allocationRerouted = new AtomicBoolean(false); - BiFunction rerouteRoutingTable = (clusterState, reason) -> { - allocationRerouted.compareAndSet(false, true); - return clusterState; - }; - ClusterState updatedClusterState = clusterStateCreateIndex( currentClusterState, Set.of(INDEX_READ_ONLY_BLOCK), newIndexMetadata, - rerouteRoutingTable, null ); assertThat(updatedClusterState.blocks().getIndexBlockWithId("test", INDEX_READ_ONLY_BLOCK.id()), is(INDEX_READ_ONLY_BLOCK)); assertThat(updatedClusterState.routingTable().index("test"), is(notNullValue())); - assertThat(allocationRerouted.get(), is(true)); Metadata metadata = updatedClusterState.metadata(); IndexAbstraction alias = metadata.getIndicesLookup().get("alias1"); @@ -1062,7 +1050,6 @@ public void testClusterStateCreateIndexWithMetadataTransaction() { currentClusterState, Set.of(INDEX_READ_ONLY_BLOCK), newIndexMetadata, - (clusterState, y) -> clusterState, metadataTransformer ); assertTrue(updatedClusterState.metadata().findAllAliases(new String[] { "my-index" }).containsKey("my-index")); diff --git a/server/src/test/java/org/elasticsearch/cluster/metadata/MetadataDeleteIndexServiceTests.java b/server/src/test/java/org/elasticsearch/cluster/metadata/MetadataDeleteIndexServiceTests.java index e39e9b743022c..e26a970d3e1a7 100644 --- a/server/src/test/java/org/elasticsearch/cluster/metadata/MetadataDeleteIndexServiceTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/metadata/MetadataDeleteIndexServiceTests.java @@ -43,6 +43,7 @@ import static org.hamcrest.Matchers.notNullValue; import static org.hamcrest.Matchers.nullValue; import static org.mockito.ArgumentMatchers.any; +import static org.mockito.ArgumentMatchers.anyString; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.verify; import static org.mockito.Mockito.when; @@ -56,7 +57,7 @@ public class MetadataDeleteIndexServiceTests extends ESTestCase { public void setUp() throws Exception { super.setUp(); allocationService = mock(AllocationService.class); - when(allocationService.reroute(any(ClusterState.class), any(String.class))).thenAnswer( + when(allocationService.reroute(any(ClusterState.class), any(String.class), any())).thenAnswer( mockInvocation -> mockInvocation.getArguments()[0] ); service = new MetadataDeleteIndexService(Settings.EMPTY, null, allocationService); @@ -109,7 +110,7 @@ public void testDeleteUnassigned() throws Exception { ClusterState before = clusterState(index); // Mock the built reroute - when(allocationService.reroute(any(ClusterState.class), any(String.class))).then(i -> i.getArguments()[0]); + when(allocationService.reroute(any(ClusterState.class), anyString(), any())).then(i -> i.getArguments()[0]); // Remove it final ClusterState after = ClusterStateTaskExecutorUtils.executeAndAssertSuccessful( @@ -128,7 +129,7 @@ public void testDeleteUnassigned() throws Exception { assertNull(after.blocks().indices().get(index)); // Make sure we actually attempted to reroute - verify(allocationService).reroute(any(ClusterState.class), any(String.class)); + verify(allocationService).reroute(any(ClusterState.class), any(String.class), any()); } public void testDeleteIndexWithAnAlias() { diff --git a/server/src/test/java/org/elasticsearch/cluster/metadata/MetadataMigrateToDataStreamServiceTests.java b/server/src/test/java/org/elasticsearch/cluster/metadata/MetadataMigrateToDataStreamServiceTests.java index 2dc01e85a5844..49beb3ed58011 100644 --- a/server/src/test/java/org/elasticsearch/cluster/metadata/MetadataMigrateToDataStreamServiceTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/metadata/MetadataMigrateToDataStreamServiceTests.java @@ -9,6 +9,7 @@ package org.elasticsearch.cluster.metadata; import org.elasticsearch.Version; +import org.elasticsearch.action.ActionListener; import org.elasticsearch.cluster.ClusterName; import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.common.Strings; @@ -299,7 +300,8 @@ public void testCreateDataStreamWithSuppliedWriteIndex() throws Exception { TimeValue.ZERO, TimeValue.ZERO ), - getMetadataCreateIndexService() + getMetadataCreateIndexService(), + ActionListener.noop() ); IndexAbstraction ds = newState.metadata().getIndicesLookup().get(dataStreamName); assertThat(ds, notNullValue()); @@ -360,7 +362,8 @@ public void testCreateDataStreamHidesBackingIndicesAndRemovesAlias() throws Exce TimeValue.ZERO, TimeValue.ZERO ), - getMetadataCreateIndexService() + getMetadataCreateIndexService(), + ActionListener.noop() ); IndexAbstraction ds = newState.metadata().getIndicesLookup().get(dataStreamName); assertThat(ds, notNullValue()); @@ -423,7 +426,8 @@ public void testCreateDataStreamWithoutSuppliedWriteIndex() { TimeValue.ZERO, TimeValue.ZERO ), - getMetadataCreateIndexService() + getMetadataCreateIndexService(), + ActionListener.noop() ) ); assertThat(e.getMessage(), containsString("alias [" + dataStreamName + "] must specify a write index")); diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/BatchedRerouteServiceTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/BatchedRerouteServiceTests.java index c4540bb45ccc9..45bf533c4bddb 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/BatchedRerouteServiceTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/BatchedRerouteServiceTests.java @@ -57,8 +57,9 @@ public void afterTest() { public void testReroutesWhenRequested() throws InterruptedException { final AtomicLong rerouteCount = new AtomicLong(); - final BatchedRerouteService batchedRerouteService = new BatchedRerouteService(clusterService, (s, r) -> { + final BatchedRerouteService batchedRerouteService = new BatchedRerouteService(clusterService, (s, r, l) -> { rerouteCount.incrementAndGet(); + l.onResponse(null); return s; }); @@ -98,8 +99,9 @@ public void onFailure(Exception e) { cyclicBarrier.await(); // wait for master thread to be blocked final AtomicBoolean rerouteExecuted = new AtomicBoolean(); - final BatchedRerouteService batchedRerouteService = new BatchedRerouteService(clusterService, (s, r) -> { + final BatchedRerouteService batchedRerouteService = new BatchedRerouteService(clusterService, (s, r, l) -> { assertTrue(rerouteExecuted.compareAndSet(false, true)); // only called once + l.onResponse(null); return s; }); @@ -188,27 +190,26 @@ public void clusterStateProcessed(ClusterState oldState, ClusterState newState) public void testNotifiesOnFailure() throws InterruptedException { - final BatchedRerouteService batchedRerouteService = new BatchedRerouteService(clusterService, (s, r) -> { + final BatchedRerouteService batchedRerouteService = new BatchedRerouteService(clusterService, (s, r, l) -> { if (rarely()) { throw new ElasticsearchException("simulated"); } + l.onResponse(null); return randomBoolean() ? s : ClusterState.builder(s).build(); }); final int iterations = between(1, 100); final CountDownLatch countDownLatch = new CountDownLatch(iterations); for (int i = 0; i < iterations; i++) { - batchedRerouteService.reroute("iteration " + i, randomFrom(EnumSet.allOf(Priority.class)), ActionListener.wrap(r -> { - countDownLatch.countDown(); - if (rarely()) { - throw new ElasticsearchException("failure during notification"); - } - }, e -> { - countDownLatch.countDown(); - if (randomBoolean()) { - throw new ElasticsearchException("failure during failure notification", e); - } - })); + batchedRerouteService.reroute( + "iteration " + i, + randomFrom(EnumSet.allOf(Priority.class)), + ActionListener.runAfter(ActionListener.wrap(r -> { + if (rarely()) { + throw new ElasticsearchException("failure during notification"); + } + }, e -> {}), countDownLatch::countDown) + ); if (rarely()) { clusterService.getMasterService() .setClusterStatePublisher( diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/DelayedAllocationServiceTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/DelayedAllocationServiceTests.java index eff3977ce560c..2e7a60fcdb843 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/DelayedAllocationServiceTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/DelayedAllocationServiceTests.java @@ -9,6 +9,7 @@ package org.elasticsearch.cluster.routing; import org.elasticsearch.Version; +import org.elasticsearch.action.ActionListener; import org.elasticsearch.cluster.ClusterChangedEvent; import org.elasticsearch.cluster.ClusterName; import org.elasticsearch.cluster.ClusterState; @@ -86,7 +87,7 @@ public void testNoDelayedUnassigned() { clusterState = ClusterState.builder(clusterState) .nodes(DiscoveryNodes.builder().add(newNode("node1")).add(newNode("node2")).localNodeId("node1").masterNodeId("node1")) .build(); - clusterState = allocationService.reroute(clusterState, "reroute"); + clusterState = allocationService.reroute(clusterState, "reroute", ActionListener.noop()); // starting primaries clusterState = startInitializingShardsAndReroute(allocationService, clusterState); // starting replicas @@ -141,7 +142,7 @@ public void testDelayedUnassignedScheduleReroute() throws Exception { .build(); final long baseTimestampNanos = System.nanoTime(); allocationService.setNanoTimeOverride(baseTimestampNanos); - clusterState = allocationService.reroute(clusterState, "reroute"); + clusterState = allocationService.reroute(clusterState, "reroute", ActionListener.noop()); // starting primaries clusterState = startInitializingShardsAndReroute(allocationService, clusterState); // starting replicas @@ -255,7 +256,7 @@ public void testDelayedUnassignedScheduleRerouteAfterDelayedReroute() throws Exc ) .build(); // allocate shards - clusterState = allocationService.reroute(clusterState, "reroute"); + clusterState = allocationService.reroute(clusterState, "reroute", ActionListener.noop()); // start primaries clusterState = startInitializingShardsAndReroute(allocationService, clusterState); // start replicas @@ -442,7 +443,7 @@ public void testDelayedUnassignedScheduleRerouteRescheduledOnShorterDelay() { .build(); final long nodeLeftTimestampNanos = System.nanoTime(); allocationService.setNanoTimeOverride(nodeLeftTimestampNanos); - clusterState = allocationService.reroute(clusterState, "reroute"); + clusterState = allocationService.reroute(clusterState, "reroute", ActionListener.noop()); // starting primaries clusterState = startInitializingShardsAndReroute(allocationService, clusterState); // starting replicas diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/PrimaryTermsTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/PrimaryTermsTests.java index cc5190b993703..1e5e40b16354f 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/PrimaryTermsTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/PrimaryTermsTests.java @@ -9,6 +9,7 @@ package org.elasticsearch.cluster.routing; import org.elasticsearch.Version; +import org.elasticsearch.action.ActionListener; import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.ESAllocationTestCase; import org.elasticsearch.cluster.health.ClusterStateHealth; @@ -83,7 +84,7 @@ private void initPrimaries() { discoBuilder = discoBuilder.add(newNode("node" + i)); } this.clusterState = ClusterState.builder(clusterState).nodes(discoBuilder).build(); - ClusterState rerouteResult = allocationService.reroute(clusterState, "reroute"); + ClusterState rerouteResult = allocationService.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(rerouteResult, not(equalTo(this.clusterState))); applyRerouteResult(rerouteResult); primaryTermsPerIndex.keySet().forEach(this::incrementPrimaryTerm); @@ -154,7 +155,7 @@ private void addNodes() { nodesBuilder.add(newNode("extra_" + i)); } this.clusterState = ClusterState.builder(clusterState).nodes(nodesBuilder).build(); - applyRerouteResult(allocationService.reroute(this.clusterState, "nodes added")); + applyRerouteResult(allocationService.reroute(this.clusterState, "nodes added", ActionListener.noop())); } diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/RoutingTableTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/RoutingTableTests.java index bb5efaca795cd..d3855023e6165 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/RoutingTableTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/RoutingTableTests.java @@ -9,6 +9,7 @@ package org.elasticsearch.cluster.routing; import org.elasticsearch.Version; +import org.elasticsearch.action.ActionListener; import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.Diff; import org.elasticsearch.cluster.ESAllocationTestCase; @@ -95,7 +96,7 @@ private void initPrimaries() { discoBuilder = discoBuilder.add(newNode("node" + i)); } this.clusterState = ClusterState.builder(clusterState).nodes(discoBuilder).build(); - ClusterState rerouteResult = ALLOCATION_SERVICE.reroute(clusterState, "reroute"); + ClusterState rerouteResult = ALLOCATION_SERVICE.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(rerouteResult, not(equalTo(this.clusterState))); this.clusterState = rerouteResult; } diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/UnassignedInfoTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/UnassignedInfoTests.java index 91f3d7edd7615..04757b2eb032b 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/UnassignedInfoTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/UnassignedInfoTests.java @@ -11,6 +11,7 @@ import com.carrotsearch.randomizedtesting.generators.RandomPicks; import org.elasticsearch.Version; +import org.elasticsearch.action.ActionListener; import org.elasticsearch.cluster.ClusterName; import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.ESAllocationTestCase; @@ -280,7 +281,7 @@ public void testReplicaAdded() { .routingTable(RoutingTable.builder().addAsNew(metadata.index(index)).build()) .build(); clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder().add(newNode("node1"))).build(); - clusterState = allocation.reroute(clusterState, "reroute"); + clusterState = allocation.reroute(clusterState, "reroute", ActionListener.noop()); // starting primaries clusterState = startInitializingShardsAndReroute(allocation, clusterState); IndexRoutingTable.Builder builder = IndexRoutingTable.builder(index); @@ -337,7 +338,7 @@ public void testNodeLeave() { clusterState = ClusterState.builder(clusterState) .nodes(DiscoveryNodes.builder().add(newNode("node1")).add(newNode("node2"))) .build(); - clusterState = allocation.reroute(clusterState, "reroute"); + clusterState = allocation.reroute(clusterState, "reroute", ActionListener.noop()); // starting primaries clusterState = startInitializingShardsAndReroute(allocation, clusterState); // starting replicas @@ -375,7 +376,7 @@ public void testFailedShard() { clusterState = ClusterState.builder(clusterState) .nodes(DiscoveryNodes.builder().add(newNode("node1")).add(newNode("node2"))) .build(); - clusterState = allocation.reroute(clusterState, "reroute"); + clusterState = allocation.reroute(clusterState, "reroute", ActionListener.noop()); // starting primaries clusterState = startInitializingShardsAndReroute(allocation, clusterState); // starting replicas @@ -586,7 +587,7 @@ public void testNumberOfDelayedUnassigned() throws Exception { clusterState = ClusterState.builder(clusterState) .nodes(DiscoveryNodes.builder().add(newNode("node1")).add(newNode("node2"))) .build(); - clusterState = allocation.reroute(clusterState, "reroute"); + clusterState = allocation.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(UnassignedInfo.getNumberOfDelayedUnassigned(clusterState), equalTo(0)); // starting primaries clusterState = startInitializingShardsAndReroute(allocation, clusterState); @@ -627,7 +628,7 @@ public void testFindNextDelayedAllocation() { clusterState = ClusterState.builder(clusterState) .nodes(DiscoveryNodes.builder().add(newNode("node1")).add(newNode("node2"))) .build(); - clusterState = allocation.reroute(clusterState, "reroute"); + clusterState = allocation.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(UnassignedInfo.getNumberOfDelayedUnassigned(clusterState), equalTo(0)); // starting primaries clusterState = startInitializingShardsAndReroute(allocation, clusterState); @@ -644,7 +645,7 @@ public void testFindNextDelayedAllocation() { if (delta > 0) { allocation.setNanoTimeOverride(baseTime + delta); - clusterState = allocation.reroute(clusterState, "time moved"); + clusterState = allocation.reroute(clusterState, "time moved", ActionListener.noop()); } assertThat(UnassignedInfo.findNextDelayedAllocation(baseTime + delta, clusterState), equalTo(expectMinDelaySettingsNanos - delta)); diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/AddIncrementallyTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/AddIncrementallyTests.java index d89c4c24fc97b..f2186b83564e5 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/AddIncrementallyTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/AddIncrementallyTests.java @@ -11,6 +11,7 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.elasticsearch.Version; +import org.elasticsearch.action.ActionListener; import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.ESAllocationTestCase; import org.elasticsearch.cluster.metadata.IndexMetadata; @@ -106,7 +107,7 @@ public void testMinimalRelocations() { nodes.add(newNode("node2")); clusterState = ClusterState.builder(clusterState).nodes(nodes.build()).build(); - clusterState = service.reroute(clusterState, "reroute"); + clusterState = service.reroute(clusterState, "reroute", ActionListener.noop()); RoutingNodes routingNodes = clusterState.getRoutingNodes(); assertThat(routingNodes.node("node2").shardsWithState(INITIALIZING).size(), equalTo(2)); @@ -171,7 +172,7 @@ public void testMinimalRelocationsNoLimit() { nodes.add(newNode("node2")); clusterState = ClusterState.builder(clusterState).nodes(nodes.build()).build(); - clusterState = service.reroute(clusterState, "reroute"); + clusterState = service.reroute(clusterState, "reroute", ActionListener.noop()); RoutingNodes routingNodes = clusterState.getRoutingNodes(); assertThat(routingNodes.node("node2").shardsWithState(INITIALIZING).size(), equalTo(2)); @@ -242,7 +243,7 @@ private ClusterState addNodes(ClusterState clusterState, AllocationService servi clusterState = ClusterState.builder(clusterState).nodes(nodes.build()).build(); - clusterState = service.reroute(clusterState, "reroute"); + clusterState = service.reroute(clusterState, "reroute", ActionListener.noop()); // move initializing to started return applyStartedShardsUntilNoChange(clusterState, service); @@ -282,7 +283,7 @@ private ClusterState initCluster( ClusterState clusterState = ClusterState.builder( org.elasticsearch.cluster.ClusterName.CLUSTER_NAME_SETTING.getDefault(Settings.EMPTY) ).nodes(nodes).metadata(metadata).routingTable(initialRoutingTable).build(); - clusterState = service.reroute(clusterState, "reroute"); + clusterState = service.reroute(clusterState, "reroute", ActionListener.noop()); logger.info("restart all the primary shards, replicas will start initializing"); clusterState = startInitializingShardsAndReroute(service, clusterState); @@ -314,7 +315,7 @@ private ClusterState addIndex( Metadata metadata = metadataBuilder.build(); clusterState = ClusterState.builder(clusterState).metadata(metadata).routingTable(routingTableBuilder.build()).build(); - clusterState = service.reroute(clusterState, "reroute"); + clusterState = service.reroute(clusterState, "reroute", ActionListener.noop()); logger.info("restart all the primary shards, replicas will start initializing"); clusterState = startInitializingShardsAndReroute(service, clusterState); @@ -349,7 +350,7 @@ private ClusterState removeNodes(ClusterState clusterState, AllocationService se clusterState = startInitializingShardsAndReroute(service, clusterState); logger.info("rebalancing"); - clusterState = service.reroute(clusterState, "reroute"); + clusterState = service.reroute(clusterState, "reroute", ActionListener.noop()); logger.info("complete rebalancing"); clusterState = applyStartedShardsUntilNoChange(clusterState, service); diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/AllocationCommandsTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/AllocationCommandsTests.java index d73df1ce0aa3e..80fe61f1abf32 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/AllocationCommandsTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/AllocationCommandsTests.java @@ -11,7 +11,9 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.elasticsearch.Version; +import org.elasticsearch.action.ActionListener; import org.elasticsearch.cluster.ClusterInfo; +import org.elasticsearch.cluster.ClusterModule; import org.elasticsearch.cluster.ClusterName; import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.ESAllocationTestCase; @@ -56,6 +58,8 @@ import static java.util.Collections.emptyMap; import static java.util.Collections.singleton; +import static org.elasticsearch.cluster.ClusterModule.BALANCED_ALLOCATOR; +import static org.elasticsearch.cluster.ClusterModule.DESIRED_BALANCE_ALLOCATOR; import static org.elasticsearch.cluster.routing.RoutingNodesHelper.shardsWithState; import static org.elasticsearch.cluster.routing.ShardRoutingState.INITIALIZING; import static org.elasticsearch.cluster.routing.ShardRoutingState.RELOCATING; @@ -72,7 +76,10 @@ public class AllocationCommandsTests extends ESAllocationTestCase { public void testMoveShardCommand() { AllocationService allocation = createAllocationService( - Settings.builder().put("cluster.routing.allocation.node_concurrent_recoveries", 10).build() + Settings.builder() + .put("cluster.routing.allocation.node_concurrent_recoveries", 10) + .put(ClusterModule.SHARDS_ALLOCATOR_TYPE_SETTING.getKey(), randomShardsAllocator()) + .build() ); logger.info("creating an index with 1 shard, no replica"); @@ -88,7 +95,7 @@ public void testMoveShardCommand() { clusterState = ClusterState.builder(clusterState) .nodes(DiscoveryNodes.builder().add(newNode("node1")).add(newNode("node2"))) .build(); - clusterState = allocation.reroute(clusterState, "reroute"); + clusterState = allocation.reroute(clusterState, "reroute", ActionListener.noop()); logger.info("start primary shard"); clusterState = startInitializingShardsAndReroute(allocation, clusterState); @@ -105,7 +112,9 @@ public void testMoveShardCommand() { clusterState, new AllocationCommands(new MoveAllocationCommand("test", 0, existingNodeId, toNodeId)), false, - false + false, + false, + ActionListener.noop() ).clusterState(); assertThat(newState, not(equalTo(clusterState))); clusterState = newState; @@ -132,6 +141,7 @@ public void testAllocateCommand() { Settings.builder() .put(EnableAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ENABLE_SETTING.getKey(), "none") .put(EnableAllocationDecider.CLUSTER_ROUTING_REBALANCE_ENABLE_SETTING.getKey(), "none") + .put(ClusterModule.SHARDS_ALLOCATOR_TYPE_SETTING.getKey(), randomShardsAllocator()) .build() ); final String index = "test"; @@ -165,12 +175,19 @@ public void testAllocateCommand() { .add(newNode("node4", singleton(DiscoveryNodeRole.MASTER_ROLE))) ) .build(); - clusterState = allocation.reroute(clusterState, "reroute"); + clusterState = allocation.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(shardsWithState(clusterState.getRoutingNodes(), INITIALIZING).size(), equalTo(0)); logger.info("--> allocating to non-existent node, should fail"); try { - allocation.reroute(clusterState, new AllocationCommands(randomAllocateCommand(index, shardId.id(), "node42")), false, false); + allocation.reroute( + clusterState, + new AllocationCommands(randomAllocateCommand(index, shardId.id(), "node42")), + false, + false, + false, + ActionListener.noop() + ); fail("expected IllegalArgumentException when allocating to non-existing node"); } catch (IllegalArgumentException e) { assertThat(e.getMessage(), containsString("failed to resolve [node42], no matching nodes")); @@ -178,7 +195,14 @@ public void testAllocateCommand() { logger.info("--> allocating to non-data node, should fail"); try { - allocation.reroute(clusterState, new AllocationCommands(randomAllocateCommand(index, shardId.id(), "node4")), false, false); + allocation.reroute( + clusterState, + new AllocationCommands(randomAllocateCommand(index, shardId.id(), "node4")), + false, + false, + false, + ActionListener.noop() + ); fail("expected IllegalArgumentException when allocating to non-data node"); } catch (IllegalArgumentException e) { assertThat(e.getMessage(), containsString("allocation can only be done on data nodes")); @@ -186,7 +210,14 @@ public void testAllocateCommand() { logger.info("--> allocating non-existing shard, should fail"); try { - allocation.reroute(clusterState, new AllocationCommands(randomAllocateCommand("test", 1, "node2")), false, false); + allocation.reroute( + clusterState, + new AllocationCommands(randomAllocateCommand("test", 1, "node2")), + false, + false, + false, + ActionListener.noop() + ); fail("expected ShardNotFoundException when allocating non-existing shard"); } catch (ShardNotFoundException e) { assertThat(e.getMessage(), containsString("no such shard")); @@ -194,7 +225,14 @@ public void testAllocateCommand() { logger.info("--> allocating non-existing index, should fail"); try { - allocation.reroute(clusterState, new AllocationCommands(randomAllocateCommand("test2", 0, "node2")), false, false); + allocation.reroute( + clusterState, + new AllocationCommands(randomAllocateCommand("test2", 0, "node2")), + false, + false, + false, + ActionListener.noop() + ); fail("expected ShardNotFoundException when allocating non-existing index"); } catch (IndexNotFoundException e) { assertThat(e.getMessage(), containsString("no such index [test2]")); @@ -206,7 +244,9 @@ public void testAllocateCommand() { clusterState, new AllocationCommands(new AllocateEmptyPrimaryAllocationCommand("test", 0, "node1", false)), false, - false + false, + false, + ActionListener.noop() ); fail("expected IllegalArgumentException when allocating empty primary with acceptDataLoss flag set to false"); } catch (IllegalArgumentException e) { @@ -226,7 +266,9 @@ public void testAllocateCommand() { clusterState, new AllocationCommands(new AllocateStalePrimaryAllocationCommand(index, shardId.id(), "node1", false)), false, - false + false, + false, + ActionListener.noop() ); fail("expected IllegalArgumentException when allocating stale primary with acceptDataLoss flag set to false"); } catch (IllegalArgumentException e) { @@ -245,7 +287,9 @@ public void testAllocateCommand() { clusterState, new AllocationCommands(new AllocateEmptyPrimaryAllocationCommand("test", 0, "node1", true)), false, - false + false, + false, + ActionListener.noop() ).clusterState(); assertThat(newState, not(equalTo(clusterState))); clusterState = newState; @@ -265,7 +309,9 @@ public void testAllocateCommand() { clusterState, new AllocationCommands(new AllocateReplicaAllocationCommand("test", 0, "node1")), false, - false + false, + false, + ActionListener.noop() ); fail("expected IllegalArgumentException when allocating replica shard on the primary shard node"); } catch (IllegalArgumentException e) {} @@ -275,7 +321,9 @@ public void testAllocateCommand() { clusterState, new AllocationCommands(new AllocateReplicaAllocationCommand("test", 0, "node2")), false, - false + false, + false, + ActionListener.noop() ).clusterState(); assertThat(newState, not(equalTo(clusterState))); clusterState = newState; @@ -293,7 +341,14 @@ public void testAllocateCommand() { logger.info("--> verify that we fail when there are no unassigned shards"); try { - allocation.reroute(clusterState, new AllocationCommands(randomAllocateCommand("test", 0, "node3")), false, false); + allocation.reroute( + clusterState, + new AllocationCommands(randomAllocateCommand("test", 0, "node3")), + false, + false, + false, + ActionListener.noop() + ); fail("expected IllegalArgumentException when allocating shard while no unassigned shard available"); } catch (IllegalArgumentException e) {} } @@ -303,6 +358,7 @@ public void testAllocateStalePrimaryCommand() { Settings.builder() .put(EnableAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ENABLE_SETTING.getKey(), "none") .put(EnableAllocationDecider.CLUSTER_ROUTING_REBALANCE_ENABLE_SETTING.getKey(), "none") + .put(ClusterModule.SHARDS_ALLOCATOR_TYPE_SETTING.getKey(), randomShardsAllocator()) .build() ); final String index = "test"; @@ -329,7 +385,7 @@ public void testAllocateStalePrimaryCommand() { final String node1 = "node1"; final String node2 = "node2"; clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder().add(newNode(node1)).add(newNode(node2))).build(); - clusterState = allocation.reroute(clusterState, "reroute"); + clusterState = allocation.reroute(clusterState, "reroute", ActionListener.noop()); // mark all shards as stale final List shardRoutings = shardsWithState(clusterState.getRoutingNodes(), UNASSIGNED); @@ -340,7 +396,9 @@ public void testAllocateStalePrimaryCommand() { clusterState, new AllocationCommands(new AllocateStalePrimaryAllocationCommand(index, 0, node1, true)), false, - false + false, + false, + ActionListener.noop() ).clusterState(); RoutingNode routingNode1 = clusterState.getRoutingNodes().node(node1); assertThat(routingNode1.size(), equalTo(1)); @@ -362,6 +420,7 @@ public void testCancelCommand() { Settings.builder() .put(EnableAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ENABLE_SETTING.getKey(), "none") .put(EnableAllocationDecider.CLUSTER_ROUTING_REBALANCE_ENABLE_SETTING.getKey(), "none") + .put(ClusterModule.SHARDS_ALLOCATOR_TYPE_SETTING.getKey(), randomShardsAllocator()) .build() ); @@ -378,7 +437,7 @@ public void testCancelCommand() { clusterState = ClusterState.builder(clusterState) .nodes(DiscoveryNodes.builder().add(newNode("node1")).add(newNode("node2")).add(newNode("node3"))) .build(); - clusterState = allocation.reroute(clusterState, "reroute"); + clusterState = allocation.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(shardsWithState(clusterState.getRoutingNodes(), INITIALIZING).size(), equalTo(0)); logger.info("--> allocating empty primary shard with accept_data_loss flag set to true"); @@ -386,7 +445,9 @@ public void testCancelCommand() { clusterState, new AllocationCommands(new AllocateEmptyPrimaryAllocationCommand("test", 0, "node1", true)), false, - false + false, + false, + ActionListener.noop() ).clusterState(); assertThat(newState, not(equalTo(clusterState))); clusterState = newState; @@ -396,7 +457,14 @@ public void testCancelCommand() { logger.info("--> cancel primary allocation, make sure it fails..."); try { - allocation.reroute(clusterState, new AllocationCommands(new CancelAllocationCommand("test", 0, "node1", false)), false, false); + allocation.reroute( + clusterState, + new AllocationCommands(new CancelAllocationCommand("test", 0, "node1", false)), + false, + false, + false, + ActionListener.noop() + ); fail(); } catch (IllegalArgumentException e) {} @@ -408,7 +476,14 @@ public void testCancelCommand() { logger.info("--> cancel primary allocation, make sure it fails..."); try { - allocation.reroute(clusterState, new AllocationCommands(new CancelAllocationCommand("test", 0, "node1", false)), false, false); + allocation.reroute( + clusterState, + new AllocationCommands(new CancelAllocationCommand("test", 0, "node1", false)), + false, + false, + false, + ActionListener.noop() + ); fail(); } catch (IllegalArgumentException e) {} @@ -417,7 +492,9 @@ public void testCancelCommand() { clusterState, new AllocationCommands(new AllocateReplicaAllocationCommand("test", 0, "node2")), false, - false + false, + false, + ActionListener.noop() ).clusterState(); assertThat(newState, not(equalTo(clusterState))); clusterState = newState; @@ -431,7 +508,9 @@ public void testCancelCommand() { clusterState, new AllocationCommands(new CancelAllocationCommand("test", 0, "node2", false)), false, - false + false, + false, + ActionListener.noop() ).clusterState(); assertThat(newState, not(equalTo(clusterState))); clusterState = newState; @@ -445,7 +524,9 @@ public void testCancelCommand() { clusterState, new AllocationCommands(new AllocateReplicaAllocationCommand("test", 0, "node2")), false, - false + false, + false, + ActionListener.noop() ).clusterState(); assertThat(newState, not(equalTo(clusterState))); clusterState = newState; @@ -456,7 +537,14 @@ public void testCancelCommand() { logger.info("--> cancel the primary being replicated, make sure it fails"); try { - allocation.reroute(clusterState, new AllocationCommands(new CancelAllocationCommand("test", 0, "node1", false)), false, false); + allocation.reroute( + clusterState, + new AllocationCommands(new CancelAllocationCommand("test", 0, "node1", false)), + false, + false, + false, + ActionListener.noop() + ); fail(); } catch (IllegalArgumentException e) {} @@ -472,7 +560,9 @@ public void testCancelCommand() { clusterState, new AllocationCommands(new CancelAllocationCommand("test", 0, "node2", false)), false, - false + false, + false, + ActionListener.noop() ).clusterState(); assertThat(newState, not(equalTo(clusterState))); clusterState = newState; @@ -486,7 +576,9 @@ public void testCancelCommand() { clusterState, new AllocationCommands(new AllocateReplicaAllocationCommand("test", 0, "node2")), false, - false + false, + false, + ActionListener.noop() ).clusterState(); assertThat(newState, not(equalTo(clusterState))); clusterState = newState; @@ -506,7 +598,9 @@ public void testCancelCommand() { clusterState, new AllocationCommands(new MoveAllocationCommand("test", 0, "node2", "node3")), false, - false + false, + false, + ActionListener.noop() ).clusterState(); assertThat(clusterState.getRoutingNodes().node("node1").size(), equalTo(1)); assertThat(clusterState.getRoutingNodes().node("node1").shardsWithState(STARTED).size(), equalTo(1)); @@ -521,7 +615,9 @@ public void testCancelCommand() { clusterState, new AllocationCommands(new CancelAllocationCommand("test", 0, "node1", true)), false, - false + false, + false, + ActionListener.noop() ).clusterState(); assertThat(newState, not(equalTo(clusterState))); clusterState = newState; @@ -534,7 +630,9 @@ public void testCancelCommand() { clusterState, new AllocationCommands(new CancelAllocationCommand("test", 0, "node3", false)), false, - false + false, + false, + ActionListener.noop() ).clusterState(); assertThat(clusterState.getRoutingNodes().node("node1").size(), equalTo(1)); assertThat(clusterState.getRoutingNodes().node("node1").shardsWithState(STARTED).size(), equalTo(1)); @@ -546,7 +644,9 @@ public void testCancelCommand() { clusterState, new AllocationCommands(new MoveAllocationCommand("test", 0, "node2", "node3")), false, - false + false, + false, + ActionListener.noop() ).clusterState(); assertThat(clusterState.getRoutingNodes().node("node1").size(), equalTo(1)); assertThat(clusterState.getRoutingNodes().node("node1").shardsWithState(STARTED).size(), equalTo(1)); @@ -560,7 +660,9 @@ public void testCancelCommand() { clusterState, new AllocationCommands(new CancelAllocationCommand("test", 0, "node2", false)), false, - false + false, + false, + ActionListener.noop() ).clusterState(); assertThat(clusterState.getRoutingNodes().node("node1").size(), equalTo(1)); assertThat(clusterState.getRoutingNodes().node("node1").shardsWithState(STARTED).size(), equalTo(1)); @@ -581,7 +683,9 @@ public void testCancelCommand() { clusterState, new AllocationCommands(new CancelAllocationCommand("test", 0, "node1", true)), false, - false + false, + false, + ActionListener.noop() ).clusterState(); assertThat(newState, not(equalTo(clusterState))); clusterState = newState; @@ -721,7 +825,10 @@ protected NamedXContentRegistry xContentRegistry() { public void testMoveShardToNonDataNode() { AllocationService allocation = createAllocationService( - Settings.builder().put("cluster.routing.allocation.node_concurrent_recoveries", 10).build() + Settings.builder() + .put("cluster.routing.allocation.node_concurrent_recoveries", 10) + .put(ClusterModule.SHARDS_ALLOCATOR_TYPE_SETTING.getKey(), randomShardsAllocator()) + .build() ); logger.info("creating an index with 1 shard, no replica"); @@ -789,7 +896,10 @@ public void testMoveShardToNonDataNode() { public void testMoveShardFromNonDataNode() { AllocationService allocation = createAllocationService( - Settings.builder().put("cluster.routing.allocation.node_concurrent_recoveries", 10).build() + Settings.builder() + .put("cluster.routing.allocation.node_concurrent_recoveries", 10) + .put(ClusterModule.SHARDS_ALLOCATOR_TYPE_SETTING.getKey(), randomShardsAllocator()) + .build() ); logger.info("creating an index with 1 shard, no replica"); @@ -859,6 +969,7 @@ public void testConflictingCommandsInSingleRequest() { Settings.builder() .put(EnableAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ENABLE_SETTING.getKey(), "none") .put(EnableAllocationDecider.CLUSTER_ROUTING_REBALANCE_ENABLE_SETTING.getKey(), "none") + .put(ClusterModule.SHARDS_ALLOCATOR_TYPE_SETTING.getKey(), randomShardsAllocator()) .build() ); @@ -905,7 +1016,7 @@ public void testConflictingCommandsInSingleRequest() { final String node1 = "node1"; final String node2 = "node2"; clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder().add(newNode(node1)).add(newNode(node2))).build(); - final ClusterState finalClusterState = allocation.reroute(clusterState, "reroute"); + final ClusterState finalClusterState = allocation.reroute(clusterState, "reroute", ActionListener.noop()); logger.info("--> allocating same index primary in multiple commands should fail"); assertThat(expectThrows(IllegalArgumentException.class, () -> { @@ -916,7 +1027,9 @@ public void testConflictingCommandsInSingleRequest() { new AllocateStalePrimaryAllocationCommand(index1, 0, node2, true) ), false, - false + false, + false, + ActionListener.noop() ); }).getMessage(), containsString("primary [" + index1 + "][0] is already assigned")); @@ -928,7 +1041,9 @@ public void testConflictingCommandsInSingleRequest() { new AllocateEmptyPrimaryAllocationCommand(index2, 0, node2, true) ), false, - false + false, + false, + ActionListener.noop() ); }).getMessage(), containsString("primary [" + index2 + "][0] is already assigned")); @@ -936,7 +1051,9 @@ public void testConflictingCommandsInSingleRequest() { clusterState, new AllocationCommands(new AllocateEmptyPrimaryAllocationCommand(index3, 0, node1, true)), false, - false + false, + false, + ActionListener.noop() ).clusterState(); clusterState = startInitializingShardsAndReroute(allocation, clusterState); @@ -952,8 +1069,14 @@ public void testConflictingCommandsInSingleRequest() { new AllocateReplicaAllocationCommand(index3, 0, node2) ), false, - false + false, + false, + ActionListener.noop() ); }).getMessage(), containsString("all copies of [" + index3 + "][0] are already assigned. Use the move allocation command instead")); } + + private static String randomShardsAllocator() { + return randomFrom(BALANCED_ALLOCATOR, DESIRED_BALANCE_ALLOCATOR); + } } diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/AllocationPriorityTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/AllocationPriorityTests.java index 675e6eee89ed3..320ceaf603cac 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/AllocationPriorityTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/AllocationPriorityTests.java @@ -8,6 +8,7 @@ package org.elasticsearch.cluster.routing.allocation; import org.elasticsearch.Version; +import org.elasticsearch.action.ActionListener; import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.ESAllocationTestCase; import org.elasticsearch.cluster.metadata.IndexMetadata; @@ -75,9 +76,9 @@ public void testPrioritizedIndicesAllocatedFirst() { clusterState = ClusterState.builder(clusterState) .nodes(DiscoveryNodes.builder().add(newNode("node1")).add(newNode("node2"))) .build(); - clusterState = allocation.reroute(clusterState, "reroute"); + clusterState = allocation.reroute(clusterState, "reroute", ActionListener.noop()); - clusterState = allocation.reroute(clusterState, "reroute"); + clusterState = allocation.reroute(clusterState, "reroute", ActionListener.noop()); assertEquals(2, shardsWithState(clusterState.getRoutingNodes(), INITIALIZING).size()); assertEquals(highPriorityName, shardsWithState(clusterState.getRoutingNodes(), INITIALIZING).get(0).getIndexName()); assertEquals(highPriorityName, shardsWithState(clusterState.getRoutingNodes(), INITIALIZING).get(1).getIndexName()); diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/AllocationServiceTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/AllocationServiceTests.java index 34ca77e4e4996..4afb957ea98b3 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/AllocationServiceTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/AllocationServiceTests.java @@ -8,6 +8,7 @@ package org.elasticsearch.cluster.routing.allocation; import org.elasticsearch.Version; +import org.elasticsearch.action.ActionListener; import org.elasticsearch.cluster.ClusterInfo; import org.elasticsearch.cluster.ClusterName; import org.elasticsearch.cluster.ClusterState; @@ -364,7 +365,7 @@ public int getNumberOfInFlightFetches() { } private static ClusterState rerouteAndStartShards(final AllocationService allocationService, final ClusterState clusterState) { - final ClusterState reroutedState = allocationService.reroute(clusterState, "test"); + final ClusterState reroutedState = allocationService.reroute(clusterState, "test", ActionListener.noop()); return allocationService.applyStartedShards( reroutedState, shardsWithState(reroutedState.getRoutingNodes(), ShardRoutingState.INITIALIZING) diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/AwarenessAllocationTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/AwarenessAllocationTests.java index 54ececbc06980..2f2129ca9e6a5 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/AwarenessAllocationTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/AwarenessAllocationTests.java @@ -11,6 +11,7 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.elasticsearch.Version; +import org.elasticsearch.action.ActionListener; import org.elasticsearch.cluster.ClusterName; import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.ESAllocationTestCase; @@ -81,7 +82,7 @@ public void testMoveShardOnceNewNodeWithAttributeAdded1() { .add(newNode("node2", singletonMap("rack_id", "1"))) ) .build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(shardsWithState(clusterState.getRoutingNodes(), INITIALIZING).size(), equalTo(1)); logger.info("--> start the shards (primaries)"); @@ -96,7 +97,7 @@ public void testMoveShardOnceNewNodeWithAttributeAdded1() { clusterState = ClusterState.builder(clusterState) .nodes(DiscoveryNodes.builder(clusterState.nodes()).add(newNode("node3", singletonMap("rack_id", "2")))) .build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(shardsWithState(clusterState.getRoutingNodes(), ShardRoutingState.STARTED).size(), equalTo(1)); assertThat(shardsWithState(clusterState.getRoutingNodes(), ShardRoutingState.RELOCATING).size(), equalTo(1)); @@ -111,13 +112,16 @@ public void testMoveShardOnceNewNodeWithAttributeAdded1() { assertThat(shardsWithState(clusterState.getRoutingNodes(), ShardRoutingState.STARTED).size(), equalTo(2)); logger.info("--> do another reroute, make sure nothing moves"); - assertThat(strategy.reroute(clusterState, "reroute").routingTable(), sameInstance(clusterState.routingTable())); + assertThat( + strategy.reroute(clusterState, "reroute", ActionListener.noop()).routingTable(), + sameInstance(clusterState.routingTable()) + ); logger.info("--> add another node with a new rack, make sure nothing moves"); clusterState = ClusterState.builder(clusterState) .nodes(DiscoveryNodes.builder(clusterState.nodes()).add(newNode("node4", singletonMap("rack_id", "3")))) .build(); - ClusterState newState = strategy.reroute(clusterState, "reroute"); + ClusterState newState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(newState, equalTo(clusterState)); assertThat(shardsWithState(clusterState.getRoutingNodes(), STARTED).size(), equalTo(2)); } @@ -152,7 +156,7 @@ public void testMoveShardOnceNewNodeWithAttributeAdded2() { .add(newNode("node3", singletonMap("rack_id", "1"))) ) .build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(shardsWithState(clusterState.getRoutingNodes(), INITIALIZING).size(), equalTo(1)); logger.info("--> start the shards (primaries)"); @@ -167,7 +171,7 @@ public void testMoveShardOnceNewNodeWithAttributeAdded2() { clusterState = ClusterState.builder(clusterState) .nodes(DiscoveryNodes.builder(clusterState.nodes()).add(newNode("node4", singletonMap("rack_id", "2")))) .build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(shardsWithState(clusterState.getRoutingNodes(), ShardRoutingState.STARTED).size(), equalTo(1)); assertThat(shardsWithState(clusterState.getRoutingNodes(), ShardRoutingState.RELOCATING).size(), equalTo(1)); @@ -182,13 +186,16 @@ public void testMoveShardOnceNewNodeWithAttributeAdded2() { assertThat(shardsWithState(clusterState.getRoutingNodes(), ShardRoutingState.STARTED).size(), equalTo(2)); logger.info("--> do another reroute, make sure nothing moves"); - assertThat(strategy.reroute(clusterState, "reroute").routingTable(), sameInstance(clusterState.routingTable())); + assertThat( + strategy.reroute(clusterState, "reroute", ActionListener.noop()).routingTable(), + sameInstance(clusterState.routingTable()) + ); logger.info("--> add another node with a new rack, make sure nothing moves"); clusterState = ClusterState.builder(clusterState) .nodes(DiscoveryNodes.builder(clusterState.nodes()).add(newNode("node5", singletonMap("rack_id", "3")))) .build(); - ClusterState newState = strategy.reroute(clusterState, "reroute"); + ClusterState newState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(newState, equalTo(clusterState)); assertThat(shardsWithState(clusterState.getRoutingNodes(), STARTED).size(), equalTo(2)); } @@ -227,7 +234,7 @@ public void testMoveShardOnceNewNodeWithAttributeAdded3() { .add(newNode("node2", singletonMap("rack_id", "1"))) ) .build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); logger.info("Initializing shards: {}", shardsWithState(clusterState.getRoutingNodes(), INITIALIZING)); logger.info("Started shards: {}", shardsWithState(clusterState.getRoutingNodes(), STARTED)); @@ -248,7 +255,7 @@ public void testMoveShardOnceNewNodeWithAttributeAdded3() { clusterState = ClusterState.builder(clusterState) .nodes(DiscoveryNodes.builder(clusterState.nodes()).add(newNode("node3", singletonMap("rack_id", "2")))) .build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(shardsWithState(clusterState.getRoutingNodes(), ShardRoutingState.STARTED).size(), equalTo(5)); assertThat(shardsWithState(clusterState.getRoutingNodes(), ShardRoutingState.RELOCATING).size(), equalTo(5)); @@ -267,13 +274,16 @@ public void testMoveShardOnceNewNodeWithAttributeAdded3() { assertThat(shardsWithState(clusterState.getRoutingNodes(), ShardRoutingState.STARTED).size(), equalTo(10)); logger.info("--> do another reroute, make sure nothing moves"); - assertThat(strategy.reroute(clusterState, "reroute").routingTable(), sameInstance(clusterState.routingTable())); + assertThat( + strategy.reroute(clusterState, "reroute", ActionListener.noop()).routingTable(), + sameInstance(clusterState.routingTable()) + ); logger.info("--> add another node with a new rack, some more relocation should happen"); clusterState = ClusterState.builder(clusterState) .nodes(DiscoveryNodes.builder(clusterState.nodes()).add(newNode("node4", singletonMap("rack_id", "3")))) .build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(shardsWithState(clusterState.getRoutingNodes(), RELOCATING).size(), greaterThan(0)); logger.info("--> complete relocation"); @@ -282,7 +292,10 @@ public void testMoveShardOnceNewNodeWithAttributeAdded3() { assertThat(shardsWithState(clusterState.getRoutingNodes(), ShardRoutingState.STARTED).size(), equalTo(10)); logger.info("--> do another reroute, make sure nothing moves"); - assertThat(strategy.reroute(clusterState, "reroute").routingTable(), sameInstance(clusterState.routingTable())); + assertThat( + strategy.reroute(clusterState, "reroute", ActionListener.noop()).routingTable(), + sameInstance(clusterState.routingTable()) + ); } public void testMoveShardOnceNewNodeWithAttributeAdded4() { @@ -320,7 +333,7 @@ public void testMoveShardOnceNewNodeWithAttributeAdded4() { .add(newNode("node2", singletonMap("rack_id", "1"))) ) .build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(shardsWithState(clusterState.getRoutingNodes(), INITIALIZING).size(), equalTo(10)); logger.info("--> start the shards (primaries)"); @@ -335,7 +348,7 @@ public void testMoveShardOnceNewNodeWithAttributeAdded4() { clusterState = ClusterState.builder(clusterState) .nodes(DiscoveryNodes.builder(clusterState.nodes()).add(newNode("node3", singletonMap("rack_id", "2")))) .build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(shardsWithState(clusterState.getRoutingNodes(), ShardRoutingState.STARTED).size(), equalTo(10)); assertThat(shardsWithState(clusterState.getRoutingNodes(), ShardRoutingState.RELOCATING).size(), equalTo(10)); @@ -358,13 +371,16 @@ public void testMoveShardOnceNewNodeWithAttributeAdded4() { assertThat(clusterState.getRoutingNodes().node("node1").size(), equalTo(5)); logger.info("--> do another reroute, make sure nothing moves"); - assertThat(strategy.reroute(clusterState, "reroute").routingTable(), sameInstance(clusterState.routingTable())); + assertThat( + strategy.reroute(clusterState, "reroute", ActionListener.noop()).routingTable(), + sameInstance(clusterState.routingTable()) + ); logger.info("--> add another node with a new rack, some more relocation should happen"); clusterState = ClusterState.builder(clusterState) .nodes(DiscoveryNodes.builder(clusterState.nodes()).add(newNode("node4", singletonMap("rack_id", "3")))) .build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(shardsWithState(clusterState.getRoutingNodes(), RELOCATING).size(), greaterThan(0)); logger.info("--> complete relocation"); @@ -379,7 +395,10 @@ public void testMoveShardOnceNewNodeWithAttributeAdded4() { assertThat(clusterState.getRoutingNodes().node("node1").size(), equalTo(5)); logger.info("--> do another reroute, make sure nothing moves"); - assertThat(strategy.reroute(clusterState, "reroute").routingTable(), sameInstance(clusterState.routingTable())); + assertThat( + strategy.reroute(clusterState, "reroute", ActionListener.noop()).routingTable(), + sameInstance(clusterState.routingTable()) + ); } public void testMoveShardOnceNewNodeWithAttributeAdded5() { @@ -411,7 +430,7 @@ public void testMoveShardOnceNewNodeWithAttributeAdded5() { .add(newNode("node2", singletonMap("rack_id", "1"))) ) .build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(shardsWithState(clusterState.getRoutingNodes(), INITIALIZING).size(), equalTo(1)); logger.info("--> start the shards (primaries)"); @@ -426,7 +445,7 @@ public void testMoveShardOnceNewNodeWithAttributeAdded5() { clusterState = ClusterState.builder(clusterState) .nodes(DiscoveryNodes.builder(clusterState.nodes()).add(newNode("node3", singletonMap("rack_id", "2")))) .build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(shardsWithState(clusterState.getRoutingNodes(), ShardRoutingState.STARTED).size(), equalTo(2)); assertThat(shardsWithState(clusterState.getRoutingNodes(), ShardRoutingState.INITIALIZING).size(), equalTo(1)); @@ -441,13 +460,16 @@ public void testMoveShardOnceNewNodeWithAttributeAdded5() { assertThat(shardsWithState(clusterState.getRoutingNodes(), ShardRoutingState.STARTED).size(), equalTo(3)); logger.info("--> do another reroute, make sure nothing moves"); - assertThat(strategy.reroute(clusterState, "reroute").routingTable(), sameInstance(clusterState.routingTable())); + assertThat( + strategy.reroute(clusterState, "reroute", ActionListener.noop()).routingTable(), + sameInstance(clusterState.routingTable()) + ); logger.info("--> add another node with a new rack, we will have another relocation"); clusterState = ClusterState.builder(clusterState) .nodes(DiscoveryNodes.builder(clusterState.nodes()).add(newNode("node4", singletonMap("rack_id", "3")))) .build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(shardsWithState(clusterState.getRoutingNodes(), STARTED).size(), equalTo(2)); assertThat(shardsWithState(clusterState.getRoutingNodes(), ShardRoutingState.RELOCATING).size(), equalTo(1)); assertThat( @@ -461,7 +483,10 @@ public void testMoveShardOnceNewNodeWithAttributeAdded5() { assertThat(shardsWithState(clusterState.getRoutingNodes(), ShardRoutingState.STARTED).size(), equalTo(3)); logger.info("--> make sure another reroute does not move things"); - assertThat(strategy.reroute(clusterState, "reroute").routingTable(), sameInstance(clusterState.routingTable())); + assertThat( + strategy.reroute(clusterState, "reroute", ActionListener.noop()).routingTable(), + sameInstance(clusterState.routingTable()) + ); } public void testMoveShardOnceNewNodeWithAttributeAdded6() { @@ -495,7 +520,7 @@ public void testMoveShardOnceNewNodeWithAttributeAdded6() { .add(newNode("node4", singletonMap("rack_id", "1"))) ) .build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(shardsWithState(clusterState.getRoutingNodes(), INITIALIZING).size(), equalTo(1)); logger.info("--> start the shards (primaries)"); @@ -510,7 +535,7 @@ public void testMoveShardOnceNewNodeWithAttributeAdded6() { clusterState = ClusterState.builder(clusterState) .nodes(DiscoveryNodes.builder(clusterState.nodes()).add(newNode("node5", singletonMap("rack_id", "2")))) .build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(shardsWithState(clusterState.getRoutingNodes(), ShardRoutingState.STARTED).size(), equalTo(3)); assertThat(shardsWithState(clusterState.getRoutingNodes(), ShardRoutingState.RELOCATING).size(), equalTo(1)); @@ -525,13 +550,16 @@ public void testMoveShardOnceNewNodeWithAttributeAdded6() { assertThat(shardsWithState(clusterState.getRoutingNodes(), ShardRoutingState.STARTED).size(), equalTo(4)); logger.info("--> do another reroute, make sure nothing moves"); - assertThat(strategy.reroute(clusterState, "reroute").routingTable(), sameInstance(clusterState.routingTable())); + assertThat( + strategy.reroute(clusterState, "reroute", ActionListener.noop()).routingTable(), + sameInstance(clusterState.routingTable()) + ); logger.info("--> add another node with a new rack, we will have another relocation"); clusterState = ClusterState.builder(clusterState) .nodes(DiscoveryNodes.builder(clusterState.nodes()).add(newNode("node6", singletonMap("rack_id", "3")))) .build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(shardsWithState(clusterState.getRoutingNodes(), STARTED).size(), equalTo(3)); assertThat(shardsWithState(clusterState.getRoutingNodes(), ShardRoutingState.RELOCATING).size(), equalTo(1)); assertThat( @@ -545,7 +573,10 @@ public void testMoveShardOnceNewNodeWithAttributeAdded6() { assertThat(shardsWithState(clusterState.getRoutingNodes(), ShardRoutingState.STARTED).size(), equalTo(4)); logger.info("--> make sure another reroute does not move things"); - assertThat(strategy.reroute(clusterState, "reroute").routingTable(), sameInstance(clusterState.routingTable())); + assertThat( + strategy.reroute(clusterState, "reroute", ActionListener.noop()).routingTable(), + sameInstance(clusterState.routingTable()) + ); } public void testFullAwareness1() { @@ -578,7 +609,7 @@ public void testFullAwareness1() { .add(newNode("node2", singletonMap("rack_id", "1"))) ) .build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(shardsWithState(clusterState.getRoutingNodes(), INITIALIZING).size(), equalTo(1)); logger.info("--> start the shards (primaries)"); @@ -592,7 +623,7 @@ public void testFullAwareness1() { clusterState = ClusterState.builder(clusterState) .nodes(DiscoveryNodes.builder(clusterState.nodes()).add(newNode("node3", singletonMap("rack_id", "2")))) .build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(shardsWithState(clusterState.getRoutingNodes(), ShardRoutingState.STARTED).size(), equalTo(1)); assertThat(shardsWithState(clusterState.getRoutingNodes(), ShardRoutingState.INITIALIZING).size(), equalTo(1)); @@ -607,13 +638,16 @@ public void testFullAwareness1() { assertThat(shardsWithState(clusterState.getRoutingNodes(), ShardRoutingState.STARTED).size(), equalTo(2)); logger.info("--> do another reroute, make sure nothing moves"); - assertThat(strategy.reroute(clusterState, "reroute").routingTable(), sameInstance(clusterState.routingTable())); + assertThat( + strategy.reroute(clusterState, "reroute", ActionListener.noop()).routingTable(), + sameInstance(clusterState.routingTable()) + ); logger.info("--> add another node with a new rack, make sure nothing moves"); clusterState = ClusterState.builder(clusterState) .nodes(DiscoveryNodes.builder(clusterState.nodes()).add(newNode("node4", singletonMap("rack_id", "3")))) .build(); - ClusterState newState = strategy.reroute(clusterState, "reroute"); + ClusterState newState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(newState, equalTo(clusterState)); assertThat(shardsWithState(clusterState.getRoutingNodes(), STARTED).size(), equalTo(2)); } @@ -649,7 +683,7 @@ public void testFullAwareness2() { .add(newNode("node3", singletonMap("rack_id", "1"))) ) .build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(shardsWithState(clusterState.getRoutingNodes(), INITIALIZING).size(), equalTo(1)); logger.info("--> start the shards (primaries)"); @@ -663,7 +697,7 @@ public void testFullAwareness2() { clusterState = ClusterState.builder(clusterState) .nodes(DiscoveryNodes.builder(clusterState.nodes()).add(newNode("node4", singletonMap("rack_id", "2")))) .build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(shardsWithState(clusterState.getRoutingNodes(), ShardRoutingState.STARTED).size(), equalTo(1)); assertThat(shardsWithState(clusterState.getRoutingNodes(), ShardRoutingState.INITIALIZING).size(), equalTo(1)); @@ -678,13 +712,16 @@ public void testFullAwareness2() { assertThat(shardsWithState(clusterState.getRoutingNodes(), ShardRoutingState.STARTED).size(), equalTo(2)); logger.info("--> do another reroute, make sure nothing moves"); - assertThat(strategy.reroute(clusterState, "reroute").routingTable(), sameInstance(clusterState.routingTable())); + assertThat( + strategy.reroute(clusterState, "reroute", ActionListener.noop()).routingTable(), + sameInstance(clusterState.routingTable()) + ); logger.info("--> add another node with a new rack, make sure nothing moves"); clusterState = ClusterState.builder(clusterState) .nodes(DiscoveryNodes.builder(clusterState.nodes()).add(newNode("node5", singletonMap("rack_id", "3")))) .build(); - ClusterState newState = strategy.reroute(clusterState, "reroute"); + ClusterState newState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(newState, equalTo(clusterState)); assertThat(shardsWithState(clusterState.getRoutingNodes(), STARTED).size(), equalTo(2)); } @@ -728,7 +765,7 @@ public void testFullAwareness3() { .add(newNode("node2", singletonMap("rack_id", "1"))) ) .build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(shardsWithState(clusterState.getRoutingNodes(), INITIALIZING).size(), equalTo(10)); logger.info("--> start the shards (primaries)"); @@ -740,7 +777,7 @@ public void testFullAwareness3() { clusterState = ClusterState.builder(clusterState) .nodes(DiscoveryNodes.builder(clusterState.nodes()).add(newNode("node3", singletonMap("rack_id", "2")))) .build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(shardsWithState(clusterState.getRoutingNodes(), ShardRoutingState.STARTED).size(), equalTo(10)); assertThat(shardsWithState(clusterState.getRoutingNodes(), ShardRoutingState.INITIALIZING).size(), equalTo(10)); @@ -758,13 +795,16 @@ public void testFullAwareness3() { assertThat(shardsWithState(clusterState.getRoutingNodes(), ShardRoutingState.STARTED).size(), equalTo(20)); logger.info("--> do another reroute, make sure nothing moves"); - assertThat(strategy.reroute(clusterState, "reroute").routingTable(), sameInstance(clusterState.routingTable())); + assertThat( + strategy.reroute(clusterState, "reroute", ActionListener.noop()).routingTable(), + sameInstance(clusterState.routingTable()) + ); logger.info("--> add another node with a new rack, some more relocation should happen"); clusterState = ClusterState.builder(clusterState) .nodes(DiscoveryNodes.builder(clusterState.nodes()).add(newNode("node4", singletonMap("rack_id", "3")))) .build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(shardsWithState(clusterState.getRoutingNodes(), RELOCATING).size(), greaterThan(0)); logger.info("--> complete relocation"); @@ -773,7 +813,10 @@ public void testFullAwareness3() { assertThat(shardsWithState(clusterState.getRoutingNodes(), ShardRoutingState.STARTED).size(), equalTo(20)); logger.info("--> do another reroute, make sure nothing moves"); - assertThat(strategy.reroute(clusterState, "reroute").routingTable(), sameInstance(clusterState.routingTable())); + assertThat( + strategy.reroute(clusterState, "reroute", ActionListener.noop()).routingTable(), + sameInstance(clusterState.routingTable()) + ); } public void testUnbalancedZones() { @@ -804,7 +847,7 @@ public void testUnbalancedZones() { clusterState = ClusterState.builder(clusterState) .nodes(DiscoveryNodes.builder().add(newNode("A-0", singletonMap("zone", "a"))).add(newNode("B-0", singletonMap("zone", "b")))) .build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(shardsWithState(clusterState.getRoutingNodes(), STARTED).size(), equalTo(0)); assertThat(shardsWithState(clusterState.getRoutingNodes(), INITIALIZING).size(), equalTo(5)); @@ -822,7 +865,7 @@ public void testUnbalancedZones() { clusterState = ClusterState.builder(clusterState) .nodes(DiscoveryNodes.builder(clusterState.nodes()).add(newNode("A-1", singletonMap("zone", "a")))) .build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(shardsWithState(clusterState.getRoutingNodes(), ShardRoutingState.STARTED).size(), equalTo(8)); assertThat(shardsWithState(clusterState.getRoutingNodes(), ShardRoutingState.INITIALIZING).size(), equalTo(2)); assertThat(shardsWithState(clusterState.getRoutingNodes(), ShardRoutingState.INITIALIZING).get(0).currentNodeId(), equalTo("A-1")); @@ -873,7 +916,7 @@ public void testUnassignedShardsWithUnbalancedZones() { .add(newNode("A-4", singletonMap("zone", "a"))) ) .build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(shardsWithState(clusterState.getRoutingNodes(), STARTED).size(), equalTo(0)); assertThat(shardsWithState(clusterState.getRoutingNodes(), INITIALIZING).size(), equalTo(1)); @@ -899,7 +942,7 @@ public void testUnassignedShardsWithUnbalancedZones() { } commands.add(new MoveAllocationCommand("test", 0, primaryNode, "A-4")); - clusterState = strategy.reroute(clusterState, commands, false, false).clusterState(); + clusterState = strategy.reroute(clusterState, commands, false, false, false, ActionListener.noop()).clusterState(); assertThat(shardsWithState(clusterState.getRoutingNodes(), STARTED).size(), equalTo(0)); assertThat(shardsWithState(clusterState.getRoutingNodes(), RELOCATING).size(), equalTo(1)); @@ -939,7 +982,7 @@ public void testMultipleAwarenessAttributes() { clusterState = ClusterState.builder(clusterState) .nodes(DiscoveryNodes.builder().add(newNode("A-0", nodeAAttributes)).add(newNode("B-0", nodeBAttributes))) .build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(shardsWithState(clusterState.getRoutingNodes(), STARTED).size(), equalTo(0)); assertThat(shardsWithState(clusterState.getRoutingNodes(), INITIALIZING).size(), equalTo(1)); diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/BalanceConfigurationTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/BalanceConfigurationTests.java index 197ada93b862d..eeed2324dec75 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/BalanceConfigurationTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/BalanceConfigurationTests.java @@ -12,6 +12,7 @@ import org.apache.logging.log4j.Logger; import org.apache.lucene.util.ArrayUtil; import org.elasticsearch.Version; +import org.elasticsearch.action.ActionListener; import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.ESAllocationTestCase; import org.elasticsearch.cluster.EmptyClusterInfoService; @@ -175,7 +176,7 @@ private ClusterState initCluster(AllocationService strategy) { ClusterState clusterState = ClusterState.builder( org.elasticsearch.cluster.ClusterName.CLUSTER_NAME_SETTING.getDefault(Settings.EMPTY) ).nodes(nodes).metadata(metadata).routingTable(initialRoutingTable).build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); logger.info("restart all the primary shards, replicas will start initializing"); clusterState = startInitializingShardsAndReroute(strategy, clusterState); @@ -193,7 +194,7 @@ private ClusterState addNode(ClusterState clusterState, AllocationService strate .nodes(DiscoveryNodes.builder(clusterState.nodes()).add(newNode("node" + numberOfNodes))) .build(); - RoutingTable routingTable = strategy.reroute(clusterState, "reroute").routingTable(); + RoutingTable routingTable = strategy.reroute(clusterState, "reroute", ActionListener.noop()).routingTable(); clusterState = ClusterState.builder(clusterState).routingTable(routingTable).build(); // move initializing to started @@ -222,7 +223,7 @@ private ClusterState removeNodes(ClusterState clusterState, AllocationService st clusterState = startInitializingShardsAndReroute(strategy, clusterState); logger.info("rebalancing"); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); logger.info("complete rebalancing"); return applyStartedShardsUntilNoChange(clusterState, strategy); @@ -422,7 +423,7 @@ public ShardAllocationDecision decideShardAllocation(ShardRouting shard, Routing ClusterState clusterState = ClusterState.builder( org.elasticsearch.cluster.ClusterName.CLUSTER_NAME_SETTING.getDefault(Settings.EMPTY) ).nodes(nodes).metadata(metadata).routingTable(routingTable).build(); - routingTable = strategy.reroute(clusterState, "reroute").routingTable(); + routingTable = strategy.reroute(clusterState, "reroute", ActionListener.noop()).routingTable(); clusterState = ClusterState.builder(clusterState).routingTable(routingTable).build(); RoutingNodes routingNodes = clusterState.getRoutingNodes(); @@ -455,7 +456,7 @@ public ShardAllocationDecision decideShardAllocation(ShardRouting shard, Routing } logger.info("rebalancing"); - routingTable = strategy.reroute(clusterState, "reroute").routingTable(); + routingTable = strategy.reroute(clusterState, "reroute", ActionListener.noop()).routingTable(); clusterState = ClusterState.builder(clusterState).routingTable(routingTable).build(); routingNodes = clusterState.getRoutingNodes(); diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/BalanceUnbalancedClusterTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/BalanceUnbalancedClusterTests.java index 84f86c095c01c..bbb1b8998084c 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/BalanceUnbalancedClusterTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/BalanceUnbalancedClusterTests.java @@ -9,6 +9,7 @@ import org.apache.lucene.tests.util.TestUtil; import org.elasticsearch.Version; +import org.elasticsearch.action.ActionListener; import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.ESAllocationTestCase; import org.elasticsearch.cluster.metadata.IndexMetadata; @@ -53,7 +54,7 @@ protected ClusterState allocateNew(ClusterState state) { RoutingTable initialRoutingTable = RoutingTable.builder(state.routingTable()).addAsNew(metadata.index(index)).build(); ClusterState clusterState = ClusterState.builder(state).metadata(metadata).routingTable(initialRoutingTable).build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); while (shardsWithState(clusterState.getRoutingNodes(), INITIALIZING).isEmpty() == false) { clusterState = ESAllocationTestCase.startInitializingShardsAndReroute(strategy, clusterState); } diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/CatAllocationTestCase.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/CatAllocationTestCase.java index 4783414153457..cc8673a0a3564 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/CatAllocationTestCase.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/CatAllocationTestCase.java @@ -9,6 +9,7 @@ package org.elasticsearch.cluster.routing.allocation; import org.elasticsearch.Version; +import org.elasticsearch.action.ActionListener; import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.ESAllocationTestCase; import org.elasticsearch.cluster.metadata.IndexMetadata; @@ -145,7 +146,7 @@ protected boolean balanceFirst() { private ClusterState rebalance(ClusterState clusterState) { AllocationService strategy = createAllocationService(Settings.builder().build()); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); int numRelocations = 0; while (true) { List initializing = shardsWithState(clusterState.getRoutingNodes(), INITIALIZING); diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/ClusterRebalanceRoutingTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/ClusterRebalanceRoutingTests.java index 65becb023721d..c904100a5a80b 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/ClusterRebalanceRoutingTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/ClusterRebalanceRoutingTests.java @@ -11,6 +11,7 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.elasticsearch.Version; +import org.elasticsearch.action.ActionListener; import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.ESAllocationTestCase; import org.elasticsearch.cluster.metadata.IndexMetadata; @@ -66,7 +67,7 @@ public void testAlways() { clusterState = ClusterState.builder(clusterState) .nodes(DiscoveryNodes.builder().add(newNode("node1")).add(newNode("node2")).localNodeId("node1").masterNodeId("node1")) .build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); for (int i = 0; i < clusterState.routingTable().index("test1").size(); i++) { assertThat(clusterState.routingTable().index("test1").shard(i).size(), equalTo(2)); @@ -114,7 +115,7 @@ public void testAlways() { clusterState = ClusterState.builder(clusterState) .nodes(DiscoveryNodes.builder(clusterState.nodes()).add(newNode("node3")).add(newNode("node4"))) .build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); RoutingNodes routingNodes = clusterState.getRoutingNodes(); final var newNodesIterator = Iterators.concat(routingNodes.node("node3").iterator(), routingNodes.node("node4").iterator()); @@ -160,7 +161,7 @@ public void testClusterPrimariesActive1() { clusterState = ClusterState.builder(clusterState) .nodes(DiscoveryNodes.builder().add(newNode("node1")).add(newNode("node2"))) .build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); for (int i = 0; i < clusterState.routingTable().index("test1").size(); i++) { assertThat(clusterState.routingTable().index("test1").shard(i).size(), equalTo(2)); assertThat(clusterState.routingTable().index("test1").shard(i).primaryShard().state(), equalTo(INITIALIZING)); @@ -220,7 +221,7 @@ public void testClusterPrimariesActive1() { logger.info("now, start 1 more node, check that rebalancing happen (for test1) because we set it to primaries_active"); clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()).add(newNode("node3"))).build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); RoutingNodes routingNodes = clusterState.getRoutingNodes(); assertThat(routingNodes.node("node3").size(), equalTo(1)); @@ -256,7 +257,7 @@ public void testClusterPrimariesActive2() { clusterState = ClusterState.builder(clusterState) .nodes(DiscoveryNodes.builder().add(newNode("node1")).add(newNode("node2"))) .build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); for (int i = 0; i < clusterState.routingTable().index("test1").size(); i++) { assertThat(clusterState.routingTable().index("test1").shard(i).size(), equalTo(2)); @@ -302,7 +303,7 @@ public void testClusterPrimariesActive2() { logger.info("now, start 1 more node, check that rebalancing will not happen (for test1) because we set it to primaries_active"); clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()).add(newNode("node3"))).build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); RoutingNodes routingNodes = clusterState.getRoutingNodes(); assertThat(routingNodes.node("node3").isEmpty(), equalTo(true)); } @@ -335,7 +336,7 @@ public void testClusterAllActive1() { clusterState = ClusterState.builder(clusterState) .nodes(DiscoveryNodes.builder().add(newNode("node1")).add(newNode("node2"))) .build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); for (int i = 0; i < clusterState.routingTable().index("test1").size(); i++) { assertThat(clusterState.routingTable().index("test1").shard(i).size(), equalTo(2)); @@ -411,7 +412,7 @@ public void testClusterAllActive1() { logger.info("now, start 1 more node, check that rebalancing happen (for test1) because we set it to all_active"); clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()).add(newNode("node3"))).build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); RoutingNodes routingNodes = clusterState.getRoutingNodes(); assertThat(routingNodes.node("node3").size(), equalTo(1)); @@ -446,7 +447,7 @@ public void testClusterAllActive2() { clusterState = ClusterState.builder(clusterState) .nodes(DiscoveryNodes.builder().add(newNode("node1")).add(newNode("node2"))) .build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); for (int i = 0; i < clusterState.routingTable().index("test1").size(); i++) { assertThat(clusterState.routingTable().index("test1").shard(i).size(), equalTo(2)); @@ -492,7 +493,7 @@ public void testClusterAllActive2() { logger.info("now, start 1 more node, check that rebalancing will not happen (for test1) because we set it to all_active"); clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()).add(newNode("node3"))).build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); RoutingNodes routingNodes = clusterState.getRoutingNodes(); assertThat(routingNodes.node("node3").isEmpty(), equalTo(true)); @@ -526,7 +527,7 @@ public void testClusterAllActive3() { clusterState = ClusterState.builder(clusterState) .nodes(DiscoveryNodes.builder().add(newNode("node1")).add(newNode("node2"))) .build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); for (int i = 0; i < clusterState.routingTable().index("test1").size(); i++) { assertThat(clusterState.routingTable().index("test1").shard(i).size(), equalTo(2)); @@ -587,7 +588,7 @@ public void testClusterAllActive3() { logger.info("now, start 1 more node, check that rebalancing will not happen (for test1) because we set it to all_active"); clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()).add(newNode("node3"))).build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); RoutingNodes routingNodes = clusterState.getRoutingNodes(); assertThat(routingNodes.node("node3").isEmpty(), equalTo(true)); @@ -627,7 +628,7 @@ public void allocateUnassigned( logger.info("start two nodes"); clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder().add(newNode("node1"))).build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); for (int i = 0; i < clusterState.routingTable().index("test").size(); i++) { assertThat(clusterState.routingTable().index("test").shard(i).size(), equalTo(1)); @@ -644,7 +645,7 @@ public void allocateUnassigned( logger.debug("now, start 1 more node, check that rebalancing will not happen since we unassigned shards"); clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()).add(newNode("node2"))).build(); logger.debug("reroute and check that nothing has changed"); - ClusterState resultingState = strategy.reroute(clusterState, "reroute"); + ClusterState resultingState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(resultingState, equalTo(clusterState)); for (int i = 0; i < clusterState.routingTable().index("test").size(); i++) { @@ -657,7 +658,7 @@ public void allocateUnassigned( } logger.debug("now set allocateTest1 to true and reroute we should see the [test1] index initializing"); allocateTest1.set(true); - resultingState = strategy.reroute(clusterState, "reroute"); + resultingState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(resultingState, not(equalTo(clusterState))); clusterState = resultingState; for (int i = 0; i < clusterState.routingTable().index("test1").size(); i++) { @@ -734,7 +735,7 @@ public void beforeAllocation(RoutingAllocation allocation) { logger.info("start two nodes"); clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder().add(newNode("node1"))).build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); for (int i = 0; i < clusterState.routingTable().index("test").size(); i++) { assertThat(clusterState.routingTable().index("test").shard(i).size(), equalTo(1)); @@ -751,7 +752,7 @@ public void beforeAllocation(RoutingAllocation allocation) { logger.debug("now, start 1 more node, check that rebalancing will not happen since we have shard sync going on"); clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()).add(newNode("node2"))).build(); logger.debug("reroute and check that nothing has changed"); - ClusterState resultState = strategy.reroute(clusterState, "reroute"); + ClusterState resultState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(resultState, equalTo(clusterState)); for (int i = 0; i < clusterState.routingTable().index("test").size(); i++) { @@ -764,7 +765,7 @@ public void beforeAllocation(RoutingAllocation allocation) { } logger.debug("now set hasFetches to true and reroute we should now see exactly one relocating shard"); hasFetches.set(false); - resultState = strategy.reroute(clusterState, "reroute"); + resultState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(resultState, not(equalTo(clusterState))); clusterState = resultState; int numStarted = 0; diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/ConcurrentRebalanceRoutingTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/ConcurrentRebalanceRoutingTests.java index d754587e3176f..9bd237e68d298 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/ConcurrentRebalanceRoutingTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/ConcurrentRebalanceRoutingTests.java @@ -11,6 +11,7 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.elasticsearch.Version; +import org.elasticsearch.action.ActionListener; import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.ESAllocationTestCase; import org.elasticsearch.cluster.metadata.IndexMetadata; @@ -63,7 +64,7 @@ public void testClusterConcurrentRebalance() { clusterState = ClusterState.builder(clusterState) .nodes(DiscoveryNodes.builder().add(newNode("node1")).add(newNode("node2"))) .build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); for (int i = 0; i < clusterState.routingTable().index("test").size(); i++) { assertThat(clusterState.routingTable().index("test").shard(i).size(), equalTo(2)); @@ -94,7 +95,7 @@ public void testClusterConcurrentRebalance() { .add(newNode("node10")) ) .build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); for (int i = 0; i < clusterState.routingTable().index("test").size(); i++) { assertThat(clusterState.routingTable().index("test").shard(i).size(), equalTo(2)); diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/DeadNodesAllocationTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/DeadNodesAllocationTests.java index ba4ab83313368..737b09fcac87d 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/DeadNodesAllocationTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/DeadNodesAllocationTests.java @@ -12,6 +12,7 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.elasticsearch.Version; +import org.elasticsearch.action.ActionListener; import org.elasticsearch.cluster.ClusterName; import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.ESAllocationTestCase; @@ -56,7 +57,7 @@ public void testSimpleDeadNodeOnStartedPrimaryShard() { .nodes(DiscoveryNodes.builder().add(newNode("node1")).add(newNode("node2"))) .build(); - clusterState = allocation.reroute(clusterState, "reroute"); + clusterState = allocation.reroute(clusterState, "reroute", ActionListener.noop()); // starting primaries clusterState = startInitializingShardsAndReroute(allocation, clusterState); @@ -149,7 +150,7 @@ public void testDeadNodeWhileRelocatingOnToNode() { .nodes(DiscoveryNodes.builder().add(newNode("node1")).add(newNode("node2"))) .build(); - clusterState = allocation.reroute(clusterState, "reroute"); + clusterState = allocation.reroute(clusterState, "reroute", ActionListener.noop()); // starting primaries clusterState = startInitializingShardsAndReroute(allocation, clusterState); @@ -164,7 +165,7 @@ public void testDeadNodeWhileRelocatingOnToNode() { logger.info("--> adding additional node"); clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()).add(newNode("node3"))).build(); - clusterState = allocation.reroute(clusterState, "reroute"); + clusterState = allocation.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(clusterState.getRoutingNodes().node("node1").size(), equalTo(1)); assertThat(clusterState.getRoutingNodes().node("node1").iterator().next().state(), equalTo(STARTED)); @@ -187,7 +188,9 @@ public void testDeadNodeWhileRelocatingOnToNode() { ) ), false, - false + false, + false, + ActionListener.noop() ); assertThat(commandsResult.clusterState(), not(equalTo(clusterState))); clusterState = commandsResult.clusterState(); @@ -226,7 +229,7 @@ public void testDeadNodeWhileRelocatingOnFromNode() { .nodes(DiscoveryNodes.builder().add(newNode("node1")).add(newNode("node2"))) .build(); - clusterState = allocation.reroute(clusterState, "reroute"); + clusterState = allocation.reroute(clusterState, "reroute", ActionListener.noop()); // starting primaries clusterState = startInitializingShardsAndReroute(allocation, clusterState); @@ -241,7 +244,7 @@ public void testDeadNodeWhileRelocatingOnFromNode() { logger.info("--> adding additional node"); clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()).add(newNode("node3"))).build(); - clusterState = allocation.reroute(clusterState, "reroute"); + clusterState = allocation.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(clusterState.getRoutingNodes().node("node1").size(), equalTo(1)); assertThat(clusterState.getRoutingNodes().node("node1").iterator().next().state(), equalTo(STARTED)); @@ -264,7 +267,9 @@ public void testDeadNodeWhileRelocatingOnFromNode() { ) ), false, - false + false, + false, + ActionListener.noop() ); assertThat(commandsResult.clusterState(), not(equalTo(clusterState))); clusterState = commandsResult.clusterState(); diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/DecisionsImpactOnClusterHealthTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/DecisionsImpactOnClusterHealthTests.java index 120727690f7e8..44eba74bcae30 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/DecisionsImpactOnClusterHealthTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/DecisionsImpactOnClusterHealthTests.java @@ -9,6 +9,7 @@ package org.elasticsearch.cluster.routing.allocation; import org.elasticsearch.Version; +import org.elasticsearch.action.ActionListener; import org.elasticsearch.cluster.ClusterName; import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.ESAllocationTestCase; @@ -122,7 +123,7 @@ private ClusterState runAllocationTest( clusterState = ClusterState.builder(clusterState).nodes(discoveryNodes).build(); logger.info("--> do the reroute"); - routingTable = allocationService.reroute(clusterState, "reroute").routingTable(); + routingTable = allocationService.reroute(clusterState, "reroute", ActionListener.noop()).routingTable(); clusterState = ClusterState.builder(clusterState).routingTable(routingTable).build(); logger.info("--> assert cluster health"); diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/ElectReplicaAsPrimaryDuringRelocationTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/ElectReplicaAsPrimaryDuringRelocationTests.java index 5edcaf56eb25f..c805426c91ef4 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/ElectReplicaAsPrimaryDuringRelocationTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/ElectReplicaAsPrimaryDuringRelocationTests.java @@ -11,6 +11,7 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.elasticsearch.Version; +import org.elasticsearch.action.ActionListener; import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.ESAllocationTestCase; import org.elasticsearch.cluster.metadata.IndexMetadata; @@ -49,7 +50,7 @@ public void testElectReplicaAsPrimaryDuringRelocation() { clusterState = ClusterState.builder(clusterState) .nodes(DiscoveryNodes.builder().add(newNode("node1")).add(newNode("node2"))) .build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); logger.info("Start the primary shards"); clusterState = startInitializingShardsAndReroute(strategy, clusterState); @@ -66,7 +67,7 @@ public void testElectReplicaAsPrimaryDuringRelocation() { logger.info("Start another node and perform rerouting"); clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()).add(newNode("node3"))).build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); logger.info("find the replica shard that gets relocated"); IndexShardRoutingTable indexShardRoutingTable = null; diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/ExpectedShardSizeAllocationTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/ExpectedShardSizeAllocationTests.java index d2cfe87fefd5f..edd5ced7e9860 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/ExpectedShardSizeAllocationTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/ExpectedShardSizeAllocationTests.java @@ -11,6 +11,7 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.elasticsearch.Version; +import org.elasticsearch.action.ActionListener; import org.elasticsearch.cluster.ClusterInfo; import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.ESAllocationTestCase; @@ -62,7 +63,7 @@ public Long getShardSize(ShardRouting shardRouting) { ).metadata(metadata).routingTable(routingTable).build(); logger.info("Adding one node and performing rerouting"); clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder().add(newNode("node1"))).build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertEquals(1, clusterState.getRoutingNodes().node("node1").numberOfShardsWithState(ShardRoutingState.INITIALIZING)); assertEquals( @@ -77,7 +78,7 @@ public Long getShardSize(ShardRouting shardRouting) { logger.info("Add another one node and reroute"); clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()).add(newNode("node2"))).build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertEquals(1, clusterState.getRoutingNodes().node("node2").numberOfShardsWithState(ShardRoutingState.INITIALIZING)); assertEquals( @@ -110,7 +111,7 @@ public Long getShardSize(ShardRouting shardRouting) { clusterState = ClusterState.builder(clusterState) .nodes(DiscoveryNodes.builder().add(newNode("node1")).add(newNode("node2"))) .build(); - clusterState = allocation.reroute(clusterState, "reroute"); + clusterState = allocation.reroute(clusterState, "reroute", ActionListener.noop()); logger.info("start primary shard"); clusterState = startInitializingShardsAndReroute(allocation, clusterState); @@ -127,7 +128,9 @@ public Long getShardSize(ShardRouting shardRouting) { clusterState, new AllocationCommands(new MoveAllocationCommand("test", 0, existingNodeId, toNodeId)), false, - false + false, + false, + ActionListener.noop() ); assertThat(commandsResult.clusterState(), not(equalTo(clusterState))); clusterState = commandsResult.clusterState(); diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/FailedNodeRoutingTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/FailedNodeRoutingTests.java index e870719594e45..9c47c227972df 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/FailedNodeRoutingTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/FailedNodeRoutingTests.java @@ -11,6 +11,7 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.elasticsearch.Version; +import org.elasticsearch.action.ActionListener; import org.elasticsearch.action.admin.cluster.reroute.ClusterRerouteRequest; import org.elasticsearch.action.admin.indices.create.CreateIndexRequest; import org.elasticsearch.action.support.ActiveShardCount; @@ -79,7 +80,7 @@ public void testSimpleFailedNodeTest() { clusterState = ClusterState.builder(clusterState) .nodes(DiscoveryNodes.builder().add(newNode("node1")).add(newNode("node2")).add(newNode("node3")).add(newNode("node4"))) .build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); logger.info("start all the primary shards, replicas will start initializing"); clusterState = startInitializingShardsAndReroute(strategy, clusterState); diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/FailedShardsRoutingTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/FailedShardsRoutingTests.java index bc3eb037d1c51..7f814c4dd866e 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/FailedShardsRoutingTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/FailedShardsRoutingTests.java @@ -11,6 +11,7 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.elasticsearch.Version; +import org.elasticsearch.action.ActionListener; import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.ESAllocationTestCase; import org.elasticsearch.cluster.metadata.IndexMetadata; @@ -70,7 +71,7 @@ public void testFailedShardPrimaryRelocatingToAndFrom() { .nodes(DiscoveryNodes.builder().add(newNode("node1")).add(newNode("node2"))) .build(); - clusterState = allocation.reroute(clusterState, "reroute"); + clusterState = allocation.reroute(clusterState, "reroute", ActionListener.noop()); // starting primaries clusterState = startInitializingShardsAndReroute(allocation, clusterState); @@ -85,7 +86,7 @@ public void testFailedShardPrimaryRelocatingToAndFrom() { logger.info("--> adding additional node"); clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()).add(newNode("node3"))).build(); - clusterState = allocation.reroute(clusterState, "reroute"); + clusterState = allocation.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(clusterState.getRoutingNodes().node("node1").size(), equalTo(1)); assertThat(clusterState.getRoutingNodes().node("node1").iterator().next().state(), equalTo(STARTED)); @@ -108,7 +109,9 @@ public void testFailedShardPrimaryRelocatingToAndFrom() { ) ), false, - false + false, + false, + ActionListener.noop() ); assertThat(commandsResult.clusterState(), not(equalTo(clusterState))); clusterState = commandsResult.clusterState(); @@ -133,7 +136,9 @@ public void testFailedShardPrimaryRelocatingToAndFrom() { ) ), false, - false + false, + false, + ActionListener.noop() ); assertThat(commandsResult.clusterState(), not(equalTo(clusterState))); clusterState = commandsResult.clusterState(); @@ -177,7 +182,7 @@ public void testFailPrimaryStartedCheckReplicaElected() { clusterState = ClusterState.builder(clusterState) .nodes(DiscoveryNodes.builder().add(newNode("node1")).add(newNode("node2"))) .build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); logger.info("Start the shards (primaries)"); ClusterState newState = startInitializingShardsAndReroute(strategy, clusterState); @@ -265,7 +270,7 @@ public void testFirstAllocationFailureSingleNode() { logger.info("Adding single node and performing rerouting"); clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder().add(newNode("node1"))).build(); - ClusterState newState = strategy.reroute(clusterState, "reroute"); + ClusterState newState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(newState, not(equalTo(clusterState))); clusterState = newState; @@ -352,9 +357,11 @@ public void testSingleShardMultipleAllocationFailures() { } } - for (String failedNode : failedNodes) { - if (routingNodes.node(failedNode).isEmpty() == false) { - fail("shard was re-assigned to failed node " + failedNode); + if (strategy.isBalancedShardsAllocator()) { + for (String failedNode : failedNodes) { + if (routingNodes.node(failedNode).isEmpty() == false) { + fail("shard was re-assigned to failed node " + failedNode); + } } } } @@ -384,7 +391,7 @@ public void testFirstAllocationFailureTwoNodes() { clusterState = ClusterState.builder(clusterState) .nodes(DiscoveryNodes.builder().add(newNode("node1")).add(newNode("node2"))) .build(); - ClusterState newState = strategy.reroute(clusterState, "reroute"); + ClusterState newState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(newState, not(clusterState)); clusterState = newState; final String nodeHoldingPrimary = clusterState.routingTable().index("test").shard(0).primaryShard().currentNodeId(); @@ -442,7 +449,7 @@ public void testRebalanceFailure() { clusterState = ClusterState.builder(clusterState) .nodes(DiscoveryNodes.builder().add(newNode("node1")).add(newNode("node2")).localNodeId("node1").masterNodeId("node1")) .build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); logger.info("Start the shards (primaries)"); ClusterState newState = startInitializingShardsAndReroute(strategy, clusterState); @@ -488,7 +495,7 @@ public void testRebalanceFailure() { logger.info("Adding third node and reroute"); clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()).add(newNode("node3"))).build(); - newState = strategy.reroute(clusterState, "reroute"); + newState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(newState, not(equalTo(clusterState))); clusterState = newState; RoutingNodes routingNodes = clusterState.getRoutingNodes(); @@ -512,9 +519,21 @@ public void testRebalanceFailure() { assertThat(routingNodes.node("node1").numberOfShardsWithState(STARTED), lessThan(3)); assertThat(routingNodes.node("node2").numberOfShardsWithState(STARTED, RELOCATING), equalTo(2)); assertThat(routingNodes.node("node2").numberOfShardsWithState(STARTED), lessThan(3)); - assertThat(routingNodes.node("node3").numberOfShardsWithState(INITIALIZING), equalTo(1)); - // make sure the failedShard is not INITIALIZING again on node3 - assertThat(routingNodes.node("node3").iterator().next().shardId(), not(equalTo(shardToFail.shardId()))); + + if (strategy.isBalancedShardsAllocator()) { + assertThat(routingNodes.node("node3").numberOfShardsWithState(INITIALIZING), equalTo(1)); + // make sure the failedShard is not INITIALIZING again on node3 + assertThat(routingNodes.node("node3").iterator().next().shardId(), not(equalTo(shardToFail.shardId()))); + } else { + // failing a shard doesn't affect the desired balance, but we do not retry on the first reroute ... + assertFalse(routingNodes.node("node3").iterator().hasNext()); + + // ... however the next reroute will retry allocating the same shard to this node + clusterState = strategy.reroute(clusterState, "test", ActionListener.noop()); + routingNodes = clusterState.getRoutingNodes(); + assertThat(routingNodes.node("node3").numberOfShardsWithState(INITIALIZING), equalTo(1)); + assertThat(routingNodes.node("node3").iterator().next().shardId(), equalTo(shardToFail.shardId())); + } } public void testFailAllReplicasInitializingOnPrimaryFail() { @@ -545,7 +564,9 @@ public void testFailAllReplicasInitializingOnPrimaryFail() { .masterNodeId("node1") ) .build(); - clusterState = ClusterState.builder(clusterState).routingTable(allocation.reroute(clusterState, "reroute").routingTable()).build(); + clusterState = ClusterState.builder(clusterState) + .routingTable(allocation.reroute(clusterState, "reroute", ActionListener.noop()).routingTable()) + .build(); assertThat(shardsWithState(clusterState.getRoutingNodes(), INITIALIZING).size(), equalTo(1)); assertThat(shardsWithState(clusterState.getRoutingNodes(), UNASSIGNED).size(), equalTo(2)); // start primary shards @@ -569,16 +590,18 @@ public void testFailAllReplicasInitializingOnPrimaryFail() { assertThat(newState, not(equalTo(clusterState))); clusterState = newState; - // The started replica gets promoted to primary and the initializing replica is reset. The other replica will be assigned. + // The started replica gets promoted to primary and the initializing replica is reset. The other replica will be assigned by the + // balanced shards allocator but not the desired balance allocator because the only remaining desired node is ignored this time + final var expectedInitializingShards = allocation.isBalancedShardsAllocator() ? 2 : 1; assertThat(shardsWithState(clusterState.getRoutingNodes(), STARTED).size(), equalTo(1)); - assertThat(shardsWithState(clusterState.getRoutingNodes(), INITIALIZING).size(), equalTo(2)); + assertThat(shardsWithState(clusterState.getRoutingNodes(), INITIALIZING).size(), equalTo(expectedInitializingShards)); ShardRouting newPrimaryShard = clusterState.routingTable().index("test").shard(0).primaryShard(); assertThat(newPrimaryShard, not(equalTo(primaryShardToFail))); assertThat(newPrimaryShard.allocationId(), equalTo(startedReplica.allocationId())); - // Another reroute changes nothing - clusterState = allocation.reroute(clusterState, "test"); + // The unassigned replica is assigned the next time round if the desired node was ignored on the previous attempt + clusterState = allocation.reroute(clusterState, "test", ActionListener.noop()); assertThat(shardsWithState(clusterState.getRoutingNodes(), STARTED).size(), equalTo(1)); assertThat(shardsWithState(clusterState.getRoutingNodes(), INITIALIZING).size(), equalTo(2)); } @@ -601,7 +624,7 @@ public void testFailAllReplicasInitializingOnPrimaryFailWhileHavingAReplicaToEle clusterState = ClusterState.builder(clusterState) .nodes(DiscoveryNodes.builder().add(newNode("node1")).add(newNode("node2")).add(newNode("node3")).add(newNode("node4"))) .build(); - clusterState = allocation.reroute(clusterState, "reroute"); + clusterState = allocation.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(shardsWithState(clusterState.getRoutingNodes(), INITIALIZING).size(), equalTo(1)); assertThat(shardsWithState(clusterState.getRoutingNodes(), UNASSIGNED).size(), equalTo(2)); // start primary shards @@ -624,15 +647,17 @@ public void testFailAllReplicasInitializingOnPrimaryFailWhileHavingAReplicaToEle assertThat(newState, not(equalTo(clusterState))); clusterState = newState; - // The started replica gets promoted to primary and the initializing replica is reset. The other replica will be assigned. + // The started replica gets promoted to primary and the initializing replica is reset. The other replica will be assigned by the + // balanced shards allocator but not the desired balance allocator because the only remaining desired node is ignored this time + final var expectedInitializingShards = allocation.isBalancedShardsAllocator() ? 2 : 1; assertThat(shardsWithState(clusterState.getRoutingNodes(), STARTED).size(), equalTo(1)); - assertThat(shardsWithState(clusterState.getRoutingNodes(), INITIALIZING).size(), equalTo(2)); + assertThat(shardsWithState(clusterState.getRoutingNodes(), INITIALIZING).size(), equalTo(expectedInitializingShards)); ShardRouting newPrimaryShard = clusterState.routingTable().index("test").shard(0).primaryShard(); assertThat(newPrimaryShard, not(equalTo(primaryShardToFail))); - // Another reroute changes nothing - clusterState = allocation.reroute(clusterState, "test"); + // The unassigned replica is assigned the next time round if the desired node was ignored on the previous attempt + clusterState = allocation.reroute(clusterState, "test", ActionListener.noop()); assertThat(shardsWithState(clusterState.getRoutingNodes(), STARTED).size(), equalTo(1)); assertThat(shardsWithState(clusterState.getRoutingNodes(), INITIALIZING).size(), equalTo(2)); } @@ -657,7 +682,9 @@ public void testReplicaOnNewestVersionIsPromoted() { clusterState = ClusterState.builder(clusterState) .nodes(DiscoveryNodes.builder().add(newNode("node1-5.x", Version.fromId(5060099)))) .build(); - clusterState = ClusterState.builder(clusterState).routingTable(allocation.reroute(clusterState, "reroute").routingTable()).build(); + clusterState = ClusterState.builder(clusterState) + .routingTable(allocation.reroute(clusterState, "reroute", ActionListener.noop()).routingTable()) + .build(); assertThat(shardsWithState(clusterState.getRoutingNodes(), INITIALIZING).size(), equalTo(1)); assertThat(shardsWithState(clusterState.getRoutingNodes(), UNASSIGNED).size(), equalTo(3)); @@ -672,7 +699,7 @@ public void testReplicaOnNewestVersionIsPromoted() { .build(); // start the shards, should have 1 primary and 1 replica available - clusterState = allocation.reroute(clusterState, "reroute"); + clusterState = allocation.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(shardsWithState(clusterState.getRoutingNodes(), STARTED).size(), equalTo(1)); assertThat(shardsWithState(clusterState.getRoutingNodes(), INITIALIZING).size(), equalTo(1)); clusterState = startInitializingShardsAndReroute(allocation, clusterState); @@ -698,7 +725,7 @@ public void testReplicaOnNewestVersionIsPromoted() { .build(); // start all the replicas - clusterState = allocation.reroute(clusterState, "reroute"); + clusterState = allocation.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(shardsWithState(clusterState.getRoutingNodes(), STARTED).size(), equalTo(2)); assertThat(shardsWithState(clusterState.getRoutingNodes(), INITIALIZING).size(), equalTo(2)); clusterState = startInitializingShardsAndReroute(allocation, clusterState); diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/FilterRoutingTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/FilterRoutingTests.java index 748dc5e488445..028b9b476652b 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/FilterRoutingTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/FilterRoutingTests.java @@ -9,6 +9,7 @@ package org.elasticsearch.cluster.routing.allocation; import org.elasticsearch.Version; +import org.elasticsearch.action.ActionListener; import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.ESAllocationTestCase; import org.elasticsearch.cluster.metadata.IndexMetadata; @@ -201,7 +202,7 @@ private void testClusterFilters(Settings.Builder allocationServiceSettings, Disc .build(); logger.info("--> rerouting"); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(shardsWithState(clusterState.getRoutingNodes(), INITIALIZING).size(), equalTo(2)); logger.info("--> start the shards (primaries)"); @@ -318,7 +319,7 @@ private void testIndexFilters(Settings.Builder initialIndexSettings, Settings.Bu .build(); logger.info("--> rerouting"); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(shardsWithState(clusterState.getRoutingNodes(), INITIALIZING).size(), equalTo(2)); logger.info("--> start the shards (primaries)"); @@ -345,7 +346,7 @@ private void testIndexFilters(Settings.Builder initialIndexSettings, Settings.Bu .build(); clusterState = ClusterState.builder(clusterState).metadata(updatedMetadata).build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(shardsWithState(clusterState.getRoutingNodes(), ShardRoutingState.STARTED).size(), equalTo(2)); assertThat(shardsWithState(clusterState.getRoutingNodes(), ShardRoutingState.RELOCATING).size(), equalTo(2)); assertThat(shardsWithState(clusterState.getRoutingNodes(), ShardRoutingState.INITIALIZING).size(), equalTo(2)); @@ -383,7 +384,7 @@ public void testConcurrentRecoveriesAfterShardsCannotRemainOnNode() { DiscoveryNode node1 = newNode("node1", singletonMap("tag1", "value1")); DiscoveryNode node2 = newNode("node2", singletonMap("tag1", "value2")); clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder().add(node1).add(node2)).build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(clusterState.getRoutingNodes().node(node1.getId()).numberOfShardsWithState(INITIALIZING), equalTo(2)); assertThat(clusterState.getRoutingNodes().node(node2.getId()).numberOfShardsWithState(INITIALIZING), equalTo(2)); @@ -402,7 +403,7 @@ public void testConcurrentRecoveriesAfterShardsCannotRemainOnNode() { ); logger.info("--> move shards from node1 to node2"); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); logger.info("--> check that concurrent recoveries only allows 1 shard to move"); assertThat(clusterState.getRoutingNodes().node(node1.getId()).numberOfShardsWithState(STARTED), equalTo(1)); assertThat(clusterState.getRoutingNodes().node(node2.getId()).numberOfShardsWithState(INITIALIZING), equalTo(1)); @@ -412,14 +413,14 @@ public void testConcurrentRecoveriesAfterShardsCannotRemainOnNode() { clusterState = startInitializingShardsAndReroute(strategy, clusterState); logger.info("--> move second shard from node1 to node2"); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(clusterState.getRoutingNodes().node(node2.getId()).numberOfShardsWithState(INITIALIZING), equalTo(1)); assertThat(clusterState.getRoutingNodes().node(node2.getId()).numberOfShardsWithState(STARTED), equalTo(3)); logger.info("--> start the shards (only primaries)"); clusterState = startInitializingShardsAndReroute(strategy, clusterState); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(clusterState.getRoutingNodes().node(node2.getId()).numberOfShardsWithState(STARTED), equalTo(4)); } } diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/InSyncAllocationIdTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/InSyncAllocationIdTests.java index 7d074f23ad09e..7892061ecba4f 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/InSyncAllocationIdTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/InSyncAllocationIdTests.java @@ -75,7 +75,7 @@ public void testInSyncAllocationIdsUpdated() { clusterState = ClusterState.builder(clusterState) .nodes(DiscoveryNodes.builder().add(newNode("node1")).add(newNode("node2")).add(newNode("node3"))) .build(); - clusterState = allocation.reroute(clusterState, "reroute"); + clusterState = allocation.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(clusterState.metadata().index("test").inSyncAllocationIds(0).size(), equalTo(0)); assertThat(clusterState.metadata().index("test-old").inSyncAllocationIds(0), equalTo(Set.of("x", "y"))); @@ -119,7 +119,9 @@ public void testInSyncAllocationIdsUpdated() { clusterState, new AllocationCommands(new AllocateEmptyPrimaryAllocationCommand("test", 0, "node1", true)), false, - false + false, + false, + ActionListener.noop() ).clusterState(); // check that in-sync allocation ids are reset by forcing an empty primary @@ -269,7 +271,7 @@ public void testInSyncIdsNotGrowingWithoutBounds() throws Exception { clusterState = ClusterState.builder(clusterState) .nodes(DiscoveryNodes.builder(clusterState.nodes()).add(newNode(replicaShard.currentNodeId()))) .build(); - clusterState = allocation.reroute(clusterState, "reroute"); + clusterState = allocation.reroute(clusterState, "reroute", ActionListener.noop()); logger.info("start replica shards"); clusterState = startInitializingShardsAndReroute(allocation, clusterState); @@ -328,7 +330,7 @@ public void testInSyncIdsNotTrimmedWhenNotGrowing() throws Exception { logger.info("add back node 1"); clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder().add(newNode("node1"))).build(); - clusterState = allocation.reroute(clusterState, "reroute"); + clusterState = allocation.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(clusterState.routingTable().index("test").shard(0).assignedShards().size(), equalTo(1)); // in-sync allocation ids should not be updated @@ -393,7 +395,7 @@ private ClusterState createOnePrimaryOneReplicaClusterState(AllocationService al clusterState = ClusterState.builder(clusterState) .nodes(DiscoveryNodes.builder().add(newNode("node1")).add(newNode("node2"))) .build(); - clusterState = allocation.reroute(clusterState, "reroute"); + clusterState = allocation.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(clusterState.metadata().index("test").inSyncAllocationIds(0).size(), equalTo(0)); diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/IndexBalanceTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/IndexBalanceTests.java index 9c4f970b2b51b..b814aba17c69b 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/IndexBalanceTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/IndexBalanceTests.java @@ -11,6 +11,7 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.elasticsearch.Version; +import org.elasticsearch.action.ActionListener; import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.ESAllocationTestCase; import org.elasticsearch.cluster.metadata.IndexMetadata; @@ -80,7 +81,7 @@ public void testBalanceAllNodesStarted() { .nodes(DiscoveryNodes.builder().add(newNode("node1")).add(newNode("node2")).add(newNode("node3"))) .build(); - ClusterState newState = strategy.reroute(clusterState, "reroute"); + ClusterState newState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(newState, not(equalTo(clusterState))); clusterState = newState; @@ -95,7 +96,7 @@ public void testBalanceAllNodesStarted() { logger.info("Another round of rebalancing"); clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes())).build(); - newState = strategy.reroute(clusterState, "reroute"); + newState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(newState, equalTo(clusterState)); newState = startInitializingShardsAndReroute(strategy, clusterState); @@ -113,7 +114,7 @@ public void testBalanceAllNodesStarted() { } logger.info("Reroute, nothing should change"); - newState = strategy.reroute(clusterState, "reroute"); + newState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(newState, equalTo(clusterState)); logger.info("Start the more shards"); @@ -197,7 +198,7 @@ public void testBalanceIncrementallyStartNodes() { logger.info("Adding one node and performing rerouting"); clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder().add(newNode("node1"))).build(); - ClusterState newState = strategy.reroute(clusterState, "reroute"); + ClusterState newState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(newState, not(equalTo(clusterState))); clusterState = newState; assertThat(clusterState.routingTable().index("test").size(), equalTo(3)); @@ -211,7 +212,7 @@ public void testBalanceIncrementallyStartNodes() { logger.info("Add another node and perform rerouting, nothing will happen since primary not started"); clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()).add(newNode("node2"))).build(); - newState = strategy.reroute(clusterState, "reroute"); + newState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(newState, equalTo(clusterState)); logger.info("Start the primary shard"); @@ -230,7 +231,7 @@ public void testBalanceIncrementallyStartNodes() { } logger.info("Reroute, nothing should change"); - newState = strategy.reroute(clusterState, "reroute"); + newState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(newState, equalTo(clusterState)); logger.info("Start the backup shard"); @@ -309,7 +310,7 @@ public void testBalanceAllNodesStartedAddIndex() { .nodes(DiscoveryNodes.builder().add(newNode("node1")).add(newNode("node2")).add(newNode("node3"))) .build(); - ClusterState newState = strategy.reroute(clusterState, "reroute"); + ClusterState newState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(newState, not(equalTo(clusterState))); clusterState = newState; assertThat(clusterState.routingTable().index("test").size(), equalTo(3)); @@ -323,7 +324,7 @@ public void testBalanceAllNodesStartedAddIndex() { logger.info("Another round of rebalancing"); clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes())).build(); - newState = strategy.reroute(clusterState, "reroute"); + newState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(newState, equalTo(clusterState)); newState = startInitializingShardsAndReroute(strategy, clusterState); @@ -341,7 +342,7 @@ public void testBalanceAllNodesStartedAddIndex() { } logger.info("Reroute, nothing should change"); - newState = strategy.reroute(clusterState, "reroute"); + newState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(newState, equalTo(clusterState)); logger.info("Start the more shards"); @@ -381,7 +382,7 @@ public void testBalanceAllNodesStartedAddIndex() { assertThat(clusterState.routingTable().index("test1").size(), equalTo(3)); - newState = strategy.reroute(clusterState, "reroute"); + newState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(newState, not(equalTo(clusterState))); clusterState = newState; assertThat(clusterState.routingTable().index("test1").size(), equalTo(3)); @@ -395,7 +396,7 @@ public void testBalanceAllNodesStartedAddIndex() { logger.info("Another round of rebalancing"); clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes())).build(); - newState = strategy.reroute(clusterState, "reroute"); + newState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(newState, equalTo(clusterState)); newState = startInitializingShardsAndReroute(strategy, clusterState); @@ -413,7 +414,7 @@ public void testBalanceAllNodesStartedAddIndex() { } logger.info("Reroute, nothing should change"); - newState = strategy.reroute(clusterState, "reroute"); + newState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(newState, equalTo(clusterState)); logger.info("Start the more shards"); diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/MaxRetryAllocationDeciderTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/MaxRetryAllocationDeciderTests.java index f995acc35c6cf..f2d69ceb30967 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/MaxRetryAllocationDeciderTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/MaxRetryAllocationDeciderTests.java @@ -9,6 +9,7 @@ package org.elasticsearch.cluster.routing.allocation; import org.elasticsearch.Version; +import org.elasticsearch.action.ActionListener; import org.elasticsearch.cluster.ClusterInfo; import org.elasticsearch.cluster.ClusterName; import org.elasticsearch.cluster.ClusterState; @@ -66,7 +67,7 @@ private ClusterState createInitialClusterState() { assertEquals(clusterState.routingTable().index("idx").size(), 1); assertEquals(clusterState.routingTable().index("idx").shard(0).shard(0).state(), UNASSIGNED); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertEquals(clusterState.routingTable().index("idx").size(), 1); assertEquals(clusterState.routingTable().index("idx").shard(0).shard(0).state(), INITIALIZING); @@ -99,7 +100,7 @@ public void testSingleRetryOnIgnore() { assertThat(routingTable.index("idx").shard(0).shard(0).unassignedInfo().getMessage(), containsString("boom")); // manual resetting of retry count - newState = strategy.reroute(clusterState, new AllocationCommands(), false, true).clusterState(); + newState = strategy.reroute(clusterState, new AllocationCommands(), false, true, false, ActionListener.noop()).clusterState(); assertThat(newState, not(equalTo(clusterState))); clusterState = newState; routingTable = newState.routingTable(); @@ -191,7 +192,7 @@ public void testFailedAllocation() { .build() ) .build(); - ClusterState newState = strategy.reroute(clusterState, "settings changed"); + ClusterState newState = strategy.reroute(clusterState, "settings changed", ActionListener.noop()); assertThat(newState, not(equalTo(clusterState))); clusterState = newState; routingTable = newState.routingTable(); @@ -264,7 +265,7 @@ public void testFailedRelocation() { }); // manually reset retry count - clusterState = strategy.reroute(clusterState, new AllocationCommands(), false, true).clusterState(); + clusterState = strategy.reroute(clusterState, new AllocationCommands(), false, true, false, ActionListener.noop()).clusterState(); // shard could be relocated again withRoutingAllocation(clusterState, allocation -> { diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/NodeVersionAllocationDeciderTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/NodeVersionAllocationDeciderTests.java index edf6ca103598e..cf1d56dfc8721 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/NodeVersionAllocationDeciderTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/NodeVersionAllocationDeciderTests.java @@ -11,6 +11,7 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.elasticsearch.Version; +import org.elasticsearch.action.ActionListener; import org.elasticsearch.cluster.ClusterName; import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.ESAllocationTestCase; @@ -112,7 +113,7 @@ public void testDoNotAllocateFromPrimary() { clusterState = ClusterState.builder(clusterState) .nodes(DiscoveryNodes.builder().add(newNode("node1")).add(newNode("node2"))) .build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); for (int i = 0; i < clusterState.routingTable().index("test").size(); i++) { assertThat(clusterState.routingTable().index("test").shard(i).size(), equalTo(3)); @@ -143,7 +144,7 @@ public void testDoNotAllocateFromPrimary() { clusterState = ClusterState.builder(clusterState) .nodes(DiscoveryNodes.builder(clusterState.nodes()).add(newNode("node3", VersionUtils.getPreviousVersion()))) .build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); for (int i = 0; i < clusterState.routingTable().index("test").size(); i++) { assertThat(clusterState.routingTable().index("test").shard(i).size(), equalTo(3)); @@ -153,7 +154,7 @@ public void testDoNotAllocateFromPrimary() { } clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()).add(newNode("node4"))).build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); for (int i = 0; i < clusterState.routingTable().index("test").size(); i++) { assertThat(clusterState.routingTable().index("test").shard(i).size(), equalTo(3)); @@ -418,7 +419,7 @@ public void testRebalanceDoesNotAllocatePrimaryAndReplicasOnDifferentVersionNode EmptyClusterInfoService.INSTANCE, EmptySnapshotsInfoService.INSTANCE ); - state = strategy.reroute(state, new AllocationCommands(), true, false).clusterState(); + state = strategy.reroute(state, new AllocationCommands(), true, false, false, ActionListener.noop()).clusterState(); // the two indices must stay as is, the replicas cannot move to oldNode2 because versions don't match assertThat(state.routingTable().index(shard2.getIndex()).shardsWithState(ShardRoutingState.RELOCATING).size(), equalTo(0)); assertThat(state.routingTable().index(shard1.getIndex()).shardsWithState(ShardRoutingState.RELOCATING).size(), equalTo(0)); @@ -489,7 +490,7 @@ public void testRestoreDoesNotAllocateSnapshotOnOlderNodes() { EmptyClusterInfoService.INSTANCE, () -> new SnapshotShardSizeInfo(snapshotShardSizes) ); - state = strategy.reroute(state, new AllocationCommands(), true, false).clusterState(); + state = strategy.reroute(state, new AllocationCommands(), true, false, false, ActionListener.noop()).clusterState(); // Make sure that primary shards are only allocated on the new node for (int i = 0; i < numberOfShards; i++) { diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/PreferLocalPrimariesToRelocatingPrimariesTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/PreferLocalPrimariesToRelocatingPrimariesTests.java index cab6f57bee9fb..54b4dab4642ce 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/PreferLocalPrimariesToRelocatingPrimariesTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/PreferLocalPrimariesToRelocatingPrimariesTests.java @@ -8,6 +8,7 @@ package org.elasticsearch.cluster.routing.allocation; import org.elasticsearch.Version; +import org.elasticsearch.action.ActionListener; import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.ESAllocationTestCase; import org.elasticsearch.cluster.metadata.IndexMetadata; @@ -66,7 +67,7 @@ public void testPreferLocalPrimaryAllocationOverFiltered() { .nodes(DiscoveryNodes.builder().add(newNode("node1")).add(newNode("node2"))) .build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); while (shardsWithState(clusterState.getRoutingNodes(), INITIALIZING).isEmpty() == false) { clusterState = startInitializingShardsAndReroute(strategy, clusterState); @@ -109,7 +110,7 @@ public void testPreferLocalPrimaryAllocationOverFiltered() { clusterState = ClusterState.builder(clusterState) .nodes(DiscoveryNodes.builder(clusterState.nodes()).add(newNode("node1", singletonMap("tag1", "value1")))) .build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); while (shardsWithState(clusterState.getRoutingNodes(), STARTED).size() < totalNumberOfShards) { int localInitializations = 0; diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/PreferPrimaryAllocationTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/PreferPrimaryAllocationTests.java index 1c53033f8dfe2..b640cba42058c 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/PreferPrimaryAllocationTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/PreferPrimaryAllocationTests.java @@ -11,6 +11,7 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.elasticsearch.Version; +import org.elasticsearch.action.ActionListener; import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.ESAllocationTestCase; import org.elasticsearch.cluster.metadata.IndexMetadata; @@ -57,7 +58,7 @@ public void testPreferPrimaryAllocationOverReplicas() { clusterState = ClusterState.builder(clusterState) .nodes(DiscoveryNodes.builder().add(newNode("node1")).add(newNode("node2"))) .build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); while (shardsWithState(clusterState.getRoutingNodes(), INITIALIZING).isEmpty() == false) { clusterState = startInitializingShardsAndReroute(strategy, clusterState); @@ -69,7 +70,7 @@ public void testPreferPrimaryAllocationOverReplicas() { metadata = Metadata.builder(clusterState.metadata()).updateNumberOfReplicas(1, indices).build(); clusterState = ClusterState.builder(clusterState).routingTable(updatedRoutingTable).metadata(metadata).build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); logger.info("2 replicas should be initializing now for the existing indices (we throttle to 1)"); assertThat(shardsWithState(clusterState.getRoutingNodes(), INITIALIZING).size(), equalTo(2)); @@ -84,7 +85,7 @@ public void testPreferPrimaryAllocationOverReplicas() { clusterState = ClusterState.builder(clusterState).metadata(metadata).routingTable(updatedRoutingTable).build(); logger.info("reroute, verify that primaries for the new index primary shards are allocated"); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(clusterState.routingTable().index("new_index").shardsWithState(INITIALIZING).size(), equalTo(2)); } diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/PrimaryElectionRoutingTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/PrimaryElectionRoutingTests.java index d85c7694a55a8..2dbcf8f62cab5 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/PrimaryElectionRoutingTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/PrimaryElectionRoutingTests.java @@ -11,6 +11,7 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.elasticsearch.Version; +import org.elasticsearch.action.ActionListener; import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.ESAllocationTestCase; import org.elasticsearch.cluster.metadata.IndexMetadata; @@ -49,10 +50,10 @@ public void testBackupElectionToPrimaryWhenPrimaryCanBeAllocatedToAnotherNode() logger.info("Adding two nodes and performing rerouting"); clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder().add(newNode("node1"))).build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()).add(newNode("node2"))).build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); logger.info("Start the primary shard (on node1)"); RoutingNodes routingNodes = clusterState.getRoutingNodes(); @@ -103,7 +104,7 @@ public void testRemovingInitializingReplicasIfPrimariesFails() { clusterState = ClusterState.builder(clusterState) .nodes(DiscoveryNodes.builder().add(newNode("node1")).add(newNode("node2"))) .build(); - clusterState = allocation.reroute(clusterState, "reroute"); + clusterState = allocation.reroute(clusterState, "reroute", ActionListener.noop()); logger.info("Start the primary shards"); clusterState = startInitializingShardsAndReroute(allocation, clusterState); diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/PrimaryNotRelocatedWhileBeingRecoveredTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/PrimaryNotRelocatedWhileBeingRecoveredTests.java index 7861f3d6217db..6a3fdc331afb0 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/PrimaryNotRelocatedWhileBeingRecoveredTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/PrimaryNotRelocatedWhileBeingRecoveredTests.java @@ -11,6 +11,7 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.elasticsearch.Version; +import org.elasticsearch.action.ActionListener; import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.ESAllocationTestCase; import org.elasticsearch.cluster.metadata.IndexMetadata; @@ -51,7 +52,7 @@ public void testPrimaryNotRelocatedWhileBeingRecoveredFrom() { logger.info("Adding two nodes and performing rerouting"); clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder().add(newNode("node1"))).build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); logger.info("Start the primary shard (on node1)"); RoutingNodes routingNodes = clusterState.getRoutingNodes(); @@ -61,14 +62,14 @@ public void testPrimaryNotRelocatedWhileBeingRecoveredFrom() { logger.info("start another node, replica will start recovering form primary"); clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()).add(newNode("node2"))).build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(shardsWithState(clusterState.getRoutingNodes(), STARTED).size(), equalTo(5)); assertThat(shardsWithState(clusterState.getRoutingNodes(), INITIALIZING).size(), equalTo(5)); logger.info("start another node, make sure the primary is not relocated"); clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()).add(newNode("node3"))).build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(shardsWithState(clusterState.getRoutingNodes(), STARTED).size(), equalTo(5)); assertThat(shardsWithState(clusterState.getRoutingNodes(), INITIALIZING).size(), equalTo(5)); diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/RandomAllocationDeciderTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/RandomAllocationDeciderTests.java index 76b1aba8a39cf..d8c2beb40688e 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/RandomAllocationDeciderTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/RandomAllocationDeciderTests.java @@ -9,6 +9,7 @@ package org.elasticsearch.cluster.routing.allocation; import org.elasticsearch.Version; +import org.elasticsearch.action.ActionListener; import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.ESAllocationTestCase; import org.elasticsearch.cluster.EmptyClusterInfoService; @@ -137,7 +138,7 @@ public void testRandomDecisions() { if (nodesRemoved) { clusterState = strategy.disassociateDeadNodes(clusterState, true, "reroute"); } else { - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); } if (shardsWithState(clusterState.getRoutingNodes(), INITIALIZING).size() > 0) { clusterState = startInitializingShardsAndReroute(strategy, clusterState); @@ -160,7 +161,7 @@ public void testRandomDecisions() { int iterations = 0; do { iterations++; - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); if (shardsWithState(clusterState.getRoutingNodes(), INITIALIZING).size() > 0) { clusterState = startInitializingShardsAndReroute(strategy, clusterState); } diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/RebalanceAfterActiveTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/RebalanceAfterActiveTests.java index a77f489a71383..9d47e84bafc55 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/RebalanceAfterActiveTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/RebalanceAfterActiveTests.java @@ -11,6 +11,7 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.elasticsearch.Version; +import org.elasticsearch.action.ActionListener; import org.elasticsearch.cluster.ClusterInfo; import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.ESAllocationTestCase; @@ -83,7 +84,7 @@ public Long getShardSize(ShardRouting shardRouting) { clusterState = ClusterState.builder(clusterState) .nodes(DiscoveryNodes.builder().add(newNode("node1")).add(newNode("node2"))) .build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); for (int i = 0; i < clusterState.routingTable().index("test").size(); i++) { assertThat(clusterState.routingTable().index("test").shard(i).size(), equalTo(2)); @@ -115,7 +116,7 @@ public Long getShardSize(ShardRouting shardRouting) { .add(newNode("node10")) ) .build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); for (int i = 0; i < clusterState.routingTable().index("test").size(); i++) { assertThat(clusterState.routingTable().index("test").shard(i).size(), equalTo(2)); diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/ReplicaAllocatedAfterPrimaryTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/ReplicaAllocatedAfterPrimaryTests.java index 40731e511d081..2934c2a1c7903 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/ReplicaAllocatedAfterPrimaryTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/ReplicaAllocatedAfterPrimaryTests.java @@ -11,6 +11,7 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.elasticsearch.Version; +import org.elasticsearch.action.ActionListener; import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.ESAllocationTestCase; import org.elasticsearch.cluster.metadata.IndexMetadata; @@ -60,7 +61,7 @@ public void testBackupIsAllocatedAfterPrimary() { .build(); RoutingTable prevRoutingTable = routingTable; - routingTable = strategy.reroute(clusterState, "reroute").routingTable(); + routingTable = strategy.reroute(clusterState, "reroute", ActionListener.noop()).routingTable(); clusterState = ClusterState.builder(clusterState).routingTable(routingTable).build(); final String nodeHoldingPrimary = routingTable.index("test").shard(0).primaryShard().currentNodeId(); diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/ResizeAllocationDeciderTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/ResizeAllocationDeciderTests.java index d4749a2e19a3c..52057b0383d55 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/ResizeAllocationDeciderTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/ResizeAllocationDeciderTests.java @@ -8,6 +8,7 @@ package org.elasticsearch.cluster.routing.allocation; import org.elasticsearch.Version; +import org.elasticsearch.action.ActionListener; import org.elasticsearch.cluster.ClusterName; import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.ESAllocationTestCase; @@ -15,11 +16,13 @@ import org.elasticsearch.cluster.metadata.IndexMetadata; import org.elasticsearch.cluster.metadata.Metadata; import org.elasticsearch.cluster.node.DiscoveryNodes; +import org.elasticsearch.cluster.routing.IndexRoutingTable; import org.elasticsearch.cluster.routing.RecoverySource; import org.elasticsearch.cluster.routing.RoutingTable; import org.elasticsearch.cluster.routing.ShardRouting; import org.elasticsearch.cluster.routing.ShardRoutingState; import org.elasticsearch.cluster.routing.TestShardRouting; +import org.elasticsearch.cluster.routing.UnassignedInfo; import org.elasticsearch.cluster.routing.allocation.allocator.BalancedShardsAllocator; import org.elasticsearch.cluster.routing.allocation.decider.AllocationDeciders; import org.elasticsearch.cluster.routing.allocation.decider.Decision; @@ -31,10 +34,14 @@ import org.elasticsearch.test.gateway.TestGatewayAllocator; import java.util.Collections; +import java.util.List; +import java.util.Optional; +import java.util.Set; import static org.elasticsearch.cluster.routing.ShardRoutingState.INITIALIZING; import static org.elasticsearch.cluster.routing.ShardRoutingState.STARTED; import static org.elasticsearch.cluster.routing.ShardRoutingState.UNASSIGNED; +import static org.hamcrest.Matchers.equalTo; public class ResizeAllocationDeciderTests extends ESAllocationTestCase { @@ -74,7 +81,7 @@ private ClusterState createInitialClusterState(boolean startShards) { .nodes(DiscoveryNodes.builder().add(newNode("node1", Version.CURRENT)).add(newNode("node2", Version.CURRENT))) .build(); RoutingTable prevRoutingTable = routingTable; - routingTable = strategy.reroute(clusterState, "reroute").routingTable(); + routingTable = strategy.reroute(clusterState, "reroute", ActionListener.noop()).routingTable(); clusterState = ClusterState.builder(clusterState).routingTable(routingTable).build(); assertEquals(prevRoutingTable.index("source").size(), 2); @@ -291,4 +298,57 @@ public void testSourcePrimaryActive() { ); } } + + public void testGetForcedInitialShardAllocationToNodes() { + var source = IndexMetadata.builder("source") + .settings( + Settings.builder() + .put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 1) + .put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 0) + .put(IndexMetadata.SETTING_INDEX_UUID, "uuid-1") + .put(IndexMetadata.SETTING_VERSION_CREATED, Version.CURRENT) + ) + .build(); + var target = IndexMetadata.builder("target") + .settings( + Settings.builder() + .put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 1) + .put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 0) + .put(IndexMetadata.INDEX_RESIZE_SOURCE_NAME.getKey(), "source") + .put(IndexMetadata.INDEX_RESIZE_SOURCE_UUID.getKey(), "uuid-1") + .put(IndexMetadata.SETTING_INDEX_UUID, "uuid-2") + .put(IndexMetadata.SETTING_VERSION_CREATED, Version.CURRENT) + ) + .build(); + var clusterState = ClusterState.builder(new ClusterName("test-cluster")) + .nodes(DiscoveryNodes.builder().add(newNode("node-1")).add(newNode("node-2"))) + .metadata(Metadata.builder().put(source, false).put(target, false)) + .routingTable( + RoutingTable.builder() + .add( + IndexRoutingTable.builder(source.getIndex()) + .addShard(TestShardRouting.newShardRouting(new ShardId(source.getIndex(), 0), "node-1", true, STARTED, null)) + ) + ) + .build(); + + var decider = new ResizeAllocationDecider(); + var allocation = new RoutingAllocation(new AllocationDeciders(List.of(decider)), clusterState, null, null, 0); + + var localRecoveryShard = ShardRouting.newUnassigned( + new ShardId(target.getIndex(), 0), + true, + RecoverySource.LocalShardsRecoverySource.INSTANCE, + new UnassignedInfo(UnassignedInfo.Reason.INDEX_CREATED, "index created") + ); + assertThat(decider.getForcedInitialShardAllocationToNodes(localRecoveryShard, allocation), equalTo(Optional.of(Set.of("node-1")))); + + var newShard = ShardRouting.newUnassigned( + new ShardId(target.getIndex(), 0), + true, + RecoverySource.EmptyStoreRecoverySource.INSTANCE, + new UnassignedInfo(UnassignedInfo.Reason.INDEX_CREATED, "index created") + ); + assertThat(decider.getForcedInitialShardAllocationToNodes(newShard, allocation), equalTo(Optional.empty())); + } } diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/ResizeSourceIndexSettingsUpdaterTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/ResizeSourceIndexSettingsUpdaterTests.java index 8eb83c36736b9..84c581dad3cff 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/ResizeSourceIndexSettingsUpdaterTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/ResizeSourceIndexSettingsUpdaterTests.java @@ -9,6 +9,7 @@ package org.elasticsearch.cluster.routing.allocation; import org.elasticsearch.Version; +import org.elasticsearch.action.ActionListener; import org.elasticsearch.cluster.ClusterName; import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.ESAllocationTestCase; @@ -77,7 +78,7 @@ public void testResizeIndexSettingsRemovedAfterStart() { .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_PRIMARIES_RECOVERIES_SETTING.getKey(), 16) .build() ); - clusterState = allocationService.reroute(clusterState, "reroute"); + clusterState = allocationService.reroute(clusterState, "reroute", ActionListener.noop()); { IndexRoutingTable sourceRoutingTable = clusterState.routingTable().index(sourceIndex); @@ -134,7 +135,7 @@ public void testResizeIndexSettingsRemovedAfterStart() { } } - clusterState = allocationService.reroute(clusterState, "reroute"); + clusterState = allocationService.reroute(clusterState, "reroute", ActionListener.noop()); { IndexMetadata targetIndexMetadata = clusterState.metadata().index(targetIndex); diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/RetryFailedAllocationTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/RetryFailedAllocationTests.java index d63fecfec64dd..41b41dda25f09 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/RetryFailedAllocationTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/RetryFailedAllocationTests.java @@ -10,6 +10,7 @@ import org.elasticsearch.ElasticsearchException; import org.elasticsearch.Version; +import org.elasticsearch.action.ActionListener; import org.elasticsearch.cluster.ClusterName; import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.ESAllocationTestCase; @@ -61,7 +62,7 @@ private ShardRouting getReplica() { public void testRetryFailedResetForAllocationCommands() { final int retries = MaxRetryAllocationDecider.SETTING_ALLOCATION_MAX_RETRY.get(Settings.EMPTY); - clusterState = strategy.reroute(clusterState, "initial allocation"); + clusterState = strategy.reroute(clusterState, "initial allocation", ActionListener.noop()); clusterState = startShardsAndReroute(strategy, clusterState, getPrimary()); // Exhaust all replica allocation attempts with shard failures @@ -70,10 +71,10 @@ public void testRetryFailedResetForAllocationCommands() { new FailedShard(getReplica(), "failing-shard::attempt-" + i, new ElasticsearchException("simulated"), randomBoolean()) ); clusterState = strategy.applyFailedShards(clusterState, failedShards, List.of()); - clusterState = strategy.reroute(clusterState, "allocation retry attempt-" + i); + clusterState = strategy.reroute(clusterState, "allocation retry attempt-" + i, ActionListener.noop()); } assertThat("replica should not be assigned", getReplica().state(), equalTo(ShardRoutingState.UNASSIGNED)); - assertThat("reroute should be a no-op", strategy.reroute(clusterState, "test"), sameInstance(clusterState)); + assertThat("reroute should be a no-op", strategy.reroute(clusterState, "test", ActionListener.noop()), sameInstance(clusterState)); // Now allocate replica with retry_failed flag set AllocationService.CommandsResult result = strategy.reroute( @@ -82,7 +83,9 @@ public void testRetryFailedResetForAllocationCommands() { new AllocateReplicaAllocationCommand(INDEX_NAME, 0, getPrimary().currentNodeId().equals("node1") ? "node2" : "node1") ), false, - true + true, + false, + ActionListener.noop() ); clusterState = result.clusterState(); diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/RoutingNodesIntegrityTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/RoutingNodesIntegrityTests.java index 2bccc6a94c3bd..b6100e1844a3f 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/RoutingNodesIntegrityTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/RoutingNodesIntegrityTests.java @@ -11,6 +11,7 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.elasticsearch.Version; +import org.elasticsearch.action.ActionListener; import org.elasticsearch.cluster.ClusterName; import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.ESAllocationTestCase; @@ -85,7 +86,7 @@ public void testBalanceAllNodesStarted() { assertThat(routingNodes.hasInactivePrimaries(), equalTo(false)); assertThat(routingNodes.hasUnassignedPrimaries(), equalTo(true)); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); routingNodes = clusterState.getRoutingNodes(); assertThat(assertShardStats(routingNodes), equalTo(true)); @@ -95,12 +96,12 @@ public void testBalanceAllNodesStarted() { logger.info("Another round of rebalancing"); clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes())).build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); clusterState = startInitializingShardsAndReroute(strategy, clusterState); logger.info("Reroute, nothing should change"); - ClusterState newState = strategy.reroute(clusterState, "reroute"); + ClusterState newState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(newState, equalTo(clusterState)); logger.info("Start the more shards"); @@ -143,11 +144,11 @@ public void testBalanceIncrementallyStartNodes() { logger.info("Adding node-1 and performing reroute"); clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder().add(newNode("node1"))).build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); logger.info("Add node-2 and perform reroute, nothing will happen since primary not started"); clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()).add(newNode("node2"))).build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); logger.info("Start the all shards"); clusterState = startInitializingShardsAndReroute(strategy, clusterState); // primaries @@ -155,7 +156,7 @@ public void testBalanceIncrementallyStartNodes() { logger.info("Add node-3 and perform reroute, relocate shards to new node"); clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()).add(newNode("node3"))).build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); logger.info("Await all shards reallocate"); clusterState = applyStartedShardsUntilNoChange(clusterState, strategy); @@ -210,7 +211,7 @@ public void testBalanceAllNodesStartedAddIndex() { assertThat(routingNodes.hasInactivePrimaries(), equalTo(false)); assertThat(routingNodes.hasUnassignedPrimaries(), equalTo(true)); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); routingNodes = clusterState.getRoutingNodes(); assertThat(assertShardStats(routingNodes), equalTo(true)); @@ -220,7 +221,7 @@ public void testBalanceAllNodesStartedAddIndex() { logger.info("Another round of rebalancing"); clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes())).build(); - ClusterState newState = strategy.reroute(clusterState, "reroute"); + ClusterState newState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(newState, equalTo(clusterState)); routingNodes = clusterState.getRoutingNodes(); @@ -240,7 +241,7 @@ public void testBalanceAllNodesStartedAddIndex() { assertThat(routingNodes.node("node3").numberOfShardsWithState(STARTED), equalTo(1)); logger.info("Reroute, nothing should change"); - newState = strategy.reroute(clusterState, "reroute"); + newState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(newState, equalTo(clusterState)); logger.info("Start the more shards"); @@ -282,11 +283,11 @@ public void testBalanceAllNodesStartedAddIndex() { assertThat(clusterState.routingTable().index("test1").size(), equalTo(3)); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); logger.info("Reroute, assign"); clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes())).build(); - newState = strategy.reroute(clusterState, "reroute"); + newState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(newState, equalTo(clusterState)); routingNodes = clusterState.getRoutingNodes(); diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/SameShardRoutingTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/SameShardRoutingTests.java index 4b815555f95b7..e078934adce61 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/SameShardRoutingTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/SameShardRoutingTests.java @@ -9,6 +9,7 @@ package org.elasticsearch.cluster.routing.allocation; import org.elasticsearch.Version; +import org.elasticsearch.action.ActionListener; import org.elasticsearch.action.support.replication.ClusterStateCreationUtils; import org.elasticsearch.cluster.ClusterInfo; import org.elasticsearch.cluster.ClusterState; @@ -97,7 +98,7 @@ public void testSameHost() { ) ) .build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(numberOfShardsOfType(clusterState.getRoutingNodes(), ShardRoutingState.INITIALIZING), equalTo(2)); @@ -126,7 +127,7 @@ public void testSameHost() { ) ) .build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(numberOfShardsOfType(clusterState.getRoutingNodes(), ShardRoutingState.STARTED), equalTo(2)); assertThat(numberOfShardsOfType(clusterState.getRoutingNodes(), ShardRoutingState.INITIALIZING), equalTo(2)); diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/ShardVersioningTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/ShardVersioningTests.java index a14afd4b79ba6..77578a7a16dbc 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/ShardVersioningTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/ShardVersioningTests.java @@ -11,6 +11,7 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.elasticsearch.Version; +import org.elasticsearch.action.ActionListener; import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.ESAllocationTestCase; import org.elasticsearch.cluster.metadata.IndexMetadata; @@ -53,7 +54,7 @@ public void testSimple() { clusterState = ClusterState.builder(clusterState) .nodes(DiscoveryNodes.builder().add(newNode("node1")).add(newNode("node2"))) .build(); - routingTable = strategy.reroute(clusterState, "reroute").routingTable(); + routingTable = strategy.reroute(clusterState, "reroute", ActionListener.noop()).routingTable(); clusterState = ClusterState.builder(clusterState).routingTable(routingTable).build(); for (int i = 0; i < routingTable.index("test1").size(); i++) { diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/ShardsLimitAllocationTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/ShardsLimitAllocationTests.java index 568d2247ba29a..ae5b6c38a1362 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/ShardsLimitAllocationTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/ShardsLimitAllocationTests.java @@ -11,6 +11,7 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.elasticsearch.Version; +import org.elasticsearch.action.ActionListener; import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.ESAllocationTestCase; import org.elasticsearch.cluster.metadata.IndexMetadata; @@ -57,7 +58,7 @@ public void testIndexLevelShardsLimitAllocate() { clusterState = ClusterState.builder(clusterState) .nodes(DiscoveryNodes.builder().add(newNode("node1")).add(newNode("node2"))) .build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(clusterState.getRoutingNodes().node("node1").numberOfShardsWithState(ShardRoutingState.INITIALIZING), equalTo(2)); assertThat(clusterState.getRoutingNodes().node("node2").numberOfShardsWithState(ShardRoutingState.INITIALIZING), equalTo(2)); @@ -104,7 +105,7 @@ public void testClusterLevelShardsLimitAllocate() { clusterState = ClusterState.builder(clusterState) .nodes(DiscoveryNodes.builder().add(newNode("node1")).add(newNode("node2"))) .build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(clusterState.getRoutingNodes().node("node1").numberOfShardsWithState(ShardRoutingState.INITIALIZING), equalTo(1)); assertThat(clusterState.getRoutingNodes().node("node2").numberOfShardsWithState(ShardRoutingState.INITIALIZING), equalTo(1)); @@ -125,7 +126,7 @@ public void testClusterLevelShardsLimitAllocate() { ); logger.info("Do another reroute, make sure shards are now allocated"); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(clusterState.getRoutingNodes().node("node1").numberOfShardsWithState(ShardRoutingState.INITIALIZING), equalTo(1)); assertThat(clusterState.getRoutingNodes().node("node2").numberOfShardsWithState(ShardRoutingState.INITIALIZING), equalTo(1)); @@ -168,7 +169,7 @@ public void testIndexLevelShardsLimitRemain() { ).metadata(metadata).routingTable(initialRoutingTable).build(); logger.info("Adding one node and reroute"); clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder().add(newNode("node1"))).build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); logger.info("Start the primary shards"); clusterState = startInitializingShardsAndReroute(strategy, clusterState); @@ -191,7 +192,7 @@ public void testIndexLevelShardsLimitRemain() { logger.info("Add another one node and reroute"); clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()).add(newNode("node2"))).build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); clusterState = startInitializingShardsAndReroute(strategy, clusterState); @@ -219,7 +220,7 @@ public void testIndexLevelShardsLimitRemain() { clusterState = ClusterState.builder(clusterState).metadata(metadata).build(); logger.info("reroute after setting"); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(clusterState.getRoutingNodes().node("node1").numberOfShardsWithState(STARTED), equalTo(3)); assertThat(clusterState.getRoutingNodes().node("node1").numberOfShardsWithState(RELOCATING), equalTo(2)); diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/SingleShardNoReplicasRoutingTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/SingleShardNoReplicasRoutingTests.java index ff3d1df0337d3..d646ef7750c60 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/SingleShardNoReplicasRoutingTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/SingleShardNoReplicasRoutingTests.java @@ -11,6 +11,7 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.elasticsearch.Version; +import org.elasticsearch.action.ActionListener; import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.ESAllocationTestCase; import org.elasticsearch.cluster.metadata.IndexMetadata; @@ -67,7 +68,7 @@ public void testSingleIndexStartedShard() { logger.info("Adding one node and performing rerouting"); clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder().add(newNode("node1"))).build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(clusterState.routingTable().index("test").size(), equalTo(1)); assertThat(clusterState.routingTable().index("test").shard(0).size(), equalTo(1)); @@ -76,7 +77,7 @@ public void testSingleIndexStartedShard() { logger.info("Rerouting again, nothing should change"); clusterState = ClusterState.builder(clusterState).build(); - ClusterState newState = strategy.reroute(clusterState, "reroute"); + ClusterState newState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(newState, equalTo(clusterState)); clusterState = newState; @@ -93,7 +94,7 @@ public void testSingleIndexStartedShard() { logger.info("Starting another node and making sure nothing changed"); clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()).add(newNode("node2"))).build(); - newState = strategy.reroute(clusterState, "reroute"); + newState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(newState, equalTo(clusterState)); clusterState = newState; @@ -117,7 +118,7 @@ public void testSingleIndexStartedShard() { logger.info("Bring node1 back, and see it's assinged"); clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()).add(newNode("node1"))).build(); - newState = strategy.reroute(clusterState, "reroute"); + newState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(newState, not(equalTo(clusterState))); clusterState = newState; @@ -128,7 +129,7 @@ public void testSingleIndexStartedShard() { logger.info("Start another node, make sure that things remain the same (shard is in node2 and initializing)"); clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()).add(newNode("node3"))).build(); - newState = strategy.reroute(clusterState, "reroute"); + newState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(newState, equalTo(clusterState)); logger.info("Start the shard on node 1"); @@ -167,7 +168,7 @@ public void testSingleIndexShardFailed() { logger.info("Adding one node and rerouting"); clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder().add(newNode("node1"))).build(); - ClusterState newState = strategy.reroute(clusterState, "reroute"); + ClusterState newState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(newState, not(equalTo(clusterState))); clusterState = newState; @@ -235,7 +236,7 @@ public void testMultiIndexEvenDistribution() { nodesBuilder.add(newNode("node" + i)); } clusterState = ClusterState.builder(clusterState).nodes(nodesBuilder).build(); - ClusterState newState = strategy.reroute(clusterState, "reroute"); + ClusterState newState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(newState, not(equalTo(clusterState))); clusterState = newState; @@ -271,7 +272,7 @@ public void testMultiIndexEvenDistribution() { nodesBuilder.add(newNode("node" + i)); } clusterState = ClusterState.builder(clusterState).nodes(nodesBuilder).build(); - newState = strategy.reroute(clusterState, "reroute"); + newState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(newState, equalTo(clusterState)); logger.info("Marking the shard as started"); @@ -339,7 +340,7 @@ public void testMultiIndexUnevenNodes() { clusterState = ClusterState.builder(clusterState) .nodes(DiscoveryNodes.builder().add(newNode("node1")).add(newNode("node2")).add(newNode("node3"))) .build(); - ClusterState newState = strategy.reroute(clusterState, "reroute"); + ClusterState newState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(newState, not(equalTo(clusterState))); clusterState = newState; @@ -358,7 +359,7 @@ public void testMultiIndexUnevenNodes() { clusterState = ClusterState.builder(clusterState) .nodes(DiscoveryNodes.builder(clusterState.nodes()).add(newNode("node4")).add(newNode("node5"))) .build(); - newState = strategy.reroute(clusterState, "reroute"); + newState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(newState, equalTo(clusterState)); clusterState = newState; diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/SingleShardOneReplicaRoutingTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/SingleShardOneReplicaRoutingTests.java index 1a820034b13ff..75a99105c79bb 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/SingleShardOneReplicaRoutingTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/SingleShardOneReplicaRoutingTests.java @@ -11,6 +11,7 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.elasticsearch.Version; +import org.elasticsearch.action.ActionListener; import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.ESAllocationTestCase; import org.elasticsearch.cluster.metadata.IndexMetadata; @@ -57,7 +58,7 @@ public void testSingleIndexFirstStartPrimaryThenBackups() { logger.info("Adding one node and performing rerouting"); clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder().add(newNode("node1"))).build(); - ClusterState newState = strategy.reroute(clusterState, "reroute"); + ClusterState newState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(newState, not(equalTo(clusterState))); clusterState = newState; @@ -71,7 +72,7 @@ public void testSingleIndexFirstStartPrimaryThenBackups() { logger.info("Add another node and perform rerouting, nothing will happen since primary shards not started"); clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()).add(newNode("node2"))).build(); - newState = strategy.reroute(clusterState, "reroute"); + newState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(newState, equalTo(clusterState)); logger.info("Start the primary shard (on node1)"); @@ -91,7 +92,7 @@ public void testSingleIndexFirstStartPrimaryThenBackups() { assertThat(clusterState.routingTable().index("test").shard(0).replicaShards().get(0).currentNodeId(), equalTo("node2")); logger.info("Reroute, nothing should change"); - newState = strategy.reroute(clusterState, "reroute"); + newState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(newState, equalTo(clusterState)); logger.info("Start the backup shard"); @@ -128,7 +129,7 @@ public void testSingleIndexFirstStartPrimaryThenBackups() { logger.info("Start another node, backup shard should start initializing"); clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()).add(newNode("node3"))).build(); - newState = strategy.reroute(clusterState, "reroute"); + newState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(newState, not(equalTo(clusterState))); clusterState = newState; diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/TenShardsOneReplicaRoutingTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/TenShardsOneReplicaRoutingTests.java index 60b43bdf3065b..f03683dd36755 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/TenShardsOneReplicaRoutingTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/TenShardsOneReplicaRoutingTests.java @@ -11,6 +11,7 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.elasticsearch.Version; +import org.elasticsearch.action.ActionListener; import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.ESAllocationTestCase; import org.elasticsearch.cluster.metadata.IndexMetadata; @@ -71,7 +72,7 @@ public void testSingleIndexFirstStartPrimaryThenBackups() { logger.info("Adding one node and performing rerouting"); clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder().add(newNode("node1"))).build(); - ClusterState newState = strategy.reroute(clusterState, "reroute"); + ClusterState newState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(newState, not(equalTo(clusterState))); clusterState = newState; @@ -88,7 +89,7 @@ public void testSingleIndexFirstStartPrimaryThenBackups() { logger.info("Add another node and perform rerouting, nothing will happen since primary not started"); clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()).add(newNode("node2"))).build(); - newState = strategy.reroute(clusterState, "reroute"); + newState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(newState, equalTo(clusterState)); logger.info("Start the primary shard (on node1)"); @@ -111,7 +112,7 @@ public void testSingleIndexFirstStartPrimaryThenBackups() { } logger.info("Reroute, nothing should change"); - newState = strategy.reroute(clusterState, "reroute"); + newState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(newState, equalTo(clusterState)); logger.info("Start the backup shard"); @@ -136,7 +137,7 @@ public void testSingleIndexFirstStartPrimaryThenBackups() { logger.info("Add another node and perform rerouting"); clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()).add(newNode("node3"))).build(); - newState = strategy.reroute(clusterState, "reroute"); + newState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(newState, not(equalTo(clusterState))); clusterState = newState; routingNodes = clusterState.getRoutingNodes(); diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/ThrottlingAllocationTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/ThrottlingAllocationTests.java index 4077c878d3c4e..f7bb70b037f20 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/ThrottlingAllocationTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/ThrottlingAllocationTests.java @@ -11,6 +11,7 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.elasticsearch.Version; +import org.elasticsearch.action.ActionListener; import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.ESAllocationTestCase; import org.elasticsearch.cluster.RestoreInProgress; @@ -81,7 +82,7 @@ public void testPrimaryRecoveryThrottling() { logger.info("start one node, do reroute, only 3 should initialize"); clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder().add(newNode("node1"))).build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(shardsWithState(clusterState.getRoutingNodes(), STARTED).size(), equalTo(0)); assertThat(shardsWithState(clusterState.getRoutingNodes(), INITIALIZING).size(), equalTo(3)); @@ -138,7 +139,7 @@ public void testReplicaAndPrimaryRecoveryThrottling() { ClusterState clusterState = createRecoveryStateAndInitializeAllocations(metadata, gatewayAllocator, snapshotsInfoService); logger.info("with one node, do reroute, only 3 should initialize"); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(shardsWithState(clusterState.getRoutingNodes(), STARTED).size(), equalTo(0)); assertThat(shardsWithState(clusterState.getRoutingNodes(), INITIALIZING).size(), equalTo(3)); @@ -160,7 +161,7 @@ public void testReplicaAndPrimaryRecoveryThrottling() { logger.info("start another node, replicas should start being allocated"); clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()).add(newNode("node2"))).build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(shardsWithState(clusterState.getRoutingNodes(), STARTED).size(), equalTo(5)); assertThat(shardsWithState(clusterState.getRoutingNodes(), INITIALIZING).size(), equalTo(3)); @@ -199,7 +200,7 @@ public void testThrottleIncomingAndOutgoing() { ClusterState clusterState = createRecoveryStateAndInitializeAllocations(metadata, gatewayAllocator, snapshotsInfoService); logger.info("with one node, do reroute, only 5 should initialize"); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(shardsWithState(clusterState.getRoutingNodes(), STARTED).size(), equalTo(0)); assertThat(shardsWithState(clusterState.getRoutingNodes(), INITIALIZING).size(), equalTo(5)); assertThat(shardsWithState(clusterState.getRoutingNodes(), UNASSIGNED).size(), equalTo(4)); @@ -218,7 +219,7 @@ public void testThrottleIncomingAndOutgoing() { clusterState = ClusterState.builder(clusterState) .nodes(DiscoveryNodes.builder(clusterState.nodes()).add(newNode("node2")).add(newNode("node3"))) .build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(shardsWithState(clusterState.getRoutingNodes(), STARTED).size(), equalTo(4)); assertThat(shardsWithState(clusterState.getRoutingNodes(), RELOCATING).size(), equalTo(5)); @@ -260,7 +261,7 @@ public void testOutgoingThrottlesAllocation() { ClusterState clusterState = createRecoveryStateAndInitializeAllocations(metadata, gatewayAllocator, snapshotsInfoService); logger.info("with one node, do reroute, only 1 should initialize"); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(shardsWithState(clusterState.getRoutingNodes(), STARTED).size(), equalTo(0)); assertThat(shardsWithState(clusterState.getRoutingNodes(), INITIALIZING).size(), equalTo(1)); @@ -275,7 +276,7 @@ public void testOutgoingThrottlesAllocation() { logger.info("start one more node, first non-primary should start being allocated"); clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()).add(newNode("node2"))).build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(shardsWithState(clusterState.getRoutingNodes(), STARTED).size(), equalTo(1)); assertThat(shardsWithState(clusterState.getRoutingNodes(), INITIALIZING).size(), equalTo(1)); @@ -291,7 +292,7 @@ public void testOutgoingThrottlesAllocation() { logger.info("start one more node, initializing second non-primary"); clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()).add(newNode("node3"))).build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(shardsWithState(clusterState.getRoutingNodes(), STARTED).size(), equalTo(2)); assertThat(shardsWithState(clusterState.getRoutingNodes(), INITIALIZING).size(), equalTo(1)); @@ -300,7 +301,7 @@ public void testOutgoingThrottlesAllocation() { logger.info("start one more node"); clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()).add(newNode("node4"))).build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertEquals(clusterState.getRoutingNodes().getOutgoingRecoveries("node1"), 1); @@ -309,7 +310,9 @@ public void testOutgoingThrottlesAllocation() { clusterState, new AllocationCommands(new MoveAllocationCommand("test", 0, "node2", "node4")), true, - false + false, + false, + ActionListener.noop() ); assertEquals(commandsResult.explanations().explanations().size(), 1); assertEquals(commandsResult.explanations().explanations().get(0).decisions().type(), Decision.Type.THROTTLE); diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/TrackFailedAllocationNodesTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/TrackFailedAllocationNodesTests.java index e5706b38d820e..90dd67e3965d2 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/TrackFailedAllocationNodesTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/TrackFailedAllocationNodesTests.java @@ -9,6 +9,7 @@ package org.elasticsearch.cluster.routing.allocation; import org.elasticsearch.Version; +import org.elasticsearch.action.ActionListener; import org.elasticsearch.cluster.ClusterName; import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.ESAllocationTestCase; @@ -46,7 +47,7 @@ public void testTrackFailedNodes() { .metadata(metadata) .routingTable(RoutingTable.builder().addAsNew(metadata.index("idx")).build()) .build(); - clusterState = allocationService.reroute(clusterState, "reroute"); + clusterState = allocationService.reroute(clusterState, "reroute", ActionListener.noop()); Set failedNodeIds = new HashSet<>(); // track the failed nodes if shard is not started @@ -65,7 +66,8 @@ public void testTrackFailedNodes() { // reroute with retryFailed=true should discard the failedNodes assertThat(clusterState.routingTable().index("idx").shard(0).shard(0).state(), equalTo(ShardRoutingState.UNASSIGNED)); - clusterState = allocationService.reroute(clusterState, new AllocationCommands(), false, true).clusterState(); + clusterState = allocationService.reroute(clusterState, new AllocationCommands(), false, true, false, ActionListener.noop()) + .clusterState(); assertThat(clusterState.routingTable().index("idx").shard(0).shard(0).unassignedInfo().getFailedNodeIds(), empty()); // do not track the failed nodes while shard is started diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/UpdateNumberOfReplicasTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/UpdateNumberOfReplicasTests.java index aef8d23eadb3b..fceeb228afc6e 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/UpdateNumberOfReplicasTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/UpdateNumberOfReplicasTests.java @@ -11,6 +11,7 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.elasticsearch.Version; +import org.elasticsearch.action.ActionListener; import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.ESAllocationTestCase; import org.elasticsearch.cluster.metadata.IndexMetadata; @@ -60,7 +61,7 @@ public void testUpdateNumberOfReplicas() { .nodes(DiscoveryNodes.builder().add(newNode("node1")).add(newNode("node2"))) .build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); logger.info("Start all the primary shards"); clusterState = startInitializingShardsAndReroute(strategy, clusterState); @@ -101,7 +102,7 @@ public void testUpdateNumberOfReplicas() { logger.info("Add another node and start the added replica"); clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()).add(newNode("node3"))).build(); - newState = strategy.reroute(clusterState, "reroute"); + newState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(newState, not(equalTo(clusterState))); clusterState = newState; @@ -159,7 +160,7 @@ public void testUpdateNumberOfReplicas() { ); logger.info("do a reroute, should remain the same"); - newState = strategy.reroute(clusterState, "reroute"); + newState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(newState, equalTo(clusterState)); } } diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/AllocationActionListenerTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/AllocationActionListenerTests.java new file mode 100644 index 0000000000000..1dc1a630d0b6b --- /dev/null +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/AllocationActionListenerTests.java @@ -0,0 +1,159 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0 and the Server Side Public License, v 1; you may not use this file except + * in compliance with, at your election, the Elastic License 2.0 or the Server + * Side Public License, v 1. + */ + +package org.elasticsearch.cluster.routing.allocation.allocator; + +import org.elasticsearch.action.ActionListener; +import org.elasticsearch.action.support.master.AcknowledgedResponse; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.util.concurrent.ThreadContext; +import org.elasticsearch.test.ESTestCase; + +import java.util.List; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicReference; + +import static org.hamcrest.Matchers.containsInAnyOrder; +import static org.hamcrest.Matchers.equalTo; + +public class AllocationActionListenerTests extends ESTestCase { + + public void testShouldDelegateWhenBothComplete() { + var completed = new AtomicBoolean(false); + var listener = new AllocationActionListener( + ActionListener.wrap(ignore -> completed.set(true), exception -> { throw new AssertionError("Should not fail in test"); }), + createEmptyThreadContext() + ); + + listener.clusterStateUpdate().onResponse(AcknowledgedResponse.TRUE); + listener.reroute().onResponse(null); + + assertThat(completed.get(), equalTo(true)); + } + + public void testShouldNotDelegateWhenOnlyOneComplete() { + var completed = new AtomicBoolean(false); + var listener = new AllocationActionListener( + ActionListener.wrap(ignore -> completed.set(true), exception -> { throw new AssertionError("Should not fail in test"); }), + createEmptyThreadContext() + ); + + if (randomBoolean()) { + listener.clusterStateUpdate().onResponse(AcknowledgedResponse.TRUE); + } else { + listener.reroute().onResponse(null); + } + + assertThat(completed.get(), equalTo(false)); + } + + public void testShouldDelegateFailureImmediately() { + var completed = new AtomicBoolean(false); + var listener = new AllocationActionListener( + ActionListener.wrap(ignore -> { throw new AssertionError("Should not complete in test"); }, exception -> completed.set(true)), + createEmptyThreadContext() + ); + + if (randomBoolean()) { + listener.clusterStateUpdate().onFailure(new RuntimeException()); + } else { + listener.reroute().onFailure(new RuntimeException()); + } + + assertThat(completed.get(), equalTo(true)); + } + + public void testShouldExecuteWithCorrectContext() { + + var context = new ThreadContext(Settings.EMPTY); + + // should not be changed after the listener is created + context.putHeader("header", "root");// ensure this is visible in the listener + context.addResponseHeader("header", "1"); + + var header = new AtomicReference(); + var responseHeaders = new AtomicReference>(); + var listener = new AllocationActionListener<>(ActionListener.wrap(ignore -> { + header.set(context.getHeader("header")); + responseHeaders.set(context.getResponseHeaders().get("header")); + }, exception -> { throw new AssertionError("Should not fail in test"); }), context); + + // this header should be ignored as it is added after context is captured + context.addResponseHeader("header", "2"); + + for (var action : shuffledList(List.of(() -> { + // headers for clusterStateUpdate listener are captured + context.addResponseHeader("header", "3"); + var csl = listener.clusterStateUpdate(); + context.addResponseHeader("header", "4"); + csl.onResponse(AcknowledgedResponse.TRUE); + }, () -> { + // reroute is executed for multiple changes so its headers should be ignored + context.addResponseHeader("header", "5"); + var reroute = listener.reroute(); + context.addResponseHeader("header", "6"); + reroute.onResponse(null); + }))) { + try (var ignored = context.stashContext()) { + action.run(); + } + } + + assertThat(header.get(), equalTo("root")); + assertThat(responseHeaders.get(), containsInAnyOrder("1", "3", "4")); + } + + public void testShouldFailWithCorrectContext() { + + var context = new ThreadContext(Settings.EMPTY); + + // should not be changed after the listener is created + context.putHeader("header", "root");// ensure this is visible in the listener + context.addResponseHeader("header", "1"); + + var header = new AtomicReference(); + var responseHeaders = new AtomicReference>(); + var listener = new AllocationActionListener<>( + ActionListener.wrap(ignore -> { throw new AssertionError("Should not fail in test"); }, exception -> { + header.set(context.getHeader("header")); + responseHeaders.set(context.getResponseHeaders().get("header")); + + }), + context + ); + + // this header should be ignored as it is added after context is captured + context.addResponseHeader("header", "2"); + + if (randomBoolean()) { + try (var ignored = context.stashContext()) { + context.addResponseHeader("header", "3"); + var csl = listener.clusterStateUpdate(); + context.addResponseHeader("header", "4"); + csl.onFailure(new RuntimeException("cluster-state-update-failed")); + } + + assertThat(header.get(), equalTo("root")); + assertThat(responseHeaders.get(), containsInAnyOrder("1", "3", "4")); + } else { + try (var ignored = context.stashContext()) { + context.addResponseHeader("header", "5"); + var reroute = listener.reroute(); + context.addResponseHeader("header", "6"); + reroute.onFailure(new RuntimeException("reroute-failed")); + } + + assertThat(header.get(), equalTo("root")); + assertThat(responseHeaders.get(), containsInAnyOrder("1")); + } + } + + private static ThreadContext createEmptyThreadContext() { + return new ThreadContext(Settings.EMPTY); + } +} diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/AllocationActionMultiListenerTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/AllocationActionMultiListenerTests.java new file mode 100644 index 0000000000000..f28e79c424f02 --- /dev/null +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/AllocationActionMultiListenerTests.java @@ -0,0 +1,168 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0 and the Server Side Public License, v 1; you may not use this file except + * in compliance with, at your election, the Elastic License 2.0 or the Server + * Side Public License, v 1. + */ + +package org.elasticsearch.cluster.routing.allocation.allocator; + +import org.elasticsearch.action.ActionListener; +import org.elasticsearch.action.support.master.AcknowledgedResponse; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.util.concurrent.ThreadContext; +import org.elasticsearch.core.Tuple; +import org.elasticsearch.test.ESTestCase; +import org.elasticsearch.threadpool.TestThreadPool; +import org.elasticsearch.threadpool.ThreadPool; + +import java.util.List; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicReference; + +import static org.hamcrest.Matchers.equalTo; + +public class AllocationActionMultiListenerTests extends ESTestCase { + + public void testShouldDelegateWhenBothComplete() { + var listener = new AllocationActionMultiListener(createEmptyThreadContext()); + + var l1 = new AtomicInteger(); + var l2 = new AtomicInteger(); + listener.delay(ActionListener.wrap(l1::set, exception -> { throw new AssertionError("Should not fail in test"); })).onResponse(1); + listener.delay(ActionListener.wrap(l2::set, exception -> { throw new AssertionError("Should not fail in test"); })).onResponse(2); + if (randomBoolean()) { + listener.reroute().onResponse(null); + } else { + listener.noRerouteNeeded(); + } + + assertThat(l1.get(), equalTo(1)); + assertThat(l2.get(), equalTo(2)); + } + + public void testShouldNotDelegateWhenOnlyOneComplete() { + var listener = new AllocationActionMultiListener(createEmptyThreadContext()); + + var completed = new AtomicBoolean(false); + var delegate = listener.delay( + ActionListener.wrap(ignore -> completed.set(true), exception -> { throw new AssertionError("Should not fail in test"); }) + ); + + switch (randomInt(2)) { + case 0 -> delegate.onResponse(AcknowledgedResponse.TRUE); + case 1 -> listener.reroute().onResponse(null); + case 2 -> listener.noRerouteNeeded(); + } + + assertThat(completed.get(), equalTo(false)); + } + + public void testShouldDelegateFailureImmediately() { + var listener = new AllocationActionMultiListener(createEmptyThreadContext()); + + var completed = new AtomicBoolean(false); + listener.delay( + ActionListener.wrap(ignore -> { throw new AssertionError("Should not complete in test"); }, exception -> completed.set(true)) + ).onFailure(new RuntimeException()); + + assertThat(completed.get(), equalTo(true)); + } + + public void testConcurrency() throws InterruptedException { + + var listener = new AllocationActionMultiListener(createEmptyThreadContext()); + + var count = randomIntBetween(1, 100); + var completed = new CountDownLatch(count); + + var start = new CountDownLatch(3); + var threadPool = new TestThreadPool(getTestName()); + + threadPool.executor(ThreadPool.Names.CLUSTER_COORDINATION).submit(() -> { + start.countDown(); + awaitQuietly(start); + for (int i = 0; i < count; i++) { + listener.delay( + ActionListener.wrap( + ignore -> completed.countDown(), + exception -> { throw new AssertionError("Should not fail in test"); } + ) + ).onResponse(AcknowledgedResponse.TRUE); + } + }); + + threadPool.executor(ThreadPool.Names.GENERIC).submit(() -> { + start.countDown(); + awaitQuietly(start); + if (randomBoolean()) { + listener.reroute().onResponse(null); + } else { + listener.noRerouteNeeded(); + } + }); + start.countDown(); + + assertTrue("Expected to call all delayed listeners within timeout", completed.await(10, TimeUnit.SECONDS)); + terminate(threadPool); + } + + private static void awaitQuietly(CountDownLatch latch) { + try { + assertTrue("Latch did not complete within timeout", latch.await(5, TimeUnit.SECONDS)); + } catch (InterruptedException e) { + throw new AssertionError("Interrupted while waiting for test to start", e); + } + } + + public void testShouldExecuteWithCorrectContext() { + + var context = new ThreadContext(Settings.EMPTY); + var listener = new AllocationActionMultiListener(context); + + context.putHeader("header", "root"); + var r1 = new AtomicReference(); + var r2 = new AtomicReference(); + var l1 = listener.delay( + ActionListener.wrap( + response -> r1.set(context.getHeader("header")), + exception -> { throw new AssertionError("Should not fail in test"); } + ) + ); + var l2 = listener.delay( + ActionListener.wrap( + response -> r2.set(context.getHeader("header")), + exception -> { throw new AssertionError("Should not fail in test"); } + ) + ); + + executeInRandomOrder( + context, + List.of( + new Tuple<>("clusterStateUpdate1", () -> l1.onResponse(1)), + new Tuple<>("clusterStateUpdate2", () -> l2.onResponse(2)), + new Tuple<>("reroute", () -> listener.reroute().onResponse(null)) + ) + ); + + assertThat(r1.get(), equalTo("root")); + assertThat(r2.get(), equalTo("root")); + } + + private static void executeInRandomOrder(ThreadContext context, List> actions) { + for (var action : shuffledList(actions)) { + try (var ignored = context.stashContext()) { + context.putHeader("header", action.v1()); + action.v2().run(); + } + } + } + + private static ThreadContext createEmptyThreadContext() { + return new ThreadContext(Settings.EMPTY); + } +} diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocatorTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocatorTests.java index 0c714595db88b..e17389831abce 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocatorTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocatorTests.java @@ -9,6 +9,7 @@ package org.elasticsearch.cluster.routing.allocation.allocator; import org.elasticsearch.Version; +import org.elasticsearch.action.ActionListener; import org.elasticsearch.action.support.replication.ClusterStateCreationUtils; import org.elasticsearch.cluster.ClusterInfo; import org.elasticsearch.cluster.ClusterName; @@ -125,7 +126,7 @@ public void testRebalanceImprovesTheBalanceOfTheShards() { .build() ); - var reroutedState = allocationService.reroute(clusterState, "test"); + var reroutedState = allocationService.reroute(clusterState, "test", ActionListener.noop()); for (ShardRouting relocatingShard : RoutingNodesHelper.shardsWithState(reroutedState.getRoutingNodes(), RELOCATING)) { assertThat( diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/ClusterInfoSimulatorTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/ClusterInfoSimulatorTests.java new file mode 100644 index 0000000000000..5a7839ed42df5 --- /dev/null +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/ClusterInfoSimulatorTests.java @@ -0,0 +1,379 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0 and the Server Side Public License, v 1; you may not use this file except + * in compliance with, at your election, the Elastic License 2.0 or the Server + * Side Public License, v 1. + */ + +package org.elasticsearch.cluster.routing.allocation.allocator; + +import org.elasticsearch.Version; +import org.elasticsearch.cluster.ClusterInfo; +import org.elasticsearch.cluster.ClusterInfoSimulator; +import org.elasticsearch.cluster.ClusterName; +import org.elasticsearch.cluster.ClusterState; +import org.elasticsearch.cluster.DiskUsage; +import org.elasticsearch.cluster.metadata.IndexMetadata; +import org.elasticsearch.cluster.metadata.Metadata; +import org.elasticsearch.cluster.node.DiscoveryNode; +import org.elasticsearch.cluster.node.DiscoveryNodeRole; +import org.elasticsearch.cluster.node.DiscoveryNodes; +import org.elasticsearch.cluster.routing.IndexRoutingTable; +import org.elasticsearch.cluster.routing.RoutingTable; +import org.elasticsearch.cluster.routing.ShardRouting; +import org.elasticsearch.cluster.routing.allocation.RoutingAllocation; +import org.elasticsearch.cluster.routing.allocation.decider.AllocationDeciders; +import org.elasticsearch.cluster.routing.allocation.decider.Decision; +import org.elasticsearch.cluster.routing.allocation.decider.DiskThresholdDecider; +import org.elasticsearch.common.UUIDs; +import org.elasticsearch.common.settings.ClusterSettings; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.snapshots.SnapshotShardSizeInfo; +import org.elasticsearch.test.ESTestCase; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import static org.elasticsearch.cluster.routing.ShardRoutingState.INITIALIZING; +import static org.elasticsearch.cluster.routing.ShardRoutingState.STARTED; +import static org.elasticsearch.cluster.routing.TestShardRouting.newShardRouting; +import static org.hamcrest.Matchers.equalTo; + +public class ClusterInfoSimulatorTests extends ESTestCase { + + public void testInitializeNewPrimary() { + + var newPrimary = newShardRouting("index-1", 0, "node-0", true, INITIALIZING); + + var simulator = new ClusterInfoSimulator( + new ClusterInfoTestBuilder() // + .withNode("node-0", new DiskUsageBuilder(1000, 1000)) + .withNode("node-1", new DiskUsageBuilder(1000, 1000)) + .withShard(newPrimary, 0) + .build() + ); + simulator.simulate(newPrimary); + + assertThat( + simulator.getClusterInfo(), + equalTo( + new ClusterInfoTestBuilder() // + .withNode("node-0", new DiskUsageBuilder(1000, 1000)) + .withNode("node-1", new DiskUsageBuilder(1000, 1000)) + .withShard(newPrimary, 0) + .build() + ) + ); + } + + public void testInitializeNewReplica() { + + var existingPrimary = newShardRouting("index-1", 0, "node-0", true, STARTED); + var newReplica = newShardRouting("index-1", 0, "node-1", false, INITIALIZING); + + var simulator = new ClusterInfoSimulator( + new ClusterInfoTestBuilder() // + .withNode("node-0", new DiskUsageBuilder(1000, 900)) + .withNode("node-1", new DiskUsageBuilder(1000, 1000)) + .withShard(existingPrimary, 100) + .withShard(newReplica, 0) + .build() + ); + simulator.simulate(newReplica); + + assertThat( + simulator.getClusterInfo(), + equalTo( + new ClusterInfoTestBuilder() // + .withNode("node-0", new DiskUsageBuilder(1000, 900)) + .withNode("node-1", new DiskUsageBuilder(1000, 900)) + .withShard(existingPrimary, 100) + .withShard(newReplica, 100) + .build() + ) + ); + } + + public void testRelocateShard() { + + var fromNodeId = "node-0"; + var toNodeId = "node-1"; + + var shard = newShardRouting("index-1", 0, toNodeId, fromNodeId, true, INITIALIZING); + + var simulator = new ClusterInfoSimulator( + new ClusterInfoTestBuilder() // + .withNode(fromNodeId, new DiskUsageBuilder(1000, 900)) + .withNode(toNodeId, new DiskUsageBuilder(1000, 1000)) + .withShard(shard, 100) + .build() + ); + simulator.simulate(shard); + + assertThat( + simulator.getClusterInfo(), + equalTo( + new ClusterInfoTestBuilder() // + .withNode(fromNodeId, new DiskUsageBuilder(1000, 1000)) + .withNode(toNodeId, new DiskUsageBuilder(1000, 900)) + .withShard(shard, 100) + .build() + ) + ); + } + + public void testRelocateShardWithMultipleDataPath1() { + + var fromNodeId = "node-0"; + var toNodeId = "node-1"; + + var shard = newShardRouting("index-1", 0, toNodeId, fromNodeId, true, INITIALIZING); + + var simulator = new ClusterInfoSimulator( + new ClusterInfoTestBuilder() // + .withNode(fromNodeId, new DiskUsageBuilder("/data-1", 1000, 500), new DiskUsageBuilder("/data-2", 1000, 750)) + .withNode(toNodeId, new DiskUsageBuilder("/data-1", 1000, 750), new DiskUsageBuilder("/data-2", 1000, 900)) + .withShard(shard, 100) + .build() + ); + simulator.simulate(shard); + + assertThat( + simulator.getClusterInfo(), + equalTo( + new ClusterInfoTestBuilder() // + .withNode(fromNodeId, new DiskUsageBuilder("/data-1", 1000, 500), new DiskUsageBuilder("/data-2", 1000, 850)) + .withNode(toNodeId, new DiskUsageBuilder("/data-1", 1000, 750), new DiskUsageBuilder("/data-2", 1000, 800)) + .withShard(shard, 100) + .build() + ) + ); + } + + public void testDiskUsageSimulationWithSingleDataPathAndDiskThresholdDecider() { + + var discoveryNodesBuilder = DiscoveryNodes.builder() + .add(createDiscoveryNode("node-0", DiscoveryNodeRole.roles())) + .add(createDiscoveryNode("node-1", DiscoveryNodeRole.roles())) + .add(createDiscoveryNode("node-2", DiscoveryNodeRole.roles())); + + var metadataBuilder = Metadata.builder(); + var routingTableBuilder = RoutingTable.builder(); + + var shard1 = newShardRouting("index-1", 0, "node-0", null, true, STARTED); + addIndex(metadataBuilder, routingTableBuilder, shard1); + + var shard2 = newShardRouting("index-2", 0, "node-0", "node-1", true, INITIALIZING); + addIndex(metadataBuilder, routingTableBuilder, shard2); + + var shard3 = newShardRouting("index-3", 0, "node-1", null, true, STARTED); + addIndex(metadataBuilder, routingTableBuilder, shard3); + + var clusterState = ClusterState.builder(ClusterName.DEFAULT) + .nodes(discoveryNodesBuilder) + .metadata(metadataBuilder) + .routingTable(routingTableBuilder) + .build(); + + var simulator = new ClusterInfoSimulator( + new ClusterInfoTestBuilder() // + .withNode("node-0", new DiskUsageBuilder("/data-1", 1000, 500)) + .withNode("node-1", new DiskUsageBuilder("/data-1", 1000, 300)) + .withShard(shard1, 500) + .withShard(shard2, 400) + .withShard(shard3, 300) + .build() + ); + + simulator.simulate(shard2); + + assertThat( + simulator.getClusterInfo(), + equalTo( + new ClusterInfoTestBuilder() // + .withNode("node-0", new DiskUsageBuilder("/data-1", 1000, 100)) + .withNode("node-1", new DiskUsageBuilder("/data-1", 1000, 700)) + .withShard(shard1, 500) + .withShard(shard2, 400) + .withShard(shard3, 300) + .build() + ) + ); + + var decider = new DiskThresholdDecider( + Settings.EMPTY, + new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS) + ); + var allocation = new RoutingAllocation( + new AllocationDeciders(List.of(decider)), + clusterState, + simulator.getClusterInfo(), + SnapshotShardSizeInfo.EMPTY, + 0L + ); + var routingNodes = allocation.routingNodes(); + + assertThat( + "Should keep index-1 on node-0", + decider.canRemain(clusterState.metadata().index("index-1"), shard1, routingNodes.node("node-0"), allocation).type(), + equalTo(Decision.Type.YES) + ); + assertThat( + "Should keep index-2 on node-0", + decider.canRemain(clusterState.metadata().index("index-2"), shard2, routingNodes.node("node-0"), allocation).type(), + equalTo(Decision.Type.YES) + ); + assertThat( + "Should not allocate index-3 on node-0 (not enough space)", + decider.canAllocate(shard3, routingNodes.node("node-0"), allocation).type(), + equalTo(Decision.Type.NO) + ); + } + + public void testDiskUsageSimulationWithMultipleDataPathAndDiskThresholdDecider() { + + var discoveryNodesBuilder = DiscoveryNodes.builder() + .add(createDiscoveryNode("node-0", DiscoveryNodeRole.roles())) + .add(createDiscoveryNode("node-1", DiscoveryNodeRole.roles())) + .add(createDiscoveryNode("node-2", DiscoveryNodeRole.roles())); + + var metadataBuilder = Metadata.builder(); + var routingTableBuilder = RoutingTable.builder(); + + var shard1 = newShardRouting("index-1", 0, "node-0", null, true, STARTED); + addIndex(metadataBuilder, routingTableBuilder, shard1); + + var shard2 = newShardRouting("index-2", 0, "node-0", "node-1", true, INITIALIZING); + addIndex(metadataBuilder, routingTableBuilder, shard2); + + var shard3 = newShardRouting("index-3", 0, "node-1", null, true, STARTED); + addIndex(metadataBuilder, routingTableBuilder, shard3); + + var clusterState = ClusterState.builder(ClusterName.DEFAULT) + .nodes(discoveryNodesBuilder) + .metadata(metadataBuilder) + .routingTable(routingTableBuilder) + .build(); + + var simulator = new ClusterInfoSimulator( + new ClusterInfoTestBuilder() // + .withNode("node-0", new DiskUsageBuilder("/data-1", 1000, 100), new DiskUsageBuilder("/data-2", 1000, 500)) + .withNode("node-1", new DiskUsageBuilder("/data-1", 1000, 100), new DiskUsageBuilder("/data-2", 1000, 300)) + .withShard(shard1, 500) + .withShard(shard2, 400) + .withShard(shard3, 300) + .build() + ); + + simulator.simulate(shard2); + + assertThat( + simulator.getClusterInfo(), + equalTo( + new ClusterInfoTestBuilder() // + .withNode("node-0", new DiskUsageBuilder("/data-1", 1000, 100), new DiskUsageBuilder("/data-2", 1000, 100)) + .withNode("node-1", new DiskUsageBuilder("/data-1", 1000, 100), new DiskUsageBuilder("/data-2", 1000, 700)) + .withShard(shard1, 500) + .withShard(shard2, 400) + .withShard(shard3, 300) + .build() + ) + ); + + var decider = new DiskThresholdDecider( + Settings.EMPTY, + new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS) + ); + var allocation = new RoutingAllocation( + new AllocationDeciders(List.of(decider)), + clusterState, + simulator.getClusterInfo(), + SnapshotShardSizeInfo.EMPTY, + 0L + ); + var routingNodes = allocation.routingNodes(); + + assertThat( + "Should keep index-1 on node-0", + decider.canRemain(clusterState.metadata().index("index-1"), shard1, routingNodes.node("node-0"), allocation).type(), + equalTo(Decision.Type.YES) + ); + + assertThat( + "Should keep index-2 on node-0", + decider.canRemain(clusterState.metadata().index("index-2"), shard2, routingNodes.node("node-0"), allocation).type(), + equalTo(Decision.Type.YES) + ); + + assertThat( + "Should not allocate index-3 on node-0 (not enough space)", + decider.canAllocate(shard3, routingNodes.node("node-0"), allocation).type(), + equalTo(Decision.Type.NO) + ); + } + + private static DiscoveryNode createDiscoveryNode(String id, Set roles) { + return new DiscoveryNode( + id, + id, + UUIDs.randomBase64UUID(random()), + buildNewFakeTransportAddress(), + Map.of(), + roles, + Version.CURRENT + ); + } + + private static void addIndex(Metadata.Builder metadataBuilder, RoutingTable.Builder routingTableBuilder, ShardRouting shardRouting) { + var name = shardRouting.getIndexName(); + var settings = Settings.builder() + .put("index.number_of_shards", 1) + .put("index.number_of_replicas", 0) + .put("index.version.created", Version.CURRENT) + .build(); + metadataBuilder.put(IndexMetadata.builder(name).settings(settings)); + routingTableBuilder.add(IndexRoutingTable.builder(metadataBuilder.get(name).getIndex()).addShard(shardRouting)); + } + + private static class ClusterInfoTestBuilder { + + private final Map leastAvailableSpaceUsage = new HashMap<>(); + private final Map mostAvailableSpaceUsage = new HashMap<>(); + private final Map shardSizes = new HashMap<>(); + + public ClusterInfoTestBuilder withNode(String name, DiskUsageBuilder diskUsageBuilderBuilder) { + leastAvailableSpaceUsage.put(name, diskUsageBuilderBuilder.toDiskUsage(name)); + mostAvailableSpaceUsage.put(name, diskUsageBuilderBuilder.toDiskUsage(name)); + return this; + } + + public ClusterInfoTestBuilder withNode(String name, DiskUsageBuilder leastAvailableSpace, DiskUsageBuilder mostAvailableSpace) { + leastAvailableSpaceUsage.put(name, leastAvailableSpace.toDiskUsage(name)); + mostAvailableSpaceUsage.put(name, mostAvailableSpace.toDiskUsage(name)); + return this; + } + + public ClusterInfoTestBuilder withShard(ShardRouting shard, long size) { + shardSizes.put(ClusterInfo.shardIdentifierFromRouting(shard), size); + return this; + } + + public ClusterInfo build() { + return new ClusterInfo(leastAvailableSpaceUsage, mostAvailableSpaceUsage, shardSizes, Map.of(), Map.of(), Map.of()); + } + } + + private record DiskUsageBuilder(String path, long total, long free) { + + private DiskUsageBuilder(long total, long free) { + this("/data", total, free); + } + + public DiskUsage toDiskUsage(String name) { + return new DiskUsage(name, name, name + path, total, free); + } + } +} diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/ContinuousComputationTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/ContinuousComputationTests.java new file mode 100644 index 0000000000000..e28248b01d56b --- /dev/null +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/ContinuousComputationTests.java @@ -0,0 +1,147 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0 and the Server Side Public License, v 1; you may not use this file except + * in compliance with, at your election, the Elastic License 2.0 or the Server + * Side Public License, v 1. + */ + +package org.elasticsearch.cluster.routing.allocation.allocator; + +import org.elasticsearch.test.ESTestCase; +import org.elasticsearch.threadpool.TestThreadPool; +import org.elasticsearch.threadpool.ThreadPool; +import org.junit.AfterClass; +import org.junit.BeforeClass; + +import java.util.Arrays; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.CyclicBarrier; +import java.util.concurrent.Semaphore; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicReference; + +import static org.hamcrest.Matchers.equalTo; +import static org.hamcrest.Matchers.sameInstance; +import static org.junit.Assert.assertEquals; + +public class ContinuousComputationTests extends ESTestCase { + + private static ThreadPool threadPool; + + @BeforeClass + public static void createThreadPool() { + threadPool = new TestThreadPool("test"); + } + + @AfterClass + public static void terminateThreadPool() { + try { + assertTrue(ThreadPool.terminate(threadPool, 10, TimeUnit.SECONDS)); + } finally { + threadPool = null; + } + } + + public void testConcurrency() throws Exception { + + final var result = new AtomicReference(); + final var computation = new ContinuousComputation(threadPool.generic()) { + + public final Semaphore executePermit = new Semaphore(1); + + @Override + protected void processInput(Integer input) { + assertTrue(executePermit.tryAcquire(1)); + result.set(input); + executePermit.release(); + } + }; + + final Thread[] threads = new Thread[between(1, 5)]; + final int[] valuePerThread = new int[threads.length]; + final CountDownLatch startLatch = new CountDownLatch(1); + for (int i = 0; i < threads.length; i++) { + final int threadIndex = i; + valuePerThread[threadIndex] = randomInt(); + threads[threadIndex] = new Thread(() -> { + try { + assertTrue(startLatch.await(10, TimeUnit.SECONDS)); + } catch (Exception e) { + throw new AssertionError(e); + } + for (int j = 1000; j >= 0; j--) { + computation.onNewInput(valuePerThread[threadIndex] = valuePerThread[threadIndex] + j); + } + }, "submit-thread-" + threadIndex); + threads[threadIndex].start(); + } + + startLatch.countDown(); + + for (Thread thread : threads) { + thread.join(); + } + + assertBusy(() -> assertFalse(computation.isActive())); + + assertTrue(Arrays.toString(valuePerThread) + " vs " + result.get(), Arrays.stream(valuePerThread).anyMatch(i -> i == result.get())); + } + + public void testSkipsObsoleteValues() throws Exception { + final var barrier = new CyclicBarrier(2); + final Runnable await = () -> { + try { + barrier.await(10, TimeUnit.SECONDS); + } catch (Exception e) { + throw new AssertionError(e); + } + }; + + final var initialInput = new Object(); + final var becomesStaleInput = new Object(); + final var skippedInput = new Object(); + final var finalInput = new Object(); + + final var result = new AtomicReference(); + final var computation = new ContinuousComputation(threadPool.generic()) { + @Override + protected void processInput(Object input) { + assertNotEquals(input, skippedInput); + await.run(); + result.set(input); + await.run(); + // becomesStaleInput should have become stale by now, but other inputs should remain fresh + assertEquals(isFresh(input), input != becomesStaleInput); + await.run(); + } + }; + + computation.onNewInput(initialInput); + await.run(); + assertTrue(computation.isActive()); + await.run(); + assertThat(result.get(), sameInstance(initialInput)); + await.run(); + assertBusy(() -> assertFalse(computation.isActive())); + + computation.onNewInput(becomesStaleInput); // triggers a computation + await.run(); + assertTrue(computation.isActive()); + + computation.onNewInput(skippedInput); // obsoleted by computation 4 before computation 2 is finished, so skipped + computation.onNewInput(finalInput); // triggers a computation once 2 is finished + + await.run(); + await.run(); + assertThat(result.get(), equalTo(becomesStaleInput)); + assertTrue(computation.isActive()); + + await.run(); + assertTrue(computation.isActive()); + await.run(); + assertThat(result.get(), equalTo(finalInput)); + await.run(); + assertBusy(() -> assertFalse(computation.isActive())); + } +} diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceComputerTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceComputerTests.java new file mode 100644 index 0000000000000..365d4c9732fd0 --- /dev/null +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceComputerTests.java @@ -0,0 +1,876 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0 and the Server Side Public License, v 1; you may not use this file except + * in compliance with, at your election, the Elastic License 2.0 or the Server + * Side Public License, v 1. + */ + +package org.elasticsearch.cluster.routing.allocation.allocator; + +import org.elasticsearch.Version; +import org.elasticsearch.cluster.ClusterInfo; +import org.elasticsearch.cluster.ClusterModule; +import org.elasticsearch.cluster.ClusterName; +import org.elasticsearch.cluster.ClusterState; +import org.elasticsearch.cluster.DiskUsage; +import org.elasticsearch.cluster.metadata.IndexMetadata; +import org.elasticsearch.cluster.metadata.Metadata; +import org.elasticsearch.cluster.node.DiscoveryNode; +import org.elasticsearch.cluster.node.DiscoveryNodeRole; +import org.elasticsearch.cluster.node.DiscoveryNodes; +import org.elasticsearch.cluster.routing.AllocationId; +import org.elasticsearch.cluster.routing.IndexRoutingTable; +import org.elasticsearch.cluster.routing.RecoverySource; +import org.elasticsearch.cluster.routing.RoutingChangesObserver; +import org.elasticsearch.cluster.routing.RoutingTable; +import org.elasticsearch.cluster.routing.ShardRouting; +import org.elasticsearch.cluster.routing.ShardRoutingState; +import org.elasticsearch.cluster.routing.UnassignedInfo; +import org.elasticsearch.cluster.routing.allocation.DiskThresholdSettings; +import org.elasticsearch.cluster.routing.allocation.RoutingAllocation; +import org.elasticsearch.cluster.routing.allocation.ShardAllocationDecision; +import org.elasticsearch.cluster.routing.allocation.command.MoveAllocationCommand; +import org.elasticsearch.cluster.routing.allocation.decider.AllocationDeciders; +import org.elasticsearch.cluster.routing.allocation.decider.ThrottlingAllocationDecider; +import org.elasticsearch.common.Randomness; +import org.elasticsearch.common.UUIDs; +import org.elasticsearch.common.settings.ClusterSettings; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.shard.ShardId; +import org.elasticsearch.snapshots.SnapshotShardSizeInfo; +import org.elasticsearch.test.ESTestCase; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Queue; +import java.util.Set; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicInteger; + +import static org.elasticsearch.cluster.metadata.IndexMetadata.SETTING_INDEX_VERSION_CREATED; +import static org.elasticsearch.cluster.metadata.IndexMetadata.SETTING_NUMBER_OF_REPLICAS; +import static org.elasticsearch.cluster.metadata.IndexMetadata.SETTING_NUMBER_OF_SHARDS; +import static org.elasticsearch.cluster.routing.ShardRoutingState.INITIALIZING; +import static org.elasticsearch.cluster.routing.ShardRoutingState.RELOCATING; +import static org.elasticsearch.cluster.routing.ShardRoutingState.STARTED; +import static org.elasticsearch.cluster.routing.ShardRoutingState.UNASSIGNED; +import static org.elasticsearch.cluster.routing.TestShardRouting.newShardRouting; +import static org.hamcrest.Matchers.aMapWithSize; +import static org.hamcrest.Matchers.allOf; +import static org.hamcrest.Matchers.equalTo; +import static org.hamcrest.Matchers.hasEntry; + +public class DesiredBalanceComputerTests extends ESTestCase { + + static final String TEST_INDEX = "test-index"; + + public void testComputeBalance() { + var desiredBalanceComputer = createDesiredBalanceComputer(); + var clusterState = createInitialClusterState(3); + var index = clusterState.metadata().index(TEST_INDEX).getIndex(); + + var desiredBalance = desiredBalanceComputer.compute(DesiredBalance.INITIAL, createInput(clusterState), queue(), input -> true); + + assertDesiredAssignments( + desiredBalance, + Map.of( + new ShardId(index, 0), + new ShardAssignment(Set.of("node-0", "node-1"), 2, 0, 0), + new ShardId(index, 1), + new ShardAssignment(Set.of("node-0", "node-1"), 2, 0, 0) + ) + ); + } + + public void testStopsComputingWhenStale() { + var desiredBalanceComputer = createDesiredBalanceComputer(); + var clusterState = createInitialClusterState(3); + var index = clusterState.metadata().index(TEST_INDEX).getIndex(); + + // if the isFresh flag is false then we only do one iteration, allocating the primaries but not the replicas + var desiredBalance0 = DesiredBalance.INITIAL; + var desiredBalance1 = desiredBalanceComputer.compute(desiredBalance0, createInput(clusterState), queue(), input -> false); + assertDesiredAssignments( + desiredBalance1, + Map.of( + new ShardId(index, 0), + new ShardAssignment(Set.of("node-0"), 2, 1, 1), + new ShardId(index, 1), + new ShardAssignment(Set.of("node-0"), 2, 1, 1) + ) + ); + + // the next iteration allocates the replicas whether stale or fresh + var desiredBalance2 = desiredBalanceComputer.compute(desiredBalance1, createInput(clusterState), queue(), input -> randomBoolean()); + assertDesiredAssignments( + desiredBalance2, + Map.of( + new ShardId(index, 0), + new ShardAssignment(Set.of("node-0", "node-1"), 2, 0, 0), + new ShardId(index, 1), + new ShardAssignment(Set.of("node-0", "node-1"), 2, 0, 0) + ) + ); + } + + public void testIgnoresOutOfScopePrimaries() { + var desiredBalanceComputer = createDesiredBalanceComputer(); + var clusterState = createInitialClusterState(3); + var index = clusterState.metadata().index(TEST_INDEX).getIndex(); + var primaryShard = clusterState.routingTable().index(TEST_INDEX).shard(0).primaryShard(); + + var desiredBalance = desiredBalanceComputer.compute( + DesiredBalance.INITIAL, + createInput(clusterState, primaryShard), + queue(), + input -> true + ); + + assertDesiredAssignments( + desiredBalance, + Map.of( + new ShardId(index, 0), + new ShardAssignment(Set.of(), 2, 2, 2), + new ShardId(index, 1), + new ShardAssignment(Set.of("node-0", "node-1"), 2, 0, 0) + ) + ); + } + + public void testIgnoresOutOfScopeReplicas() { + var desiredBalanceComputer = createDesiredBalanceComputer(); + var clusterState = createInitialClusterState(3); + + var index = clusterState.metadata().index(TEST_INDEX).getIndex(); + var replicaShard = clusterState.routingTable().index(TEST_INDEX).shard(0).replicaShards().get(0); + + var desiredBalance = desiredBalanceComputer.compute( + DesiredBalance.INITIAL, + createInput(clusterState, replicaShard), + queue(), + input -> true + ); + + assertDesiredAssignments( + desiredBalance, + Map.of( + new ShardId(index, 0), + new ShardAssignment(Set.of("node-0"), 2, 1, 1), + new ShardId(index, 1), + new ShardAssignment(Set.of("node-0", "node-1"), 2, 0, 0) + ) + ); + } + + public void testAssignShardsToTheirPreviousLocationIfAvailable() { + var desiredBalanceComputer = createDesiredBalanceComputer(); + var clusterState = createInitialClusterState(3); + var index = clusterState.metadata().index(TEST_INDEX).getIndex(); + + var changes = new RoutingChangesObserver.DelegatingRoutingChangesObserver(); + var routingNodes = clusterState.mutableRoutingNodes(); + for (final var iterator = routingNodes.unassigned().iterator(); iterator.hasNext();) { + final var shardRouting = iterator.next(); + if (shardRouting.shardId().id() == 0 && shardRouting.primary()) { + iterator.updateUnassigned( + new UnassignedInfo( + UnassignedInfo.Reason.NODE_LEFT, + null, + null, + 0, + 0, + 0, + false, + UnassignedInfo.AllocationStatus.NO_ATTEMPT, + Set.of(), + "node-2" + ), + RecoverySource.EmptyStoreRecoverySource.INSTANCE, + changes + ); + } + } + clusterState = ClusterState.builder(clusterState) + .routingTable(RoutingTable.of(clusterState.routingTable().version(), routingNodes)) + .build(); + + var ignored = randomBoolean() + ? new ShardRouting[0] + : new ShardRouting[] { clusterState.routingTable().index(TEST_INDEX).shard(0).primaryShard() }; + + var desiredBalance = desiredBalanceComputer.compute( + DesiredBalance.INITIAL, + createInput(clusterState, ignored), + queue(), + input -> true + ); + + assertDesiredAssignments( + desiredBalance, + Map.of( + new ShardId(index, 0), + new ShardAssignment(Set.of("node-2", "node-1"), 2, 0, 0), + new ShardId(index, 1), + new ShardAssignment(Set.of("node-0", "node-1"), 2, 0, 0) + ) + ); + } + + public void testRespectsAssignmentOfUnknownPrimaries() { + var desiredBalanceComputer = createDesiredBalanceComputer(); + var clusterState = createInitialClusterState(3); + var index = clusterState.metadata().index(TEST_INDEX).getIndex(); + + var changes = new RoutingChangesObserver.DelegatingRoutingChangesObserver(); + var routingNodes = clusterState.mutableRoutingNodes(); + for (final var iterator = routingNodes.unassigned().iterator(); iterator.hasNext();) { + final var shardRouting = iterator.next(); + if (shardRouting.shardId().id() == 0 && shardRouting.primary()) { + switch (between(1, 3)) { + case 1 -> iterator.initialize("node-2", null, 0L, changes); + case 2 -> routingNodes.startShard(logger, iterator.initialize("node-2", null, 0L, changes), changes, 0L); + case 3 -> routingNodes.relocateShard( + routingNodes.startShard(logger, iterator.initialize("node-1", null, 0L, changes), changes, 0L), + "node-2", + 0L, + changes + ); + } + break; + } + } + clusterState = ClusterState.builder(clusterState) + .routingTable(RoutingTable.of(clusterState.routingTable().version(), routingNodes)) + .build(); + + var desiredBalance = desiredBalanceComputer.compute(DesiredBalance.INITIAL, createInput(clusterState), queue(), input -> true); + + assertDesiredAssignments( + desiredBalance, + Map.of( + new ShardId(index, 0), + new ShardAssignment(Set.of("node-2", "node-1"), 2, 0, 0), + new ShardId(index, 1), + new ShardAssignment(Set.of("node-0", "node-1"), 2, 0, 0) + ) + ); + } + + public void testRespectsAssignmentOfUnknownReplicas() { + var desiredBalanceComputer = createDesiredBalanceComputer(); + var clusterState = createInitialClusterState(3); + var index = clusterState.metadata().index(TEST_INDEX).getIndex(); + + var changes = new RoutingChangesObserver.DelegatingRoutingChangesObserver(); + var routingNodes = clusterState.mutableRoutingNodes(); + for (var iterator = routingNodes.unassigned().iterator(); iterator.hasNext();) { + var shardRouting = iterator.next(); + if (shardRouting.shardId().id() == 0 && shardRouting.primary()) { + routingNodes.startShard(logger, iterator.initialize("node-2", null, 0L, changes), changes, 0L); + break; + } + } + for (var iterator = routingNodes.unassigned().iterator(); iterator.hasNext();) { + var shardRouting = iterator.next(); + if (shardRouting.shardId().id() == 0) { + assert shardRouting.primary() == false; + switch (between(1, 3)) { + case 1 -> iterator.initialize("node-0", null, 0L, changes); + case 2 -> routingNodes.startShard(logger, iterator.initialize("node-0", null, 0L, changes), changes, 0L); + case 3 -> routingNodes.relocateShard( + routingNodes.startShard(logger, iterator.initialize("node-1", null, 0L, changes), changes, 0L), + "node-0", + 0L, + changes + ); + } + break; + } + } + clusterState = ClusterState.builder(clusterState) + .routingTable(RoutingTable.of(clusterState.routingTable().version(), routingNodes)) + .build(); + + var desiredBalance = desiredBalanceComputer.compute(DesiredBalance.INITIAL, createInput(clusterState), queue(), input -> true); + + assertDesiredAssignments( + desiredBalance, + Map.of( + new ShardId(index, 0), + new ShardAssignment(Set.of("node-2", "node-0"), 2, 0, 0), + new ShardId(index, 1), + new ShardAssignment(Set.of("node-0", "node-1"), 2, 0, 0) + ) + ); + } + + public void testSimulatesAchievingDesiredBalanceBeforeDelegating() { + + var allocateCalled = new AtomicBoolean(); + var desiredBalanceComputer = new DesiredBalanceComputer(new ShardsAllocator() { + @Override + public void allocate(RoutingAllocation allocation) { + assertTrue(allocateCalled.compareAndSet(false, true)); + // whatever the allocation in the current cluster state, the desired balance service should start by moving all the + // known shards to their desired locations before delegating to the inner allocator + for (var routingNode : allocation.routingNodes()) { + assertThat( + allocation.routingNodes().toString(), + routingNode.numberOfOwningShards(), + equalTo(routingNode.nodeId().equals("node-2") ? 0 : 2) + ); + for (var shardRouting : routingNode) { + assertTrue(shardRouting.toString(), shardRouting.started()); + } + } + } + + @Override + public ShardAllocationDecision decideShardAllocation(ShardRouting shard, RoutingAllocation allocation) { + throw new AssertionError("only used for allocation explain"); + } + }); + var clusterState = createInitialClusterState(3); + var index = clusterState.metadata().index(TEST_INDEX).getIndex(); + + // first, manually assign the shards to their expected locations to pre-populate the desired balance + var changes = new RoutingChangesObserver.DelegatingRoutingChangesObserver(); + var desiredRoutingNodes = clusterState.mutableRoutingNodes(); + for (var iterator = desiredRoutingNodes.unassigned().iterator(); iterator.hasNext();) { + var shardRouting = iterator.next(); + desiredRoutingNodes.startShard( + logger, + iterator.initialize(shardRouting.primary() ? "node-0" : "node-1", null, 0L, changes), + changes, + 0L + ); + } + clusterState = ClusterState.builder(clusterState) + .routingTable(RoutingTable.of(clusterState.routingTable().version(), desiredRoutingNodes)) + .build(); + + var desiredBalance1 = desiredBalanceComputer.compute(DesiredBalance.INITIAL, createInput(clusterState), queue(), input -> true); + assertDesiredAssignments( + desiredBalance1, + Map.of( + new ShardId(index, 0), + new ShardAssignment(Set.of("node-0", "node-1"), 2, 0, 0), + new ShardId(index, 1), + new ShardAssignment(Set.of("node-0", "node-1"), 2, 0, 0) + ) + ); + + // now create a cluster state with the routing table in a random state + var randomRoutingNodes = clusterState.mutableRoutingNodes(); + for (int shard = 0; shard < 2; shard++) { + var primaryRoutingState = randomFrom(ShardRoutingState.values()); + var replicaRoutingState = switch (primaryRoutingState) { + case UNASSIGNED, INITIALIZING -> UNASSIGNED; + case STARTED -> randomFrom(ShardRoutingState.values()); + case RELOCATING -> randomValueOtherThan(RELOCATING, () -> randomFrom(ShardRoutingState.values())); + }; + var nodes = new ArrayList<>(List.of("node-0", "node-1", "node-2")); + Randomness.shuffle(nodes); + + if (primaryRoutingState == UNASSIGNED) { + continue; + } + for (var iterator = randomRoutingNodes.unassigned().iterator(); iterator.hasNext();) { + var shardRouting = iterator.next(); + if (shardRouting.shardId().getId() == shard && shardRouting.primary()) { + switch (primaryRoutingState) { + case INITIALIZING -> iterator.initialize(nodes.remove(0), null, 0L, changes); + case STARTED -> randomRoutingNodes.startShard( + logger, + iterator.initialize(nodes.remove(0), null, 0L, changes), + changes, + 0L + ); + case RELOCATING -> randomRoutingNodes.relocateShard( + randomRoutingNodes.startShard(logger, iterator.initialize(nodes.remove(0), null, 0L, changes), changes, 0L), + nodes.remove(0), + 0L, + changes + ); + } + break; + } + } + + if (replicaRoutingState == UNASSIGNED) { + continue; + } + for (var iterator = randomRoutingNodes.unassigned().iterator(); iterator.hasNext();) { + var shardRouting = iterator.next(); + if (shardRouting.shardId().getId() == shard && shardRouting.primary() == false) { + switch (replicaRoutingState) { + case INITIALIZING -> iterator.initialize(nodes.remove(0), null, 0L, changes); + case STARTED -> randomRoutingNodes.startShard( + logger, + iterator.initialize(nodes.remove(0), null, 0L, changes), + changes, + 0L + ); + case RELOCATING -> randomRoutingNodes.relocateShard( + randomRoutingNodes.startShard(logger, iterator.initialize(nodes.remove(0), null, 0L, changes), changes, 0L), + nodes.remove(0), + 0L, + changes + ); + } + break; + } + } + } + clusterState = ClusterState.builder(clusterState) + .routingTable(RoutingTable.of(clusterState.routingTable().version(), randomRoutingNodes)) + .build(); + + allocateCalled.set(false); + + var desiredBalance2 = desiredBalanceComputer.compute(desiredBalance1, createInput(clusterState), queue(), input -> true); + assertDesiredAssignments( + desiredBalance2, + Map.of( + new ShardId(index, 0), + new ShardAssignment(Set.of("node-0", "node-1"), 2, 0, 0), + new ShardId(index, 1), + new ShardAssignment(Set.of("node-0", "node-1"), 2, 0, 0) + ) + ); + assertTrue(allocateCalled.get()); + } + + public void testNoDataNodes() { + var desiredBalanceComputer = createDesiredBalanceComputer(); + var clusterState = createInitialClusterState(0); + + var desiredBalance = desiredBalanceComputer.compute(DesiredBalance.INITIAL, createInput(clusterState), queue(), input -> true); + + assertDesiredAssignments(desiredBalance, Map.of()); + } + + public void testAppliesMoveCommands() { + var desiredBalanceComputer = createDesiredBalanceComputer(); + var clusterState = createInitialClusterState(3); + var index = clusterState.metadata().index(TEST_INDEX).getIndex(); + + var changes = new RoutingChangesObserver.DelegatingRoutingChangesObserver(); + var routingNodes = clusterState.mutableRoutingNodes(); + for (var iterator = routingNodes.unassigned().iterator(); iterator.hasNext();) { + var shardRouting = iterator.next(); + routingNodes.startShard( + logger, + iterator.initialize(shardRouting.primary() ? "node-0" : "node-1", null, 0L, changes), + changes, + 0L + ); + } + clusterState = ClusterState.builder(clusterState) + .routingTable(RoutingTable.of(clusterState.routingTable().version(), routingNodes)) + .build(); + + var desiredBalance = desiredBalanceComputer.compute( + DesiredBalance.INITIAL, + createInput(clusterState), + queue( + new MoveAllocationCommand(index.getName(), 0, "node-1", "node-2"), + new MoveAllocationCommand(index.getName(), 1, "node-1", "node-2") + ), + input -> true + ); + + assertDesiredAssignments( + desiredBalance, + Map.of( + new ShardId(index, 0), + new ShardAssignment(Set.of("node-0", "node-2"), 2, 0, 0), + new ShardId(index, 1), + new ShardAssignment(Set.of("node-0", "node-2"), 2, 0, 0) + ) + ); + } + + public void testDesiredBalanceShouldConvergeInABigCluster() { + var nodes = randomIntBetween(3, 7); + var nodeIds = new ArrayList(nodes); + var discoveryNodesBuilder = DiscoveryNodes.builder(); + for (int node = 0; node < nodes; node++) { + var nodeId = "node-" + node; + nodeIds.add(nodeId); + discoveryNodesBuilder.add(createDiscoveryNode(nodeId, DiscoveryNodeRole.roles())); + } + + var indices = scaledRandomIntBetween(1, 1000); + var totalShards = 0; + var metadataBuilder = Metadata.builder(); + var routingTableBuilder = RoutingTable.builder(); + for (int i = 0; i < indices; i++) { + var indexName = "index-" + i; + var shards = randomIntBetween(1, 10); + var replicas = randomIntBetween(1, nodes - 1); + totalShards += shards * (replicas + 1); + var inSyncIds = randomList(shards * (replicas + 1), shards * (replicas + 1), () -> UUIDs.randomBase64UUID(random())); + + var indexMetadataBuilder = IndexMetadata.builder(indexName) + .settings( + Settings.builder() + .put("index.number_of_shards", shards) + .put("index.number_of_replicas", replicas) + .put("index.version.created", Version.CURRENT) + .build() + ); + for (int shard = 0; shard < shards; shard++) { + indexMetadataBuilder.putInSyncAllocationIds( + shard, + Set.copyOf(inSyncIds.subList(shard * (replicas + 1), (shard + 1) * (replicas + 1))) + ); + } + metadataBuilder.put(indexMetadataBuilder); + + var indexId = metadataBuilder.get(indexName).getIndex(); + var indexRoutingTableBuilder = IndexRoutingTable.builder(indexId); + + for (int shard = 0; shard < shards; shard++) { + var remainingNodeIds = new ArrayList<>(nodeIds); + remainingNodeIds.add(null);// disconnected node + var shardId = new ShardId(indexId, shard); + var primaryNodeId = pickAndRemoveRandomValueFrom(remainingNodeIds); + indexRoutingTableBuilder.addShard( + newShardRouting( + shardId, + primaryNodeId, + null, + true, + primaryNodeId == null ? UNASSIGNED : STARTED, + AllocationId.newInitializing(inSyncIds.get(shard * (replicas + 1))) + ) + ); + for (int replica = 0; replica < replicas; replica++) { + var replicaNodeId = pickAndRemoveRandomValueFrom(remainingNodeIds); + indexRoutingTableBuilder.addShard( + newShardRouting( + shardId, + replicaNodeId, + null, + false, + replicaNodeId == null ? UNASSIGNED : STARTED, + AllocationId.newInitializing(inSyncIds.get(shard * (replicas + 1) + 1 + replica)) + ) + ); + } + } + routingTableBuilder.add(indexRoutingTableBuilder); + } + + var clusterState = ClusterState.builder(ClusterName.DEFAULT) + .nodes(discoveryNodesBuilder) + .metadata(metadataBuilder) + .routingTable(routingTableBuilder) + .build(); + + var iteration = new AtomicInteger(0); + + var input = new DesiredBalanceInput( + randomInt(), + routingAllocationWithDecidersOf(clusterState, ClusterInfo.EMPTY, Settings.EMPTY), + Set.of() + ); + var desiredBalance = new DesiredBalanceComputer(new BalancedShardsAllocator(Settings.EMPTY)).compute( + DesiredBalance.INITIAL, + input, + queue(), + ignored -> iteration.incrementAndGet() < 1000 + ); + + try { + assertThat( + "Balance should converge, but exited by the iteration limit", + desiredBalance.lastConvergedIndex(), + equalTo(input.index()) + ); + logger.info( + "Balance converged after [{}] iterations for [{}] nodes and [{}] total shards", + iteration.get(), + nodes, + totalShards + ); + } catch (AssertionError e) { + logger.error( + "Failed to converge desired balance for [{}] nodes and [{}] total shards:\n {}", + nodes, + totalShards, + clusterState.getRoutingNodes() + ); + throw e; + } + } + + private String pickAndRemoveRandomValueFrom(List values) { + var value = randomFrom(values); + values.remove(value); + return value; + } + + public void testComputeConsideringShardSizes() { + + var discoveryNodesBuilder = DiscoveryNodes.builder() + .add(createDiscoveryNode("node-0", DiscoveryNodeRole.roles())) + .add(createDiscoveryNode("node-1", DiscoveryNodeRole.roles())) + .add(createDiscoveryNode("node-2", DiscoveryNodeRole.roles())); + + var metadataBuilder = Metadata.builder(); + var routingTableBuilder = RoutingTable.builder(); + + ShardRouting index0PrimaryShard; + ShardRouting index0ReplicaShard; + { + var indexName = "index-0"; + + metadataBuilder.put( + IndexMetadata.builder(indexName) + .settings( + Settings.builder() + .put("index.number_of_shards", 1) + .put("index.number_of_replicas", 1) + .put("index.version.created", Version.CURRENT) + .put("index.routing.allocation.exclude._name", "node-2") + .build() + ) + ); + + var indexId = metadataBuilder.get(indexName).getIndex(); + var shardId = new ShardId(indexId, 0); + + index0PrimaryShard = newShardRouting(shardId, "node-1", null, true, STARTED); + index0ReplicaShard = switch (randomIntBetween(0, 6)) { + // shard is started on the desired node + case 0 -> newShardRouting(shardId, "node-0", null, false, STARTED); + // shard is initializing on the desired node + case 1 -> newShardRouting(shardId, "node-0", null, false, INITIALIZING); + // shard is initializing on the undesired node + case 2 -> newShardRouting(shardId, "node-2", null, false, INITIALIZING); + // shard started on undesired node, assumed to be relocated to the desired node in the future + case 3 -> newShardRouting(shardId, "node-2", null, false, STARTED); + // shard is already relocating to the desired node + case 4 -> newShardRouting(shardId, "node-2", "node-0", false, RELOCATING); + // shard is relocating to the undesired node + case 5 -> newShardRouting(shardId, "node-0", "node-2", false, RELOCATING); + // shard is unassigned + case 6 -> newShardRouting(shardId, null, null, false, UNASSIGNED); + default -> throw new IllegalStateException(); + }; + + routingTableBuilder.add(IndexRoutingTable.builder(indexId).addShard(index0PrimaryShard).addShard(index0ReplicaShard)); + } + + for (int i = 1; i < 10; i++) { + var indexName = "index-" + i; + + metadataBuilder.put( + IndexMetadata.builder(indexName) + .settings( + Settings.builder() + .put("index.number_of_shards", 1) + .put("index.number_of_replicas", 0) + .put("index.version.created", Version.CURRENT) + .put("index.routing.allocation.exclude._name", "node-2") + .build() + ) + ); + + var indexId = metadataBuilder.get(indexName).getIndex(); + var shardId = new ShardId(indexId, 0); + + routingTableBuilder.add( + IndexRoutingTable.builder(indexId).addShard(newShardRouting(shardId, i == 1 ? "node-0" : "node-1", null, true, STARTED)) + ); + } + + var clusterState = ClusterState.builder(ClusterName.DEFAULT) + .nodes(discoveryNodesBuilder) + .metadata(metadataBuilder) + .routingTable(routingTableBuilder) + .build(); + + var node0RemainingBytes = (index0ReplicaShard.started() || index0ReplicaShard.relocating()) + && Objects.equals(index0ReplicaShard.currentNodeId(), "node-0") ? 100 : 600; + var node0Usage = new DiskUsage("node-0", "node-0", "/data", 1000, node0RemainingBytes); + var node1Usage = new DiskUsage("node-1", "node-1", "/data", 1000, 100); + var node2Usage = new DiskUsage("node-2", "node-2", "/data", 1000, 1000); + + var clusterInfo = new ClusterInfo( + Map.of(node0Usage.nodeId(), node0Usage, node1Usage.nodeId(), node1Usage, node2Usage.getNodeId(), node2Usage), + Map.of(node0Usage.nodeId(), node0Usage, node1Usage.nodeId(), node1Usage, node2Usage.getNodeId(), node2Usage), + Map.ofEntries( + // node-0 & node-1 + indexSize(clusterState, "index-0", 500, true), + indexSize(clusterState, "index-0", 500, false), + // node-0 + indexSize(clusterState, "index-1", 400, true), + // node-1 + indexSize(clusterState, "index-2", 50, true), + indexSize(clusterState, "index-3", 50, true), + indexSize(clusterState, "index-4", 50, true), + indexSize(clusterState, "index-5", 50, true), + indexSize(clusterState, "index-6", 50, true), + indexSize(clusterState, "index-7", 50, true), + indexSize(clusterState, "index-8", 50, true), + indexSize(clusterState, "index-9", 50, true) + ), + Map.of(), + Map.of(), + Map.of() + ); + + var settings = Settings.builder() + // force as many iterations as possible to accumulate the diff + .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_RECOVERIES_SETTING.getKey(), "1") + // have a small gap to keep allocating the shards + .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), "97%") + .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), "98%") + .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING.getKey(), "99%") + .build(); + + var initial = new DesiredBalance( + 1, + Map.of( + findShardId(clusterState, "index-0"), + new ShardAssignment(Set.of("node-0", "node-1"), 2, 0, 0), + findShardId(clusterState, "index-1"), + new ShardAssignment(Set.of("node-0"), 1, 0, 0) + ) + ); + + var desiredBalance = new DesiredBalanceComputer(new BalancedShardsAllocator(settings)).compute( + initial, + new DesiredBalanceInput(randomInt(), routingAllocationWithDecidersOf(clusterState, clusterInfo, settings), Set.of()), + queue(), + input -> true + ); + + var resultDiskUsage = new HashMap(); + for (var assignment : desiredBalance.assignments().entrySet()) { + for (String nodeId : assignment.getValue().nodeIds()) { + var size = Objects.requireNonNull(clusterInfo.getShardSize(assignment.getKey(), true)); + resultDiskUsage.compute(nodeId, (k, v) -> v == null ? size : v + size); + } + } + + assertThat(resultDiskUsage, allOf(aMapWithSize(2), hasEntry("node-0", 950L), hasEntry("node-1", 850L))); + } + + private static Map.Entry indexSize(ClusterState clusterState, String name, long size, boolean primary) { + return Map.entry(ClusterInfo.shardIdentifierFromRouting(findShardId(clusterState, name), primary), size); + } + + private static ShardId findShardId(ClusterState clusterState, String name) { + return clusterState.getRoutingTable().index(name).shard(0).shardId(); + } + + static ClusterState createInitialClusterState(int dataNodesCount) { + var discoveryNodes = DiscoveryNodes.builder().add(createDiscoveryNode("master", Set.of(DiscoveryNodeRole.MASTER_ROLE))); + for (int i = 0; i < dataNodesCount; i++) { + discoveryNodes.add(createDiscoveryNode("node-" + i, Set.of(DiscoveryNodeRole.DATA_ROLE))); + } + + var indexMetadata = IndexMetadata.builder(TEST_INDEX) + .settings( + Settings.builder() + .put(SETTING_NUMBER_OF_SHARDS, 2) + .put(SETTING_NUMBER_OF_REPLICAS, 1) + .put(SETTING_INDEX_VERSION_CREATED.getKey(), Version.CURRENT) + ) + .build(); + + return ClusterState.builder(ClusterName.DEFAULT) + .nodes(discoveryNodes.masterNodeId("master").localNodeId("master")) + .metadata(Metadata.builder().put(indexMetadata, true)) + .routingTable(RoutingTable.builder().addAsNew(indexMetadata)) + .build(); + } + + private static DiscoveryNode createDiscoveryNode(String id, Set roles) { + return new DiscoveryNode( + id, + id, + UUIDs.randomBase64UUID(random()), + buildNewFakeTransportAddress(), + Map.of(), + roles, + Version.CURRENT + ); + } + + /** + * @return a {@link DesiredBalanceComputer} which allocates unassigned primaries to node-0 and unassigned replicas to node-1 + */ + private static DesiredBalanceComputer createDesiredBalanceComputer() { + return new DesiredBalanceComputer(new ShardsAllocator() { + @Override + public void allocate(RoutingAllocation allocation) { + final var unassignedIterator = allocation.routingNodes().unassigned().iterator(); + while (unassignedIterator.hasNext()) { + final var shardRouting = unassignedIterator.next(); + if (shardRouting.primary()) { + unassignedIterator.initialize("node-0", null, 0L, allocation.changes()); + } else if (isCorrespondingPrimaryStarted(shardRouting, allocation)) { + unassignedIterator.initialize("node-1", null, 0L, allocation.changes()); + } else { + unassignedIterator.removeAndIgnore(UnassignedInfo.AllocationStatus.NO_ATTEMPT, allocation.changes()); + } + } + } + + private static boolean isCorrespondingPrimaryStarted(ShardRouting shardRouting, RoutingAllocation allocation) { + return allocation.routingNodes().assignedShards(shardRouting.shardId()).stream().anyMatch(r -> r.primary() && r.started()); + } + + @Override + public ShardAllocationDecision decideShardAllocation(ShardRouting shard, RoutingAllocation allocation) { + throw new AssertionError("only used for allocation explain"); + } + }); + } + + private static void assertDesiredAssignments(DesiredBalance desiredBalance, Map expected) { + assertThat(desiredBalance.assignments(), equalTo(expected)); + } + + private static DesiredBalanceInput createInput(ClusterState clusterState, ShardRouting... ignored) { + return new DesiredBalanceInput(randomInt(), routingAllocationOf(clusterState), Set.of(ignored)); + } + + private static RoutingAllocation routingAllocationOf(ClusterState clusterState) { + return new RoutingAllocation(new AllocationDeciders(List.of()), clusterState, ClusterInfo.EMPTY, SnapshotShardSizeInfo.EMPTY, 0L); + } + + private static RoutingAllocation routingAllocationWithDecidersOf( + ClusterState clusterState, + ClusterInfo clusterInfo, + Settings settings + ) { + return new RoutingAllocation( + new AllocationDeciders( + ClusterModule.createAllocationDeciders( + settings, + new ClusterSettings(settings, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS), + List.of() + ) + ), + clusterState, + clusterInfo, + SnapshotShardSizeInfo.EMPTY, + 0L + ); + } + + private static Queue> queue(MoveAllocationCommand... commands) { + return new LinkedList<>(List.of(List.of(commands))); + } +} diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconcilerTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconcilerTests.java new file mode 100644 index 0000000000000..72204fb823263 --- /dev/null +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconcilerTests.java @@ -0,0 +1,1166 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0 and the Server Side Public License, v 1; you may not use this file except + * in compliance with, at your election, the Elastic License 2.0 or the Server + * Side Public License, v 1. + */ + +package org.elasticsearch.cluster.routing.allocation.allocator; + +import org.elasticsearch.Version; +import org.elasticsearch.action.ActionListener; +import org.elasticsearch.cluster.ClusterInfo; +import org.elasticsearch.cluster.ClusterInfoService; +import org.elasticsearch.cluster.ClusterName; +import org.elasticsearch.cluster.ClusterState; +import org.elasticsearch.cluster.metadata.IndexMetadata; +import org.elasticsearch.cluster.metadata.Metadata; +import org.elasticsearch.cluster.metadata.NodesShutdownMetadata; +import org.elasticsearch.cluster.metadata.SingleNodeShutdownMetadata; +import org.elasticsearch.cluster.node.DiscoveryNode; +import org.elasticsearch.cluster.node.DiscoveryNodeRole; +import org.elasticsearch.cluster.node.DiscoveryNodes; +import org.elasticsearch.cluster.routing.IndexShardRoutingTable; +import org.elasticsearch.cluster.routing.RecoverySource; +import org.elasticsearch.cluster.routing.RoutingChangesObserver; +import org.elasticsearch.cluster.routing.RoutingNode; +import org.elasticsearch.cluster.routing.RoutingTable; +import org.elasticsearch.cluster.routing.ShardRouting; +import org.elasticsearch.cluster.routing.ShardRoutingState; +import org.elasticsearch.cluster.routing.UnassignedInfo; +import org.elasticsearch.cluster.routing.allocation.AllocateUnassignedDecision; +import org.elasticsearch.cluster.routing.allocation.AllocationService; +import org.elasticsearch.cluster.routing.allocation.ExistingShardsAllocator; +import org.elasticsearch.cluster.routing.allocation.FailedShard; +import org.elasticsearch.cluster.routing.allocation.RoutingAllocation; +import org.elasticsearch.cluster.routing.allocation.ShardAllocationDecision; +import org.elasticsearch.cluster.routing.allocation.decider.AllocationDecider; +import org.elasticsearch.cluster.routing.allocation.decider.AllocationDeciders; +import org.elasticsearch.cluster.routing.allocation.decider.Decision; +import org.elasticsearch.cluster.routing.allocation.decider.FilterAllocationDecider; +import org.elasticsearch.cluster.routing.allocation.decider.NodeReplacementAllocationDecider; +import org.elasticsearch.cluster.routing.allocation.decider.NodeShutdownAllocationDecider; +import org.elasticsearch.cluster.routing.allocation.decider.ReplicaAfterPrimaryActiveAllocationDecider; +import org.elasticsearch.cluster.routing.allocation.decider.SameShardAllocationDecider; +import org.elasticsearch.cluster.routing.allocation.decider.ThrottlingAllocationDecider; +import org.elasticsearch.common.TriFunction; +import org.elasticsearch.common.UUIDs; +import org.elasticsearch.common.collect.ImmutableOpenMap; +import org.elasticsearch.common.settings.ClusterSettings; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.gateway.GatewayAllocator; +import org.elasticsearch.index.shard.ShardId; +import org.elasticsearch.repositories.IndexId; +import org.elasticsearch.snapshots.InternalSnapshotsInfoService; +import org.elasticsearch.snapshots.Snapshot; +import org.elasticsearch.snapshots.SnapshotId; +import org.elasticsearch.snapshots.SnapshotShardSizeInfo; +import org.elasticsearch.snapshots.SnapshotsInfoService; +import org.elasticsearch.test.ESTestCase; +import org.junit.BeforeClass; + +import java.util.Comparator; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicReference; +import java.util.function.BiPredicate; +import java.util.function.Consumer; +import java.util.stream.Collectors; +import java.util.stream.IntStream; +import java.util.stream.StreamSupport; + +import static org.elasticsearch.cluster.ClusterInfo.shardIdentifierFromRouting; +import static org.elasticsearch.cluster.ESAllocationTestCase.startInitializingShardsAndReroute; +import static org.elasticsearch.cluster.metadata.IndexMetadata.SETTING_INDEX_VERSION_CREATED; +import static org.elasticsearch.cluster.metadata.IndexMetadata.SETTING_NUMBER_OF_REPLICAS; +import static org.elasticsearch.cluster.metadata.IndexMetadata.SETTING_NUMBER_OF_SHARDS; +import static org.elasticsearch.cluster.routing.allocation.decider.ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_INCOMING_RECOVERIES_SETTING; +import static org.elasticsearch.cluster.routing.allocation.decider.ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_OUTGOING_RECOVERIES_SETTING; +import static org.elasticsearch.cluster.routing.allocation.decider.ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_PRIMARIES_RECOVERIES_SETTING; +import static org.hamcrest.Matchers.equalTo; +import static org.hamcrest.Matchers.hasSize; +import static org.hamcrest.Matchers.oneOf; + +public class DesiredBalanceReconcilerTests extends ESTestCase { + + public void testNoChangesOnEmptyDesiredBalance() { + final var clusterState = DesiredBalanceComputerTests.createInitialClusterState(3); + final var routingAllocation = new RoutingAllocation( + new AllocationDeciders(List.of()), + clusterState.mutableRoutingNodes(), + clusterState, + ClusterInfo.EMPTY, + SnapshotShardSizeInfo.EMPTY, + 0L + ); + + reconcile(routingAllocation, new DesiredBalance(1, Map.of())); + assertFalse(routingAllocation.routingNodesChanged()); + } + + public void testFailsNewPrimariesIfNoDataNodes() { + final var clusterState = ClusterState.builder(DesiredBalanceComputerTests.createInitialClusterState(3)) + .nodes( + DiscoveryNodes.builder() + .add( + new DiscoveryNode( + "master", + buildNewFakeTransportAddress(), + Map.of(), + Set.of(DiscoveryNodeRole.MASTER_ROLE), + Version.CURRENT + ) + ) + .localNodeId("master") + .masterNodeId("master") + .build() + ) + .build(); + + final var routingNodes = clusterState.mutableRoutingNodes(); + final var unassigned = routingNodes.unassigned().iterator(); + while (unassigned.hasNext()) { + final var shardRouting = unassigned.next(); + if (shardRouting.primary() && shardRouting.shardId().id() == 1) { + final var unassignedInfo = shardRouting.unassignedInfo(); + assertThat(unassignedInfo.getLastAllocationStatus(), equalTo(UnassignedInfo.AllocationStatus.NO_ATTEMPT)); + unassigned.updateUnassigned( + new UnassignedInfo( + unassignedInfo.getReason(), + unassignedInfo.getMessage(), + unassignedInfo.getFailure(), + unassignedInfo.getNumFailedAllocations(), + unassignedInfo.getUnassignedTimeInNanos(), + unassignedInfo.getUnassignedTimeInMillis(), + unassignedInfo.isDelayed(), + UnassignedInfo.AllocationStatus.DECIDERS_THROTTLED, + unassignedInfo.getFailedNodeIds(), + unassignedInfo.getLastAllocatedNodeId() + ), + shardRouting.recoverySource(), + new RoutingChangesObserver.DelegatingRoutingChangesObserver() + ); + } + } + + final var routingAllocation = new RoutingAllocation( + new AllocationDeciders(List.of()), + routingNodes, + clusterState, + ClusterInfo.EMPTY, + SnapshotShardSizeInfo.EMPTY, + 0L + ); + + for (ShardRouting shardRouting : routingAllocation.routingNodes().unassigned()) { + assertTrue(shardRouting.toString(), shardRouting.unassigned()); + assertThat( + shardRouting.unassignedInfo().getLastAllocationStatus(), + equalTo( + shardRouting.primary() && shardRouting.shardId().id() == 1 + ? UnassignedInfo.AllocationStatus.DECIDERS_THROTTLED + : UnassignedInfo.AllocationStatus.NO_ATTEMPT + ) + ); + } + + reconcile( + routingAllocation, + new DesiredBalance( + 1, + randomBoolean() + ? Map.of() + : Map.of( + new ShardId(clusterState.metadata().index(DesiredBalanceComputerTests.TEST_INDEX).getIndex(), 0), + new ShardAssignment(Set.of("node-0"), 1, 0, 0) + ) + ) + ); + assertTrue(routingAllocation.routingNodesChanged()); + + for (ShardRouting shardRouting : routingAllocation.routingNodes().unassigned()) { + assertTrue(shardRouting.toString(), shardRouting.unassigned()); + assertThat( + shardRouting.unassignedInfo().getLastAllocationStatus(), + equalTo( + // we only update primaries, and only if currently NO_ATTEMPT + shardRouting.primary() + ? shardRouting.shardId().id() == 1 + ? UnassignedInfo.AllocationStatus.DECIDERS_THROTTLED + : UnassignedInfo.AllocationStatus.DECIDERS_NO + : UnassignedInfo.AllocationStatus.NO_ATTEMPT + ) + ); + } + } + + public void testUnassignedPrimariesBeforeUnassignedReplicas() { + // regardless of priority, we attempt to allocate all unassigned primaries before considering any unassigned replicas + + final var discoveryNodes = discoveryNodes(2); + final var metadata = Metadata.builder(); + final var routingTable = RoutingTable.builder(); + + final var indexMetadata0 = randomPriorityIndex("index-0", 1, 1); + metadata.put(indexMetadata0, true); + routingTable.addAsNew(indexMetadata0); + + final var indexMetadata1 = randomPriorityIndex("index-1", 1, 1); + metadata.put(indexMetadata1, true); + routingTable.addAsNew(indexMetadata1); + + final var clusterState = ClusterState.builder(ClusterName.DEFAULT) + .nodes(discoveryNodes) + .metadata(metadata) + .routingTable(routingTable) + .build(); + + final var settings = throttleSettings(); + final var clusterSettings = new ClusterSettings(settings, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS); + + final var desiredBalance = desiredBalance(clusterState, (shardId, nodeId) -> true); + final var allocationFilter = new AtomicReference>( + (indexName, nodeId) -> indexName.equals("index-0") && nodeId.equals("node-0") + ); + + final var allocationService = createTestAllocationService( + routingAllocation -> reconcile(routingAllocation, desiredBalance), + new SameShardAllocationDecider(settings, clusterSettings), + new ReplicaAfterPrimaryActiveAllocationDecider(), + new ThrottlingAllocationDecider(settings, clusterSettings), + new AllocationDecider() { + @Override + public Decision canAllocate(ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) { + return allocationFilter.get().test(shardRouting.getIndexName(), node.nodeId()) ? Decision.YES : Decision.NO; + } + } + ); + + // first start the primary of index-0 (no other shards may be allocated due to allocation filter) + final var stateWithStartedPrimary = startInitializingShardsAndReroute( + allocationService, + startInitializingShardsAndReroute(allocationService, clusterState) + ); + { + final var index0RoutingTable = stateWithStartedPrimary.routingTable().shardRoutingTable("index-0", 0); + assertTrue(index0RoutingTable.primaryShard().started()); + assertTrue(index0RoutingTable.replicaShards().stream().allMatch(ShardRouting::unassigned)); + final var index1RoutingTable = stateWithStartedPrimary.routingTable().shardRoutingTable("index-1", 0); + assertTrue(index1RoutingTable.primaryShard().unassigned()); + assertTrue(index1RoutingTable.replicaShards().stream().allMatch(ShardRouting::unassigned)); + } + + // now relax the filter so that the replica of index-0 and the primary of index-1 can both be assigned to node-1, but the throttle + // forces us to choose one of them to go first which must be the primary + allocationFilter.set((indexName, nodeId) -> indexName.equals("index-0") || nodeId.equals("node-1")); + final var stateWithInitializingSecondPrimary = startInitializingShardsAndReroute(allocationService, stateWithStartedPrimary); + { + final var index0RoutingTable = stateWithInitializingSecondPrimary.routingTable().shardRoutingTable("index-0", 0); + assertTrue(index0RoutingTable.primaryShard().started()); + assertTrue(index0RoutingTable.replicaShards().stream().allMatch(ShardRouting::unassigned)); + final var index1RoutingTable = stateWithInitializingSecondPrimary.routingTable().shardRoutingTable("index-1", 0); + assertTrue(index1RoutingTable.primaryShard().initializing()); + assertTrue(index1RoutingTable.replicaShards().stream().allMatch(ShardRouting::unassigned)); + } + + final var stateWithStartedPrimariesAndInitializingReplica = startInitializingShardsAndReroute( + allocationService, + stateWithInitializingSecondPrimary + ); + { + final var index0RoutingTable = stateWithStartedPrimariesAndInitializingReplica.routingTable().shardRoutingTable("index-0", 0); + assertTrue(index0RoutingTable.primaryShard().started()); + assertTrue(index0RoutingTable.replicaShards().stream().allMatch(ShardRouting::initializing)); + final var index1RoutingTable = stateWithStartedPrimariesAndInitializingReplica.routingTable().shardRoutingTable("index-1", 0); + assertTrue(index1RoutingTable.primaryShard().started()); + assertTrue(index1RoutingTable.replicaShards().stream().allMatch(ShardRouting::unassigned)); + } + } + + public void testUnassignedShardsInterleaving() { + // regardless of priority, we give each shard an opportunity to allocate one of its copies before we give any shard an opportunity + // to allocate a further copy + + final var discoveryNodes = discoveryNodes(4); + final var metadata = Metadata.builder(); + final var routingTable = RoutingTable.builder(); + + var shardsRemaining = 4; + var indexNum = 0; + while (shardsRemaining > 0) { + final var shardCount = between(1, shardsRemaining); + shardsRemaining -= shardCount; + final var indexMetadata = randomPriorityIndex("index-" + indexNum++, shardCount, 3); + metadata.put(indexMetadata, true); + routingTable.addAsNew(indexMetadata); + } + + final var clusterState = ClusterState.builder(ClusterName.DEFAULT) + .nodes(discoveryNodes) + .metadata(metadata) + .routingTable(routingTable) + .build(); + + final var settings = throttleSettings(); + final var clusterSettings = new ClusterSettings(settings, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS); + + final var desiredBalance = desiredBalance(clusterState, (shardId, nodeId) -> true); + + final var allocationService = createTestAllocationService( + routingAllocation -> reconcile(routingAllocation, desiredBalance), + new SameShardAllocationDecider(settings, clusterSettings), + new ReplicaAfterPrimaryActiveAllocationDecider(), + new ThrottlingAllocationDecider(settings, clusterSettings) + ); + + final var stateWithInitializingPrimaries = startInitializingShardsAndReroute(allocationService, clusterState); + for (final var indexRoutingTable : stateWithInitializingPrimaries.routingTable()) { + for (int i = 0; i < indexRoutingTable.size(); i++) { + final var indexShardRoutingTable = indexRoutingTable.shard(i); + } + + for (int i = 0; i < indexRoutingTable.size(); i++) { + final var indexShardRoutingTable = indexRoutingTable.shard(i); + assertTrue(indexShardRoutingTable.primaryShard().initializing()); + assertThat(indexShardRoutingTable.replicaShards().stream().filter(ShardRouting::unassigned).count(), equalTo(3L)); + } + } + + final var stateWithInitializingReplicas1 = startInitializingShardsAndReroute(allocationService, stateWithInitializingPrimaries); + for (final var indexRoutingTable : stateWithInitializingReplicas1.routingTable()) { + for (int i = 0; i < indexRoutingTable.size(); i++) { + final var indexShardRoutingTable = indexRoutingTable.shard(i); + assertTrue(indexShardRoutingTable.primaryShard().started()); + assertThat(indexShardRoutingTable.replicaShards().stream().filter(ShardRouting::unassigned).count(), equalTo(2L)); + assertThat(indexShardRoutingTable.replicaShards().stream().filter(ShardRouting::initializing).count(), equalTo(1L)); + } + } + + final var stateWithInitializingReplicas2 = startInitializingShardsAndReroute(allocationService, stateWithInitializingReplicas1); + for (final var indexRoutingTable : stateWithInitializingReplicas2.routingTable()) { + for (int i = 0; i < indexRoutingTable.size(); i++) { + final var indexShardRoutingTable = indexRoutingTable.shard(i); + assertTrue(indexShardRoutingTable.primaryShard().started()); + assertThat(indexShardRoutingTable.replicaShards().stream().filter(ShardRouting::unassigned).count(), equalTo(1L)); + assertThat(indexShardRoutingTable.replicaShards().stream().filter(ShardRouting::initializing).count(), equalTo(1L)); + assertThat(indexShardRoutingTable.replicaShards().stream().filter(ShardRouting::started).count(), equalTo(1L)); + } + } + + final var stateWithInitializingReplicas3 = startInitializingShardsAndReroute(allocationService, stateWithInitializingReplicas2); + for (final var indexRoutingTable : stateWithInitializingReplicas3.routingTable()) { + for (int i = 0; i < indexRoutingTable.size(); i++) { + final var indexShardRoutingTable = indexRoutingTable.shard(i); + assertTrue(indexShardRoutingTable.primaryShard().started()); + assertThat(indexShardRoutingTable.replicaShards().stream().filter(ShardRouting::initializing).count(), equalTo(1L)); + assertThat(indexShardRoutingTable.replicaShards().stream().filter(ShardRouting::started).count(), equalTo(2L)); + } + } + + final var finalState = startInitializingShardsAndReroute(allocationService, stateWithInitializingReplicas3); + for (final var indexRoutingTable : finalState.routingTable()) { + for (int i = 0; i < indexRoutingTable.size(); i++) { + final var indexShardRoutingTable = indexRoutingTable.shard(i); + assertTrue(indexShardRoutingTable.allShardsStarted()); + } + } + } + + public void testUnassignedShardsPriority() { + final var discoveryNodes = discoveryNodes(2); + final var metadata = Metadata.builder(); + final var routingTable = RoutingTable.builder(); + + final var indexMetadata0 = randomPriorityIndex("index-0", 2, 1); + final var indexMetadata1 = randomPriorityIndex("index-1", 2, 1); + + metadata.put(indexMetadata0, true); + metadata.put(indexMetadata1, true); + routingTable.addAsNew(indexMetadata0); + routingTable.addAsNew(indexMetadata1); + + final var comparisonResult = Comparator.comparingInt(indexMetadata -> indexMetadata.isSystem() ? 1 : 0) + .thenComparingInt(IndexMetadata::priority) + .thenComparingLong(IndexMetadata::getCreationDate) + .thenComparing(indexMetadata -> indexMetadata.getIndex().getName()) + .compare(indexMetadata0, indexMetadata1); + assert comparisonResult != 0; + final var higherIndex = comparisonResult > 0 ? indexMetadata0 : indexMetadata1; + final var lowerIndex = comparisonResult > 0 ? indexMetadata1 : indexMetadata0; + + final var clusterState = ClusterState.builder(ClusterName.DEFAULT) + .nodes(discoveryNodes) + .metadata(metadata) + .routingTable(routingTable) + .build(); + + final var settings = throttleSettings(); + final var clusterSettings = new ClusterSettings(settings, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS); + + final var desiredBalance = desiredBalance(clusterState, (shardId, nodeId) -> true); + final var assignReplicas = new AtomicBoolean(false); + + final var allocationService = createTestAllocationService( + routingAllocation -> reconcile(routingAllocation, desiredBalance), + new SameShardAllocationDecider(settings, clusterSettings), + new ReplicaAfterPrimaryActiveAllocationDecider(), + new ThrottlingAllocationDecider(settings, clusterSettings), + new AllocationDecider() { + @Override + public Decision canAllocate(ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) { + return (shardRouting.primary() && node.nodeId().equals("node-0")) || assignReplicas.get() ? Decision.YES : Decision.NO; + } + } + ); + + final TriFunction primaryGetter = (state, indexMetadata, shardId) -> state + .routingTable() + .shardRoutingTable(indexMetadata.getIndex().getName(), shardId) + .primaryShard(); + + final var state1 = startInitializingShardsAndReroute(allocationService, clusterState); + assertTrue(primaryGetter.apply(state1, higherIndex, 0).initializing()); + assertTrue(primaryGetter.apply(state1, higherIndex, 1).unassigned()); + assertTrue(primaryGetter.apply(state1, lowerIndex, 0).unassigned()); + assertTrue(primaryGetter.apply(state1, lowerIndex, 1).unassigned()); + + final var state2 = startInitializingShardsAndReroute(allocationService, state1); + assertTrue(primaryGetter.apply(state2, higherIndex, 0).started()); + assertTrue(primaryGetter.apply(state2, higherIndex, 1).initializing()); + assertTrue(primaryGetter.apply(state2, lowerIndex, 0).unassigned()); + assertTrue(primaryGetter.apply(state2, lowerIndex, 1).unassigned()); + + final var state3 = startInitializingShardsAndReroute(allocationService, state2); + assertTrue(primaryGetter.apply(state3, higherIndex, 0).started()); + assertTrue(primaryGetter.apply(state3, higherIndex, 1).started()); + assertTrue(primaryGetter.apply(state3, lowerIndex, 0).initializing()); + assertTrue(primaryGetter.apply(state3, lowerIndex, 1).unassigned()); + + final var state4 = startInitializingShardsAndReroute(allocationService, state3); + assertTrue(primaryGetter.apply(state4, higherIndex, 0).started()); + assertTrue(primaryGetter.apply(state4, higherIndex, 1).started()); + assertTrue(primaryGetter.apply(state4, lowerIndex, 0).started()); + assertTrue(primaryGetter.apply(state4, lowerIndex, 1).initializing()); + + final var state5 = startInitializingShardsAndReroute(allocationService, state4); + assertTrue(primaryGetter.apply(state5, higherIndex, 0).started()); + assertTrue(primaryGetter.apply(state5, higherIndex, 1).started()); + assertTrue(primaryGetter.apply(state5, lowerIndex, 0).started()); + assertTrue(primaryGetter.apply(state5, lowerIndex, 1).started()); + + final TriFunction replicaGetter = (state, indexMetadata, shardId) -> state + .routingTable() + .shardRoutingTable(indexMetadata.getIndex().getName(), shardId) + .replicaShards() + .get(0); + + assignReplicas.set(true); + + final var state6 = startInitializingShardsAndReroute(allocationService, state5); + assertTrue(replicaGetter.apply(state6, higherIndex, 0).initializing()); + assertTrue(replicaGetter.apply(state6, higherIndex, 1).unassigned()); + assertTrue(replicaGetter.apply(state6, lowerIndex, 0).unassigned()); + assertTrue(replicaGetter.apply(state6, lowerIndex, 1).unassigned()); + + final var state7 = startInitializingShardsAndReroute(allocationService, state6); + assertTrue(replicaGetter.apply(state7, higherIndex, 0).started()); + assertTrue(replicaGetter.apply(state7, higherIndex, 1).initializing()); + assertTrue(replicaGetter.apply(state7, lowerIndex, 0).unassigned()); + assertTrue(replicaGetter.apply(state7, lowerIndex, 1).unassigned()); + + final var state8 = startInitializingShardsAndReroute(allocationService, state7); + assertTrue(replicaGetter.apply(state8, higherIndex, 0).started()); + assertTrue(replicaGetter.apply(state8, higherIndex, 1).started()); + assertTrue(replicaGetter.apply(state8, lowerIndex, 0).initializing()); + assertTrue(replicaGetter.apply(state8, lowerIndex, 1).unassigned()); + + final var state9 = startInitializingShardsAndReroute(allocationService, state8); + assertTrue(replicaGetter.apply(state9, higherIndex, 0).started()); + assertTrue(replicaGetter.apply(state9, higherIndex, 1).started()); + assertTrue(replicaGetter.apply(state9, lowerIndex, 0).started()); + assertTrue(replicaGetter.apply(state9, lowerIndex, 1).initializing()); + + final var state10 = startInitializingShardsAndReroute(allocationService, state9); + assertTrue(replicaGetter.apply(state10, higherIndex, 0).started()); + assertTrue(replicaGetter.apply(state10, higherIndex, 1).started()); + assertTrue(replicaGetter.apply(state10, lowerIndex, 0).started()); + assertTrue(replicaGetter.apply(state10, lowerIndex, 1).started()); + } + + public void testUnassignedRespectsDesiredBalance() { + final var discoveryNodes = discoveryNodes(5); + final var metadata = Metadata.builder(); + final var routingTable = RoutingTable.builder(); + + for (var i = 0; i < 5; i++) { + final var indexMetadata = randomPriorityIndex("index-" + i, between(1, 5), between(0, 4)); + metadata.put(indexMetadata, true); + routingTable.addAsNew(indexMetadata); + } + + final var clusterState = ClusterState.builder(ClusterName.DEFAULT) + .nodes(discoveryNodes) + .metadata(metadata) + .routingTable(routingTable) + .build(); + + final var settings = Settings.EMPTY; + final var clusterSettings = new ClusterSettings(settings, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS); + + final var desiredBalance = desiredBalance(clusterState, (shardId, nodeId) -> true); + final var allocationService = createTestAllocationService( + routingAllocation -> reconcile(routingAllocation, desiredBalance), + new SameShardAllocationDecider(settings, clusterSettings), + new ReplicaAfterPrimaryActiveAllocationDecider() + ); + + ClusterState reroutedState = clusterState; + var changed = true; + while (changed) { + final var nextState = startInitializingShardsAndReroute(allocationService, reroutedState); + changed = nextState != reroutedState; + reroutedState = nextState; + } + + boolean anyAssigned = false; + for (final var indexRoutingTable : reroutedState.routingTable()) { + for (int shardId = 0; shardId < indexRoutingTable.size(); shardId++) { + final var indexShardRoutingTable = indexRoutingTable.shard(shardId); + final var nodeIds = new HashSet(); + for (int copy = 0; copy < indexShardRoutingTable.size(); copy++) { + final var shardRouting = indexShardRoutingTable.shard(copy); + if (shardRouting.started()) { + anyAssigned = true; + nodeIds.add(shardRouting.currentNodeId()); + } else { + assertTrue(shardRouting.unassigned()); + } + } + assertTrue(desiredBalance.getAssignment(indexShardRoutingTable.shardId()).nodeIds().containsAll(nodeIds)); + } + } + + assertNotEquals(anyAssigned, desiredBalance.assignments().values().stream().map(ShardAssignment::nodeIds).allMatch(Set::isEmpty)); + } + + public void testUnassignedAllocationPredictsDiskUsage() { + final var discoveryNodes = discoveryNodes(1); + final var metadata = Metadata.builder(); + final var routingTable = RoutingTable.builder(); + + final var existingIndexMetadata = randomPriorityIndex("index-existing", 1, 0); + metadata.put(existingIndexMetadata, true); + routingTable.addAsRecovery(existingIndexMetadata); + + final var restoredIndexMetadata = randomPriorityIndex("index-restored", 1, 0); + metadata.put(restoredIndexMetadata, true); + final var recoverySource = new RecoverySource.SnapshotRecoverySource( + UUIDs.randomBase64UUID(random()), + new Snapshot("repo", new SnapshotId("snap", UUIDs.randomBase64UUID(random()))), + Version.CURRENT, + new IndexId("index", UUIDs.randomBase64UUID(random())) + ); + routingTable.addAsRestore(restoredIndexMetadata, recoverySource); + + final var clusterState = ClusterState.builder(ClusterName.DEFAULT) + .nodes(discoveryNodes) + .metadata(metadata) + .routingTable(routingTable) + .build(); + + final var settings = Settings.EMPTY; + final var clusterSettings = new ClusterSettings(settings, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS); + + final var existingShardSize = randomNonNegativeLong(); + final var shardSizesBuilder = ImmutableOpenMap.builder(); + shardSizesBuilder.put( + shardIdentifierFromRouting(clusterState.routingTable().shardRoutingTable("index-existing", 0).primaryShard()), + existingShardSize + ); + final var clusterInfo = new ClusterInfo( + ImmutableOpenMap.of(), + ImmutableOpenMap.of(), + shardSizesBuilder.build(), + ImmutableOpenMap.of(), + ImmutableOpenMap.of(), + ImmutableOpenMap.of() + ); + + final var restoredShardSize = randomNonNegativeLong(); + final var snapshotSizesBuilder = ImmutableOpenMap.builder(); + snapshotSizesBuilder.put( + new InternalSnapshotsInfoService.SnapshotShard( + recoverySource.snapshot(), + recoverySource.index(), + new ShardId(restoredIndexMetadata.getIndex(), 0) + ), + restoredShardSize + ); + final var snapshotShardSizeInfo = new SnapshotShardSizeInfo(snapshotSizesBuilder.build()); + + final var desiredBalance = desiredBalance(clusterState, (shardId, nodeId) -> true); + final var allocationService = createTestAllocationService( + routingAllocation -> reconcile(routingAllocation, desiredBalance), + () -> clusterInfo, + () -> snapshotShardSizeInfo, + new SameShardAllocationDecider(settings, clusterSettings), + new ReplicaAfterPrimaryActiveAllocationDecider() + ); + + final var reroutedState = allocationService.reroute(clusterState, "test", ActionListener.noop()); + + final var existingShard = reroutedState.routingTable().shardRoutingTable("index-existing", 0).primaryShard(); + assertTrue(existingShard.initializing()); + assertThat(existingShard.getExpectedShardSize(), equalTo(existingShardSize)); + + final var restoredShard = reroutedState.routingTable().shardRoutingTable("index-restored", 0).primaryShard(); + assertTrue(restoredShard.initializing()); + assertThat(restoredShard.getExpectedShardSize(), equalTo(restoredShardSize)); + } + + public void testUnassignedSkipsEquivalentReplicas() { + final var discoveryNodes = discoveryNodes(2); + final var metadata = Metadata.builder(); + final var routingTable = RoutingTable.builder(); + + final var indexMetadata = randomPriorityIndex("index-0", 1, between(0, 5)); + metadata.put(indexMetadata, true); + routingTable.addAsNew(indexMetadata); + + final var clusterState = ClusterState.builder(ClusterName.DEFAULT) + .nodes(discoveryNodes) + .metadata(metadata) + .routingTable(routingTable) + .build(); + + final var settings = Settings.EMPTY; + final var clusterSettings = new ClusterSettings(settings, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS); + + final var triedReplica = new AtomicBoolean(); + final var replicaDecision = randomFrom(Decision.THROTTLE, Decision.NO); + final var desiredBalance = desiredBalance(clusterState, (shardId, nodeId) -> true); + final var allocationService = createTestAllocationService( + routingAllocation -> reconcile(routingAllocation, desiredBalance), + new SameShardAllocationDecider(settings, clusterSettings), + new ReplicaAfterPrimaryActiveAllocationDecider(), + new AllocationDecider() { + @Override + public Decision canAllocate(ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) { + if (shardRouting.primary()) { + return Decision.YES; + } else { + // there are two replicas but they're equivalent so we should only call canAllocate once. + assert triedReplica.compareAndSet(false, true); + return replicaDecision; + } + } + } + ); + + var reroutedState = clusterState; + boolean changed; + do { + triedReplica.set(false); + final var newState = startInitializingShardsAndReroute(allocationService, reroutedState); + changed = newState != reroutedState; + reroutedState = newState; + } while (changed); + + assertTrue( + reroutedState.routingTable() + .shardRoutingTable("index-0", 0) + .replicaShards() + .stream() + .allMatch( + shardRouting -> shardRouting.unassignedInfo().getLastAllocationStatus() == UnassignedInfo.AllocationStatus.NO_ATTEMPT + ) + ); + } + + public void testUnassignedSetsAllocationStatusOnUnassignedShards() { + final var discoveryNodes = discoveryNodes(2); + final var metadata = Metadata.builder(); + final var routingTable = RoutingTable.builder(); + + final var indexMetadata = randomPriorityIndex("index-0", 1, between(0, 5)); + metadata.put(indexMetadata, true); + routingTable.addAsNew(indexMetadata); + + final var clusterState = ClusterState.builder(ClusterName.DEFAULT) + .nodes(discoveryNodes) + .metadata(metadata) + .routingTable(routingTable) + .build(); + + final var settings = Settings.EMPTY; + final var clusterSettings = new ClusterSettings(settings, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS); + + final var assignPrimary = new AtomicBoolean(false); + final var nonYesDecision = randomFrom(Decision.THROTTLE, Decision.NO); + final var desiredBalance = desiredBalance(clusterState, (shardId, nodeId) -> true); + final var allocationService = createTestAllocationService( + routingAllocation -> reconcile(routingAllocation, desiredBalance), + new SameShardAllocationDecider(settings, clusterSettings), + new ReplicaAfterPrimaryActiveAllocationDecider(), + new AllocationDecider() { + @Override + public Decision canAllocate(ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) { + if (shardRouting.primary()) { + return assignPrimary.get() ? Decision.YES : nonYesDecision; + } else { + return nonYesDecision; + } + } + } + ); + + final var redState = startInitializingShardsAndReroute(allocationService, clusterState); + assertEquals( + nonYesDecision == Decision.NO + ? UnassignedInfo.AllocationStatus.DECIDERS_NO + : UnassignedInfo.AllocationStatus.DECIDERS_THROTTLED, + redState.routingTable().shardRoutingTable("index-0", 0).primaryShard().unassignedInfo().getLastAllocationStatus() + ); + + assignPrimary.set(true); + final var yellowState = startInitializingShardsAndReroute( + allocationService, + startInitializingShardsAndReroute(allocationService, redState) + ); + for (final var shardRouting : yellowState.routingTable().shardRoutingTable("index-0", 0).replicaShards()) { + assertEquals(UnassignedInfo.AllocationStatus.NO_ATTEMPT, shardRouting.unassignedInfo().getLastAllocationStatus()); + } + } + + public void testUnassignedPrimariesThrottlingAndFallback() { + // we fall back to trying all nodes if an unassigned primary cannot be assigned to a desired node, but only if the desired nodes + // aren't just throttled + + final var discoveryNodes = discoveryNodes(2); + final var metadata = Metadata.builder(); + final var routingTable = RoutingTable.builder(); + + final var indexMetadata0 = randomPriorityIndex("index-0", 2, 0); + metadata.put(indexMetadata0, true); + routingTable.addAsNew(indexMetadata0); + + final var clusterState = ClusterState.builder(ClusterName.DEFAULT) + .nodes(discoveryNodes) + .metadata(metadata) + .routingTable(routingTable) + .build(); + + final var settings = throttleSettings(); + final var clusterSettings = new ClusterSettings(settings, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS); + + final var desiredBalance = desiredBalance(clusterState, (shardId, nodeId) -> nodeId.equals("node-0")); + final var allocationFilter = new AtomicReference>(); + + final var allocationService = createTestAllocationService( + routingAllocation -> reconcile(routingAllocation, desiredBalance), + new SameShardAllocationDecider(settings, clusterSettings), + new ReplicaAfterPrimaryActiveAllocationDecider(), + new ThrottlingAllocationDecider(settings, clusterSettings), + new AllocationDecider() { + @Override + public Decision canAllocate(ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) { + return allocationFilter.get().test(shardRouting.getId(), node.nodeId()) ? Decision.YES : Decision.NO; + } + } + ); + + final var unused = ActionListener.noop(); + + // first assign the primary of [index-0][0] (no other shards may be allocated due to allocation filter) + allocationFilter.set((shardId, nodeId) -> shardId == 0); + final var stateWithOneInitializingPrimary = allocationService.reroute(clusterState, "test", unused); + { + final var shard0RoutingTable = stateWithOneInitializingPrimary.routingTable().shardRoutingTable("index-0", 0); + assertTrue(shard0RoutingTable.primaryShard().initializing()); + assertThat(shard0RoutingTable.primaryShard().currentNodeId(), equalTo("node-0")); + final var shard1RoutingTable = stateWithOneInitializingPrimary.routingTable().shardRoutingTable("index-0", 1); + assertTrue(shard1RoutingTable.primaryShard().unassigned()); + } + + // now relax the allocation filter and ensure that [index-0][1] still isn't assigned due to throttling on the desired node + allocationFilter.set((shardId, nodeId) -> true); + final var stateStillWithOneInitializingPrimary = allocationService.reroute(stateWithOneInitializingPrimary, "test", unused); + { + final var shard0RoutingTable = stateStillWithOneInitializingPrimary.routingTable().shardRoutingTable("index-0", 0); + assertTrue(shard0RoutingTable.primaryShard().initializing()); + assertThat(shard0RoutingTable.primaryShard().currentNodeId(), equalTo("node-0")); + final var shard1RoutingTable = stateStillWithOneInitializingPrimary.routingTable().shardRoutingTable("index-0", 1); + assertTrue(shard1RoutingTable.primaryShard().unassigned()); + } + + // now forbid [index-0][1] from its desired node and see that it falls back to the undesired node + allocationFilter.set((shardId, nodeId) -> nodeId.equals("node-1")); + final var stateWithBothInitializingPrimaries = allocationService.reroute(stateStillWithOneInitializingPrimary, "test", unused); + { + final var shard0RoutingTable = stateWithBothInitializingPrimaries.routingTable().shardRoutingTable("index-0", 0); + assertTrue(shard0RoutingTable.primaryShard().initializing()); + assertThat(shard0RoutingTable.primaryShard().currentNodeId(), equalTo("node-0")); + final var shard1RoutingTable = stateWithBothInitializingPrimaries.routingTable().shardRoutingTable("index-0", 1); + assertTrue(shard1RoutingTable.primaryShard().initializing()); + assertThat(shard1RoutingTable.primaryShard().currentNodeId(), equalTo("node-1")); + } + } + + public void testMoveShards() { + final var discoveryNodes = discoveryNodes(4); + final var metadata = Metadata.builder(); + final var routingTable = RoutingTable.builder(); + + final var indexMetadata = randomPriorityIndex("index-0", 3, 1); + metadata.put(indexMetadata, true); + routingTable.addAsNew(indexMetadata); + + var clusterState = ClusterState.builder(ClusterName.DEFAULT) + .nodes(discoveryNodes) + .metadata(metadata) + .routingTable(routingTable) + .build(); + + final var settings = Settings.builder() + .put(throttleSettings()) + .putList( + FilterAllocationDecider.CLUSTER_ROUTING_INCLUDE_GROUP_SETTING.getConcreteSettingForNamespace("_id").getKey(), + "node-0", + "node-1" + ) + .build(); + final var clusterSettings = new ClusterSettings(settings, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS); + + final var canAllocateRef = new AtomicReference<>(Decision.YES); + + final var desiredBalance = new AtomicReference<>(desiredBalance(clusterState, (shardId, nodeId) -> true)); + final var allocationService = createTestAllocationService( + routingAllocation -> reconcile(routingAllocation, desiredBalance.get()), + new SameShardAllocationDecider(settings, clusterSettings), + new ReplicaAfterPrimaryActiveAllocationDecider(), + new ThrottlingAllocationDecider(settings, clusterSettings), + new FilterAllocationDecider(settings, clusterSettings), + new NodeShutdownAllocationDecider(), + new NodeReplacementAllocationDecider(), + new AllocationDecider() { + @Override + public Decision canRebalance(RoutingAllocation allocation) { + return Decision.NO; + } + + @Override + public Decision canAllocate(ShardRouting shardRouting, RoutingAllocation allocation) { + return canAllocateRef.get(); + } + } + ); + + boolean changed; + do { + final var newState = startInitializingShardsAndReroute(allocationService, clusterState); + changed = newState != clusterState; + clusterState = newState; + } while (changed); + + for (final var shardRouting : clusterState.routingTable().allShards()) { + assertTrue(shardRouting.started()); + assertThat(shardRouting.currentNodeId(), oneOf("node-0", "node-1")); + } + + clusterSettings.applySettings( + Settings.builder() + .putList( + FilterAllocationDecider.CLUSTER_ROUTING_INCLUDE_GROUP_SETTING.getConcreteSettingForNamespace("_id").getKey(), + "node-2", + "node-3" + ) + .build() + ); + + assertSame(clusterState, allocationService.reroute(clusterState, "test", ActionListener.noop())); // all still on desired nodes, no + // movement needed + + desiredBalance.set(desiredBalance(clusterState, (shardId, nodeId) -> nodeId.equals("node-2") || nodeId.equals("node-3"))); + + // The next reroute starts moving shards to node-2 and node-3, but interleaves the decisions between node-0 and node-1 for fairness. + // There's an inbound throttle of 1 but no outbound throttle, so without the interleaving one node would relocate 2 shards. + final var reroutedState = allocationService.reroute(clusterState, "test", ActionListener.noop()); + assertThat(reroutedState.getRoutingNodes().node("node-0").shardsWithState(ShardRoutingState.RELOCATING), hasSize(1)); + assertThat(reroutedState.getRoutingNodes().node("node-1").shardsWithState(ShardRoutingState.RELOCATING), hasSize(1)); + + // Ensuring that we check the shortcut two-param canAllocate() method up front + canAllocateRef.set(Decision.NO); + assertSame(clusterState, allocationService.reroute(clusterState, "test", ActionListener.noop())); + canAllocateRef.set(Decision.YES); + + // Restore filter to default + clusterSettings.applySettings( + Settings.builder() + .putList( + FilterAllocationDecider.CLUSTER_ROUTING_INCLUDE_GROUP_SETTING.getConcreteSettingForNamespace("_id").getKey(), + "node-0", + "node-1" + ) + .build() + ); + + // Mark node-0 as shutting down, to be replaced by node-2, so that a shard can be force-moved to node-2 even though the allocation + // filter forbids this + final var shuttingDownState = allocationService.reroute( + clusterState.copyAndUpdateMetadata( + tmpMetadata -> tmpMetadata.putCustom( + NodesShutdownMetadata.TYPE, + new NodesShutdownMetadata( + Map.of( + "node-0", + SingleNodeShutdownMetadata.builder() + .setNodeId("node-0") + .setType(SingleNodeShutdownMetadata.Type.REPLACE) + .setTargetNodeName("node-2") + .setStartedAtMillis(System.currentTimeMillis()) + .setReason("test") + .build() + ) + ) + ) + ), + "test", + ActionListener.noop() + ); + assertThat(shuttingDownState.getRoutingNodes().node("node-2").shardsWithState(ShardRoutingState.INITIALIZING), hasSize(1)); + } + + public void testRebalance() { + final var discoveryNodes = discoveryNodes(4); + final var metadata = Metadata.builder(); + final var routingTable = RoutingTable.builder(); + + final var indexMetadata = randomPriorityIndex("index-0", 3, 1); + metadata.put(indexMetadata, true); + routingTable.addAsNew(indexMetadata); + + var clusterState = ClusterState.builder(ClusterName.DEFAULT) + .nodes(discoveryNodes) + .metadata(metadata) + .routingTable(routingTable) + .build(); + + final var settings = throttleSettings(); + final var clusterSettings = new ClusterSettings(settings, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS); + + final var canAllocateShardRef = new AtomicReference<>(Decision.YES); + final var canRebalanceGlobalRef = new AtomicReference<>(Decision.YES); + final var canRebalanceShardRef = new AtomicReference<>(Decision.YES); + + final var desiredBalance = new AtomicReference<>( + desiredBalance(clusterState, (shardId, nodeId) -> nodeId.equals("node-0") || nodeId.equals("node-1")) + ); + final var allocationService = createTestAllocationService( + routingAllocation -> reconcile(routingAllocation, desiredBalance.get()), + new SameShardAllocationDecider(settings, clusterSettings), + new ReplicaAfterPrimaryActiveAllocationDecider(), + new ThrottlingAllocationDecider(settings, clusterSettings), + new AllocationDecider() { + @Override + public Decision canRebalance(RoutingAllocation allocation) { + return canRebalanceGlobalRef.get(); + } + + @Override + public Decision canRebalance(ShardRouting shardRouting, RoutingAllocation allocation) { + return canRebalanceShardRef.get(); + } + + @Override + public Decision canAllocate(ShardRouting shardRouting, RoutingAllocation allocation) { + return canAllocateShardRef.get(); + } + } + ); + + boolean changed; + do { + final var newState = startInitializingShardsAndReroute(allocationService, clusterState); + changed = newState != clusterState; + clusterState = newState; + } while (changed); + + for (final var shardRouting : clusterState.routingTable().allShards()) { + assertTrue(shardRouting.started()); + assertThat(shardRouting.currentNodeId(), oneOf("node-0", "node-1")); + } + + assertSame(clusterState, allocationService.reroute(clusterState, "test", ActionListener.noop())); // all still on desired nodes, no + // movement needed + + desiredBalance.set(desiredBalance(clusterState, (shardId, nodeId) -> nodeId.equals("node-2") || nodeId.equals("node-3"))); + + canRebalanceGlobalRef.set(Decision.NO); + assertSame(clusterState, allocationService.reroute(clusterState, "test", ActionListener.noop())); // rebalancing forbidden on all + // shards, no movement + canRebalanceGlobalRef.set(Decision.YES); + + canRebalanceShardRef.set(Decision.NO); + assertSame(clusterState, allocationService.reroute(clusterState, "test", ActionListener.noop())); // rebalancing forbidden on + // specific shards, no movement + canRebalanceShardRef.set(Decision.YES); + + canAllocateShardRef.set(Decision.NO); + assertSame(clusterState, allocationService.reroute(clusterState, "test", ActionListener.noop())); // allocation not possible, no + // movement + canAllocateShardRef.set(Decision.YES); + + // The next reroute starts moving shards to node-2 and node-3, but interleaves the decisions between node-0 and node-1 for fairness. + // There's an inbound throttle of 1 but no outbound throttle, so without the interleaving one node would relocate 2 shards. + final var reroutedState = allocationService.reroute(clusterState, "test", ActionListener.noop()); + assertThat(reroutedState.getRoutingNodes().node("node-0").shardsWithState(ShardRoutingState.RELOCATING), hasSize(1)); + assertThat(reroutedState.getRoutingNodes().node("node-1").shardsWithState(ShardRoutingState.RELOCATING), hasSize(1)); + } + + private static void reconcile(RoutingAllocation routingAllocation, DesiredBalance desiredBalance) { + new DesiredBalanceReconciler(desiredBalance, routingAllocation, new NodeAllocationOrdering()).run(); + } + + private static AllocationService createTestAllocationService( + Consumer allocationConsumer, + AllocationDecider... allocationDeciders + ) { + return createTestAllocationService( + allocationConsumer, + () -> ClusterInfo.EMPTY, + () -> SnapshotShardSizeInfo.EMPTY, + allocationDeciders + ); + } + + private static AllocationService createTestAllocationService( + Consumer allocationConsumer, + ClusterInfoService clusterInfoService, + SnapshotsInfoService snapshotsInfoService, + AllocationDecider... allocationDeciders + ) { + final var allocationService = new AllocationService(new AllocationDeciders(List.of(allocationDeciders)), new ShardsAllocator() { + @Override + public void allocate(RoutingAllocation allocation) { + allocationConsumer.accept(allocation); + } + + @Override + public ShardAllocationDecision decideShardAllocation(ShardRouting shard, RoutingAllocation allocation) { + throw new AssertionError("should not be called"); + } + }, clusterInfoService, snapshotsInfoService); + allocationService.setExistingShardsAllocators(Map.of(GatewayAllocator.ALLOCATOR_NAME, new NoOpExistingShardsAllocator())); + return allocationService; + } + + private static class NoOpExistingShardsAllocator implements ExistingShardsAllocator { + @Override + public void beforeAllocation(RoutingAllocation allocation) {} + + @Override + public void afterPrimariesBeforeReplicas(RoutingAllocation allocation) {} + + @Override + public void allocateUnassigned( + ShardRouting shardRouting, + RoutingAllocation allocation, + UnassignedAllocationHandler unassignedAllocationHandler + ) {} + + @Override + public AllocateUnassignedDecision explainUnassignedShardAllocation( + ShardRouting unassignedShard, + RoutingAllocation routingAllocation + ) { + throw new AssertionError("should not be called"); + } + + @Override + public void cleanCaches() {} + + @Override + public void applyStartedShards(List startedShards, RoutingAllocation allocation) {} + + @Override + public void applyFailedShards(List failedShards, RoutingAllocation allocation) {} + + @Override + public int getNumberOfInFlightFetches() { + return 0; + } + } + + private static DesiredBalance desiredBalance(ClusterState clusterState, BiPredicate isDesiredPredicate) { + return new DesiredBalance( + 1, + StreamSupport.stream(clusterState.routingTable().spliterator(), false) + .flatMap(indexRoutingTable -> IntStream.range(0, indexRoutingTable.size()).mapToObj(indexRoutingTable::shard)) + .collect( + Collectors.toMap( + IndexShardRoutingTable::shardId, + indexShardRoutingTable -> clusterState.nodes() + .stream() + .map(DiscoveryNode::getId) + .filter(nodeId -> isDesiredPredicate.test(indexShardRoutingTable.shardId(), nodeId)) + .collect(Collectors.collectingAndThen(Collectors.toSet(), set -> new ShardAssignment(set, set.size(), 0, 0))) + ) + ) + ); + } + + private static DiscoveryNodes discoveryNodes(int nodeCount) { + final var discoveryNodes = DiscoveryNodes.builder(); + for (var i = 0; i < nodeCount; i++) { + final var transportAddress = buildNewFakeTransportAddress(); + final var discoveryNode = new DiscoveryNode( + "node-" + i, + "node-" + i, + UUIDs.randomBase64UUID(random()), + transportAddress.address().getHostString(), + transportAddress.getAddress(), + transportAddress, + Map.of(), + Set.of(DiscoveryNodeRole.MASTER_ROLE, DiscoveryNodeRole.DATA_ROLE), + Version.CURRENT + ); + discoveryNodes.add(discoveryNode); + } + discoveryNodes.masterNodeId("node-0").localNodeId("node-0"); + return discoveryNodes.build(); + } + + @BeforeClass + public static void populateCreationDates() { + creationDates = randomArray(5, 5, Long[]::new, ESTestCase::randomNonNegativeLong); + } + + // use relatively small set of creation dates so that they will occasionally be equal + private static Long[] creationDates; + + private static IndexMetadata randomPriorityIndex(String name, int numberOfShards, int numberOfReplicas) { + return IndexMetadata.builder(name) + .settings( + Settings.builder() + .put(SETTING_NUMBER_OF_SHARDS, numberOfShards) + .put(SETTING_NUMBER_OF_REPLICAS, numberOfReplicas) + .put(SETTING_INDEX_VERSION_CREATED.getKey(), Version.CURRENT) + .put(IndexMetadata.INDEX_PRIORITY_SETTING.getKey(), between(1, 5)) + .put(IndexMetadata.SETTING_CREATION_DATE, randomFrom(creationDates)) + ) + .system(randomBoolean()) + .build(); + } + + private static Settings throttleSettings() { + return Settings.builder() + .put(CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_PRIMARIES_RECOVERIES_SETTING.getKey(), 1) + .put(CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_INCOMING_RECOVERIES_SETTING.getKey(), 1) + .put(CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_OUTGOING_RECOVERIES_SETTING.getKey(), 1000) + .build(); + } +} diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceShardsAllocatorTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceShardsAllocatorTests.java new file mode 100644 index 0000000000000..a4ed26dc7b74b --- /dev/null +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceShardsAllocatorTests.java @@ -0,0 +1,466 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0 and the Server Side Public License, v 1; you may not use this file except + * in compliance with, at your election, the Elastic License 2.0 or the Server + * Side Public License, v 1. + */ + +package org.elasticsearch.cluster.routing.allocation.allocator; + +import org.apache.lucene.util.SetOnce; +import org.elasticsearch.Version; +import org.elasticsearch.action.ActionListener; +import org.elasticsearch.cluster.ClusterInfo; +import org.elasticsearch.cluster.ClusterName; +import org.elasticsearch.cluster.ClusterState; +import org.elasticsearch.cluster.ClusterStateUpdateTask; +import org.elasticsearch.cluster.block.ClusterBlocks; +import org.elasticsearch.cluster.metadata.IndexMetadata; +import org.elasticsearch.cluster.metadata.Metadata; +import org.elasticsearch.cluster.node.DiscoveryNode; +import org.elasticsearch.cluster.node.DiscoveryNodeRole; +import org.elasticsearch.cluster.node.DiscoveryNodes; +import org.elasticsearch.cluster.routing.RoutingTable; +import org.elasticsearch.cluster.routing.ShardRouting; +import org.elasticsearch.cluster.routing.UnassignedInfo; +import org.elasticsearch.cluster.routing.allocation.AllocationService; +import org.elasticsearch.cluster.routing.allocation.ExistingShardsAllocator; +import org.elasticsearch.cluster.routing.allocation.RoutingAllocation; +import org.elasticsearch.cluster.routing.allocation.ShardAllocationDecision; +import org.elasticsearch.cluster.routing.allocation.command.MoveAllocationCommand; +import org.elasticsearch.cluster.routing.allocation.decider.AllocationDeciders; +import org.elasticsearch.cluster.service.ClusterApplierService; +import org.elasticsearch.cluster.service.ClusterService; +import org.elasticsearch.cluster.service.FakeThreadPoolMasterService; +import org.elasticsearch.common.UUIDs; +import org.elasticsearch.common.settings.ClusterSettings; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.util.concurrent.DeterministicTaskQueue; +import org.elasticsearch.common.util.concurrent.PrioritizedEsThreadPoolExecutor; +import org.elasticsearch.gateway.GatewayAllocator; +import org.elasticsearch.snapshots.SnapshotShardSizeInfo; +import org.elasticsearch.test.ClusterServiceUtils; +import org.elasticsearch.test.ESTestCase; +import org.elasticsearch.threadpool.TestThreadPool; + +import java.util.List; +import java.util.Map; +import java.util.Queue; +import java.util.Set; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.function.BiConsumer; +import java.util.function.Consumer; +import java.util.function.Predicate; + +import static org.elasticsearch.cluster.metadata.IndexMetadata.SETTING_INDEX_VERSION_CREATED; +import static org.elasticsearch.cluster.metadata.IndexMetadata.SETTING_NUMBER_OF_REPLICAS; +import static org.elasticsearch.cluster.metadata.IndexMetadata.SETTING_NUMBER_OF_SHARDS; +import static org.hamcrest.Matchers.equalTo; + +public class DesiredBalanceShardsAllocatorTests extends ESTestCase { + + public void testGatewayAllocatorPreemptsAllocation() { + testAllocate( + (allocation, unassignedAllocationHandler) -> unassignedAllocationHandler.initialize( + allocation.nodes().getLocalNodeId(), + null, + 0L, + allocation.changes() + ), + routingTable -> assertTrue(routingTable.index("test-index").shard(0).primaryShard().assignedToNode()) + ); + } + + public void testGatewayAllocatorStillFetching() { + testAllocate( + (allocation, unassignedAllocationHandler) -> unassignedAllocationHandler.removeAndIgnore( + UnassignedInfo.AllocationStatus.FETCHING_SHARD_DATA, + allocation.changes() + ), + routingTable -> { + var shardRouting = routingTable.shardRoutingTable("test-index", 0).primaryShard(); + assertFalse(shardRouting.assignedToNode()); + assertThat( + shardRouting.unassignedInfo().getLastAllocationStatus(), + equalTo(UnassignedInfo.AllocationStatus.FETCHING_SHARD_DATA) + ); + } + ); + } + + public void testGatewayAllocatorDoesNothing() { + testAllocate((allocation, unassignedAllocationHandler) -> {}, routingTable -> { + var shardRouting = routingTable.shardRoutingTable("test-index", 0).primaryShard(); + assertTrue(shardRouting.assignedToNode());// assigned by a followup reconciliation + assertThat(shardRouting.unassignedInfo().getLastAllocationStatus(), equalTo(UnassignedInfo.AllocationStatus.NO_ATTEMPT)); + }); + } + + public void testAllocate( + BiConsumer allocateUnassigned, + Consumer verifier + ) { + var deterministicTaskQueue = new DeterministicTaskQueue(); + var threadPool = deterministicTaskQueue.getThreadPool(); + + var localNode = createDiscoveryNode("node-1"); + var initialState = ClusterState.builder(new ClusterName(ClusterServiceUtils.class.getSimpleName())) + .nodes(DiscoveryNodes.builder().add(localNode).localNodeId(localNode.getId()).masterNodeId(localNode.getId())) + .blocks(ClusterBlocks.EMPTY_CLUSTER_BLOCK) + .build(); + + var clusterSettings = new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS); + var clusterService = new ClusterService( + Settings.EMPTY, + clusterSettings, + new FakeThreadPoolMasterService("node-1", "test", threadPool, deterministicTaskQueue::scheduleNow), + new ClusterApplierService("node-1", Settings.EMPTY, clusterSettings, threadPool) { + @Override + protected PrioritizedEsThreadPoolExecutor createThreadPoolExecutor() { + return deterministicTaskQueue.getPrioritizedEsThreadPoolExecutor(); + } + } + ); + clusterService.getClusterApplierService().setInitialState(initialState); + clusterService.setNodeConnectionsService(ClusterServiceUtils.createNoOpNodeConnectionsService()); + clusterService.getMasterService() + .setClusterStatePublisher(ClusterServiceUtils.createClusterStatePublisher(clusterService.getClusterApplierService())); + clusterService.getMasterService().setClusterStateSupplier(clusterService.getClusterApplierService()::state); + clusterService.start(); + + var allocationServiceRef = new SetOnce(); + var reconcileAction = new DesiredBalanceShardsAllocator.DesiredBalanceReconcilerAction() { + @Override + public ClusterState apply(ClusterState clusterState, Consumer routingAllocationAction) { + return allocationServiceRef.get().executeWithRoutingAllocation(clusterState, "reconcile", routingAllocationAction); + } + }; + + var allocationService = createAllocationService( + new DesiredBalanceShardsAllocator(createShardsAllocator(), threadPool, clusterService, reconcileAction), + createGatewayAllocator(allocateUnassigned) + ); + allocationServiceRef.set(allocationService); + + var listenerCalled = new AtomicBoolean(false); + clusterService.submitUnbatchedStateUpdateTask("test", new ClusterStateUpdateTask() { + @Override + public ClusterState execute(ClusterState currentState) { + var indexMetadata = createIndex("test-index"); + var newState = ClusterState.builder(currentState) + .metadata(Metadata.builder(currentState.metadata()).put(indexMetadata, true)) + .routingTable(RoutingTable.builder(currentState.routingTable()).addAsNew(indexMetadata)) + .build(); + return allocationService.reroute( + newState, + "test", + ActionListener.wrap( + response -> listenerCalled.set(true), + exception -> { throw new AssertionError("should not happen in test", exception); } + ) + ); + } + + @Override + public void onFailure(Exception e) { + throw new AssertionError(e); + } + }); + deterministicTaskQueue.runAllTasks(); + + try { + assertTrue(listenerCalled.get()); + verifier.accept(clusterService.state().routingTable()); + } finally { + clusterService.close(); + } + } + + public void testCallListenersOnlyAfterProducingFreshInput() throws InterruptedException { + + var reconciliations = new AtomicInteger(0); + var listenersCalled = new CountDownLatch(2); + var clusterStateUpdatesExecuted = new CountDownLatch(2); + + var discoveryNode = createDiscoveryNode("node-0"); + var initialState = ClusterState.builder(ClusterName.DEFAULT) + .nodes(DiscoveryNodes.builder().add(discoveryNode).localNodeId(discoveryNode.getId()).masterNodeId(discoveryNode.getId())) + .build(); + + var threadPool = new TestThreadPool(getTestName()); + var clusterService = ClusterServiceUtils.createClusterService(initialState, threadPool); + var allocationServiceRef = new SetOnce(); + var reconcileAction = new DesiredBalanceShardsAllocator.DesiredBalanceReconcilerAction() { + @Override + public ClusterState apply(ClusterState clusterState, Consumer routingAllocationAction) { + reconciliations.incrementAndGet(); + return allocationServiceRef.get().executeWithRoutingAllocation(clusterState, "reconcile", routingAllocationAction); + } + }; + + var gatewayAllocator = createGatewayAllocator(); + var shardsAllocator = createShardsAllocator(); + var desiredBalanceShardsAllocator = new DesiredBalanceShardsAllocator( + shardsAllocator, + threadPool, + clusterService, + new DesiredBalanceComputer(shardsAllocator) { + @Override + public DesiredBalance compute( + DesiredBalance previousDesiredBalance, + DesiredBalanceInput desiredBalanceInput, + Queue> pendingDesiredBalanceMoves, + Predicate isFresh + ) { + try { + // simulate slow balance computation + assertTrue(clusterStateUpdatesExecuted.await(5, TimeUnit.SECONDS)); + } catch (InterruptedException e) { + throw new AssertionError(e); + } + return super.compute(previousDesiredBalance, desiredBalanceInput, pendingDesiredBalanceMoves, isFresh); + } + }, + reconcileAction + ); + var allocationService = createAllocationService(desiredBalanceShardsAllocator, gatewayAllocator); + allocationServiceRef.set(allocationService); + + class CreateIndexTask extends ClusterStateUpdateTask { + private final String indexName; + + private CreateIndexTask(String indexName) { + this.indexName = indexName; + } + + @Override + public ClusterState execute(ClusterState currentState) throws Exception { + var indexMetadata = createIndex(indexName); + var newState = ClusterState.builder(currentState) + .metadata(Metadata.builder(currentState.metadata()).put(indexMetadata, true)) + .routingTable(RoutingTable.builder(currentState.routingTable()).addAsNew(indexMetadata)) + .build(); + return allocationService.reroute(newState, "test", ActionListener.wrap(response -> { + assertThat( + "All shards should be initializing by the time listener is called", + clusterService.state().getRoutingTable().index(indexName).primaryShardsUnassigned(), + equalTo(0) + ); + assertThat(reconciliations.get(), equalTo(1)); + listenersCalled.countDown(); + }, exception -> { throw new AssertionError("Should not happen in test", exception); })); + } + + @Override + public void clusterStateProcessed(ClusterState initialState, ClusterState newState) { + clusterStateUpdatesExecuted.countDown(); + } + + @Override + public void onFailure(Exception e) { + throw new AssertionError(e); + } + } + + clusterService.submitUnbatchedStateUpdateTask("test", new CreateIndexTask("index-1")); + clusterService.submitUnbatchedStateUpdateTask("test", new CreateIndexTask("index-2")); + + try { + assertTrue(listenersCalled.await(10, TimeUnit.SECONDS)); + assertThat("Expected single reconciliation after both state updates", reconciliations.get(), equalTo(1)); + } finally { + clusterService.close(); + terminate(threadPool); + } + } + + public void testFailListenersOnNoLongerMasterException() throws InterruptedException { + + var listenersCalled = new CountDownLatch(1); + var newMasterElected = new CountDownLatch(1); + var clusterStateUpdatesExecuted = new CountDownLatch(1); + + var node1 = createDiscoveryNode("node-1"); + var node2 = createDiscoveryNode("node-2"); + var initial = ClusterState.builder(ClusterName.DEFAULT) + .nodes(DiscoveryNodes.builder().add(node1).add(node2).localNodeId(node1.getId()).masterNodeId(node1.getId())) + .build(); + + var threadPool = new TestThreadPool(getTestName()); + var clusterService = ClusterServiceUtils.createClusterService(initial, threadPool); + var allocationServiceRef = new SetOnce(); + var reconcileAction = new DesiredBalanceShardsAllocator.DesiredBalanceReconcilerAction() { + @Override + public ClusterState apply(ClusterState clusterState, Consumer routingAllocationAction) { + return allocationServiceRef.get().executeWithRoutingAllocation(clusterState, "reconcile", routingAllocationAction); + } + }; + + var gatewayAllocator = createGatewayAllocator(); + var shardsAllocator = createShardsAllocator(); + var desiredBalanceShardsAllocator = new DesiredBalanceShardsAllocator( + shardsAllocator, + threadPool, + clusterService, + new DesiredBalanceComputer(shardsAllocator) { + @Override + public DesiredBalance compute( + DesiredBalance previousDesiredBalance, + DesiredBalanceInput desiredBalanceInput, + Queue> pendingDesiredBalanceMoves, + Predicate isFresh + ) { + try { + // fake slow balance computation + assertTrue(newMasterElected.await(5, TimeUnit.SECONDS)); + } catch (InterruptedException e) { + throw new AssertionError(e); + } + return super.compute(previousDesiredBalance, desiredBalanceInput, pendingDesiredBalanceMoves, isFresh); + } + }, + reconcileAction + ); + + var allocationService = createAllocationService(desiredBalanceShardsAllocator, gatewayAllocator); + allocationServiceRef.set(allocationService); + + clusterService.submitUnbatchedStateUpdateTask("test", new ClusterStateUpdateTask() { + @Override + public ClusterState execute(ClusterState currentState) { + var indexMetadata = createIndex("index-1"); + var newState = ClusterState.builder(currentState) + .metadata(Metadata.builder(currentState.metadata()).put(indexMetadata, true)) + .routingTable(RoutingTable.builder(currentState.routingTable()).addAsNew(indexMetadata)) + .build(); + return allocationService.reroute( + newState, + "test", + ActionListener.wrap( + response -> { throw new AssertionError("Should not happen in test"); }, + exception -> listenersCalled.countDown() + ) + ); + } + + @Override + public void clusterStateProcessed(ClusterState initialState, ClusterState newState) { + clusterStateUpdatesExecuted.countDown(); + } + + @Override + public void onFailure(Exception e) { + throw new AssertionError(e); + } + }); + + // await + assertTrue(clusterStateUpdatesExecuted.await(5, TimeUnit.SECONDS)); + + var noLongerMaster = ClusterState.builder(clusterService.state()) + .nodes(DiscoveryNodes.builder().add(node1).add(node2).localNodeId(node1.getId()).masterNodeId(node2.getId())) + .build(); + ClusterServiceUtils.setState(clusterService, noLongerMaster); + + newMasterElected.countDown(); + + try { + assertTrue(listenersCalled.await(10, TimeUnit.SECONDS)); + } finally { + clusterService.close(); + terminate(threadPool); + } + } + + private static DiscoveryNode createDiscoveryNode(String nodeId) { + var transportAddress = buildNewFakeTransportAddress(); + return new DiscoveryNode( + nodeId, + nodeId, + UUIDs.randomBase64UUID(random()), + transportAddress.address().getHostString(), + transportAddress.getAddress(), + transportAddress, + Map.of(), + Set.of(DiscoveryNodeRole.MASTER_ROLE, DiscoveryNodeRole.DATA_ROLE), + Version.CURRENT + ); + } + + private static IndexMetadata createIndex(String name) { + return IndexMetadata.builder(name) + .settings( + Settings.builder() + .put(SETTING_NUMBER_OF_SHARDS, 1) + .put(SETTING_NUMBER_OF_REPLICAS, 0) + .put(SETTING_INDEX_VERSION_CREATED.getKey(), Version.CURRENT) + ) + .build(); + } + + private static AllocationService createAllocationService( + DesiredBalanceShardsAllocator desiredBalanceShardsAllocator, + GatewayAllocator gatewayAllocator + ) { + return new AllocationService( + new AllocationDeciders(List.of()), + gatewayAllocator, + desiredBalanceShardsAllocator, + () -> ClusterInfo.EMPTY, + () -> SnapshotShardSizeInfo.EMPTY + ); + } + + private static GatewayAllocator createGatewayAllocator() { + return createGatewayAllocator(DesiredBalanceShardsAllocatorTests::initialize); + } + + private static void initialize(RoutingAllocation allocation, ExistingShardsAllocator.UnassignedAllocationHandler handler) { + handler.initialize(allocation.nodes().getLocalNodeId(), null, 0L, allocation.changes()); + } + + private static GatewayAllocator createGatewayAllocator( + BiConsumer allocateUnassigned + ) { + return new GatewayAllocator() { + + @Override + public void beforeAllocation(RoutingAllocation allocation) {} + + @Override + public void allocateUnassigned( + ShardRouting shardRouting, + RoutingAllocation allocation, + UnassignedAllocationHandler unassignedAllocationHandler + ) { + allocateUnassigned.accept(allocation, unassignedAllocationHandler); + } + + @Override + public void afterPrimariesBeforeReplicas(RoutingAllocation allocation) {} + }; + } + + private static ShardsAllocator createShardsAllocator() { + return new ShardsAllocator() { + @Override + public void allocate(RoutingAllocation allocation) { + var dataNodeId = allocation.nodes().getDataNodes().values().iterator().next().getId(); + var unassignedIterator = allocation.routingNodes().unassigned().iterator(); + while (unassignedIterator.hasNext()) { + unassignedIterator.next(); + unassignedIterator.initialize(dataNodeId, null, 0L, allocation.changes()); + } + } + + @Override + public ShardAllocationDecision decideShardAllocation(ShardRouting shard, RoutingAllocation allocation) { + throw new AssertionError("only used for allocation explain"); + } + }; + } +} diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceStatsTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceStatsTests.java new file mode 100644 index 0000000000000..721dec54dedf0 --- /dev/null +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceStatsTests.java @@ -0,0 +1,76 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0 and the Server Side Public License, v 1; you may not use this file except + * in compliance with, at your election, the Elastic License 2.0 or the Server + * Side Public License, v 1. + */ + +package org.elasticsearch.cluster.routing.allocation.allocator; + +import org.elasticsearch.common.Strings; +import org.elasticsearch.common.io.stream.Writeable; +import org.elasticsearch.test.AbstractWireSerializingTestCase; + +import java.io.IOException; +import java.util.Locale; + +import static org.hamcrest.Matchers.equalTo; + +public class DesiredBalanceStatsTests extends AbstractWireSerializingTestCase { + + @Override + protected Writeable.Reader instanceReader() { + return DesiredBalanceStats::readFrom; + } + + @Override + protected DesiredBalanceStats createTestInstance() { + return new DesiredBalanceStats( + randomNonNegativeLong(), + randomBoolean(), + randomNonNegativeLong(), + randomNonNegativeLong(), + randomNonNegativeLong(), + randomNonNegativeLong(), + randomNonNegativeLong(), + randomNonNegativeLong() + ); + } + + @Override + protected DesiredBalanceStats mutateInstance(DesiredBalanceStats instance) throws IOException { + return createTestInstance(); + } + + public void testToXContent() { + var instance = createTestInstance(); + assertThat( + Strings.toString(instance, true, false), + equalTo( + String.format( + Locale.ROOT, + """ + { + "computation_active" : %b, + "computation_submitted" : %d, + "computation_executed" : %d, + "computation_converged" : %d, + "computation_iterations" : %d, + "computation_converged_index" : %d, + "computation_time_in_millis" : %d, + "reconciliation_time_in_millis" : %d + }""", + instance.computationActive(), + instance.computationSubmitted(), + instance.computationExecuted(), + instance.computationConverged(), + instance.computationIterations(), + instance.lastConvergedIndex(), + instance.cumulativeComputationTime(), + instance.cumulativeReconciliationTime() + ) + ) + ); + } +} diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/NodeAllocationOrderingTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/NodeAllocationOrderingTests.java new file mode 100644 index 0000000000000..4b60aeb408efa --- /dev/null +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/NodeAllocationOrderingTests.java @@ -0,0 +1,47 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0 and the Server Side Public License, v 1; you may not use this file except + * in compliance with, at your election, the Elastic License 2.0 or the Server + * Side Public License, v 1. + */ + +package org.elasticsearch.cluster.routing.allocation.allocator; + +import org.elasticsearch.test.ESTestCase; + +import java.util.Set; + +import static org.hamcrest.Matchers.contains; +import static org.hamcrest.Matchers.containsInAnyOrder; +import static org.hamcrest.Matchers.equalTo; + +public class NodeAllocationOrderingTests extends ESTestCase { + + public void testSortNodeIds() { + var order = new NodeAllocationOrdering(); + order.recordAllocation("node-1"); + order.recordAllocation("node-2"); + + var nodeIds = order.sort(Set.of("node-1", "node-2", "node-3", "node-4", "node-5")); + + assertThat(nodeIds.get(4), equalTo("node-2"));// as this node received the most recent allocation + assertThat(nodeIds.get(3), equalTo("node-1")); + assertThat(nodeIds.subList(0, 3), containsInAnyOrder("node-3", "node-4", "node-5"));// no recent allocations, any could be used + } + + public void testRetainOnlyAliveNodes() { + var order = new NodeAllocationOrdering(); + order.recordAllocation("node-1"); + order.recordAllocation("node-2"); + order.recordAllocation("node-3"); + + order.retainNodes(Set.of("node-1", "node-2"));// simulate node-3 leaving the cluster + + var nodeIds = order.sort(Set.of("node-1", "node-2", "node-3"));// node-3 is back + + // node-3 should be pushed to the beginning of the list as its allocation history was cleaned when it left. + // now we assume no recent allocations on it + assertThat(nodeIds, contains("node-3", "node-1", "node-2")); + } +} diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/PendingListenersQueueTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/PendingListenersQueueTests.java new file mode 100644 index 0000000000000..977e42a1f2c62 --- /dev/null +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/PendingListenersQueueTests.java @@ -0,0 +1,73 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0 and the Server Side Public License, v 1; you may not use this file except + * in compliance with, at your election, the Elastic License 2.0 or the Server + * Side Public License, v 1. + */ + +package org.elasticsearch.cluster.routing.allocation.allocator; + +import org.elasticsearch.action.ActionListener; +import org.elasticsearch.test.ESTestCase; +import org.elasticsearch.threadpool.TestThreadPool; + +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.TimeUnit; + +import static org.hamcrest.Matchers.equalTo; + +public class PendingListenersQueueTests extends ESTestCase { + + public void testShouldExecuteOnlyCompleted() throws InterruptedException { + var threadPool = new TestThreadPool(getTestName()); + var queue = new PendingListenersQueue(threadPool); + var executed = new CountDownLatch(2); + + queue.add(1, ActionListener.wrap(executed::countDown)); + queue.add(2, ActionListener.wrap(executed::countDown)); + queue.add(3, ActionListener.wrap(() -> fail("Should not complete in test"))); + queue.complete(2); + + try { + assertThat(executed.await(1, TimeUnit.SECONDS), equalTo(true)); + } finally { + terminate(threadPool); + } + } + + public void testShouldAdvanceOnly() throws InterruptedException { + var threadPool = new TestThreadPool(getTestName()); + var queue = new PendingListenersQueue(threadPool); + var executed = new CountDownLatch(2); + + queue.add(1, ActionListener.wrap(executed::countDown)); + queue.add(2, ActionListener.wrap(executed::countDown)); + queue.add(3, ActionListener.wrap(() -> fail("Should not complete in test"))); + queue.complete(2); + queue.complete(1); + + try { + assertThat(executed.await(1, TimeUnit.SECONDS), equalTo(true)); + assertThat(queue.getCompletedIndex(), equalTo(2L)); + } finally { + terminate(threadPool); + } + } + + public void testShouldExecuteAllAsNonMaster() throws InterruptedException { + var threadPool = new TestThreadPool(getTestName()); + var queue = new PendingListenersQueue(threadPool); + var executed = new CountDownLatch(2); + + queue.add(1, ActionListener.wrap(ignored -> fail("Should not complete in test"), exception -> executed.countDown())); + queue.add(2, ActionListener.wrap(ignored -> fail("Should not complete in test"), exception -> executed.countDown())); + queue.completeAllAsNotMaster(); + + try { + assertThat(executed.await(1, TimeUnit.SECONDS), equalTo(true)); + } finally { + terminate(threadPool); + } + } +} diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/decider/AllocationDecidersTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/decider/AllocationDecidersTests.java index 0772e0fb905e5..533a499ce33ee 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/decider/AllocationDecidersTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/decider/AllocationDecidersTests.java @@ -28,6 +28,8 @@ import java.util.Collection; import java.util.List; +import java.util.Optional; +import java.util.Set; import static org.hamcrest.Matchers.equalTo; @@ -269,6 +271,58 @@ private Decision decision(RoutingAllocation allocation) { assertEquals(expectedDebugDecision, allocationDeciders.canForceAllocatePrimary(shardRouting, routingNode, allocation)); } + public void testGetForcedInitialShardAllocation() { + var deciders = new AllocationDeciders( + shuffledList( + List.of( + new AnyNodeInitialShardAllocationDecider(), + new AnyNodeInitialShardAllocationDecider(), + new AnyNodeInitialShardAllocationDecider() + ) + ) + ); + + assertThat( + deciders.getForcedInitialShardAllocationToNodes(createShardRouting(), createRoutingAllocation(deciders)), + equalTo(Optional.empty()) + ); + } + + public void testGetForcedInitialShardAllocationToFixedNode() { + var deciders = new AllocationDeciders( + shuffledList( + List.of( + new AnyNodeInitialShardAllocationDecider(), + new FixedNodesInitialShardAllocationDecider(Set.of("node-1", "node-2")), + new AnyNodeInitialShardAllocationDecider() + ) + ) + ); + + assertThat( + deciders.getForcedInitialShardAllocationToNodes(createShardRouting(), createRoutingAllocation(deciders)), + equalTo(Optional.of(Set.of("node-1", "node-2"))) + ); + } + + public void testGetForcedInitialShardAllocationToFixedNodeFromMultipleDeciders() { + var deciders = new AllocationDeciders( + shuffledList( + List.of( + new AnyNodeInitialShardAllocationDecider(), + new FixedNodesInitialShardAllocationDecider(Set.of("node-1", "node-2")), + new FixedNodesInitialShardAllocationDecider(Set.of("node-2", "node-3")), + new AnyNodeInitialShardAllocationDecider() + ) + ) + ); + + assertThat( + deciders.getForcedInitialShardAllocationToNodes(createShardRouting(), createRoutingAllocation(deciders)), + equalTo(Optional.of(Set.of("node-2"))) + ); + } + private static ShardRouting createShardRouting(Index index) { return ShardRouting.newUnassigned( new ShardId(index, 0), @@ -277,4 +331,29 @@ private static ShardRouting createShardRouting(Index index) { new UnassignedInfo(UnassignedInfo.Reason.INDEX_CREATED, "_message") ); } + + private static ShardRouting createShardRouting() { + return createShardRouting(new Index("test", "testUUID")); + } + + private static RoutingAllocation createRoutingAllocation(AllocationDeciders deciders) { + return new RoutingAllocation(deciders, ClusterState.builder(new ClusterName("test")).build(), null, null, 0L); + } + + private static final class AnyNodeInitialShardAllocationDecider extends AllocationDecider { + + } + + private static final class FixedNodesInitialShardAllocationDecider extends AllocationDecider { + private final Set initialNodeIds; + + private FixedNodesInitialShardAllocationDecider(Set initialNodeIds) { + this.initialNodeIds = initialNodeIds; + } + + @Override + public Optional> getForcedInitialShardAllocationToNodes(ShardRouting shardRouting, RoutingAllocation allocation) { + return Optional.of(initialNodeIds); + } + } } diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/decider/DiskThresholdDeciderTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/decider/DiskThresholdDeciderTests.java index 480c5d23dde5b..d063e4975f605 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/decider/DiskThresholdDeciderTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/decider/DiskThresholdDeciderTests.java @@ -9,6 +9,7 @@ package org.elasticsearch.cluster.routing.allocation.decider; import org.elasticsearch.Version; +import org.elasticsearch.action.ActionListener; import org.elasticsearch.cluster.ClusterInfo; import org.elasticsearch.cluster.ClusterInfoService; import org.elasticsearch.cluster.ClusterName; @@ -154,7 +155,7 @@ private void doTestDiskThreshold(boolean testMaxHeadroom) { clusterState = ClusterState.builder(clusterState) .nodes(DiscoveryNodes.builder().add(newNode("node1")).add(newNode("node2"))) .build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); logShardStates(clusterState); // Primary shard should be initializing, replica should not @@ -179,7 +180,7 @@ private void doTestDiskThreshold(boolean testMaxHeadroom) { logger.info("--> adding node3"); clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()).add(newNode("node3"))).build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); logShardStates(clusterState); // Assert that the replica is initialized now that node3 is available with enough space @@ -228,7 +229,7 @@ private void doTestDiskThreshold(boolean testMaxHeadroom) { EmptySnapshotsInfoService.INSTANCE ); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); logShardStates(clusterState); // Shards remain started @@ -271,7 +272,7 @@ private void doTestDiskThreshold(boolean testMaxHeadroom) { EmptySnapshotsInfoService.INSTANCE ); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); logShardStates(clusterState); // Shards remain started @@ -284,7 +285,7 @@ private void doTestDiskThreshold(boolean testMaxHeadroom) { logger.info("--> adding node4"); clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()).add(newNode("node4"))).build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); logShardStates(clusterState); // Shards remain started @@ -364,7 +365,7 @@ public void testDiskThresholdWithAbsoluteSizes() { .nodes(DiscoveryNodes.builder().add(newNode("node1")).add(newNode("node2"))) .build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); logShardStates(clusterState); // Primary should initialize, even though both nodes are over the limit initialize @@ -397,7 +398,7 @@ public void testDiskThresholdWithAbsoluteSizes() { EmptySnapshotsInfoService.INSTANCE ); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); logShardStates(clusterState); // Now the replica should be able to initialize @@ -420,7 +421,7 @@ public void testDiskThresholdWithAbsoluteSizes() { logger.info("--> adding node3"); clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()).add(newNode("node3"))).build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); logShardStates(clusterState); // Assert that the replica is initialized now that node3 is available with enough space @@ -461,7 +462,7 @@ public void testDiskThresholdWithAbsoluteSizes() { EmptySnapshotsInfoService.INSTANCE ); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); logShardStates(clusterState); // Shards remain started @@ -494,7 +495,7 @@ public void testDiskThresholdWithAbsoluteSizes() { EmptySnapshotsInfoService.INSTANCE ); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); logShardStates(clusterState); // Shards remain started @@ -507,7 +508,7 @@ public void testDiskThresholdWithAbsoluteSizes() { logger.info("--> adding node4"); clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()).add(newNode("node4"))).build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); logShardStates(clusterState); // Shards remain started @@ -530,7 +531,7 @@ public void testDiskThresholdWithAbsoluteSizes() { logger.info("--> adding node5"); clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()).add(newNode("node5"))).build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); logShardStates(clusterState); // Shards remain started on node3 and node4 @@ -626,7 +627,7 @@ private void doTestDiskThresholdWithShardSizes(boolean testMaxHeadroom) { clusterState = ClusterState.builder(clusterState) .nodes(DiscoveryNodes.builder().add(newNode("node1")).add(newNode("node2"))) .build(); - routingTable = strategy.reroute(clusterState, "reroute").routingTable(); + routingTable = strategy.reroute(clusterState, "reroute", ActionListener.noop()).routingTable(); clusterState = ClusterState.builder(clusterState).routingTable(routingTable).build(); logger.info("--> start the shards (primaries)"); routingTable = startInitializingShardsAndReroute(strategy, clusterState).routingTable(); @@ -705,7 +706,7 @@ public void testUnknownDiskUsage() { // automatically ignore single-node clusters ) .build(); - routingTable = strategy.reroute(clusterState, "reroute").routingTable(); + routingTable = strategy.reroute(clusterState, "reroute", ActionListener.noop()).routingTable(); clusterState = ClusterState.builder(clusterState).routingTable(routingTable).build(); // Shard can be allocated to node1, even though it only has 25% free, @@ -820,7 +821,7 @@ private void doTestShardRelocationsTakenIntoAccount(boolean testMaxHeadroom) { clusterState = ClusterState.builder(clusterState) .nodes(DiscoveryNodes.builder().add(newNode("node1")).add(newNode("node2"))) .build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); logShardStates(clusterState); // shards should be initializing @@ -840,7 +841,7 @@ private void doTestShardRelocationsTakenIntoAccount(boolean testMaxHeadroom) { AllocationCommand moveAllocationCommand = new MoveAllocationCommand("test", 0, "node2", "node3"); AllocationCommands cmds = new AllocationCommands(moveAllocationCommand); - clusterState = strategy.reroute(clusterState, cmds, false, false).clusterState(); + clusterState = strategy.reroute(clusterState, cmds, false, false, false, ActionListener.noop()).clusterState(); logShardStates(clusterState); } @@ -870,8 +871,10 @@ private void doTestShardRelocationsTakenIntoAccount(boolean testMaxHeadroom) { final ClusterState clusterStateThatRejectsCommands = clusterState; assertThat( - expectThrows(IllegalArgumentException.class, () -> strategy.reroute(clusterStateThatRejectsCommands, cmds, false, false)) - .getMessage(), + expectThrows( + IllegalArgumentException.class, + () -> strategy.reroute(clusterStateThatRejectsCommands, cmds, false, false, false, ActionListener.noop()) + ).getMessage(), containsString( testMaxHeadroom ? "the node is above the low watermark cluster setting " @@ -885,8 +888,10 @@ private void doTestShardRelocationsTakenIntoAccount(boolean testMaxHeadroom) { clusterInfoReference.set(overfullClusterInfo); assertThat( - expectThrows(IllegalArgumentException.class, () -> strategy.reroute(clusterStateThatRejectsCommands, cmds, false, false)) - .getMessage(), + expectThrows( + IllegalArgumentException.class, + () -> strategy.reroute(clusterStateThatRejectsCommands, cmds, false, false, false, ActionListener.noop()) + ).getMessage(), containsString("the node has fewer free bytes remaining than the total size of all incoming shards") ); @@ -898,12 +903,13 @@ private void doTestShardRelocationsTakenIntoAccount(boolean testMaxHeadroom) { AllocationCommands cmds = new AllocationCommands(moveAllocationCommand); clusterState = startInitializingShardsAndReroute(strategy, clusterState); - clusterState = strategy.reroute(clusterState, cmds, false, false).clusterState(); + clusterState = strategy.reroute(clusterState, cmds, false, false, false, ActionListener.noop()).clusterState(); logShardStates(clusterState); clusterInfoReference.set(overfullClusterInfo); - strategy.reroute(clusterState, "foo"); // ensure reroute doesn't fail even though there is negative free space + strategy.reroute(clusterState, "foo", ActionListener.noop()); // ensure reroute doesn't fail even though there is negative free + // space } { @@ -1140,7 +1146,7 @@ private void doTestCanRemainWithShardRelocatingAway(boolean testMaxHeadroom) { ); // Ensure that the reroute call doesn't alter the routing table, since the first primary is relocating away // and therefore we will have sufficient disk space on node1. - ClusterState result = strategy.reroute(clusterState, "reroute"); + ClusterState result = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(result, equalTo(clusterState)); assertThat(result.routingTable().index("test").shard(0).primaryShard().state(), equalTo(STARTED)); assertThat(result.routingTable().index("test").shard(0).primaryShard().currentNodeId(), equalTo("node1")); @@ -1242,7 +1248,7 @@ private void doTestWatermarksEnabledForSingleDataNode(boolean testMaxHeadroom) { cis, EmptySnapshotsInfoService.INSTANCE ); - ClusterState result = strategy.reroute(clusterState, "reroute"); + ClusterState result = strategy.reroute(clusterState, "reroute", ActionListener.noop()); ShardRouting shardRouting = result.routingTable().index("test").shard(0).primaryShard(); assertThat(shardRouting.state(), equalTo(UNASSIGNED)); @@ -1425,7 +1431,7 @@ private void doTestDiskThresholdWithSnapshotShardSizes(boolean testMaxHeadroom) ); // reroute triggers snapshot shard size fetching - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); logShardStates(clusterState); // shard cannot be assigned yet as the snapshot shard size is unknown diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/decider/DiskThresholdDeciderUnitTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/decider/DiskThresholdDeciderUnitTests.java index a44271a3f06df..8150c3e055dd6 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/decider/DiskThresholdDeciderUnitTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/decider/DiskThresholdDeciderUnitTests.java @@ -9,6 +9,7 @@ package org.elasticsearch.cluster.routing.allocation.decider; import org.elasticsearch.Version; +import org.elasticsearch.action.ActionListener; import org.elasticsearch.cluster.ClusterInfo; import org.elasticsearch.cluster.ClusterName; import org.elasticsearch.cluster.ClusterState; @@ -716,7 +717,7 @@ public void testSizeShrinkIndex() { AllocationService allocationService = createAllocationService(); clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder().add(newNode("node1"))).build(); - clusterState = allocationService.reroute(clusterState, "foo"); + clusterState = allocationService.reroute(clusterState, "foo", ActionListener.noop()); clusterState = startShardsAndReroute( allocationService, @@ -795,7 +796,7 @@ public void testSizeShrinkIndex() { .routingTable(RoutingTable.builder(clusterState.routingTable()).remove("test").build()) .build(); - allocationService.reroute(clusterState, "foo"); + allocationService.reroute(clusterState, "foo", ActionListener.noop()); RoutingAllocation allocationWithMissingSourceIndex = new RoutingAllocation(null, clusterStateWithMissingSourceIndex, info, null, 0); assertEquals(42L, getExpectedShardSize(target, 42L, allocationWithMissingSourceIndex)); assertEquals(42L, getExpectedShardSize(target2, 42L, allocationWithMissingSourceIndex)); diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/decider/EnableAllocationShortCircuitTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/decider/EnableAllocationShortCircuitTests.java index ee7e448d8e81e..1bddb47445114 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/decider/EnableAllocationShortCircuitTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/decider/EnableAllocationShortCircuitTests.java @@ -8,6 +8,7 @@ package org.elasticsearch.cluster.routing.allocation.decider; import org.elasticsearch.Version; +import org.elasticsearch.action.ActionListener; import org.elasticsearch.cluster.ClusterModule; import org.elasticsearch.cluster.ClusterName; import org.elasticsearch.cluster.ClusterState; @@ -94,7 +95,7 @@ public void testRebalancingAttemptedIfPermitted() { ), plugin ); - allocationService.reroute(clusterState, "reroute").routingTable(); + allocationService.reroute(clusterState, "reroute", ActionListener.noop()).routingTable(); assertThat(plugin.rebalanceAttempts, greaterThan(0)); } @@ -106,7 +107,7 @@ public void testRebalancingSkippedIfDisabled() { Settings.builder().put(CLUSTER_ROUTING_REBALANCE_ENABLE_SETTING.getKey(), EnableAllocationDecider.Allocation.NONE.name()), plugin ); - allocationService.reroute(clusterState, "reroute").routingTable(); + allocationService.reroute(clusterState, "reroute", ActionListener.noop()).routingTable(); assertThat(plugin.rebalanceAttempts, equalTo(0)); } @@ -133,7 +134,7 @@ public void testRebalancingSkippedIfDisabledIncludingOnSpecificIndices() { Settings.builder().put(CLUSTER_ROUTING_REBALANCE_ENABLE_SETTING.getKey(), EnableAllocationDecider.Rebalance.NONE.name()), plugin ); - allocationService.reroute(clusterState, "reroute").routingTable(); + allocationService.reroute(clusterState, "reroute", ActionListener.noop()).routingTable(); assertThat(plugin.rebalanceAttempts, equalTo(0)); } @@ -167,7 +168,7 @@ public void testRebalancingAttemptedIfDisabledButOverridenOnSpecificIndices() { Settings.builder().put(CLUSTER_ROUTING_REBALANCE_ENABLE_SETTING.getKey(), EnableAllocationDecider.Rebalance.NONE.name()), plugin ); - allocationService.reroute(clusterState, "reroute").routingTable(); + allocationService.reroute(clusterState, "reroute", ActionListener.noop()).routingTable(); assertThat(plugin.rebalanceAttempts, greaterThan(0)); } @@ -190,7 +191,7 @@ public void testAllocationSkippedIfDisabled() { .nodes(DiscoveryNodes.builder().add(newNode("node1"))) .build(); - allocationService.reroute(clusterState, "reroute").routingTable(); + allocationService.reroute(clusterState, "reroute", ActionListener.noop()).routingTable(); assertThat(plugin.canAllocateAttempts, equalTo(0)); } diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/decider/EnableAllocationTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/decider/EnableAllocationTests.java index 0c82274c6f6d8..38f8777af7e9f 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/decider/EnableAllocationTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/decider/EnableAllocationTests.java @@ -13,6 +13,7 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.elasticsearch.Version; +import org.elasticsearch.action.ActionListener; import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.ESAllocationTestCase; import org.elasticsearch.cluster.metadata.IndexMetadata; @@ -63,7 +64,7 @@ public void testClusterEnableNone() { clusterState = ClusterState.builder(clusterState) .nodes(DiscoveryNodes.builder().add(newNode("node1")).add(newNode("node2"))) .build(); - routingTable = strategy.reroute(clusterState, "reroute").routingTable(); + routingTable = strategy.reroute(clusterState, "reroute", ActionListener.noop()).routingTable(); clusterState = ClusterState.builder(clusterState).routingTable(routingTable).build(); assertThat(shardsWithState(clusterState.getRoutingNodes(), INITIALIZING).size(), equalTo(0)); @@ -90,7 +91,7 @@ public void testClusterEnableOnlyPrimaries() { clusterState = ClusterState.builder(clusterState) .nodes(DiscoveryNodes.builder().add(newNode("node1")).add(newNode("node2"))) .build(); - routingTable = strategy.reroute(clusterState, "reroute").routingTable(); + routingTable = strategy.reroute(clusterState, "reroute", ActionListener.noop()).routingTable(); clusterState = ClusterState.builder(clusterState).routingTable(routingTable).build(); assertThat(shardsWithState(clusterState.getRoutingNodes(), INITIALIZING).size(), equalTo(1)); @@ -127,7 +128,7 @@ public void testIndexEnableNone() { clusterState = ClusterState.builder(clusterState) .nodes(DiscoveryNodes.builder().add(newNode("node1")).add(newNode("node2"))) .build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(shardsWithState(clusterState.getRoutingNodes(), INITIALIZING).size(), equalTo(1)); logger.info("--> start the shards (primaries)"); clusterState = startInitializingShardsAndReroute(strategy, clusterState); @@ -186,7 +187,7 @@ public void testEnableClusterBalance() { clusterState = ClusterState.builder(clusterState) .nodes(DiscoveryNodes.builder().add(newNode("node1")).add(newNode("node2"))) .build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(shardsWithState(clusterState.getRoutingNodes(), INITIALIZING).size(), equalTo(4)); logger.info("--> start the shards (primaries)"); clusterState = startInitializingShardsAndReroute(strategy, clusterState); @@ -201,7 +202,7 @@ public void testEnableClusterBalance() { clusterState = ClusterState.builder(clusterState) .nodes(DiscoveryNodes.builder().add(newNode("node1")).add(newNode("node2")).add(newNode("node3"))) .build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(shardsWithState(clusterState.getRoutingNodes(), STARTED).size(), equalTo(8)); assertThat(shardsWithState(clusterState.getRoutingNodes(), RELOCATING).size(), equalTo(0)); @@ -234,7 +235,7 @@ public void testEnableClusterBalance() { } clusterSettings.applySettings(clusterState.metadata().settings()); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat( "expected 6 shards to be started 2 to relocate useClusterSettings: " + useClusterSetting, shardsWithState(clusterState.getRoutingNodes(), STARTED).size(), @@ -304,7 +305,7 @@ public void testEnableClusterBalanceNoReplicas() { clusterState = ClusterState.builder(clusterState) .nodes(DiscoveryNodes.builder().add(newNode("node1")).add(newNode("node2"))) .build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(shardsWithState(clusterState.getRoutingNodes(), INITIALIZING).size(), equalTo(6)); logger.info("--> start the shards (primaries)"); clusterState = startInitializingShardsAndReroute(strategy, clusterState); @@ -315,7 +316,7 @@ public void testEnableClusterBalanceNoReplicas() { clusterState = ClusterState.builder(clusterState) .nodes(DiscoveryNodes.builder().add(newNode("node1")).add(newNode("node2")).add(newNode("node3"))) .build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat(shardsWithState(clusterState.getRoutingNodes(), STARTED).size(), equalTo(6)); assertThat(shardsWithState(clusterState.getRoutingNodes(), RELOCATING).size(), equalTo(0)); metadata = clusterState.metadata(); @@ -355,7 +356,7 @@ public void testEnableClusterBalanceNoReplicas() { .build(); } clusterSettings.applySettings(clusterState.metadata().settings()); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); assertThat( "expected 4 primaries to be started and 2 to relocate useClusterSettings: " + useClusterSetting, shardsWithState(clusterState.getRoutingNodes(), STARTED).size(), diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/decider/FilterAllocationDeciderTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/decider/FilterAllocationDeciderTests.java index 489f07b63981d..832a488a30a1c 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/decider/FilterAllocationDeciderTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/decider/FilterAllocationDeciderTests.java @@ -8,6 +8,8 @@ package org.elasticsearch.cluster.routing.allocation.decider; import org.elasticsearch.Version; +import org.elasticsearch.action.ActionListener; +import org.elasticsearch.cluster.ClusterName; import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.ESAllocationTestCase; import org.elasticsearch.cluster.EmptyClusterInfoService; @@ -18,6 +20,7 @@ import org.elasticsearch.cluster.routing.RecoverySource; import org.elasticsearch.cluster.routing.RoutingTable; import org.elasticsearch.cluster.routing.ShardRouting; +import org.elasticsearch.cluster.routing.UnassignedInfo; import org.elasticsearch.cluster.routing.allocation.AllocationService; import org.elasticsearch.cluster.routing.allocation.FailedShard; import org.elasticsearch.cluster.routing.allocation.RoutingAllocation; @@ -26,18 +29,22 @@ import org.elasticsearch.common.settings.ClusterSettings; import org.elasticsearch.common.settings.IndexScopedSettings; import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.shard.ShardId; import org.elasticsearch.snapshots.EmptySnapshotsInfoService; import org.elasticsearch.test.gateway.TestGatewayAllocator; import java.util.Arrays; import java.util.Collections; import java.util.List; +import java.util.Optional; +import java.util.Set; import static org.elasticsearch.cluster.metadata.IndexMetadata.INDEX_RESIZE_SOURCE_NAME; import static org.elasticsearch.cluster.metadata.IndexMetadata.INDEX_RESIZE_SOURCE_UUID; import static org.elasticsearch.cluster.routing.ShardRoutingState.INITIALIZING; import static org.elasticsearch.cluster.routing.ShardRoutingState.STARTED; import static org.elasticsearch.cluster.routing.ShardRoutingState.UNASSIGNED; +import static org.hamcrest.Matchers.equalTo; public class FilterAllocationDeciderTests extends ESAllocationTestCase { @@ -103,7 +110,7 @@ public void testFilterInitialRecovery() { assertEquals("initial allocation of the index is only allowed on nodes [_id:\"node2\"]", decision.getExplanation()); } - state = service.reroute(state, "try allocate again"); + state = service.reroute(state, "try allocate again", ActionListener.noop()); routingTable = state.routingTable(); assertEquals(routingTable.index("idx").shard(0).primaryShard().state(), INITIALIZING); assertEquals(routingTable.index("idx").shard(0).primaryShard().currentNodeId(), "node2"); @@ -134,7 +141,11 @@ public void testFilterInitialRecovery() { ); // now bring back node1 and see it's assigned - state = service.reroute(ClusterState.builder(state).nodes(DiscoveryNodes.builder(state.nodes()).add(node1)).build(), "test"); + state = service.reroute( + ClusterState.builder(state).nodes(DiscoveryNodes.builder(state.nodes()).add(node1)).build(), + "test", + ActionListener.noop() + ); routingTable = state.routingTable(); assertEquals(routingTable.index("idx").shard(0).primaryShard().state(), INITIALIZING); assertEquals(routingTable.index("idx").shard(0).primaryShard().currentNodeId(), "node1"); @@ -194,7 +205,7 @@ private ClusterState createInitialClusterState(AllocationService service, Settin clusterState = ClusterState.builder(clusterState) .nodes(DiscoveryNodes.builder().add(newNode("node1")).add(newNode("node2"))) .build(); - return service.reroute(clusterState, "reroute"); + return service.reroute(clusterState, "reroute", ActionListener.noop()); } public void testInvalidIPFilter() { @@ -279,4 +290,41 @@ public void testSettingsAcceptArrayOfValues() { "test ip validation" ); } + + public void testGetForcedInitialShardAllocationToNodes() { + var index = IndexMetadata.builder("index") + .settings( + Settings.builder() + .put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 1) + .put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 0) + .put("index.routing.allocation.initial_recovery._id", "node-1") + .put(IndexMetadata.SETTING_INDEX_UUID, "uuid") + .put(IndexMetadata.SETTING_VERSION_CREATED, Version.CURRENT) + ) + .build(); + var clusterState = ClusterState.builder(new ClusterName("test-cluster")) + .nodes(DiscoveryNodes.builder().add(newNode("node-1")).add(newNode("node-2"))) + .metadata(Metadata.builder().put(index, false)) + .build(); + + var clusterSettings = new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS); + var decider = new FilterAllocationDecider(Settings.EMPTY, clusterSettings); + var allocation = new RoutingAllocation(new AllocationDeciders(List.of(decider)), clusterState, null, null, 0); + + var localRecoveryShard = ShardRouting.newUnassigned( + new ShardId(index.getIndex(), 0), + true, + RecoverySource.LocalShardsRecoverySource.INSTANCE, + new UnassignedInfo(UnassignedInfo.Reason.INDEX_CREATED, "index created") + ); + assertThat(decider.getForcedInitialShardAllocationToNodes(localRecoveryShard, allocation), equalTo(Optional.of(Set.of("node-1")))); + + var newShard = ShardRouting.newUnassigned( + new ShardId(index.getIndex(), 0), + true, + RecoverySource.EmptyStoreRecoverySource.INSTANCE, + new UnassignedInfo(UnassignedInfo.Reason.INDEX_CREATED, "index created") + ); + assertThat(decider.getForcedInitialShardAllocationToNodes(newShard, allocation), equalTo(Optional.empty())); + } } diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/decider/NodeReplacementAllocationDeciderTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/decider/NodeReplacementAllocationDeciderTests.java index 9aa0956c7dde3..4f5e6839d08bc 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/decider/NodeReplacementAllocationDeciderTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/decider/NodeReplacementAllocationDeciderTests.java @@ -9,6 +9,7 @@ package org.elasticsearch.cluster.routing.allocation.decider; import org.elasticsearch.Version; +import org.elasticsearch.action.ActionListener; import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.ESAllocationTestCase; import org.elasticsearch.cluster.EmptyClusterInfoService; @@ -101,7 +102,11 @@ public void testNoReplacements() { } public void testCanForceAllocate() { - ClusterState state = prepareState(service.reroute(ClusterState.EMPTY_STATE, "initial state"), NODE_A.getId(), NODE_B.getName()); + ClusterState state = prepareState( + service.reroute(ClusterState.EMPTY_STATE, "initial state", ActionListener.noop()), + NODE_A.getId(), + NODE_B.getName() + ); RoutingAllocation allocation = new RoutingAllocation(allocationDeciders, state, null, null, 0); RoutingNode routingNode = RoutingNodesHelper.routingNode(NODE_A.getId(), NODE_A, shard); allocation.debugDecision(true); @@ -144,7 +149,11 @@ public void testCanForceAllocate() { } public void testCannotRemainOnReplacedNode() { - ClusterState state = prepareState(service.reroute(ClusterState.EMPTY_STATE, "initial state"), NODE_A.getId(), NODE_B.getName()); + ClusterState state = prepareState( + service.reroute(ClusterState.EMPTY_STATE, "initial state", ActionListener.noop()), + NODE_A.getId(), + NODE_B.getName() + ); RoutingAllocation allocation = new RoutingAllocation(allocationDeciders, state, null, null, 0); RoutingNode routingNode = RoutingNodesHelper.routingNode(NODE_A.getId(), NODE_A, shard); allocation.debugDecision(true); @@ -170,7 +179,11 @@ public void testCannotRemainOnReplacedNode() { } public void testCanAllocateToNeitherSourceNorTarget() { - ClusterState state = prepareState(service.reroute(ClusterState.EMPTY_STATE, "initial state"), NODE_A.getId(), NODE_B.getName()); + ClusterState state = prepareState( + service.reroute(ClusterState.EMPTY_STATE, "initial state", ActionListener.noop()), + NODE_A.getId(), + NODE_B.getName() + ); RoutingAllocation allocation = new RoutingAllocation(allocationDeciders, state, null, null, 0); RoutingNode routingNode = RoutingNodesHelper.routingNode(NODE_A.getId(), NODE_A, shard); allocation.debugDecision(true); diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/decider/NodeShutdownAllocationDeciderTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/decider/NodeShutdownAllocationDeciderTests.java index 3f6ce547f1adc..1cf9803caecfd 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/decider/NodeShutdownAllocationDeciderTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/decider/NodeShutdownAllocationDeciderTests.java @@ -9,6 +9,7 @@ package org.elasticsearch.cluster.routing.allocation.decider; import org.elasticsearch.Version; +import org.elasticsearch.action.ActionListener; import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.ESAllocationTestCase; import org.elasticsearch.cluster.EmptyClusterInfoService; @@ -79,7 +80,7 @@ public class NodeShutdownAllocationDeciderTests extends ESAllocationTestCase { public void testCanAllocateShardsToRestartingNode() { ClusterState state = prepareState( - service.reroute(ClusterState.EMPTY_STATE, "initial state"), + service.reroute(ClusterState.EMPTY_STATE, "initial state", ActionListener.noop()), SingleNodeShutdownMetadata.Type.RESTART ); RoutingAllocation allocation = new RoutingAllocation(allocationDeciders, state, null, null, 0); @@ -96,7 +97,7 @@ public void testCanAllocateShardsToRestartingNode() { public void testCannotAllocateShardsToRemovingNode() { ClusterState state = prepareState( - service.reroute(ClusterState.EMPTY_STATE, "initial state"), + service.reroute(ClusterState.EMPTY_STATE, "initial state", ActionListener.noop()), randomFrom(SingleNodeShutdownMetadata.Type.REMOVE, SingleNodeShutdownMetadata.Type.REPLACE) ); RoutingAllocation allocation = new RoutingAllocation(allocationDeciders, state, null, null, 0); @@ -110,7 +111,7 @@ public void testCannotAllocateShardsToRemovingNode() { public void testShardsCanRemainOnRestartingNode() { ClusterState state = prepareState( - service.reroute(ClusterState.EMPTY_STATE, "initial state"), + service.reroute(ClusterState.EMPTY_STATE, "initial state", ActionListener.noop()), SingleNodeShutdownMetadata.Type.RESTART ); RoutingAllocation allocation = new RoutingAllocation(allocationDeciders, state, null, null, 0); @@ -127,7 +128,7 @@ public void testShardsCanRemainOnRestartingNode() { public void testShardsCannotRemainOnRemovingNode() { ClusterState state = prepareState( - service.reroute(ClusterState.EMPTY_STATE, "initial state"), + service.reroute(ClusterState.EMPTY_STATE, "initial state", ActionListener.noop()), randomFrom(SingleNodeShutdownMetadata.Type.REMOVE, SingleNodeShutdownMetadata.Type.REPLACE) ); RoutingAllocation allocation = new RoutingAllocation(allocationDeciders, state, null, null, 0); @@ -141,7 +142,7 @@ public void testShardsCannotRemainOnRemovingNode() { public void testCanAutoExpandToRestartingNode() { ClusterState state = prepareState( - service.reroute(ClusterState.EMPTY_STATE, "initial state"), + service.reroute(ClusterState.EMPTY_STATE, "initial state", ActionListener.noop()), SingleNodeShutdownMetadata.Type.RESTART ); RoutingAllocation allocation = new RoutingAllocation(allocationDeciders, state, null, null, 0); @@ -156,7 +157,7 @@ public void testCanAutoExpandToRestartingNode() { } public void testCanAutoExpandToNodeIfNoNodesShuttingDown() { - ClusterState state = service.reroute(ClusterState.EMPTY_STATE, "initial state"); + ClusterState state = service.reroute(ClusterState.EMPTY_STATE, "initial state", ActionListener.noop()); RoutingAllocation allocation = new RoutingAllocation(allocationDeciders, state, null, null, 0); allocation.debugDecision(true); @@ -168,7 +169,7 @@ public void testCanAutoExpandToNodeIfNoNodesShuttingDown() { public void testCanAutoExpandToNodeThatIsNotShuttingDown() { ClusterState state = prepareState( - service.reroute(ClusterState.EMPTY_STATE, "initial state"), + service.reroute(ClusterState.EMPTY_STATE, "initial state", ActionListener.noop()), randomFrom(SingleNodeShutdownMetadata.Type.REMOVE, SingleNodeShutdownMetadata.Type.REPLACE), "other-node-id" ); @@ -183,7 +184,7 @@ public void testCanAutoExpandToNodeThatIsNotShuttingDown() { public void testCannotAutoExpandToRemovingNode() { ClusterState state = prepareState( - service.reroute(ClusterState.EMPTY_STATE, "initial state"), + service.reroute(ClusterState.EMPTY_STATE, "initial state", ActionListener.noop()), randomFrom(SingleNodeShutdownMetadata.Type.REMOVE, SingleNodeShutdownMetadata.Type.REPLACE) ); RoutingAllocation allocation = new RoutingAllocation(allocationDeciders, state, null, null, 0); diff --git a/server/src/test/java/org/elasticsearch/cluster/serialization/ClusterSerializationTests.java b/server/src/test/java/org/elasticsearch/cluster/serialization/ClusterSerializationTests.java index f57627649706d..2b676bc4d3e3b 100644 --- a/server/src/test/java/org/elasticsearch/cluster/serialization/ClusterSerializationTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/serialization/ClusterSerializationTests.java @@ -9,6 +9,7 @@ package org.elasticsearch.cluster.serialization; import org.elasticsearch.Version; +import org.elasticsearch.action.ActionListener; import org.elasticsearch.cluster.AbstractNamedDiffable; import org.elasticsearch.cluster.ClusterModule; import org.elasticsearch.cluster.ClusterName; @@ -71,7 +72,9 @@ public void testClusterStateSerialization() throws Exception { .build(); AllocationService strategy = createAllocationService(); - clusterState = ClusterState.builder(clusterState).routingTable(strategy.reroute(clusterState, "reroute").routingTable()).build(); + clusterState = ClusterState.builder(clusterState) + .routingTable(strategy.reroute(clusterState, "reroute", ActionListener.noop()).routingTable()) + .build(); ClusterState serializedClusterState = ClusterState.Builder.fromBytes( ClusterState.Builder.toBytes(clusterState), @@ -100,7 +103,7 @@ public void testRoutingTableSerialization() throws Exception { .build(); AllocationService strategy = createAllocationService(); - RoutingTable source = strategy.reroute(clusterState, "reroute").routingTable(); + RoutingTable source = strategy.reroute(clusterState, "reroute", ActionListener.noop()).routingTable(); BytesStreamOutput outStream = new BytesStreamOutput(); source.writeTo(outStream); diff --git a/server/src/test/java/org/elasticsearch/cluster/serialization/ClusterStateToStringTests.java b/server/src/test/java/org/elasticsearch/cluster/serialization/ClusterStateToStringTests.java index c8731546a0331..b50c227e3c615 100644 --- a/server/src/test/java/org/elasticsearch/cluster/serialization/ClusterStateToStringTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/serialization/ClusterStateToStringTests.java @@ -8,6 +8,7 @@ package org.elasticsearch.cluster.serialization; import org.elasticsearch.Version; +import org.elasticsearch.action.ActionListener; import org.elasticsearch.cluster.ClusterName; import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.ESAllocationTestCase; @@ -53,7 +54,9 @@ public void testClusterStateSerialization() throws Exception { .build(); AllocationService strategy = createAllocationService(); - clusterState = ClusterState.builder(clusterState).routingTable(strategy.reroute(clusterState, "reroute").routingTable()).build(); + clusterState = ClusterState.builder(clusterState) + .routingTable(strategy.reroute(clusterState, "reroute", ActionListener.noop()).routingTable()) + .build(); String clusterStateString = Strings.toString(clusterState); assertNotNull(clusterStateString); diff --git a/server/src/test/java/org/elasticsearch/cluster/structure/RoutingIteratorTests.java b/server/src/test/java/org/elasticsearch/cluster/structure/RoutingIteratorTests.java index 053c7f9fad3db..144b71ef52028 100644 --- a/server/src/test/java/org/elasticsearch/cluster/structure/RoutingIteratorTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/structure/RoutingIteratorTests.java @@ -9,6 +9,7 @@ package org.elasticsearch.cluster.structure; import org.elasticsearch.Version; +import org.elasticsearch.action.ActionListener; import org.elasticsearch.cluster.ClusterName; import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.ESAllocationTestCase; @@ -230,7 +231,9 @@ public void testNodeSelectorRouting() { .localNodeId("node1") ) .build(); - clusterState = strategy.reroute(clusterState, "reroute"); + + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); + clusterState = startInitializingShardsAndReroute(strategy, clusterState); assertThat( @@ -333,7 +336,7 @@ public void testShardsAndPreferNodeRouting() { clusterState = ClusterState.builder(clusterState) .nodes(DiscoveryNodes.builder().add(newNode("node1")).add(newNode("node2")).localNodeId("node1")) .build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = strategy.reroute(clusterState, "reroute", ActionListener.noop()); clusterState = startInitializingShardsAndReroute(strategy, clusterState); clusterState = startInitializingShardsAndReroute(strategy, clusterState); diff --git a/server/src/test/java/org/elasticsearch/indices/cluster/ClusterStateChanges.java b/server/src/test/java/org/elasticsearch/indices/cluster/ClusterStateChanges.java index 36f8f0fea27ad..2954cdcbbdcda 100644 --- a/server/src/test/java/org/elasticsearch/indices/cluster/ClusterStateChanges.java +++ b/server/src/test/java/org/elasticsearch/indices/cluster/ClusterStateChanges.java @@ -284,7 +284,8 @@ public IndexMetadata verifyIndexMetadata(IndexMetadata indexMetadata, Version mi allocationService, IndexScopedSettings.DEFAULT_SCOPED_SETTINGS, indicesService, - shardLimitValidator + shardLimitValidator, + threadPool ); MetadataCreateIndexService createIndexService = new MetadataCreateIndexService( SETTINGS, @@ -375,7 +376,7 @@ public ClusterState closeIndices(ClusterState state, CloseIndexRequest request) blockedIndices, blockedIndices.keySet().stream().collect(toMap(Function.identity(), CloseIndexResponse.IndexResult::new)) ); - return allocationService.reroute(newState, "indices closed"); + return allocationService.reroute(newState, "indices closed", ActionListener.noop()); } public ClusterState openIndices(ClusterState state, OpenIndexRequest request) { diff --git a/server/src/test/java/org/elasticsearch/snapshots/SnapshotResiliencyTests.java b/server/src/test/java/org/elasticsearch/snapshots/SnapshotResiliencyTests.java index 1fdb9cb91e5c1..3515728b69a2f 100644 --- a/server/src/test/java/org/elasticsearch/snapshots/SnapshotResiliencyTests.java +++ b/server/src/test/java/org/elasticsearch/snapshots/SnapshotResiliencyTests.java @@ -1936,7 +1936,8 @@ protected void assertSnapshotOrGenericThread() { shardLimitValidator, EmptySystemIndices.INSTANCE, indicesService, - mock(FileSettingsService.class) + mock(FileSettingsService.class), + threadPool ); actions.put( PutMappingAction.INSTANCE, diff --git a/test/framework/src/main/java/org/elasticsearch/cluster/ESAllocationTestCase.java b/test/framework/src/main/java/org/elasticsearch/cluster/ESAllocationTestCase.java index b8d7e911016da..87d68f43dee1c 100644 --- a/test/framework/src/main/java/org/elasticsearch/cluster/ESAllocationTestCase.java +++ b/test/framework/src/main/java/org/elasticsearch/cluster/ESAllocationTestCase.java @@ -9,6 +9,7 @@ package org.elasticsearch.cluster; import org.elasticsearch.Version; +import org.elasticsearch.action.ActionListener; import org.elasticsearch.cluster.node.DiscoveryNode; import org.elasticsearch.cluster.node.DiscoveryNodeRole; import org.elasticsearch.cluster.routing.RecoverySource; @@ -20,6 +21,8 @@ import org.elasticsearch.cluster.routing.allocation.FailedShard; import org.elasticsearch.cluster.routing.allocation.RoutingAllocation; import org.elasticsearch.cluster.routing.allocation.allocator.BalancedShardsAllocator; +import org.elasticsearch.cluster.routing.allocation.allocator.DesiredBalance; +import org.elasticsearch.cluster.routing.allocation.allocator.DesiredBalanceShardsAllocator; import org.elasticsearch.cluster.routing.allocation.allocator.ShardsAllocator; import org.elasticsearch.cluster.routing.allocation.decider.AllocationDecider; import org.elasticsearch.cluster.routing.allocation.decider.AllocationDeciders; @@ -27,6 +30,7 @@ import org.elasticsearch.cluster.routing.allocation.decider.SameShardAllocationDecider; import org.elasticsearch.common.settings.ClusterSettings; import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.util.concurrent.DeterministicTaskQueue; import org.elasticsearch.gateway.GatewayAllocator; import org.elasticsearch.snapshots.SnapshotShardSizeInfo; import org.elasticsearch.snapshots.SnapshotsInfoService; @@ -42,6 +46,8 @@ import java.util.Set; import static java.util.Collections.emptyMap; +import static org.elasticsearch.cluster.ClusterModule.BALANCED_ALLOCATOR; +import static org.elasticsearch.cluster.ClusterModule.DESIRED_BALANCE_ALLOCATOR; import static org.elasticsearch.cluster.routing.ShardRoutingState.INITIALIZING; public abstract class ESAllocationTestCase extends ESTestCase { @@ -75,12 +81,45 @@ public static MockAllocationService createAllocationService(Settings settings, C return new MockAllocationService( randomAllocationDeciders(settings, clusterSettings, random), new TestGatewayAllocator(), - new BalancedShardsAllocator(settings), + createShardsAllocator(settings), EmptyClusterInfoService.INSTANCE, SNAPSHOT_INFO_SERVICE_WITH_NO_SHARD_SIZES ); } + private static ShardsAllocator createShardsAllocator(Settings settings) { + return switch (randomFrom(BALANCED_ALLOCATOR, DESIRED_BALANCE_ALLOCATOR)) { + case BALANCED_ALLOCATOR -> new BalancedShardsAllocator(settings); + case DESIRED_BALANCE_ALLOCATOR -> createDesiredBalanceShardsAllocator(settings); + default -> throw new AssertionError("Unknown allocator"); + }; + } + + private static DesiredBalanceShardsAllocator createDesiredBalanceShardsAllocator(Settings settings) { + var queue = new DeterministicTaskQueue(); + return new DesiredBalanceShardsAllocator(new BalancedShardsAllocator(settings), queue.getThreadPool(), null, null) { + private RoutingAllocation lastAllocation; + + @Override + public void allocate(RoutingAllocation allocation, ActionListener listener) { + lastAllocation = allocation; + super.allocate(allocation, listener); + queue.runAllTasks(); + } + + @Override + protected void reconcile(DesiredBalance desiredBalance, RoutingAllocation allocation) { + // do nothing as balance is not computed yet (during allocate) + } + + @Override + protected void submitReconcileTask(DesiredBalance desiredBalance) { + // reconcile synchronously rather than in cluster state update task + super.reconcile(desiredBalance, lastAllocation); + } + }; + } + public static MockAllocationService createAllocationService(Settings settings, ClusterInfoService clusterInfoService) { return new MockAllocationService( randomAllocationDeciders(settings, EMPTY_CLUSTER_SETTINGS, random()), @@ -257,7 +296,11 @@ public static ClusterState startShardsAndReroute( ClusterState clusterState, List initializingShards ) { - return allocationService.reroute(allocationService.applyStartedShards(clusterState, initializingShards), "reroute after starting"); + return allocationService.reroute( + allocationService.applyStartedShards(clusterState, initializingShards), + "reroute after starting", + ActionListener.noop() + ); } public static class TestAllocateDecision extends AllocationDecider { diff --git a/test/framework/src/main/java/org/elasticsearch/cluster/metadata/DataStreamTestHelper.java b/test/framework/src/main/java/org/elasticsearch/cluster/metadata/DataStreamTestHelper.java index ca35581ec25b7..b8a58ba411340 100644 --- a/test/framework/src/main/java/org/elasticsearch/cluster/metadata/DataStreamTestHelper.java +++ b/test/framework/src/main/java/org/elasticsearch/cluster/metadata/DataStreamTestHelper.java @@ -449,7 +449,7 @@ public static MetadataRolloverService getMetadataRolloverService( Environment env = mock(Environment.class); when(env.sharedDataFile()).thenReturn(null); AllocationService allocationService = mock(AllocationService.class); - when(allocationService.reroute(any(ClusterState.class), any(String.class))).then(i -> i.getArguments()[0]); + when(allocationService.reroute(any(ClusterState.class), any(String.class), any())).then(i -> i.getArguments()[0]); MappingLookup mappingLookup = null; if (dataStream != null) { RootObjectMapper.Builder root = new RootObjectMapper.Builder("_doc", ObjectMapper.Defaults.SUBOBJECTS); diff --git a/x-pack/plugin/rollup/src/main/java/org/elasticsearch/xpack/downsample/TransportRollupAction.java b/x-pack/plugin/rollup/src/main/java/org/elasticsearch/xpack/downsample/TransportRollupAction.java index e7f5b284bc632..a793cddc8e421 100644 --- a/x-pack/plugin/rollup/src/main/java/org/elasticsearch/xpack/downsample/TransportRollupAction.java +++ b/x-pack/plugin/rollup/src/main/java/org/elasticsearch/xpack/downsample/TransportRollupAction.java @@ -35,6 +35,7 @@ import org.elasticsearch.cluster.metadata.IndexNameExpressionResolver; import org.elasticsearch.cluster.metadata.Metadata; import org.elasticsearch.cluster.metadata.MetadataCreateIndexService; +import org.elasticsearch.cluster.routing.allocation.allocator.AllocationActionListener; import org.elasticsearch.cluster.service.ClusterService; import org.elasticsearch.common.Priority; import org.elasticsearch.common.bytes.BytesReference; @@ -640,6 +641,7 @@ private void createRollupIndex( rollupIndexName, rollupIndexName ).settings(builder.build()).mappings(mapping); + var delegate = new AllocationActionListener<>(listener, threadPool.getThreadContext()); clusterService.submitStateUpdateTask("create-rollup-index [" + rollupIndexName + "]", new RollupClusterStateUpdateTask(listener) { @Override public ClusterState execute(ClusterState currentState) throws Exception { @@ -648,7 +650,8 @@ public ClusterState execute(ClusterState currentState) throws Exception { createIndexClusterStateUpdateRequest, true, // Copy index metadata from source index to rollup index - (builder, rollupIndexMetadata) -> builder.put(copyIndexMetadata(sourceIndexMetadata, rollupIndexMetadata)) + (builder, rollupIndexMetadata) -> builder.put(copyIndexMetadata(sourceIndexMetadata, rollupIndexMetadata)), + delegate.reroute() ); } }, ClusterStateTaskConfig.build(Priority.URGENT, request.masterNodeTimeout()), STATE_UPDATE_TASK_EXECUTOR); diff --git a/x-pack/plugin/searchable-snapshots/src/main/java/org/elasticsearch/xpack/searchablesnapshots/allocation/decider/SearchableSnapshotEnableAllocationDecider.java b/x-pack/plugin/searchable-snapshots/src/main/java/org/elasticsearch/xpack/searchablesnapshots/allocation/decider/SearchableSnapshotEnableAllocationDecider.java index c3f40cb86c90d..52be65bb783c5 100644 --- a/x-pack/plugin/searchable-snapshots/src/main/java/org/elasticsearch/xpack/searchablesnapshots/allocation/decider/SearchableSnapshotEnableAllocationDecider.java +++ b/x-pack/plugin/searchable-snapshots/src/main/java/org/elasticsearch/xpack/searchablesnapshots/allocation/decider/SearchableSnapshotEnableAllocationDecider.java @@ -69,6 +69,10 @@ public Decision canAllocate(ShardRouting shardRouting, RoutingNode node, Routing @Override public Decision canAllocate(ShardRouting shardRouting, RoutingAllocation allocation) { + if (allocation.isSimulating()) { + return allocation.decision(Decision.YES, NAME, "allocation is always enabled when simulating"); + } + final IndexMetadata indexMetadata = allocation.metadata().getIndexSafe(shardRouting.index()); if (indexMetadata.isSearchableSnapshot()) { EnableAllocationDecider.Allocation enableAllocationCopy = this.enableAllocation; From 734a9b7b663382df24eb844ae6a91d21bd82bb28 Mon Sep 17 00:00:00 2001 From: David Turner Date: Mon, 7 Nov 2022 10:10:11 +0000 Subject: [PATCH 02/15] Update docs/changelog/91343.yaml --- docs/changelog/91343.yaml | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 docs/changelog/91343.yaml diff --git a/docs/changelog/91343.yaml b/docs/changelog/91343.yaml new file mode 100644 index 0000000000000..27a59ec487d00 --- /dev/null +++ b/docs/changelog/91343.yaml @@ -0,0 +1,5 @@ +pr: 91343 +summary: Introduce desired-balance allocator +area: Allocation +type: enhancement +issues: [] From 4e4a7b6aa14d5126fc562de6a1d25869aa53acbc Mon Sep 17 00:00:00 2001 From: David Turner Date: Tue, 8 Nov 2022 09:34:42 +0000 Subject: [PATCH 03/15] Re-enable fixed tests --- .../elasticsearch/cluster/coordination/RareClusterStateIT.java | 1 - .../java/org/elasticsearch/index/store/CorruptedFileIT.java | 1 - 2 files changed, 2 deletions(-) diff --git a/server/src/internalClusterTest/java/org/elasticsearch/cluster/coordination/RareClusterStateIT.java b/server/src/internalClusterTest/java/org/elasticsearch/cluster/coordination/RareClusterStateIT.java index a8d12d52b50a4..856bebba1f518 100644 --- a/server/src/internalClusterTest/java/org/elasticsearch/cluster/coordination/RareClusterStateIT.java +++ b/server/src/internalClusterTest/java/org/elasticsearch/cluster/coordination/RareClusterStateIT.java @@ -178,7 +178,6 @@ public void onFailure(Exception e) { return future; } - @AwaitsFix(bugUrl = "https://github.com/elastic/elasticsearch/issues/88647") public void testDeleteCreateInOneBulk() throws Exception { internalCluster().startMasterOnlyNode(); String dataNode = internalCluster().startDataOnlyNode(); diff --git a/server/src/internalClusterTest/java/org/elasticsearch/index/store/CorruptedFileIT.java b/server/src/internalClusterTest/java/org/elasticsearch/index/store/CorruptedFileIT.java index ec986d1d1f6ea..d9a21acbd4175 100644 --- a/server/src/internalClusterTest/java/org/elasticsearch/index/store/CorruptedFileIT.java +++ b/server/src/internalClusterTest/java/org/elasticsearch/index/store/CorruptedFileIT.java @@ -616,7 +616,6 @@ public void testCorruptFileThenSnapshotAndRestore() throws InterruptedException, * nodes, so that replica won't be sneaky and allocated on a node that doesn't have a corrupted * replica. */ - @AwaitsFix(bugUrl = "https://github.com/elastic/elasticsearch/issues/86429") public void testReplicaCorruption() throws Exception { int numDocs = scaledRandomIntBetween(100, 1000); internalCluster().ensureAtLeastNumDataNodes(2); From 1b67865d429268724a32b5995ba3dee809e0a20e Mon Sep 17 00:00:00 2001 From: David Turner Date: Tue, 8 Nov 2022 09:35:16 +0000 Subject: [PATCH 04/15] New default! --- .../main/java/org/elasticsearch/cluster/ClusterModule.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/ClusterModule.java b/server/src/main/java/org/elasticsearch/cluster/ClusterModule.java index d25e3d04bd29b..d7d432c46239f 100644 --- a/server/src/main/java/org/elasticsearch/cluster/ClusterModule.java +++ b/server/src/main/java/org/elasticsearch/cluster/ClusterModule.java @@ -95,8 +95,8 @@ */ public class ClusterModule extends AbstractModule { - public static final String BALANCED_ALLOCATOR = "balanced"; // default - public static final String DESIRED_BALANCE_ALLOCATOR = "desired_balance"; + public static final String BALANCED_ALLOCATOR = "balanced"; + public static final String DESIRED_BALANCE_ALLOCATOR = "desired_balance"; // default public static final Setting SHARDS_ALLOCATOR_TYPE_SETTING = new Setting<>( "cluster.routing.allocation.type", DESIRED_BALANCE_ALLOCATOR, From 2abe6ff69ecc37e573510c43c41aeb7c67f13a02 Mon Sep 17 00:00:00 2001 From: David Turner Date: Tue, 8 Nov 2022 09:35:45 +0000 Subject: [PATCH 05/15] Option -> Optional --- .../cluster/routing/allocation/decider/AllocationDecider.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/AllocationDecider.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/AllocationDecider.java index b083ca1617bc4..16620ae974a3a 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/AllocationDecider.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/AllocationDecider.java @@ -144,7 +144,7 @@ public Decision canAllocateReplicaWhenThereIsRetentionLease(ShardRouting shardRo } /** - * Returns a {@code empty()} if shard could be initially allocated anywhere or {@code Option.of(Set.of(nodeIds))} if shard could be + * Returns a {@code empty()} if shard could be initially allocated anywhere or {@code Optional.of(Set.of(nodeIds))} if shard could be * initially allocated only on subset of a nodes. * * This might be required for splitting or shrinking index as resulting shards have to be on the same node as a source shard. From 0037ee40a0238ec3fc8639abd9deb39a291b93b8 Mon Sep 17 00:00:00 2001 From: David Turner Date: Tue, 8 Nov 2022 09:39:27 +0000 Subject: [PATCH 06/15] Javadoc for DesiredBalanceInput#index --- .../routing/allocation/allocator/DesiredBalanceInput.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceInput.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceInput.java index 9b4cb4c14cd46..a71e52924e45c 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceInput.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceInput.java @@ -20,6 +20,9 @@ /** * The input to the desired balance computation. * + * @param index Each {@link DesiredBalanceInput} comes from a call to {@code reroute()} by a cluster state update, so they + * arrive in sequence and newer inputs should supersede older ones. The {@link #index} of the input is its position + * in this sequence. * @param routingAllocation a copy of (the immutable parts of) the context for the allocation decision process * @param ignoredShards a list of the shards for which earlier allocators have claimed responsibility */ From 7846feac1416a1dbfe507df12fb5ce10d482df55 Mon Sep 17 00:00:00 2001 From: David Turner Date: Tue, 8 Nov 2022 09:52:28 +0000 Subject: [PATCH 07/15] Rename simulate() to simulateShardStarted() --- .../elasticsearch/cluster/ClusterInfoSimulator.java | 2 +- .../allocation/allocator/DesiredBalanceComputer.java | 10 +++++----- .../allocator/ClusterInfoSimulatorTests.java | 12 ++++++------ 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/ClusterInfoSimulator.java b/server/src/main/java/org/elasticsearch/cluster/ClusterInfoSimulator.java index 748ba9459831e..0485e06f66636 100644 --- a/server/src/main/java/org/elasticsearch/cluster/ClusterInfoSimulator.java +++ b/server/src/main/java/org/elasticsearch/cluster/ClusterInfoSimulator.java @@ -39,7 +39,7 @@ public ClusterInfoSimulator(ClusterInfo clusterInfo) { * This assumes the worst case (all shards are placed on a single most used disk) and prevents node overflow. * Balance is later recalculated with a refreshed cluster info containing actual shards placement. */ - public void simulate(ShardRouting shard) { + public void simulateShardStarted(ShardRouting shard) { assert shard.initializing(); var size = getEstimatedShardSize(shard); diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceComputer.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceComputer.java index 0b291d1ed304b..2c9aade9b1de8 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceComputer.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceComputer.java @@ -72,7 +72,7 @@ public DesiredBalance compute( for (final var routingNode : routingNodes) { for (final var shardRouting : routingNode) { if (shardRouting.initializing()) { - clusterInfoSimulator.simulate(shardRouting); + clusterInfoSimulator.simulateShardStarted(shardRouting); routingNodes.startShard(logger, shardRouting, changes, 0L); } } @@ -140,7 +140,7 @@ public DesiredBalance compute( assert shardRouting.started(); if (targetNodesIterator.hasNext()) { ShardRouting shardToRelocate = routingNodes.relocateShard(shardRouting, targetNodesIterator.next(), 0L, changes).v2(); - clusterInfoSimulator.simulate(shardToRelocate); + clusterInfoSimulator.simulateShardStarted(shardToRelocate); routingNodes.startShard(logger, shardToRelocate, changes, 0L); } else { break; @@ -166,7 +166,7 @@ public DesiredBalance compute( if (nodeIds != null && nodeIds.isEmpty() == false) { final String nodeId = nodeIds.removeFirst(); ShardRouting shardToInitialized = unassignedPrimaryIterator.initialize(nodeId, null, 0L, changes); - clusterInfoSimulator.simulate(shardToInitialized); + clusterInfoSimulator.simulateShardStarted(shardToInitialized); routingNodes.startShard(logger, shardToInitialized, changes, 0L); } } @@ -180,7 +180,7 @@ public DesiredBalance compute( if (nodeIds != null && nodeIds.isEmpty() == false) { final String nodeId = nodeIds.removeFirst(); ShardRouting shardToInitialize = unassignedReplicaIterator.initialize(nodeId, null, 0L, changes); - clusterInfoSimulator.simulate(shardToInitialize); + clusterInfoSimulator.simulateShardStarted(shardToInitialize); routingNodes.startShard(logger, shardToInitialize, changes, 0L); } } @@ -230,7 +230,7 @@ public DesiredBalance compute( for (final var shardRouting : routingNode) { if (shardRouting.initializing()) { hasChanges = true; - clusterInfoSimulator.simulate(shardRouting); + clusterInfoSimulator.simulateShardStarted(shardRouting); routingNodes.startShard(logger, shardRouting, changes, 0L); logger.trace("starting shard {}", shardRouting); } diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/ClusterInfoSimulatorTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/ClusterInfoSimulatorTests.java index 5a7839ed42df5..6e74c510159e8 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/ClusterInfoSimulatorTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/ClusterInfoSimulatorTests.java @@ -55,7 +55,7 @@ public void testInitializeNewPrimary() { .withShard(newPrimary, 0) .build() ); - simulator.simulate(newPrimary); + simulator.simulateShardStarted(newPrimary); assertThat( simulator.getClusterInfo(), @@ -82,7 +82,7 @@ public void testInitializeNewReplica() { .withShard(newReplica, 0) .build() ); - simulator.simulate(newReplica); + simulator.simulateShardStarted(newReplica); assertThat( simulator.getClusterInfo(), @@ -111,7 +111,7 @@ public void testRelocateShard() { .withShard(shard, 100) .build() ); - simulator.simulate(shard); + simulator.simulateShardStarted(shard); assertThat( simulator.getClusterInfo(), @@ -139,7 +139,7 @@ public void testRelocateShardWithMultipleDataPath1() { .withShard(shard, 100) .build() ); - simulator.simulate(shard); + simulator.simulateShardStarted(shard); assertThat( simulator.getClusterInfo(), @@ -188,7 +188,7 @@ public void testDiskUsageSimulationWithSingleDataPathAndDiskThresholdDecider() { .build() ); - simulator.simulate(shard2); + simulator.simulateShardStarted(shard2); assertThat( simulator.getClusterInfo(), @@ -268,7 +268,7 @@ public void testDiskUsageSimulationWithMultipleDataPathAndDiskThresholdDecider() .build() ); - simulator.simulate(shard2); + simulator.simulateShardStarted(shard2); assertThat( simulator.getClusterInfo(), From 6924aa96e8f4205853c7639e709ef31433a56133 Mon Sep 17 00:00:00 2001 From: David Turner Date: Tue, 8 Nov 2022 09:52:43 +0000 Subject: [PATCH 08/15] Comment --- .../routing/allocation/allocator/DesiredBalanceComputer.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceComputer.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceComputer.java index 2c9aade9b1de8..016685af38870 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceComputer.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceComputer.java @@ -134,8 +134,8 @@ public DesiredBalance compute( final var targetNodesIterator = targetNodes.iterator(); // Here existing shards are moved to desired locations before initializing unassigned shards because we prefer not to leave - // immovable shards allocated to undesirable locations (e.g. a node that is shutting down). In contrast, reconciliation prefers - // to initialize the unassigned shards first. + // immovable shards allocated to undesirable locations (e.g. a node that is shutting down or an allocation filter which was + // only recently applied). In contrast, reconciliation prefers to initialize the unassigned shards first. for (final var shardRouting : shardsToRelocate) { assert shardRouting.started(); if (targetNodesIterator.hasNext()) { From 01beed033b2d0ca790c6952d9784091cb8e5f02e Mon Sep 17 00:00:00 2001 From: David Turner Date: Tue, 8 Nov 2022 09:53:22 +0000 Subject: [PATCH 09/15] Rename variable --- .../allocation/allocator/DesiredBalanceComputer.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceComputer.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceComputer.java index 016685af38870..563b78f1b0229 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceComputer.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceComputer.java @@ -164,10 +164,10 @@ public DesiredBalance compute( if (shardRouting.primary()) { final var nodeIds = unassignedShardsToInitialize.get(shardRouting); if (nodeIds != null && nodeIds.isEmpty() == false) { - final String nodeId = nodeIds.removeFirst(); - ShardRouting shardToInitialized = unassignedPrimaryIterator.initialize(nodeId, null, 0L, changes); - clusterInfoSimulator.simulateShardStarted(shardToInitialized); - routingNodes.startShard(logger, shardToInitialized, changes, 0L); + final var nodeId = nodeIds.removeFirst(); + final var shardToInitialize = unassignedPrimaryIterator.initialize(nodeId, null, 0L, changes); + clusterInfoSimulator.simulateShardStarted(shardToInitialize); + routingNodes.startShard(logger, shardToInitialize, changes, 0L); } } } From 528e3c0d2973cb4430ac9329b9c9e24df3e7dd82 Mon Sep 17 00:00:00 2001 From: David Turner Date: Tue, 8 Nov 2022 09:55:05 +0000 Subject: [PATCH 10/15] Occasional INFO message about excessive iterations --- .../allocator/DesiredBalanceComputer.java | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceComputer.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceComputer.java index 563b78f1b0229..dae29cce4f4c7 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceComputer.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceComputer.java @@ -8,6 +8,7 @@ package org.elasticsearch.cluster.routing.allocation.allocator; +import org.apache.logging.log4j.Level; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.elasticsearch.cluster.ClusterInfoSimulator; @@ -17,6 +18,7 @@ import org.elasticsearch.cluster.routing.UnassignedInfo; import org.elasticsearch.cluster.routing.allocation.command.MoveAllocationCommand; import org.elasticsearch.common.metrics.MeanMetric; +import org.elasticsearch.core.Strings; import org.elasticsearch.index.shard.ShardId; import java.util.ArrayList; @@ -239,18 +241,28 @@ public DesiredBalance compute( i++; if (hasChanges == false) { - logger.debug("Desired balance computation converged after {} iterations", i); + logger.debug("Desired balance computation for [{}] converged after [{}] iterations", desiredBalanceInput.index(), i); break; } if (isFresh.test(desiredBalanceInput) == false) { // we run at least one iteration, but if another reroute happened meanwhile // then publish the interim state and restart the calculation - logger.debug("Newer cluster state received, publishing incomplete desired balance and restarting computation"); + logger.debug(""" + Newer cluster state received after [{}] iterations, publishing incomplete desired balance for [{}] and restarting \ + computation + """, i, desiredBalanceInput.index()); break; } if (i % 100 == 0) { // TODO this warning should be time based, iteration count should be proportional to the number of shards - logger.debug("Desired balance computation is still not converged after {} iterations", i); + logger.log( + i % 1000000 == 0 ? Level.INFO : Level.DEBUG, + Strings.format( + "Desired balance computation for [%d] is still not converged after [%d] iterations", + desiredBalanceInput.index(), + i + ) + ); } } iterations.inc(i); From 69ded3c56391e42c4e4321f6ba9c68a919b451c6 Mon Sep 17 00:00:00 2001 From: David Turner Date: Tue, 8 Nov 2022 10:00:04 +0000 Subject: [PATCH 11/15] Rename ShardAssignment#of --- .../routing/allocation/allocator/DesiredBalanceComputer.java | 2 +- .../cluster/routing/allocation/allocator/ShardAssignment.java | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceComputer.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceComputer.java index dae29cce4f4c7..285ed9f222256 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceComputer.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceComputer.java @@ -269,7 +269,7 @@ public DesiredBalance compute( final var assignments = new HashMap(); for (var shardAndAssignments : routingNodes.getAssignedShards().entrySet()) { - assignments.put(shardAndAssignments.getKey(), ShardAssignment.of(shardAndAssignments.getValue())); + assignments.put(shardAndAssignments.getKey(), ShardAssignment.ofAssignedShards(shardAndAssignments.getValue())); } for (var ignored : routingNodes.unassigned().ignored()) { diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/ShardAssignment.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/ShardAssignment.java index e11b02b4e4759..ad899400de805 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/ShardAssignment.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/ShardAssignment.java @@ -28,7 +28,8 @@ public boolean isIgnored(boolean primary) { return primary ? total == ignored : ignored > 0; } - public static ShardAssignment of(List routings) { + public static ShardAssignment ofAssignedShards(List routings) { + assert routings.stream().allMatch(ShardRouting::started) : routings; var nodeIds = routings.stream().map(ShardRouting::currentNodeId).collect(toCollection(LinkedHashSet::new)); return new ShardAssignment(unmodifiableSet(nodeIds), routings.size(), 0, 0); } From 2034ac3cf34ecc9c28be3cf07881901f39c0c460 Mon Sep 17 00:00:00 2001 From: David Turner Date: Tue, 8 Nov 2022 10:11:05 +0000 Subject: [PATCH 12/15] Rename arg to PendingListenersQueue#complete() (and inline advance()) --- .../allocator/PendingListenersQueue.java | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/PendingListenersQueue.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/PendingListenersQueue.java index 309edc8c77094..dd9d57c0ca943 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/PendingListenersQueue.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/PendingListenersQueue.java @@ -39,8 +39,12 @@ public void add(long index, ActionListener listener) { } } - public void complete(long index) { - advance(index); + public void complete(long convergedIndex) { + synchronized (pendingListeners) { + if (convergedIndex > completedIndex) { + completedIndex = convergedIndex; + } + } executeListeners(completedIndex, true); } @@ -66,14 +70,6 @@ private void executeListeners(long convergedIndex, boolean isMaster) { } } - private void advance(long index) { - synchronized (pendingListeners) { - if (index > completedIndex) { - completedIndex = index; - } - } - } - private Collection> pollListeners(long maxIndex) { var listeners = new ArrayList>(); PendingListener listener; From e34225bbf7bc9126d0c7ac50b54bbbbf1c5e05ca Mon Sep 17 00:00:00 2001 From: David Turner Date: Tue, 8 Nov 2022 10:15:31 +0000 Subject: [PATCH 13/15] Make constant final --- .../cluster/routing/allocation/allocator/DesiredBalance.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalance.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalance.java index 7bfaddc49fbb3..53827f787624c 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalance.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalance.java @@ -20,6 +20,8 @@ */ public record DesiredBalance(long lastConvergedIndex, Map assignments) { + public static final DesiredBalance INITIAL = new DesiredBalance(-1, Map.of()); + public ShardAssignment getAssignment(ShardId shardId) { return assignments.get(shardId); } @@ -28,5 +30,4 @@ public static boolean hasChanges(DesiredBalance a, DesiredBalance b) { return Objects.equals(a.assignments, b.assignments) == false; } - public static DesiredBalance INITIAL = new DesiredBalance(-1, Map.of()); } From c5994780f15b578b4176a43bf64b4e85591edc08 Mon Sep 17 00:00:00 2001 From: David Turner Date: Tue, 8 Nov 2022 10:30:34 +0000 Subject: [PATCH 14/15] Javadoc on NodeAllocationOrdering --- .../routing/allocation/allocator/NodeAllocationOrdering.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/NodeAllocationOrdering.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/NodeAllocationOrdering.java index 60ad1d3480372..104f7f7491cf5 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/NodeAllocationOrdering.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/NodeAllocationOrdering.java @@ -17,6 +17,10 @@ import java.util.Set; import java.util.concurrent.atomic.AtomicLong; +/** + * Tracks the order in which nodes are used for allocation so that we can allocate shards to nodes in a round-robin fashion (all else being + * equal). + */ public class NodeAllocationOrdering { private final AtomicLong order = new AtomicLong(0); From 0e0684f123afb72dd86959bb0dbb00c701ae28e6 Mon Sep 17 00:00:00 2001 From: David Turner Date: Tue, 8 Nov 2022 11:55:52 +0000 Subject: [PATCH 15/15] Use TreeMap rather than broken TreeSet --- .../allocator/DesiredBalanceComputer.java | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceComputer.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceComputer.java index 285ed9f222256..ca628a4d4aacc 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceComputer.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceComputer.java @@ -22,7 +22,6 @@ import org.elasticsearch.index.shard.ShardId; import java.util.ArrayList; -import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedList; @@ -30,6 +29,7 @@ import java.util.Map; import java.util.Queue; import java.util.Set; +import java.util.TreeMap; import java.util.TreeSet; import java.util.function.Predicate; @@ -112,12 +112,14 @@ public DesiredBalance compute( final var shardId = entry.getKey(); final var routings = entry.getValue(); - // treesets so that we are consistent about the order of future relocations - final var shardsToRelocate = new TreeSet<>(Comparator.comparing(ShardRouting::currentNodeId)); + // treemap (keyed by node ID) so that we are consistent about the order of future relocations + final var shardsToRelocate = new TreeMap(); final var assignment = previousDesiredBalance.getAssignment(shardId); + // treeset (ordered by node ID) so that we are consistent about the order of future relocations final var targetNodes = assignment != null ? new TreeSet<>(assignment.nodeIds()) : new TreeSet(); targetNodes.retainAll(knownNodeIds); + // preserving last known shard location as a starting point to avoid unnecessary relocations for (ShardRouting shardRouting : routings.unassigned()) { var lastAllocatedNodeId = shardRouting.unassignedInfo().getLastAllocatedNodeId(); @@ -129,7 +131,8 @@ public DesiredBalance compute( for (final var shardRouting : routings.assigned()) { assert shardRouting.started(); if (targetNodes.remove(shardRouting.currentNodeId()) == false) { - shardsToRelocate.add(shardRouting); + final var previousShard = shardsToRelocate.put(shardRouting.currentNodeId(), shardRouting); + assert previousShard == null : "duplicate shards to relocate: " + shardRouting + " vs " + previousShard; } } @@ -138,7 +141,7 @@ public DesiredBalance compute( // Here existing shards are moved to desired locations before initializing unassigned shards because we prefer not to leave // immovable shards allocated to undesirable locations (e.g. a node that is shutting down or an allocation filter which was // only recently applied). In contrast, reconciliation prefers to initialize the unassigned shards first. - for (final var shardRouting : shardsToRelocate) { + for (final var shardRouting : shardsToRelocate.values()) { assert shardRouting.started(); if (targetNodesIterator.hasNext()) { ShardRouting shardToRelocate = routingNodes.relocateShard(shardRouting, targetNodesIterator.next(), 0L, changes).v2();