From 4f601e97b1b4fdec888260fd4ebe4681d91fe4f6 Mon Sep 17 00:00:00 2001 From: David Turner Date: Mon, 22 Feb 2021 11:34:39 +0000 Subject: [PATCH 1/2] Skip zone/host awareness with auto-expand replicas Today if an index is set to `auto_expand_replicas: N-all` then we will try and create a shard copy on every node that matches the applicable allocation filters. This conflits with shard allocation awareness and the same-host allocation decider if there is an uneven distribution of nodes across zones or hosts, since these deciders prevent shard copies from being allocated unevenly and may therefore leave some unassigned shards. The point of these two deciders is to improve resilience given a limited number of shard copies but there is no need for this behaviour when the number of shard copies is not limited, so this commit supresses them in that case. Closes #54151 Closes #2869 --- docs/reference/index-modules.asciidoc | 23 +++---- .../cluster/shards_allocation.asciidoc | 2 +- .../cluster/metadata/AutoExpandReplicas.java | 4 ++ .../decider/AwarenessAllocationDecider.java | 19 ++++-- .../decider/SameShardAllocationDecider.java | 9 +++ .../metadata/AutoExpandReplicasTests.java | 3 + .../allocation/AwarenessAllocationTests.java | 41 ++++++++++++- .../allocation/SameShardRoutingTests.java | 60 ++++++++++++++++++- 8 files changed, 142 insertions(+), 19 deletions(-) diff --git a/docs/reference/index-modules.asciidoc b/docs/reference/index-modules.asciidoc index 8bbb122e61e52..21a3e573fcff0 100644 --- a/docs/reference/index-modules.asciidoc +++ b/docs/reference/index-modules.asciidoc @@ -139,16 +139,19 @@ specific index module: The number of replicas each primary shard has. Defaults to 1. `index.auto_expand_replicas`:: - - Auto-expand the number of replicas based on the number of data nodes in the cluster. - Set to a dash delimited lower and upper bound (e.g. `0-5`) or use `all` - for the upper bound (e.g. `0-all`). Defaults to `false` (i.e. disabled). - Note that the auto-expanded number of replicas only takes - <> rules into account, but ignores - any other allocation rules such as <> - and <>, and this can lead to the - cluster health becoming `YELLOW` if the applicable rules prevent all the replicas - from being allocated. +Auto-expand the number of replicas based on the number of data nodes in the +cluster. Set to a dash delimited lower and upper bound (e.g. `0-5`) or use `all` +for the upper bound (e.g. `0-all`). Defaults to `false` (i.e. disabled). Note +that the auto-expanded number of replicas only takes +<> rules into account, but +ignores other allocation rules such as <>, and this can lead to the cluster health becoming `YELLOW` if the +applicable rules prevent all the replicas from being allocated. ++ +If the upper bound is `all` then <> and +<> +are ignored for this index. `index.search.idle.after`:: How long a shard can not receive a search or get request until it's considered diff --git a/docs/reference/modules/cluster/shards_allocation.asciidoc b/docs/reference/modules/cluster/shards_allocation.asciidoc index e26e732e3455e..34313c2dc6eea 100644 --- a/docs/reference/modules/cluster/shards_allocation.asciidoc +++ b/docs/reference/modules/cluster/shards_allocation.asciidoc @@ -45,7 +45,7 @@ one of the active allocation ids in the cluster state. These should be fast so more initial primary recoveries can happen in parallel on the same node. Defaults to `4`. - +[[cluster-routing-allocation-same-shard-host]] `cluster.routing.allocation.same_shard.host`:: (<>) Allows to perform a check to prevent allocation of multiple instances of diff --git a/server/src/main/java/org/elasticsearch/cluster/metadata/AutoExpandReplicas.java b/server/src/main/java/org/elasticsearch/cluster/metadata/AutoExpandReplicas.java index 181281a9f5cb9..5689235d78c29 100644 --- a/server/src/main/java/org/elasticsearch/cluster/metadata/AutoExpandReplicas.java +++ b/server/src/main/java/org/elasticsearch/cluster/metadata/AutoExpandReplicas.java @@ -91,6 +91,10 @@ int getMaxReplicas(int numDataNodes) { return Math.min(maxReplicas, numDataNodes-1); } + public boolean expandToAllNodes() { + return maxReplicas == Integer.MAX_VALUE; + } + private OptionalInt getDesiredNumberOfReplicas(IndexMetadata indexMetadata, RoutingAllocation allocation) { if (enabled) { int numMatchingDataNodes = 0; diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/AwarenessAllocationDecider.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/AwarenessAllocationDecider.java index fdc43e98d0cae..886436139fc9f 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/AwarenessAllocationDecider.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/AwarenessAllocationDecider.java @@ -8,11 +8,6 @@ package org.elasticsearch.cluster.routing.allocation.decider; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.function.Function; - import com.carrotsearch.hppc.ObjectIntHashMap; import org.elasticsearch.cluster.metadata.IndexMetadata; import org.elasticsearch.cluster.routing.RoutingNode; @@ -24,7 +19,13 @@ import org.elasticsearch.common.settings.Setting.Property; import org.elasticsearch.common.settings.Settings; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.function.Function; + import static java.util.Collections.emptyList; +import static org.elasticsearch.cluster.metadata.IndexMetadata.INDEX_AUTO_EXPAND_REPLICAS_SETTING; /** * This {@link AllocationDecider} controls shard allocation based on @@ -118,6 +119,9 @@ public Decision canRemain(ShardRouting shardRouting, RoutingNode node, RoutingAl "allocation awareness is not enabled, set cluster setting [" + CLUSTER_ROUTING_ALLOCATION_AWARENESS_ATTRIBUTE_SETTING.getKey() + "] to enable it"); + private static final Decision YES_AUTO_EXPAND_ALL = Decision.single(Decision.Type.YES, NAME, + "allocation awareness is ignored, this index is set to auto-expand to all nodes"); + private static final Decision YES_ALL_MET = Decision.single(Decision.Type.YES, NAME, "node meets all awareness attribute requirements"); @@ -128,6 +132,11 @@ private Decision underCapacity(ShardRouting shardRouting, RoutingNode node, Rout final boolean debug = allocation.debugDecision(); IndexMetadata indexMetadata = allocation.metadata().getIndexSafe(shardRouting.index()); + + if (INDEX_AUTO_EXPAND_REPLICAS_SETTING.get(indexMetadata.getSettings()).expandToAllNodes()) { + return YES_AUTO_EXPAND_ALL; + } + int shardCount = indexMetadata.getNumberOfReplicas() + 1; // 1 for primary for (String awarenessAttribute : awarenessAttributes) { // the node the shard exists on must be associated with an awareness attribute diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/SameShardAllocationDecider.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/SameShardAllocationDecider.java index 82254ae4cc96a..1f3f995deaf0d 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/SameShardAllocationDecider.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/SameShardAllocationDecider.java @@ -17,6 +17,8 @@ import org.elasticsearch.common.settings.Setting.Property; import org.elasticsearch.common.settings.Settings; +import static org.elasticsearch.cluster.metadata.IndexMetadata.INDEX_AUTO_EXPAND_REPLICAS_SETTING; + /** * An allocation decider that prevents multiple instances of the same shard to * be allocated on the same {@code node}. @@ -58,6 +60,9 @@ private void setSameHost(boolean sameHost) { private static final Decision YES_NONE_HOLD_COPY = Decision.single(Decision.Type.YES, NAME, "none of the nodes on this host hold a copy of this shard"); + private static final Decision YES_AUTO_EXPAND_ALL = Decision.single(Decision.Type.YES, NAME, + "same-host allocation is ignored, this index is set to auto-expand to all nodes"); + @Override public Decision canAllocate(ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) { Iterable assignedShards = allocation.routingNodes().assignedShards(shardRouting.shardId()); @@ -66,6 +71,10 @@ public Decision canAllocate(ShardRouting shardRouting, RoutingNode node, Routing // if its already a NO decision looking at the node, or we aren't configured to look at the host, return the decision return decision; } + if (INDEX_AUTO_EXPAND_REPLICAS_SETTING.get( + allocation.metadata().getIndexSafe(shardRouting.index()).getSettings()).expandToAllNodes()) { + return YES_AUTO_EXPAND_ALL; + } if (node.node() != null) { for (RoutingNode checkNode : allocation.routingNodes()) { if (checkNode.node() == null) { diff --git a/server/src/test/java/org/elasticsearch/cluster/metadata/AutoExpandReplicasTests.java b/server/src/test/java/org/elasticsearch/cluster/metadata/AutoExpandReplicasTests.java index ef2011431e469..f5fc9764ff8fa 100644 --- a/server/src/test/java/org/elasticsearch/cluster/metadata/AutoExpandReplicasTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/metadata/AutoExpandReplicasTests.java @@ -50,16 +50,19 @@ public void testParseSettings() { assertEquals(0, autoExpandReplicas.getMinReplicas()); assertEquals(5, autoExpandReplicas.getMaxReplicas(8)); assertEquals(2, autoExpandReplicas.getMaxReplicas(3)); + assertFalse(autoExpandReplicas.expandToAllNodes()); autoExpandReplicas = AutoExpandReplicas.SETTING.get(Settings.builder().put("index.auto_expand_replicas", "0-all").build()); assertEquals(0, autoExpandReplicas.getMinReplicas()); assertEquals(5, autoExpandReplicas.getMaxReplicas(6)); assertEquals(2, autoExpandReplicas.getMaxReplicas(3)); + assertTrue(autoExpandReplicas.expandToAllNodes()); autoExpandReplicas = AutoExpandReplicas.SETTING.get(Settings.builder().put("index.auto_expand_replicas", "1-all").build()); assertEquals(1, autoExpandReplicas.getMinReplicas()); assertEquals(5, autoExpandReplicas.getMaxReplicas(6)); assertEquals(2, autoExpandReplicas.getMaxReplicas(3)); + assertTrue(autoExpandReplicas.expandToAllNodes()); } diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/AwarenessAllocationTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/AwarenessAllocationTests.java index d8677aa211c4d..7a7702c4d7671 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/AwarenessAllocationTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/AwarenessAllocationTests.java @@ -11,6 +11,7 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.elasticsearch.Version; +import org.elasticsearch.cluster.ClusterName; import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.ESAllocationTestCase; import org.elasticsearch.cluster.metadata.IndexMetadata; @@ -34,6 +35,7 @@ import static org.elasticsearch.cluster.routing.ShardRoutingState.RELOCATING; import static org.elasticsearch.cluster.routing.ShardRoutingState.STARTED; import static org.elasticsearch.cluster.routing.ShardRoutingState.UNASSIGNED; +import static org.hamcrest.Matchers.empty; import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.greaterThan; import static org.hamcrest.Matchers.sameInstance; @@ -770,8 +772,13 @@ public void testUnassignedShardsWithUnbalancedZones() { logger.info("Building initial routing table for 'testUnassignedShardsWithUnbalancedZones'"); + final Settings.Builder indexSettings = settings(Version.CURRENT); + if (randomBoolean()) { + indexSettings.put(IndexMetadata.SETTING_AUTO_EXPAND_REPLICAS, "0-4"); + } + Metadata metadata = Metadata.builder() - .put(IndexMetadata.builder("test").settings(settings(Version.CURRENT)).numberOfShards(1).numberOfReplicas(4)) + .put(IndexMetadata.builder("test").settings(indexSettings).numberOfShards(1).numberOfReplicas(4)) .build(); RoutingTable initialRoutingTable = RoutingTable.builder() @@ -865,4 +872,36 @@ public void testMultipleAwarenessAttributes() { assertThat(clusterState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(2)); assertThat(clusterState.getRoutingNodes().shardsWithState(INITIALIZING).size(), equalTo(0)); } + + public void testDisabledByAutoExpandReplicas() { + final Settings settings = Settings.builder() + .put(AwarenessAllocationDecider.CLUSTER_ROUTING_ALLOCATION_AWARENESS_ATTRIBUTE_SETTING.getKey(), "zone") + .build(); + + final AllocationService strategy = createAllocationService(settings); + + final Metadata metadata = Metadata.builder() + .put(IndexMetadata.builder("test").settings(settings(Version.CURRENT) + .put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 1) + .put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 99) + .put(IndexMetadata.SETTING_AUTO_EXPAND_REPLICAS, "0-all"))) + .build(); + + final ClusterState clusterState = applyStartedShardsUntilNoChange( + ClusterState.builder(ClusterName.CLUSTER_NAME_SETTING.get(Settings.EMPTY)) + .metadata(metadata) + .routingTable(RoutingTable.builder() + .addAsNew(metadata.index("test")) + .build()) + .nodes(DiscoveryNodes.builder() + .add(newNode("A-0", singletonMap("zone", "a"))) + .add(newNode("A-1", singletonMap("zone", "a"))) + .add(newNode("A-2", singletonMap("zone", "a"))) + .add(newNode("A-3", singletonMap("zone", "a"))) + .add(newNode("A-4", singletonMap("zone", "a"))) + .add(newNode("B-0", singletonMap("zone", "b"))) + ).build(), strategy); + + assertThat(clusterState.getRoutingNodes().shardsWithState(UNASSIGNED), empty()); + } } diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/SameShardRoutingTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/SameShardRoutingTests.java index f269bb1ff8776..94f8f36a213e7 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/SameShardRoutingTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/SameShardRoutingTests.java @@ -37,8 +37,11 @@ import java.util.Collections; import static java.util.Collections.emptyMap; +import static org.elasticsearch.cluster.ClusterName.CLUSTER_NAME_SETTING; import static org.elasticsearch.cluster.routing.ShardRoutingState.INITIALIZING; +import static org.elasticsearch.cluster.routing.ShardRoutingState.UNASSIGNED; import static org.elasticsearch.cluster.routing.allocation.RoutingNodesUtils.numberOfShardsOfType; +import static org.hamcrest.Matchers.empty; import static org.hamcrest.Matchers.equalTo; public class SameShardRoutingTests extends ESAllocationTestCase { @@ -48,14 +51,19 @@ public void testSameHost() { AllocationService strategy = createAllocationService( Settings.builder().put(SameShardAllocationDecider.CLUSTER_ROUTING_ALLOCATION_SAME_HOST_SETTING.getKey(), true).build()); + final Settings.Builder indexSettings = settings(Version.CURRENT); + if (randomBoolean()) { + indexSettings.put(IndexMetadata.SETTING_AUTO_EXPAND_REPLICAS, "0-1"); + } + Metadata metadata = Metadata.builder() - .put(IndexMetadata.builder("test").settings(settings(Version.CURRENT)).numberOfShards(2).numberOfReplicas(1)) + .put(IndexMetadata.builder("test").settings(indexSettings).numberOfShards(2).numberOfReplicas(1)) .build(); RoutingTable routingTable = RoutingTable.builder() .addAsNew(metadata.index("test")) .build(); - ClusterState clusterState = ClusterState.builder(ClusterName.CLUSTER_NAME_SETTING.getDefault(Settings.EMPTY)).metadata(metadata) + ClusterState clusterState = ClusterState.builder(CLUSTER_NAME_SETTING.getDefault(Settings.EMPTY)).metadata(metadata) .routingTable(routingTable).build(); logger.info("--> adding two nodes with the same host"); @@ -88,6 +96,54 @@ public void testSameHost() { } } + public void testSameHostCheckDisabledByAutoExpandReplicas() { + final AllocationService strategy = createAllocationService( + Settings.builder().put(SameShardAllocationDecider.CLUSTER_ROUTING_ALLOCATION_SAME_HOST_SETTING.getKey(), true).build()); + + final Metadata metadata = Metadata.builder() + .put(IndexMetadata.builder("test").settings(settings(Version.CURRENT) + .put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 1) + .put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 99) + .put(IndexMetadata.SETTING_AUTO_EXPAND_REPLICAS, "0-all"))) + .build(); + + final DiscoveryNode node1 = new DiscoveryNode( + "node1", + "node1", + "node1", + "test1", + "test1", + buildNewFakeTransportAddress(), + emptyMap(), + MASTER_DATA_ROLES, + Version.CURRENT); + + + final DiscoveryNode node2 = new DiscoveryNode( + "node2", + "node2", + "node2", + "test1", + "test1", + buildNewFakeTransportAddress(), + emptyMap(), + MASTER_DATA_ROLES, + Version.CURRENT); + + final ClusterState clusterState = applyStartedShardsUntilNoChange(ClusterState + .builder(CLUSTER_NAME_SETTING.getDefault(Settings.EMPTY)) + .metadata(metadata) + .routingTable(RoutingTable.builder() + .addAsNew(metadata.index("test")) + .build()) + .nodes( + DiscoveryNodes.builder() + .add(node1) + .add(node2)).build(), strategy); + + assertThat(clusterState.getRoutingNodes().shardsWithState(UNASSIGNED), empty()); + } + public void testForceAllocatePrimaryOnSameNodeNotAllowed() { SameShardAllocationDecider decider = new SameShardAllocationDecider( Settings.EMPTY, new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS)); From 2de1316bc9496804c93b959cefd844dc4ef80838 Mon Sep 17 00:00:00 2001 From: David Turner Date: Mon, 22 Feb 2021 12:19:22 +0000 Subject: [PATCH 2/2] Precommit --- .../cluster/routing/allocation/SameShardRoutingTests.java | 1 - 1 file changed, 1 deletion(-) diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/SameShardRoutingTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/SameShardRoutingTests.java index 94f8f36a213e7..7ef4b8f23a3c7 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/SameShardRoutingTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/SameShardRoutingTests.java @@ -13,7 +13,6 @@ import org.elasticsearch.Version; import org.elasticsearch.action.support.replication.ClusterStateCreationUtils; import org.elasticsearch.cluster.ClusterInfo; -import org.elasticsearch.cluster.ClusterName; import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.ESAllocationTestCase; import org.elasticsearch.cluster.metadata.IndexMetadata;