Skip to content

Commit 3ee004c

Browse files
authored
Add dedicated step for checking shrink allocation status (#35161)
This adds a new step for checking whether an index is allocated correctly based on the rules added prior to running the shrink step. It also fixes a bug where for shrink we are not allowed to have the shards relocating for the shrink step. This also allows us to simplify AllocationRoutedStep and provide better feedback in the step info for why either the allocation or the shrink checks have failed. Resolves #34938
1 parent 842809e commit 3ee004c

File tree

7 files changed

+522
-46
lines changed

7 files changed

+522
-46
lines changed

x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/indexlifecycle/AllocateAction.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,7 @@ public List<Step> toSteps(Client client, String phase, StepKey nextStepKey) {
145145
exclude.forEach((key, value) -> newSettings.put(IndexMetaData.INDEX_ROUTING_EXCLUDE_GROUP_SETTING.getKey() + key, value));
146146
require.forEach((key, value) -> newSettings.put(IndexMetaData.INDEX_ROUTING_REQUIRE_GROUP_SETTING.getKey() + key, value));
147147
UpdateSettingsStep allocateStep = new UpdateSettingsStep(allocateKey, allocationRoutedKey, client, newSettings.build());
148-
AllocationRoutedStep routedCheckStep = new AllocationRoutedStep(allocationRoutedKey, nextStepKey, true);
148+
AllocationRoutedStep routedCheckStep = new AllocationRoutedStep(allocationRoutedKey, nextStepKey);
149149
return Arrays.asList(allocateStep, routedCheckStep);
150150
}
151151

x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/indexlifecycle/AllocationRoutedStep.java

Lines changed: 9 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -40,15 +40,8 @@ public class AllocationRoutedStep extends ClusterStateWaitStep {
4040
private static final AllocationDeciders ALLOCATION_DECIDERS = new AllocationDeciders(Collections.singletonList(
4141
new FilterAllocationDecider(Settings.EMPTY, new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS))));
4242

43-
private boolean waitOnAllShardCopies;
44-
45-
AllocationRoutedStep(StepKey key, StepKey nextStepKey, boolean waitOnAllShardCopies) {
43+
AllocationRoutedStep(StepKey key, StepKey nextStepKey) {
4644
super(key, nextStepKey);
47-
this.waitOnAllShardCopies = waitOnAllShardCopies;
48-
}
49-
50-
public boolean getWaitOnAllShardCopies() {
51-
return waitOnAllShardCopies;
5245
}
5346

5447
@Override
@@ -68,42 +61,35 @@ public Result isConditionMet(Index index, ClusterState clusterState) {
6861
// if the allocation has happened
6962
RoutingAllocation allocation = new RoutingAllocation(ALLOCATION_DECIDERS, clusterState.getRoutingNodes(), clusterState, null,
7063
System.nanoTime());
64+
7165
int allocationPendingAllShards = 0;
7266

7367
ImmutableOpenIntMap<IndexShardRoutingTable> allShards = clusterState.getRoutingTable().index(index).getShards();
7468
for (ObjectCursor<IndexShardRoutingTable> shardRoutingTable : allShards.values()) {
75-
int allocationPendingThisShard = 0;
76-
int shardCopiesThisShard = shardRoutingTable.value.size();
7769
for (ShardRouting shardRouting : shardRoutingTable.value.shards()) {
7870
String currentNodeId = shardRouting.currentNodeId();
7971
boolean canRemainOnCurrentNode = ALLOCATION_DECIDERS
8072
.canRemain(shardRouting, clusterState.getRoutingNodes().node(currentNodeId), allocation)
8173
.type() == Decision.Type.YES;
8274
if (canRemainOnCurrentNode == false) {
83-
allocationPendingThisShard++;
75+
allocationPendingAllShards++;
8476
}
8577
}
86-
87-
if (waitOnAllShardCopies) {
88-
allocationPendingAllShards += allocationPendingThisShard;
89-
} else if (shardCopiesThisShard - allocationPendingThisShard == 0) {
90-
allocationPendingAllShards++;
91-
}
9278
}
79+
9380
if (allocationPendingAllShards > 0) {
94-
logger.debug(
95-
"[{}] lifecycle action for index [{}] waiting for [{}] shards " + "to be allocated to nodes matching the given filters",
96-
getKey().getAction(), index, allocationPendingAllShards);
81+
logger.debug("{} lifecycle action [{}] waiting for [{}] shards to be allocated to nodes matching the given filters",
82+
index, getKey().getAction(), allocationPendingAllShards);
9783
return new Result(false, new Info(idxMeta.getNumberOfReplicas(), allocationPendingAllShards, true));
9884
} else {
99-
logger.debug("[{}] lifecycle action for index [{}] complete", getKey().getAction(), index);
85+
logger.debug("{} lifecycle action for [{}] complete", index, getKey().getAction());
10086
return new Result(true, null);
10187
}
10288
}
10389

10490
@Override
10591
public int hashCode() {
106-
return Objects.hash(super.hashCode(), waitOnAllShardCopies);
92+
return 611;
10793
}
10894

10995
@Override
@@ -114,9 +100,7 @@ public boolean equals(Object obj) {
114100
if (getClass() != obj.getClass()) {
115101
return false;
116102
}
117-
AllocationRoutedStep other = (AllocationRoutedStep) obj;
118-
return super.equals(obj) &&
119-
Objects.equals(waitOnAllShardCopies, other.waitOnAllShardCopies);
103+
return super.equals(obj);
120104
}
121105

122106
public static final class Info implements ToXContentObject {
Lines changed: 175 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,175 @@
1+
/*
2+
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3+
* or more contributor license agreements. Licensed under the Elastic License;
4+
* you may not use this file except in compliance with the Elastic License.
5+
*/
6+
7+
package org.elasticsearch.xpack.core.indexlifecycle;
8+
9+
import org.apache.logging.log4j.LogManager;
10+
import org.apache.logging.log4j.Logger;
11+
import org.elasticsearch.action.support.ActiveShardCount;
12+
import org.elasticsearch.cluster.ClusterState;
13+
import org.elasticsearch.cluster.metadata.IndexMetaData;
14+
import org.elasticsearch.cluster.routing.IndexRoutingTable;
15+
import org.elasticsearch.cluster.routing.ShardRouting;
16+
import org.elasticsearch.cluster.routing.ShardRoutingState;
17+
import org.elasticsearch.common.ParseField;
18+
import org.elasticsearch.common.Strings;
19+
import org.elasticsearch.common.xcontent.ConstructingObjectParser;
20+
import org.elasticsearch.common.xcontent.ToXContentObject;
21+
import org.elasticsearch.common.xcontent.XContentBuilder;
22+
import org.elasticsearch.index.Index;
23+
24+
import java.io.IOException;
25+
import java.util.Locale;
26+
import java.util.Objects;
27+
28+
/**
29+
* This step is used prior to running a shrink step in order to ensure that the index being shrunk
30+
* has a copy of each shard allocated on one particular node (the node used by the require
31+
* parameter) and that the shards are not relocating.
32+
*/
33+
public class CheckShrinkReadyStep extends ClusterStateWaitStep {
34+
public static final String NAME = "check-shrink-allocation";
35+
36+
private static final Logger logger = LogManager.getLogger(CheckShrinkReadyStep.class);
37+
38+
CheckShrinkReadyStep(StepKey key, StepKey nextStepKey) {
39+
super(key, nextStepKey);
40+
}
41+
42+
@Override
43+
public Result isConditionMet(Index index, ClusterState clusterState) {
44+
IndexMetaData idxMeta = clusterState.metaData().index(index);
45+
46+
if (idxMeta == null) {
47+
// Index must have been since deleted, ignore it
48+
logger.debug("[{}] lifecycle action for index [{}] executed but index no longer exists",
49+
getKey().getAction(), index.getName());
50+
return new Result(false, null);
51+
}
52+
53+
// How many shards the node should have
54+
int expectedShardCount = idxMeta.getNumberOfShards();
55+
56+
if (ActiveShardCount.ALL.enoughShardsActive(clusterState, index.getName()) == false) {
57+
logger.debug("[{}] shrink action for [{}] cannot make progress because not all shards are active",
58+
getKey().getAction(), index.getName());
59+
return new Result(false, new CheckShrinkReadyStep.Info("", expectedShardCount, -1));
60+
}
61+
62+
// The id of the node the shards should be on
63+
final String idShardsShouldBeOn = idxMeta.getSettings().get(IndexMetaData.INDEX_ROUTING_REQUIRE_GROUP_PREFIX + "._id");
64+
if (idShardsShouldBeOn == null) {
65+
throw new IllegalStateException("Cannot check shrink allocation as there are no allocation rules by _id");
66+
}
67+
68+
final IndexRoutingTable routingTable = clusterState.getRoutingTable().index(index);
69+
int foundShards = 0;
70+
for (ShardRouting shard : routingTable.shardsWithState(ShardRoutingState.STARTED)) {
71+
final String currentNodeId = shard.currentNodeId();
72+
if (idShardsShouldBeOn.equals(currentNodeId) && shard.relocating() == false) {
73+
foundShards++;
74+
}
75+
}
76+
77+
logger.trace("{} checking for shrink readiness on [{}], found {} shards and need {}",
78+
index, idShardsShouldBeOn, foundShards, expectedShardCount);
79+
80+
if (foundShards == expectedShardCount) {
81+
logger.trace("{} successfully found {} allocated shards for shrink readiness on node [{}] ({})",
82+
index, expectedShardCount, idShardsShouldBeOn, getKey().getAction());
83+
return new Result(true, null);
84+
} else {
85+
logger.trace("{} failed to find {} allocated shards (found {}) on node [{}] for shrink readiness ({})",
86+
index, expectedShardCount, foundShards, idShardsShouldBeOn, getKey().getAction());
87+
return new Result(false, new CheckShrinkReadyStep.Info(idShardsShouldBeOn, expectedShardCount,
88+
expectedShardCount - foundShards));
89+
}
90+
}
91+
92+
@Override
93+
public int hashCode() {
94+
return 612;
95+
}
96+
97+
@Override
98+
public boolean equals(Object obj) {
99+
if (obj == null) {
100+
return false;
101+
}
102+
if (getClass() != obj.getClass()) {
103+
return false;
104+
}
105+
return super.equals(obj);
106+
}
107+
108+
public static final class Info implements ToXContentObject {
109+
110+
private final String nodeId;
111+
private final long actualReplicas;
112+
private final long numberShardsLeftToAllocate;
113+
private final String message;
114+
115+
static final ParseField NODE_ID = new ParseField("node_id");
116+
static final ParseField EXPECTED_SHARDS = new ParseField("expected_shards");
117+
static final ParseField SHARDS_TO_ALLOCATE = new ParseField("shards_left_to_allocate");
118+
static final ParseField MESSAGE = new ParseField("message");
119+
static final ConstructingObjectParser<CheckShrinkReadyStep.Info, Void> PARSER = new ConstructingObjectParser<>(
120+
"check_shrink_ready_step_info", a -> new CheckShrinkReadyStep.Info((String) a[0], (long) a[1], (long) a[2]));
121+
static {
122+
PARSER.declareString(ConstructingObjectParser.constructorArg(), NODE_ID);
123+
PARSER.declareLong(ConstructingObjectParser.constructorArg(), EXPECTED_SHARDS);
124+
PARSER.declareLong(ConstructingObjectParser.constructorArg(), SHARDS_TO_ALLOCATE);
125+
PARSER.declareString((i, s) -> {}, MESSAGE);
126+
}
127+
128+
public Info(String nodeId, long expectedShards, long numberShardsLeftToAllocate) {
129+
this.nodeId = nodeId;
130+
this.actualReplicas = expectedShards;
131+
this.numberShardsLeftToAllocate = numberShardsLeftToAllocate;
132+
if (numberShardsLeftToAllocate < 0) {
133+
this.message = "Waiting for all shards to become active";
134+
} else {
135+
this.message = String.format(Locale.ROOT, "Waiting for node [%s] to contain [%d] shards, found [%d], remaining [%d]",
136+
nodeId, expectedShards, expectedShards - numberShardsLeftToAllocate, numberShardsLeftToAllocate);
137+
}
138+
}
139+
140+
@Override
141+
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
142+
builder.startObject();
143+
builder.field(MESSAGE.getPreferredName(), message);
144+
builder.field(NODE_ID.getPreferredName(), nodeId);
145+
builder.field(SHARDS_TO_ALLOCATE.getPreferredName(), numberShardsLeftToAllocate);
146+
builder.field(EXPECTED_SHARDS.getPreferredName(), actualReplicas);
147+
builder.endObject();
148+
return builder;
149+
}
150+
151+
@Override
152+
public int hashCode() {
153+
return Objects.hash(nodeId, actualReplicas, numberShardsLeftToAllocate);
154+
}
155+
156+
@Override
157+
public boolean equals(Object obj) {
158+
if (obj == null) {
159+
return false;
160+
}
161+
if (getClass() != obj.getClass()) {
162+
return false;
163+
}
164+
CheckShrinkReadyStep.Info other = (CheckShrinkReadyStep.Info) obj;
165+
return Objects.equals(actualReplicas, other.actualReplicas) &&
166+
Objects.equals(numberShardsLeftToAllocate, other.numberShardsLeftToAllocate) &&
167+
Objects.equals(nodeId, other.nodeId);
168+
}
169+
170+
@Override
171+
public String toString() {
172+
return Strings.toString(this);
173+
}
174+
}
175+
}

x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/indexlifecycle/ShrinkAction.java

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ public List<Step> toSteps(Client client, String phase, Step.StepKey nextStepKey)
8787

8888
StepKey readOnlyKey = new StepKey(phase, NAME, ReadOnlyAction.NAME);
8989
StepKey setSingleNodeKey = new StepKey(phase, NAME, SetSingleNodeAllocateStep.NAME);
90-
StepKey allocationRoutedKey = new StepKey(phase, NAME, AllocationRoutedStep.NAME);
90+
StepKey allocationRoutedKey = new StepKey(phase, NAME, CheckShrinkReadyStep.NAME);
9191
StepKey shrinkKey = new StepKey(phase, NAME, ShrinkStep.NAME);
9292
StepKey enoughShardsKey = new StepKey(phase, NAME, ShrunkShardsAllocatedStep.NAME);
9393
StepKey copyMetadataKey = new StepKey(phase, NAME, CopyExecutionStateStep.NAME);
@@ -96,27 +96,27 @@ public List<Step> toSteps(Client client, String phase, Step.StepKey nextStepKey)
9696

9797
UpdateSettingsStep readOnlyStep = new UpdateSettingsStep(readOnlyKey, setSingleNodeKey, client, readOnlySettings);
9898
SetSingleNodeAllocateStep setSingleNodeStep = new SetSingleNodeAllocateStep(setSingleNodeKey, allocationRoutedKey, client);
99-
AllocationRoutedStep allocationStep = new AllocationRoutedStep(allocationRoutedKey, shrinkKey, false);
99+
CheckShrinkReadyStep checkShrinkReadyStep = new CheckShrinkReadyStep(allocationRoutedKey, shrinkKey);
100100
ShrinkStep shrink = new ShrinkStep(shrinkKey, enoughShardsKey, client, numberOfShards, SHRUNKEN_INDEX_PREFIX);
101101
ShrunkShardsAllocatedStep allocated = new ShrunkShardsAllocatedStep(enoughShardsKey, copyMetadataKey, SHRUNKEN_INDEX_PREFIX);
102102
CopyExecutionStateStep copyMetadata = new CopyExecutionStateStep(copyMetadataKey, aliasKey, SHRUNKEN_INDEX_PREFIX);
103103
ShrinkSetAliasStep aliasSwapAndDelete = new ShrinkSetAliasStep(aliasKey, isShrunkIndexKey, client, SHRUNKEN_INDEX_PREFIX);
104104
ShrunkenIndexCheckStep waitOnShrinkTakeover = new ShrunkenIndexCheckStep(isShrunkIndexKey, nextStepKey, SHRUNKEN_INDEX_PREFIX);
105-
return Arrays.asList(readOnlyStep, setSingleNodeStep, allocationStep, shrink, allocated, copyMetadata,
105+
return Arrays.asList(readOnlyStep, setSingleNodeStep, checkShrinkReadyStep, shrink, allocated, copyMetadata,
106106
aliasSwapAndDelete, waitOnShrinkTakeover);
107107
}
108108

109109
@Override
110110
public List<StepKey> toStepKeys(String phase) {
111111
StepKey readOnlyKey = new StepKey(phase, NAME, ReadOnlyAction.NAME);
112112
StepKey setSingleNodeKey = new StepKey(phase, NAME, SetSingleNodeAllocateStep.NAME);
113-
StepKey allocationRoutedKey = new StepKey(phase, NAME, AllocationRoutedStep.NAME);
113+
StepKey checkShrinkReadyKey = new StepKey(phase, NAME, CheckShrinkReadyStep.NAME);
114114
StepKey shrinkKey = new StepKey(phase, NAME, ShrinkStep.NAME);
115115
StepKey enoughShardsKey = new StepKey(phase, NAME, ShrunkShardsAllocatedStep.NAME);
116116
StepKey copyMetadataKey = new StepKey(phase, NAME, CopyExecutionStateStep.NAME);
117117
StepKey aliasKey = new StepKey(phase, NAME, ShrinkSetAliasStep.NAME);
118118
StepKey isShrunkIndexKey = new StepKey(phase, NAME, ShrunkenIndexCheckStep.NAME);
119-
return Arrays.asList(readOnlyKey, setSingleNodeKey, allocationRoutedKey, shrinkKey, enoughShardsKey,
119+
return Arrays.asList(readOnlyKey, setSingleNodeKey, checkShrinkReadyKey, shrinkKey, enoughShardsKey,
120120
copyMetadataKey, aliasKey, isShrunkIndexKey);
121121
}
122122

x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/indexlifecycle/AllocationRoutedStepTests.java

Lines changed: 7 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -36,37 +36,32 @@ public class AllocationRoutedStepTests extends AbstractStepTestCase<AllocationRo
3636
public AllocationRoutedStep createRandomInstance() {
3737
StepKey stepKey = randomStepKey();
3838
StepKey nextStepKey = randomStepKey();
39-
boolean waitOnAllShardCopies = randomBoolean();
4039

41-
return new AllocationRoutedStep(stepKey, nextStepKey, waitOnAllShardCopies);
40+
return new AllocationRoutedStep(stepKey, nextStepKey);
4241
}
4342

4443
@Override
4544
public AllocationRoutedStep mutateInstance(AllocationRoutedStep instance) {
4645
StepKey key = instance.getKey();
4746
StepKey nextKey = instance.getNextStepKey();
48-
boolean waitOnAllShardCopies = instance.getWaitOnAllShardCopies();
4947

50-
switch (between(0, 2)) {
48+
switch (between(0, 1)) {
5149
case 0:
5250
key = new StepKey(key.getPhase(), key.getAction(), key.getName() + randomAlphaOfLength(5));
5351
break;
5452
case 1:
5553
nextKey = new StepKey(key.getPhase(), key.getAction(), key.getName() + randomAlphaOfLength(5));
5654
break;
57-
case 2:
58-
waitOnAllShardCopies = waitOnAllShardCopies == false;
59-
break;
6055
default:
6156
throw new AssertionError("Illegal randomisation branch");
6257
}
6358

64-
return new AllocationRoutedStep(key, nextKey, waitOnAllShardCopies);
59+
return new AllocationRoutedStep(key, nextKey);
6560
}
6661

6762
@Override
6863
public AllocationRoutedStep copyInstance(AllocationRoutedStep instance) {
69-
return new AllocationRoutedStep(instance.getKey(), instance.getNextStepKey(), instance.getWaitOnAllShardCopies());
64+
return new AllocationRoutedStep(instance.getKey(), instance.getNextStepKey());
7065
}
7166

7267
public void testConditionMet() {
@@ -132,9 +127,9 @@ public void testConditionMetOnlyOneCopyAllocated() {
132127
.addShard(TestShardRouting.newShardRouting(new ShardId(index, 0), "node2", primaryOnNode1 == false,
133128
ShardRoutingState.STARTED));
134129

135-
AllocationRoutedStep step = new AllocationRoutedStep(randomStepKey(), randomStepKey(), false);
130+
AllocationRoutedStep step = new AllocationRoutedStep(randomStepKey(), randomStepKey());
136131
assertAllocateStatus(index, 1, 0, step, existingSettings, node1Settings, node2Settings, indexRoutingTable,
137-
new ClusterStateWaitStep.Result(true, null));
132+
new ClusterStateWaitStep.Result(false, new AllocationRoutedStep.Info(0, 1, true)));
138133
}
139134

140135
public void testExecuteAllocateNotComplete() throws Exception {
@@ -202,7 +197,7 @@ public void testExecuteAllocateNotCompleteOnlyOneCopyAllocated() throws Exceptio
202197
.addShard(TestShardRouting.newShardRouting(new ShardId(index, 0), "node2", primaryOnNode1 == false,
203198
ShardRoutingState.STARTED));
204199

205-
AllocationRoutedStep step = new AllocationRoutedStep(randomStepKey(), randomStepKey(), true);
200+
AllocationRoutedStep step = new AllocationRoutedStep(randomStepKey(), randomStepKey());
206201
assertAllocateStatus(index, 2, 0, step, existingSettings, node1Settings, node2Settings, indexRoutingTable,
207202
new ClusterStateWaitStep.Result(false, new AllocationRoutedStep.Info(0, 1, true)));
208203
}

0 commit comments

Comments
 (0)