Skip to content

Commit 14a93f6

Browse files
authored
Improve Node Shutdown Observability (#78727)
This PR does two things: 1) Logs a message when the Get Shutdown Status API detects there's a shard causing a node shutdown to stall. The log message isn't new, just bumped from `trace` to `info`. 2) Adds Allocation Explain output to the shard migration status portion of the node shutdown response if it's causing a shard to stall. Example of the new Get Shutdown Status response: ```json { "nodes": [ { "node_id": "GszZYZ2bSZiLi5dBinWf8g", "type": "REMOVE", "reason": "idk my bff jill", "shutdown_startedmillis": 1633475221483, "status": "STALLED", "shard_migration": { "status": "STALLED", "shard_migrations_remaining": 1, "explanation": "shard [0] [primary] of index [my-index] cannot move", "node_allocation_decisions": { "can_remain_on_current_node": "no", "can_remain_decisions": [ { "decider": "node_shutdown", "decision": "NO", "explanation": "node [GszZYZ2bSZiLi5dBinWf8g] is preparing to be removed from the cluster" }, { "decider": "filter", "decision": "NO", "explanation": "node does not match index setting [index.routing.allocation.require] filters [include._id:\"GszZYZ2bSZiLi5dBinWf8g\"]" } ], "can_move_to_other_node": "no", "move_explanation": "cannot move shard to another node, even though it is not allowed to remain on its current node", "node_allocation_decisions": [ { "node_id": "T6AnBS1mT2iXlD-McXSteg", "node_name": "node-0", "transport_address": "127.0.0.1:9301", "node_attributes": { "ml.machine_memory": "34359738368", "ml.max_open_jobs": "512", "xpack.installed": "true", "ml.max_jvm_size": "17179869184" }, "node_decision": "no", "weight_ranking": 1, "deciders": [ { "decider": "filter", "decision": "NO", "explanation": "node does not match index setting [index.routing.allocation.require] filters [include._id:\"GszZYZ2bSZiLi5dBinWf8g\"]" } ] }, { "node_id": "81mJIQ_XRrG2zK1ojhjLMg", "node_name": "node-2", "transport_address": "127.0.0.1:9300", "node_attributes": { "ml.machine_memory": "34359738368", "ml.max_open_jobs": "512", "xpack.installed": "true", "ml.max_jvm_size": "17179869184" }, "node_decision": "no", "weight_ranking": 2, "deciders": [ { "decider": "filter", "decision": "NO", "explanation": "node does not match index setting [index.routing.allocation.require] filters [include._id:\"GszZYZ2bSZiLi5dBinWf8g\"]" } ] } ] } }, "persistent_tasks": { "status": "COMPLETE" }, "plugins": { "status": "COMPLETE" } } ] } ```
1 parent 070828d commit 14a93f6

File tree

5 files changed

+79
-28
lines changed

5 files changed

+79
-28
lines changed

server/src/main/java/org/elasticsearch/cluster/metadata/ShutdownShardMigrationStatus.java

Lines changed: 35 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88

99
package org.elasticsearch.cluster.metadata;
1010

11+
import org.elasticsearch.Version;
12+
import org.elasticsearch.cluster.routing.allocation.ShardAllocationDecision;
1113
import org.elasticsearch.common.Strings;
1214
import org.elasticsearch.common.io.stream.StreamInput;
1315
import org.elasticsearch.common.io.stream.StreamOutput;
@@ -20,25 +22,42 @@
2022
import java.util.Objects;
2123

2224
public class ShutdownShardMigrationStatus implements Writeable, ToXContentObject {
25+
private static final Version ALLOCATION_DECISION_ADDED_VERSION = Version.V_7_16_0;
2326

2427
private final SingleNodeShutdownMetadata.Status status;
2528
private final long shardsRemaining;
2629
@Nullable private final String explanation;
30+
@Nullable private final ShardAllocationDecision allocationDecision;
2731

2832
public ShutdownShardMigrationStatus(SingleNodeShutdownMetadata.Status status, long shardsRemaining) {
29-
this(status, shardsRemaining, null);
33+
this(status, shardsRemaining, null, null);
3034
}
3135

3236
public ShutdownShardMigrationStatus(SingleNodeShutdownMetadata.Status status, long shardsRemaining, @Nullable String explanation) {
37+
this(status, shardsRemaining, explanation, null);
38+
}
39+
40+
public ShutdownShardMigrationStatus(
41+
SingleNodeShutdownMetadata.Status status,
42+
long shardsRemaining,
43+
@Nullable String explanation,
44+
@Nullable ShardAllocationDecision allocationDecision
45+
) {
3346
this.status = Objects.requireNonNull(status, "status must not be null");
3447
this.shardsRemaining = shardsRemaining;
3548
this.explanation = explanation;
49+
this.allocationDecision = allocationDecision;
3650
}
3751

3852
public ShutdownShardMigrationStatus(StreamInput in) throws IOException {
3953
this.status = in.readEnum(SingleNodeShutdownMetadata.Status.class);
4054
this.shardsRemaining = in.readLong();
4155
this.explanation = in.readOptionalString();
56+
if (in.getVersion().onOrAfter(ALLOCATION_DECISION_ADDED_VERSION)) {
57+
this.allocationDecision = in.readOptionalWriteable(ShardAllocationDecision::new);
58+
} else {
59+
this.allocationDecision = null;
60+
}
4261
}
4362

4463
public long getShardsRemaining() {
@@ -61,6 +80,13 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws
6180
if (Objects.nonNull(explanation)) {
6281
builder.field("explanation", explanation);
6382
}
83+
if (Objects.nonNull(allocationDecision)) {
84+
builder.startObject("node_allocation_decision");
85+
{
86+
allocationDecision.toXContent(builder, params);
87+
}
88+
builder.endObject();
89+
}
6490
builder.endObject();
6591
return builder;
6692
}
@@ -70,19 +96,25 @@ public void writeTo(StreamOutput out) throws IOException {
7096
out.writeEnum(status);
7197
out.writeLong(shardsRemaining);
7298
out.writeOptionalString(explanation);
99+
if (out.getVersion().onOrAfter(ALLOCATION_DECISION_ADDED_VERSION)) {
100+
out.writeOptionalWriteable(allocationDecision);
101+
}
73102
}
74103

75104
@Override
76105
public boolean equals(Object o) {
77106
if (this == o) return true;
78107
if ((o instanceof ShutdownShardMigrationStatus) == false) return false;
79108
ShutdownShardMigrationStatus that = (ShutdownShardMigrationStatus) o;
80-
return shardsRemaining == that.shardsRemaining && status == that.status && Objects.equals(explanation, that.explanation);
109+
return shardsRemaining == that.shardsRemaining
110+
&& status == that.status
111+
&& Objects.equals(explanation, that.explanation)
112+
&& Objects.equals(allocationDecision, that.allocationDecision);
81113
}
82114

83115
@Override
84116
public int hashCode() {
85-
return Objects.hash(status, shardsRemaining, explanation);
117+
return Objects.hash(status, shardsRemaining, explanation, allocationDecision);
86118
}
87119

88120
@Override

server/src/main/java/org/elasticsearch/cluster/metadata/SingleNodeShutdownMetadata.java

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -256,6 +256,28 @@ public int hashCode() {
256256
);
257257
}
258258

259+
@Override public String toString() {
260+
final StringBuilder stringBuilder = new StringBuilder();
261+
stringBuilder
262+
.append("{")
263+
.append("nodeId=[")
264+
.append(nodeId)
265+
.append(']')
266+
.append(", type=[")
267+
.append(type)
268+
.append("], reason=[")
269+
.append(reason)
270+
.append(']');
271+
if (allocationDelay != null) {
272+
stringBuilder
273+
.append(", allocationDelay=[")
274+
.append(allocationDelay)
275+
.append("]");
276+
}
277+
stringBuilder.append("}");
278+
return stringBuilder.toString();
279+
}
280+
259281
public static Builder builder() {
260282
return new Builder();
261283
}

x-pack/plugin/shutdown/src/main/java/org/elasticsearch/xpack/shutdown/TransportDeleteShutdownNodeAction.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,8 @@ protected void masterOperation(
6969
public ClusterState execute(ClusterState currentState) throws Exception {
7070
NodesShutdownMetadata currentShutdownMetadata = currentState.metadata().custom(NodesShutdownMetadata.TYPE);
7171

72+
logger.info("removing shutdown record for node [{}]", request.getNodeId());
73+
7274
return ClusterState.builder(currentState)
7375
.metadata(
7476
Metadata.builder(currentState.metadata())

x-pack/plugin/shutdown/src/main/java/org/elasticsearch/xpack/shutdown/TransportGetShutdownStatusAction.java

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -260,17 +260,15 @@ static ShutdownShardMigrationStatus shardMigrationStatus(
260260
return hasShardCopyOnOtherNode == false;
261261
})
262262
.peek(pair -> {
263-
if (logger.isTraceEnabled()) { // don't serialize the decision unless we have to
264-
logger.trace(
265-
"node [{}] shutdown of type [{}] stalled: found shard [{}][{}] from index [{}] with negative decision: [{}]",
266-
nodeId,
267-
shutdownType,
268-
pair.v1().getId(),
269-
pair.v1().primary() ? "primary" : "replica",
270-
pair.v1().shardId().getIndexName(),
271-
Strings.toString(pair.v2())
272-
);
273-
}
263+
logger.debug(
264+
"node [{}] shutdown of type [{}] stalled: found shard [{}][{}] from index [{}] with negative decision: [{}]",
265+
nodeId,
266+
shutdownType,
267+
pair.v1().getId(),
268+
pair.v1().primary() ? "primary" : "replica",
269+
pair.v1().shardId().getIndexName(),
270+
Strings.toString(pair.v2())
271+
);
274272
})
275273
.findFirst();
276274

@@ -285,6 +283,7 @@ static ShutdownShardMigrationStatus shardMigrationStatus(
285283
} else if (unmovableShard.isPresent()) {
286284
// We found a shard that can't be moved, so shard relocation is stalled. Blame the unmovable shard.
287285
ShardRouting shardRouting = unmovableShard.get().v1();
286+
ShardAllocationDecision decision = unmovableShard.get().v2();
288287

289288
return new ShutdownShardMigrationStatus(
290289
SingleNodeShutdownMetadata.Status.STALLED,
@@ -294,7 +293,8 @@ static ShutdownShardMigrationStatus shardMigrationStatus(
294293
shardRouting.shardId().getId(),
295294
shardRouting.primary() ? "primary" : "replica",
296295
shardRouting.index().getName()
297-
).getFormattedMessage()
296+
).getFormattedMessage(),
297+
decision
298298
);
299299
} else {
300300
return new ShutdownShardMigrationStatus(SingleNodeShutdownMetadata.Status.IN_PROGRESS, totalRemainingShards);

x-pack/plugin/shutdown/src/main/java/org/elasticsearch/xpack/shutdown/TransportPutShutdownNodeAction.java

Lines changed: 8 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -64,19 +64,6 @@ public ClusterState execute(ClusterState currentState) {
6464
currentShutdownMetadata = new NodesShutdownMetadata(new HashMap<>());
6565
}
6666

67-
// Verify that there's not already a shutdown metadata for this node
68-
SingleNodeShutdownMetadata existingRecord = currentShutdownMetadata.getAllNodeMetadataMap().get(request.getNodeId());
69-
if (existingRecord != null) {
70-
logger.info(
71-
"updating existing shutdown record for node [{}] of type [{}] with reason [{}] with new type [{}] and reason [{}]",
72-
existingRecord.getNodeId(),
73-
existingRecord.getType(),
74-
existingRecord.getReason(),
75-
request.getType(),
76-
request.getReason()
77-
);
78-
}
79-
8067
final boolean nodeSeen = currentState.getNodes().nodeExists(request.getNodeId());
8168

8269
SingleNodeShutdownMetadata newNodeMetadata = SingleNodeShutdownMetadata.builder()
@@ -89,6 +76,14 @@ public ClusterState execute(ClusterState currentState) {
8976
.setTargetNodeName(request.getTargetNodeName())
9077
.build();
9178

79+
// Verify that there's not already a shutdown metadata for this node
80+
SingleNodeShutdownMetadata existingRecord = currentShutdownMetadata.getAllNodeMetadataMap().get(request.getNodeId());
81+
if (existingRecord != null) {
82+
logger.info("updating existing shutdown record {} with new record {}", existingRecord, newNodeMetadata);
83+
} else {
84+
logger.info("creating shutdown record {}", newNodeMetadata);
85+
}
86+
9287
return ClusterState.builder(currentState)
9388
.metadata(
9489
Metadata.builder(currentState.metadata())

0 commit comments

Comments
 (0)