-
Notifications
You must be signed in to change notification settings - Fork 25.6k
Delay shard reassignment from nodes which are known to be restarting #75606
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
f2d6666
29641f4
b87dfc4
4f8f0bd
a8fde87
8eadf32
917bde5
4e9c299
cd2a67a
1b2df0a
acb871a
edf9afa
28fea44
34e983e
a933857
219045a
1159b86
bcab443
47324db
523977e
0630dd6
4906783
9769679
2d27b21
b0437d9
0c157f0
b57f7cd
a2e4b07
1de72bc
4c1b1aa
30c1c3f
9a309ac
b34b2b0
1621069
0e23e09
44113f5
9df8ac2
c2bb729
010c5f6
49d1545
8bc7442
ca51467
0491493
dc5d851
3269afd
6b2f2d1
1a45d0f
8066104
cdb3f6e
b9f98f0
65f5650
b090715
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -9,18 +9,20 @@ | |
| package org.elasticsearch.cluster.routing; | ||
|
|
||
| import com.carrotsearch.hppc.cursors.ObjectCursor; | ||
|
|
||
| import org.apache.logging.log4j.Logger; | ||
| import org.apache.lucene.util.CollectionUtil; | ||
| import org.elasticsearch.Assertions; | ||
| import org.elasticsearch.cluster.ClusterState; | ||
| import org.elasticsearch.cluster.metadata.IndexMetadata; | ||
| import org.elasticsearch.cluster.metadata.Metadata; | ||
| import org.elasticsearch.cluster.metadata.SingleNodeShutdownMetadata; | ||
| import org.elasticsearch.cluster.node.DiscoveryNode; | ||
| import org.elasticsearch.cluster.routing.UnassignedInfo.AllocationStatus; | ||
| import org.elasticsearch.cluster.routing.allocation.ExistingShardsAllocator; | ||
| import org.elasticsearch.cluster.service.MasterService; | ||
| import org.elasticsearch.core.Nullable; | ||
| import org.elasticsearch.common.Randomness; | ||
| import org.elasticsearch.core.Nullable; | ||
| import org.elasticsearch.core.Tuple; | ||
| import org.elasticsearch.index.Index; | ||
| import org.elasticsearch.index.shard.ShardId; | ||
|
|
@@ -65,6 +67,8 @@ public class RoutingNodes implements Iterable<RoutingNode> { | |
|
|
||
| private final Map<ShardId, List<ShardRouting>> assignedShards = new HashMap<>(); | ||
|
|
||
| private final Map<String, SingleNodeShutdownMetadata> nodeShutdowns; | ||
|
|
||
| private final boolean readOnly; | ||
|
|
||
| private int inactivePrimaryCount = 0; | ||
|
|
@@ -83,6 +87,7 @@ public RoutingNodes(ClusterState clusterState) { | |
| public RoutingNodes(ClusterState clusterState, boolean readOnly) { | ||
| this.readOnly = readOnly; | ||
| final RoutingTable routingTable = clusterState.routingTable(); | ||
| nodeShutdowns = clusterState.metadata().nodeShutdowns(); | ||
|
|
||
| Map<String, LinkedHashMap<ShardId, ShardRouting>> nodesToShards = new HashMap<>(); | ||
| // fill in the nodeToShards with the "live" nodes | ||
|
|
@@ -533,9 +538,17 @@ assert getByAllocationId(failedShard.shardId(), failedShard.allocationId().getId | |
| // re-resolve replica as earlier iteration could have changed source/target of replica relocation | ||
| ShardRouting replicaShard = getByAllocationId(routing.shardId(), routing.allocationId().getId()); | ||
| assert replicaShard != null : "failed to re-resolve " + routing + " when failing replicas"; | ||
| UnassignedInfo primaryFailedUnassignedInfo = new UnassignedInfo(UnassignedInfo.Reason.PRIMARY_FAILED, | ||
| "primary failed while replica initializing", null, 0, unassignedInfo.getUnassignedTimeInNanos(), | ||
| unassignedInfo.getUnassignedTimeInMillis(), false, AllocationStatus.NO_ATTEMPT, Collections.emptySet()); | ||
| UnassignedInfo primaryFailedUnassignedInfo = new UnassignedInfo( | ||
| UnassignedInfo.Reason.PRIMARY_FAILED, | ||
| "primary failed while replica initializing", | ||
| null, | ||
| 0, | ||
| unassignedInfo.getUnassignedTimeInNanos(), | ||
| unassignedInfo.getUnassignedTimeInMillis(), | ||
| false, | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is a bit annoying that we have to calculate
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This no longer needs to be calculated here, as the reason is |
||
| AllocationStatus.NO_ATTEMPT, | ||
| Collections.emptySet(), | ||
| routing.currentNodeId()); | ||
| failShard(logger, replicaShard, primaryFailedUnassignedInfo, indexMetadata, routingChangesObserver); | ||
| } | ||
| } | ||
|
|
@@ -858,10 +871,17 @@ public void ignoreShard(ShardRouting shard, AllocationStatus allocationStatus, R | |
| UnassignedInfo currInfo = shard.unassignedInfo(); | ||
| assert currInfo != null; | ||
| if (allocationStatus.equals(currInfo.getLastAllocationStatus()) == false) { | ||
| UnassignedInfo newInfo = new UnassignedInfo(currInfo.getReason(), currInfo.getMessage(), currInfo.getFailure(), | ||
| currInfo.getNumFailedAllocations(), currInfo.getUnassignedTimeInNanos(), | ||
| currInfo.getUnassignedTimeInMillis(), currInfo.isDelayed(), | ||
| allocationStatus, currInfo.getFailedNodeIds()); | ||
| UnassignedInfo newInfo = new UnassignedInfo( | ||
| currInfo.getReason(), | ||
| currInfo.getMessage(), | ||
| currInfo.getFailure(), | ||
| currInfo.getNumFailedAllocations(), | ||
| currInfo.getUnassignedTimeInNanos(), | ||
| currInfo.getUnassignedTimeInMillis(), | ||
| currInfo.isDelayed(), | ||
| allocationStatus, | ||
| currInfo.getFailedNodeIds(), | ||
| currInfo.getLastAllocatedNodeId()); | ||
| ShardRouting updatedShard = shard.updateUnassigned(newInfo, shard.recoverySource()); | ||
| changes.unassignedInfoUpdated(shard, newInfo); | ||
| shard = updatedShard; | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is not really part of this review, but I wonder if we could not risk seeing multiple shutdown indications for the same node, for instance both a RESTART and REMOVE or REPLACE? I think of ECK in particular here, but might also be relevant in cloud.
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No, there's a couple things that prevent this:
TransportPutShutdownNodeActionwhen we get aPUTfor a node that already has a record, it's updated rather than added to, andSingleNodeShutdownMetadata(theMapin the line you're commenting on) is keyed by node UUID, so it should be impossible to have multiple records for the same key/nodeId.Since the node id is duplicated in the
SingleNodeShutdownMetadataas well, it's conceivable that in the case of a bug we could end up with a mismatch between the id used for keying and the id used in the object, but I don't think that's likely.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, sorry if this was unclear, what I meant was whether it could be a reasonable use case to have both a RESTART and one of the two others at the same time. Not that there is anything wrong in this PR or anything, more wanted to bring this to your attention for possible discussion (and maybe you discussed it already and discarded the use case?).