Skip to content

Commit 63fe3c6

Browse files
Fix PrimaryAllocationIT Race Condition (#37355)
* Fix PrimaryAllocationIT Race Condition * Forcing a stale primary allocation on a green index was tripping the assertion that was removed * Added a test that this case still errors out correctly * Made the ability to wipe stopped datanode's data public on the internal test cluster and used it to ensure correct behaviour on the fixed test * Previously it simply passed because the test finished before the index went green and would NPE when the index was green at the time of the shard store status request, that would then come up empty * Closes #37345
1 parent 359222c commit 63fe3c6

File tree

3 files changed

+29
-5
lines changed

3 files changed

+29
-5
lines changed

server/src/main/java/org/elasticsearch/action/admin/cluster/reroute/TransportClusterRerouteAction.java

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,11 @@ private void verifyThenSubmitUpdate(ClusterRerouteRequest request, ActionListene
113113
for (Map.Entry<String, List<AbstractAllocateAllocationCommand>> entry : stalePrimaryAllocations.entrySet()) {
114114
final String index = entry.getKey();
115115
final ImmutableOpenIntMap<List<IndicesShardStoresResponse.StoreStatus>> indexStatus = status.get(index);
116-
assert indexStatus != null;
116+
if (indexStatus == null) {
117+
// The index in the stale primary allocation request was green and hence filtered out by the store status
118+
// request. We ignore it here since the relevant exception will be thrown by the reroute action later on.
119+
continue;
120+
}
117121
for (AbstractAllocateAllocationCommand command : entry.getValue()) {
118122
final List<IndicesShardStoresResponse.StoreStatus> shardStatus =
119123
indexStatus.get(command.shardId());

server/src/test/java/org/elasticsearch/cluster/routing/PrimaryAllocationIT.java

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -265,7 +265,6 @@ public void testForceStaleReplicaToBePromotedToPrimary() throws Exception {
265265
assertThat(newHistoryUUIds, hasSize(1));
266266
}
267267

268-
@AwaitsFix(bugUrl = "https://github.com/elastic/elasticsearch/issues/37345")
269268
public void testForceStaleReplicaToBePromotedToPrimaryOnWrongNode() throws Exception {
270269
String master = internalCluster().startMasterOnlyNode(Settings.EMPTY);
271270
internalCluster().startDataOnlyNodes(2);
@@ -275,7 +274,10 @@ public void testForceStaleReplicaToBePromotedToPrimaryOnWrongNode() throws Excep
275274
.put("index.number_of_replicas", 1)).get());
276275
ensureGreen();
277276
createStaleReplicaScenario(master);
278-
internalCluster().startDataOnlyNodes(2);
277+
// Ensure the stopped primary's data is deleted so that it doesn't get picked up by the next datanode we start
278+
internalCluster().wipePendingDataDirectories();
279+
internalCluster().startDataOnlyNodes(1);
280+
ensureStableCluster(3, master);
279281
final int shardId = 0;
280282
final List<String> nodeNames = new ArrayList<>(Arrays.asList(internalCluster().getNodeNames()));
281283
nodeNames.remove(master);
@@ -292,6 +294,25 @@ public void testForceStaleReplicaToBePromotedToPrimaryOnWrongNode() throws Excep
292294
equalTo("No data for shard [" + shardId + "] of index [" + idxName + "] found on node [" + nodeWithoutData + ']'));
293295
}
294296

297+
public void testForceStaleReplicaToBePromotedForGreenIndex() {
298+
internalCluster().startMasterOnlyNode(Settings.EMPTY);
299+
final List<String> dataNodes = internalCluster().startDataOnlyNodes(2);
300+
final String idxName = "test";
301+
assertAcked(client().admin().indices().prepareCreate(idxName)
302+
.setSettings(Settings.builder().put("index.number_of_shards", 1)
303+
.put("index.number_of_replicas", 1)).get());
304+
ensureGreen();
305+
final String nodeWithoutData = randomFrom(dataNodes);
306+
final int shardId = 0;
307+
IllegalArgumentException iae = expectThrows(
308+
IllegalArgumentException.class,
309+
() -> client().admin().cluster().prepareReroute()
310+
.add(new AllocateStalePrimaryAllocationCommand(idxName, shardId, nodeWithoutData, true)).get());
311+
assertThat(
312+
iae.getMessage(),
313+
equalTo("[allocate_stale_primary] primary [" + idxName+ "][" + shardId + "] is already assigned"));
314+
}
315+
295316
public void testForceStaleReplicaToBePromotedForMissingIndex() {
296317
internalCluster().startMasterOnlyNode(Settings.EMPTY);
297318
final String dataNode = internalCluster().startDataOnlyNode();

test/framework/src/main/java/org/elasticsearch/test/InternalTestCluster.java

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1398,8 +1398,7 @@ private void randomlyResetClients() {
13981398
}
13991399
}
14001400

1401-
private void wipePendingDataDirectories() {
1402-
assert Thread.holdsLock(this);
1401+
public synchronized void wipePendingDataDirectories() {
14031402
if (!dataDirToClean.isEmpty()) {
14041403
try {
14051404
for (Path path : dataDirToClean) {

0 commit comments

Comments
 (0)