Skip to content

Commit 3effe26

Browse files
author
David Roberts
committed
[ML] Fix problem with lost shards in distributed failure test (#43153)
We were stopping a node in the cluster at a time when the replica shards of the .ml-state index might not have been created. This change moves the wait for green status to a point where the .ml-state index exists. Fixes #40546 Fixes #41742 Forward port of #43111
1 parent a8bf181 commit 3effe26

File tree

1 file changed

+9
-8
lines changed

1 file changed

+9
-8
lines changed

x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/integration/BasicDistributedJobsIT.java

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@
2323
import org.elasticsearch.search.aggregations.bucket.histogram.HistogramAggregationBuilder;
2424
import org.elasticsearch.search.aggregations.metrics.MaxAggregationBuilder;
2525
import org.elasticsearch.test.InternalTestCluster;
26-
import org.elasticsearch.test.junit.annotations.TestLogging;
2726
import org.elasticsearch.xpack.core.ml.MlTasks;
2827
import org.elasticsearch.xpack.core.ml.action.CloseJobAction;
2928
import org.elasticsearch.xpack.core.ml.action.GetDatafeedsStatsAction;
@@ -64,19 +63,19 @@ public void testFailOverBasics() throws Exception {
6463
Job.Builder job = createJob("fail-over-basics-job", new ByteSizeValue(2, ByteSizeUnit.MB));
6564
PutJobAction.Request putJobRequest = new PutJobAction.Request(job);
6665
client().execute(PutJobAction.INSTANCE, putJobRequest).actionGet();
67-
ensureGreen();
66+
ensureYellow(); // at least the primary shards of the indices a job uses should be started
6867
OpenJobAction.Request openJobRequest = new OpenJobAction.Request(job.getId());
6968
client().execute(OpenJobAction.INSTANCE, openJobRequest).actionGet();
7069
awaitJobOpenedAndAssigned(job.getId(), null);
7170

71+
ensureGreen(); // replicas must be assigned, otherwise we could lose a whole index
7272
internalCluster().stopRandomDataNode();
7373
ensureStableCluster(3);
74-
ensureGreen();
7574
awaitJobOpenedAndAssigned(job.getId(), null);
7675

76+
ensureGreen(); // replicas must be assigned, otherwise we could lose a whole index
7777
internalCluster().stopRandomDataNode();
7878
ensureStableCluster(2);
79-
ensureGreen();
8079
awaitJobOpenedAndAssigned(job.getId(), null);
8180
}
8281

@@ -106,7 +105,7 @@ public void testFailOverBasics_withDataFeeder() throws Exception {
106105
PutDatafeedAction.Request putDatafeedRequest = new PutDatafeedAction.Request(config);
107106
client().execute(PutDatafeedAction.INSTANCE, putDatafeedRequest).actionGet();
108107

109-
ensureGreen();
108+
ensureYellow(); // at least the primary shards of the indices a job uses should be started
110109
OpenJobAction.Request openJobRequest = new OpenJobAction.Request(job.getId());
111110
client().execute(OpenJobAction.INSTANCE, openJobRequest).actionGet();
112111
awaitJobOpenedAndAssigned(job.getId(), null);
@@ -120,9 +119,9 @@ public void testFailOverBasics_withDataFeeder() throws Exception {
120119
assertEquals(DatafeedState.STARTED, statsResponse.getResponse().results().get(0).getDatafeedState());
121120
});
122121

122+
ensureGreen(); // replicas must be assigned, otherwise we could lose a whole index
123123
internalCluster().stopRandomDataNode();
124124
ensureStableCluster(3);
125-
ensureGreen();
126125
awaitJobOpenedAndAssigned(job.getId(), null);
127126
assertBusy(() -> {
128127
GetDatafeedsStatsAction.Response statsResponse =
@@ -131,9 +130,9 @@ public void testFailOverBasics_withDataFeeder() throws Exception {
131130
assertEquals(DatafeedState.STARTED, statsResponse.getResponse().results().get(0).getDatafeedState());
132131
});
133132

133+
ensureGreen(); // replicas must be assigned, otherwise we could lose a whole index
134134
internalCluster().stopRandomDataNode();
135135
ensureStableCluster(2);
136-
ensureGreen();
137136
awaitJobOpenedAndAssigned(job.getId(), null);
138137
assertBusy(() -> {
139138
GetDatafeedsStatsAction.Response statsResponse =
@@ -171,6 +170,7 @@ public void testJobAutoClose() throws Exception {
171170
PutDatafeedAction.Request putDatafeedRequest = new PutDatafeedAction.Request(config);
172171
client().execute(PutDatafeedAction.INSTANCE, putDatafeedRequest).actionGet();
173172

173+
ensureYellow(); // at least the primary shards of the indices a job uses should be started
174174
client().execute(OpenJobAction.INSTANCE, new OpenJobAction.Request(job.getId())).get();
175175

176176
StartDatafeedAction.Request startDatafeedRequest = new StartDatafeedAction.Request(config.getId(), 0L);
@@ -183,7 +183,6 @@ public void testJobAutoClose() throws Exception {
183183
});
184184
}
185185

186-
@TestLogging("org.elasticsearch.xpack.persistent:TRACE,org.elasticsearch.cluster.service:DEBUG,org.elasticsearch.xpack.ml.action:DEBUG")
187186
public void testDedicatedMlNode() throws Exception {
188187
internalCluster().ensureAtMostNumDataNodes(0);
189188
// start 2 non ml node that will never get a job allocated. (but ml apis are accessible from this node)
@@ -203,6 +202,7 @@ public void testDedicatedMlNode() throws Exception {
203202
PutJobAction.Request putJobRequest = new PutJobAction.Request(job);
204203
client().execute(PutJobAction.INSTANCE, putJobRequest).actionGet();
205204

205+
ensureYellow(); // at least the primary shards of the indices a job uses should be started
206206
OpenJobAction.Request openJobRequest = new OpenJobAction.Request(job.getId());
207207
client().execute(OpenJobAction.INSTANCE, openJobRequest).actionGet();
208208
assertBusy(() -> {
@@ -277,6 +277,7 @@ public void testMaxConcurrentJobAllocations() throws Exception {
277277
}
278278
});
279279

280+
ensureYellow(); // at least the primary shards of the indices a job uses should be started
280281
int numJobs = numMlNodes * 10;
281282
for (int i = 0; i < numJobs; i++) {
282283
Job.Builder job = createJob(Integer.toString(i), new ByteSizeValue(2, ByteSizeUnit.MB));

0 commit comments

Comments
 (0)