-
Notifications
You must be signed in to change notification settings - Fork 25.6k
[ML][Data frame] fixing failure state transitions and race condition #45627
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
8570d71
860e901
2499fe9
5cb95ec
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -37,9 +37,13 @@ public class DataFrameTaskFailedStateIT extends DataFrameRestTestCase { | |
| @Before | ||
| public void setClusterSettings() throws IOException { | ||
| // Make sure we never retry on failure to speed up the test | ||
| // Set logging level to trace | ||
| // see: https://github.com/elastic/elasticsearch/issues/45562 | ||
| Request addFailureRetrySetting = new Request("PUT", "/_cluster/settings"); | ||
| addFailureRetrySetting.setJsonEntity( | ||
| "{\"persistent\": {\"xpack.data_frame.num_transform_failure_retries\": \"" + 0 + "\"}}"); | ||
| "{\"transient\": {\"xpack.data_frame.num_transform_failure_retries\": \"" + 0 + "\"," + | ||
| "\"logger.org.elasticsearch.action.bulk\": \"info\"," + // reduces bulk failure spam | ||
| "\"logger.org.elasticsearch.xpack.dataframe\": \"trace\"}}"); | ||
| client().performRequest(addFailureRetrySetting); | ||
| } | ||
|
|
||
|
|
@@ -84,7 +88,6 @@ public void testForceStopFailedTransform() throws Exception { | |
| assertThat(XContentMapValues.extractValue("reason", fullState), is(nullValue())); | ||
| } | ||
|
|
||
| @AwaitsFix(bugUrl = "https://github.com/elastic/elasticsearch/issues/45609") | ||
| public void testForceStartFailedTransform() throws Exception { | ||
| String transformId = "test-force-start-failed-transform"; | ||
| createReviewsIndex(REVIEWS_INDEX_NAME, 10); | ||
|
|
@@ -100,13 +103,16 @@ public void testForceStartFailedTransform() throws Exception { | |
| // Verify we have failed for the expected reason | ||
| assertThat(XContentMapValues.extractValue("reason", fullState), equalTo(failureReason)); | ||
|
|
||
| final String expectedFailure = "Unable to start data frame transform [test-force-start-failed-transform] " + | ||
| "as it is in a failed state with failure: [" + failureReason + | ||
| "]. Use force start to restart data frame transform once error is resolved."; | ||
| // Verify that we cannot start the transform when the task is in a failed state | ||
| ResponseException ex = expectThrows(ResponseException.class, () -> startDataframeTransform(transformId, false)); | ||
| assertThat(ex.getResponse().getStatusLine().getStatusCode(), equalTo(RestStatus.CONFLICT.getStatus())); | ||
| assertThat(XContentMapValues.extractValue("error.reason", entityAsMap(ex.getResponse())), | ||
| equalTo("Unable to start data frame transform [test-force-start-failed-transform] as it is in a failed state with failure: [" + | ||
| failureReason + | ||
| "]. Use force start to restart data frame transform once error is resolved.")); | ||
| assertBusy(() -> { | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This |
||
| ResponseException ex = expectThrows(ResponseException.class, () -> startDataframeTransform(transformId, false)); | ||
| assertThat(ex.getResponse().getStatusLine().getStatusCode(), equalTo(RestStatus.CONFLICT.getStatus())); | ||
| assertThat(XContentMapValues.extractValue("error.reason", entityAsMap(ex.getResponse())), | ||
| equalTo(expectedFailure)); | ||
| }, 60, TimeUnit.SECONDS); | ||
|
|
||
| // Correct the failure by deleting the destination index | ||
| deleteIndex(dataFrameIndex); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -59,7 +59,8 @@ protected void doExecute(Task task, StartDataFrameTransformTaskAction.Request re | |
| protected void taskOperation(StartDataFrameTransformTaskAction.Request request, DataFrameTransformTask transformTask, | ||
| ActionListener<StartDataFrameTransformTaskAction.Response> listener) { | ||
| if (transformTask.getTransformId().equals(request.getId())) { | ||
| transformTask.start(null, listener); | ||
| //TODO fix bug as .start where it was failed could result in a null current checkpoint? | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I noticed this potential bug while working through this, I will do investigation in another PR. |
||
| transformTask.start(null, request.isForce(), listener); | ||
| } else { | ||
| listener.onFailure(new RuntimeException("ID of data frame transform task [" + transformTask.getTransformId() | ||
| + "] does not match request's ID [" + request.getId() + "]")); | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is the same behavior as previously we only did
forcechecks against the stored cluster state.