-
Notifications
You must be signed in to change notification settings - Fork 25.6k
More robust timeout for repo analysis #101184
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
c600d7a
177ee41
19a62e8
d8e0d02
407c283
504ebb1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,6 @@ | ||
| pr: 101184 | ||
| summary: More robust timeout for repo analysis | ||
| area: Snapshot/Restore | ||
| type: bug | ||
| issues: | ||
| - 101182 |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -9,6 +9,7 @@ | |
|
|
||
| import org.apache.logging.log4j.LogManager; | ||
| import org.apache.logging.log4j.Logger; | ||
| import org.elasticsearch.ElasticsearchTimeoutException; | ||
| import org.elasticsearch.ExceptionsHelper; | ||
| import org.elasticsearch.TransportVersions; | ||
| import org.elasticsearch.Version; | ||
|
|
@@ -22,6 +23,7 @@ | |
| import org.elasticsearch.action.support.ActionFilters; | ||
| import org.elasticsearch.action.support.HandledTransportAction; | ||
| import org.elasticsearch.action.support.RefCountingRunnable; | ||
| import org.elasticsearch.action.support.SubscribableListener; | ||
| import org.elasticsearch.cluster.ClusterState; | ||
| import org.elasticsearch.cluster.node.DiscoveryNode; | ||
| import org.elasticsearch.cluster.node.DiscoveryNodes; | ||
|
|
@@ -364,6 +366,7 @@ public static class AsyncAction { | |
| private final DiscoveryNodes discoveryNodes; | ||
| private final LongSupplier currentTimeMillisSupplier; | ||
| private final ActionListener<Response> listener; | ||
| private final SubscribableListener<Void> cancellationListener; | ||
| private final long timeoutTimeMillis; | ||
|
|
||
| // choose the blob path nondeterministically to avoid clashes, assuming that the actual path doesn't matter for reproduction | ||
|
|
@@ -394,15 +397,24 @@ public AsyncAction( | |
| this.discoveryNodes = discoveryNodes; | ||
| this.currentTimeMillisSupplier = currentTimeMillisSupplier; | ||
| this.timeoutTimeMillis = currentTimeMillisSupplier.getAsLong() + request.getTimeout().millis(); | ||
| this.listener = listener; | ||
|
|
||
| this.cancellationListener = new SubscribableListener<>(); | ||
| this.listener = ActionListener.runBefore(listener, () -> cancellationListener.onResponse(null)); | ||
|
|
||
| responses = new ArrayList<>(request.blobCount); | ||
| } | ||
|
|
||
| private void fail(Exception e) { | ||
| private boolean setFirstFailure(Exception e) { | ||
| if (failure.compareAndSet(null, e)) { | ||
| transportService.getTaskManager().cancelTaskAndDescendants(task, "task failed", false, ActionListener.noop()); | ||
| return true; | ||
| } else { | ||
| return false; | ||
| } | ||
| } | ||
|
|
||
| private void fail(Exception e) { | ||
| if (setFirstFailure(e) == false) { | ||
| if (innerFailures.tryAcquire()) { | ||
| final Throwable cause = ExceptionsHelper.unwrapCause(e); | ||
| if (cause instanceof TaskCancelledException || cause instanceof ReceiveTimeoutTransportException) { | ||
|
|
@@ -424,24 +436,34 @@ private boolean isRunning() { | |
| } | ||
|
|
||
| if (task.isCancelled()) { | ||
| failure.compareAndSet(null, new RepositoryVerificationException(request.repositoryName, "verification cancelled")); | ||
| setFirstFailure(new RepositoryVerificationException(request.repositoryName, "verification cancelled")); | ||
| // if this CAS failed then we're failing for some other reason, nbd; also if the task is cancelled then its descendants are | ||
| // also cancelled, so no further action is needed either way. | ||
| return false; | ||
| } | ||
|
|
||
| if (timeoutTimeMillis < currentTimeMillisSupplier.getAsLong()) { | ||
| if (failure.compareAndSet( | ||
| null, | ||
| new RepositoryVerificationException(request.repositoryName, "analysis timed out after [" + request.getTimeout() + "]") | ||
| )) { | ||
| transportService.getTaskManager().cancelTaskAndDescendants(task, "timed out", false, ActionListener.noop()); | ||
| } | ||
| // if this CAS failed then we're already failing for some other reason, nbd | ||
| return false; | ||
| return true; | ||
| } | ||
|
|
||
| private class CheckForCancelListener implements ActionListener<Void> { | ||
| @Override | ||
| public void onResponse(Void unused) { | ||
| // task complete, nothing to do | ||
| } | ||
|
|
||
| return true; | ||
| @Override | ||
| public void onFailure(Exception e) { | ||
| assert e instanceof ElasticsearchTimeoutException : e; | ||
| if (isRunning()) { | ||
| // if this CAS fails then we're already failing for some other reason, nbd | ||
| setFirstFailure( | ||
| new RepositoryVerificationException( | ||
| request.repositoryName, | ||
| "analysis timed out after [" + request.getTimeout() + "]" | ||
| ) | ||
| ); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| public void run() { | ||
|
|
@@ -450,6 +472,9 @@ public void run() { | |
|
|
||
| logger.info("running analysis of repository [{}] using path [{}]", request.getRepositoryName(), blobPath); | ||
|
|
||
| cancellationListener.addTimeout(request.getTimeout(), repository.threadPool(), EsExecutors.DIRECT_EXECUTOR_SERVICE); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If the cluster has many nodes and the repo analysis is configured to have high concurrency, would it be expensive to cancel the tasks on the scheduler thread?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Eh perhaps, but I wouldn't expect it to be a problem because (a) I don't see folks changing the concurrency very much and (b) even at 1000 nodes I don't think it'd be a huge deal, the cancel messages are tiny. We cancel things for other reasons on low-latency threads, e.g. |
||
| cancellationListener.addListener(new CheckForCancelListener()); | ||
|
|
||
| final Random random = new Random(request.getSeed()); | ||
| final List<DiscoveryNode> nodes = getSnapshotNodes(discoveryNodes); | ||
|
|
||
|
|
@@ -536,7 +561,7 @@ private void runBlobAnalysis(Releasable ref, final BlobAnalyzeAction.Request req | |
| BlobAnalyzeAction.NAME, | ||
| request, | ||
| task, | ||
| TransportRequestOptions.timeout(TimeValue.timeValueMillis(timeoutTimeMillis - currentTimeMillisSupplier.getAsLong())), | ||
| TransportRequestOptions.EMPTY, | ||
| new ActionListenerResponseHandler<>(ActionListener.releaseAfter(new ActionListener<>() { | ||
| @Override | ||
| public void onResponse(BlobAnalyzeAction.Response response) { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am trying to think through the difference scenarios how
cancellationListenercan already be completed before we invokeonResponse(null)here. I think they are all OK. I am writing it down to be explicit and maybe you can double check it as well.cancellationListeneris already timed out and this runs right before we are going to calllistener.onFailure. This is fine becauseSubscribableListeneraccepts only the first completion and silently ignores all future results.listener.onResponsefor success whilecancellationListenertimes out concurrently. The timeout will set the failure and try to cancel the tasks. This is fine because we don't check the failure object anymore and cancelling completed or non-existing tasks seem to be a noop.listeneris completed. This is fine since the timeout will be aftercancellationListener.onResponse(null)and completing aSubscribableListenermore than once is ignored.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
👍 sounds about right, yes.