Skip to content

Commit 66c6582

Browse files
committed
Retry failed replication due to transient errors (elastic#55633)
Currently a failed replication action will fail an entire replica. This includes when replication fails due to potentially short lived transient issues such as network distruptions or circuit breaking errors. This commit implements retries using the retryable action.
1 parent cd3663e commit 66c6582

File tree

18 files changed

+678
-89
lines changed

18 files changed

+678
-89
lines changed

server/src/main/java/org/elasticsearch/action/support/RetryableAction.java

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ public RetryableAction(Logger logger, ThreadPool threadPool, TimeValue initialDe
6464
if (initialDelayMillis < 1) {
6565
throw new IllegalArgumentException("Initial delay was less than 1 millisecond: " + initialDelay);
6666
}
67-
this.timeoutMillis = Math.max(timeoutValue.getMillis(), 1);
67+
this.timeoutMillis = timeoutValue.getMillis();
6868
this.startMillis = threadPool.relativeTimeInMillis();
6969
this.finalListener = listener;
7070
this.executor = executor;
@@ -82,6 +82,7 @@ public void cancel(Exception e) {
8282
if (localRetryTask != null) {
8383
localRetryTask.cancel();
8484
}
85+
onFinished();
8586
finalListener.onFailure(e);
8687
}
8788
}
@@ -112,6 +113,9 @@ public void onRejection(Exception e) {
112113

113114
public abstract boolean shouldRetry(Exception e);
114115

116+
public void onFinished() {
117+
}
118+
115119
private class RetryingListener implements ActionListener<Response> {
116120

117121
private static final int MAX_EXCEPTIONS = 4;
@@ -127,6 +131,7 @@ private RetryingListener(long delayMillisBound, ArrayDeque<Exception> caughtExce
127131
@Override
128132
public void onResponse(Response response) {
129133
if (isDone.compareAndSet(false, true)) {
134+
onFinished();
130135
finalListener.onResponse(response);
131136
}
132137
}
@@ -140,6 +145,7 @@ public void onFailure(Exception e) {
140145
TimeValue.timeValueMillis(elapsedMillis)), e);
141146
addException(e);
142147
if (isDone.compareAndSet(false, true)) {
148+
onFinished();
143149
finalListener.onFailure(buildFinalException());
144150
}
145151
} else {
@@ -158,6 +164,7 @@ public void onFailure(Exception e) {
158164
} else {
159165
addException(e);
160166
if (isDone.compareAndSet(false,true)) {
167+
onFinished();
161168
finalListener.onFailure(buildFinalException());
162169
}
163170
}
Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
/*
2+
* Licensed to Elasticsearch under one or more contributor
3+
* license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright
5+
* ownership. Elasticsearch licenses this file to you under
6+
* the Apache License, Version 2.0 (the "License"); you may
7+
* not use this file except in compliance with the License.
8+
* You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package org.elasticsearch.action.support.replication;
21+
22+
import org.elasticsearch.action.support.RetryableAction;
23+
import org.elasticsearch.common.lease.Releasable;
24+
import org.elasticsearch.common.util.concurrent.ConcurrentCollections;
25+
import org.elasticsearch.index.shard.IndexShardClosedException;
26+
import org.elasticsearch.index.shard.ReplicationGroup;
27+
import org.elasticsearch.index.shard.ShardId;
28+
import org.elasticsearch.threadpool.ThreadPool;
29+
30+
import java.util.ArrayList;
31+
import java.util.Collection;
32+
import java.util.Map;
33+
import java.util.Set;
34+
import java.util.function.Consumer;
35+
36+
public class PendingReplicationActions implements Consumer<ReplicationGroup>, Releasable {
37+
38+
private final Map<String, Set<RetryableAction<?>>> onGoingReplicationActions = ConcurrentCollections.newConcurrentMap();
39+
private final ShardId shardId;
40+
private final ThreadPool threadPool;
41+
private volatile long replicationGroupVersion = -1;
42+
43+
public PendingReplicationActions(ShardId shardId, ThreadPool threadPool) {
44+
this.shardId = shardId;
45+
this.threadPool = threadPool;
46+
}
47+
48+
public void addPendingAction(String allocationId, RetryableAction<?> replicationAction) {
49+
Set<RetryableAction<?>> ongoingActionsOnNode = onGoingReplicationActions.get(allocationId);
50+
if (ongoingActionsOnNode != null) {
51+
ongoingActionsOnNode.add(replicationAction);
52+
if (onGoingReplicationActions.containsKey(allocationId) == false) {
53+
replicationAction.cancel(new IndexShardClosedException(shardId,
54+
"Replica unavailable - replica could have left ReplicationGroup or IndexShard might have closed"));
55+
}
56+
} else {
57+
replicationAction.cancel(new IndexShardClosedException(shardId,
58+
"Replica unavailable - replica could have left ReplicationGroup or IndexShard might have closed"));
59+
}
60+
}
61+
62+
public void removeReplicationAction(String allocationId, RetryableAction<?> action) {
63+
Set<RetryableAction<?>> ongoingActionsOnNode = onGoingReplicationActions.get(allocationId);
64+
if (ongoingActionsOnNode != null) {
65+
ongoingActionsOnNode.remove(action);
66+
}
67+
}
68+
69+
@Override
70+
public void accept(ReplicationGroup replicationGroup) {
71+
if (isNewerVersion(replicationGroup)) {
72+
synchronized (this) {
73+
if (isNewerVersion(replicationGroup)) {
74+
acceptNewTrackedAllocationIds(replicationGroup.getTrackedAllocationIds());
75+
replicationGroupVersion = replicationGroup.getVersion();
76+
}
77+
}
78+
}
79+
}
80+
81+
private boolean isNewerVersion(ReplicationGroup replicationGroup) {
82+
// Relative comparison to mitigate long overflow
83+
return replicationGroup.getVersion() - replicationGroupVersion > 0;
84+
}
85+
86+
// Visible for testing
87+
synchronized void acceptNewTrackedAllocationIds(Set<String> trackedAllocationIds) {
88+
for (String targetAllocationId : trackedAllocationIds) {
89+
onGoingReplicationActions.putIfAbsent(targetAllocationId, ConcurrentCollections.newConcurrentSet());
90+
}
91+
ArrayList<Set<RetryableAction<?>>> toCancel = new ArrayList<>();
92+
for (String allocationId : onGoingReplicationActions.keySet()) {
93+
if (trackedAllocationIds.contains(allocationId) == false) {
94+
toCancel.add(onGoingReplicationActions.remove(allocationId));
95+
}
96+
}
97+
98+
cancelActions(toCancel, "Replica left ReplicationGroup");
99+
}
100+
101+
@Override
102+
public synchronized void close() {
103+
ArrayList<Set<RetryableAction<?>>> toCancel = new ArrayList<>(onGoingReplicationActions.values());
104+
onGoingReplicationActions.clear();
105+
106+
cancelActions(toCancel, "Primary closed.");
107+
}
108+
109+
private void cancelActions(ArrayList<Set<RetryableAction<?>>> toCancel, String message) {
110+
threadPool.executor(ThreadPool.Names.GENERIC).execute(() -> toCancel.stream()
111+
.flatMap(Collection::stream)
112+
.forEach(action -> action.cancel(new IndexShardClosedException(shardId, message))));
113+
}
114+
}

server/src/main/java/org/elasticsearch/action/support/replication/ReplicationOperation.java

Lines changed: 83 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -27,17 +27,23 @@
2727
import org.elasticsearch.action.ActionListener;
2828
import org.elasticsearch.action.UnavailableShardsException;
2929
import org.elasticsearch.action.support.ActiveShardCount;
30+
import org.elasticsearch.action.support.RetryableAction;
3031
import org.elasticsearch.action.support.TransportActions;
3132
import org.elasticsearch.cluster.action.shard.ShardStateAction;
3233
import org.elasticsearch.cluster.routing.IndexShardRoutingTable;
3334
import org.elasticsearch.cluster.routing.ShardRouting;
3435
import org.elasticsearch.common.Nullable;
36+
import org.elasticsearch.common.breaker.CircuitBreakingException;
3537
import org.elasticsearch.common.io.stream.StreamInput;
38+
import org.elasticsearch.common.unit.TimeValue;
39+
import org.elasticsearch.common.util.concurrent.EsRejectedExecutionException;
3640
import org.elasticsearch.index.seqno.SequenceNumbers;
3741
import org.elasticsearch.index.shard.ReplicationGroup;
3842
import org.elasticsearch.index.shard.ShardId;
3943
import org.elasticsearch.node.NodeClosedException;
4044
import org.elasticsearch.rest.RestStatus;
45+
import org.elasticsearch.threadpool.ThreadPool;
46+
import org.elasticsearch.transport.ConnectTransportException;
4147

4248
import java.io.IOException;
4349
import java.util.ArrayList;
@@ -54,6 +60,7 @@ public class ReplicationOperation<
5460
PrimaryResultT extends ReplicationOperation.PrimaryResult<ReplicaRequest>
5561
> {
5662
private final Logger logger;
63+
private final ThreadPool threadPool;
5764
private final Request request;
5865
private final String opType;
5966
private final AtomicInteger totalShards = new AtomicInteger();
@@ -72,6 +79,8 @@ public class ReplicationOperation<
7279
private final Primary<Request, ReplicaRequest, PrimaryResultT> primary;
7380
private final Replicas<ReplicaRequest> replicasProxy;
7481
private final AtomicBoolean finished = new AtomicBoolean();
82+
private final TimeValue initialRetryBackoffBound;
83+
private final TimeValue retryTimeout;
7584
private final long primaryTerm;
7685

7786
// exposed for tests
@@ -84,14 +93,18 @@ public class ReplicationOperation<
8493
public ReplicationOperation(Request request, Primary<Request, ReplicaRequest, PrimaryResultT> primary,
8594
ActionListener<PrimaryResultT> listener,
8695
Replicas<ReplicaRequest> replicas,
87-
Logger logger, String opType, long primaryTerm) {
96+
Logger logger, ThreadPool threadPool, String opType, long primaryTerm, TimeValue initialRetryBackoffBound,
97+
TimeValue retryTimeout) {
8898
this.replicasProxy = replicas;
8999
this.primary = primary;
90100
this.resultListener = listener;
91101
this.logger = logger;
102+
this.threadPool = threadPool;
92103
this.request = request;
93104
this.opType = opType;
94105
this.primaryTerm = primaryTerm;
106+
this.initialRetryBackoffBound = initialRetryBackoffBound;
107+
this.retryTimeout = retryTimeout;
95108
}
96109

97110
public void execute() throws Exception {
@@ -130,8 +143,9 @@ private void handlePrimaryResult(final PrimaryResultT primaryResult) {
130143
final long maxSeqNoOfUpdatesOrDeletes = primary.maxSeqNoOfUpdatesOrDeletes();
131144
assert maxSeqNoOfUpdatesOrDeletes != SequenceNumbers.UNASSIGNED_SEQ_NO : "seqno_of_updates still uninitialized";
132145
final ReplicationGroup replicationGroup = primary.getReplicationGroup();
146+
final PendingReplicationActions pendingReplicationActions = primary.getPendingReplicationActions();
133147
markUnavailableShardsAsStale(replicaRequest, replicationGroup);
134-
performOnReplicas(replicaRequest, globalCheckpoint, maxSeqNoOfUpdatesOrDeletes, replicationGroup);
148+
performOnReplicas(replicaRequest, globalCheckpoint, maxSeqNoOfUpdatesOrDeletes, replicationGroup, pendingReplicationActions);
135149
}
136150
primaryResult.runPostReplicationActions(new ActionListener<Void>() {
137151

@@ -165,7 +179,8 @@ private void markUnavailableShardsAsStale(ReplicaRequest replicaRequest, Replica
165179
}
166180

167181
private void performOnReplicas(final ReplicaRequest replicaRequest, final long globalCheckpoint,
168-
final long maxSeqNoOfUpdatesOrDeletes, final ReplicationGroup replicationGroup) {
182+
final long maxSeqNoOfUpdatesOrDeletes, final ReplicationGroup replicationGroup,
183+
final PendingReplicationActions pendingReplicationActions) {
169184
// for total stats, add number of unassigned shards and
170185
// number of initializing shards that are not ready yet to receive operations (recovery has not opened engine yet on the target)
171186
totalShards.addAndGet(replicationGroup.getSkippedShards().size());
@@ -174,52 +189,78 @@ private void performOnReplicas(final ReplicaRequest replicaRequest, final long g
174189

175190
for (final ShardRouting shard : replicationGroup.getReplicationTargets()) {
176191
if (shard.isSameAllocation(primaryRouting) == false) {
177-
performOnReplica(shard, replicaRequest, globalCheckpoint, maxSeqNoOfUpdatesOrDeletes);
192+
performOnReplica(shard, replicaRequest, globalCheckpoint, maxSeqNoOfUpdatesOrDeletes, pendingReplicationActions);
178193
}
179194
}
180195
}
181196

182197
private void performOnReplica(final ShardRouting shard, final ReplicaRequest replicaRequest,
183-
final long globalCheckpoint, final long maxSeqNoOfUpdatesOrDeletes) {
198+
final long globalCheckpoint, final long maxSeqNoOfUpdatesOrDeletes,
199+
final PendingReplicationActions pendingReplicationActions) {
184200
if (logger.isTraceEnabled()) {
185201
logger.trace("[{}] sending op [{}] to replica {} for request [{}]", shard.shardId(), opType, shard, replicaRequest);
186202
}
187-
188203
totalShards.incrementAndGet();
189204
pendingActions.incrementAndGet();
190-
replicasProxy.performOn(shard, replicaRequest, primaryTerm, globalCheckpoint, maxSeqNoOfUpdatesOrDeletes,
191-
new ActionListener<ReplicaResponse>() {
192-
@Override
193-
public void onResponse(ReplicaResponse response) {
194-
successfulShards.incrementAndGet();
195-
try {
196-
updateCheckPoints(shard, response::localCheckpoint, response::globalCheckpoint);
197-
} finally {
198-
decPendingAndFinishIfNeeded();
199-
}
205+
final ActionListener<ReplicaResponse> replicationListener = new ActionListener<ReplicaResponse>() {
206+
@Override
207+
public void onResponse(ReplicaResponse response) {
208+
successfulShards.incrementAndGet();
209+
try {
210+
updateCheckPoints(shard, response::localCheckpoint, response::globalCheckpoint);
211+
} finally {
212+
decPendingAndFinishIfNeeded();
200213
}
214+
}
201215

202-
@Override
203-
public void onFailure(Exception replicaException) {
204-
logger.trace(() -> new ParameterizedMessage(
205-
"[{}] failure while performing [{}] on replica {}, request [{}]",
206-
shard.shardId(), opType, shard, replicaRequest), replicaException);
207-
// Only report "critical" exceptions - TODO: Reach out to the master node to get the latest shard state then report.
208-
if (TransportActions.isShardNotAvailableException(replicaException) == false) {
209-
RestStatus restStatus = ExceptionsHelper.status(replicaException);
210-
shardReplicaFailures.add(new ReplicationResponse.ShardInfo.Failure(
211-
shard.shardId(), shard.currentNodeId(), replicaException, restStatus, false));
212-
}
213-
String message = String.format(Locale.ROOT, "failed to perform %s on replica %s", opType, shard);
214-
replicasProxy.failShardIfNeeded(shard, primaryTerm, message, replicaException,
215-
ActionListener.wrap(r -> decPendingAndFinishIfNeeded(), ReplicationOperation.this::onNoLongerPrimary));
216+
@Override
217+
public void onFailure(Exception replicaException) {
218+
logger.trace(() -> new ParameterizedMessage(
219+
"[{}] failure while performing [{}] on replica {}, request [{}]",
220+
shard.shardId(), opType, shard, replicaRequest), replicaException);
221+
// Only report "critical" exceptions - TODO: Reach out to the master node to get the latest shard state then report.
222+
if (TransportActions.isShardNotAvailableException(replicaException) == false) {
223+
RestStatus restStatus = ExceptionsHelper.status(replicaException);
224+
shardReplicaFailures.add(new ReplicationResponse.ShardInfo.Failure(
225+
shard.shardId(), shard.currentNodeId(), replicaException, restStatus, false));
216226
}
227+
String message = String.format(Locale.ROOT, "failed to perform %s on replica %s", opType, shard);
228+
replicasProxy.failShardIfNeeded(shard, primaryTerm, message, replicaException,
229+
ActionListener.wrap(r -> decPendingAndFinishIfNeeded(), ReplicationOperation.this::onNoLongerPrimary));
230+
}
217231

218-
@Override
219-
public String toString() {
220-
return "[" + replicaRequest + "][" + shard + "]";
221-
}
222-
});
232+
@Override
233+
public String toString() {
234+
return "[" + replicaRequest + "][" + shard + "]";
235+
}
236+
};
237+
238+
final String allocationId = shard.allocationId().getId();
239+
final RetryableAction<ReplicaResponse> replicationAction = new RetryableAction<ReplicaResponse>(logger, threadPool,
240+
initialRetryBackoffBound, retryTimeout, replicationListener) {
241+
242+
@Override
243+
public void tryAction(ActionListener<ReplicaResponse> listener) {
244+
replicasProxy.performOn(shard, replicaRequest, primaryTerm, globalCheckpoint, maxSeqNoOfUpdatesOrDeletes, listener);
245+
}
246+
247+
@Override
248+
public void onFinished() {
249+
super.onFinished();
250+
pendingReplicationActions.removeReplicationAction(allocationId, this);
251+
}
252+
253+
@Override
254+
public boolean shouldRetry(Exception e) {
255+
final Throwable cause = ExceptionsHelper.unwrapCause(e);
256+
return cause instanceof CircuitBreakingException ||
257+
cause instanceof EsRejectedExecutionException ||
258+
cause instanceof ConnectTransportException;
259+
}
260+
};
261+
262+
pendingReplicationActions.addPendingAction(allocationId, replicationAction);
263+
replicationAction.run();
223264
}
224265

225266
private void updateCheckPoints(ShardRouting shard, LongSupplier localCheckpointSupplier, LongSupplier globalCheckpointSupplier) {
@@ -396,6 +437,13 @@ public interface Primary<
396437
* @return the replication group
397438
*/
398439
ReplicationGroup getReplicationGroup();
440+
441+
/**
442+
* Returns the pending replication actions on the primary shard
443+
*
444+
* @return the pending replication actions
445+
*/
446+
PendingReplicationActions getPendingReplicationActions();
399447
}
400448

401449
/**

0 commit comments

Comments
 (0)