Skip to content

Commit 15aa376

Browse files
authored
Reduce recovery time with compress or secure transport (#36981)
Today file-chunks are sent sequentially one by one in peer-recovery. This is a correct choice since the implementation is straightforward and recovery is network bound in most of the time. However, if the connection is encrypted, we might not be able to saturate the network pipe because encrypting/decrypting are cpu bound rather than network-bound. With this commit, a source node can send multiple (default to 2) file-chunks without waiting for the acknowledgments from the target. Below are the benchmark results for PMC and NYC_taxis. - PMC (20.2 GB) | Transport | Baseline | chunks=1 | chunks=2 | chunks=3 | chunks=4 | | ----------| ---------| -------- | -------- | -------- | -------- | | Plain | 184s | 137s | 106s | 105s | 106s | | TLS | 346s | 294s | 176s | 153s | 117s | | Compress | 1556s | 1407s | 1193s | 1183s | 1211s | - NYC_Taxis (38.6GB) | Transport | Baseline | chunks=1 | chunks=2 | chunks=3 | chunks=4 | | ----------| ---------| ---------| ---------| ---------| -------- | | Plain | 321s | 249s | 191s | * | * | | TLS | 618s | 539s | 323s | 290s | 213s | | Compress | 2622s | 2421s | 2018s | 2029s | n/a | Relates #33844
1 parent 5c68338 commit 15aa376

File tree

14 files changed

+559
-144
lines changed

14 files changed

+559
-144
lines changed

docs/reference/modules/indices/recovery.asciidoc

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,5 +20,14 @@ peer recoveries:
2020
consume an excess of bandwidth (or other resources) which could destabilize
2121
the cluster. Defaults to `40mb`.
2222

23+
`indices.recovery.max_concurrent_file_chunks`::
24+
Controls the number of file chunk requests that can be sent in parallel per recovery.
25+
As multiple recoveries are already running in parallel (controlled by
26+
cluster.routing.allocation.node_concurrent_recoveries), increasing this expert-level
27+
setting might only help in situations where peer recovery of a single shard is not
28+
reaching the total inbound and outbound peer recovery traffic as configured by
29+
indices.recovery.max_bytes_per_sec, but is CPU-bound instead, typically when using
30+
transport-level security or compression. Defaults to `2`.
31+
2332
This setting can be dynamically updated on a live cluster with the
2433
<<cluster-update-settings,cluster-update-settings>> API.

server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,7 @@ public void apply(Settings value, Settings current, Settings previous) {
214214
RecoverySettings.INDICES_RECOVERY_ACTIVITY_TIMEOUT_SETTING,
215215
RecoverySettings.INDICES_RECOVERY_INTERNAL_ACTION_TIMEOUT_SETTING,
216216
RecoverySettings.INDICES_RECOVERY_INTERNAL_LONG_ACTION_TIMEOUT_SETTING,
217+
RecoverySettings.INDICES_RECOVERY_MAX_CONCURRENT_FILE_CHUNKS_SETTING,
217218
ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_PRIMARIES_RECOVERIES_SETTING,
218219
ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_INCOMING_RECOVERIES_SETTING,
219220
ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_OUTGOING_RECOVERIES_SETTING,

server/src/main/java/org/elasticsearch/indices/recovery/PeerRecoverySourceService.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,8 @@ private RecoverySourceHandler createRecoverySourceHandler(StartRecoveryRequest r
176176
final RemoteRecoveryTargetHandler recoveryTarget =
177177
new RemoteRecoveryTargetHandler(request.recoveryId(), request.shardId(), transportService,
178178
request.targetNode(), recoverySettings, throttleTime -> shard.recoveryStats().addThrottleTime(throttleTime));
179-
handler = new RecoverySourceHandler(shard, recoveryTarget, request, recoverySettings.getChunkSize().bytesAsInt());
179+
handler = new RecoverySourceHandler(shard, recoveryTarget, request,
180+
Math.toIntExact(recoverySettings.getChunkSize().getBytes()), recoverySettings.getMaxConcurrentFileChunks());
180181
return handler;
181182
}
182183
}

server/src/main/java/org/elasticsearch/indices/recovery/PeerRecoveryTargetService.java

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@
2929
import org.elasticsearch.ElasticsearchException;
3030
import org.elasticsearch.ElasticsearchTimeoutException;
3131
import org.elasticsearch.ExceptionsHelper;
32+
import org.elasticsearch.action.ActionListener;
33+
import org.elasticsearch.action.support.HandledTransportAction;
3234
import org.elasticsearch.action.support.PlainActionFuture;
3335
import org.elasticsearch.cluster.ClusterState;
3436
import org.elasticsearch.cluster.ClusterStateObserver;
@@ -602,8 +604,7 @@ class FileChunkTransportRequestHandler implements TransportRequestHandler<Recove
602604

603605
@Override
604606
public void messageReceived(final RecoveryFileChunkRequest request, TransportChannel channel, Task task) throws Exception {
605-
try (RecoveryRef recoveryRef = onGoingRecoveries.getRecoverySafe(request.recoveryId(), request.shardId()
606-
)) {
607+
try (RecoveryRef recoveryRef = onGoingRecoveries.getRecoverySafe(request.recoveryId(), request.shardId())) {
607608
final RecoveryTarget recoveryTarget = recoveryRef.target();
608609
final RecoveryState.Index indexState = recoveryTarget.state().getIndex();
609610
if (request.sourceThrottleTimeInNanos() != RecoveryState.Index.UNKNOWN) {
@@ -621,12 +622,12 @@ public void messageReceived(final RecoveryFileChunkRequest request, TransportCha
621622
recoveryTarget.indexShard().recoveryStats().addThrottleTime(throttleTimeInNanos);
622623
}
623624
}
624-
625-
recoveryTarget.writeFileChunk(request.metadata(), request.position(), request.content(),
626-
request.lastChunk(), request.totalTranslogOps()
627-
);
625+
final ActionListener<TransportResponse> listener =
626+
new HandledTransportAction.ChannelActionListener<>(channel, Actions.FILE_CHUNK, request);
627+
recoveryTarget.writeFileChunk(request.metadata(), request.position(), request.content(), request.lastChunk(),
628+
request.totalTranslogOps(),
629+
ActionListener.wrap(nullVal -> listener.onResponse(TransportResponse.Empty.INSTANCE), listener::onFailure));
628630
}
629-
channel.sendResponse(TransportResponse.Empty.INSTANCE);
630631
}
631632
}
632633

server/src/main/java/org/elasticsearch/indices/recovery/RecoverySettings.java

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,12 @@ public class RecoverySettings {
3939
Setting.byteSizeSetting("indices.recovery.max_bytes_per_sec", new ByteSizeValue(40, ByteSizeUnit.MB),
4040
Property.Dynamic, Property.NodeScope);
4141

42+
/**
43+
* Controls the maximum number of file chunk requests that can be sent concurrently from the source node to the target node.
44+
*/
45+
public static final Setting<Integer> INDICES_RECOVERY_MAX_CONCURRENT_FILE_CHUNKS_SETTING =
46+
Setting.intSetting("indices.recovery.max_concurrent_file_chunks", 2, 1, 5, Property.Dynamic, Property.NodeScope);
47+
4248
/**
4349
* how long to wait before retrying after issues cause by cluster state syncing between nodes
4450
* i.e., local node is not yet known on remote node, remote shard not yet started etc.
@@ -78,6 +84,7 @@ public class RecoverySettings {
7884
public static final ByteSizeValue DEFAULT_CHUNK_SIZE = new ByteSizeValue(512, ByteSizeUnit.KB);
7985

8086
private volatile ByteSizeValue maxBytesPerSec;
87+
private volatile int maxConcurrentFileChunks;
8188
private volatile SimpleRateLimiter rateLimiter;
8289
private volatile TimeValue retryDelayStateSync;
8390
private volatile TimeValue retryDelayNetwork;
@@ -89,6 +96,7 @@ public class RecoverySettings {
8996

9097
public RecoverySettings(Settings settings, ClusterSettings clusterSettings) {
9198
this.retryDelayStateSync = INDICES_RECOVERY_RETRY_DELAY_STATE_SYNC_SETTING.get(settings);
99+
this.maxConcurrentFileChunks = INDICES_RECOVERY_MAX_CONCURRENT_FILE_CHUNKS_SETTING.get(settings);
92100
// doesn't have to be fast as nodes are reconnected every 10s by default (see InternalClusterService.ReconnectToNodes)
93101
// and we want to give the master time to remove a faulty node
94102
this.retryDelayNetwork = INDICES_RECOVERY_RETRY_DELAY_NETWORK_SETTING.get(settings);
@@ -108,6 +116,7 @@ public RecoverySettings(Settings settings, ClusterSettings clusterSettings) {
108116
logger.debug("using max_bytes_per_sec[{}]", maxBytesPerSec);
109117

110118
clusterSettings.addSettingsUpdateConsumer(INDICES_RECOVERY_MAX_BYTES_PER_SEC_SETTING, this::setMaxBytesPerSec);
119+
clusterSettings.addSettingsUpdateConsumer(INDICES_RECOVERY_MAX_CONCURRENT_FILE_CHUNKS_SETTING, this::setMaxConcurrentFileChunks);
111120
clusterSettings.addSettingsUpdateConsumer(INDICES_RECOVERY_RETRY_DELAY_STATE_SYNC_SETTING, this::setRetryDelayStateSync);
112121
clusterSettings.addSettingsUpdateConsumer(INDICES_RECOVERY_RETRY_DELAY_NETWORK_SETTING, this::setRetryDelayNetwork);
113122
clusterSettings.addSettingsUpdateConsumer(INDICES_RECOVERY_INTERNAL_ACTION_TIMEOUT_SETTING, this::setInternalActionTimeout);
@@ -180,4 +189,12 @@ private void setMaxBytesPerSec(ByteSizeValue maxBytesPerSec) {
180189
rateLimiter = new SimpleRateLimiter(maxBytesPerSec.getMbFrac());
181190
}
182191
}
192+
193+
public int getMaxConcurrentFileChunks() {
194+
return maxConcurrentFileChunks;
195+
}
196+
197+
private void setMaxConcurrentFileChunks(int maxConcurrentFileChunks) {
198+
this.maxConcurrentFileChunks = maxConcurrentFileChunks;
199+
}
183200
}

server/src/main/java/org/elasticsearch/indices/recovery/RecoverySourceHandler.java

Lines changed: 71 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
import org.elasticsearch.cluster.routing.ShardRouting;
3737
import org.elasticsearch.common.StopWatch;
3838
import org.elasticsearch.common.bytes.BytesArray;
39+
import org.elasticsearch.common.collect.Tuple;
3940
import org.elasticsearch.common.lease.Releasable;
4041
import org.elasticsearch.common.logging.Loggers;
4142
import org.elasticsearch.common.lucene.store.InputStreamIndexInput;
@@ -44,7 +45,6 @@
4445
import org.elasticsearch.common.util.CancellableThreads;
4546
import org.elasticsearch.common.util.concurrent.FutureUtils;
4647
import org.elasticsearch.core.internal.io.IOUtils;
47-
import org.elasticsearch.core.internal.io.Streams;
4848
import org.elasticsearch.index.engine.Engine;
4949
import org.elasticsearch.index.engine.RecoveryEngineException;
5050
import org.elasticsearch.index.seqno.LocalCheckpointTracker;
@@ -59,10 +59,9 @@
5959
import org.elasticsearch.threadpool.ThreadPool;
6060
import org.elasticsearch.transport.RemoteTransportException;
6161

62-
import java.io.BufferedOutputStream;
6362
import java.io.Closeable;
6463
import java.io.IOException;
65-
import java.io.OutputStream;
64+
import java.io.InputStream;
6665
import java.util.ArrayList;
6766
import java.util.Collections;
6867
import java.util.Comparator;
@@ -71,10 +70,12 @@
7170
import java.util.concurrent.CompletableFuture;
7271
import java.util.concurrent.CopyOnWriteArrayList;
7372
import java.util.concurrent.atomic.AtomicLong;
74-
import java.util.function.Function;
73+
import java.util.concurrent.atomic.AtomicReference;
7574
import java.util.function.Supplier;
7675
import java.util.stream.StreamSupport;
7776

77+
import static org.elasticsearch.index.seqno.SequenceNumbers.NO_OPS_PERFORMED;
78+
7879
/**
7980
* RecoverySourceHandler handles the three phases of shard recovery, which is
8081
* everything relating to copying the segment files as well as sending translog
@@ -96,17 +97,19 @@ public class RecoverySourceHandler {
9697
private final StartRecoveryRequest request;
9798
private final int chunkSizeInBytes;
9899
private final RecoveryTargetHandler recoveryTarget;
100+
private final int maxConcurrentFileChunks;
99101
private final CancellableThreads cancellableThreads = new CancellableThreads();
100102

101-
public RecoverySourceHandler(final IndexShard shard, RecoveryTargetHandler recoveryTarget,
102-
final StartRecoveryRequest request,
103-
final int fileChunkSizeInBytes) {
103+
public RecoverySourceHandler(final IndexShard shard, RecoveryTargetHandler recoveryTarget, final StartRecoveryRequest request,
104+
final int fileChunkSizeInBytes, final int maxConcurrentFileChunks) {
104105
this.shard = shard;
105106
this.recoveryTarget = recoveryTarget;
106107
this.request = request;
107108
this.shardId = this.request.shardId().id();
108109
this.logger = Loggers.getLogger(getClass(), request.shardId(), "recover to " + request.targetNode().getName());
109110
this.chunkSizeInBytes = fileChunkSizeInBytes;
111+
// if the target is on an old version, it won't be able to handle out-of-order file chunks.
112+
this.maxConcurrentFileChunks = request.targetNode().getVersion().onOrAfter(Version.V_7_0_0) ? maxConcurrentFileChunks : 1;
110113
}
111114

112115
public StartRecoveryRequest getRequest() {
@@ -407,10 +410,7 @@ public SendFileResult phase1(final IndexCommit snapshot, final Supplier<Integer>
407410
phase1ExistingFileNames.size(), new ByteSizeValue(existingTotalSize));
408411
cancellableThreads.execute(() -> recoveryTarget.receiveFileInfo(
409412
phase1FileNames, phase1FileSizes, phase1ExistingFileNames, phase1ExistingFileSizes, translogOps.get()));
410-
// How many bytes we've copied since we last called RateLimiter.pause
411-
final Function<StoreFileMetaData, OutputStream> outputStreamFactories =
412-
md -> new BufferedOutputStream(new RecoveryOutputStream(md, translogOps), chunkSizeInBytes);
413-
sendFiles(store, phase1Files.toArray(new StoreFileMetaData[phase1Files.size()]), outputStreamFactories);
413+
sendFiles(store, phase1Files.toArray(new StoreFileMetaData[0]), translogOps);
414414
// Send the CLEAN_FILES request, which takes all of the files that
415415
// were transferred and renames them from their temporary file
416416
// names to the actual file names. It also writes checksums for
@@ -649,73 +649,72 @@ public String toString() {
649649
'}';
650650
}
651651

652-
653-
final class RecoveryOutputStream extends OutputStream {
654-
private final StoreFileMetaData md;
655-
private final Supplier<Integer> translogOps;
656-
private long position = 0;
657-
658-
RecoveryOutputStream(StoreFileMetaData md, Supplier<Integer> translogOps) {
659-
this.md = md;
660-
this.translogOps = translogOps;
661-
}
662-
663-
@Override
664-
public void write(int b) throws IOException {
665-
throw new UnsupportedOperationException("we can't send single bytes over the wire");
652+
void sendFiles(Store store, StoreFileMetaData[] files, Supplier<Integer> translogOps) throws Exception {
653+
ArrayUtil.timSort(files, Comparator.comparingLong(StoreFileMetaData::length)); // send smallest first
654+
final LocalCheckpointTracker requestSeqIdTracker = new LocalCheckpointTracker(NO_OPS_PERFORMED, NO_OPS_PERFORMED);
655+
final AtomicReference<Tuple<StoreFileMetaData, Exception>> error = new AtomicReference<>();
656+
final byte[] buffer = new byte[chunkSizeInBytes];
657+
for (final StoreFileMetaData md : files) {
658+
if (error.get() != null) {
659+
break;
660+
}
661+
try (IndexInput indexInput = store.directory().openInput(md.name(), IOContext.READONCE);
662+
InputStream in = new InputStreamIndexInput(indexInput, md.length())) {
663+
long position = 0;
664+
int bytesRead;
665+
while ((bytesRead = in.read(buffer, 0, buffer.length)) != -1) {
666+
final BytesArray content = new BytesArray(buffer, 0, bytesRead);
667+
final boolean lastChunk = position + content.length() == md.length();
668+
final long requestSeqId = requestSeqIdTracker.generateSeqNo();
669+
cancellableThreads.execute(() -> requestSeqIdTracker.waitForOpsToComplete(requestSeqId - maxConcurrentFileChunks));
670+
cancellableThreads.checkForCancel();
671+
if (error.get() != null) {
672+
break;
673+
}
674+
final long requestFilePosition = position;
675+
cancellableThreads.executeIO(() ->
676+
recoveryTarget.writeFileChunk(md, requestFilePosition, content, lastChunk, translogOps.get(),
677+
ActionListener.wrap(
678+
r -> requestSeqIdTracker.markSeqNoAsCompleted(requestSeqId),
679+
e -> {
680+
error.compareAndSet(null, Tuple.tuple(md, e));
681+
requestSeqIdTracker.markSeqNoAsCompleted(requestSeqId);
682+
}
683+
)));
684+
position += content.length();
685+
}
686+
} catch (Exception e) {
687+
error.compareAndSet(null, Tuple.tuple(md, e));
688+
break;
689+
}
666690
}
667-
668-
@Override
669-
public void write(byte[] b, int offset, int length) throws IOException {
670-
sendNextChunk(position, new BytesArray(b, offset, length), md.length() == position + length);
671-
position += length;
672-
assert md.length() >= position : "length: " + md.length() + " but positions was: " + position;
691+
// When we terminate exceptionally, we don't wait for the outstanding requests as we don't use their results anyway.
692+
// This allows us to end quickly and eliminate the complexity of handling requestSeqIds in case of error.
693+
if (error.get() == null) {
694+
cancellableThreads.execute(() -> requestSeqIdTracker.waitForOpsToComplete(requestSeqIdTracker.getMaxSeqNo()));
673695
}
674-
675-
private void sendNextChunk(long position, BytesArray content, boolean lastChunk) throws IOException {
676-
// Actually send the file chunk to the target node, waiting for it to complete
677-
cancellableThreads.executeIO(() ->
678-
recoveryTarget.writeFileChunk(md, position, content, lastChunk, translogOps.get())
679-
);
680-
if (shard.state() == IndexShardState.CLOSED) { // check if the shard got closed on us
681-
throw new IndexShardClosedException(request.shardId());
682-
}
696+
if (error.get() != null) {
697+
handleErrorOnSendFiles(store, error.get().v1(), error.get().v2());
683698
}
684699
}
685700

686-
void sendFiles(Store store, StoreFileMetaData[] files, Function<StoreFileMetaData, OutputStream> outputStreamFactory) throws Exception {
687-
store.incRef();
688-
try {
689-
ArrayUtil.timSort(files, Comparator.comparingLong(StoreFileMetaData::length)); // send smallest first
690-
for (int i = 0; i < files.length; i++) {
691-
final StoreFileMetaData md = files[i];
692-
try (IndexInput indexInput = store.directory().openInput(md.name(), IOContext.READONCE)) {
693-
// it's fine that we are only having the indexInput in the try/with block. The copy methods handles
694-
// exceptions during close correctly and doesn't hide the original exception.
695-
Streams.copy(new InputStreamIndexInput(indexInput, md.length()), outputStreamFactory.apply(md));
696-
} catch (Exception e) {
697-
final IOException corruptIndexException;
698-
if ((corruptIndexException = ExceptionsHelper.unwrapCorruption(e)) != null) {
699-
if (store.checkIntegrityNoException(md) == false) { // we are corrupted on the primary -- fail!
700-
logger.warn("{} Corrupted file detected {} checksum mismatch", shardId, md);
701-
failEngine(corruptIndexException);
702-
throw corruptIndexException;
703-
} else { // corruption has happened on the way to replica
704-
RemoteTransportException exception = new RemoteTransportException("File corruption occurred on recovery but " +
705-
"checksums are ok", null);
706-
exception.addSuppressed(e);
707-
logger.warn(() -> new ParameterizedMessage(
708-
"{} Remote file corruption on node {}, recovering {}. local checksum OK",
709-
shardId, request.targetNode(), md), corruptIndexException);
710-
throw exception;
711-
}
712-
} else {
713-
throw e;
714-
}
715-
}
701+
private void handleErrorOnSendFiles(Store store, StoreFileMetaData md, Exception e) throws Exception {
702+
final IOException corruptIndexException;
703+
if ((corruptIndexException = ExceptionsHelper.unwrapCorruption(e)) != null) {
704+
if (store.checkIntegrityNoException(md) == false) { // we are corrupted on the primary -- fail!
705+
logger.warn("{} Corrupted file detected {} checksum mismatch", shardId, md);
706+
failEngine(corruptIndexException);
707+
throw corruptIndexException;
708+
} else { // corruption has happened on the way to replica
709+
RemoteTransportException exception = new RemoteTransportException(
710+
"File corruption occurred on recovery but checksums are ok", null);
711+
exception.addSuppressed(e);
712+
logger.warn(() -> new ParameterizedMessage("{} Remote file corruption on node {}, recovering {}. local checksum OK",
713+
shardId, request.targetNode(), md), corruptIndexException);
714+
throw exception;
716715
}
717-
} finally {
718-
store.decRef();
716+
} else {
717+
throw e;
719718
}
720719
}
721720

0 commit comments

Comments
 (0)