-
Notifications
You must be signed in to change notification settings - Fork 25.6k
Create peer-recovery retention leases #43190
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
01ad532
6f446a2
4716a9c
0f2f2b7
9a2f984
043771a
32837be
4085c74
e89051a
e5c46a6
8c30a5c
3266d90
9451133
d7fa139
47b6b42
64481dc
78dd210
09027bc
2edd640
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -24,6 +24,7 @@ | |
| import org.elasticsearch.Version; | ||
| import org.elasticsearch.action.ActionListener; | ||
| import org.elasticsearch.action.support.replication.ReplicationResponse; | ||
| import org.elasticsearch.cluster.metadata.IndexMetaData; | ||
| import org.elasticsearch.cluster.routing.AllocationId; | ||
| import org.elasticsearch.cluster.routing.IndexShardRoutingTable; | ||
| import org.elasticsearch.cluster.routing.ShardRouting; | ||
|
|
@@ -217,10 +218,22 @@ public synchronized Tuple<Boolean, RetentionLeases> getRetentionLeases(final boo | |
| // the primary calculates the non-expired retention leases and syncs them to replicas | ||
| final long currentTimeMillis = currentTimeMillisSupplier.getAsLong(); | ||
| final long retentionLeaseMillis = indexSettings.getRetentionLeaseMillis(); | ||
| final Set<String> leaseIdsForCurrentPeers | ||
| = routingTable.assignedShards().stream().map(ReplicationTracker::getPeerRecoveryRetentionLeaseId).collect(Collectors.toSet()); | ||
| final Map<Boolean, List<RetentionLease>> partitionByExpiration = retentionLeases | ||
| .leases() | ||
| .stream() | ||
| .collect(Collectors.groupingBy(lease -> currentTimeMillis - lease.timestamp() > retentionLeaseMillis)); | ||
| .collect(Collectors.groupingBy(lease -> { | ||
| if (lease.source().equals(PEER_RECOVERY_RETENTION_LEASE_SOURCE)) { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we make this check a method of RetentionLease?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As in #43190 (comment) I don't think |
||
| if (leaseIdsForCurrentPeers.contains(lease.id())) { | ||
| return false; | ||
| } | ||
| if (routingTable.allShardsStarted()) { | ||
| return true; | ||
| } | ||
| } | ||
| return currentTimeMillis - lease.timestamp() > retentionLeaseMillis; | ||
| })); | ||
| final Collection<RetentionLease> expiredLeases = partitionByExpiration.get(true); | ||
| if (expiredLeases == null) { | ||
| // early out as no retention leases have expired | ||
|
|
@@ -242,7 +255,7 @@ public synchronized Tuple<Boolean, RetentionLeases> getRetentionLeases(final boo | |
| * @param source the source of the retention lease | ||
| * @param listener the callback when the retention lease is successfully added and synced to replicas | ||
| * @return the new retention lease | ||
| * @throws IllegalArgumentException if the specified retention lease already exists | ||
| * @throws RetentionLeaseAlreadyExistsException if the specified retention lease already exists | ||
| */ | ||
| public RetentionLease addRetentionLease( | ||
| final String id, | ||
|
|
@@ -253,30 +266,46 @@ public RetentionLease addRetentionLease( | |
| final RetentionLease retentionLease; | ||
| final RetentionLeases currentRetentionLeases; | ||
| synchronized (this) { | ||
| assert primaryMode; | ||
| if (retentionLeases.contains(id)) { | ||
| throw new RetentionLeaseAlreadyExistsException(id); | ||
| } | ||
| retentionLease = new RetentionLease(id, retainingSequenceNumber, currentTimeMillisSupplier.getAsLong(), source); | ||
| logger.debug("adding new retention lease [{}] to current retention leases [{}]", retentionLease, retentionLeases); | ||
| retentionLeases = new RetentionLeases( | ||
| operationPrimaryTerm, | ||
| retentionLeases.version() + 1, | ||
| Stream.concat(retentionLeases.leases().stream(), Stream.of(retentionLease)).collect(Collectors.toList())); | ||
| retentionLease = innerAddRetentionLease(id, retainingSequenceNumber, source); | ||
| currentRetentionLeases = retentionLeases; | ||
| } | ||
| onSyncRetentionLeases.accept(currentRetentionLeases, listener); | ||
| return retentionLease; | ||
| } | ||
|
|
||
| /** | ||
| * Adds a new retention lease, but does not synchronise it with the rest of the replication group. | ||
| * | ||
| * @param id the identifier of the retention lease | ||
| * @param retainingSequenceNumber the retaining sequence number | ||
| * @param source the source of the retention lease | ||
| * @return the new retention lease | ||
| * @throws RetentionLeaseAlreadyExistsException if the specified retention lease already exists | ||
| */ | ||
| private RetentionLease innerAddRetentionLease(String id, long retainingSequenceNumber, String source) { | ||
| assert Thread.holdsLock(this); | ||
| assert primaryMode : id + "/" + retainingSequenceNumber + "/" + source; | ||
| if (retentionLeases.contains(id)) { | ||
| throw new RetentionLeaseAlreadyExistsException(id); | ||
| } | ||
| final RetentionLease retentionLease | ||
| = new RetentionLease(id, retainingSequenceNumber, currentTimeMillisSupplier.getAsLong(), source); | ||
| logger.debug("adding new retention lease [{}] to current retention leases [{}]", retentionLease, retentionLeases); | ||
| retentionLeases = new RetentionLeases( | ||
| operationPrimaryTerm, | ||
| retentionLeases.version() + 1, | ||
| Stream.concat(retentionLeases.leases().stream(), Stream.of(retentionLease)).collect(Collectors.toList())); | ||
| return retentionLease; | ||
| } | ||
|
|
||
| /** | ||
| * Renews an existing retention lease. | ||
| * | ||
| * @param id the identifier of the retention lease | ||
| * @param retainingSequenceNumber the retaining sequence number | ||
| * @param source the source of the retention lease | ||
| * @return the renewed retention lease | ||
| * @throws IllegalArgumentException if the specified retention lease does not exist | ||
| * @throws RetentionLeaseNotFoundException if the specified retention lease does not exist | ||
| */ | ||
| public synchronized RetentionLease renewRetentionLease(final String id, final long retainingSequenceNumber, final String source) { | ||
| assert primaryMode; | ||
|
|
@@ -390,6 +419,51 @@ public boolean assertRetentionLeasesPersisted(final Path path) throws IOExceptio | |
| return true; | ||
| } | ||
|
|
||
|
|
||
| /** | ||
| * Retention leases for peer recovery have source {@link ReplicationTracker#PEER_RECOVERY_RETENTION_LEASE_SOURCE}, a lease ID | ||
| * containing the persistent node ID calculated by {@link ReplicationTracker#getPeerRecoveryRetentionLeaseId}, and retain operations | ||
| * with sequence numbers strictly greater than the given global checkpoint. | ||
| */ | ||
| public void addPeerRecoveryRetentionLease(String nodeId, long globalCheckpoint, ActionListener<ReplicationResponse> listener) { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we remove this method and prepare these parameters in IndexShard instead?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We could but I think it's appropriate to do this here given that you need to do this when working with the |
||
| addRetentionLease(getPeerRecoveryRetentionLeaseId(nodeId), globalCheckpoint + 1, PEER_RECOVERY_RETENTION_LEASE_SOURCE, listener); | ||
| } | ||
|
|
||
| /** | ||
| * Source for peer recovery retention leases; see {@link ReplicationTracker#addPeerRecoveryRetentionLease}. | ||
| */ | ||
| public static final String PEER_RECOVERY_RETENTION_LEASE_SOURCE = "peer recovery"; | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How about moving this constant and two related static methods to RetentionLease class instead?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think |
||
|
|
||
| /** | ||
| * Id for a peer recovery retention lease for the given node. See {@link ReplicationTracker#addPeerRecoveryRetentionLease}. | ||
| */ | ||
| static String getPeerRecoveryRetentionLeaseId(String nodeId) { | ||
| return "peer_recovery/" + nodeId; | ||
| } | ||
|
|
||
| /** | ||
| * Id for a peer recovery retention lease for the given {@link ShardRouting}. | ||
| * See {@link ReplicationTracker#addPeerRecoveryRetentionLease}. | ||
| */ | ||
| public static String getPeerRecoveryRetentionLeaseId(ShardRouting shardRouting) { | ||
| return getPeerRecoveryRetentionLeaseId(shardRouting.currentNodeId()); | ||
| } | ||
|
|
||
| /** | ||
| * Advance the peer-recovery retention lease for all tracked shard copies, for use in tests until advancing these leases is done | ||
| * properly. TODO remove this. | ||
| */ | ||
| public synchronized void advancePeerRecoveryRetentionLeasesToGlobalCheckpoints() { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why are we not automatically advancing the leases when the global checkpoints advance? Is it because it breaks some tests right now?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Mainly because I think this change is already large enough without this feature too, and we haven't settled for definite on whether these leases should be GCP-based. Advancing the leases is needed in the tests in very few places, but I haven't tried advancing them more eagerly. |
||
| assert primaryMode; | ||
| for (ShardRouting shardRouting : routingTable) { | ||
| if (shardRouting.assignedToNode()) { | ||
| final CheckpointState checkpointState = checkpoints.get(shardRouting.allocationId().getId()); | ||
| renewRetentionLease(getPeerRecoveryRetentionLeaseId(shardRouting), checkpointState.globalCheckpoint + 1, | ||
| PEER_RECOVERY_RETENTION_LEASE_SOURCE); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| public static class CheckpointState implements Writeable { | ||
|
|
||
| /** | ||
|
|
@@ -616,6 +690,23 @@ private boolean invariant() { | |
| assert checkpoints.get(aId) != null : "aId [" + aId + "] is pending in sync but isn't tracked"; | ||
| } | ||
|
|
||
| if (primaryMode | ||
| && indexSettings.isSoftDeleteEnabled() | ||
| && indexSettings.getIndexMetaData().getState() == IndexMetaData.State.OPEN | ||
| && indexSettings.getIndexVersionCreated().onOrAfter(Version.V_8_0_0)) { | ||
| // all tracked shard copies have a corresponding peer-recovery retention lease | ||
| for (final ShardRouting shardRouting : routingTable.assignedShards()) { | ||
DaveCTurner marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| if (checkpoints.get(shardRouting.allocationId().getId()).tracked) { | ||
| assert retentionLeases.contains(getPeerRecoveryRetentionLeaseId(shardRouting)) | ||
| : "no retention lease for tracked shard [" + shardRouting + "] in " + retentionLeases; | ||
| assert PEER_RECOVERY_RETENTION_LEASE_SOURCE.equals( | ||
| retentionLeases.get(getPeerRecoveryRetentionLeaseId(shardRouting)).source()) | ||
| : "incorrect source [" + retentionLeases.get(getPeerRecoveryRetentionLeaseId(shardRouting)).source() | ||
| + "] for [" + shardRouting + "] in " + retentionLeases; | ||
| } | ||
| } | ||
| } | ||
|
|
||
| return true; | ||
| } | ||
|
|
||
|
|
@@ -669,6 +760,7 @@ public ReplicationTracker( | |
| this.pendingInSync = new HashSet<>(); | ||
| this.routingTable = null; | ||
| this.replicationGroup = null; | ||
| assert Version.V_EMPTY.equals(indexSettings.getIndexVersionCreated()) == false; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is this to catch issues where tests have not been properly set up?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, if this is unset then the crucial assertions are skipped, which is Very Bad™. |
||
| assert invariant(); | ||
| } | ||
|
|
||
|
|
@@ -772,6 +864,31 @@ public synchronized void activatePrimaryMode(final long localCheckpoint) { | |
| primaryMode = true; | ||
| updateLocalCheckpoint(shardAllocationId, checkpoints.get(shardAllocationId), localCheckpoint); | ||
| updateGlobalCheckpointOnPrimary(); | ||
|
|
||
| if (indexSettings.isSoftDeleteEnabled()) { | ||
| final ShardRouting primaryShard = routingTable.primaryShard(); | ||
| final String leaseId = getPeerRecoveryRetentionLeaseId(primaryShard); | ||
| if (retentionLeases.get(leaseId) == null) { | ||
| /* | ||
| * We might have got here here via a rolling upgrade from an older version that doesn't create peer recovery retention | ||
| * leases for every shard copy, but in this case we do not expect any leases to exist. | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this might also be a recovery from store?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This comment is explaining the following In a primary relocation the new primary, being a tracked replica, already has a lease.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Did you mean there will be another change here? Why don't we do it now ;). The relocating target should not have a lease if the old primary was on an old version.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This change is already a substantial +665/-185, and I think it's unwise to bring BWC into scope at this time. Note that this PR is against a feature branch, not |
||
| */ | ||
| if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_8_0_0)) { | ||
| // We are starting up the whole replication group from scratch: if we were not (i.e. this is a replica promotion) then | ||
| // this copy must already be in-sync and active and therefore holds a retention lease for itself. | ||
| assert routingTable.activeShards().equals(Collections.singletonList(primaryShard)) : routingTable.activeShards(); | ||
| assert primaryShard.allocationId().getId().equals(shardAllocationId) | ||
| : routingTable.activeShards() + " vs " + shardAllocationId; | ||
| assert replicationGroup.getReplicationTargets().equals(Collections.singletonList(primaryShard)); | ||
|
|
||
| // Safe to call innerAddRetentionLease() without a subsequent sync since there are no other members of this replication | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we don't need a sync, but why not do one any way? This will persist the leases locally on disk
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Doing a sync on the cluster applier thread isn't possible as things stand because of the reroute phase; it also would mean waiting for the sync to return, which is something we try and avoid on the applier thread. We could explicitly persist the leases when calling |
||
| // group. | ||
| innerAddRetentionLease(leaseId, Math.max(0L, checkpoints.get(shardAllocationId).globalCheckpoint + 1), | ||
| PEER_RECOVERY_RETENTION_LEASE_SOURCE); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| assert invariant(); | ||
| } | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -32,6 +32,8 @@ | |
| import org.elasticsearch.ExceptionsHelper; | ||
| import org.elasticsearch.action.ActionListener; | ||
| import org.elasticsearch.action.StepListener; | ||
| import org.elasticsearch.action.support.replication.ReplicationResponse; | ||
| import org.elasticsearch.cluster.metadata.IndexMetaData; | ||
| import org.elasticsearch.cluster.routing.IndexShardRoutingTable; | ||
| import org.elasticsearch.cluster.routing.ShardRouting; | ||
| import org.elasticsearch.common.CheckedSupplier; | ||
|
|
@@ -49,6 +51,7 @@ | |
| import org.elasticsearch.index.engine.Engine; | ||
| import org.elasticsearch.index.engine.RecoveryEngineException; | ||
| import org.elasticsearch.index.seqno.LocalCheckpointTracker; | ||
| import org.elasticsearch.index.seqno.RetentionLeaseAlreadyExistsException; | ||
| import org.elasticsearch.index.seqno.RetentionLeases; | ||
| import org.elasticsearch.index.seqno.SequenceNumbers; | ||
| import org.elasticsearch.index.shard.IndexShard; | ||
|
|
@@ -188,10 +191,30 @@ public void recoverToTarget(ActionListener<RecoveryResponse> listener) { | |
| } | ||
| assert startingSeqNo >= 0 : "startingSeqNo must be non negative. got: " + startingSeqNo; | ||
|
|
||
| final StepListener<ReplicationResponse> establishRetentionLeaseStep = new StepListener<>(); | ||
| if (shard.indexSettings().isSoftDeleteEnabled() | ||
| && shard.indexSettings().getIndexMetaData().getState() != IndexMetaData.State.CLOSE) { | ||
| runUnderPrimaryPermit(() -> { | ||
| try { | ||
| // conservative estimate of the GCP for creating the lease. TODO use the actual GCP once it's appropriate to do so | ||
| final long globalCheckpoint = startingSeqNo - 1; | ||
| // blindly create the lease. TODO integrate this with the recovery process | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure what you mean by "blindly" here and what integration you're referring to.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. With this change, retention leases have no impact on the recovery process, nor do we make any attempt to add a lease for history we've any hope of retaining. E.g. with a file-based recovery we add a lease for all history. In due course the recovery process will be made more dependent on leases. |
||
| shard.addPeerRecoveryRetentionLease(request.targetNode().getId(), globalCheckpoint, establishRetentionLeaseStep); | ||
| } catch (RetentionLeaseAlreadyExistsException e) { | ||
| logger.debug("peer-recovery retention lease already exists", e); | ||
| establishRetentionLeaseStep.onResponse(null); | ||
| } | ||
| }, shardId + " establishing retention lease for [" + request.targetAllocationId() + "]", shard, cancellableThreads, logger); | ||
| } else { | ||
| establishRetentionLeaseStep.onResponse(null); | ||
| } | ||
|
|
||
| final StepListener<TimeValue> prepareEngineStep = new StepListener<>(); | ||
| // For a sequence based recovery, the target can keep its local translog | ||
| prepareTargetForTranslog(isSequenceNumberBasedRecovery == false, | ||
| shard.estimateNumberOfHistoryOperations("peer-recovery", startingSeqNo), prepareEngineStep); | ||
| establishRetentionLeaseStep.whenComplete(r -> { | ||
| // For a sequence based recovery, the target can keep its local translog | ||
| prepareTargetForTranslog(isSequenceNumberBasedRecovery == false, | ||
| shard.estimateNumberOfHistoryOperations("peer-recovery", startingSeqNo), prepareEngineStep); | ||
| }, onFailure); | ||
| final StepListener<SendSnapshotResult> sendSnapshotStep = new StepListener<>(); | ||
| prepareEngineStep.whenComplete(prepareEngineTime -> { | ||
| /* | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can we instead assert the absence of CCR leases?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not as robustly as I'd like, no. We could say there's no leases with source
"ccr", but that's a lot weaker than saying the only remaining leases are PRRLs, similarly to how we previously asserted that there were no leases at all.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we use toMapExcludingPeerRecoveryRetentionLeases here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not very easily. Here we are the other side of the high-level REST API, and this doesn't include indices stats so we don't have access to a
RetentionLeasesobject. It would be quite some work to build one.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ah, I did not realize that it's a rest test.