elastic · DaveCTurner · Jun 19, 2019 · Jun 13, 2019 · Jun 13, 2019 · Jun 13, 2019
diff --git a/client/rest-high-level/src/test/java/org/elasticsearch/client/CCRIT.java b/client/rest-high-level/src/test/java/org/elasticsearch/client/CCRIT.java
@@ -54,6 +54,7 @@
 import org.elasticsearch.common.xcontent.XContentHelper;
 import org.elasticsearch.common.xcontent.XContentType;
 import org.elasticsearch.common.xcontent.json.JsonXContent;
+import org.elasticsearch.index.seqno.ReplicationTracker;
 import org.elasticsearch.test.rest.yaml.ObjectPath;
 import org.junit.Before;
 
@@ -260,7 +261,9 @@ public void testForgetFollower() throws IOException {
             final Map<?, ?> shardStatsAsMap = (Map<?, ?>) shardStats.get(0);
             final Map<?, ?> retentionLeasesStats = (Map<?, ?>) shardStatsAsMap.get("retention_leases");
             final List<?> leases = (List<?>) retentionLeasesStats.get("leases");
-            assertThat(leases, empty());
+            for (final Object lease : leases) {
+                assertThat(((Map<?, ?>) lease).get("source"), equalTo(ReplicationTracker.PEER_RECOVERY_RETENTION_LEASE_SOURCE));
+            }
         }
     }
 

diff --git a/server/src/main/java/org/elasticsearch/index/seqno/ReplicationTracker.java b/server/src/main/java/org/elasticsearch/index/seqno/ReplicationTracker.java
@@ -24,6 +24,7 @@
 import org.elasticsearch.Version;
 import org.elasticsearch.action.ActionListener;
 import org.elasticsearch.action.support.replication.ReplicationResponse;
+import org.elasticsearch.cluster.metadata.IndexMetaData;
 import org.elasticsearch.cluster.routing.AllocationId;
 import org.elasticsearch.cluster.routing.IndexShardRoutingTable;
 import org.elasticsearch.cluster.routing.ShardRouting;
@@ -217,10 +218,22 @@ public synchronized Tuple<Boolean, RetentionLeases> getRetentionLeases(final boo
         // the primary calculates the non-expired retention leases and syncs them to replicas
         final long currentTimeMillis = currentTimeMillisSupplier.getAsLong();
         final long retentionLeaseMillis = indexSettings.getRetentionLeaseMillis();
+        final Set<String> leaseIdsForCurrentPeers
+            = routingTable.assignedShards().stream().map(ReplicationTracker::getPeerRecoveryRetentionLeaseId).collect(Collectors.toSet());
         final Map<Boolean, List<RetentionLease>> partitionByExpiration = retentionLeases
                 .leases()
                 .stream()
-                .collect(Collectors.groupingBy(lease -> currentTimeMillis - lease.timestamp() > retentionLeaseMillis));
+                .collect(Collectors.groupingBy(lease -> {
+                    if (lease.source().equals(PEER_RECOVERY_RETENTION_LEASE_SOURCE)) {
+                        if (leaseIdsForCurrentPeers.contains(lease.id())) {
+                            return false;
+                        }
+                        if (routingTable.allShardsStarted()) {
+                            return true;
+                        }
+                    }
+                    return currentTimeMillis - lease.timestamp() > retentionLeaseMillis;
+                }));
         final Collection<RetentionLease> expiredLeases = partitionByExpiration.get(true);
         if (expiredLeases == null) {
             // early out as no retention leases have expired
@@ -242,7 +255,7 @@ public synchronized Tuple<Boolean, RetentionLeases> getRetentionLeases(final boo
      * @param source                  the source of the retention lease
      * @param listener                the callback when the retention lease is successfully added and synced to replicas
      * @return the new retention lease
-     * @throws IllegalArgumentException if the specified retention lease already exists
+     * @throws RetentionLeaseAlreadyExistsException if the specified retention lease already exists
      */
     public RetentionLease addRetentionLease(
             final String id,
@@ -253,30 +266,46 @@ public RetentionLease addRetentionLease(
         final RetentionLease retentionLease;
         final RetentionLeases currentRetentionLeases;
         synchronized (this) {
-            assert primaryMode;
-            if (retentionLeases.contains(id)) {
-                throw new RetentionLeaseAlreadyExistsException(id);
-            }
-            retentionLease = new RetentionLease(id, retainingSequenceNumber, currentTimeMillisSupplier.getAsLong(), source);
-            logger.debug("adding new retention lease [{}] to current retention leases [{}]", retentionLease, retentionLeases);
-            retentionLeases = new RetentionLeases(
-                    operationPrimaryTerm,
-                    retentionLeases.version() + 1,
-                    Stream.concat(retentionLeases.leases().stream(), Stream.of(retentionLease)).collect(Collectors.toList()));
+            retentionLease = innerAddRetentionLease(id, retainingSequenceNumber, source);
             currentRetentionLeases = retentionLeases;
         }
         onSyncRetentionLeases.accept(currentRetentionLeases, listener);
         return retentionLease;
     }
 
+    /**
+     * Adds a new retention lease, but does not synchronise it with the rest of the replication group.
+     *
+     * @param id                      the identifier of the retention lease
+     * @param retainingSequenceNumber the retaining sequence number
+     * @param source                  the source of the retention lease
+     * @return the new retention lease
+     * @throws RetentionLeaseAlreadyExistsException if the specified retention lease already exists
+     */
+    private RetentionLease innerAddRetentionLease(String id, long retainingSequenceNumber, String source) {
+        assert Thread.holdsLock(this);
+        assert primaryMode : id + "/" + retainingSequenceNumber + "/" + source;
+        if (retentionLeases.contains(id)) {
+            throw new RetentionLeaseAlreadyExistsException(id);
+        }
+        final RetentionLease retentionLease
+            = new RetentionLease(id, retainingSequenceNumber, currentTimeMillisSupplier.getAsLong(), source);
+        logger.debug("adding new retention lease [{}] to current retention leases [{}]", retentionLease, retentionLeases);
+        retentionLeases = new RetentionLeases(
+                operationPrimaryTerm,
+                retentionLeases.version() + 1,
+                Stream.concat(retentionLeases.leases().stream(), Stream.of(retentionLease)).collect(Collectors.toList()));
+        return retentionLease;
+    }
+
     /**
      * Renews an existing retention lease.
      *
      * @param id                      the identifier of the retention lease
      * @param retainingSequenceNumber the retaining sequence number
      * @param source                  the source of the retention lease
      * @return the renewed retention lease
-     * @throws IllegalArgumentException if the specified retention lease does not exist
+     * @throws RetentionLeaseNotFoundException if the specified retention lease does not exist
      */
     public synchronized RetentionLease renewRetentionLease(final String id, final long retainingSequenceNumber, final String source) {
         assert primaryMode;
@@ -390,6 +419,51 @@ public boolean assertRetentionLeasesPersisted(final Path path) throws IOExceptio
         return true;
     }
 
+
+    /**
+     * Retention leases for peer recovery have source {@link ReplicationTracker#PEER_RECOVERY_RETENTION_LEASE_SOURCE}, a lease ID
+     * containing the persistent node ID calculated by {@link ReplicationTracker#getPeerRecoveryRetentionLeaseId}, and retain operations
+     * with sequence numbers strictly greater than the given global checkpoint.
+     */
+    public void addPeerRecoveryRetentionLease(String nodeId, long globalCheckpoint, ActionListener<ReplicationResponse> listener) {
+        addRetentionLease(getPeerRecoveryRetentionLeaseId(nodeId), globalCheckpoint + 1, PEER_RECOVERY_RETENTION_LEASE_SOURCE, listener);
+    }
+
+    /**
+     * Source for peer recovery retention leases; see {@link ReplicationTracker#addPeerRecoveryRetentionLease}.
+     */
+    public static final String PEER_RECOVERY_RETENTION_LEASE_SOURCE = "peer recovery";
+
+    /**
+     * Id for a peer recovery retention lease for the given node. See {@link ReplicationTracker#addPeerRecoveryRetentionLease}.
+     */
+    static String getPeerRecoveryRetentionLeaseId(String nodeId) {
+        return "peer_recovery/" + nodeId;
+    }
+
+    /**
+     * Id for a peer recovery retention lease for the given {@link ShardRouting}.
+     * See {@link ReplicationTracker#addPeerRecoveryRetentionLease}.
+     */
+    public static String getPeerRecoveryRetentionLeaseId(ShardRouting shardRouting) {
+        return getPeerRecoveryRetentionLeaseId(shardRouting.currentNodeId());
+    }
+
+    /**
+     * Advance the peer-recovery retention lease for all tracked shard copies, for use in tests until advancing these leases is done
+     * properly. TODO remove this.
+     */
+    public synchronized void advancePeerRecoveryRetentionLeasesToGlobalCheckpoints() {
+        assert primaryMode;
+        for (ShardRouting shardRouting : routingTable) {
+            if (shardRouting.assignedToNode()) {
+                final CheckpointState checkpointState = checkpoints.get(shardRouting.allocationId().getId());
+                renewRetentionLease(getPeerRecoveryRetentionLeaseId(shardRouting), checkpointState.globalCheckpoint + 1,
+                    PEER_RECOVERY_RETENTION_LEASE_SOURCE);
+            }
+        }
+    }
+
     public static class CheckpointState implements Writeable {
 
         /**
@@ -616,6 +690,23 @@ private boolean invariant() {
             assert checkpoints.get(aId) != null : "aId [" + aId + "] is pending in sync but isn't tracked";
         }
 
+        if (primaryMode
+            && indexSettings.isSoftDeleteEnabled()
+            && indexSettings.getIndexMetaData().getState() == IndexMetaData.State.OPEN
+            && indexSettings.getIndexVersionCreated().onOrAfter(Version.V_8_0_0)) {
+            // all tracked shard copies have a corresponding peer-recovery retention lease
+            for (final ShardRouting shardRouting : routingTable.assignedShards()) {
+                if (checkpoints.get(shardRouting.allocationId().getId()).tracked) {
+                    assert retentionLeases.contains(getPeerRecoveryRetentionLeaseId(shardRouting))
+                        : "no retention lease for tracked shard [" + shardRouting + "] in " + retentionLeases;
+                    assert PEER_RECOVERY_RETENTION_LEASE_SOURCE.equals(
+                        retentionLeases.get(getPeerRecoveryRetentionLeaseId(shardRouting)).source())
+                        : "incorrect source [" + retentionLeases.get(getPeerRecoveryRetentionLeaseId(shardRouting)).source()
+                        + "] for [" + shardRouting + "] in " + retentionLeases;
+                }
+            }
+        }
+
         return true;
     }
 
@@ -669,6 +760,7 @@ public ReplicationTracker(
         this.pendingInSync = new HashSet<>();
         this.routingTable = null;
         this.replicationGroup = null;
+        assert Version.V_EMPTY.equals(indexSettings.getIndexVersionCreated()) == false;
         assert invariant();
     }
 
@@ -772,6 +864,31 @@ public synchronized void activatePrimaryMode(final long localCheckpoint) {
         primaryMode = true;
         updateLocalCheckpoint(shardAllocationId, checkpoints.get(shardAllocationId), localCheckpoint);
         updateGlobalCheckpointOnPrimary();
+
+        if (indexSettings.isSoftDeleteEnabled()) {
+            final ShardRouting primaryShard = routingTable.primaryShard();
+            final String leaseId = getPeerRecoveryRetentionLeaseId(primaryShard);
+            if (retentionLeases.get(leaseId) == null) {
+                /*
+                 * We might have got here here via a rolling upgrade from an older version that doesn't create peer recovery retention
+                 * leases for every shard copy, but in this case we do not expect any leases to exist.
+                 */
+                if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_8_0_0)) {
+                    // We are starting up the whole replication group from scratch: if we were not (i.e. this is a replica promotion) then
+                    // this copy must already be in-sync and active and therefore holds a retention lease for itself.
+                    assert routingTable.activeShards().equals(Collections.singletonList(primaryShard)) : routingTable.activeShards();
+                    assert primaryShard.allocationId().getId().equals(shardAllocationId)
+                        : routingTable.activeShards() + " vs " + shardAllocationId;
+                    assert replicationGroup.getReplicationTargets().equals(Collections.singletonList(primaryShard));
+
+                    // Safe to call innerAddRetentionLease() without a subsequent sync since there are no other members of this replication
+                    // group.
+                    innerAddRetentionLease(leaseId, Math.max(0L, checkpoints.get(shardAllocationId).globalCheckpoint + 1),
+                        PEER_RECOVERY_RETENTION_LEASE_SOURCE);
+                }
+            }
+        }
+
         assert invariant();
     }
 

diff --git a/server/src/main/java/org/elasticsearch/index/seqno/RetentionLeaseSyncer.java b/server/src/main/java/org/elasticsearch/index/seqno/RetentionLeaseSyncer.java
@@ -44,7 +44,7 @@ public interface RetentionLeaseSyncer {
     RetentionLeaseSyncer EMPTY = new RetentionLeaseSyncer() {
         @Override
         public void sync(final ShardId shardId, final RetentionLeases retentionLeases, final ActionListener<ReplicationResponse> listener) {
-
+            listener.onResponse(new ReplicationResponse());
         }
 
         @Override

diff --git a/server/src/main/java/org/elasticsearch/index/seqno/RetentionLeases.java b/server/src/main/java/org/elasticsearch/index/seqno/RetentionLeases.java
@@ -274,14 +274,5 @@ private static Map<String, RetentionLease> toMap(final Collection<RetentionLease
                         LinkedHashMap::new));
     }
 
-    /**
-     * A utility method to convert a retention lease collection to a map from retention lease ID to retention lease.
-     *
-     * @param retentionLeases the retention lease collection
-     * @return the map from retention lease ID to retention lease
-     */
-    static Map<String, RetentionLease> toMap(final RetentionLeases retentionLeases) {
-        return retentionLeases.leases;
-    }
-
 }
+
diff --git a/server/src/main/java/org/elasticsearch/index/shard/IndexShard.java b/server/src/main/java/org/elasticsearch/index/shard/IndexShard.java
@@ -2415,6 +2415,21 @@ public boolean isRelocatedPrimary() {
         return replicationTracker.isRelocated();
     }
 
+    public void addPeerRecoveryRetentionLease(String nodeId, long globalCheckpoint, ActionListener<ReplicationResponse> listener) {
+        assert assertPrimaryMode();
+        replicationTracker.addPeerRecoveryRetentionLease(nodeId, globalCheckpoint, listener);
+    }
+
+    /**
+     * Test-only method to advance the all shards' peer-recovery retention leases to their tracked global checkpoints so that operations
+     * can be discarded. TODO Remove this when retention leases are advanced by other mechanisms.
+     */
+    public void advancePeerRecoveryRetentionLeasesToGlobalCheckpoints() {
+        assert assertPrimaryMode();
+        replicationTracker.advancePeerRecoveryRetentionLeasesToGlobalCheckpoints();
+        syncRetentionLeases();
+    }
+
     class ShardEventListener implements Engine.EventListener {
         private final CopyOnWriteArrayList<Consumer<ShardFailure>> delegates = new CopyOnWriteArrayList<>();
 

diff --git a/server/src/main/java/org/elasticsearch/indices/recovery/RecoverySourceHandler.java b/server/src/main/java/org/elasticsearch/indices/recovery/RecoverySourceHandler.java
@@ -32,6 +32,8 @@
 import org.elasticsearch.ExceptionsHelper;
 import org.elasticsearch.action.ActionListener;
 import org.elasticsearch.action.StepListener;
+import org.elasticsearch.action.support.replication.ReplicationResponse;
+import org.elasticsearch.cluster.metadata.IndexMetaData;
 import org.elasticsearch.cluster.routing.IndexShardRoutingTable;
 import org.elasticsearch.cluster.routing.ShardRouting;
 import org.elasticsearch.common.CheckedSupplier;
@@ -49,6 +51,7 @@
 import org.elasticsearch.index.engine.Engine;
 import org.elasticsearch.index.engine.RecoveryEngineException;
 import org.elasticsearch.index.seqno.LocalCheckpointTracker;
+import org.elasticsearch.index.seqno.RetentionLeaseAlreadyExistsException;
 import org.elasticsearch.index.seqno.RetentionLeases;
 import org.elasticsearch.index.seqno.SequenceNumbers;
 import org.elasticsearch.index.shard.IndexShard;
@@ -188,10 +191,30 @@ public void recoverToTarget(ActionListener<RecoveryResponse> listener) {
             }
             assert startingSeqNo >= 0 : "startingSeqNo must be non negative. got: " + startingSeqNo;
 
+            final StepListener<ReplicationResponse> establishRetentionLeaseStep = new StepListener<>();
+            if (shard.indexSettings().isSoftDeleteEnabled()
+                && shard.indexSettings().getIndexMetaData().getState() != IndexMetaData.State.CLOSE) {
+                runUnderPrimaryPermit(() -> {
+                    try {
+                        // conservative estimate of the GCP for creating the lease. TODO use the actual GCP once it's appropriate to do so
+                        final long globalCheckpoint = startingSeqNo - 1;
+                        // blindly create the lease. TODO integrate this with the recovery process
+                        shard.addPeerRecoveryRetentionLease(request.targetNode().getId(), globalCheckpoint, establishRetentionLeaseStep);
+                    } catch (RetentionLeaseAlreadyExistsException e) {
+                        logger.debug("peer-recovery retention lease already exists", e);
+                        establishRetentionLeaseStep.onResponse(null);
+                    }
+                }, shardId + " establishing retention lease for [" + request.targetAllocationId() + "]", shard, cancellableThreads, logger);
+            } else {
+                establishRetentionLeaseStep.onResponse(null);
+            }
+
             final StepListener<TimeValue> prepareEngineStep = new StepListener<>();
-            // For a sequence based recovery, the target can keep its local translog
-            prepareTargetForTranslog(isSequenceNumberBasedRecovery == false,
-                shard.estimateNumberOfHistoryOperations("peer-recovery", startingSeqNo), prepareEngineStep);
+            establishRetentionLeaseStep.whenComplete(r -> {
+                // For a sequence based recovery, the target can keep its local translog
+                prepareTargetForTranslog(isSequenceNumberBasedRecovery == false,
+                    shard.estimateNumberOfHistoryOperations("peer-recovery", startingSeqNo), prepareEngineStep);
+            }, onFailure);
             final StepListener<SendSnapshotResult> sendSnapshotStep = new StepListener<>();
             prepareEngineStep.whenComplete(prepareEngineTime -> {
                 /*