Skip to content

Commit 1ca0b5e

Browse files
authored
Introduce a History UUID as a requirement for ops based recovery (#26577)
The new ops based recovery, introduce as part of #10708, is based on the assumption that all operations below the global checkpoint known to the replica do not need to be synced with the primary. This is based on the guarantee that all ops below it are available on primary and they are equal. Under normal operations this guarantee holds. Sadly, it can be violated when a primary is restored from an old snapshot. At the point the restore primary can miss operations below the replica's global checkpoint, or even worse may have total different operations at the same spot. This PR introduces the notion of a history uuid to be able to capture the difference with the restored primary (in a follow up PR). The History UUID is generated by a primary when it is first created and is synced to the replicas which are recovered via a file based recovery. The PR adds a requirement to ops based recovery to make sure that the history uuid of the source and the target are equal. Under normal operations, all shard copies will stay with that history uuid for the rest of the index lifetime and thus this is a noop. However, it gives us a place to guarantee we fall back to file base syncing in special events like a restore from snapshot (to be done as a follow up) and when someone calls the truncate translog command which can go wrong when combined with primary recovery (this is done in this PR). We considered in the past to use the translog uuid for this function (i.e., sync it across copies) and thus avoid adding an extra identifier. This idea was rejected as it removes the ability to verify that a specific translog really belongs to a specific lucene index. We also feel that having a history uuid will serve us well in the future.
1 parent e69c39a commit 1ca0b5e

File tree

21 files changed

+385
-156
lines changed

21 files changed

+385
-156
lines changed

build.gradle

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,7 @@ task verifyVersions {
186186
* after the backport of the backcompat code is complete.
187187
*/
188188
allprojects {
189-
ext.bwc_tests_enabled = true
189+
ext.bwc_tests_enabled = false
190190
}
191191

192192
task verifyBwcTestsEnabled {

core/src/main/java/org/elasticsearch/index/engine/Engine.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@
9595
public abstract class Engine implements Closeable {
9696

9797
public static final String SYNC_COMMIT_ID = "sync_id";
98+
public static final String HISTORY_UUID_KEY = "history_uuid";
9899

99100
protected final ShardId shardId;
100101
protected final String allocationId;
@@ -183,6 +184,9 @@ public MergeStats getMergeStats() {
183184
return new MergeStats();
184185
}
185186

187+
/** returns the history uuid for the engine */
188+
public abstract String getHistoryUUID();
189+
186190
/**
187191
* A throttling class that can be activated, causing the
188192
* {@code acquireThrottle} method to block on a lock when throttling

core/src/main/java/org/elasticsearch/index/engine/InternalEngine.java

Lines changed: 69 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@
4848
import org.elasticsearch.Version;
4949
import org.elasticsearch.action.index.IndexRequest;
5050
import org.elasticsearch.common.Nullable;
51+
import org.elasticsearch.common.UUIDs;
5152
import org.elasticsearch.common.lease.Releasable;
5253
import org.elasticsearch.common.lucene.LoggerInfoStream;
5354
import org.elasticsearch.common.lucene.Lucene;
@@ -142,6 +143,8 @@ public class InternalEngine extends Engine {
142143
private final CounterMetric numVersionLookups = new CounterMetric();
143144
private final CounterMetric numIndexVersionsLookups = new CounterMetric();
144145

146+
@Nullable
147+
private final String historyUUID;
145148

146149
public InternalEngine(EngineConfig engineConfig) throws EngineException {
147150
super(engineConfig);
@@ -174,15 +177,23 @@ public InternalEngine(EngineConfig engineConfig) throws EngineException {
174177
switch (openMode) {
175178
case OPEN_INDEX_AND_TRANSLOG:
176179
writer = createWriter(false);
180+
String existingHistoryUUID = loadHistoryUUIDFromCommit(writer);
181+
if (existingHistoryUUID == null) {
182+
historyUUID = UUIDs.randomBase64UUID();
183+
} else {
184+
historyUUID = existingHistoryUUID;
185+
}
177186
final long globalCheckpoint = Translog.readGlobalCheckpoint(engineConfig.getTranslogConfig().getTranslogPath());
178187
seqNoStats = store.loadSeqNoStats(globalCheckpoint);
179188
break;
180189
case OPEN_INDEX_CREATE_TRANSLOG:
181190
writer = createWriter(false);
191+
historyUUID = loadHistoryUUIDFromCommit(writer);
182192
seqNoStats = store.loadSeqNoStats(SequenceNumbers.UNASSIGNED_SEQ_NO);
183193
break;
184194
case CREATE_INDEX_AND_TRANSLOG:
185195
writer = createWriter(true);
196+
historyUUID = UUIDs.randomBase64UUID();
186197
seqNoStats = new SeqNoStats(
187198
SequenceNumbers.NO_OPS_PERFORMED,
188199
SequenceNumbers.NO_OPS_PERFORMED,
@@ -342,6 +353,12 @@ private void recoverFromTranslogInternal() throws IOException {
342353
flush(true, true);
343354
} else if (translog.isCurrent(translogGeneration) == false) {
344355
commitIndexWriter(indexWriter, translog, lastCommittedSegmentInfos.getUserData().get(Engine.SYNC_COMMIT_ID));
356+
refreshLastCommittedSegmentInfos();
357+
} else if (lastCommittedSegmentInfos.getUserData().containsKey(HISTORY_UUID_KEY) == false) {
358+
assert historyUUID != null;
359+
// put the history uuid into the index
360+
commitIndexWriter(indexWriter, translog, lastCommittedSegmentInfos.getUserData().get(Engine.SYNC_COMMIT_ID));
361+
refreshLastCommittedSegmentInfos();
345362
}
346363
// clean up what's not needed
347364
translog.trimUnreferencedReaders();
@@ -382,6 +399,11 @@ public Translog getTranslog() {
382399
return translog;
383400
}
384401

402+
@Override
403+
public String getHistoryUUID() {
404+
return historyUUID;
405+
}
406+
385407
/**
386408
* Reads the current stored translog ID from the IW commit data. If the id is not found, recommits the current
387409
* translog id into lucene and returns null.
@@ -401,6 +423,19 @@ private String loadTranslogUUIDFromCommit(IndexWriter writer) throws IOException
401423
}
402424
}
403425

426+
/**
427+
* Reads the current stored history ID from the IW commit data. If the id is not found, returns null.
428+
*/
429+
@Nullable
430+
private String loadHistoryUUIDFromCommit(final IndexWriter writer) throws IOException {
431+
String uuid = commitDataAsMap(writer).get(HISTORY_UUID_KEY);
432+
if (uuid == null) {
433+
assert config().getIndexSettings().getIndexVersionCreated().before(Version.V_6_0_0_rc1) :
434+
"index was created after 6_0_0_rc1 but has no history uuid";
435+
}
436+
return uuid;
437+
}
438+
404439
private SearcherManager createSearcherManager() throws EngineException {
405440
boolean success = false;
406441
SearcherManager searcherManager = null;
@@ -1312,30 +1347,8 @@ public CommitId flush(boolean force, boolean waitIfOngoing) throws EngineExcepti
13121347
} catch (Exception e) {
13131348
throw new FlushFailedEngineException(shardId, e);
13141349
}
1315-
/*
1316-
* we have to inc-ref the store here since if the engine is closed by a tragic event
1317-
* we don't acquire the write lock and wait until we have exclusive access. This might also
1318-
* dec the store reference which can essentially close the store and unless we can inc the reference
1319-
* we can't use it.
1320-
*/
1321-
store.incRef();
1322-
try {
1323-
// reread the last committed segment infos
1324-
lastCommittedSegmentInfos = store.readLastCommittedSegmentsInfo();
1325-
} catch (Exception e) {
1326-
if (isClosed.get() == false) {
1327-
try {
1328-
logger.warn("failed to read latest segment infos on flush", e);
1329-
} catch (Exception inner) {
1330-
e.addSuppressed(inner);
1331-
}
1332-
if (Lucene.isCorruptionException(e)) {
1333-
throw new FlushFailedEngineException(shardId, e);
1334-
}
1335-
}
1336-
} finally {
1337-
store.decRef();
1338-
}
1350+
refreshLastCommittedSegmentInfos();
1351+
13391352
}
13401353
newCommitId = lastCommittedSegmentInfos.getId();
13411354
} catch (FlushFailedEngineException ex) {
@@ -1353,6 +1366,33 @@ public CommitId flush(boolean force, boolean waitIfOngoing) throws EngineExcepti
13531366
return new CommitId(newCommitId);
13541367
}
13551368

1369+
private void refreshLastCommittedSegmentInfos() {
1370+
/*
1371+
* we have to inc-ref the store here since if the engine is closed by a tragic event
1372+
* we don't acquire the write lock and wait until we have exclusive access. This might also
1373+
* dec the store reference which can essentially close the store and unless we can inc the reference
1374+
* we can't use it.
1375+
*/
1376+
store.incRef();
1377+
try {
1378+
// reread the last committed segment infos
1379+
lastCommittedSegmentInfos = store.readLastCommittedSegmentsInfo();
1380+
} catch (Exception e) {
1381+
if (isClosed.get() == false) {
1382+
try {
1383+
logger.warn("failed to read latest segment infos on flush", e);
1384+
} catch (Exception inner) {
1385+
e.addSuppressed(inner);
1386+
}
1387+
if (Lucene.isCorruptionException(e)) {
1388+
throw new FlushFailedEngineException(shardId, e);
1389+
}
1390+
}
1391+
} finally {
1392+
store.decRef();
1393+
}
1394+
}
1395+
13561396
@Override
13571397
public void rollTranslogGeneration() throws EngineException {
13581398
try (ReleasableLock ignored = readLock.acquire()) {
@@ -1874,7 +1914,7 @@ protected void commitIndexWriter(final IndexWriter writer, final Translog transl
18741914
* {@link IndexWriter#commit()} call flushes all documents, we defer computation of the maximum sequence number to the time
18751915
* of invocation of the commit data iterator (which occurs after all documents have been flushed to Lucene).
18761916
*/
1877-
final Map<String, String> commitData = new HashMap<>(5);
1917+
final Map<String, String> commitData = new HashMap<>(6);
18781918
commitData.put(Translog.TRANSLOG_GENERATION_KEY, translogFileGeneration);
18791919
commitData.put(Translog.TRANSLOG_UUID_KEY, translogUUID);
18801920
commitData.put(SequenceNumbers.LOCAL_CHECKPOINT_KEY, localCheckpointValue);
@@ -1883,6 +1923,9 @@ protected void commitIndexWriter(final IndexWriter writer, final Translog transl
18831923
}
18841924
commitData.put(SequenceNumbers.MAX_SEQ_NO, Long.toString(seqNoService().getMaxSeqNo()));
18851925
commitData.put(MAX_UNSAFE_AUTO_ID_TIMESTAMP_COMMIT_ID, Long.toString(maxUnsafeAutoIdTimestamp.get()));
1926+
if (historyUUID != null) {
1927+
commitData.put(HISTORY_UUID_KEY, historyUUID);
1928+
}
18861929
logger.trace("committing writer with commit data [{}]", commitData);
18871930
return commitData.entrySet().iterator();
18881931
});
@@ -1992,7 +2035,7 @@ public boolean isRecovering() {
19922035
* Gets the commit data from {@link IndexWriter} as a map.
19932036
*/
19942037
private static Map<String, String> commitDataAsMap(final IndexWriter indexWriter) {
1995-
Map<String, String> commitData = new HashMap<>(5);
2038+
Map<String, String> commitData = new HashMap<>(6);
19962039
for (Map.Entry<String, String> entry : indexWriter.getLiveCommitData()) {
19972040
commitData.put(entry.getKey(), entry.getValue());
19982041
}

core/src/main/java/org/elasticsearch/index/shard/IndexShard.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1585,6 +1585,10 @@ public Translog getTranslog() {
15851585
return getEngine().getTranslog();
15861586
}
15871587

1588+
public String getHistoryUUID() {
1589+
return getEngine().getHistoryUUID();
1590+
}
1591+
15881592
public IndexEventListener getIndexEventListener() {
15891593
return indexEventListener;
15901594
}

core/src/main/java/org/elasticsearch/index/shard/StoreRecovery.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,10 +35,12 @@
3535
import org.elasticsearch.cluster.metadata.MappingMetaData;
3636
import org.elasticsearch.cluster.routing.RecoverySource;
3737
import org.elasticsearch.cluster.routing.RecoverySource.SnapshotRecoverySource;
38+
import org.elasticsearch.common.UUIDs;
3839
import org.elasticsearch.common.lucene.Lucene;
3940
import org.elasticsearch.common.unit.ByteSizeValue;
4041
import org.elasticsearch.common.unit.TimeValue;
4142
import org.elasticsearch.index.Index;
43+
import org.elasticsearch.index.engine.Engine;
4244
import org.elasticsearch.index.engine.EngineException;
4345
import org.elasticsearch.index.engine.InternalEngine;
4446
import org.elasticsearch.index.mapper.MapperService;
@@ -162,10 +164,11 @@ void addIndices(
162164
* document-level semantics.
163165
*/
164166
writer.setLiveCommitData(() -> {
165-
final HashMap<String, String> liveCommitData = new HashMap<>(2);
167+
final HashMap<String, String> liveCommitData = new HashMap<>(4);
166168
liveCommitData.put(SequenceNumbers.MAX_SEQ_NO, Long.toString(maxSeqNo));
167169
liveCommitData.put(SequenceNumbers.LOCAL_CHECKPOINT_KEY, Long.toString(maxSeqNo));
168170
liveCommitData.put(InternalEngine.MAX_UNSAFE_AUTO_ID_TIMESTAMP_COMMIT_ID, Long.toString(maxUnsafeAutoIdTimestamp));
171+
liveCommitData.put(Engine.HISTORY_UUID_KEY, UUIDs.randomBase64UUID());
169172
return liveCommitData.entrySet().iterator();
170173
});
171174
writer.commit();

core/src/main/java/org/elasticsearch/index/store/Store.java

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@
7979
import org.elasticsearch.index.shard.AbstractIndexShardComponent;
8080
import org.elasticsearch.index.shard.IndexShard;
8181
import org.elasticsearch.index.shard.ShardId;
82+
import org.elasticsearch.index.translog.Translog;
8283

8384
import java.io.Closeable;
8485
import java.io.EOFException;
@@ -1027,6 +1028,20 @@ public Map<String, String> getCommitUserData() {
10271028
return commitUserData;
10281029
}
10291030

1031+
/**
1032+
* returns the history uuid the store points at, or null if not existant.
1033+
*/
1034+
public String getHistoryUUID() {
1035+
return commitUserData.get(Engine.HISTORY_UUID_KEY);
1036+
}
1037+
1038+
/**
1039+
* returns the translog uuid the store points at
1040+
*/
1041+
public String getTranslogUUID() {
1042+
return commitUserData.get(Translog.TRANSLOG_UUID_KEY);
1043+
}
1044+
10301045
/**
10311046
* Returns true iff this metadata contains the given file.
10321047
*/

0 commit comments

Comments
 (0)