Skip to content

Commit f7ebe9d

Browse files
authored
Preserve multiple translog generations
Today when a flush is performed, the translog is committed and if there are no outstanding views, only the current translog generation is preserved. Yet for the purpose of sequence numbers, we need stronger guarantees than this. This commit migrates the preservation of translog generations to keep the minimum generation that would be needed to recover after the local checkpoint. Relates #24015
1 parent 8033c57 commit f7ebe9d

File tree

4 files changed

+435
-100
lines changed

4 files changed

+435
-100
lines changed

core/src/main/java/org/elasticsearch/index/engine/InternalEngine.java

Lines changed: 36 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -299,7 +299,7 @@ private Translog openTranslog(EngineConfig engineConfig, IndexWriter writer, Lon
299299
throw new IllegalStateException("no translog generation present in commit data but translog is expected to exist");
300300
}
301301
if (generation.translogUUID == null) {
302-
throw new IndexFormatTooOldException("trasnlog", "translog has no generation nor a UUID - this might be an index from a previous version consider upgrading to N-1 first");
302+
throw new IndexFormatTooOldException("translog", "translog has no generation nor a UUID - this might be an index from a previous version consider upgrading to N-1 first");
303303
}
304304
}
305305
final Translog translog = new Translog(translogConfig, generation, globalCheckpointSupplier);
@@ -1233,12 +1233,12 @@ public CommitId flush(boolean force, boolean waitIfOngoing) throws EngineExcepti
12331233
try {
12341234
translog.prepareCommit();
12351235
logger.trace("starting commit for flush; commitTranslog=true");
1236-
commitIndexWriter(indexWriter, translog, null);
1236+
final long committedGeneration = commitIndexWriter(indexWriter, translog, null);
12371237
logger.trace("finished commit for flush");
12381238
// we need to refresh in order to clear older version values
12391239
refresh("version_table_flush");
12401240
// after refresh documents can be retrieved from the index so we can now commit the translog
1241-
translog.commit();
1241+
translog.commit(committedGeneration);
12421242
} catch (Exception e) {
12431243
throw new FlushFailedEngineException(shardId, e);
12441244
}
@@ -1734,55 +1734,65 @@ protected void doRun() throws Exception {
17341734
}
17351735
}
17361736

1737-
private void commitIndexWriter(IndexWriter writer, Translog translog, String syncId) throws IOException {
1737+
/**
1738+
* Commits the specified index writer.
1739+
*
1740+
* @param writer the index writer to commit
1741+
* @param translog the translog
1742+
* @param syncId the sync flush ID ({@code null} if not committing a synced flush)
1743+
* @return the minimum translog generation for the local checkpoint committed with the specified index writer
1744+
* @throws IOException if an I/O exception occurs committing the specfied writer
1745+
*/
1746+
private long commitIndexWriter(final IndexWriter writer, final Translog translog, @Nullable final String syncId) throws IOException {
17381747
ensureCanFlush();
17391748
try {
1740-
Translog.TranslogGeneration translogGeneration = translog.getGeneration();
1741-
1742-
final String translogFileGen = Long.toString(translogGeneration.translogFileGeneration);
1749+
final long localCheckpoint = seqNoService().getLocalCheckpoint();
1750+
final Translog.TranslogGeneration translogGeneration = translog.getMinGenerationForSeqNo(localCheckpoint + 1);
1751+
final String translogFileGeneration = Long.toString(translogGeneration.translogFileGeneration);
17431752
final String translogUUID = translogGeneration.translogUUID;
1744-
final String localCheckpoint = Long.toString(seqNoService().getLocalCheckpoint());
1753+
final String localCheckpointValue = Long.toString(localCheckpoint);
17451754

17461755
writer.setLiveCommitData(() -> {
17471756
/*
17481757
* The user data captured above (e.g. local checkpoint) contains data that must be evaluated *before* Lucene flushes
1749-
* segments, including the local checkpoint amongst other values. The maximum sequence number is different - we never want
1758+
* segments, including the local checkpoint amongst other values. The maximum sequence number is different, we never want
17501759
* the maximum sequence number to be less than the last sequence number to go into a Lucene commit, otherwise we run the
17511760
* risk of re-using a sequence number for two different documents when restoring from this commit point and subsequently
1752-
* writing new documents to the index. Since we only know which Lucene documents made it into the final commit after the
1753-
* {@link IndexWriter#commit()} call flushes all documents, we defer computation of the max_seq_no to the time of invocation
1754-
* of the commit data iterator (which occurs after all documents have been flushed to Lucene).
1761+
* writing new documents to the index. Since we only know which Lucene documents made it into the final commit after the
1762+
* {@link IndexWriter#commit()} call flushes all documents, we defer computation of the maximum sequence number to the time
1763+
* of invocation of the commit data iterator (which occurs after all documents have been flushed to Lucene).
17551764
*/
1756-
final Map<String, String> commitData = new HashMap<>(6);
1757-
commitData.put(Translog.TRANSLOG_GENERATION_KEY, translogFileGen);
1765+
final Map<String, String> commitData = new HashMap<>(5);
1766+
commitData.put(Translog.TRANSLOG_GENERATION_KEY, translogFileGeneration);
17581767
commitData.put(Translog.TRANSLOG_UUID_KEY, translogUUID);
1759-
commitData.put(SequenceNumbers.LOCAL_CHECKPOINT_KEY, localCheckpoint);
1768+
commitData.put(SequenceNumbers.LOCAL_CHECKPOINT_KEY, localCheckpointValue);
17601769
if (syncId != null) {
17611770
commitData.put(Engine.SYNC_COMMIT_ID, syncId);
17621771
}
17631772
commitData.put(SequenceNumbers.MAX_SEQ_NO, Long.toString(seqNoService().getMaxSeqNo()));
1764-
if (logger.isTraceEnabled()) {
1765-
logger.trace("committing writer with commit data [{}]", commitData);
1766-
}
1773+
logger.trace("committing writer with commit data [{}]", commitData);
17671774
return commitData.entrySet().iterator();
17681775
});
17691776

17701777
writer.commit();
1771-
} catch (Exception ex) {
1778+
return translogGeneration.translogFileGeneration;
1779+
} catch (final Exception ex) {
17721780
try {
17731781
failEngine("lucene commit failed", ex);
1774-
} catch (Exception inner) {
1782+
} catch (final Exception inner) {
17751783
ex.addSuppressed(inner);
17761784
}
17771785
throw ex;
1778-
} catch (AssertionError e) {
1779-
// IndexWriter throws AssertionError on commit, if asserts are enabled, if any files don't exist, but tests that
1780-
// randomly throw FNFE/NSFE can also hit this:
1786+
} catch (final AssertionError e) {
1787+
/*
1788+
* If assertions are enabled, IndexWriter throws AssertionError on commit if any files don't exist, but tests that randomly
1789+
* throw FileNotFoundException or NoSuchFileException can also hit this.
1790+
*/
17811791
if (ExceptionsHelper.stackTrace(e).contains("org.apache.lucene.index.IndexWriter.filesExist")) {
1782-
EngineException engineException = new EngineException(shardId, "failed to commit engine", e);
1792+
final EngineException engineException = new EngineException(shardId, "failed to commit engine", e);
17831793
try {
17841794
failEngine("lucene commit failed", engineException);
1785-
} catch (Exception inner) {
1795+
} catch (final Exception inner) {
17861796
engineException.addSuppressed(inner);
17871797
}
17881798
throw engineException;
@@ -1866,7 +1876,7 @@ public boolean isRecovering() {
18661876
* Gets the commit data from {@link IndexWriter} as a map.
18671877
*/
18681878
private static Map<String, String> commitDataAsMap(final IndexWriter indexWriter) {
1869-
Map<String, String> commitData = new HashMap<>(6);
1879+
Map<String, String> commitData = new HashMap<>(5);
18701880
for (Map.Entry<String, String> entry : indexWriter.getLiveCommitData()) {
18711881
commitData.put(entry.getKey(), entry.getValue());
18721882
}

core/src/main/java/org/elasticsearch/index/translog/Translog.java

Lines changed: 75 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -85,14 +85,14 @@
8585
* When a translog is opened the checkpoint is use to retrieve the latest translog file generation and subsequently to open the last written file to recovery operations.
8686
* The {@link org.elasticsearch.index.translog.Translog.TranslogGeneration}, given when the translog is opened / constructed is compared against
8787
* the latest generation and all consecutive translog files singe the given generation and the last generation in the checkpoint will be recovered and preserved until the next
88-
* generation is committed using {@link Translog#commit()}. In the common case the translog file generation in the checkpoint and the generation passed to the translog on creation are
89-
* the same. The only situation when they can be different is when an actual translog commit fails in between {@link Translog#prepareCommit()} and {@link Translog#commit()}. In such a case
88+
* generation is committed using {@link Translog#commit(long)}. In the common case the translog file generation in the checkpoint and the generation passed to the translog on creation are
89+
* the same. The only situation when they can be different is when an actual translog commit fails in between {@link Translog#prepareCommit()} and {@link Translog#commit(long)}. In such a case
9090
* the currently being committed translog file will not be deleted since it's commit was not successful. Yet, a new/current translog file is already opened at that point such that there is more than
9191
* one translog file present. Such an uncommitted translog file always has a <tt>translog-${gen}.ckp</tt> associated with it which is an fsynced copy of the it's last <tt>translog.ckp</tt> such that in
9292
* disaster recovery last fsynced offsets, number of operation etc. are still preserved.
9393
* </p>
9494
*/
95-
public class Translog extends AbstractIndexShardComponent implements IndexShardComponent, Closeable, TwoPhaseCommit {
95+
public class Translog extends AbstractIndexShardComponent implements IndexShardComponent, Closeable {
9696

9797
/*
9898
* TODO
@@ -804,6 +804,8 @@ public static Type fromId(byte id) {
804804

805805
long seqNo();
806806

807+
long primaryTerm();
808+
807809
/**
808810
* Reads the type and the operation from the given stream. The operation must be written with
809811
* {@link Operation#writeType(Operation, StreamOutput)}
@@ -953,6 +955,7 @@ public long seqNo() {
953955
return seqNo;
954956
}
955957

958+
@Override
956959
public long primaryTerm() {
957960
return primaryTerm;
958961
}
@@ -1104,6 +1107,7 @@ public long seqNo() {
11041107
return seqNo;
11051108
}
11061109

1110+
@Override
11071111
public long primaryTerm() {
11081112
return primaryTerm;
11091113
}
@@ -1180,6 +1184,7 @@ public long seqNo() {
11801184
return seqNo;
11811185
}
11821186

1187+
@Override
11831188
public long primaryTerm() {
11841189
return primaryTerm;
11851190
}
@@ -1347,6 +1352,31 @@ public static void writeOperationNoSize(BufferedChecksumStreamOutput out, Transl
13471352
out.writeInt((int) checksum);
13481353
}
13491354

1355+
/**
1356+
* Gets the minimum generation that could contain any sequence number after the specified sequence number, or the current generation if
1357+
* there is no generation that could any such sequence number.
1358+
*
1359+
* @param seqNo the sequence number
1360+
* @return the minimum generation for the sequence number
1361+
*/
1362+
public TranslogGeneration getMinGenerationForSeqNo(final long seqNo) {
1363+
try (ReleasableLock ignored = writeLock.acquire()) {
1364+
/*
1365+
* When flushing, the engine will ask the translog for the minimum generation that could contain any sequence number after the
1366+
* local checkpoint. Immediately after flushing, there will be no such generation, so this minimum generation in this case will
1367+
* be the current translog generation as we do not need any prior generations to have a complete history up to the current local
1368+
* checkpoint.
1369+
*/
1370+
long minTranslogFileGeneration = this.currentFileGeneration();
1371+
for (final TranslogReader reader : readers) {
1372+
if (seqNo <= reader.getCheckpoint().maxSeqNo) {
1373+
minTranslogFileGeneration = Math.min(minTranslogFileGeneration, reader.getGeneration());
1374+
}
1375+
}
1376+
return new TranslogGeneration(translogUUID, minTranslogFileGeneration);
1377+
}
1378+
}
1379+
13501380
/**
13511381
* Roll the current translog generation into a new generation. This does not commit the
13521382
* translog.
@@ -1375,54 +1405,78 @@ public void rollGeneration() throws IOException {
13751405
}
13761406
}
13771407

1378-
@Override
1379-
public long prepareCommit() throws IOException {
1408+
/**
1409+
* Prepares a translog commit by setting the current committing generation and rolling the translog generation.
1410+
*
1411+
* @throws IOException if an I/O exception occurred while rolling the translog generation
1412+
*/
1413+
public void prepareCommit() throws IOException {
13801414
try (ReleasableLock ignored = writeLock.acquire()) {
13811415
ensureOpen();
13821416
if (currentCommittingGeneration != NOT_SET_GENERATION) {
1383-
final String message = String.format(
1384-
Locale.ROOT,
1385-
"already committing a translog with generation [%d]",
1386-
currentCommittingGeneration);
1417+
final String message =
1418+
String.format(Locale.ROOT, "already committing a translog with generation [%d]", currentCommittingGeneration);
13871419
throw new IllegalStateException(message);
13881420
}
13891421
currentCommittingGeneration = current.getGeneration();
13901422
rollGeneration();
13911423
}
1392-
return 0;
13931424
}
13941425

1395-
@Override
1396-
public long commit() throws IOException {
1426+
/**
1427+
* Commits the translog and sets the last committed translog generation to the specified generation. The specified committed generation
1428+
* will be used when trimming unreferenced translog generations such that generations from the committed generation will be preserved.
1429+
*
1430+
* If {@link Translog#prepareCommit()} was not called before calling commit, this method will be invoked too causing the translog
1431+
* generation to be rolled.
1432+
*
1433+
* @param committedGeneration the minimum translog generation to preserve after trimming unreferenced generations
1434+
* @throws IOException if an I/O exception occurred preparing the translog commit
1435+
*/
1436+
public void commit(final long committedGeneration) throws IOException {
13971437
try (ReleasableLock ignored = writeLock.acquire()) {
13981438
ensureOpen();
1439+
assert assertCommittedGenerationIsInValidRange(committedGeneration);
13991440
if (currentCommittingGeneration == NOT_SET_GENERATION) {
14001441
prepareCommit();
14011442
}
14021443
assert currentCommittingGeneration != NOT_SET_GENERATION;
14031444
assert readers.stream().anyMatch(r -> r.getGeneration() == currentCommittingGeneration)
14041445
: "readers missing committing generation [" + currentCommittingGeneration + "]";
14051446
// set the last committed generation otherwise old files will not be cleaned up
1406-
lastCommittedTranslogFileGeneration = currentCommittingGeneration + 1;
1447+
lastCommittedTranslogFileGeneration = committedGeneration;
14071448
currentCommittingGeneration = NOT_SET_GENERATION;
14081449
trimUnreferencedReaders();
14091450
}
1410-
return 0;
14111451
}
14121452

1453+
private boolean assertCommittedGenerationIsInValidRange(final long committedGeneration) {
1454+
assert committedGeneration <= current.generation
1455+
: "tried to commit generation [" + committedGeneration + "] after current generation [" + current.generation + "]";
1456+
final long min = readers.stream().map(TranslogReader::getGeneration).min(Long::compareTo).orElse(Long.MIN_VALUE);
1457+
assert committedGeneration >= min
1458+
: "tried to commit generation [" + committedGeneration + "] before minimum generation [" + min + "]";
1459+
return true;
1460+
}
1461+
1462+
/**
1463+
* Trims unreferenced translog generations. The guarantee here is that translog generations will be preserved for all outstanding views
1464+
* and from the last committed translog generation defined by {@link Translog#lastCommittedTranslogFileGeneration}.
1465+
*/
14131466
void trimUnreferencedReaders() {
14141467
try (ReleasableLock ignored = writeLock.acquire()) {
14151468
if (closed.get()) {
1416-
// we're shutdown potentially on some tragic event - don't delete anything
1469+
// we're shutdown potentially on some tragic event, don't delete anything
14171470
return;
14181471
}
1419-
long minReferencedGen = outstandingViews.stream().mapToLong(View::minTranslogGeneration).min().orElse(Long.MAX_VALUE);
1420-
minReferencedGen = Math.min(lastCommittedTranslogFileGeneration, minReferencedGen);
1421-
final long finalMinReferencedGen = minReferencedGen;
1422-
List<TranslogReader> unreferenced = readers.stream().filter(r -> r.getGeneration() < finalMinReferencedGen).collect(Collectors.toList());
1472+
long minReferencedGen = Math.min(
1473+
lastCommittedTranslogFileGeneration,
1474+
outstandingViews.stream().mapToLong(View::minTranslogGeneration).min().orElse(Long.MAX_VALUE));
1475+
final List<TranslogReader> unreferenced =
1476+
readers.stream().filter(r -> r.getGeneration() < minReferencedGen).collect(Collectors.toList());
14231477
for (final TranslogReader unreferencedReader : unreferenced) {
1424-
Path translogPath = unreferencedReader.path();
1425-
logger.trace("delete translog file - not referenced and not current anymore {}", translogPath);
1478+
final Path translogPath = unreferencedReader.path();
1479+
logger.trace("delete translog file [{}], not referenced and not current anymore", translogPath);
14261480
IOUtils.closeWhileHandlingException(unreferencedReader);
14271481
IOUtils.deleteFilesIgnoringExceptions(translogPath,
14281482
translogPath.resolveSibling(getCommitCheckpointFileName(unreferencedReader.getGeneration())));
@@ -1442,13 +1496,6 @@ void closeFilesIfNoPendingViews() throws IOException {
14421496
}
14431497
}
14441498

1445-
1446-
@Override
1447-
public void rollback() throws IOException {
1448-
ensureOpen();
1449-
close();
1450-
}
1451-
14521499
/**
14531500
* References a transaction log generation
14541501
*/

0 commit comments

Comments
 (0)