Skip to content

Commit f96e00b

Browse files
authored
Add primary term to translog header (#29227)
This change adds the current primary term to the header of the current translog file. Having a term in a translog header is a prerequisite step that allows us to trim translog operations given the max valid seq# for that term. This commit also updates tests to conform the primary term invariant which guarantees that all translog operations in a translog file have its terms at most the term stored in the translog header.
1 parent d72d3f9 commit f96e00b

31 files changed

+680
-482
lines changed

server/src/main/java/org/elasticsearch/index/IndexSettings.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -185,7 +185,7 @@ public final class IndexSettings {
185185
public static final Setting<ByteSizeValue> INDEX_TRANSLOG_FLUSH_THRESHOLD_SIZE_SETTING =
186186
Setting.byteSizeSetting("index.translog.flush_threshold_size", new ByteSizeValue(512, ByteSizeUnit.MB),
187187
/*
188-
* An empty translog occupies 43 bytes on disk. If the flush threshold is below this, the flush thread
188+
* An empty translog occupies 55 bytes on disk. If the flush threshold is below this, the flush thread
189189
* can get stuck in an infinite loop as the shouldPeriodicallyFlush can still be true after flushing.
190190
* However, small thresholds are useful for testing so we do not add a large lower bound here.
191191
*/
@@ -220,7 +220,7 @@ public final class IndexSettings {
220220
"index.translog.generation_threshold_size",
221221
new ByteSizeValue(64, ByteSizeUnit.MB),
222222
/*
223-
* An empty translog occupies 43 bytes on disk. If the generation threshold is
223+
* An empty translog occupies 55 bytes on disk. If the generation threshold is
224224
* below this, the flush thread can get stuck in an infinite loop repeatedly
225225
* rolling the generation as every new generation will already exceed the
226226
* generation threshold. However, small thresholds are useful for testing so we

server/src/main/java/org/elasticsearch/index/engine/Engine.java

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1066,14 +1066,13 @@ public Index(Term uid, ParsedDocument doc, long seqNo, long primaryTerm, long ve
10661066
this.autoGeneratedIdTimestamp = autoGeneratedIdTimestamp;
10671067
}
10681068

1069-
public Index(Term uid, ParsedDocument doc) {
1070-
this(uid, doc, Versions.MATCH_ANY);
1069+
public Index(Term uid, long primaryTerm, ParsedDocument doc) {
1070+
this(uid, primaryTerm, doc, Versions.MATCH_ANY);
10711071
} // TEST ONLY
10721072

1073-
Index(Term uid, ParsedDocument doc, long version) {
1074-
// use a primary term of 2 to allow tests to reduce it to a valid >0 term
1075-
this(uid, doc, SequenceNumbers.UNASSIGNED_SEQ_NO, 2, version, VersionType.INTERNAL,
1076-
Origin.PRIMARY, System.nanoTime(), -1, false);
1073+
Index(Term uid, long primaryTerm, ParsedDocument doc, long version) {
1074+
this(uid, doc, SequenceNumbers.UNASSIGNED_SEQ_NO, primaryTerm, version, VersionType.INTERNAL,
1075+
Origin.PRIMARY, System.nanoTime(), -1, false);
10771076
} // TEST ONLY
10781077

10791078
public ParsedDocument parsedDoc() {
@@ -1143,8 +1142,8 @@ public Delete(String type, String id, Term uid, long seqNo, long primaryTerm, lo
11431142
this.id = Objects.requireNonNull(id);
11441143
}
11451144

1146-
public Delete(String type, String id, Term uid) {
1147-
this(type, id, uid, SequenceNumbers.UNASSIGNED_SEQ_NO, 0, Versions.MATCH_ANY, VersionType.INTERNAL, Origin.PRIMARY, System.nanoTime());
1145+
public Delete(String type, String id, Term uid, long primaryTerm) {
1146+
this(type, id, uid, SequenceNumbers.UNASSIGNED_SEQ_NO, primaryTerm, Versions.MATCH_ANY, VersionType.INTERNAL, Origin.PRIMARY, System.nanoTime());
11481147
}
11491148

11501149
public Delete(Delete template, VersionType versionType) {

server/src/main/java/org/elasticsearch/index/engine/EngineConfig.java

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ public final class EngineConfig {
7979
@Nullable
8080
private final CircuitBreakerService circuitBreakerService;
8181
private final LongSupplier globalCheckpointSupplier;
82+
private final LongSupplier primaryTermSupplier;
8283

8384
/**
8485
* Index setting to change the low level lucene codec used for writing new segments.
@@ -125,7 +126,7 @@ public EngineConfig(ShardId shardId, String allocationId, ThreadPool threadPool,
125126
List<ReferenceManager.RefreshListener> externalRefreshListener,
126127
List<ReferenceManager.RefreshListener> internalRefreshListener, Sort indexSort,
127128
TranslogRecoveryRunner translogRecoveryRunner, CircuitBreakerService circuitBreakerService,
128-
LongSupplier globalCheckpointSupplier) {
129+
LongSupplier globalCheckpointSupplier, LongSupplier primaryTermSupplier) {
129130
this.shardId = shardId;
130131
this.allocationId = allocationId;
131132
this.indexSettings = indexSettings;
@@ -152,6 +153,7 @@ public EngineConfig(ShardId shardId, String allocationId, ThreadPool threadPool,
152153
this.translogRecoveryRunner = translogRecoveryRunner;
153154
this.circuitBreakerService = circuitBreakerService;
154155
this.globalCheckpointSupplier = globalCheckpointSupplier;
156+
this.primaryTermSupplier = primaryTermSupplier;
155157
}
156158

157159
/**
@@ -354,4 +356,11 @@ public Sort getIndexSort() {
354356
public CircuitBreakerService getCircuitBreakerService() {
355357
return this.circuitBreakerService;
356358
}
359+
360+
/**
361+
* Returns a supplier that supplies the latest primary term value of the associated shard.
362+
*/
363+
public LongSupplier getPrimaryTermSupplier() {
364+
return primaryTermSupplier;
365+
}
357366
}

server/src/main/java/org/elasticsearch/index/engine/InternalEngine.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -418,7 +418,7 @@ private Translog openTranslog(EngineConfig engineConfig, TranslogDeletionPolicy
418418
final TranslogConfig translogConfig = engineConfig.getTranslogConfig();
419419
final String translogUUID = loadTranslogUUIDFromLastCommit();
420420
// We expect that this shard already exists, so it must already have an existing translog else something is badly wrong!
421-
return new Translog(translogConfig, translogUUID, translogDeletionPolicy, globalCheckpointSupplier);
421+
return new Translog(translogConfig, translogUUID, translogDeletionPolicy, globalCheckpointSupplier, engineConfig.getPrimaryTermSupplier());
422422
}
423423

424424
@Override

server/src/main/java/org/elasticsearch/index/shard/IndexShard.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2136,7 +2136,7 @@ private EngineConfig newEngineConfig() {
21362136
IndexingMemoryController.SHARD_INACTIVE_TIME_SETTING.get(indexSettings.getSettings()),
21372137
Collections.singletonList(refreshListeners),
21382138
Collections.singletonList(new RefreshMetricUpdater(refreshMetric)),
2139-
indexSort, this::runTranslogRecovery, circuitBreakerService, replicationTracker);
2139+
indexSort, this::runTranslogRecovery, circuitBreakerService, replicationTracker, this::getPrimaryTerm);
21402140
}
21412141

21422142
/**

server/src/main/java/org/elasticsearch/index/shard/StoreRecovery.java

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -393,7 +393,8 @@ private void internalRecoverFromStore(IndexShard indexShard) throws IndexShardRe
393393
store.bootstrapNewHistory();
394394
final SegmentInfos segmentInfos = store.readLastCommittedSegmentsInfo();
395395
final long maxSeqNo = Long.parseLong(segmentInfos.userData.get(SequenceNumbers.MAX_SEQ_NO));
396-
final String translogUUID = Translog.createEmptyTranslog(indexShard.shardPath().resolveTranslog(), maxSeqNo, shardId);
396+
final String translogUUID = Translog.createEmptyTranslog(
397+
indexShard.shardPath().resolveTranslog(), maxSeqNo, shardId, indexShard.getPrimaryTerm());
397398
store.associateIndexWithNewTranslog(translogUUID);
398399
} else if (indexShouldExists) {
399400
// since we recover from local, just fill the files and size
@@ -407,8 +408,8 @@ private void internalRecoverFromStore(IndexShard indexShard) throws IndexShardRe
407408
}
408409
} else {
409410
store.createEmpty();
410-
final String translogUUID = Translog.createEmptyTranslog(indexShard.shardPath().resolveTranslog(),
411-
SequenceNumbers.NO_OPS_PERFORMED, shardId);
411+
final String translogUUID = Translog.createEmptyTranslog(
412+
indexShard.shardPath().resolveTranslog(), SequenceNumbers.NO_OPS_PERFORMED, shardId, indexShard.getPrimaryTerm());
412413
store.associateIndexWithNewTranslog(translogUUID);
413414
}
414415
indexShard.openEngineAndRecoverFromTranslog();
@@ -456,7 +457,8 @@ private void restore(final IndexShard indexShard, final Repository repository, f
456457
store.bootstrapNewHistory();
457458
final SegmentInfos segmentInfos = store.readLastCommittedSegmentsInfo();
458459
final long maxSeqNo = Long.parseLong(segmentInfos.userData.get(SequenceNumbers.MAX_SEQ_NO));
459-
final String translogUUID = Translog.createEmptyTranslog(indexShard.shardPath().resolveTranslog(), maxSeqNo, shardId);
460+
final String translogUUID = Translog.createEmptyTranslog(
461+
indexShard.shardPath().resolveTranslog(), maxSeqNo, shardId, indexShard.getPrimaryTerm());
460462
store.associateIndexWithNewTranslog(translogUUID);
461463
assert indexShard.shardRouting.primary() : "only primary shards can recover from store";
462464
indexShard.openEngineAndRecoverFromTranslog();

server/src/main/java/org/elasticsearch/index/translog/BaseTranslogReader.java

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -35,15 +35,15 @@ public abstract class BaseTranslogReader implements Comparable<BaseTranslogReade
3535
protected final long generation;
3636
protected final FileChannel channel;
3737
protected final Path path;
38-
protected final long firstOperationOffset;
38+
protected final TranslogHeader header;
3939

40-
public BaseTranslogReader(long generation, FileChannel channel, Path path, long firstOperationOffset) {
40+
public BaseTranslogReader(long generation, FileChannel channel, Path path, TranslogHeader header) {
4141
assert Translog.parseIdFromFileName(path) == generation : "generation mismatch. Path: " + Translog.parseIdFromFileName(path) + " but generation: " + generation;
4242

4343
this.generation = generation;
4444
this.path = path;
4545
this.channel = channel;
46-
this.firstOperationOffset = firstOperationOffset;
46+
this.header = header;
4747
}
4848

4949
public long getGeneration() {
@@ -57,7 +57,14 @@ public long getGeneration() {
5757
abstract Checkpoint getCheckpoint();
5858

5959
public final long getFirstOperationOffset() {
60-
return firstOperationOffset;
60+
return header.sizeInBytes();
61+
}
62+
63+
/**
64+
* Returns the primary term associated with this translog reader.
65+
*/
66+
public final long getPrimaryTerm() {
67+
return header.getPrimaryTerm();
6168
}
6269

6370
/** read the size of the op (i.e., number of bytes, including the op size) written at the given position */
@@ -100,7 +107,12 @@ protected final BufferedChecksumStreamInput checksummedStream(ByteBuffer reusabl
100107
}
101108

102109
protected Translog.Operation read(BufferedChecksumStreamInput inStream) throws IOException {
103-
return Translog.readOperation(inStream);
110+
final Translog.Operation op = Translog.readOperation(inStream);
111+
if (op.primaryTerm() > getPrimaryTerm() && getPrimaryTerm() != TranslogHeader.UNKNOWN_PRIMARY_TERM) {
112+
throw new TranslogCorruptedException("Operation's term is newer than translog header term; " +
113+
"operation term[" + op.primaryTerm() + "], translog header term [" + getPrimaryTerm() + "]");
114+
}
115+
return op;
104116
}
105117

106118
/**

0 commit comments

Comments
 (0)