|
21 | 21 | import org.apache.lucene.index.IndexWriter; |
22 | 22 | import org.apache.lucene.index.IndexWriterConfig; |
23 | 23 | import org.apache.lucene.index.LeafReaderContext; |
| 24 | +import org.apache.lucene.index.MergePolicy; |
| 25 | +import org.apache.lucene.index.NoMergePolicy; |
24 | 26 | import org.apache.lucene.index.SerialMergeScheduler; |
25 | 27 | import org.apache.lucene.index.Term; |
| 28 | +import org.apache.lucene.index.TieredMergePolicy; |
26 | 29 | import org.apache.lucene.search.DocIdSetIterator; |
27 | 30 | import org.apache.lucene.search.IndexSearcher; |
28 | 31 | import org.apache.lucene.search.Query; |
|
53 | 56 | import org.elasticsearch.common.util.ByteArray; |
54 | 57 | import org.elasticsearch.common.util.PageCacheRecycler; |
55 | 58 | import org.elasticsearch.common.xcontent.LoggingDeprecationHandler; |
56 | | -import org.elasticsearch.xcontent.NamedXContentRegistry; |
57 | | -import org.elasticsearch.xcontent.ToXContent; |
58 | | -import org.elasticsearch.xcontent.XContentBuilder; |
59 | | -import org.elasticsearch.xcontent.XContentFactory; |
60 | | -import org.elasticsearch.xcontent.XContentType; |
61 | 59 | import org.elasticsearch.core.CheckedConsumer; |
62 | 60 | import org.elasticsearch.core.Nullable; |
63 | 61 | import org.elasticsearch.core.Releasable; |
64 | 62 | import org.elasticsearch.core.Releasables; |
| 63 | +import org.elasticsearch.core.SuppressForbidden; |
65 | 64 | import org.elasticsearch.core.TimeValue; |
66 | 65 | import org.elasticsearch.core.internal.io.IOUtils; |
67 | 66 | import org.elasticsearch.env.NodeEnvironment; |
68 | 67 | import org.elasticsearch.env.NodeMetadata; |
69 | 68 | import org.elasticsearch.index.Index; |
| 69 | +import org.elasticsearch.xcontent.NamedXContentRegistry; |
| 70 | +import org.elasticsearch.xcontent.ToXContent; |
| 71 | +import org.elasticsearch.xcontent.XContentBuilder; |
| 72 | +import org.elasticsearch.xcontent.XContentFactory; |
| 73 | +import org.elasticsearch.xcontent.XContentType; |
70 | 74 |
|
71 | 75 | import java.io.Closeable; |
72 | 76 | import java.io.IOError; |
@@ -124,6 +128,9 @@ public class PersistedClusterStateService { |
124 | 128 | private static final String INDEX_UUID_FIELD_NAME = "index_uuid"; |
125 | 129 | private static final int COMMIT_DATA_SIZE = 4; |
126 | 130 |
|
| 131 | + private static final MergePolicy NO_MERGE_POLICY = noMergePolicy(); |
| 132 | + private static final MergePolicy DEFAULT_MERGE_POLICY = defaultMergePolicy(); |
| 133 | + |
127 | 134 | public static final String METADATA_DIRECTORY_NAME = MetadataStateFormat.STATE_DIR_NAME; |
128 | 135 |
|
129 | 136 | public static final Setting<TimeValue> SLOW_WRITE_LOGGING_THRESHOLD = Setting.timeSetting("gateway.slow_write_logging_threshold", |
@@ -193,10 +200,13 @@ private static IndexWriter createIndexWriter(Directory directory, boolean openEx |
193 | 200 | indexWriterConfig.setOpenMode(openExisting ? IndexWriterConfig.OpenMode.APPEND : IndexWriterConfig.OpenMode.CREATE); |
194 | 201 | // only commit when specifically instructed, we must not write any intermediate states |
195 | 202 | indexWriterConfig.setCommitOnClose(false); |
196 | | - // most of the data goes into stored fields which are not buffered, so we only really need a tiny buffer |
| 203 | + // most of the data goes into stored fields which are not buffered, so each doc written accounts for ~500B of indexing buffer |
| 204 | + // (see e.g. BufferedUpdates#BYTES_PER_DEL_TERM); a 1MB buffer therefore gets flushed every ~2000 docs. |
197 | 205 | indexWriterConfig.setRAMBufferSizeMB(1.0); |
198 | 206 | // merge on the write thread (e.g. while flushing) |
199 | 207 | indexWriterConfig.setMergeScheduler(new SerialMergeScheduler()); |
| 208 | + // apply the adjusted merge policy |
| 209 | + indexWriterConfig.setMergePolicy(DEFAULT_MERGE_POLICY); |
200 | 210 |
|
201 | 211 | return new IndexWriter(directory, indexWriterConfig); |
202 | 212 | } |
@@ -481,6 +491,28 @@ private static void consumeFromType(IndexSearcher indexSearcher, String type, |
481 | 491 | FORMAT_PARAMS = new ToXContent.MapParams(params); |
482 | 492 | } |
483 | 493 |
|
| 494 | + @SuppressForbidden(reason = "merges are only temporarily suppressed, the merge scheduler does not need changing") |
| 495 | + private static MergePolicy noMergePolicy() { |
| 496 | + return NoMergePolicy.INSTANCE; |
| 497 | + } |
| 498 | + |
| 499 | + private static MergePolicy defaultMergePolicy() { |
| 500 | + final TieredMergePolicy mergePolicy = new TieredMergePolicy(); |
| 501 | + |
| 502 | + // don't worry about cleaning up deletes too much, segments will often get completely deleted once they're old enough |
| 503 | + mergePolicy.setDeletesPctAllowed(50.0); |
| 504 | + // more/smaller segments means there's a better chance they just get deleted before needing a merge |
| 505 | + mergePolicy.setSegmentsPerTier(100); |
| 506 | + // ... but if we do end up merging them then do them all |
| 507 | + mergePolicy.setMaxMergeAtOnce(100); |
| 508 | + // always use compound segments to avoid fsync overhead |
| 509 | + mergePolicy.setNoCFSRatio(1.0); |
| 510 | + // segments are mostly tiny, so don't pretend they are bigger |
| 511 | + mergePolicy.setFloorSegmentMB(0.001); |
| 512 | + |
| 513 | + return mergePolicy; |
| 514 | + } |
| 515 | + |
484 | 516 | /** |
485 | 517 | * Encapsulates a single {@link IndexWriter} with its {@link Directory} for ease of closing, and a {@link Logger}. There is one of these |
486 | 518 | * for each data path. |
@@ -522,7 +554,15 @@ void flush() throws IOException { |
522 | 554 | this.indexWriter.flush(); |
523 | 555 | } |
524 | 556 |
|
| 557 | + void startWrite() { |
| 558 | + // Disable merges during indexing - many older segments will ultimately contain no live docs and simply get deleted. |
| 559 | + indexWriter.getConfig().setMergePolicy(NO_MERGE_POLICY); |
| 560 | + } |
| 561 | + |
525 | 562 | void prepareCommit(String nodeId, long currentTerm, long lastAcceptedVersion) throws IOException { |
| 563 | + indexWriter.getConfig().setMergePolicy(DEFAULT_MERGE_POLICY); |
| 564 | + indexWriter.maybeMerge(); |
| 565 | + |
526 | 566 | final Map<String, String> commitData = new HashMap<>(COMMIT_DATA_SIZE); |
527 | 567 | commitData.put(CURRENT_TERM_KEY, Long.toString(currentTerm)); |
528 | 568 | commitData.put(LAST_ACCEPTED_VERSION_KEY, Long.toString(lastAcceptedVersion)); |
@@ -594,6 +634,11 @@ public void writeFullStateAndCommit(long currentTerm, ClusterState clusterState) |
594 | 634 | ensureOpen(); |
595 | 635 | try { |
596 | 636 | final long startTimeMillis = relativeTimeMillisSupplier.getAsLong(); |
| 637 | + |
| 638 | + for (MetadataIndexWriter metadataIndexWriter : metadataIndexWriters) { |
| 639 | + metadataIndexWriter.startWrite(); |
| 640 | + } |
| 641 | + |
597 | 642 | final WriterStats stats = overwriteMetadata(clusterState.metadata()); |
598 | 643 | commit(currentTerm, clusterState.version()); |
599 | 644 | fullStateWritten = true; |
@@ -623,6 +668,11 @@ void writeIncrementalStateAndCommit(long currentTerm, ClusterState previousClust |
623 | 668 |
|
624 | 669 | try { |
625 | 670 | final long startTimeMillis = relativeTimeMillisSupplier.getAsLong(); |
| 671 | + |
| 672 | + for (MetadataIndexWriter metadataIndexWriter : metadataIndexWriters) { |
| 673 | + metadataIndexWriter.startWrite(); |
| 674 | + } |
| 675 | + |
626 | 676 | final WriterStats stats = updateMetadata(previousClusterState.metadata(), clusterState.metadata()); |
627 | 677 | commit(currentTerm, clusterState.version()); |
628 | 678 | final long durationMillis = relativeTimeMillisSupplier.getAsLong() - startTimeMillis; |
|
0 commit comments