From 65affe3fcc03ff341428e73e5a8d00891bf86567 Mon Sep 17 00:00:00 2001 From: Mayya Sharipova Date: Tue, 5 Oct 2021 13:21:39 -0400 Subject: [PATCH 1/2] Expand segment sorter for all timeseries indices PR #75195 added segment sorter on @timestamp desc for datastream indices. This PR applies segment sorter to all indices that have @timestamp field. The presence of @timestamp field can serve as a strong indication that we are dealing with timeseries indices. The most common type of query for timeseries indices is to get the latest data, that is data sorted by @timestamp desc. This PR sorts segments by @timestamp desc which allows to speed up this kind of queries. Backport for #78639 Relates to #75195 --- .../search/380_sort_segments_on_timestamp.yml | 139 ++++++++++++++++++ .../cluster/metadata/DataStream.java | 16 +- .../index/mapper/MappingLookup.java | 14 ++ .../elasticsearch/index/shard/IndexShard.java | 13 +- ...1_sort_segments_migrate_to_data_stream.yml | 111 -------------- x-pack/qa/runtime-fields/build.gradle | 2 + 6 files changed, 176 insertions(+), 119 deletions(-) create mode 100644 rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search/380_sort_segments_on_timestamp.yml delete mode 100644 x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/data_stream/131_sort_segments_migrate_to_data_stream.yml diff --git a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search/380_sort_segments_on_timestamp.yml b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search/380_sort_segments_on_timestamp.yml new file mode 100644 index 0000000000000..866d3c8637101 --- /dev/null +++ b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search/380_sort_segments_on_timestamp.yml @@ -0,0 +1,139 @@ +--- +"Test that index segments are sorted on timestamp field if @timestamp field is defined in mapping": + - skip: + version: " - 7.15.99" + reason: "sorting segments was added in 7.16" + features: allowed_warnings + + - do: + indices.create: + index: test_index1 + body: + mappings: + properties: + "@timestamp": + type: date + settings: + number_of_shards: 1 + number_of_replicas: 0 + + # 1st segment + - do: + index: + index: test_index1 + body: { "foo": "bar1", "@timestamp": "2021-08-01" } + refresh: true + + # 2nd segment + - do: + index: + index: test_index1 + body: { "foo": "bar2", "@timestamp": "2021-08-02" } + refresh: true + + # test that segments are sorted by @timestamp DESC + - do: + search: + index: test_index1 + body: + fields: [{ "field":"@timestamp", "format":"yyyy-MM-dd" }] + - match: { hits.total.value: 2 } + - match: { hits.hits.0.fields.@timestamp: ["2021-08-02"] } + - match: { hits.hits.1.fields.@timestamp: ["2021-08-01"] } + +--- +"Test that index segments are NOT sorted on timestamp field when @timestamp field is dynamically added": + - skip: + version: " - 7.15.99" + reason: "sorting segments was added in 7.16" + features: allowed_warnings + + - do: + indices.create: + index: test_index2 + body: + settings: + number_of_shards: 1 + number_of_replicas: 0 + + # 1st segment + - do: + index: + index: test_index2 + body: { "foo": "bar1", "@timestamp": "2021-08-01" } + refresh: true + + # 2nd segment + - do: + index: + index: test_index2 + body: { "foo": "bar2", "@timestamp": "2021-08-02" } + refresh: true + + # test that segments are NOT sorted by @timestamp DESC as the field was not + - do: + search: + index: test_index2 + body: + fields: [{ "field":"@timestamp", "format":"yyyy-MM-dd" }] + - match: { hits.hits.0.fields.@timestamp: ["2021-08-01"] } + - match: { hits.hits.1.fields.@timestamp: ["2021-08-02"] } + + # test that after we reopen the index, segments are sorted by @timestamp DESC + - do: + indices.close: + index: test_index2 + - is_true: acknowledged + - do: + indices.open: + index: test_index2 + - is_true: acknowledged + - do: + search: + index: test_index2 + body: + fields: [{ "field":"@timestamp", "format":"yyyy-MM-dd" }] + - match: { hits.total.value: 2 } + - match: { hits.hits.0.fields.@timestamp: ["2021-08-02"] } + - match: { hits.hits.1.fields.@timestamp: ["2021-08-01"] } + +--- +"Test if segments are missing @timestamp field we don't get errors": + - skip: + version: " - 7.15.99" + reason: "sorting segments was added in 7.16" + features: allowed_warnings + + - do: + indices.create: + index: test_index3 + body: + mappings: + properties: + "@timestamp": + type: date + settings: + number_of_shards: 1 + number_of_replicas: 0 + + # 1st segment missing @timestamp field + - do: + index: + index: test_index3 + body: { "foo": "bar1"} + refresh: true + + # 2nd segment + - do: + index: + index: test_index3 + body: { "foo": "bar2", "@timestamp": "2021-08-02" } + refresh: true + + - do: + search: + index: test_index3 + body: + fields: [{ "field":"@timestamp", "format":"yyyy-MM-dd" }] + - match: { hits.hits.0.fields.@timestamp: ["2021-08-02"] } + - is_false: hits.hits.1.fields.@timestamp diff --git a/server/src/main/java/org/elasticsearch/cluster/metadata/DataStream.java b/server/src/main/java/org/elasticsearch/cluster/metadata/DataStream.java index b6a0e866ca2b0..1d743d19aa6a3 100644 --- a/server/src/main/java/org/elasticsearch/cluster/metadata/DataStream.java +++ b/server/src/main/java/org/elasticsearch/cluster/metadata/DataStream.java @@ -11,6 +11,7 @@ import org.apache.lucene.document.LongPoint; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.PointValues; +import org.elasticsearch.ElasticsearchException; import org.elasticsearch.cluster.AbstractDiffable; import org.elasticsearch.cluster.Diff; import org.elasticsearch.common.Strings; @@ -43,8 +44,8 @@ public final class DataStream extends AbstractDiffable implements To public static final String BACKING_INDEX_PREFIX = ".ds-"; public static final DateFormatter DATE_FORMATTER = DateFormatter.forPattern("uuuu.MM.dd"); - // Datastreams' leaf readers should be sorted by desc order of their timestamp field, as it allows search time optimizations - public static Comparator DATASTREAM_LEAF_READERS_SORTER = + // Timeseries indices' leaf readers should be sorted by desc order of their timestamp field, as it allows search time optimizations + public static Comparator TIMESERIES_LEAF_READERS_SORTER = Comparator.comparingLong( (LeafReader r) -> { try { @@ -52,14 +53,17 @@ public final class DataStream extends AbstractDiffable implements To if (points != null) { byte[] sortValue = points.getMaxPackedValue(); return LongPoint.decodeDimension(sortValue, 0); - } else if (r.numDocs() == 0) { - // points can be null if the segment contains only deleted documents + } else { + // As we apply this segment sorter to any timeseries indices, + // we don't have a guarantee that all docs contain @timestamp field. + // Some segments may have all docs without @timestamp field, in this + // case they will be sorted last. return Long.MIN_VALUE; } } catch (IOException e) { + throw new ElasticsearchException("Can't access [" + + DataStream.TimestampField.FIXED_TIMESTAMP_FIELD + "] field for the index!", e); } - throw new IllegalStateException("Can't access [" + - DataStream.TimestampField.FIXED_TIMESTAMP_FIELD + "] field for the data stream!"); }) .reversed(); diff --git a/server/src/main/java/org/elasticsearch/index/mapper/MappingLookup.java b/server/src/main/java/org/elasticsearch/index/mapper/MappingLookup.java index 120f80f0645e7..e66a0bdf58e55 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/MappingLookup.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/MappingLookup.java @@ -9,6 +9,7 @@ package org.elasticsearch.index.mapper; import org.apache.lucene.codecs.PostingsFormat; +import org.elasticsearch.cluster.metadata.DataStream; import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.analysis.IndexAnalyzers; import org.elasticsearch.index.analysis.NamedAnalyzer; @@ -391,6 +392,19 @@ public boolean isDataStreamTimestampFieldEnabled() { return dtfm != null && dtfm.isEnabled(); } + /** + * Returns if this mapping contains a timestamp field that is of type date, indexed and has doc values. + * @return {@code true} if contains a timestamp field of type date that is indexed and has doc values, {@code false} otherwise. + */ + public boolean hasTimestampField() { + final MappedFieldType mappedFieldType = fieldTypesLookup().get(DataStream.TimestampField.FIXED_TIMESTAMP_FIELD); + if (mappedFieldType instanceof DateFieldMapper.DateFieldType) { + return mappedFieldType.isSearchable() && mappedFieldType.hasDocValues(); + } else { + return false; + } + } + /** * Key for the lookup to be used in caches. */ diff --git a/server/src/main/java/org/elasticsearch/index/shard/IndexShard.java b/server/src/main/java/org/elasticsearch/index/shard/IndexShard.java index ec675e44ff484..f57507aaff74d 100644 --- a/server/src/main/java/org/elasticsearch/index/shard/IndexShard.java +++ b/server/src/main/java/org/elasticsearch/index/shard/IndexShard.java @@ -188,7 +188,7 @@ import java.util.stream.Collectors; import java.util.stream.StreamSupport; -import static org.elasticsearch.cluster.metadata.DataStream.DATASTREAM_LEAF_READERS_SORTER; +import static org.elasticsearch.cluster.metadata.DataStream.TIMESERIES_LEAF_READERS_SORTER; import static org.elasticsearch.index.seqno.RetentionLeaseActions.RETAIN_ALL; import static org.elasticsearch.index.seqno.SequenceNumbers.UNASSIGNED_SEQ_NO; @@ -408,6 +408,14 @@ public Sort getIndexSort() { return indexSortSupplier.get(); } + /** + * Returns if this shard is a part of datastream + * @return {@code true} if this shard is a part of datastream, {@code false} otherwise + */ + public boolean isDataStreamIndex() { + return isDataStreamIndex; + } + public ShardGetService getService() { return this.getService; } @@ -2990,6 +2998,7 @@ private EngineConfig newEngineConfig(LongSupplier globalCheckpointSupplier) { this.warmer.warm(reader); } }; + final boolean isTimeseriesIndex = mapperService == null ? false : mapperService.mappingLookup().hasTimestampField(); return new EngineConfig( shardId, threadPool, @@ -3013,7 +3022,7 @@ private EngineConfig newEngineConfig(LongSupplier globalCheckpointSupplier) { replicationTracker::getRetentionLeases, this::getOperationPrimaryTerm, snapshotCommitSupplier, - isDataStreamIndex ? DATASTREAM_LEAF_READERS_SORTER : null); + isTimeseriesIndex ? TIMESERIES_LEAF_READERS_SORTER : null); } /** diff --git a/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/data_stream/131_sort_segments_migrate_to_data_stream.yml b/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/data_stream/131_sort_segments_migrate_to_data_stream.yml deleted file mode 100644 index f7769ee4fc4dd..0000000000000 --- a/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/data_stream/131_sort_segments_migrate_to_data_stream.yml +++ /dev/null @@ -1,111 +0,0 @@ ---- -"Test that datastream index segments are sorted on timestamp field desc after data stream migration": - - skip: - version: " - 7.15.99" - reason: "sorted segments added in 7.16" - features: allowed_warnings - - - do: - allowed_warnings: - - "index template [my-template] has index patterns [my_ds] matching patterns from existing older templates [global] with patterns (global => [*]); this template [my-template] will take precedence during new index creation" - indices.put_index_template: - name: my-template - body: - index_patterns: [ my_ds ] - data_stream: { } - template: - settings: - number_of_shards: 1 - number_of_replicas: 0 - - - do: - indices.create: - index: test_index1 - body: - settings: - number_of_shards: 1 - number_of_replicas: 0 - aliases: - my_ds: - is_write_index: true - - # 1st segment - - do: - index: - index: my_ds - body: { "foo": "bar1", "@timestamp": "2021-08-01" } - refresh: true - - # 2nd segment - - do: - index: - index: my_ds - body: { "foo": "bar2", "@timestamp": "2021-08-02" } - refresh: true - - # test that segments are sorted as indexed by @timestamp ASC - - do: - search: - index: my_ds - body: - fields: [{ "field":"@timestamp", "format":"yyyy-MM-dd" }] - - match: { hits.total.value: 2 } - - match: { hits.hits.0.fields.@timestamp: ["2021-08-01"] } - - match: { hits.hits.1.fields.@timestamp: ["2021-08-02"] } - - # migrate to data-stream - - do: - indices.migrate_to_data_stream: - name: my_ds - - is_true: acknowledged - - # test that segments are still sorted as indexed by @timestamp ASC - # as we don't reopen existing shards and index readers after migration - - do: - search: - index: my_ds - body: - fields: [{ "field":"@timestamp", "format":"yyyy-MM-dd" }] - - match: { hits.total.value: 2 } - - match: { hits.hits.0.fields.@timestamp: ["2021-08-01"] } - - match: { hits.hits.1.fields.@timestamp: ["2021-08-02"] } - - # rollover data stream to create new backing index - - do: - indices.rollover: - alias: "my_ds" - - match: { rolled_over: true } - # save the new backing index names for later use - - set: { new_index: idx0name } - - # 1st segment in the new backing index - - do: - index: - index: my_ds - body: { "foo": "bar3", "@timestamp": "2021-08-03" } - refresh: true - - # 2nd segment in the new backing index - - do: - index: - index: my_ds - body: { "foo": "bar4", "@timestamp": "2021-08-04" } - refresh: true - - - # test that segments are sorted by @timestamp DESC in the new backing index, - # as the newly created index and shard pick up the index leaf sorter - - do: - search: - index: $idx0name - body: - fields: [{ "field":"@timestamp", "format":"yyyy-MM-dd" }] - - match: { hits.total.value: 2 } - - match: { hits.hits.0.fields.@timestamp: ["2021-08-04"] } - - match: { hits.hits.1.fields.@timestamp: ["2021-08-03"] } - - - - do: - indices.delete_data_stream: - name: my_ds - - is_true: acknowledged diff --git a/x-pack/qa/runtime-fields/build.gradle b/x-pack/qa/runtime-fields/build.gradle index 568bd577ed175..b7abc0a8dfc9e 100644 --- a/x-pack/qa/runtime-fields/build.gradle +++ b/x-pack/qa/runtime-fields/build.gradle @@ -88,6 +88,8 @@ subprojects { // The error messages are different 'search/330_fetch_fields/error includes field name', 'search/330_fetch_fields/error includes glob pattern', + // we need a @timestamp field to be defined in index mapping + 'search/380_sort_segments_on_timestamp/*', /////// NOT SUPPORTED /////// ].join(',') } From 667a2a928b2a71512e8b7d7d56c1dc57ed491c52 Mon Sep 17 00:00:00 2001 From: Mayya Sharipova Date: Tue, 5 Oct 2021 14:45:44 -0400 Subject: [PATCH 2/2] Add warning --- .../test/search/380_sort_segments_on_timestamp.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search/380_sort_segments_on_timestamp.yml b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search/380_sort_segments_on_timestamp.yml index 866d3c8637101..4bd019a494e54 100644 --- a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search/380_sort_segments_on_timestamp.yml +++ b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search/380_sort_segments_on_timestamp.yml @@ -83,6 +83,8 @@ - do: indices.close: index: test_index2 + allowed_warnings: + - "the default value for the ?wait_for_active_shards parameter will change from '0' to 'index-setting' in version 8; specify '?wait_for_active_shards=index-setting' to adopt the future default behaviour, or '?wait_for_active_shards=0' to preserve today's behaviour" - is_true: acknowledged - do: indices.open: