From 7a9632c199ad6d25c4899ffee066ef33d93bc152 Mon Sep 17 00:00:00 2001 From: Christos Soulios Date: Tue, 28 Apr 2020 15:50:55 +0300 Subject: [PATCH 1/5] Initial version of doc_count field mapper added tests Build fixes Added tests for doc_count fieldmapper doc count tests Resolve conflicts after merge from master Added yaml test for doc_count field type Minor changes to test Fix issue with not-registering field mapper Simplify terms agg test Add doc_count provider in the buckets aggregator Initialize doc_count provider once doc_count provider is initialized once in the buckets agg constructor. Added tests for FieldBasedDocCountProvider Added more tests to DocCountFieldMapper Updated branch to fix build after merge Added validation for single doc_count field Added validation to ensure only a single doc_count field exists in the mapping Added version skips to fix broken tests Added documentation for doc_count Changes to address review comments: - Minor doc change - Added yml test that merges template with multiple doc_count - Changed DocCountFieldType indexing to TextSearchInfo.NONE Use _doc_count as Lucene field for doc count Minor change: field rename Minor change to yml test. Fix errors from merge Converted _doc_count to metadata field type Throw an error if parsed value is not a number Make _doc_count field a metadata field Fixed broken tests Fix bug in low cardinality ordinal terms aggs Update docs that _doc_count is a metadata field Fix broken ML tests Fix errors after merge Addressed review comments Addressed reviewer comments Added DocCountFieldTypeTests Make composite agg respect _doc_count field Cleaned up/simplified DocCountProvider class DocCountProvider rethrows IOException instead of swallowing it Set familyTypeName of _doc_count to integer Revert changes to AggregatorTestCase Doc changes --- docs/reference/mapping/fields.asciidoc | 9 +- .../mapping/fields/doc-count-field.asciidoc | 118 ++++++++++++++ .../370_doc_count_field.yml | 150 ++++++++++++++++++ .../index/mapper/DocCountFieldMapper.java | 141 ++++++++++++++++ .../elasticsearch/indices/IndicesModule.java | 2 + .../search/aggregations/AggregatorBase.java | 4 +- .../bucket/BucketsAggregator.java | 36 +++-- .../aggregations/bucket/DocCountProvider.java | 49 ++++++ .../adjacency/AdjacencyMatrixAggregator.java | 4 +- .../bucket/composite/CompositeAggregator.java | 5 +- .../CompositeValuesCollectorQueue.java | 22 +-- .../bucket/composite/SortedDocsProducer.java | 6 +- .../bucket/nested/NestedAggregator.java | 3 +- .../GlobalOrdinalsStringTermsAggregator.java | 13 +- .../mapper/DocCountFieldMapperTests.java | 71 +++++++++ .../index/mapper/DocCountFieldTypeTests.java | 55 +++++++ .../indices/IndicesModuleTests.java | 4 +- .../bucket/DocCountProviderTests.java | 108 +++++++++++++ .../CompositeValuesCollectorQueueTests.java | 2 +- .../extractor/ExtractedFieldsDetector.java | 2 +- 20 files changed, 762 insertions(+), 42 deletions(-) create mode 100644 docs/reference/mapping/fields/doc-count-field.asciidoc create mode 100644 rest-api-spec/src/main/resources/rest-api-spec/test/search.aggregation/370_doc_count_field.yml create mode 100644 server/src/main/java/org/elasticsearch/index/mapper/DocCountFieldMapper.java create mode 100644 server/src/main/java/org/elasticsearch/search/aggregations/bucket/DocCountProvider.java create mode 100644 server/src/test/java/org/elasticsearch/index/mapper/DocCountFieldMapperTests.java create mode 100644 server/src/test/java/org/elasticsearch/index/mapper/DocCountFieldTypeTests.java create mode 100644 server/src/test/java/org/elasticsearch/search/aggregations/bucket/DocCountProviderTests.java diff --git a/docs/reference/mapping/fields.asciidoc b/docs/reference/mapping/fields.asciidoc index 75cb29abf89fd..78ac432e1f4f1 100644 --- a/docs/reference/mapping/fields.asciidoc +++ b/docs/reference/mapping/fields.asciidoc @@ -29,6 +29,13 @@ fields can be customized when a mapping is created. The size of the `_source` field in bytes, provided by the {plugins}/mapper-size.html[`mapper-size` plugin]. +q[discrete] +=== Doc count metadata field + +<>:: + + A custom field used for storing doc counts when a document represents pre-aggregated data. + [discrete] === Indexing metadata fields @@ -55,6 +62,7 @@ fields can be customized when a mapping is created. Application specific metadata. +include::fields/doc-count-field.asciidoc[] include::fields/field-names-field.asciidoc[] @@ -69,4 +77,3 @@ include::fields/meta-field.asciidoc[] include::fields/routing-field.asciidoc[] include::fields/source-field.asciidoc[] - diff --git a/docs/reference/mapping/fields/doc-count-field.asciidoc b/docs/reference/mapping/fields/doc-count-field.asciidoc new file mode 100644 index 0000000000000..52eb8ea52a71f --- /dev/null +++ b/docs/reference/mapping/fields/doc-count-field.asciidoc @@ -0,0 +1,118 @@ +[[mapping-doc-count-field]] +=== `_doc_count` data type +++++ +_doc_count +++++ + +Bucket aggregations always return a field named `doc_count` showing the number of documents that were aggregated and partitioned +in each bucket. Computation of the value of `doc_count` is very simple. `doc_count` is incremented by 1 for every document collected +in each bucket. + +While this simple approach is effective when computing aggregations over individual documents, it fails to accurately represent +documents that store pre-aggregated data (such as `histogram` or `aggregate_metric_double` fields), because one summary field may +represent multiple documents. + +To allow for correct computation of the number of documents when working with pre-aggregated data, we have introduced a +metadata field type named `_doc_count`. `_doc_count` must always be a positive integer representing the number of documents +aggregated in a single summary field. + +When field `_doc_count` is added to a document, all bucket aggregations will respect its value and increment the bucket `doc_count` +by the value of the field. If a document does not contain any `_doc_count` field, `_doc_count = 1` is implied by default. + +[IMPORTANT] +======== +* A `_doc_count` field can only store a single positive integer per document. Nested arrays are not allowed. +* If a document contains no `_doc_count` fields, aggregators will increment by 1, which is the default behavior. +======== + +[[mapping-doc-count-field-example]] +==== Example + +The following <> API request creates a new index with the following field mappings: + +* `my_histogram`, a `histogram` field used to store percentile data +* `my_text`, a `keyword` field used to store a title for the histogram + +[source,console] +-------------------------------------------------- +PUT my_index +{ + "mappings" : { + "properties" : { + "my_histogram" : { + "type" : "histogram" + }, + "my_text" : { + "type" : "keyword" + } + } + } +} +-------------------------------------------------- + +The following <> API requests store pre-aggregated data for +two histograms: `histogram_1` and `histogram_2`. + +[source,console] +-------------------------------------------------- +PUT my_index/_doc/1 +{ + "my_text" : "histogram_1", + "my_histogram" : { + "values" : [0.1, 0.2, 0.3, 0.4, 0.5], + "counts" : [3, 7, 23, 12, 6] + }, + "_doc_count": 45 <1> +} + +PUT my_index/_doc/2 +{ + "my_text" : "histogram_2", + "my_histogram" : { + "values" : [0.1, 0.25, 0.35, 0.4, 0.45, 0.5], + "counts" : [8, 17, 8, 7, 6, 2] + }, + "_doc_count_": 62 <1> +} +-------------------------------------------------- +<1> Field `_doc_count` must be a positive integer storing the number of documents aggregated to produce each histogram. + +If we run the following <> on `my_index`: + +[source,console] +-------------------------------------------------- +GET /_search +{ + "aggs" : { + "histogram_titles" : { + "terms" : { "field" : "my_text" } + } + } +} +-------------------------------------------------- + +We will get the following response: + +[source,console-result] +-------------------------------------------------- +{ + ... + "aggregations" : { + "histogram_titles" : { + "doc_count_error_upper_bound": 0, + "sum_other_doc_count": 0, + "buckets" : [ + { + "key" : "histogram_2", + "doc_count" : 62 + }, + { + "key" : "histogram_1", + "doc_count" : 45 + } + ] + } + } +} +-------------------------------------------------- +// TESTRESPONSE[skip:test not setup] diff --git a/rest-api-spec/src/main/resources/rest-api-spec/test/search.aggregation/370_doc_count_field.yml b/rest-api-spec/src/main/resources/rest-api-spec/test/search.aggregation/370_doc_count_field.yml new file mode 100644 index 0000000000000..8c19a418f0a8f --- /dev/null +++ b/rest-api-spec/src/main/resources/rest-api-spec/test/search.aggregation/370_doc_count_field.yml @@ -0,0 +1,150 @@ +setup: + - do: + indices.create: + index: test_1 + body: + settings: + number_of_replicas: 0 + mappings: + properties: + str: + type: keyword + number: + type: integer + + - do: + bulk: + index: test_1 + refresh: true + body: + - '{"index": {}}' + - '{"_doc_count": 10, "str": "abc", "number" : 500, "unmapped": "abc" }' + - '{"index": {}}' + - '{"_doc_count": 5, "str": "xyz", "number" : 100, "unmapped": "xyz" }' + - '{"index": {}}' + - '{"_doc_count": 7, "str": "foo", "number" : 100, "unmapped": "foo" }' + - '{"index": {}}' + - '{"_doc_count": 1, "str": "foo", "number" : 200, "unmapped": "foo" }' + - '{"index": {}}' + - '{"str": "abc", "number" : 500, "unmapped": "abc" }' + +--- +"Test numeric terms agg with doc_count": + - skip: + version: " - 7.99.99" + reason: "Doc count fields are only implemented in 8.0" + + - do: + search: + rest_total_hits_as_int: true + body: { "size" : 0, "aggs" : { "num_terms" : { "terms" : { "field" : "number" } } } } + + - match: { hits.total: 5 } + - length: { aggregations.num_terms.buckets: 3 } + - match: { aggregations.num_terms.buckets.0.key: 100 } + - match: { aggregations.num_terms.buckets.0.doc_count: 12 } + - match: { aggregations.num_terms.buckets.1.key: 500 } + - match: { aggregations.num_terms.buckets.1.doc_count: 11 } + - match: { aggregations.num_terms.buckets.2.key: 200 } + - match: { aggregations.num_terms.buckets.2.doc_count: 1 } + + +--- +"Test keyword terms agg with doc_count": + - skip: + version: " - 7.99.99" + reason: "Doc count fields are only implemented in 8.0" + - do: + search: + rest_total_hits_as_int: true + body: { "size" : 0, "aggs" : { "str_terms" : { "terms" : { "field" : "str" } } } } + + - match: { hits.total: 5 } + - length: { aggregations.str_terms.buckets: 3 } + - match: { aggregations.str_terms.buckets.0.key: "abc" } + - match: { aggregations.str_terms.buckets.0.doc_count: 11 } + - match: { aggregations.str_terms.buckets.1.key: "foo" } + - match: { aggregations.str_terms.buckets.1.doc_count: 8 } + - match: { aggregations.str_terms.buckets.2.key: "xyz" } + - match: { aggregations.str_terms.buckets.2.doc_count: 5 } + +--- + +"Test unmapped string terms agg with doc_count": + - skip: + version: " - 7.99.99" + reason: "Doc count fields are only implemented in 8.0" + - do: + bulk: + index: test_2 + refresh: true + body: + - '{"index": {}}' + - '{"_doc_count": 10, "str": "abc" }' + - '{"index": {}}' + - '{"str": "abc" }' + - do: + search: + index: test_2 + rest_total_hits_as_int: true + body: { "size" : 0, "aggs" : { "str_terms" : { "terms" : { "field" : "str.keyword" } } } } + + - match: { hits.total: 2 } + - length: { aggregations.str_terms.buckets: 1 } + - match: { aggregations.str_terms.buckets.0.key: "abc" } + - match: { aggregations.str_terms.buckets.0.doc_count: 11 } + +--- +"Test composite str_terms agg with doc_count": + - skip: + version: " - 7.99.99" + reason: "Doc count fields are only implemented in 8.0" + - do: + search: + rest_total_hits_as_int: true + body: { "size" : 0, "aggs" : + { "composite_agg" : { "composite" : + { + "sources": ["str_terms": { "terms": { "field": "str" } }] + } + } + } + } + + - match: { hits.total: 5 } + - length: { aggregations.composite_agg.buckets: 3 } + - match: { aggregations.composite_agg.buckets.0.key.str_terms: "abc" } + - match: { aggregations.composite_agg.buckets.0.doc_count: 11 } + - match: { aggregations.composite_agg.buckets.1.key.str_terms: "foo" } + - match: { aggregations.composite_agg.buckets.1.doc_count: 8 } + - match: { aggregations.composite_agg.buckets.2.key.str_terms: "xyz" } + - match: { aggregations.composite_agg.buckets.2.doc_count: 5 } + + +--- +"Test composite num_terms agg with doc_count": + - skip: + version: " - 7.99.99" + reason: "Doc count fields are only implemented in 8.0" + - do: + search: + rest_total_hits_as_int: true + body: { "size" : 0, "aggs" : + { "composite_agg" : + { "composite" : + { + "sources": ["num_terms" : { "terms" : { "field" : "number" } }] + } + } + } + } + + - match: { hits.total: 5 } + - length: { aggregations.composite_agg.buckets: 3 } + - match: { aggregations.composite_agg.buckets.0.key.num_terms: 100 } + - match: { aggregations.composite_agg.buckets.0.doc_count: 12 } + - match: { aggregations.composite_agg.buckets.1.key.num_terms: 200 } + - match: { aggregations.composite_agg.buckets.1.doc_count: 1 } + - match: { aggregations.composite_agg.buckets.2.key.num_terms: 500 } + - match: { aggregations.composite_agg.buckets.2.doc_count: 11 } + diff --git a/server/src/main/java/org/elasticsearch/index/mapper/DocCountFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/DocCountFieldMapper.java new file mode 100644 index 0000000000000..611bab88f66fe --- /dev/null +++ b/server/src/main/java/org/elasticsearch/index/mapper/DocCountFieldMapper.java @@ -0,0 +1,141 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.index.mapper; + +import org.apache.lucene.document.Field; +import org.apache.lucene.document.NumericDocValuesField; +import org.apache.lucene.search.DocValuesFieldExistsQuery; +import org.apache.lucene.search.Query; +import org.elasticsearch.common.xcontent.XContentParser; +import org.elasticsearch.common.xcontent.XContentParserUtils; +import org.elasticsearch.index.query.QueryShardContext; +import org.elasticsearch.index.query.QueryShardException; +import org.elasticsearch.search.lookup.SearchLookup; + +import java.io.IOException; +import java.util.Collections; +import java.util.List; + +/** Mapper for the doc_count field. */ +public class DocCountFieldMapper extends MetadataFieldMapper { + + public static final String NAME = "_doc_count"; + public static final String CONTENT_TYPE = "_doc_count"; + + public static final TypeParser PARSER = new ConfigurableTypeParser( + c -> new DocCountFieldMapper(), + c -> new DocCountFieldMapper.Builder()); + + static class Builder extends MetadataFieldMapper.Builder { + + Builder() { + super(NAME); + } + + @Override + protected List> getParameters() { + return Collections.emptyList(); + } + + @Override + public DocCountFieldMapper build(BuilderContext context) { + return new DocCountFieldMapper(); + } + } + + public static final class DocCountFieldType extends MappedFieldType { + + public static final DocCountFieldType INSTANCE = new DocCountFieldType(); + + private static final Long defaultValue = 1L; + + public DocCountFieldType() { + super(NAME, false, false, true, TextSearchInfo.NONE, Collections.emptyMap()); + } + + @Override + public String typeName() { + return CONTENT_TYPE; + } + + @Override + public String familyTypeName() { + return NumberFieldMapper.NumberType.INTEGER.typeName(); + } + + @Override + public Query existsQuery(QueryShardContext context) { + return new DocValuesFieldExistsQuery(NAME); + } + + @Override + public Query termQuery(Object value, QueryShardContext context) { + throw new QueryShardException(context, "Field [" + name() + "] of type [" + typeName() + "] is not searchable"); + } + + @Override + public ValueFetcher valueFetcher(MapperService mapperService, SearchLookup searchLookup, String format) { + if (format != null) { + throw new IllegalArgumentException("Field [" + name() + "] of type [" + typeName() + "] doesn't support formats."); + } + + return new SourceValueFetcher(name(), mapperService, defaultValue) { + @Override + protected Object parseSourceValue(Object value) { + if ("".equals(value)) { + return defaultValue; + } else { + return NumberFieldMapper.NumberType.objectToLong(value, false); + } + } + }; + } + } + + private DocCountFieldMapper() { + super(DocCountFieldType.INSTANCE); + } + + @Override + protected void parseCreateField(ParseContext context) throws IOException { + XContentParser parser = context.parser(); + XContentParserUtils.ensureExpectedToken(XContentParser.Token.VALUE_NUMBER, parser.currentToken(), parser); + + long value = parser.longValue(false); + if (value <= 0) { + throw new IllegalArgumentException("Field [" + fieldType().name() + "] must be a positive integer."); + } + final Field docCount = new NumericDocValuesField(NAME, value); + context.doc().add(docCount); + } + + @Override + public void preParse(ParseContext context) { } + + @Override + public DocCountFieldType fieldType() { + return (DocCountFieldType) super.fieldType(); + } + + @Override + protected String contentType() { + return CONTENT_TYPE; + } + +} diff --git a/server/src/main/java/org/elasticsearch/indices/IndicesModule.java b/server/src/main/java/org/elasticsearch/indices/IndicesModule.java index cade11a509b62..2ba32df753ea1 100644 --- a/server/src/main/java/org/elasticsearch/indices/IndicesModule.java +++ b/server/src/main/java/org/elasticsearch/indices/IndicesModule.java @@ -32,6 +32,7 @@ import org.elasticsearch.index.mapper.BooleanFieldMapper; import org.elasticsearch.index.mapper.CompletionFieldMapper; import org.elasticsearch.index.mapper.DateFieldMapper; +import org.elasticsearch.index.mapper.DocCountFieldMapper; import org.elasticsearch.index.mapper.FieldAliasMapper; import org.elasticsearch.index.mapper.FieldNamesFieldMapper; import org.elasticsearch.index.mapper.GeoPointFieldMapper; @@ -152,6 +153,7 @@ private static Map initBuiltInMetadataMa builtInMetadataMappers.put(NestedPathFieldMapper.NAME, NestedPathFieldMapper.PARSER); builtInMetadataMappers.put(VersionFieldMapper.NAME, VersionFieldMapper.PARSER); builtInMetadataMappers.put(SeqNoFieldMapper.NAME, SeqNoFieldMapper.PARSER); + builtInMetadataMappers.put(DocCountFieldMapper.NAME, DocCountFieldMapper.PARSER); //_field_names must be added last so that it has a chance to see all the other mappers builtInMetadataMappers.put(FieldNamesFieldMapper.NAME, FieldNamesFieldMapper.PARSER); return Collections.unmodifiableMap(builtInMetadataMappers); diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/AggregatorBase.java b/server/src/main/java/org/elasticsearch/search/aggregations/AggregatorBase.java index 31284edeaa478..be9c81b5759b9 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/AggregatorBase.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/AggregatorBase.java @@ -179,7 +179,7 @@ public Map metadata() { @Override public final LeafBucketCollector getLeafCollector(LeafReaderContext ctx) throws IOException { - preGetSubLeafCollectors(); + preGetSubLeafCollectors(ctx); final LeafBucketCollector sub = collectableSubAggregators.getLeafCollector(ctx); return getLeafCollector(ctx, sub); } @@ -188,7 +188,7 @@ public final LeafBucketCollector getLeafCollector(LeafReaderContext ctx) throws * Can be overridden by aggregator implementations that like the perform an operation before the leaf collectors * of children aggregators are instantiated for the next segment. */ - protected void preGetSubLeafCollectors() throws IOException { + protected void preGetSubLeafCollectors(LeafReaderContext ctx) throws IOException { } /** diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/BucketsAggregator.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/BucketsAggregator.java index 596aadd927e83..bb2e956a485b6 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/BucketsAggregator.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/BucketsAggregator.java @@ -18,9 +18,10 @@ */ package org.elasticsearch.search.aggregations.bucket; +import org.apache.lucene.index.LeafReaderContext; import org.elasticsearch.common.lease.Releasable; import org.elasticsearch.common.util.BigArrays; -import org.elasticsearch.common.util.IntArray; +import org.elasticsearch.common.util.LongArray; import org.elasticsearch.search.aggregations.AggregationExecutionException; import org.elasticsearch.search.aggregations.Aggregator; import org.elasticsearch.search.aggregations.AggregatorBase; @@ -52,7 +53,8 @@ public abstract class BucketsAggregator extends AggregatorBase { private final BigArrays bigArrays; private final IntConsumer multiBucketConsumer; - private IntArray docCounts; + private LongArray docCounts; + protected final DocCountProvider docCountProvider; public BucketsAggregator(String name, AggregatorFactories factories, SearchContext context, Aggregator parent, CardinalityUpperBound bucketCardinality, Map metadata) throws IOException { @@ -63,7 +65,8 @@ public BucketsAggregator(String name, AggregatorFactories factories, SearchConte } else { multiBucketConsumer = (count) -> {}; } - docCounts = bigArrays.newIntArray(1, true); + docCounts = bigArrays.newLongArray(1, true); + docCountProvider = new DocCountProvider(); } /** @@ -92,7 +95,8 @@ public final void collectBucket(LeafBucketCollector subCollector, int doc, long * Same as {@link #collectBucket(LeafBucketCollector, int, long)}, but doesn't check if the docCounts needs to be re-sized. */ public final void collectExistingBucket(LeafBucketCollector subCollector, int doc, long bucketOrd) throws IOException { - if (docCounts.increment(bucketOrd, 1) == 1) { + long docCount = docCountProvider.getDocCount(doc); + if (docCounts.increment(bucketOrd, docCount) == docCount) { // We calculate the final number of buckets only during the reduce phase. But we still need to // trigger bucket consumer from time to time in order to give it a chance to check available memory and break // the execution if we are running out. To achieve that we are passing 0 as a bucket count. @@ -123,11 +127,11 @@ public final void mergeBuckets(long[] mergeMap, long newNumBuckets) { * merge the actual ordinals and doc ID deltas. */ public final void rewriteBuckets(long newNumBuckets, LongUnaryOperator mergeMap){ - try (IntArray oldDocCounts = docCounts) { - docCounts = bigArrays.newIntArray(newNumBuckets, true); + try (LongArray oldDocCounts = docCounts) { + docCounts = bigArrays.newLongArray(newNumBuckets, true); docCounts.fill(0, newNumBuckets, 0); for (long i = 0; i < oldDocCounts.size(); i++) { - int docCount = oldDocCounts.get(i); + long docCount = oldDocCounts.get(i); if(docCount == 0) continue; @@ -140,14 +144,14 @@ public final void rewriteBuckets(long newNumBuckets, LongUnaryOperator mergeMap) } } - public IntArray getDocCounts() { + public LongArray getDocCounts() { return docCounts; } /** * Utility method to increment the doc counts of the given bucket (identified by the bucket ordinal) */ - public final void incrementBucketDocCount(long bucketOrd, int inc) { + public final void incrementBucketDocCount(long bucketOrd, long inc) { docCounts = bigArrays.grow(docCounts, bucketOrd + 1); docCounts.increment(bucketOrd, inc); } @@ -155,7 +159,7 @@ public final void incrementBucketDocCount(long bucketOrd, int inc) { /** * Utility method to return the number of documents that fell in the given bucket (identified by the bucket ordinal) */ - public final int bucketDocCount(long bucketOrd) { + public final long bucketDocCount(long bucketOrd) { if (bucketOrd >= docCounts.size()) { // This may happen eg. if no document in the highest buckets is accepted by a sub aggregator. // For example, if there is a long terms agg on 3 terms 1,2,3 with a sub filter aggregator and if no document with 3 as a value @@ -295,7 +299,7 @@ protected final InternalAggregation[] buildAggregationsForFixedBucketCount(l } @FunctionalInterface protected interface BucketBuilderForFixedCount { - B build(int offsetInOwningOrd, int docCount, InternalAggregations subAggregationResults); + B build(int offsetInOwningOrd, long docCount, InternalAggregations subAggregationResults); } /** @@ -366,7 +370,7 @@ protected final InternalAggregation[] buildAggregationsForVariableBuckets(lo } @FunctionalInterface protected interface BucketBuilderForVariable { - B build(long bucketValue, int docCount, InternalAggregations subAggregationResults); + B build(long bucketValue, long docCount, InternalAggregations subAggregationResults); } @FunctionalInterface protected interface ResultBuilderForVariable { @@ -394,7 +398,7 @@ public BucketComparator bucketComparator(String key, SortOrder order) { return super.bucketComparator(key, order); } if (key == null || "doc_count".equals(key)) { - return (lhs, rhs) -> order.reverseMul() * Integer.compare(bucketDocCount(lhs), bucketDocCount(rhs)); + return (lhs, rhs) -> order.reverseMul() * Long.compare(bucketDocCount(lhs), bucketDocCount(rhs)); } throw new IllegalArgumentException("Ordering on a single-bucket aggregation can only be done on its doc_count. " + "Either drop the key (a la \"" + name() + "\") or change it to \"doc_count\" (a la \"" + name() + @@ -411,4 +415,10 @@ public static boolean descendsFromGlobalAggregator(Aggregator parent) { return false; } + @Override + protected void preGetSubLeafCollectors(LeafReaderContext ctx) throws IOException { + super.preGetSubLeafCollectors(ctx); + // Set LeafReaderContext to the doc_count provider + docCountProvider.setLeafReaderContext(ctx); + } } diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/DocCountProvider.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/DocCountProvider.java new file mode 100644 index 0000000000000..9cf25e098cb0f --- /dev/null +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/DocCountProvider.java @@ -0,0 +1,49 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.search.aggregations.bucket; + +import org.apache.lucene.index.DocValues; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.NumericDocValues; +import org.elasticsearch.index.mapper.DocCountFieldMapper; + +import java.io.IOException; + +/** + * An implementation of a doc_count provider that reads the value + * of the _doc_count field in the document. If a document does not have a + * _doc_count field the implementation will return 1 as the default value. + */ +public class DocCountProvider { + + private NumericDocValues docCountValues; + + public long getDocCount(int doc) throws IOException { + if (docCountValues != null && docCountValues.advanceExact(doc)) { + return docCountValues.longValue(); + } else { + return 1L; + } + } + + public void setLeafReaderContext(LeafReaderContext ctx) throws IOException { + docCountValues = DocValues.getNumeric(ctx.reader(), DocCountFieldMapper.NAME); + } +} diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/adjacency/AdjacencyMatrixAggregator.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/adjacency/AdjacencyMatrixAggregator.java index afc1bea27c2f3..c12a6f002b418 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/adjacency/AdjacencyMatrixAggregator.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/adjacency/AdjacencyMatrixAggregator.java @@ -200,7 +200,7 @@ public InternalAggregation[] buildAggregations(long[] owningBucketOrds) throws I List buckets = new ArrayList<>(filters.length); for (int i = 0; i < keys.length; i++) { long bucketOrd = bucketOrd(owningBucketOrds[owningBucketOrdIdx], i); - int docCount = bucketDocCount(bucketOrd); + long docCount = bucketDocCount(bucketOrd); // Empty buckets are not returned because this aggregation will commonly be used under a // a date-histogram where we will look for transactions over time and can expect many // empty buckets. @@ -214,7 +214,7 @@ public InternalAggregation[] buildAggregations(long[] owningBucketOrds) throws I for (int i = 0; i < keys.length; i++) { for (int j = i + 1; j < keys.length; j++) { long bucketOrd = bucketOrd(owningBucketOrds[owningBucketOrdIdx], pos); - int docCount = bucketDocCount(bucketOrd); + long docCount = bucketDocCount(bucketOrd); // Empty buckets are not returned due to potential for very sparse matrices if (docCount > 0) { String intersectKey = keys[i] + separator + keys[j]; diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/composite/CompositeAggregator.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/composite/CompositeAggregator.java index 5946507403925..5c21692f3ad5f 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/composite/CompositeAggregator.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/composite/CompositeAggregator.java @@ -155,7 +155,7 @@ public InternalAggregation[] buildAggregations(long[] owningBucketOrds) throws I int slot = queue.pop(); CompositeKey key = queue.toCompositeKey(slot); InternalAggregations aggs = subAggsForBuckets[slot]; - int docCount = queue.getDocCount(slot); + long docCount = queue.getDocCount(slot); buckets[queue.size()] = new InternalComposite.InternalBucket(sourceNames, formats, key, reverseMuls, docCount, aggs); } CompositeKey lastBucket = num > 0 ? buckets[num-1].getRawKey() : null; @@ -430,7 +430,8 @@ private LeafBucketCollector getFirstPassCollector(RoaringDocIdSet.Builder builde @Override public void collect(int doc, long bucket) throws IOException { try { - if (queue.addIfCompetitive(indexSortPrefix)) { + long docCount = docCountProvider.getDocCount(doc); + if (queue.addIfCompetitive(indexSortPrefix, docCount)) { if (builder != null && lastDoc != doc) { builder.add(doc); lastDoc = doc; diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/composite/CompositeValuesCollectorQueue.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/composite/CompositeValuesCollectorQueue.java index 70887e0724c8a..35caefb495da4 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/composite/CompositeValuesCollectorQueue.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/composite/CompositeValuesCollectorQueue.java @@ -25,7 +25,7 @@ import org.elasticsearch.common.lease.Releasable; import org.elasticsearch.common.lease.Releasables; import org.elasticsearch.common.util.BigArrays; -import org.elasticsearch.common.util.IntArray; +import org.elasticsearch.common.util.LongArray; import org.elasticsearch.search.aggregations.LeafBucketCollector; import java.io.IOException; @@ -65,7 +65,7 @@ public int hashCode() { private final Map map; private final SingleDimensionValuesSource[] arrays; - private IntArray docCounts; + private LongArray docCounts; private boolean afterKeyIsSet = false; /** @@ -88,7 +88,7 @@ public int hashCode() { sources[i].setAfter(afterKey.get(i)); } } - this.docCounts = bigArrays.newIntArray(1, false); + this.docCounts = bigArrays.newLongArray(1, false); } @Override @@ -127,19 +127,19 @@ Comparable getUpperValueLeadSource() throws IOException { /** * Returns the document count in slot. */ - int getDocCount(int slot) { + long getDocCount(int slot) { return docCounts.get(slot); } /** * Copies the current value in slot. */ - private void copyCurrent(int slot) { + private void copyCurrent(int slot, long value) { for (int i = 0; i < arrays.length; i++) { arrays[i].copyCurrent(slot); } docCounts = bigArrays.grow(docCounts, slot+1); - docCounts.set(slot, 1); + docCounts.set(slot, value); } /** @@ -248,8 +248,8 @@ LeafBucketCollector getLeafCollector(Comparable forceLeadSourceValue, * Check if the current candidate should be added in the queue. * @return true if the candidate is competitive (added or already in the queue). */ - boolean addIfCompetitive() { - return addIfCompetitive(0); + boolean addIfCompetitive(long inc) { + return addIfCompetitive(0, inc); } @@ -263,12 +263,12 @@ boolean addIfCompetitive() { * * @throws CollectionTerminatedException if the current collection can be terminated early due to index sorting. */ - boolean addIfCompetitive(int indexSortSourcePrefix) { + boolean addIfCompetitive(int indexSortSourcePrefix, long inc) { // checks if the candidate key is competitive Integer topSlot = compareCurrent(); if (topSlot != null) { // this key is already in the top N, skip it - docCounts.increment(topSlot, 1); + docCounts.increment(topSlot, inc); return true; } if (afterKeyIsSet) { @@ -309,7 +309,7 @@ boolean addIfCompetitive(int indexSortSourcePrefix) { newSlot = size(); } // move the candidate key to its new slot - copyCurrent(newSlot); + copyCurrent(newSlot, inc); map.put(new Slot(newSlot), newSlot); add(newSlot); return true; diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/composite/SortedDocsProducer.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/composite/SortedDocsProducer.java index 01e2c0a3ae5b0..7a5cef87b6731 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/composite/SortedDocsProducer.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/composite/SortedDocsProducer.java @@ -27,6 +27,7 @@ import org.apache.lucene.util.DocIdSetBuilder; import org.elasticsearch.common.Nullable; import org.elasticsearch.search.aggregations.LeafBucketCollector; +import org.elasticsearch.search.aggregations.bucket.DocCountProvider; import java.io.IOException; @@ -54,6 +55,8 @@ protected boolean processBucket(CompositeValuesCollectorQueue queue, LeafReaderC Comparable leadSourceBucket, @Nullable DocIdSetBuilder builder) throws IOException { final int[] topCompositeCollected = new int[1]; final boolean[] hasCollected = new boolean[1]; + final DocCountProvider docCountProvider = new DocCountProvider(); + docCountProvider.setLeafReaderContext(context); final LeafBucketCollector queueCollector = new LeafBucketCollector() { int lastDoc = -1; @@ -66,7 +69,8 @@ protected boolean processBucket(CompositeValuesCollectorQueue queue, LeafReaderC @Override public void collect(int doc, long bucket) throws IOException { hasCollected[0] = true; - if (queue.addIfCompetitive()) { + long docCount = docCountProvider.getDocCount(doc); + if (queue.addIfCompetitive(docCount)) { topCompositeCollected[0]++; if (adder != null && doc != lastDoc) { if (remainingBits == 0) { diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/nested/NestedAggregator.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/nested/NestedAggregator.java index f481d30d8b0fd..6e40d7217fbcd 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/nested/NestedAggregator.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/nested/NestedAggregator.java @@ -107,7 +107,8 @@ public void collect(int parentDoc, long bucket) throws IOException { } @Override - protected void preGetSubLeafCollectors() throws IOException { + protected void preGetSubLeafCollectors(LeafReaderContext ctx) throws IOException { + super.preGetSubLeafCollectors(ctx); processBufferedDocs(); } diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java index 4dad0ca1d4af6..f26dccf900315 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java @@ -30,7 +30,6 @@ import org.elasticsearch.common.io.stream.StreamOutput; import org.elasticsearch.common.lease.Releasable; import org.elasticsearch.common.lease.Releasables; -import org.elasticsearch.common.util.IntArray; import org.elasticsearch.common.util.LongArray; import org.elasticsearch.common.util.LongHash; import org.elasticsearch.common.xcontent.XContentBuilder; @@ -272,7 +271,7 @@ protected void doClose() { static class LowCardinality extends GlobalOrdinalsStringTermsAggregator { private LongUnaryOperator mapping; - private IntArray segmentDocCounts; + private LongArray segmentDocCounts; LowCardinality( String name, @@ -292,7 +291,7 @@ static class LowCardinality extends GlobalOrdinalsStringTermsAggregator { super(name, factories, resultStrategy, valuesSource, order, format, bucketCountThresholds, null, context, parent, remapGlobalOrds, collectionMode, showTermDocCountError, CardinalityUpperBound.ONE, metadata); assert factories == null || factories.countAggregators() == 0; - this.segmentDocCounts = context.bigArrays().newIntArray(1, true); + this.segmentDocCounts = context.bigArrays().newLongArray(1, true); } @Override @@ -316,7 +315,8 @@ public void collect(int doc, long owningBucketOrd) throws IOException { return; } int ord = singleValues.ordValue(); - segmentDocCounts.increment(ord + 1, 1); + long docCount = docCountProvider.getDocCount(doc); + segmentDocCounts.increment(ord + 1, docCount); } }); } @@ -329,7 +329,8 @@ public void collect(int doc, long owningBucketOrd) throws IOException { return; } for (long segmentOrd = segmentOrds.nextOrd(); segmentOrd != NO_MORE_ORDS; segmentOrd = segmentOrds.nextOrd()) { - segmentDocCounts.increment(segmentOrd + 1, 1); + long docCount = docCountProvider.getDocCount(doc); + segmentDocCounts.increment(segmentOrd + 1, docCount); } } }); @@ -353,7 +354,7 @@ private void mapSegmentCountsToGlobalCounts(LongUnaryOperator mapping) throws IO for (long i = 1; i < segmentDocCounts.size(); i++) { // We use set(...) here, because we need to reset the slow to 0. // segmentDocCounts get reused over the segments and otherwise counts would be too high. - int inc = segmentDocCounts.set(i, 0); + long inc = segmentDocCounts.set(i, 0); if (inc == 0) { continue; } diff --git a/server/src/test/java/org/elasticsearch/index/mapper/DocCountFieldMapperTests.java b/server/src/test/java/org/elasticsearch/index/mapper/DocCountFieldMapperTests.java new file mode 100644 index 0000000000000..25cf8c064a847 --- /dev/null +++ b/server/src/test/java/org/elasticsearch/index/mapper/DocCountFieldMapperTests.java @@ -0,0 +1,71 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.index.mapper; + +import org.apache.lucene.index.DocValuesType; +import org.apache.lucene.index.IndexableField; + +import static org.hamcrest.Matchers.containsString; + +public class DocCountFieldMapperTests extends MapperServiceTestCase { + + private static final String CONTENT_TYPE = DocCountFieldMapper.CONTENT_TYPE; + private static final String DOC_COUNT_FIELD = DocCountFieldMapper.NAME; + + /** + * Test parsing field mapping and adding simple field + */ + public void testParseValue() throws Exception { + DocumentMapper mapper = createDocumentMapper(mapping(b -> {})); + ParsedDocument doc = mapper.parse(source(b -> + b.field("foo", 500) + .field(CONTENT_TYPE, 100) + )); + + IndexableField field = doc.rootDoc().getField(DOC_COUNT_FIELD); + assertEquals(100L, field.numericValue()); + assertEquals(DocValuesType.NUMERIC, field.fieldType().docValuesType()); + assertEquals(1, doc.rootDoc().getFields(DOC_COUNT_FIELD).length); + } + + public void testInvalidDocument_NegativeDocCount() throws Exception { + DocumentMapper mapper = createDocumentMapper(mapping(b -> {})); + Exception e = expectThrows(MapperParsingException.class, () -> mapper.parse(source(b -> b.field(CONTENT_TYPE, -100)))); + assertThat(e.getCause().getMessage(), containsString("Field [_doc_count] must be a positive integer")); + } + + public void testInvalidDocument_ZeroDocCount() throws Exception { + DocumentMapper mapper = createDocumentMapper(mapping(b -> {})); + Exception e = expectThrows(MapperParsingException.class, () -> mapper.parse(source(b -> b.field(CONTENT_TYPE, 0)))); + assertThat(e.getCause().getMessage(), containsString("Field [_doc_count] must be a positive integer")); + } + + public void testInvalidDocument_NonNumericDocCount() throws Exception { + DocumentMapper mapper = createDocumentMapper(mapping(b -> {})); + Exception e = expectThrows(MapperParsingException.class, () -> mapper.parse(source(b -> b.field(CONTENT_TYPE, "foo")))); + assertThat(e.getCause().getMessage(), + containsString("Failed to parse object: expecting token of type [VALUE_NUMBER] but found [VALUE_STRING]")); + } + + public void testInvalidDocument_FractionalDocCount() throws Exception { + DocumentMapper mapper = createDocumentMapper(mapping(b -> {})); + Exception e = expectThrows(MapperParsingException.class, () -> mapper.parse(source(b -> b.field(CONTENT_TYPE, 100.23)))); + assertThat(e.getCause().getMessage(), containsString("100.23 cannot be converted to Long without data loss")); + } +} diff --git a/server/src/test/java/org/elasticsearch/index/mapper/DocCountFieldTypeTests.java b/server/src/test/java/org/elasticsearch/index/mapper/DocCountFieldTypeTests.java new file mode 100644 index 0000000000000..f8f5d3c2e810c --- /dev/null +++ b/server/src/test/java/org/elasticsearch/index/mapper/DocCountFieldTypeTests.java @@ -0,0 +1,55 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.index.mapper; + +import org.apache.lucene.search.DocValuesFieldExistsQuery; +import org.elasticsearch.index.query.QueryShardException; + +import java.io.IOException; +import java.util.Arrays; + +public class DocCountFieldTypeTests extends FieldTypeTestCase { + + public void testTermQuery() { + MappedFieldType ft = new DocCountFieldMapper.DocCountFieldType(); + QueryShardException e = expectThrows(QueryShardException.class, + () -> ft.termQuery(10L, randomMockShardContext())); + assertEquals("Field [_doc_count] of type [_doc_count] is not searchable", e.getMessage()); + } + + public void testRangeQuery() { + MappedFieldType ft = new DocCountFieldMapper.DocCountFieldType(); + IllegalArgumentException e = expectThrows(IllegalArgumentException.class, + () -> ft.rangeQuery(null, null, randomBoolean(), randomBoolean(), null, null, null, null)); + assertEquals("Field [_doc_count] of type [_doc_count] does not support range queries", e.getMessage()); + } + + public void testExistsQuery() { + MappedFieldType ft = new DocCountFieldMapper.DocCountFieldType(); + assertTrue(ft.existsQuery(randomMockShardContext()) instanceof DocValuesFieldExistsQuery); + } + + public void testFetchSourceValue() throws IOException { + MappedFieldType fieldType = new DocCountFieldMapper.DocCountFieldType(); + assertEquals(Arrays.asList(14L), fetchSourceValue(fieldType, 14)); + assertEquals(Arrays.asList(14L), fetchSourceValue(fieldType, "14")); + assertEquals(Arrays.asList(1L), fetchSourceValue(fieldType, "")); + assertEquals(Arrays.asList(1L), fetchSourceValue(fieldType, null)); + } +} diff --git a/server/src/test/java/org/elasticsearch/indices/IndicesModuleTests.java b/server/src/test/java/org/elasticsearch/indices/IndicesModuleTests.java index 6ee6fb0cd3972..7c58d977154af 100644 --- a/server/src/test/java/org/elasticsearch/indices/IndicesModuleTests.java +++ b/server/src/test/java/org/elasticsearch/indices/IndicesModuleTests.java @@ -20,6 +20,7 @@ package org.elasticsearch.indices; import org.elasticsearch.Version; +import org.elasticsearch.index.mapper.DocCountFieldMapper; import org.elasticsearch.index.mapper.FieldNamesFieldMapper; import org.elasticsearch.index.mapper.IdFieldMapper; import org.elasticsearch.index.mapper.IgnoredFieldMapper; @@ -76,7 +77,8 @@ public Map getMetadataMappers() { private static final String[] EXPECTED_METADATA_FIELDS = new String[]{ IgnoredFieldMapper.NAME, IdFieldMapper.NAME, RoutingFieldMapper.NAME, IndexFieldMapper.NAME, SourceFieldMapper.NAME, - NestedPathFieldMapper.NAME, VersionFieldMapper.NAME, SeqNoFieldMapper.NAME, FieldNamesFieldMapper.NAME }; + NestedPathFieldMapper.NAME, VersionFieldMapper.NAME, SeqNoFieldMapper.NAME, DocCountFieldMapper.NAME, + FieldNamesFieldMapper.NAME }; public void testBuiltinMappers() { IndicesModule module = new IndicesModule(Collections.emptyList()); diff --git a/server/src/test/java/org/elasticsearch/search/aggregations/bucket/DocCountProviderTests.java b/server/src/test/java/org/elasticsearch/search/aggregations/bucket/DocCountProviderTests.java new file mode 100644 index 0000000000000..f7ba8db8a66f2 --- /dev/null +++ b/server/src/test/java/org/elasticsearch/search/aggregations/bucket/DocCountProviderTests.java @@ -0,0 +1,108 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.search.aggregations.bucket; + +import org.apache.lucene.document.IntPoint; +import org.apache.lucene.document.NumericDocValuesField; +import org.apache.lucene.document.SortedNumericDocValuesField; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.search.Query; +import org.elasticsearch.common.CheckedConsumer; +import org.elasticsearch.index.mapper.DocCountFieldMapper; +import org.elasticsearch.index.mapper.MappedFieldType; +import org.elasticsearch.index.mapper.NumberFieldMapper; +import org.elasticsearch.search.aggregations.AggregatorTestCase; +import org.elasticsearch.search.aggregations.bucket.global.GlobalAggregationBuilder; +import org.elasticsearch.search.aggregations.bucket.global.InternalGlobal; + +import java.io.IOException; +import java.util.List; +import java.util.function.Consumer; + +import static java.util.Collections.singleton; + + +public class DocCountProviderTests extends AggregatorTestCase { + + private static final String DOC_COUNT_FIELD = DocCountFieldMapper.NAME; + private static final String NUMBER_FIELD = "number"; + + public void testDocsWithDocCount() throws IOException { + testAggregation(new MatchAllDocsQuery(), iw -> { + iw.addDocument(List.of( + new NumericDocValuesField(DOC_COUNT_FIELD, 4), + new SortedNumericDocValuesField(NUMBER_FIELD, 1) + )); + iw.addDocument(List.of( + new NumericDocValuesField(DOC_COUNT_FIELD, 5), + new SortedNumericDocValuesField(NUMBER_FIELD, 7) + )); + iw.addDocument(List.of( + // Intentionally omit doc_count field + new SortedNumericDocValuesField(NUMBER_FIELD, 1) + )); + }, global -> { + assertEquals(10, global.getDocCount()); + }); + } + + public void testDocsWithoutDocCount() throws IOException { + testAggregation(new MatchAllDocsQuery(), iw -> { + iw.addDocument(singleton(new SortedNumericDocValuesField(NUMBER_FIELD, 1))); + iw.addDocument(singleton(new SortedNumericDocValuesField(NUMBER_FIELD, 7))); + iw.addDocument(singleton(new SortedNumericDocValuesField(NUMBER_FIELD, 1))); + }, global -> { + assertEquals(3, global.getDocCount()); + }); + } + + public void testQueryFiltering() throws IOException { + testAggregation(IntPoint.newRangeQuery(NUMBER_FIELD, 4, 5), iw -> { + iw.addDocument(List.of( + new NumericDocValuesField(DOC_COUNT_FIELD, 4), + new IntPoint(NUMBER_FIELD, 6) + )); + iw.addDocument(List.of( + new NumericDocValuesField(DOC_COUNT_FIELD, 2), + new IntPoint(NUMBER_FIELD, 5) + )); + iw.addDocument(List.of( + // Intentionally omit doc_count field + new IntPoint(NUMBER_FIELD, 1) + )); + iw.addDocument(List.of( + // Intentionally omit doc_count field + new IntPoint(NUMBER_FIELD, 5) + )); + }, global -> { + assertEquals(3, global.getDocCount()); + }); + } + + private void testAggregation(Query query, + CheckedConsumer indexer, + Consumer verify) throws IOException { + GlobalAggregationBuilder aggregationBuilder = new GlobalAggregationBuilder("_name"); + MappedFieldType fieldType = new NumberFieldMapper.NumberFieldType(NUMBER_FIELD, NumberFieldMapper.NumberType.LONG); + MappedFieldType docCountFieldType = new DocCountFieldMapper.DocCountFieldType(); + testCase(aggregationBuilder, query, indexer, verify, fieldType, docCountFieldType); + } +} diff --git a/server/src/test/java/org/elasticsearch/search/aggregations/bucket/composite/CompositeValuesCollectorQueueTests.java b/server/src/test/java/org/elasticsearch/search/aggregations/bucket/composite/CompositeValuesCollectorQueueTests.java index 8cb35b6b861b4..57f04d5192529 100644 --- a/server/src/test/java/org/elasticsearch/search/aggregations/bucket/composite/CompositeValuesCollectorQueueTests.java +++ b/server/src/test/java/org/elasticsearch/search/aggregations/bucket/composite/CompositeValuesCollectorQueueTests.java @@ -309,7 +309,7 @@ private void testRandomCase(boolean forceMerge, final LeafBucketCollector leafCollector = new LeafBucketCollector() { @Override public void collect(int doc, long bucket) throws IOException { - queue.addIfCompetitive(indexSortSourcePrefix); + queue.addIfCompetitive(indexSortSourcePrefix, 1); } }; final LeafBucketCollector queueCollector = queue.getLeafCollector(leafReaderContext, leafCollector); diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/dataframe/extractor/ExtractedFieldsDetector.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/dataframe/extractor/ExtractedFieldsDetector.java index 57288547cf119..7ef9e7ddecd98 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/dataframe/extractor/ExtractedFieldsDetector.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/dataframe/extractor/ExtractedFieldsDetector.java @@ -59,7 +59,7 @@ public class ExtractedFieldsDetector { */ private static final List IGNORE_FIELDS = Arrays.asList("_id", "_field_names", "_index", "_parent", "_routing", "_seq_no", "_source", "_type", "_uid", "_version", "_feature", "_ignored", "_nested_path", DestinationIndex.INCREMENTAL_ID, - "_data_stream_timestamp"); + "_data_stream_timestamp", "_doc_count"); private final DataFrameAnalyticsConfig config; private final int docValueFieldsLimit; From a46c8ea96aaefc62b289defa777dcab1e3f2a634 Mon Sep 17 00:00:00 2001 From: Christos Soulios Date: Mon, 2 Nov 2020 22:35:31 +0200 Subject: [PATCH 2/5] Exclude _doc_count from SQL results that made tests fail --- .../java/org/elasticsearch/xpack/ql/index/IndexResolver.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/x-pack/plugin/ql/src/main/java/org/elasticsearch/xpack/ql/index/IndexResolver.java b/x-pack/plugin/ql/src/main/java/org/elasticsearch/xpack/ql/index/IndexResolver.java index e86ac79ee3dae..d6b76db19b088 100644 --- a/x-pack/plugin/ql/src/main/java/org/elasticsearch/xpack/ql/index/IndexResolver.java +++ b/x-pack/plugin/ql/src/main/java/org/elasticsearch/xpack/ql/index/IndexResolver.java @@ -158,7 +158,7 @@ public boolean equals(Object obj) { EnumSet.of(Option.ALLOW_NO_INDICES, Option.IGNORE_UNAVAILABLE), EnumSet.of(WildcardStates.OPEN)); - private static final List FIELD_NAMES_BLACKLIST = Arrays.asList("_size"); + private static final List FIELD_NAMES_BLACKLIST = Arrays.asList("_size", "_doc_count"); private static final String UNMAPPED = "unmapped"; private final Client client; From 29c39bbe0348597a624ae20f03e895f2dec79fd4 Mon Sep 17 00:00:00 2001 From: Christos Soulios Date: Mon, 2 Nov 2020 22:55:50 +0200 Subject: [PATCH 3/5] Converted list to set, since contains() is used --- .../java/org/elasticsearch/xpack/ql/index/IndexResolver.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/x-pack/plugin/ql/src/main/java/org/elasticsearch/xpack/ql/index/IndexResolver.java b/x-pack/plugin/ql/src/main/java/org/elasticsearch/xpack/ql/index/IndexResolver.java index d6b76db19b088..24328aca88db5 100644 --- a/x-pack/plugin/ql/src/main/java/org/elasticsearch/xpack/ql/index/IndexResolver.java +++ b/x-pack/plugin/ql/src/main/java/org/elasticsearch/xpack/ql/index/IndexResolver.java @@ -24,6 +24,7 @@ import org.elasticsearch.cluster.metadata.AliasMetadata; import org.elasticsearch.common.Strings; import org.elasticsearch.common.collect.ImmutableOpenMap; +import org.elasticsearch.common.util.set.Sets; import org.elasticsearch.index.IndexNotFoundException; import org.elasticsearch.xpack.ql.QlIllegalArgumentException; import org.elasticsearch.xpack.ql.type.ConstantKeywordEsField; @@ -158,7 +159,7 @@ public boolean equals(Object obj) { EnumSet.of(Option.ALLOW_NO_INDICES, Option.IGNORE_UNAVAILABLE), EnumSet.of(WildcardStates.OPEN)); - private static final List FIELD_NAMES_BLACKLIST = Arrays.asList("_size", "_doc_count"); + private static final Set FIELD_NAMES_BLACKLIST = Sets.newHashSet("_size", "_doc_count"); private static final String UNMAPPED = "unmapped"; private final Client client; From ad67789881683bdc6a4c57db6a18575b4f25b9b3 Mon Sep 17 00:00:00 2001 From: Christos Soulios Date: Mon, 2 Nov 2020 22:56:49 +0200 Subject: [PATCH 4/5] Changed family type of doc_count from int to long --- .../org/elasticsearch/index/mapper/DocCountFieldMapper.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/src/main/java/org/elasticsearch/index/mapper/DocCountFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/DocCountFieldMapper.java index 611bab88f66fe..9806d4e2d5915 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/DocCountFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/DocCountFieldMapper.java @@ -76,7 +76,7 @@ public String typeName() { @Override public String familyTypeName() { - return NumberFieldMapper.NumberType.INTEGER.typeName(); + return NumberFieldMapper.NumberType.LONG.typeName(); } @Override From b69c5e93d8cc4def7d2ab094b8a2ace576dd4bce Mon Sep 17 00:00:00 2001 From: Christos Soulios Date: Tue, 3 Nov 2020 14:56:20 +0200 Subject: [PATCH 5/5] Added _doc_count to SQL IndexResolverTests --- .../analysis/index/IndexResolverTests.java | 42 ++++++++++--------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/x-pack/plugin/sql/src/test/java/org/elasticsearch/xpack/sql/analysis/index/IndexResolverTests.java b/x-pack/plugin/sql/src/test/java/org/elasticsearch/xpack/sql/analysis/index/IndexResolverTests.java index d69384f32e53f..e899b06f01349 100644 --- a/x-pack/plugin/sql/src/test/java/org/elasticsearch/xpack/sql/analysis/index/IndexResolverTests.java +++ b/x-pack/plugin/sql/src/test/java/org/elasticsearch/xpack/sql/analysis/index/IndexResolverTests.java @@ -114,20 +114,21 @@ public void testMultiLevelObjectMappings() throws Exception { public void testMultiLevelNestedMappings() throws Exception { Map nestedMapping = loadMapping("mapping-nested.json", true); - + IndexResolution resolution = merge(new EsIndex("a", nestedMapping)); assertTrue(resolution.isValid()); assertEqualsMaps(nestedMapping, resolution.get().mapping()); } - + public void testMetaFieldsAreIgnored() throws Exception { Map> fieldCaps = new HashMap<>(); addFieldCaps(fieldCaps, "_version", "_version", false, false); addFieldCaps(fieldCaps, "_meta_field", "integer", true, true); addFieldCaps(fieldCaps, "_size", "integer", true, true); + addFieldCaps(fieldCaps, "_doc_count", "long", false, false); addFieldCaps(fieldCaps, "text", "keyword", true, true); - + String wildcard = "*"; IndexResolution resolution = mergedMappings(wildcard, new String[] { "index" }, fieldCaps); assertTrue(resolution.isValid()); @@ -136,10 +137,11 @@ public void testMetaFieldsAreIgnored() throws Exception { assertEquals(wildcard, esIndex.name()); assertNull(esIndex.mapping().get("_version")); assertNull(esIndex.mapping().get("_size")); + assertNull(esIndex.mapping().get("_doc_count")); assertEquals(INTEGER, esIndex.mapping().get("_meta_field").getDataType()); assertEquals(KEYWORD, esIndex.mapping().get("text").getDataType()); } - + public void testFlattenedHiddenSubfield() throws Exception { Map> fieldCaps = new HashMap<>(); addFieldCaps(fieldCaps, "some_field", "flattened", false, false); @@ -150,7 +152,7 @@ public void testFlattenedHiddenSubfield() throws Exception { addFieldCaps(fieldCaps, "nested_field.sub_field", "flattened", true, true); addFieldCaps(fieldCaps, "nested_field.sub_field._keyed", "flattened", true, true); addFieldCaps(fieldCaps, "text", "keyword", true, true); - + String wildcard = "*"; IndexResolution resolution = mergedMappings(wildcard, new String[] { "index" }, fieldCaps); assertTrue(resolution.isValid()); @@ -167,7 +169,7 @@ public void testFlattenedHiddenSubfield() throws Exception { assertEquals(OBJECT, esIndex.mapping().get("another_field").getDataType()); assertEquals(KEYWORD, esIndex.mapping().get("another_field").getProperties().get("_keyed").getDataType()); } - + public void testPropagateUnsupportedTypeToSubFields() throws Exception { // generate a field type having the name of the format "foobar43" String esFieldType = randomAlphaOfLengthBetween(5, 10) + randomIntBetween(-100, 100); @@ -177,7 +179,7 @@ public void testPropagateUnsupportedTypeToSubFields() throws Exception { addFieldCaps(fieldCaps, "a.b.c", "object", true, false); addFieldCaps(fieldCaps, "a.b.c.d", "keyword", true, false); addFieldCaps(fieldCaps, "a.b.c.e", "foo", true, true); - + String wildcard = "*"; IndexResolution resolution = mergedMappings(wildcard, new String[] { "index" }, fieldCaps); assertTrue(resolution.isValid()); @@ -192,11 +194,11 @@ public void testPropagateUnsupportedTypeToSubFields() throws Exception { assertEquals(UNSUPPORTED, esIndex.mapping().get("a").getProperties().get("b").getProperties().get("c") .getProperties().get("e").getDataType()); } - + public void testRandomMappingFieldTypeMappedAsUnsupported() throws Exception { // generate a field type having the name of the format "foobar43" String esFieldType = randomAlphaOfLengthBetween(5, 10) + randomIntBetween(-100, 100); - + Map> fieldCaps = new HashMap<>(); addFieldCaps(fieldCaps, "some_field", esFieldType, false, false); addFieldCaps(fieldCaps, "another_field", "object", true, false); @@ -208,7 +210,7 @@ public void testRandomMappingFieldTypeMappedAsUnsupported() throws Exception { // even if this is of a supported type, because it belongs to an UNSUPPORTED type parent, it should also be UNSUPPORTED addFieldCaps(fieldCaps, "nested_field.sub_field2.bar", "keyword", true, true); addFieldCaps(fieldCaps, "text", "keyword", true, true); - + String wildcard = "*"; IndexResolution resolution = mergedMappings(wildcard, new String[] { "index" }, fieldCaps); assertTrue(resolution.isValid()); @@ -299,19 +301,19 @@ public void testConstantKeywordTwoIndicesScenario() throws Exception { assertTrue(props.containsKey("name")); assertTrue(props.containsKey("wheels_count")); assertTrue(props.containsKey("motor")); - + esField = props.get("cycle_type"); assertTrue(esField instanceof KeywordEsField); assertEquals(KEYWORD, ((KeywordEsField) esField).getDataType()); - + esField = props.get("name"); assertTrue(esField instanceof TextEsField); assertEquals(TEXT, ((TextEsField) esField).getDataType()); - + esField = props.get("wheels_count"); assertTrue(esField instanceof ConstantKeywordEsField); assertEquals(CONSTANT_KEYWORD, ((ConstantKeywordEsField) esField).getDataType()); - + esField = props.get("motor"); assertTrue(esField instanceof KeywordEsField); assertEquals(KEYWORD, ((KeywordEsField) esField).getDataType()); @@ -347,7 +349,7 @@ public void testSeparateIncompatibleTypes() throws Exception { public void testMultipleCompatibleIndicesWithDifferentFields() { int indicesCount = randomIntBetween(2, 15); EsIndex[] expectedIndices = new EsIndex[indicesCount]; - + // each index will have one field with different name than all others for (int i = 0; i < indicesCount; i++) { Map mapping = new HashMap<>(1); @@ -355,7 +357,7 @@ public void testMultipleCompatibleIndicesWithDifferentFields() { mapping.put(fieldName, new KeywordEsField(fieldName)); expectedIndices[i] = new EsIndex("index" + (i + 1), mapping); } - + List actualIndices = separate(expectedIndices); assertEquals(indicesCount, actualIndices.size()); for (int i = 0; i < indicesCount; i++) { @@ -405,7 +407,7 @@ public static Map> fromMappings(EsIndex.. return merged; } - + private static void addFieldCaps(String parent, EsField field, String indexName, Map> merged) { String fieldName = parent != null ? parent + "." + field.getName() : field.getName(); Map map = merged.get(fieldName); @@ -471,19 +473,19 @@ private static void assertEqualsMaps(Map left, Map right) { assertEquals(format("Key [{}] has different values", entry.getKey()), entry.getValue(), rv); } } - + private void addFieldCaps(Map> fieldCaps, String name, String type, boolean isSearchable, boolean isAggregatable) { Map cap = new HashMap<>(); cap.put(type, new FieldCapabilities(name, type, isSearchable, isAggregatable, null, null, null, Collections.emptyMap())); fieldCaps.put(name, cap); } - + private static IndexResolution mergedMappings(String indexPattern, String[] indexNames, Map> fieldCaps) { return IndexResolver.mergedMappings(SqlDataTypeRegistry.INSTANCE, indexPattern, indexNames, fieldCaps); } - + private static List separateMappings(String javaRegex, String[] indexNames, Map> fieldCaps) { return IndexResolver.separateMappings(SqlDataTypeRegistry.INSTANCE, javaRegex, indexNames, fieldCaps, null);