diff --git a/core/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/InternalMappedSignificantTerms.java b/core/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/InternalMappedSignificantTerms.java index da346a8a1e43f..e1bd4defd3e08 100644 --- a/core/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/InternalMappedSignificantTerms.java +++ b/core/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/InternalMappedSignificantTerms.java @@ -128,6 +128,7 @@ protected int doHashCode() { @Override public XContentBuilder doXContentBody(XContentBuilder builder, Params params) throws IOException { builder.field(CommonFields.DOC_COUNT.getPreferredName(), subsetSize); + builder.field(BG_COUNT, supersetSize); builder.startArray(CommonFields.BUCKETS.getPreferredName()); for (Bucket bucket : buckets) { //There is a condition (presumably when only one shard has a bucket?) where reduce is not called diff --git a/core/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/ParsedSignificantLongTerms.java b/core/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/ParsedSignificantLongTerms.java index 2d512632f50c2..9592d80c77625 100644 --- a/core/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/ParsedSignificantLongTerms.java +++ b/core/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/ParsedSignificantLongTerms.java @@ -39,9 +39,7 @@ public String getType() { } public static ParsedSignificantLongTerms fromXContent(XContentParser parser, String name) throws IOException { - ParsedSignificantLongTerms aggregation = PARSER.parse(parser, null); - aggregation.setName(name); - return aggregation; + return parseSignificantTermsXContent(() -> PARSER.parse(parser, null), name); } public static class ParsedBucket extends ParsedSignificantTerms.ParsedBucket { diff --git a/core/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/ParsedSignificantStringTerms.java b/core/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/ParsedSignificantStringTerms.java index fb1c7728e0c12..008a5a28e5d39 100644 --- a/core/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/ParsedSignificantStringTerms.java +++ b/core/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/ParsedSignificantStringTerms.java @@ -40,9 +40,7 @@ public String getType() { } public static ParsedSignificantStringTerms fromXContent(XContentParser parser, String name) throws IOException { - ParsedSignificantStringTerms aggregation = PARSER.parse(parser, null); - aggregation.setName(name); - return aggregation; + return parseSignificantTermsXContent(() -> PARSER.parse(parser, null), name); } public static class ParsedBucket extends ParsedSignificantTerms.ParsedBucket { diff --git a/core/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/ParsedSignificantTerms.java b/core/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/ParsedSignificantTerms.java index 56be0aa60711f..8991ca0993295 100644 --- a/core/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/ParsedSignificantTerms.java +++ b/core/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/ParsedSignificantTerms.java @@ -21,6 +21,8 @@ import org.elasticsearch.common.CheckedBiConsumer; import org.elasticsearch.common.CheckedFunction; +import org.elasticsearch.common.CheckedSupplier; +import org.elasticsearch.common.ParseField; import org.elasticsearch.common.xcontent.ObjectParser; import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.common.xcontent.XContentParser; @@ -42,11 +44,16 @@ public abstract class ParsedSignificantTerms extends ParsedMultiBucketAggregatio private Map bucketMap; protected long subsetSize; + protected long supersetSize; protected long getSubsetSize() { return subsetSize; } + protected long getSupersetSize() { + return supersetSize; + } + @Override public List getBuckets() { return buckets; @@ -68,6 +75,7 @@ public Iterator iterator() { @Override protected XContentBuilder doXContentBody(XContentBuilder builder, Params params) throws IOException { builder.field(CommonFields.DOC_COUNT.getPreferredName(), subsetSize); + builder.field(InternalMappedSignificantTerms.BG_COUNT, supersetSize); builder.startArray(CommonFields.BUCKETS.getPreferredName()); for (SignificantTerms.Bucket bucket : buckets) { bucket.toXContent(builder, params); @@ -76,16 +84,31 @@ protected XContentBuilder doXContentBody(XContentBuilder builder, Params params) return builder; } + static T parseSignificantTermsXContent(final CheckedSupplier aggregationSupplier, + final String name) throws IOException { + T aggregation = aggregationSupplier.get(); + aggregation.setName(name); + for (ParsedBucket bucket : aggregation.buckets) { + bucket.subsetSize = aggregation.subsetSize; + bucket.supersetSize = aggregation.supersetSize; + } + return aggregation; + } + static void declareParsedSignificantTermsFields(final ObjectParser objectParser, final CheckedFunction bucketParser) { declareMultiBucketAggregationFields(objectParser, bucketParser::apply, bucketParser::apply); objectParser.declareLong((parsedTerms, value) -> parsedTerms.subsetSize = value , CommonFields.DOC_COUNT); + objectParser.declareLong((parsedTerms, value) -> parsedTerms.supersetSize = value , + new ParseField(InternalMappedSignificantTerms.BG_COUNT)); } public abstract static class ParsedBucket extends ParsedMultiBucketAggregation.ParsedBucket implements SignificantTerms.Bucket { protected long subsetDf; + protected long subsetSize; protected long supersetDf; + protected long supersetSize; protected double score; @Override @@ -110,12 +133,12 @@ public double getSignificanceScore() { @Override public long getSupersetSize() { - throw new UnsupportedOperationException(); + return supersetSize; } @Override public long getSubsetSize() { - throw new UnsupportedOperationException(); + return subsetSize; } @Override diff --git a/core/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTerms.java b/core/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTerms.java index 8c0da8b890e13..61cb4a9ca0a2d 100644 --- a/core/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTerms.java +++ b/core/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTerms.java @@ -29,17 +29,39 @@ public interface SignificantTerms extends MultiBucketsAggregation, Iterable> { + private SignificanceHeuristic significanceHeuristic; + + @Override + public void setUp() throws Exception { + super.setUp(); + significanceHeuristic = randomSignificanceHeuristic(); + } + + @Override + protected final InternalSignificantTerms createTestInstance(String name, + List pipelineAggregators, + Map metaData, + InternalAggregations aggregations) { + final int requiredSize = randomIntBetween(1, 5); + final int numBuckets = randomInt(requiredSize + 2); + + long subsetSize = 0; + long supersetSize = 0; + + int[] subsetDfs = new int[numBuckets]; + int[] supersetDfs = new int[numBuckets]; + + for (int i = 0; i < numBuckets; ++i) { + int subsetDf = randomIntBetween(1, 10); + subsetDfs[i] = subsetDf; + + int supersetDf = randomIntBetween(subsetDf, 20); + supersetDfs[i] = supersetDf; + + subsetSize += subsetDf; + supersetSize += supersetDf; + } + return createTestInstance(name, pipelineAggregators, metaData, aggregations, requiredSize, numBuckets, subsetSize, subsetDfs, + supersetSize, supersetDfs, significanceHeuristic); + } + + protected abstract InternalSignificantTerms createTestInstance(String name, + List pipelineAggregators, + Map metaData, + InternalAggregations aggregations, + int requiredSize, int numBuckets, + long subsetSize, int[] subsetDfs, + long supersetSize, int[] supersetDfs, + SignificanceHeuristic significanceHeuristic); + @Override protected InternalSignificantTerms createUnmappedInstance(String name, List pipelineAggregators, @@ -72,6 +123,7 @@ protected void assertMultiBucketsAggregation(MultiBucketsAggregation expected, M InternalSignificantTerms expectedSigTerms = (InternalSignificantTerms) expected; ParsedSignificantTerms actualSigTerms = (ParsedSignificantTerms) actual; assertEquals(expectedSigTerms.getSubsetSize(), actualSigTerms.getSubsetSize()); + assertEquals(expectedSigTerms.getSupersetSize(), actualSigTerms.getSupersetSize()); for (SignificantTerms.Bucket bucket : (SignificantTerms) expected) { String key = bucket.getKeyAsString(); @@ -91,14 +143,22 @@ protected void assertBucket(MultiBucketsAggregation.Bucket expected, MultiBucket assertEquals(expectedSigTerm.getSignificanceScore(), actualSigTerm.getSignificanceScore(), 0.0); assertEquals(expectedSigTerm.getSubsetDf(), actualSigTerm.getSubsetDf()); + assertEquals(expectedSigTerm.getDocCount(), actualSigTerm.getSubsetDf()); assertEquals(expectedSigTerm.getSupersetDf(), actualSigTerm.getSupersetDf()); - - expectThrows(UnsupportedOperationException.class, actualSigTerm::getSubsetSize); - expectThrows(UnsupportedOperationException.class, actualSigTerm::getSupersetSize); + assertEquals(expectedSigTerm.getSubsetSize(), actualSigTerm.getSubsetSize()); + assertEquals(expectedSigTerm.getSupersetSize(), actualSigTerm.getSupersetSize()); } private static Map toCounts(Stream buckets, Function fn) { return buckets.collect(Collectors.toMap(SignificantTerms.Bucket::getKey, fn, Long::sum)); } + + private static SignificanceHeuristic randomSignificanceHeuristic() { + return randomFrom( + new JLHScore(), + new MutualInformation(randomBoolean(), randomBoolean()), + new GND(randomBoolean()), + new ChiSquare(randomBoolean(), randomBoolean())); + } } diff --git a/core/src/test/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantLongTermsTests.java b/core/src/test/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantLongTermsTests.java index 793c6aec5c31a..f41dc80c3eded 100644 --- a/core/src/test/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantLongTermsTests.java +++ b/core/src/test/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantLongTermsTests.java @@ -23,10 +23,6 @@ import org.elasticsearch.search.DocValueFormat; import org.elasticsearch.search.aggregations.InternalAggregations; import org.elasticsearch.search.aggregations.ParsedMultiBucketAggregation; -import org.elasticsearch.search.aggregations.bucket.significant.heuristics.ChiSquare; -import org.elasticsearch.search.aggregations.bucket.significant.heuristics.GND; -import org.elasticsearch.search.aggregations.bucket.significant.heuristics.JLHScore; -import org.elasticsearch.search.aggregations.bucket.significant.heuristics.MutualInformation; import org.elasticsearch.search.aggregations.bucket.significant.heuristics.SignificanceHeuristic; import org.elasticsearch.search.aggregations.pipeline.PipelineAggregator; @@ -38,13 +34,11 @@ public class SignificantLongTermsTests extends InternalSignificantTermsTestCase { - private SignificanceHeuristic significanceHeuristic; private DocValueFormat format; @Override public void setUp() throws Exception { super.setUp(); - significanceHeuristic = randomSignificanceHeuristic(); format = randomNumericDocValueFormat(); } @@ -52,30 +46,20 @@ public void setUp() throws Exception { protected InternalSignificantTerms createTestInstance(String name, List pipelineAggregators, Map metaData, - InternalAggregations aggregations) { - int requiredSize = randomIntBetween(1, 5); - int shardSize = requiredSize + 2; - final int numBuckets = randomInt(shardSize); - - long globalSubsetSize = 0; - long globalSupersetSize = 0; + InternalAggregations aggs, + int requiredSize, int numBuckets, + long subsetSize, int[] subsetDfs, + long supersetSize, int[] supersetDfs, + SignificanceHeuristic significanceHeuristic) { List buckets = new ArrayList<>(numBuckets); Set terms = new HashSet<>(); for (int i = 0; i < numBuckets; ++i) { long term = randomValueOtherThanMany(l -> terms.add(l) == false, random()::nextLong); - - int subsetDf = randomIntBetween(1, 10); - int supersetDf = randomIntBetween(subsetDf, 20); - int supersetSize = randomIntBetween(supersetDf, 30); - - globalSubsetSize += subsetDf; - globalSupersetSize += supersetSize; - - buckets.add(new SignificantLongTerms.Bucket(subsetDf, subsetDf, supersetDf, supersetSize, term, aggregations, format)); + buckets.add(new SignificantLongTerms.Bucket(subsetDfs[i], subsetSize, supersetDfs[i], supersetSize, term, aggs, format)); } - return new SignificantLongTerms(name, requiredSize, 1L, pipelineAggregators, metaData, format, globalSubsetSize, - globalSupersetSize, significanceHeuristic, buckets); + return new SignificantLongTerms(name, requiredSize, 1L, pipelineAggregators, metaData, format, subsetSize, + supersetSize, significanceHeuristic, buckets); } @Override @@ -87,12 +71,4 @@ protected InternalSignificantTerms createTestInstance(String name, protected Class implementationClass() { return ParsedSignificantLongTerms.class; } - - private static SignificanceHeuristic randomSignificanceHeuristic() { - return randomFrom( - new JLHScore(), - new MutualInformation(randomBoolean(), randomBoolean()), - new GND(randomBoolean()), - new ChiSquare(randomBoolean(), randomBoolean())); - } } diff --git a/core/src/test/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantStringTermsTests.java b/core/src/test/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantStringTermsTests.java index 762472e4be56a..e9c716751f704 100644 --- a/core/src/test/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantStringTermsTests.java +++ b/core/src/test/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantStringTermsTests.java @@ -24,10 +24,6 @@ import org.elasticsearch.search.DocValueFormat; import org.elasticsearch.search.aggregations.InternalAggregations; import org.elasticsearch.search.aggregations.ParsedMultiBucketAggregation; -import org.elasticsearch.search.aggregations.bucket.significant.heuristics.ChiSquare; -import org.elasticsearch.search.aggregations.bucket.significant.heuristics.GND; -import org.elasticsearch.search.aggregations.bucket.significant.heuristics.JLHScore; -import org.elasticsearch.search.aggregations.bucket.significant.heuristics.MutualInformation; import org.elasticsearch.search.aggregations.bucket.significant.heuristics.SignificanceHeuristic; import org.elasticsearch.search.aggregations.pipeline.PipelineAggregator; @@ -39,43 +35,24 @@ public class SignificantStringTermsTests extends InternalSignificantTermsTestCase { - private SignificanceHeuristic significanceHeuristic; - - @Override - public void setUp() throws Exception { - super.setUp(); - significanceHeuristic = randomSignificanceHeuristic(); - } - @Override protected InternalSignificantTerms createTestInstance(String name, List pipelineAggregators, Map metaData, - InternalAggregations aggregations) { + InternalAggregations aggs, + int requiredSize, int numBuckets, + long subsetSize, int[] subsetDfs, + long supersetSize, int[] supersetDfs, + SignificanceHeuristic significanceHeuristic) { DocValueFormat format = DocValueFormat.RAW; - int requiredSize = randomIntBetween(1, 5); - int shardSize = requiredSize + 2; - final int numBuckets = randomInt(shardSize); - - long globalSubsetSize = 0; - long globalSupersetSize = 0; - List buckets = new ArrayList<>(numBuckets); Set terms = new HashSet<>(); for (int i = 0; i < numBuckets; ++i) { BytesRef term = randomValueOtherThanMany(b -> terms.add(b) == false, () -> new BytesRef(randomAlphaOfLength(10))); - - int subsetDf = randomIntBetween(1, 10); - int supersetDf = randomIntBetween(subsetDf, 20); - int supersetSize = randomIntBetween(supersetDf, 30); - - globalSubsetSize += subsetDf; - globalSupersetSize += supersetSize; - - buckets.add(new SignificantStringTerms.Bucket(term, subsetDf, subsetDf, supersetDf, supersetSize, aggregations, format)); + buckets.add(new SignificantStringTerms.Bucket(term, subsetDfs[i], subsetSize, supersetDfs[i], supersetSize, aggs, format)); } - return new SignificantStringTerms(name, requiredSize, 1L, pipelineAggregators, metaData, format, globalSubsetSize, - globalSupersetSize, significanceHeuristic, buckets); + return new SignificantStringTerms(name, requiredSize, 1L, pipelineAggregators, metaData, format, subsetSize, + supersetSize, significanceHeuristic, buckets); } @Override @@ -87,12 +64,4 @@ protected InternalSignificantTerms createTestInstance(String name, protected Class implementationClass() { return ParsedSignificantStringTerms.class; } - - private static SignificanceHeuristic randomSignificanceHeuristic() { - return randomFrom( - new JLHScore(), - new MutualInformation(randomBoolean(), randomBoolean()), - new GND(randomBoolean()), - new ChiSquare(randomBoolean(), randomBoolean())); - } } diff --git a/docs/reference/aggregations/bucket/diversified-sampler-aggregation.asciidoc b/docs/reference/aggregations/bucket/diversified-sampler-aggregation.asciidoc index acb40c5a65dc4..6ef7196f4ac07 100644 --- a/docs/reference/aggregations/bucket/diversified-sampler-aggregation.asciidoc +++ b/docs/reference/aggregations/bucket/diversified-sampler-aggregation.asciidoc @@ -66,9 +66,10 @@ Response: ... "aggregations": { "my_unbiased_sample": { - "doc_count": 1000,<1> + "doc_count": 151,<1> "keywords": {<2> - "doc_count": 1000, + "doc_count": 151, + "bg_count": 650, "buckets": [ { "key": "kibana", @@ -83,10 +84,9 @@ Response: } -------------------------------------------------- // TESTRESPONSE[s/\.\.\./"took": $body.took,"timed_out": false,"_shards": $body._shards,"hits": $body.hits,/] -// TESTRESPONSE[s/1000/151/] // TESTRESPONSE[s/2.213/$body.aggregations.my_unbiased_sample.keywords.buckets.0.score/] -<1> 1000 documents were sampled in total because we asked for a maximum of 200 from an index with 5 shards. The cost of performing the nested significant_terms aggregation was therefore limited rather than unbounded. +<1> 151 documents were sampled in total. <2> The results of the significant_terms aggregation are not skewed by any single author's quirks because we asked for a maximum of one post from any one author in our sample. ==== Scripted example: @@ -136,9 +136,10 @@ Response: ... "aggregations": { "my_unbiased_sample": { - "doc_count": 1000, + "doc_count": 6, "keywords": { - "doc_count": 1000, + "doc_count": 6, + "bg_count": 650, "buckets": [ { "key": "logstash", @@ -159,7 +160,6 @@ Response: } -------------------------------------------------- // TESTRESPONSE[s/\.\.\./"took": $body.took,"timed_out": false,"_shards": $body._shards,"hits": $body.hits,/] -// TESTRESPONSE[s/1000/6/] // TESTRESPONSE[s/2.213/$body.aggregations.my_unbiased_sample.keywords.buckets.0.score/] // TESTRESPONSE[s/1.34/$body.aggregations.my_unbiased_sample.keywords.buckets.1.score/] diff --git a/docs/reference/aggregations/bucket/sampler-aggregation.asciidoc b/docs/reference/aggregations/bucket/sampler-aggregation.asciidoc index 693df5e814423..4957901920e23 100644 --- a/docs/reference/aggregations/bucket/sampler-aggregation.asciidoc +++ b/docs/reference/aggregations/bucket/sampler-aggregation.asciidoc @@ -54,9 +54,10 @@ Response: ... "aggregations": { "sample": { - "doc_count": 1000,<1> + "doc_count": 200,<1> "keywords": { - "doc_count": 1000, + "doc_count": 200, + "bg_count": 650, "buckets": [ { "key": "elasticsearch", @@ -77,9 +78,9 @@ Response: } -------------------------------------------------- // TESTRESPONSE[s/\.\.\./"took": $body.took,"timed_out": false,"_shards": $body._shards,"hits": $body.hits,/] -// TESTRESPONSE[s/1000/200/] -<1> 1000 documents were sampled in total because we asked for a maximum of 200 from an index with 5 shards. The cost of performing the nested significant_terms aggregation was therefore limited rather than unbounded. +<1> 200 documents were sampled in total. The cost of performing the nested significant_terms aggregation was +therefore limited rather than unbounded. Without the `sampler` aggregation the request query considers the full "long tail" of low-quality matches and therefore identifies @@ -117,13 +118,14 @@ Response: ... "aggregations": { "low_quality_keywords": { - "doc_count": 1000, + "doc_count": 600, + "bg_count": 650, "buckets": [ { "key": "angular", "doc_count": 200, "score": 0.02777, - "bg_count": 200 + "bg_count": 200 }, { "key": "jquery", @@ -143,7 +145,6 @@ Response: } -------------------------------------------------- // TESTRESPONSE[s/\.\.\./"took": $body.took,"timed_out": false,"_shards": $body._shards,"hits": $body.hits,/] -// TESTRESPONSE[s/1000/600/] // TESTRESPONSE[s/0.02777/$body.aggregations.low_quality_keywords.buckets.0.score/] // TESTRESPONSE[s/0.0069/$body.aggregations.low_quality_keywords.buckets.2.score/] diff --git a/docs/reference/aggregations/bucket/significantterms-aggregation.asciidoc b/docs/reference/aggregations/bucket/significantterms-aggregation.asciidoc index 172a528fcb316..a927a2165ecaa 100644 --- a/docs/reference/aggregations/bucket/significantterms-aggregation.asciidoc +++ b/docs/reference/aggregations/bucket/significantterms-aggregation.asciidoc @@ -48,6 +48,7 @@ Response: "aggregations" : { "significantCrimeTypes" : { "doc_count": 47347, + "bg_count": 5064554, "buckets" : [ { "key": "Bicycle theft", @@ -111,6 +112,7 @@ Response: "doc_count": 894038, "significantCrimeTypes": { "doc_count": 894038, + "bg_count": 5064554, "buckets": [ { "key": "Robbery", @@ -127,6 +129,7 @@ Response: "doc_count": 47347, "significantCrimeTypes": { "doc_count": 47347, + "bg_count": 5064554, "buckets": [ { "key": "Bicycle theft",