Skip to content

Commit 79c7a57

Browse files
matarresejtibshirani
authored andcommitted
Use the breadth first collection mode for significant terms aggs. (#29042)
This helps avoid memory issues when computing deep sub-aggregations. Because it should be rare to use sub-aggregations with significant terms, we opted to always choose breadth first as opposed to exposing a `collect_mode` option. Closes #28652.
1 parent 0f49684 commit 79c7a57

File tree

6 files changed

+71
-8
lines changed

6 files changed

+71
-8
lines changed

docs/reference/aggregations/bucket/significantterms-aggregation.asciidoc

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -542,6 +542,12 @@ It is possible (although rarely required) to filter the values for which buckets
542542
`exclude` parameters which are based on a regular expression string or arrays of exact terms. This functionality mirrors the features
543543
described in the <<search-aggregations-bucket-terms-aggregation,terms aggregation>> documentation.
544544

545+
==== Collect mode
546+
547+
To avoid memory issues, the `significant_terms` aggregation always computes child aggregations in `breadth_first` mode.
548+
A description of the different collection modes can be found in the
549+
<<search-aggregations-bucket-terms-aggregation-collect, terms aggregation>> documentation.
550+
545551
==== Execution hint
546552

547553
There are different mechanisms by which terms aggregations can be executed:

docs/reference/aggregations/bucket/terms-aggregation.asciidoc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -775,6 +775,7 @@ fields, then use `copy_to` in your mapping to create a new dedicated field at
775775
index time which contains the values from both fields. You can aggregate on
776776
this single field, which will benefit from the global ordinals optimization.
777777

778+
[[search-aggregations-bucket-terms-aggregation-collect]]
778779
==== Collect mode
779780

780781
Deferring calculation of child aggregations

server/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/GlobalOrdinalsSignificantTermsAggregator.java

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ public GlobalOrdinalsSignificantTermsAggregator(String name,
6565
List<PipelineAggregator> pipelineAggregators,
6666
Map<String, Object> metaData) throws IOException {
6767
super(name, factories, valuesSource, null, format, bucketCountThresholds, includeExclude, context, parent,
68-
forceRemapGlobalOrds, SubAggCollectionMode.DEPTH_FIRST, false, pipelineAggregators, metaData);
68+
forceRemapGlobalOrds, SubAggCollectionMode.BREADTH_FIRST, false, pipelineAggregators, metaData);
6969
this.significanceHeuristic = significanceHeuristic;
7070
this.termsAggFactory = termsAggFactory;
7171
this.numCollectedDocs = 0;
@@ -146,12 +146,19 @@ public SignificantStringTerms buildAggregation(long owningBucketOrdinal) throws
146146
}
147147

148148
final SignificantStringTerms.Bucket[] list = new SignificantStringTerms.Bucket[ordered.size()];
149+
final long[] survivingBucketOrds = new long[ordered.size()];
149150
for (int i = ordered.size() - 1; i >= 0; i--) {
150151
final SignificantStringTerms.Bucket bucket = ordered.pop();
152+
survivingBucketOrds[i] = bucket.bucketOrd;
153+
list[i] = bucket;
154+
}
155+
156+
runDeferredCollections(survivingBucketOrds);
157+
158+
for (SignificantStringTerms.Bucket bucket : list) {
151159
// the terms are owned by the BytesRefHash, we need to pull a copy since the BytesRef hash data may be recycled at some point
152160
bucket.termBytes = BytesRef.deepCopyOf(bucket.termBytes);
153161
bucket.aggregations = bucketAggregations(bucket.bucketOrd);
154-
list[i] = bucket;
155162
}
156163

157164
return new SignificantStringTerms(name, bucketCountThresholds.getRequiredSize(), bucketCountThresholds.getMinDocCount(),

server/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantLongTermsAggregator.java

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ public SignificantLongTermsAggregator(String name, AggregatorFactories factories
5050
List<PipelineAggregator> pipelineAggregators, Map<String, Object> metaData) throws IOException {
5151

5252
super(name, factories, valuesSource, format, null, bucketCountThresholds, context, parent,
53-
SubAggCollectionMode.DEPTH_FIRST, false, includeExclude, pipelineAggregators, metaData);
53+
SubAggCollectionMode.BREADTH_FIRST, false, includeExclude, pipelineAggregators, metaData);
5454
this.significanceHeuristic = significanceHeuristic;
5555
this.termsAggFactory = termsAggFactory;
5656
}
@@ -106,12 +106,20 @@ public SignificantLongTerms buildAggregation(long owningBucketOrdinal) throws IO
106106
}
107107
}
108108

109-
final SignificantLongTerms.Bucket[] list = new SignificantLongTerms.Bucket[ordered.size()];
109+
SignificantLongTerms.Bucket[] list = new SignificantLongTerms.Bucket[ordered.size()];
110+
final long[] survivingBucketOrds = new long[ordered.size()];
110111
for (int i = ordered.size() - 1; i >= 0; i--) {
111112
final SignificantLongTerms.Bucket bucket = ordered.pop();
112-
bucket.aggregations = bucketAggregations(bucket.bucketOrd);
113+
survivingBucketOrds[i] = bucket.bucketOrd;
113114
list[i] = bucket;
114115
}
116+
117+
runDeferredCollections(survivingBucketOrds);
118+
119+
for (SignificantLongTerms.Bucket bucket : list) {
120+
bucket.aggregations = bucketAggregations(bucket.bucketOrd);
121+
}
122+
115123
return new SignificantLongTerms(name, bucketCountThresholds.getRequiredSize(), bucketCountThresholds.getMinDocCount(),
116124
pipelineAggregators(), metaData(), format, subsetSize, supersetSize, significanceHeuristic, Arrays.asList(list));
117125
}

server/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantStringTermsAggregator.java

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ public SignificantStringTermsAggregator(String name, AggregatorFactories factori
5757
List<PipelineAggregator> pipelineAggregators, Map<String, Object> metaData) throws IOException {
5858

5959
super(name, factories, valuesSource, null, format, bucketCountThresholds, includeExclude, aggregationContext, parent,
60-
SubAggCollectionMode.DEPTH_FIRST, false, pipelineAggregators, metaData);
60+
SubAggCollectionMode.BREADTH_FIRST, false, pipelineAggregators, metaData);
6161
this.significanceHeuristic = significanceHeuristic;
6262
this.termsAggFactory = termsAggFactory;
6363
}
@@ -113,12 +113,20 @@ public SignificantStringTerms buildAggregation(long owningBucketOrdinal) throws
113113
}
114114

115115
final SignificantStringTerms.Bucket[] list = new SignificantStringTerms.Bucket[ordered.size()];
116+
final long[] survivingBucketOrds = new long[ordered.size()];
116117
for (int i = ordered.size() - 1; i >= 0; i--) {
117118
final SignificantStringTerms.Bucket bucket = ordered.pop();
118-
// the terms are owned by the BytesRefHash, we need to pull a copy since the BytesRef hash data may be recycled at some point
119+
survivingBucketOrds[i] = bucket.bucketOrd;
120+
list[i] = bucket;
121+
}
122+
123+
runDeferredCollections(survivingBucketOrds);
124+
125+
for (SignificantStringTerms.Bucket bucket : list) {
126+
// the terms are owned by the BytesRefHash, we need to pull a copy since the BytesRef hash data may be
127+
// recycled at some point
119128
bucket.termBytes = BytesRef.deepCopyOf(bucket.termBytes);
120129
bucket.aggregations = bucketAggregations(bucket.bucketOrd);
121-
list[i] = bucket;
122130
}
123131

124132
return new SignificantStringTerms( name, bucketCountThresholds.getRequiredSize(),

server/src/test/java/org/elasticsearch/search/aggregations/bucket/SignificantTermsSignificanceScoreIT.java

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
import org.elasticsearch.common.xcontent.XContentFactory;
3131
import org.elasticsearch.common.xcontent.XContentParser;
3232
import org.elasticsearch.common.xcontent.XContentType;
33+
import org.elasticsearch.index.query.QueryBuilder;
3334
import org.elasticsearch.index.query.QueryBuilders;
3435
import org.elasticsearch.index.query.QueryShardException;
3536
import org.elasticsearch.plugins.Plugin;
@@ -38,6 +39,7 @@
3839
import org.elasticsearch.script.Script;
3940
import org.elasticsearch.script.ScriptType;
4041
import org.elasticsearch.search.aggregations.Aggregation;
42+
import org.elasticsearch.search.aggregations.AggregationBuilder;
4143
import org.elasticsearch.search.aggregations.Aggregations;
4244
import org.elasticsearch.search.aggregations.bucket.filter.InternalFilter;
4345
import org.elasticsearch.search.aggregations.bucket.significant.SignificantTerms;
@@ -543,6 +545,37 @@ public void testScoresEqualForPositiveAndNegative(SignificanceHeuristic heuristi
543545
}
544546
}
545547

548+
/**
549+
* A simple test that adds a sub-aggregation to a significant terms aggregation,
550+
* to help check that sub-aggregation collection is handled correctly.
551+
*/
552+
public void testSubAggregations() throws Exception {
553+
indexEqualTestData();
554+
555+
QueryBuilder query = QueryBuilders.termsQuery(TEXT_FIELD, "a", "b");
556+
AggregationBuilder subAgg = terms("class").field(CLASS_FIELD);
557+
AggregationBuilder agg = significantTerms("significant_terms")
558+
.field(TEXT_FIELD)
559+
.executionHint(randomExecutionHint())
560+
.significanceHeuristic(new ChiSquare(true, true))
561+
.minDocCount(1).shardSize(1000).size(1000)
562+
.subAggregation(subAgg);
563+
564+
SearchResponse response = client().prepareSearch("test")
565+
.setQuery(query)
566+
.addAggregation(agg)
567+
.get();
568+
assertSearchResponse(response);
569+
570+
SignificantTerms sigTerms = response.getAggregations().get("significant_terms");
571+
assertThat(sigTerms.getBuckets().size(), equalTo(2));
572+
573+
for (SignificantTerms.Bucket bucket : sigTerms) {
574+
StringTerms terms = bucket.getAggregations().get("class");
575+
assertThat(terms.getBuckets().size(), equalTo(2));
576+
}
577+
}
578+
546579
private void indexEqualTestData() throws ExecutionException, InterruptedException {
547580
assertAcked(prepareCreate("test")
548581
.setSettings(Settings.builder().put(SETTING_NUMBER_OF_SHARDS, 1).put(SETTING_NUMBER_OF_REPLICAS, 0))

0 commit comments

Comments
 (0)