Skip to content

Commit b9fe108

Browse files
authored
Make global ords terms simpler to understand (backport of #57241) (#57311)
When the `terms` enum operates on non-numeric data it can collect it via global ordinals. It actually has two separate collection strategies for, one "dense" and one "remapping". Each of *those* strategies has two "iteration" strategies that it uses to build buckets, depending on whether or not we need buckets with `0` docs in them. Previously this was done with several `null` checks and never really explained. This change replaces those checks with two `CollectionStrategy` classes which have good stuff like documentation.
1 parent 24d605e commit b9fe108

File tree

4 files changed

+246
-137
lines changed

4 files changed

+246
-137
lines changed

rest-api-spec/src/main/resources/rest-api-spec/test/search.aggregation/20_terms.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -822,6 +822,7 @@ setup:
822822
- match: { profile.shards.0.aggregations.0.description: str_terms }
823823
- match: { profile.shards.0.aggregations.0.breakdown.collect_count: 4 }
824824
- match: { profile.shards.0.aggregations.0.debug.deferred_aggregators: [ max_number ] }
825+
- match: { profile.shards.0.aggregations.0.debug.collection_strategy: dense }
825826
- match: { profile.shards.0.aggregations.0.children.0.type: MaxAggregator }
826827
- match: { profile.shards.0.aggregations.0.children.0.description: max_number }
827828

server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/GlobalOrdinalsSignificantTermsAggregator.java

Lines changed: 25 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -96,50 +96,33 @@ public InternalAggregation[] buildAggregations(long[] owningBucketOrds) throws I
9696
long subsetSize = numCollectedDocs;
9797

9898
BucketSignificancePriorityQueue<SignificantStringTerms.Bucket> ordered = new BucketSignificancePriorityQueue<>(size);
99-
SignificantStringTerms.Bucket spare = null;
100-
final boolean needsFullScan = bucketOrds == null || bucketCountThresholds.getMinDocCount() == 0;
101-
final long maxId = needsFullScan ? valueCount : bucketOrds.size();
102-
for (long ord = 0; ord < maxId; ord++) {
103-
final long globalOrd;
104-
final long bucketOrd;
105-
if (needsFullScan) {
106-
bucketOrd = bucketOrds == null ? ord : bucketOrds.find(ord);
107-
globalOrd = ord;
108-
} else {
109-
assert bucketOrds != null;
110-
bucketOrd = ord;
111-
globalOrd = bucketOrds.get(ord);
112-
}
113-
if (includeExclude != null && !acceptedGlobalOrdinals.get(globalOrd)) {
114-
continue;
115-
}
116-
final int bucketDocCount = bucketOrd < 0 ? 0 : bucketDocCount(bucketOrd);
117-
if (bucketCountThresholds.getMinDocCount() > 0 && bucketDocCount == 0) {
118-
continue;
119-
}
120-
if (bucketDocCount < bucketCountThresholds.getShardMinDocCount()) {
121-
continue;
122-
}
99+
collectionStrategy.forEach(new BucketInfoConsumer() {
100+
SignificantStringTerms.Bucket spare = null;
123101

124-
if (spare == null) {
125-
spare = new SignificantStringTerms.Bucket(new BytesRef(), 0, 0, 0, 0, null, format, 0);
126-
}
127-
spare.bucketOrd = bucketOrd;
128-
copy(lookupGlobalOrd.apply(globalOrd), spare.termBytes);
129-
spare.subsetDf = bucketDocCount;
130-
spare.subsetSize = subsetSize;
131-
spare.supersetDf = termsAggFactory.getBackgroundFrequency(spare.termBytes);
132-
spare.supersetSize = supersetSize;
133-
// During shard-local down-selection we use subset/superset stats
134-
// that are for this shard only
135-
// Back at the central reducer these properties will be updated with
136-
// global stats
137-
spare.updateScore(significanceHeuristic);
138-
spare = ordered.insertWithOverflow(spare);
139-
if (spare == null) {
140-
consumeBucketsAndMaybeBreak(1);
102+
@Override
103+
public void accept(long globalOrd, long bucketOrd, long docCount) throws IOException {
104+
if (docCount >= bucketCountThresholds.getShardMinDocCount()) {
105+
if (spare == null) {
106+
spare = new SignificantStringTerms.Bucket(new BytesRef(), 0, 0, 0, 0, null, format, 0);
107+
}
108+
spare.bucketOrd = bucketOrd;
109+
copy(lookupGlobalOrd.apply(globalOrd), spare.termBytes);
110+
spare.subsetDf = docCount;
111+
spare.subsetSize = subsetSize;
112+
spare.supersetDf = termsAggFactory.getBackgroundFrequency(spare.termBytes);
113+
spare.supersetSize = supersetSize;
114+
// During shard-local down-selection we use subset/superset stats
115+
// that are for this shard only
116+
// Back at the central reducer these properties will be updated with
117+
// global stats
118+
spare.updateScore(significanceHeuristic);
119+
spare = ordered.insertWithOverflow(spare);
120+
if (spare == null) {
121+
consumeBucketsAndMaybeBreak(1);
122+
}
123+
}
141124
}
142-
}
125+
});
143126

144127
final SignificantStringTerms.Bucket[] list = new SignificantStringTerms.Bucket[ordered.size()];
145128
for (int i = ordered.size() - 1; i >= 0; i--) {

0 commit comments

Comments
 (0)