Skip to content

Commit d6c8d94

Browse files
authored
Give significance lookups their own home (backport of #57903) (#57959)
This moves the code to look up significance heuristics information like background frequency and superset size out of `SignificantTermsAggregatorFactory` and into its own home so that it is easier to pass around. This will: 1. Make us feel better about ourselves for not passing around the factory, which is really *supposed* to be a throw away thing. 2. Abstract the significance lookup logic so we can reuse it for the `significant_text` aggregation. 3. Make if very simple to cache the background frequencies which should speed up when the agg is a sub-agg. We had done this for numerics but not string-shaped significant terms.
1 parent 922423c commit d6c8d94

File tree

6 files changed

+262
-212
lines changed

6 files changed

+262
-212
lines changed

server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343
import org.elasticsearch.search.aggregations.InternalMultiBucketAggregation;
4444
import org.elasticsearch.search.aggregations.LeafBucketCollector;
4545
import org.elasticsearch.search.aggregations.LeafBucketCollectorBase;
46+
import org.elasticsearch.search.aggregations.bucket.terms.SignificanceLookup.BackgroundFrequencyForBytes;
4647
import org.elasticsearch.search.aggregations.bucket.terms.heuristic.SignificanceHeuristic;
4748
import org.elasticsearch.search.aggregations.support.ValuesSource;
4849
import org.elasticsearch.search.internal.SearchContext;
@@ -753,14 +754,19 @@ class SignificantTermsResults extends ResultStrategy<
753754
SignificantStringTerms.Bucket,
754755
SignificantStringTerms.Bucket> {
755756

756-
// TODO a reference to the factory is weird - probably should be reference to what we need from it.
757-
private final SignificantTermsAggregatorFactory termsAggFactory;
757+
private final BackgroundFrequencyForBytes backgroundFrequencies;
758+
private final long supersetSize;
758759
private final SignificanceHeuristic significanceHeuristic;
759760

760761
private LongArray subsetSizes = context.bigArrays().newLongArray(1, true);
761762

762-
SignificantTermsResults(SignificantTermsAggregatorFactory termsAggFactory, SignificanceHeuristic significanceHeuristic) {
763-
this.termsAggFactory = termsAggFactory;
763+
SignificantTermsResults(
764+
SignificanceLookup significanceLookup,
765+
SignificanceHeuristic significanceHeuristic,
766+
boolean collectsFromSingleBucket
767+
) {
768+
backgroundFrequencies = significanceLookup.bytesLookup(context.bigArrays(), collectsFromSingleBucket);
769+
supersetSize = significanceLookup.supersetSize();
764770
this.significanceHeuristic = significanceHeuristic;
765771
}
766772

@@ -804,8 +810,8 @@ BucketUpdater<SignificantStringTerms.Bucket> bucketUpdater(long owningBucketOrd)
804810
oversizedCopy(lookupGlobalOrd.apply(globalOrd), spare.termBytes);
805811
spare.subsetDf = docCount;
806812
spare.subsetSize = subsetSize;
807-
spare.supersetDf = termsAggFactory.getBackgroundFrequency(spare.termBytes);
808-
spare.supersetSize = termsAggFactory.getSupersetNumDocs();
813+
spare.supersetDf = backgroundFrequencies.freq(spare.termBytes);
814+
spare.supersetSize = supersetSize;
809815
/*
810816
* During shard-local down-selection we use subset/superset stats
811817
* that are for this shard only. Back at the central reducer these
@@ -839,7 +845,7 @@ SignificantStringTerms buildResult(long owningBucketOrd, long otherDocCount, Sig
839845
metadata(),
840846
format,
841847
subsetSizes.get(owningBucketOrd),
842-
termsAggFactory.getSupersetNumDocs(),
848+
supersetSize,
843849
significanceHeuristic,
844850
Arrays.asList(topBuckets)
845851
);
@@ -857,7 +863,7 @@ SignificantStringTerms buildNoValuesResult(long owningBucketOrdinal) {
857863

858864
@Override
859865
public void close() {
860-
Releasables.close(termsAggFactory, subsetSizes);
866+
Releasables.close(backgroundFrequencies, subsetSizes);
861867
}
862868

863869
/**

server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/MapStringTermsAggregator.java

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
import org.elasticsearch.search.aggregations.InternalOrder;
3737
import org.elasticsearch.search.aggregations.LeafBucketCollector;
3838
import org.elasticsearch.search.aggregations.LeafBucketCollectorBase;
39+
import org.elasticsearch.search.aggregations.bucket.terms.SignificanceLookup.BackgroundFrequencyForBytes;
3940
import org.elasticsearch.search.aggregations.bucket.terms.heuristic.SignificanceHeuristic;
4041
import org.elasticsearch.search.aggregations.support.ValuesSource;
4142
import org.elasticsearch.search.internal.SearchContext;
@@ -367,14 +368,19 @@ public void close() {}
367368
* Builds results for the {@code significant_terms} aggregation.
368369
*/
369370
class SignificantTermsResults extends ResultStrategy<SignificantStringTerms, SignificantStringTerms.Bucket> {
370-
// TODO a reference to the factory is weird - probably should be reference to what we need from it.
371-
private final SignificantTermsAggregatorFactory termsAggFactory;
371+
private final BackgroundFrequencyForBytes backgroundFrequencies;
372+
private final long supersetSize;
372373
private final SignificanceHeuristic significanceHeuristic;
373374

374375
private LongArray subsetSizes = context.bigArrays().newLongArray(1, true);
375376

376-
SignificantTermsResults(SignificantTermsAggregatorFactory termsAggFactory, SignificanceHeuristic significanceHeuristic) {
377-
this.termsAggFactory = termsAggFactory;
377+
SignificantTermsResults(
378+
SignificanceLookup significanceLookup,
379+
SignificanceHeuristic significanceHeuristic,
380+
boolean collectsFromSingleBucket
381+
) {
382+
backgroundFrequencies = significanceLookup.bytesLookup(context.bigArrays(), collectsFromSingleBucket);
383+
supersetSize = significanceLookup.supersetSize();
378384
this.significanceHeuristic = significanceHeuristic;
379385
}
380386

@@ -416,8 +422,8 @@ void updateBucket(SignificantStringTerms.Bucket spare, BytesKeyedBucketOrds.Buck
416422
ordsEnum.readValue(spare.termBytes);
417423
spare.bucketOrd = ordsEnum.ord();
418424
spare.subsetDf = docCount;
419-
spare.supersetDf = termsAggFactory.getBackgroundFrequency(spare.termBytes);
420-
spare.supersetSize = termsAggFactory.getSupersetNumDocs();
425+
spare.supersetDf = backgroundFrequencies.freq(spare.termBytes);
426+
spare.supersetSize = supersetSize;
421427
/*
422428
* During shard-local down-selection we use subset/superset stats
423429
* that are for this shard only. Back at the central reducer these
@@ -460,7 +466,7 @@ SignificantStringTerms buildResult(long owningBucketOrd, long otherDocCount, Sig
460466
metadata(),
461467
format,
462468
subsetSizes.get(owningBucketOrd),
463-
termsAggFactory.getSupersetNumDocs(),
469+
supersetSize,
464470
significanceHeuristic,
465471
Arrays.asList(topBuckets)
466472
);
@@ -473,7 +479,7 @@ SignificantStringTerms buildEmptyResult() {
473479

474480
@Override
475481
public void close() {
476-
Releasables.close(termsAggFactory, subsetSizes);
482+
Releasables.close(backgroundFrequencies, subsetSizes);
477483
}
478484
}
479485
}

server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/NumericTermsAggregator.java

Lines changed: 5 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,7 @@
2727
import org.elasticsearch.common.collect.List;
2828
import org.elasticsearch.common.lease.Releasable;
2929
import org.elasticsearch.common.lease.Releasables;
30-
import org.elasticsearch.common.util.BigArrays;
3130
import org.elasticsearch.common.util.LongArray;
32-
import org.elasticsearch.common.util.LongHash;
3331
import org.elasticsearch.index.fielddata.FieldData;
3432
import org.elasticsearch.search.DocValueFormat;
3533
import org.elasticsearch.search.aggregations.Aggregator;
@@ -42,6 +40,7 @@
4240
import org.elasticsearch.search.aggregations.LeafBucketCollectorBase;
4341
import org.elasticsearch.search.aggregations.bucket.terms.IncludeExclude.LongFilter;
4442
import org.elasticsearch.search.aggregations.bucket.terms.LongKeyedBucketOrds.BucketOrdsEnum;
43+
import org.elasticsearch.search.aggregations.bucket.terms.SignificanceLookup.BackgroundFrequencyForLong;
4544
import org.elasticsearch.search.aggregations.bucket.terms.heuristic.SignificanceHeuristic;
4645
import org.elasticsearch.search.aggregations.support.ValuesSource;
4746
import org.elasticsearch.search.internal.ContextIndexSearcher;
@@ -469,19 +468,18 @@ DoubleTerms buildEmptyResult() {
469468
}
470469

471470
class SignificantLongTermsResults extends ResultStrategy<SignificantLongTerms, SignificantLongTerms.Bucket> {
472-
private final BackgroundFrequencies backgroundFrequencies;
471+
private final BackgroundFrequencyForLong backgroundFrequencies;
473472
private final long supersetSize;
474473
private final SignificanceHeuristic significanceHeuristic;
475474
private LongArray subsetSizes;
476475

477476
SignificantLongTermsResults(
478-
SignificantTermsAggregatorFactory termsAggFactory,
477+
SignificanceLookup significanceLookup,
479478
SignificanceHeuristic significanceHeuristic,
480479
boolean collectsFromSingleBucket
481480
) {
482-
LookupBackgroundFrequencies lookup = new LookupBackgroundFrequencies(termsAggFactory);
483-
backgroundFrequencies = collectsFromSingleBucket ? lookup : new CacheBackgroundFrequencies(lookup, context.bigArrays());
484-
supersetSize = termsAggFactory.getSupersetNumDocs();
481+
backgroundFrequencies = significanceLookup.longLookup(context.bigArrays(), collectsFromSingleBucket);
482+
supersetSize = significanceLookup.supersetSize();
485483
this.significanceHeuristic = significanceHeuristic;
486484
subsetSizes = context.bigArrays().newLongArray(1, true);
487485
}
@@ -588,66 +586,5 @@ public void close() {
588586
}
589587
}
590588

591-
/**
592-
* Lookup frequencies for terms.
593-
*/
594-
private interface BackgroundFrequencies extends Releasable {
595-
long freq(long term) throws IOException;
596-
}
597-
598-
/**
599-
* Lookup frequencies for terms.
600-
*/
601-
private static class LookupBackgroundFrequencies implements BackgroundFrequencies {
602-
// TODO a reference to the factory is weird - probably should be reference to what we need from it.
603-
private final SignificantTermsAggregatorFactory termsAggFactory;
604-
605-
LookupBackgroundFrequencies(SignificantTermsAggregatorFactory termsAggFactory) {
606-
this.termsAggFactory = termsAggFactory;
607-
}
608589

609-
@Override
610-
public long freq(long term) throws IOException {
611-
return termsAggFactory.getBackgroundFrequency(term);
612-
}
613-
614-
@Override
615-
public void close() {
616-
termsAggFactory.close();
617-
}
618-
}
619-
620-
/**
621-
* Lookup and cache background frequencies for terms.
622-
*/
623-
private static class CacheBackgroundFrequencies implements BackgroundFrequencies {
624-
private final LookupBackgroundFrequencies lookup;
625-
private final BigArrays bigArrays;
626-
private final LongHash termToPosition;
627-
private LongArray positionToFreq;
628-
629-
CacheBackgroundFrequencies(LookupBackgroundFrequencies lookup, BigArrays bigArrays) {
630-
this.lookup = lookup;
631-
this.bigArrays = bigArrays;
632-
termToPosition = new LongHash(1, bigArrays);
633-
positionToFreq = bigArrays.newLongArray(1, false);
634-
}
635-
636-
@Override
637-
public long freq(long term) throws IOException {
638-
long position = termToPosition.add(term);
639-
if (position < 0) {
640-
return positionToFreq.get(-1 - position);
641-
}
642-
long freq = lookup.freq(term);
643-
positionToFreq = bigArrays.grow(positionToFreq, position + 1);
644-
positionToFreq.set(position, freq);
645-
return freq;
646-
}
647-
648-
@Override
649-
public void close() {
650-
Releasables.close(lookup, termToPosition, positionToFreq);
651-
}
652-
}
653590
}

0 commit comments

Comments
 (0)