|
20 | 20 |
|
21 | 21 | import org.apache.lucene.index.LeafReaderContext; |
22 | 22 | import org.apache.lucene.index.SortedNumericDocValues; |
23 | | -import org.apache.lucene.util.CollectionUtil; |
24 | 23 | import org.elasticsearch.common.lease.Releasables; |
25 | 24 | import org.elasticsearch.common.util.LongHash; |
| 25 | +import org.elasticsearch.common.util.SetBackedScalingCuckooFilter; |
26 | 26 | import org.elasticsearch.search.DocValueFormat; |
27 | 27 | import org.elasticsearch.search.aggregations.Aggregator; |
28 | 28 | import org.elasticsearch.search.aggregations.AggregatorFactories; |
|
34 | 34 |
|
35 | 35 | import java.io.IOException; |
36 | 36 | import java.util.ArrayList; |
| 37 | +import java.util.Arrays; |
37 | 38 | import java.util.List; |
38 | 39 | import java.util.Map; |
39 | 40 |
|
|
42 | 43 | /** |
43 | 44 | * An aggregator that finds "rare" string values (e.g. terms agg that orders ascending) |
44 | 45 | */ |
45 | | -public class LongRareTermsAggregator extends AbstractRareTermsAggregator<ValuesSource.Numeric, IncludeExclude.LongFilter, Long> { |
46 | | - |
47 | | - protected LongHash bucketOrds; |
48 | | - |
49 | | - LongRareTermsAggregator(String name, AggregatorFactories factories, ValuesSource.Numeric valuesSource, DocValueFormat format, |
50 | | - SearchContext aggregationContext, Aggregator parent, IncludeExclude.LongFilter longFilter, |
51 | | - int maxDocCount, double precision, Map<String, Object> metadata) throws IOException { |
52 | | - super(name, factories, aggregationContext, parent, metadata, maxDocCount, precision, format, valuesSource, longFilter); |
53 | | - this.bucketOrds = new LongHash(1, aggregationContext.bigArrays()); |
| 46 | +public class LongRareTermsAggregator extends AbstractRareTermsAggregator { |
| 47 | + private final ValuesSource.Numeric valuesSource; |
| 48 | + private final IncludeExclude.LongFilter filter; |
| 49 | + private final LongKeyedBucketOrds bucketOrds; |
| 50 | + |
| 51 | + LongRareTermsAggregator( |
| 52 | + String name, |
| 53 | + AggregatorFactories factories, |
| 54 | + ValuesSource.Numeric valuesSource, |
| 55 | + DocValueFormat format, |
| 56 | + SearchContext aggregationContext, |
| 57 | + Aggregator parent, |
| 58 | + IncludeExclude.LongFilter filter, |
| 59 | + int maxDocCount, |
| 60 | + double precision, |
| 61 | + boolean collectsFromSingleBucket, |
| 62 | + Map<String, Object> metadata |
| 63 | + ) throws IOException { |
| 64 | + super( |
| 65 | + name, |
| 66 | + factories, |
| 67 | + aggregationContext, |
| 68 | + parent, |
| 69 | + metadata, |
| 70 | + maxDocCount, |
| 71 | + precision, |
| 72 | + format, |
| 73 | + collectsFromSingleBucket |
| 74 | + ); |
| 75 | + this.valuesSource = valuesSource; |
| 76 | + this.filter = filter; |
| 77 | + this.bucketOrds = LongKeyedBucketOrds.build(context.bigArrays(), collectsFromSingleBucket); |
54 | 78 | } |
55 | 79 |
|
56 | 80 | protected SortedNumericDocValues getValues(ValuesSource.Numeric valuesSource, LeafReaderContext ctx) throws IOException { |
57 | 81 | return valuesSource.longValues(ctx); |
58 | 82 | } |
59 | 83 |
|
60 | 84 | @Override |
61 | | - public LeafBucketCollector getLeafCollector(LeafReaderContext ctx, |
62 | | - final LeafBucketCollector sub) throws IOException { |
63 | | - final SortedNumericDocValues values = getValues(valuesSource, ctx); |
| 85 | + public LeafBucketCollector getLeafCollector(LeafReaderContext ctx, LeafBucketCollector sub) throws IOException { |
| 86 | + SortedNumericDocValues values = getValues(valuesSource, ctx); |
64 | 87 | return new LeafBucketCollectorBase(sub, values) { |
65 | | - |
66 | 88 | @Override |
67 | | - public void collect(int docId, long owningBucketOrdinal) throws IOException { |
68 | | - if (values.advanceExact(docId)) { |
69 | | - final int valuesCount = values.docValueCount(); |
70 | | - long previous = Long.MAX_VALUE; |
71 | | - for (int i = 0; i < valuesCount; ++i) { |
72 | | - final long val = values.nextValue(); |
73 | | - if (previous != val || i == 0) { |
74 | | - if ((includeExclude == null) || (includeExclude.accept(val))) { |
75 | | - doCollect(sub, val, docId); |
76 | | - } |
77 | | - previous = val; |
78 | | - } |
| 89 | + public void collect(int docId, long owningBucketOrd) throws IOException { |
| 90 | + if (false == values.advanceExact(docId)) { |
| 91 | + return; |
| 92 | + } |
| 93 | + int valuesCount = values.docValueCount(); |
| 94 | + long previous = Long.MAX_VALUE; |
| 95 | + for (int i = 0; i < valuesCount; ++i) { |
| 96 | + long val = values.nextValue(); |
| 97 | + if (i == 0 && previous == val) { |
| 98 | + continue; |
| 99 | + } |
| 100 | + previous = val; |
| 101 | + if (filter != null && false == filter.accept(val)) { |
| 102 | + continue; |
| 103 | + } |
| 104 | + long bucketOrdinal = bucketOrds.add(owningBucketOrd, val); |
| 105 | + if (bucketOrdinal < 0) { // already seen |
| 106 | + bucketOrdinal = -1 - bucketOrdinal; |
| 107 | + collectExistingBucket(sub, docId, bucketOrdinal); |
| 108 | + } else { |
| 109 | + collectBucket(sub, docId, bucketOrdinal); |
79 | 110 | } |
80 | 111 | } |
81 | 112 | } |
82 | 113 | }; |
83 | 114 | } |
84 | 115 |
|
85 | 116 | @Override |
86 | | - long addValueToOrds(Long value) { |
87 | | - return bucketOrds.add(value); |
88 | | - } |
89 | | - |
90 | | - /** |
91 | | - * Merges the ordinals to a minimal set, populates the CuckooFilter and |
92 | | - * generates a final set of buckets. |
93 | | - * |
94 | | - * If a term is below the maxDocCount, it is turned into a Bucket. Otherwise, |
95 | | - * the term is added to the filter, and pruned from the ordinal map. If |
96 | | - * necessary the ordinal map is merged down to a minimal set to remove deletions |
97 | | - */ |
98 | | - private List<LongRareTerms.Bucket> buildSketch() { |
99 | | - long deletionCount = 0; |
100 | | - LongHash newBucketOrds = new LongHash(1, context.bigArrays()); |
101 | | - List<LongRareTerms.Bucket> buckets = new ArrayList<>(); |
102 | | - try (LongHash oldBucketOrds = bucketOrds) { |
103 | | - |
104 | | - long[] mergeMap = new long[(int) oldBucketOrds.size()]; |
105 | | - for (int i = 0; i < oldBucketOrds.size(); i++) { |
106 | | - long oldKey = oldBucketOrds.get(i); |
107 | | - long newBucketOrd = -1; |
108 | | - |
109 | | - long docCount = bucketDocCount(i); |
110 | | - // if the key is below threshold, reinsert into the new ords |
111 | | - if (docCount <= maxDocCount) { |
112 | | - newBucketOrd = newBucketOrds.add(oldKey); |
113 | | - LongRareTerms.Bucket bucket = new LongRareTerms.Bucket(oldKey, docCount, null, format); |
114 | | - bucket.bucketOrd = newBucketOrd; |
115 | | - buckets.add(bucket); |
116 | | - } else { |
117 | | - // Make a note when one of the ords has been deleted |
118 | | - deletionCount += 1; |
119 | | - filter.add(oldKey); |
| 117 | + public InternalAggregation[] buildAggregations(long[] owningBucketOrds) throws IOException { |
| 118 | + /* |
| 119 | + * Collect the list of buckets, populate the filter with terms |
| 120 | + * that are too frequent, and figure out how to merge sub-buckets. |
| 121 | + */ |
| 122 | + LongRareTerms.Bucket[][] rarestPerOrd = new LongRareTerms.Bucket[owningBucketOrds.length][]; |
| 123 | + SetBackedScalingCuckooFilter[] filters = new SetBackedScalingCuckooFilter[owningBucketOrds.length]; |
| 124 | + long keepCount = 0; |
| 125 | + long[] mergeMap = new long[(int) bucketOrds.size()]; |
| 126 | + Arrays.fill(mergeMap, -1); |
| 127 | + long offset = 0; |
| 128 | + for (int owningOrdIdx = 0; owningOrdIdx < owningBucketOrds.length; owningOrdIdx++) { |
| 129 | + try (LongHash bucketsInThisOwningBucketToCollect = new LongHash(1, context.bigArrays())) { |
| 130 | + filters[owningOrdIdx] = newFilter(); |
| 131 | + List<LongRareTerms.Bucket> builtBuckets = new ArrayList<>(); |
| 132 | + LongKeyedBucketOrds.BucketOrdsEnum collectedBuckets = bucketOrds.ordsEnum(owningBucketOrds[owningOrdIdx]); |
| 133 | + while (collectedBuckets.next()) { |
| 134 | + long docCount = bucketDocCount(collectedBuckets.ord()); |
| 135 | + // if the key is below threshold, reinsert into the new ords |
| 136 | + if (docCount <= maxDocCount) { |
| 137 | + LongRareTerms.Bucket bucket = new LongRareTerms.Bucket(collectedBuckets.value(), docCount, null, format); |
| 138 | + bucket.bucketOrd = offset + bucketsInThisOwningBucketToCollect.add(collectedBuckets.value()); |
| 139 | + mergeMap[(int) collectedBuckets.ord()] = bucket.bucketOrd; |
| 140 | + builtBuckets.add(bucket); |
| 141 | + keepCount++; |
| 142 | + } else { |
| 143 | + filters[owningOrdIdx].add(collectedBuckets.value()); |
| 144 | + } |
120 | 145 | } |
121 | | - mergeMap[i] = newBucketOrd; |
| 146 | + rarestPerOrd[owningOrdIdx] = builtBuckets.toArray(new LongRareTerms.Bucket[0]); |
| 147 | + offset += bucketsInThisOwningBucketToCollect.size(); |
122 | 148 | } |
| 149 | + } |
123 | 150 |
|
124 | | - // Only merge/delete the ordinals if we have actually deleted one, |
125 | | - // to save on some redundant work |
126 | | - if (deletionCount > 0) { |
127 | | - mergeBuckets(mergeMap, newBucketOrds.size()); |
128 | | - if (deferringCollector != null) { |
129 | | - deferringCollector.mergeBuckets(mergeMap); |
130 | | - } |
| 151 | + /* |
| 152 | + * Only merge/delete the ordinals if we have actually deleted one, |
| 153 | + * to save on some redundant work. |
| 154 | + */ |
| 155 | + if (keepCount != mergeMap.length) { |
| 156 | + mergeBuckets(mergeMap, offset); |
| 157 | + if (deferringCollector != null) { |
| 158 | + deferringCollector.mergeBuckets(mergeMap); |
131 | 159 | } |
132 | 160 | } |
133 | | - bucketOrds = newBucketOrds; |
134 | | - return buckets; |
135 | | - } |
136 | 161 |
|
137 | | - @Override |
138 | | - public InternalAggregation[] buildAggregations(long[] owningBucketOrds) throws IOException { |
139 | | - assert owningBucketOrds.length == 1 && owningBucketOrds[0] == 0; |
140 | | - List<LongRareTerms.Bucket> buckets = buildSketch(); |
141 | | - buildSubAggsForBuckets(buckets, b -> b.bucketOrd, (b, aggs) -> b.aggregations = aggs); |
142 | | - |
143 | | - CollectionUtil.introSort(buckets, ORDER.comparator()); |
144 | | - return new InternalAggregation[] {new LongRareTerms(name, ORDER, metadata(), format, buckets, maxDocCount, filter)}; |
| 162 | + /* |
| 163 | + * Now build the results! |
| 164 | + */ |
| 165 | + buildSubAggsForAllBuckets(rarestPerOrd, b -> b.bucketOrd, (b, aggs) -> b.aggregations = aggs); |
| 166 | + InternalAggregation[] result = new InternalAggregation[owningBucketOrds.length]; |
| 167 | + for (int ordIdx = 0; ordIdx < owningBucketOrds.length; ordIdx++) { |
| 168 | + Arrays.sort(rarestPerOrd[ordIdx], ORDER.comparator()); |
| 169 | + result[ordIdx] = new LongRareTerms( |
| 170 | + name, |
| 171 | + ORDER, |
| 172 | + metadata(), |
| 173 | + format, |
| 174 | + Arrays.asList(rarestPerOrd[ordIdx]), |
| 175 | + maxDocCount, |
| 176 | + filters[ordIdx] |
| 177 | + ); |
| 178 | + } |
| 179 | + return result; |
145 | 180 | } |
146 | 181 |
|
147 | 182 | @Override |
148 | 183 | public InternalAggregation buildEmptyAggregation() { |
149 | | - return new LongRareTerms(name, ORDER, metadata(), format, emptyList(), 0, filter); |
| 184 | + return new LongRareTerms(name, ORDER, metadata(), format, emptyList(), 0, newFilter()); |
150 | 185 | } |
151 | 186 |
|
152 | 187 | @Override |
|
0 commit comments