Skip to content

Commit de93d95

Browse files
authored
Fix rate agg with custom _doc_count (#79346)
When running a rate aggregation without setting the field parameter, the result is computed based on the bucket doc_count. This PR adds support for a custom _doc_count field. Closes #77734
1 parent 12e2f55 commit de93d95

File tree

5 files changed

+114
-28
lines changed

5 files changed

+114
-28
lines changed

docs/reference/aggregations/metrics/rate-aggregation.asciidoc

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
++++
88

99
A `rate` metrics aggregation can be used only inside a `date_histogram` or `composite` aggregation. It calculates a rate of documents
10-
or a field in each bucket. The field values can be generated extracted from specific numeric or
10+
or a field in each bucket. The field values can be extracted from specific numeric or
1111
<<histogram,histogram fields>> in the documents.
1212

1313
NOTE: For `composite` aggregations, there must be exactly one `date_histogram` source for the `rate` aggregation to be supported.
@@ -27,7 +27,7 @@ A `rate` aggregation looks like this in isolation:
2727
--------------------------------------------------
2828
// NOTCONSOLE
2929

30-
The following request will group all sales records into monthly bucket and than convert the number of sales transaction in each bucket
30+
The following request will group all sales records into monthly buckets and then convert the number of sales transactions in each bucket
3131
into per annual sales rate.
3232

3333
[source,console]
@@ -56,8 +56,8 @@ GET sales/_search
5656
<1> Histogram is grouped by month.
5757
<2> But the rate is converted into annual rate.
5858

59-
The response will return the annual rate of transaction in each bucket. Since there are 12 months per year, the annual rate will
60-
be automatically calculated by multiplying monthly rate by 12.
59+
The response will return the annual rate of transactions in each bucket. Since there are 12 months per year, the annual rate will
60+
be automatically calculated by multiplying the monthly rate by 12.
6161

6262
[source,console-result]
6363
--------------------------------------------------

x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/rate/AbstractRateAggregator.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ public abstract class AbstractRateAggregator extends NumericMetricsAggregator.Si
2929
private final Rounding.DateTimeUnit rateUnit;
3030
protected final RateMode rateMode;
3131
private final SizedBucketAggregator sizedBucketAggregator;
32+
protected final boolean computeWithDocCount;
3233

3334
protected DoubleArray sums;
3435
protected DoubleArray compensations;
@@ -55,6 +56,8 @@ public AbstractRateAggregator(
5556
this.rateUnit = rateUnit;
5657
this.rateMode = rateMode;
5758
this.sizedBucketAggregator = findSizedBucketAncestor();
59+
// If no fields or scripts have been defined in the agg, rate should be computed based on bucket doc_counts
60+
this.computeWithDocCount = valuesSourceConfig.fieldContext() == null && valuesSourceConfig.script() == null;
5861
}
5962

6063
private SizedBucketAggregator findSizedBucketAncestor() {
@@ -112,5 +115,4 @@ public InternalAggregation buildEmptyAggregation() {
112115
public void doClose() {
113116
Releasables.close(sums, compensations);
114117
}
115-
116118
}

x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/rate/NumericRateAggregator.java

Lines changed: 53 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
import org.elasticsearch.search.aggregations.Aggregator;
1414
import org.elasticsearch.search.aggregations.LeafBucketCollector;
1515
import org.elasticsearch.search.aggregations.LeafBucketCollectorBase;
16+
import org.elasticsearch.search.aggregations.bucket.DocCountProvider;
1617
import org.elasticsearch.search.aggregations.metrics.CompensatedSum;
1718
import org.elasticsearch.search.aggregations.support.AggregationContext;
1819
import org.elasticsearch.search.aggregations.support.ValuesSource;
@@ -22,6 +23,9 @@
2223
import java.util.Map;
2324

2425
public class NumericRateAggregator extends AbstractRateAggregator {
26+
27+
private final DocCountProvider docCountProvider;
28+
2529
public NumericRateAggregator(
2630
String name,
2731
ValuesSourceConfig valuesSourceConfig,
@@ -32,42 +36,68 @@ public NumericRateAggregator(
3236
Map<String, Object> metadata
3337
) throws IOException {
3438
super(name, valuesSourceConfig, rateUnit, rateMode, context, parent, metadata);
39+
docCountProvider = computeWithDocCount ? new DocCountProvider() : null;
3540
}
3641

3742
@Override
3843
public LeafBucketCollector getLeafCollector(LeafReaderContext ctx, final LeafBucketCollector sub) throws IOException {
3944
final CompensatedSum kahanSummation = new CompensatedSum(0, 0);
40-
final SortedNumericDoubleValues values = ((ValuesSource.Numeric) valuesSource).doubleValues(ctx);
41-
return new LeafBucketCollectorBase(sub, values) {
42-
@Override
43-
public void collect(int doc, long bucket) throws IOException {
44-
sums = bigArrays().grow(sums, bucket + 1);
45-
compensations = bigArrays().grow(compensations, bucket + 1);
46-
47-
if (values.advanceExact(doc)) {
48-
final int valuesCount = values.docValueCount();
45+
if (computeWithDocCount) {
46+
// No field or script has been set at the rate agg. So, rate will be computed based on the doc_counts.
47+
// This implementation hard-wires the DocCountProvider and reads the _doc_count fields when available.
48+
// A better approach would be to create a DOC_COUNT ValuesSource type and use that as valuesSource
49+
// In that case the computeRateOnDocs variable and this branch of the if-statement are not required.
50+
docCountProvider.setLeafReaderContext(ctx);
51+
return new LeafBucketCollectorBase(sub, null) {
52+
@Override
53+
public void collect(int doc, long bucket) throws IOException {
54+
sums = bigArrays().grow(sums, bucket + 1);
55+
compensations = bigArrays().grow(compensations, bucket + 1);
4956
// Compute the sum of double values with Kahan summation algorithm which is more
5057
// accurate than naive summation.
5158
double sum = sums.get(bucket);
5259
double compensation = compensations.get(bucket);
5360
kahanSummation.reset(sum, compensation);
54-
switch (rateMode) {
55-
case SUM:
56-
for (int i = 0; i < valuesCount; i++) {
57-
kahanSummation.add(values.nextValue());
58-
}
59-
break;
60-
case VALUE_COUNT:
61-
kahanSummation.add(valuesCount);
62-
break;
63-
default:
64-
throw new IllegalArgumentException("Unsupported rate mode " + rateMode);
65-
}
6661

62+
final int docCount = docCountProvider.getDocCount(doc);
63+
kahanSummation.add(docCount);
6764
compensations.set(bucket, kahanSummation.delta());
6865
sums.set(bucket, kahanSummation.value());
6966
}
70-
}
71-
};
67+
};
68+
} else {
69+
final SortedNumericDoubleValues values = ((ValuesSource.Numeric) valuesSource).doubleValues(ctx);
70+
return new LeafBucketCollectorBase(sub, values) {
71+
@Override
72+
public void collect(int doc, long bucket) throws IOException {
73+
sums = bigArrays().grow(sums, bucket + 1);
74+
compensations = bigArrays().grow(compensations, bucket + 1);
75+
76+
if (values.advanceExact(doc)) {
77+
final int valuesCount = values.docValueCount();
78+
// Compute the sum of double values with Kahan summation algorithm which is more
79+
// accurate than naive summation.
80+
double sum = sums.get(bucket);
81+
double compensation = compensations.get(bucket);
82+
kahanSummation.reset(sum, compensation);
83+
switch (rateMode) {
84+
case SUM:
85+
for (int i = 0; i < valuesCount; i++) {
86+
kahanSummation.add(values.nextValue());
87+
}
88+
break;
89+
case VALUE_COUNT:
90+
kahanSummation.add(valuesCount);
91+
break;
92+
default:
93+
throw new IllegalArgumentException("Unsupported rate mode " + rateMode);
94+
}
95+
96+
compensations.set(bucket, kahanSummation.delta());
97+
sums.set(bucket, kahanSummation.value());
98+
}
99+
}
100+
};
101+
}
72102
}
73103
}

x-pack/plugin/analytics/src/test/java/org/elasticsearch/xpack/analytics/rate/RateAggregatorTests.java

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
import org.elasticsearch.common.settings.Settings;
2525
import org.elasticsearch.core.CheckedConsumer;
2626
import org.elasticsearch.index.fielddata.ScriptDocValues;
27+
import org.elasticsearch.index.mapper.CustomTermFreqField;
2728
import org.elasticsearch.index.mapper.DateFieldMapper;
2829
import org.elasticsearch.index.mapper.KeywordFieldMapper;
2930
import org.elasticsearch.index.mapper.MappedFieldType;
@@ -859,6 +860,18 @@ public void testModeWithoutField() {
859860
assertEquals("The mode parameter is only supported with field or script", ex.getMessage());
860861
}
861862

863+
public void testWithCustomDocCount() throws IOException {
864+
testCase(new MatchAllDocsQuery(), "month", true, "month", null, iw -> {
865+
iw.addDocument(doc("2010-03-12T01:07:45", new CustomTermFreqField("_doc_count", "_doc_count", 10)));
866+
iw.addDocument(doc("2010-04-01T03:43:34"));
867+
iw.addDocument(doc("2010-04-27T03:43:34", new CustomTermFreqField("_doc_count", "_doc_count", 5)));
868+
}, dh -> {
869+
assertThat(dh.getBuckets(), hasSize(2));
870+
assertThat(((InternalRate) dh.getBuckets().get(0).getAggregations().asList().get(0)).value(), closeTo(10.0, 0.000001));
871+
assertThat(((InternalRate) dh.getBuckets().get(1).getAggregations().asList().get(0)).value(), closeTo(6.0, 0.000001));
872+
});
873+
}
874+
862875
private static AbstractAggregationBuilder<?> randomValidMultiBucketAggBuilder(
863876
RateAggregationBuilder rateAggregationBuilder,
864877
DateHistogramInterval interval

x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/analytics/rate.yml

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,3 +34,44 @@ setup:
3434
- length: { aggregations.by_date.buckets: 2 }
3535
- match: { aggregations.by_date.buckets.0.rate.value: 1.0 }
3636
- match: { aggregations.by_date.buckets.1.rate.value: 2.0 }
37+
38+
39+
---
40+
"rate with doc_count":
41+
- skip:
42+
version: " - 7.99.99"
43+
reason: bug fixed in 8.0.0
44+
- do:
45+
bulk:
46+
index: test2
47+
refresh: true
48+
body:
49+
- '{"index": {}}'
50+
- '{"timestamp": "2021-09-14T22:33:37.477Z", "_doc_count": 10}'
51+
- '{"index": {}}'
52+
- '{"timestamp": "2021-09-14T22:35:37.477Z", "_doc_count": 5}'
53+
- '{"index": {}}'
54+
- '{"timestamp": "2021-09-14T22:35:38.477Z", "_doc_count": 1}'
55+
- '{"index": {}}'
56+
- '{"timestamp": "2021-09-14T22:36:08.477Z"}'
57+
- do:
58+
search:
59+
size: 0
60+
index: "test2"
61+
body:
62+
aggs:
63+
by_date:
64+
date_histogram:
65+
field: timestamp
66+
fixed_interval: 60s
67+
aggs:
68+
rate:
69+
rate:
70+
unit: minute
71+
72+
- length: { aggregations.by_date.buckets: 4 }
73+
- match: { aggregations.by_date.buckets.0.rate.value: 10.0 }
74+
- match: { aggregations.by_date.buckets.1.rate.value: 0.0 }
75+
- match: { aggregations.by_date.buckets.2.rate.value: 6.0 }
76+
- match: { aggregations.by_date.buckets.3.rate.value: 1.0 }
77+

0 commit comments

Comments
 (0)