Skip to content

Commit 491a5a0

Browse files
authored
[7.x] Add supports for upper and lower values on boxplot based on the IQR value (#63617) (#64611)
* Add supports for upper and lower values on boxplot based on the IQR value (#63617) * fix List.of usage
1 parent f7511d4 commit 491a5a0

File tree

3 files changed

+158
-32
lines changed

3 files changed

+158
-32
lines changed

docs/reference/aggregations/metrics/boxplot-aggregation.asciidoc

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,13 +59,21 @@ The response will look like this:
5959
"max": 990.0,
6060
"q1": 165.0,
6161
"q2": 445.0,
62-
"q3": 725.0
62+
"q3": 725.0,
63+
"lower": 0.0,
64+
"upper": 990.0
6365
}
6466
}
6567
}
6668
--------------------------------------------------
6769
// TESTRESPONSE[s/\.\.\./"took": $body.took,"timed_out": false,"_shards": $body._shards,"hits": $body.hits,/]
6870

71+
In this case, the lower and upper whisker values are equal to the min and max. In general, these values are the 1.5 *
72+
IQR range, which is to say the nearest values to `q1 - (1.5 * IQR)` and `q3 + (1.5 * IQR)`. Since this is an approximation, the given values
73+
may not actually be observed values from the data, but should be within a reasonable error bound of them. While the Boxplot aggregation
74+
doesn't directly return outlier points, you can check if `lower > min` or `upper < max` to see if outliers exist on either side, and then
75+
query for them directly.
76+
6977
==== Script
7078

7179
The boxplot metric supports scripting. For example, if our load times

x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/boxplot/InternalBoxplot.java

Lines changed: 128 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66

77
package org.elasticsearch.xpack.analytics.boxplot;
88

9+
import com.tdunning.math.stats.Centroid;
10+
911
import org.elasticsearch.common.io.stream.StreamInput;
1012
import org.elasticsearch.common.io.stream.StreamOutput;
1113
import org.elasticsearch.common.xcontent.XContentBuilder;
@@ -24,9 +26,91 @@
2426

2527
public class InternalBoxplot extends InternalNumericMetricsAggregation.MultiValue implements Boxplot {
2628

29+
/**
30+
* This value is used in determining the width of the whiskers of the boxplot. After the IQR value is calculated, it gets multiplied
31+
* by this multiplier to decide how far out from the q1 and q3 points to extend the whiskers. The value of 1.5 is traditional.
32+
* See https://en.wikipedia.org/wiki/Box_plot
33+
*/
34+
public static final double IQR_MULTIPLIER = 1.5;
35+
2736
enum Metrics {
37+
MIN {
38+
@Override
39+
double value(InternalBoxplot boxplot) {
40+
return boxplot.getMin();
41+
}
42+
43+
@Override
44+
double value(TDigestState state) {
45+
return state == null ? Double.NEGATIVE_INFINITY : state.getMin();
46+
}
47+
},
48+
MAX {
49+
@Override
50+
double value(InternalBoxplot boxplot) {
51+
return boxplot.getMax();
52+
}
53+
54+
@Override
55+
double value(TDigestState state) {
56+
return state == null ? Double.POSITIVE_INFINITY : state.getMax();
57+
}
58+
},
59+
Q1 {
60+
@Override
61+
double value(InternalBoxplot boxplot) {
62+
return boxplot.getQ1();
63+
}
64+
65+
@Override
66+
double value(TDigestState state) {
67+
return state == null ? Double.NaN : state.quantile(0.25);
68+
}
69+
},
70+
Q2 {
71+
@Override
72+
double value(InternalBoxplot boxplot) {
73+
return boxplot.getQ2();
74+
}
75+
76+
@Override
77+
double value(TDigestState state) {
78+
return state == null ? Double.NaN : state.quantile(0.5);
79+
}
80+
},
81+
Q3 {
82+
@Override
83+
double value(InternalBoxplot boxplot) {
84+
return boxplot.getQ3();
85+
}
86+
87+
@Override
88+
double value(TDigestState state) {
89+
return state == null ? Double.NaN : state.quantile(0.75);
90+
}
91+
},
92+
LOWER {
93+
@Override
94+
double value(InternalBoxplot boxplot) {
95+
return whiskers(boxplot.state)[0];
96+
}
2897

29-
MIN, MAX, Q1, Q2, Q3;
98+
@Override
99+
double value(TDigestState state) {
100+
return whiskers(state)[0];
101+
}
102+
},
103+
UPPER {
104+
@Override
105+
double value(InternalBoxplot boxplot) {
106+
return whiskers(boxplot.state)[1];
107+
}
108+
109+
@Override
110+
double value(TDigestState state) {
111+
return whiskers(state)[1];
112+
}
113+
};
30114

31115
public static Metrics resolve(String name) {
32116
return Metrics.valueOf(name.toUpperCase(Locale.ROOT));
@@ -36,39 +120,48 @@ public String value() {
36120
return name().toLowerCase(Locale.ROOT);
37121
}
38122

39-
double value(InternalBoxplot boxplot) {
40-
switch (this) {
41-
case MIN:
42-
return boxplot.getMin();
43-
case MAX:
44-
return boxplot.getMax();
45-
case Q1:
46-
return boxplot.getQ1();
47-
case Q2:
48-
return boxplot.getQ2();
49-
case Q3:
50-
return boxplot.getQ3();
51-
default:
52-
throw new IllegalArgumentException("Unknown value [" + this.value() + "] in the boxplot aggregation");
53-
}
123+
abstract double value(InternalBoxplot boxplot);
124+
125+
abstract double value(TDigestState state);
126+
}
127+
128+
/**
129+
* For a given TDigest, find the "whisker" valeus, such that the upper whisker is (close to) the highest observed value less than
130+
* q3 + 1.5 * IQR and the lower whisker is (close to) the lowest observed value greater than q1 - 1.5 * IQR. Since we don't track
131+
* observed values directly, this function returns the centroid according to the above logic.
132+
*
133+
* @param state - an initialized TDigestState representing the observed data.
134+
* @return - two doubles in an array, where whiskers[0] is the lower whisker and whiskers[1] is the upper whisker.
135+
*/
136+
public static double[] whiskers(TDigestState state) {
137+
double[] results = new double[2];
138+
results[0] = Double.NaN;
139+
results[1] = Double.NaN;
140+
if (state == null) {
141+
return results;
54142
}
55143

56-
double value(TDigestState state) {
57-
switch (this) {
58-
case MIN:
59-
return state == null ? Double.NEGATIVE_INFINITY : state.getMin();
60-
case MAX:
61-
return state == null ? Double.POSITIVE_INFINITY : state.getMax();
62-
case Q1:
63-
return state == null ? Double.NaN : state.quantile(0.25);
64-
case Q2:
65-
return state == null ? Double.NaN : state.quantile(0.5);
66-
case Q3:
67-
return state == null ? Double.NaN : state.quantile(0.75);
68-
default:
69-
throw new IllegalArgumentException("Unknown value [" + this.value() + "] in the boxplot aggregation");
144+
double q3 = state.quantile(0.75);
145+
double q1 = state.quantile(0.25);
146+
double iqr = q3 - q1;
147+
double upper = q3 + (IQR_MULTIPLIER * iqr);
148+
double lower = q1 - (IQR_MULTIPLIER * iqr);
149+
Centroid prev = null;
150+
// Does this iterate in ascending order? if not, we might need to sort...
151+
for (Centroid c : state.centroids()) {
152+
if (Double.isNaN(results[0]) && c.mean() > lower) {
153+
results[0] = c.mean();
70154
}
155+
if (c.mean() > upper) {
156+
results[1] = prev.mean();
157+
break;
158+
}
159+
prev = c;
160+
}
161+
if (Double.isNaN(results[1])) {
162+
results[1] = state.getMax();
71163
}
164+
return results;
72165
}
73166

74167
public static List<String> metricNames = Stream.of(Metrics.values())
@@ -188,17 +281,22 @@ public InternalBoxplot reduce(List<InternalAggregation> aggregations, ReduceCont
188281

189282
@Override
190283
public XContentBuilder doXContentBody(XContentBuilder builder, Params params) throws IOException {
284+
double[] whiskers = whiskers(state);
191285
builder.field("min", getMin());
192286
builder.field("max", getMax());
193287
builder.field("q1", getQ1());
194288
builder.field("q2", getQ2());
195289
builder.field("q3", getQ3());
290+
builder.field("lower", whiskers[0]);
291+
builder.field("upper", whiskers[1]);
196292
if (format != DocValueFormat.RAW) {
197293
builder.field("min_as_string", format.format(getMin()));
198294
builder.field("max_as_string", format.format(getMax()));
199295
builder.field("q1_as_string", format.format(getQ1()));
200296
builder.field("q2_as_string", format.format(getQ2()));
201297
builder.field("q3_as_string", format.format(getQ3()));
298+
builder.field("lower_as_string", format.format(whiskers[0]));
299+
builder.field("upper_as_string", format.format(whiskers[1]));
202300
}
203301
return builder;
204302
}

x-pack/plugin/analytics/src/test/java/org/elasticsearch/xpack/analytics/boxplot/InternalBoxplotTests.java

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,15 +113,35 @@ protected List<NamedXContentRegistry.Entry> getNamedXContents() {
113113
return extendedNamedXContents;
114114
}
115115

116+
public void testIQR() {
117+
double epsilon = 0.00001; // tolerance on equality for doubles
118+
TDigestState state = new TDigestState(100);
119+
for (double value : org.elasticsearch.common.collect.List.of(52, 57, 57, 58, 63, 66, 66, 67, 67, 68, 69, 70, 70, 70, 70, 72, 73, 75,
120+
75, 76, 76, 78, 79, 89)) {
121+
state.add(value);
122+
}
123+
double[] actual = InternalBoxplot.whiskers(state);
124+
assertEquals(57.0, actual[0], epsilon);
125+
assertEquals(79.0, actual[1], epsilon);
126+
127+
// Test null state
128+
actual = InternalBoxplot.whiskers(null);
129+
assertNotNull(actual);
130+
assertTrue(Double.isNaN(actual[0]));
131+
assertTrue(Double.isNaN(actual[1]));
132+
}
133+
116134
public void testIterator() {
117135
InternalBoxplot aggregation = createTestInstance("test", emptyMap());
118136
List<String> names = StreamSupport.stream(aggregation.valueNames().spliterator(), false).collect(Collectors.toList());
119137

120-
assertEquals(5, names.size());
138+
assertEquals(7, names.size());
121139
assertTrue(names.contains("min"));
122140
assertTrue(names.contains("max"));
123141
assertTrue(names.contains("q1"));
124142
assertTrue(names.contains("q2"));
125143
assertTrue(names.contains("q3"));
144+
assertTrue(names.contains("lower"));
145+
assertTrue(names.contains("upper"));
126146
}
127147
}

0 commit comments

Comments
 (0)