diff --git a/docs/reference/aggregations/metrics/boxplot-aggregation.asciidoc b/docs/reference/aggregations/metrics/boxplot-aggregation.asciidoc index b69c918484d6f..a4c2c4273726f 100644 --- a/docs/reference/aggregations/metrics/boxplot-aggregation.asciidoc +++ b/docs/reference/aggregations/metrics/boxplot-aggregation.asciidoc @@ -59,13 +59,21 @@ The response will look like this: "max": 990.0, "q1": 165.0, "q2": 445.0, - "q3": 725.0 + "q3": 725.0, + "lower": 0.0, + "upper": 990.0 } } } -------------------------------------------------- // TESTRESPONSE[s/\.\.\./"took": $body.took,"timed_out": false,"_shards": $body._shards,"hits": $body.hits,/] +In this case, the lower and upper whisker values are equal to the min and max. In general, these values are the 1.5 * +IQR range, which is to say the nearest values to `q1 - (1.5 * IQR)` and `q3 + (1.5 * IQR)`. Since this is an approximation, the given values +may not actually be observed values from the data, but should be within a reasonable error bound of them. While the Boxplot aggregation +doesn't directly return outlier points, you can check if `lower > min` or `upper < max` to see if outliers exist on either side, and then +query for them directly. + ==== Script The boxplot metric supports scripting. For example, if our load times diff --git a/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/boxplot/InternalBoxplot.java b/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/boxplot/InternalBoxplot.java index 5e16bf646a249..ac6509fe736a2 100644 --- a/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/boxplot/InternalBoxplot.java +++ b/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/boxplot/InternalBoxplot.java @@ -6,6 +6,8 @@ package org.elasticsearch.xpack.analytics.boxplot; +import com.tdunning.math.stats.Centroid; + import org.elasticsearch.common.io.stream.StreamInput; import org.elasticsearch.common.io.stream.StreamOutput; import org.elasticsearch.common.xcontent.XContentBuilder; @@ -24,9 +26,91 @@ public class InternalBoxplot extends InternalNumericMetricsAggregation.MultiValue implements Boxplot { + /** + * This value is used in determining the width of the whiskers of the boxplot. After the IQR value is calculated, it gets multiplied + * by this multiplier to decide how far out from the q1 and q3 points to extend the whiskers. The value of 1.5 is traditional. + * See https://en.wikipedia.org/wiki/Box_plot + */ + public static final double IQR_MULTIPLIER = 1.5; + enum Metrics { + MIN { + @Override + double value(InternalBoxplot boxplot) { + return boxplot.getMin(); + } + + @Override + double value(TDigestState state) { + return state == null ? Double.NEGATIVE_INFINITY : state.getMin(); + } + }, + MAX { + @Override + double value(InternalBoxplot boxplot) { + return boxplot.getMax(); + } + + @Override + double value(TDigestState state) { + return state == null ? Double.POSITIVE_INFINITY : state.getMax(); + } + }, + Q1 { + @Override + double value(InternalBoxplot boxplot) { + return boxplot.getQ1(); + } + + @Override + double value(TDigestState state) { + return state == null ? Double.NaN : state.quantile(0.25); + } + }, + Q2 { + @Override + double value(InternalBoxplot boxplot) { + return boxplot.getQ2(); + } + + @Override + double value(TDigestState state) { + return state == null ? Double.NaN : state.quantile(0.5); + } + }, + Q3 { + @Override + double value(InternalBoxplot boxplot) { + return boxplot.getQ3(); + } + + @Override + double value(TDigestState state) { + return state == null ? Double.NaN : state.quantile(0.75); + } + }, + LOWER { + @Override + double value(InternalBoxplot boxplot) { + return whiskers(boxplot.state)[0]; + } - MIN, MAX, Q1, Q2, Q3; + @Override + double value(TDigestState state) { + return whiskers(state)[0]; + } + }, + UPPER { + @Override + double value(InternalBoxplot boxplot) { + return whiskers(boxplot.state)[1]; + } + + @Override + double value(TDigestState state) { + return whiskers(state)[1]; + } + }; public static Metrics resolve(String name) { return Metrics.valueOf(name.toUpperCase(Locale.ROOT)); @@ -36,39 +120,48 @@ public String value() { return name().toLowerCase(Locale.ROOT); } - double value(InternalBoxplot boxplot) { - switch (this) { - case MIN: - return boxplot.getMin(); - case MAX: - return boxplot.getMax(); - case Q1: - return boxplot.getQ1(); - case Q2: - return boxplot.getQ2(); - case Q3: - return boxplot.getQ3(); - default: - throw new IllegalArgumentException("Unknown value [" + this.value() + "] in the boxplot aggregation"); - } + abstract double value(InternalBoxplot boxplot); + + abstract double value(TDigestState state); + } + + /** + * For a given TDigest, find the "whisker" valeus, such that the upper whisker is (close to) the highest observed value less than + * q3 + 1.5 * IQR and the lower whisker is (close to) the lowest observed value greater than q1 - 1.5 * IQR. Since we don't track + * observed values directly, this function returns the centroid according to the above logic. + * + * @param state - an initialized TDigestState representing the observed data. + * @return - two doubles in an array, where whiskers[0] is the lower whisker and whiskers[1] is the upper whisker. + */ + public static double[] whiskers(TDigestState state) { + double[] results = new double[2]; + results[0] = Double.NaN; + results[1] = Double.NaN; + if (state == null) { + return results; } - double value(TDigestState state) { - switch (this) { - case MIN: - return state == null ? Double.NEGATIVE_INFINITY : state.getMin(); - case MAX: - return state == null ? Double.POSITIVE_INFINITY : state.getMax(); - case Q1: - return state == null ? Double.NaN : state.quantile(0.25); - case Q2: - return state == null ? Double.NaN : state.quantile(0.5); - case Q3: - return state == null ? Double.NaN : state.quantile(0.75); - default: - throw new IllegalArgumentException("Unknown value [" + this.value() + "] in the boxplot aggregation"); + double q3 = state.quantile(0.75); + double q1 = state.quantile(0.25); + double iqr = q3 - q1; + double upper = q3 + (IQR_MULTIPLIER * iqr); + double lower = q1 - (IQR_MULTIPLIER * iqr); + Centroid prev = null; + // Does this iterate in ascending order? if not, we might need to sort... + for (Centroid c : state.centroids()) { + if (Double.isNaN(results[0]) && c.mean() > lower) { + results[0] = c.mean(); } + if (c.mean() > upper) { + results[1] = prev.mean(); + break; + } + prev = c; + } + if (Double.isNaN(results[1])) { + results[1] = state.getMax(); } + return results; } public static List metricNames = Stream.of(Metrics.values()) @@ -188,17 +281,22 @@ public InternalBoxplot reduce(List aggregations, ReduceCont @Override public XContentBuilder doXContentBody(XContentBuilder builder, Params params) throws IOException { + double[] whiskers = whiskers(state); builder.field("min", getMin()); builder.field("max", getMax()); builder.field("q1", getQ1()); builder.field("q2", getQ2()); builder.field("q3", getQ3()); + builder.field("lower", whiskers[0]); + builder.field("upper", whiskers[1]); if (format != DocValueFormat.RAW) { builder.field("min_as_string", format.format(getMin())); builder.field("max_as_string", format.format(getMax())); builder.field("q1_as_string", format.format(getQ1())); builder.field("q2_as_string", format.format(getQ2())); builder.field("q3_as_string", format.format(getQ3())); + builder.field("lower_as_string", format.format(whiskers[0])); + builder.field("upper_as_string", format.format(whiskers[1])); } return builder; } diff --git a/x-pack/plugin/analytics/src/test/java/org/elasticsearch/xpack/analytics/boxplot/InternalBoxplotTests.java b/x-pack/plugin/analytics/src/test/java/org/elasticsearch/xpack/analytics/boxplot/InternalBoxplotTests.java index ff236a113aada..715737fa189e2 100644 --- a/x-pack/plugin/analytics/src/test/java/org/elasticsearch/xpack/analytics/boxplot/InternalBoxplotTests.java +++ b/x-pack/plugin/analytics/src/test/java/org/elasticsearch/xpack/analytics/boxplot/InternalBoxplotTests.java @@ -112,15 +112,34 @@ protected List getNamedXContents() { return extendedNamedXContents; } + public void testIQR() { + double epsilon = 0.00001; // tolerance on equality for doubles + TDigestState state = new TDigestState(100); + for (double value : List.of(52, 57, 57, 58, 63, 66, 66, 67, 67, 68, 69, 70, 70, 70, 70, 72, 73, 75, 75, 76, 76, 78, 79, 89)) { + state.add(value); + } + double[] actual = InternalBoxplot.whiskers(state); + assertEquals(57.0, actual[0], epsilon); + assertEquals(79.0, actual[1], epsilon); + + // Test null state + actual = InternalBoxplot.whiskers(null); + assertNotNull(actual); + assertTrue(Double.isNaN(actual[0])); + assertTrue(Double.isNaN(actual[1])); + } + public void testIterator() { InternalBoxplot aggregation = createTestInstance("test", emptyMap()); List names = StreamSupport.stream(aggregation.valueNames().spliterator(), false).collect(Collectors.toList()); - assertEquals(5, names.size()); + assertEquals(7, names.size()); assertTrue(names.contains("min")); assertTrue(names.contains("max")); assertTrue(names.contains("q1")); assertTrue(names.contains("q2")); assertTrue(names.contains("q3")); + assertTrue(names.contains("lower")); + assertTrue(names.contains("upper")); } }