From bcd29a2f3c95ee7013509c797c8d543f935d1f99 Mon Sep 17 00:00:00 2001 From: Mark Tozzi Date: Mon, 31 Aug 2020 09:14:44 -0400 Subject: [PATCH 1/7] experimenting --- .../xpack/analytics/boxplot/InternalBoxplot.java | 6 ++++++ .../xpack/analytics/boxplot/InternalBoxplotTests.java | 9 +++++++++ 2 files changed, 15 insertions(+) diff --git a/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/boxplot/InternalBoxplot.java b/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/boxplot/InternalBoxplot.java index 0280bbe66d6ce..418427f1aaf30 100644 --- a/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/boxplot/InternalBoxplot.java +++ b/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/boxplot/InternalBoxplot.java @@ -69,6 +69,12 @@ public String value() { } } + public static double upper(TDigestState state) { + double q3 = state.quantile(0.75); + double iqr = q3 - state.quantile(0.25); + return state.quantile(state.cdf(q3 + (1.5 * iqr))); + } + private final TDigestState state; InternalBoxplot(String name, TDigestState state, DocValueFormat formatter, Map metadata) { diff --git a/x-pack/plugin/analytics/src/test/java/org/elasticsearch/xpack/analytics/boxplot/InternalBoxplotTests.java b/x-pack/plugin/analytics/src/test/java/org/elasticsearch/xpack/analytics/boxplot/InternalBoxplotTests.java index fcc251238e915..4d28a1060f6cd 100644 --- a/x-pack/plugin/analytics/src/test/java/org/elasticsearch/xpack/analytics/boxplot/InternalBoxplotTests.java +++ b/x-pack/plugin/analytics/src/test/java/org/elasticsearch/xpack/analytics/boxplot/InternalBoxplotTests.java @@ -107,4 +107,13 @@ protected List getNamedXContents() { )); return extendedNamedXContents; } + + public void testIQR() { + double epsilon = 0.00001; // tolerance on equality for doubles + TDigestState state = new TDigestState(100); + for (double value : List.of(52, 57, 57, 58, 63, 66, 66, 67, 67, 68, 69, 70, 70, 70, 70, 72, 73, 75, 75, 76, 76, 78, 79, 89)) { + state.add(value); + } + assertEquals(79.0, InternalBoxplot.upper(state), epsilon); + } } From bad48b3b340ab259906acfabe2052ab9c7a0b25a Mon Sep 17 00:00:00 2001 From: Mark Tozzi Date: Mon, 14 Sep 2020 11:17:35 -0400 Subject: [PATCH 2/7] function to find IQR based whisker values --- .../analytics/boxplot/InternalBoxplot.java | 33 +++++++++++++++++-- .../boxplot/InternalBoxplotTests.java | 4 ++- 2 files changed, 33 insertions(+), 4 deletions(-) diff --git a/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/boxplot/InternalBoxplot.java b/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/boxplot/InternalBoxplot.java index 418427f1aaf30..e2cc2e62fbe36 100644 --- a/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/boxplot/InternalBoxplot.java +++ b/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/boxplot/InternalBoxplot.java @@ -6,6 +6,7 @@ package org.elasticsearch.xpack.analytics.boxplot; +import com.tdunning.math.stats.Centroid; import org.elasticsearch.common.io.stream.StreamInput; import org.elasticsearch.common.io.stream.StreamOutput; import org.elasticsearch.common.xcontent.XContentBuilder; @@ -69,10 +70,36 @@ public String value() { } } - public static double upper(TDigestState state) { + /** + * For a given TDigest, find the "whisker" valeus, such that the upper whisker is (close to) the highest observed value less than + * q3 + 1.5 * IQR and the lower whisker is (close to) the lowest observed value greater than q1 - 1.5 * IQR. Since we don't track + * observed values directly, this function returns the centroid according to the above logic. + * + * @param state - an initialized TDigestState representing the observed data. + * @return - two doubles in an array, where whiskers[0] is the lower whisker and whiskers[1] is the upper whisker. + */ + public static double[] whiskers(TDigestState state) { double q3 = state.quantile(0.75); - double iqr = q3 - state.quantile(0.25); - return state.quantile(state.cdf(q3 + (1.5 * iqr))); + double q1 = state.quantile(0.25); + double iqr = q3 - q1; + double upper = q3 + (1.5 * iqr); + double lower = q1 - (1.5 * iqr); + Centroid prev = null; + double[] results = new double[2]; + results[0] = Double.NaN; + results[1] = Double.NaN; + // Does this iterate in ascending order? if not, we might need to sort... + for (Centroid c : state.centroids()) { + if (Double.isNaN(results[0]) && c.mean() > lower) { + results[0] = c.mean(); + } + if (c.mean() > upper) { + results[1] = prev.mean(); + break; + } + prev = c; + } + return results; } private final TDigestState state; diff --git a/x-pack/plugin/analytics/src/test/java/org/elasticsearch/xpack/analytics/boxplot/InternalBoxplotTests.java b/x-pack/plugin/analytics/src/test/java/org/elasticsearch/xpack/analytics/boxplot/InternalBoxplotTests.java index 4d28a1060f6cd..f58310bb09976 100644 --- a/x-pack/plugin/analytics/src/test/java/org/elasticsearch/xpack/analytics/boxplot/InternalBoxplotTests.java +++ b/x-pack/plugin/analytics/src/test/java/org/elasticsearch/xpack/analytics/boxplot/InternalBoxplotTests.java @@ -114,6 +114,8 @@ public void testIQR() { for (double value : List.of(52, 57, 57, 58, 63, 66, 66, 67, 67, 68, 69, 70, 70, 70, 70, 72, 73, 75, 75, 76, 76, 78, 79, 89)) { state.add(value); } - assertEquals(79.0, InternalBoxplot.upper(state), epsilon); + double[] actual = InternalBoxplot.whiskers(state); + assertEquals(57.0, actual[0], epsilon); + assertEquals(79.0, actual[1], epsilon); } } From 05e6d1f9c7cd66d4d85096b5e8cb0992442a7a22 Mon Sep 17 00:00:00 2001 From: Mark Tozzi Date: Tue, 15 Sep 2020 09:22:47 -0400 Subject: [PATCH 3/7] replace enum switch with abstract method --- .../analytics/boxplot/InternalBoxplot.java | 99 ++++++++++++------- 1 file changed, 66 insertions(+), 33 deletions(-) diff --git a/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/boxplot/InternalBoxplot.java b/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/boxplot/InternalBoxplot.java index e2cc2e62fbe36..707c3489a2b6a 100644 --- a/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/boxplot/InternalBoxplot.java +++ b/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/boxplot/InternalBoxplot.java @@ -25,49 +25,82 @@ public class InternalBoxplot extends InternalNumericMetricsAggregation.MultiValu enum Metrics { - MIN, MAX, Q1, Q2, Q3; + MIN + { + @Override + double value(InternalBoxplot boxplot) { + return boxplot.getMin(); + } - public static Metrics resolve(String name) { - return Metrics.valueOf(name.toUpperCase(Locale.ROOT)); - } + @Override + double value(TDigestState state) { + return state == null ? Double.NEGATIVE_INFINITY : state.getMin(); + } + }, + MAX + { + @Override + double value(InternalBoxplot boxplot) { + return boxplot.getMax(); + } - public String value() { - return name().toLowerCase(Locale.ROOT); - } + @Override + double value(TDigestState state) { + return state == null ? Double.POSITIVE_INFINITY : state.getMax(); + } + }, - double value(InternalBoxplot boxplot) { - switch (this) { - case MIN: - return boxplot.getMin(); - case MAX: - return boxplot.getMax(); - case Q1: + Q1 + { + @Override + double value(InternalBoxplot boxplot) { return boxplot.getQ1(); - case Q2: - return boxplot.getQ2(); - case Q3: - return boxplot.getQ3(); - default: - throw new IllegalArgumentException("Unknown value [" + this.value() + "] in the boxplot aggregation"); - } - } + } - double value(TDigestState state) { - switch (this) { - case MIN: - return state == null ? Double.NEGATIVE_INFINITY : state.getMin(); - case MAX: - return state == null ? Double.POSITIVE_INFINITY : state.getMax(); - case Q1: + @Override + double value(TDigestState state) { return state == null ? Double.NaN : state.quantile(0.25); - case Q2: + } + }, + + Q2 + { + @Override + double value(InternalBoxplot boxplot) { + return boxplot.getQ2(); + } + + @Override + double value(TDigestState state) { return state == null ? Double.NaN : state.quantile(0.5); - case Q3: + } + }, + + Q3 + { + @Override + double value(InternalBoxplot boxplot) { + return boxplot.getQ3(); + } + + @Override + double value(TDigestState state) { return state == null ? Double.NaN : state.quantile(0.75); - default: - throw new IllegalArgumentException("Unknown value [" + this.value() + "] in the boxplot aggregation"); + } } + ; + + public static Metrics resolve(String name) { + return Metrics.valueOf(name.toUpperCase(Locale.ROOT)); + } + + public String value() { + return name().toLowerCase(Locale.ROOT); } + + abstract double value(InternalBoxplot boxplot); + + abstract double value(TDigestState state); } /** From 2ad2e6ff7925b5ad71758af719207c1fccc480c1 Mon Sep 17 00:00:00 2001 From: Mark Tozzi Date: Tue, 15 Sep 2020 13:42:24 -0400 Subject: [PATCH 4/7] null handling --- .../analytics/boxplot/InternalBoxplot.java | 149 ++++++++++-------- .../boxplot/InternalBoxplotTests.java | 6 + 2 files changed, 89 insertions(+), 66 deletions(-) diff --git a/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/boxplot/InternalBoxplot.java b/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/boxplot/InternalBoxplot.java index 707c3489a2b6a..0ec029bbefe77 100644 --- a/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/boxplot/InternalBoxplot.java +++ b/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/boxplot/InternalBoxplot.java @@ -7,6 +7,7 @@ package org.elasticsearch.xpack.analytics.boxplot; import com.tdunning.math.stats.Centroid; + import org.elasticsearch.common.io.stream.StreamInput; import org.elasticsearch.common.io.stream.StreamOutput; import org.elasticsearch.common.xcontent.XContentBuilder; @@ -24,71 +25,83 @@ public class InternalBoxplot extends InternalNumericMetricsAggregation.MultiValue implements Boxplot { enum Metrics { + MIN { + @Override + double value(InternalBoxplot boxplot) { + return boxplot.getMin(); + } + + @Override + double value(TDigestState state) { + return state == null ? Double.NEGATIVE_INFINITY : state.getMin(); + } + }, + MAX { + @Override + double value(InternalBoxplot boxplot) { + return boxplot.getMax(); + } + + @Override + double value(TDigestState state) { + return state == null ? Double.POSITIVE_INFINITY : state.getMax(); + } + }, + Q1 { + @Override + double value(InternalBoxplot boxplot) { + return boxplot.getQ1(); + } + + @Override + double value(TDigestState state) { + return state == null ? Double.NaN : state.quantile(0.25); + } + }, + Q2 { + @Override + double value(InternalBoxplot boxplot) { + return boxplot.getQ2(); + } + + @Override + double value(TDigestState state) { + return state == null ? Double.NaN : state.quantile(0.5); + } + }, + Q3 { + @Override + double value(InternalBoxplot boxplot) { + return boxplot.getQ3(); + } - MIN - { - @Override - double value(InternalBoxplot boxplot) { - return boxplot.getMin(); - } - - @Override - double value(TDigestState state) { - return state == null ? Double.NEGATIVE_INFINITY : state.getMin(); - } - }, - MAX - { - @Override - double value(InternalBoxplot boxplot) { - return boxplot.getMax(); - } - - @Override - double value(TDigestState state) { - return state == null ? Double.POSITIVE_INFINITY : state.getMax(); - } - }, - - Q1 - { - @Override - double value(InternalBoxplot boxplot) { - return boxplot.getQ1(); - } - - @Override - double value(TDigestState state) { - return state == null ? Double.NaN : state.quantile(0.25); - } - }, - - Q2 - { - @Override - double value(InternalBoxplot boxplot) { - return boxplot.getQ2(); - } - - @Override - double value(TDigestState state) { - return state == null ? Double.NaN : state.quantile(0.5); - } - }, - - Q3 - { - @Override - double value(InternalBoxplot boxplot) { - return boxplot.getQ3(); - } - - @Override - double value(TDigestState state) { - return state == null ? Double.NaN : state.quantile(0.75); - } + @Override + double value(TDigestState state) { + return state == null ? Double.NaN : state.quantile(0.75); + } + }, + LOWER { + @Override + double value(InternalBoxplot boxplot) { + return whiskers(boxplot.state)[0]; + } + + @Override + double value(TDigestState state) { + return whiskers(state)[0]; + } + }, + UPPER { + @Override + double value(InternalBoxplot boxplot) { + return whiskers(boxplot.state)[1]; } - ; + + @Override + double value(TDigestState state) { + return whiskers(state)[1]; + } + }; public static Metrics resolve(String name) { return Metrics.valueOf(name.toUpperCase(Locale.ROOT)); @@ -112,15 +125,19 @@ public String value() { * @return - two doubles in an array, where whiskers[0] is the lower whisker and whiskers[1] is the upper whisker. */ public static double[] whiskers(TDigestState state) { + double[] results = new double[2]; + results[0] = Double.NaN; + results[1] = Double.NaN; + if (state == null) { + return results; + } + double q3 = state.quantile(0.75); double q1 = state.quantile(0.25); double iqr = q3 - q1; double upper = q3 + (1.5 * iqr); double lower = q1 - (1.5 * iqr); Centroid prev = null; - double[] results = new double[2]; - results[0] = Double.NaN; - results[1] = Double.NaN; // Does this iterate in ascending order? if not, we might need to sort... for (Centroid c : state.centroids()) { if (Double.isNaN(results[0]) && c.mean() > lower) { diff --git a/x-pack/plugin/analytics/src/test/java/org/elasticsearch/xpack/analytics/boxplot/InternalBoxplotTests.java b/x-pack/plugin/analytics/src/test/java/org/elasticsearch/xpack/analytics/boxplot/InternalBoxplotTests.java index f58310bb09976..73d3ce31d438a 100644 --- a/x-pack/plugin/analytics/src/test/java/org/elasticsearch/xpack/analytics/boxplot/InternalBoxplotTests.java +++ b/x-pack/plugin/analytics/src/test/java/org/elasticsearch/xpack/analytics/boxplot/InternalBoxplotTests.java @@ -117,5 +117,11 @@ public void testIQR() { double[] actual = InternalBoxplot.whiskers(state); assertEquals(57.0, actual[0], epsilon); assertEquals(79.0, actual[1], epsilon); + + // Test null state + actual = InternalBoxplot.whiskers(null); + assertNotNull(actual); + assertTrue(Double.isNaN(actual[0])); + assertTrue(Double.isNaN(actual[1])); } } From 3ef7773611ed5d434f54423bfd47bebb1a505ab6 Mon Sep 17 00:00:00 2001 From: Mark Tozzi Date: Thu, 24 Sep 2020 14:40:22 -0400 Subject: [PATCH 5/7] add output format testing --- .../xpack/analytics/boxplot/InternalBoxplot.java | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/boxplot/InternalBoxplot.java b/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/boxplot/InternalBoxplot.java index 0ec029bbefe77..ee070c47cf86f 100644 --- a/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/boxplot/InternalBoxplot.java +++ b/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/boxplot/InternalBoxplot.java @@ -260,17 +260,22 @@ public InternalBoxplot reduce(List aggregations, ReduceCont @Override public XContentBuilder doXContentBody(XContentBuilder builder, Params params) throws IOException { + double[] whiskers = whiskers(state); builder.field("min", getMin()); builder.field("max", getMax()); builder.field("q1", getQ1()); builder.field("q2", getQ2()); builder.field("q3", getQ3()); + builder.field("lower", whiskers[0]); + builder.field("upper", whiskers[1]); if (format != DocValueFormat.RAW) { builder.field("min_as_string", format.format(getMin())); builder.field("max_as_string", format.format(getMax())); builder.field("q1_as_string", format.format(getQ1())); builder.field("q2_as_string", format.format(getQ2())); builder.field("q3_as_string", format.format(getQ3())); + builder.field("lower_as_string", format.format(whiskers[0])); + builder.field("upper_as_string", format.format(whiskers[1])); } return builder; } From 940ff1e447ff74e37e9e7bc7a686f418d268d38e Mon Sep 17 00:00:00 2001 From: Mark Tozzi Date: Mon, 19 Oct 2020 15:29:43 -0400 Subject: [PATCH 6/7] Update docs and fix doc tests --- .../aggregations/metrics/boxplot-aggregation.asciidoc | 10 +++++++++- .../xpack/analytics/boxplot/InternalBoxplot.java | 3 +++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/docs/reference/aggregations/metrics/boxplot-aggregation.asciidoc b/docs/reference/aggregations/metrics/boxplot-aggregation.asciidoc index d6abee0255bf4..8d5175a935777 100644 --- a/docs/reference/aggregations/metrics/boxplot-aggregation.asciidoc +++ b/docs/reference/aggregations/metrics/boxplot-aggregation.asciidoc @@ -56,13 +56,21 @@ The response will look like this: "max": 990.0, "q1": 165.0, "q2": 445.0, - "q3": 725.0 + "q3": 725.0, + "lower": 0.0, + "upper": 990.0 } } } -------------------------------------------------- // TESTRESPONSE[s/\.\.\./"took": $body.took,"timed_out": false,"_shards": $body._shards,"hits": $body.hits,/] +In this case, the lower and upper whisker values are equal to the min and max. In general, these values are the 1.5 * +IQR range, which is to say the nearest values to `q1 - (1.5 * IQR)` and `q3 + (1.5 * IQR)`. Since this is an approximation, the given values +may not actually be observed values from the data, but should be within a reasonable error bound of them. While the Boxplot aggregation +doesn't directly return outlier points, you can check if `lower > min` or `upper < max` to see if outliers exist on either side, and then +query for them directly. + ==== Script The boxplot metric supports scripting. For example, if our load times diff --git a/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/boxplot/InternalBoxplot.java b/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/boxplot/InternalBoxplot.java index ee070c47cf86f..b67b0b4cc83ab 100644 --- a/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/boxplot/InternalBoxplot.java +++ b/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/boxplot/InternalBoxplot.java @@ -149,6 +149,9 @@ public static double[] whiskers(TDigestState state) { } prev = c; } + if (Double.isNaN(results[1])) { + results[1] = state.getMax(); + } return results; } From 2d78449be213b6981313a69c751be97654f261b7 Mon Sep 17 00:00:00 2001 From: Mark Tozzi Date: Wed, 4 Nov 2020 11:31:55 -0500 Subject: [PATCH 7/7] response to PR feedback --- .../xpack/analytics/boxplot/InternalBoxplot.java | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/boxplot/InternalBoxplot.java b/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/boxplot/InternalBoxplot.java index 3658b128c5e6d..ac6509fe736a2 100644 --- a/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/boxplot/InternalBoxplot.java +++ b/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/boxplot/InternalBoxplot.java @@ -26,6 +26,13 @@ public class InternalBoxplot extends InternalNumericMetricsAggregation.MultiValue implements Boxplot { + /** + * This value is used in determining the width of the whiskers of the boxplot. After the IQR value is calculated, it gets multiplied + * by this multiplier to decide how far out from the q1 and q3 points to extend the whiskers. The value of 1.5 is traditional. + * See https://en.wikipedia.org/wiki/Box_plot + */ + public static final double IQR_MULTIPLIER = 1.5; + enum Metrics { MIN { @Override @@ -137,8 +144,8 @@ public static double[] whiskers(TDigestState state) { double q3 = state.quantile(0.75); double q1 = state.quantile(0.25); double iqr = q3 - q1; - double upper = q3 + (1.5 * iqr); - double lower = q1 - (1.5 * iqr); + double upper = q3 + (IQR_MULTIPLIER * iqr); + double lower = q1 - (IQR_MULTIPLIER * iqr); Centroid prev = null; // Does this iterate in ascending order? if not, we might need to sort... for (Centroid c : state.centroids()) {