From e239ac5bd3e11e8e1f31222253fe4077c1b80a01 Mon Sep 17 00:00:00 2001 From: Igor Motov Date: Tue, 4 Feb 2020 17:06:01 -0500 Subject: [PATCH 1/2] Add Boxplot Aggregation Adds a `boxplot` aggregation that calculates min, max, medium and the first and the third quartiles of the given data set. Closes #33112 --- .../metrics/boxplot-aggregation.asciidoc | 204 +++++++++++ .../metrics/MinAggregatorTests.java | 2 +- .../xpack/analytics/AnalyticsPlugin.java | 13 +- .../xpack/analytics/boxplot/Boxplot.java | 63 ++++ .../boxplot/BoxplotAggregationBuilder.java | 133 ++++++++ .../analytics/boxplot/BoxplotAggregator.java | 162 +++++++++ .../boxplot/BoxplotAggregatorFactory.java | 59 ++++ .../analytics/boxplot/InternalBoxplot.java | 200 +++++++++++ .../BoxplotAggregationBuilderTests.java | 65 ++++ .../boxplot/BoxplotAggregatorTests.java | 323 ++++++++++++++++++ .../boxplot/InternalBoxplotTests.java | 113 ++++++ .../rest-api-spec/test/analytics/boxplot.yml | 56 +++ 12 files changed, 1390 insertions(+), 3 deletions(-) create mode 100644 docs/reference/aggregations/metrics/boxplot-aggregation.asciidoc create mode 100644 x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/boxplot/Boxplot.java create mode 100644 x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/boxplot/BoxplotAggregationBuilder.java create mode 100644 x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/boxplot/BoxplotAggregator.java create mode 100644 x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/boxplot/BoxplotAggregatorFactory.java create mode 100644 x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/boxplot/InternalBoxplot.java create mode 100644 x-pack/plugin/analytics/src/test/java/org/elasticsearch/xpack/analytics/boxplot/BoxplotAggregationBuilderTests.java create mode 100644 x-pack/plugin/analytics/src/test/java/org/elasticsearch/xpack/analytics/boxplot/BoxplotAggregatorTests.java create mode 100644 x-pack/plugin/analytics/src/test/java/org/elasticsearch/xpack/analytics/boxplot/InternalBoxplotTests.java create mode 100644 x-pack/plugin/src/test/resources/rest-api-spec/test/analytics/boxplot.yml diff --git a/docs/reference/aggregations/metrics/boxplot-aggregation.asciidoc b/docs/reference/aggregations/metrics/boxplot-aggregation.asciidoc new file mode 100644 index 0000000000000..262a53d650e24 --- /dev/null +++ b/docs/reference/aggregations/metrics/boxplot-aggregation.asciidoc @@ -0,0 +1,204 @@ +[role="xpack"] +[testenv="basic"] +[[search-aggregations-metrics-boxplot-aggregation]] +=== Boxplot Aggregation + +A `boxplot` metrics aggregation that computes boxplot of numeric values extracted from the aggregated documents. +These values can be extracted either from specific numeric fields in the documents, or be generated by a provided script. + +The `boxplot` aggregation returns essential information for making a https://en.wikipedia.org/wiki/Box_plot[box plot]: minimum, maximum +median, first quartile (25th percentile) and third quartile (75th percentile) values. + +==== Syntax + +A `boxplot` aggregation looks like this in isolation: + +[source,js] +-------------------------------------------------- +{ + "boxplot": { + "buckets_path": "my_cardinality_agg" + } +} +-------------------------------------------------- +// NOTCONSOLE + +Let's look at a boxplot representing load time: + +[source,console] +-------------------------------------------------- +GET latency/_search +{ + "size": 0, + "aggs" : { + "load_time_boxplot" : { + "boxplot" : { + "field" : "load_time" <1> + } + } + } +} +-------------------------------------------------- +// TEST[setup:latency] +<1> The field `load_time` must be a numeric field + +The response will look like this: + +[source,console-result] +-------------------------------------------------- +{ + ... + + "aggregations": { + "load_time_boxplot": { + "min": 0.0, + "max": 990.0, + "q1": 165.0, + "q2": 445.0, + "q3": 725.0 + } + } +} +-------------------------------------------------- +// TESTRESPONSE[s/\.\.\./"took": $body.took,"timed_out": false,"_shards": $body._shards,"hits": $body.hits,/] + +As you can see, the aggregation will return a calculated value for each percentile +in the default range. If we assume response times are in milliseconds, it is +immediately obvious that the webpage normally loads in 10-725ms, but occasionally +spikes to 945-985ms. + +==== Script + +The boxplot metric supports scripting. For example, if our load times +are in milliseconds but we want percentiles calculated in seconds, we could use +a script to convert them on-the-fly: + +[source,console] +-------------------------------------------------- +GET latency/_search +{ + "size": 0, + "aggs" : { + "load_time_boxplot" : { + "boxplot" : { + "script" : { + "lang": "painless", + "source": "doc['load_time'].value / params.timeUnit", <1> + "params" : { + "timeUnit" : 1000 <2> + } + } + } + } + } +} +-------------------------------------------------- +// TEST[setup:latency] + +<1> The `field` parameter is replaced with a `script` parameter, which uses the +script to generate values which percentiles are calculated on +<2> Scripting supports parameterized input just like any other script + +This will interpret the `script` parameter as an `inline` script with the `painless` script language and no script parameters. To use a +stored script use the following syntax: + +[source,console] +-------------------------------------------------- +GET latency/_search +{ + "size": 0, + "aggs" : { + "load_time_boxplot" : { + "boxplot" : { + "script" : { + "id": "my_script", + "params": { + "field": "load_time" + } + } + } + } + } +} +-------------------------------------------------- +// TEST[setup:latency,stored_example_script] + +[[search-aggregations-metrics-boxplot-aggregation-approximation]] +==== Boxplot values are (usually) approximate + +The algorithm used by the `boxplot` metric is called TDigest (introduced by +Ted Dunning in +https://github.com/tdunning/t-digest/blob/master/docs/t-digest-paper/histo.pdf[Computing Accurate Quantiles using T-Digests]). + +[WARNING] +==== +Boxplot as other percentile aggregations are also +https://en.wikipedia.org/wiki/Nondeterministic_algorithm[non-deterministic]. +This means you can get slightly different results using the same data. +==== + +[[search-aggregations-metrics-boxplot-aggregation-compression]] +==== Compression + +Approximate algorithms must balance memory utilization with estimation accuracy. +This balance can be controlled using a `compression` parameter: + +[source,console] +-------------------------------------------------- +GET latency/_search +{ + "size": 0, + "aggs" : { + "load_time_boxplot" : { + "boxplot" : { + "field" : "load_time", + "compression" : 200 <1> + } + } + } +} +-------------------------------------------------- +// TEST[setup:latency] + +<1> Compression controls memory usage and approximation error + +The TDigest algorithm uses a number of "nodes" to approximate percentiles -- the +more nodes available, the higher the accuracy (and large memory footprint) proportional +to the volume of data. The `compression` parameter limits the maximum number of +nodes to `20 * compression`. + +Therefore, by increasing the compression value, you can increase the accuracy of +your percentiles at the cost of more memory. Larger compression values also +make the algorithm slower since the underlying tree data structure grows in size, +resulting in more expensive operations. The default compression value is +`100`. + +A "node" uses roughly 32 bytes of memory, so under worst-case scenarios (large amount +of data which arrives sorted and in-order) the default settings will produce a +TDigest roughly 64KB in size. In practice data tends to be more random and +the TDigest will use less memory. + +==== Missing value + +The `missing` parameter defines how documents that are missing a value should be treated. +By default they will be ignored but it is also possible to treat them as if they +had a value. + +[source,console] +-------------------------------------------------- +GET latency/_search +{ + "size": 0, + "aggs" : { + "grade_boxplot" : { + "boxplot" : { + "field" : "grade", + "missing": 10 <1> + } + } + } +} +-------------------------------------------------- +// TEST[setup:latency] + +<1> Documents without a value in the `grade` field will fall into the same bucket as documents that have the value `10`. diff --git a/server/src/test/java/org/elasticsearch/search/aggregations/metrics/MinAggregatorTests.java b/server/src/test/java/org/elasticsearch/search/aggregations/metrics/MinAggregatorTests.java index d6b65dfc62d74..72781f332a007 100644 --- a/server/src/test/java/org/elasticsearch/search/aggregations/metrics/MinAggregatorTests.java +++ b/server/src/test/java/org/elasticsearch/search/aggregations/metrics/MinAggregatorTests.java @@ -378,7 +378,7 @@ public void testGetProperty() throws IOException { iw.addDocument(singleton(new NumericDocValuesField("number", 7))); iw.addDocument(singleton(new NumericDocValuesField("number", 1))); }, (Consumer) global -> { - assertEquals(1.0, global.getDocCount(), 2); + assertEquals(2, global.getDocCount()); assertTrue(AggregationInspectionHelper.hasValue(global)); assertNotNull(global.getAggregations().asMap().get("min")); diff --git a/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/AnalyticsPlugin.java b/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/AnalyticsPlugin.java index 40f8ac55be89c..26722d5aa3cbe 100644 --- a/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/AnalyticsPlugin.java +++ b/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/AnalyticsPlugin.java @@ -7,12 +7,15 @@ import org.elasticsearch.action.ActionRequest; import org.elasticsearch.action.ActionResponse; +import org.elasticsearch.common.xcontent.ContextParser; import org.elasticsearch.index.mapper.Mapper; import org.elasticsearch.license.XPackLicenseState; import org.elasticsearch.plugins.ActionPlugin; import org.elasticsearch.plugins.MapperPlugin; import org.elasticsearch.plugins.Plugin; import org.elasticsearch.plugins.SearchPlugin; +import org.elasticsearch.search.aggregations.AggregationBuilder; +import org.elasticsearch.xpack.analytics.boxplot.InternalBoxplot; import org.elasticsearch.xpack.analytics.mapper.HistogramFieldMapper; import org.elasticsearch.xpack.core.XPackPlugin; import org.elasticsearch.xpack.core.action.XPackInfoFeatureAction; @@ -21,6 +24,7 @@ import org.elasticsearch.xpack.analytics.action.AnalyticsInfoTransportAction; import org.elasticsearch.xpack.analytics.action.AnalyticsUsageTransportAction; import org.elasticsearch.xpack.analytics.action.TransportAnalyticsStatsAction; +import org.elasticsearch.xpack.analytics.boxplot.BoxplotAggregationBuilder; import org.elasticsearch.xpack.analytics.cumulativecardinality.CumulativeCardinalityPipelineAggregationBuilder; import org.elasticsearch.xpack.analytics.cumulativecardinality.CumulativeCardinalityPipelineAggregator; import org.elasticsearch.xpack.analytics.stringstats.InternalStringStats; @@ -56,11 +60,16 @@ public List getPipelineAggregations() { @Override public List getAggregations() { - return singletonList( + return Arrays.asList( new AggregationSpec( StringStatsAggregationBuilder.NAME, StringStatsAggregationBuilder::new, - StringStatsAggregationBuilder::parse).addResultReader(InternalStringStats::new) + StringStatsAggregationBuilder::parse).addResultReader(InternalStringStats::new), + new AggregationSpec( + BoxplotAggregationBuilder.NAME, + BoxplotAggregationBuilder::new, + (ContextParser) (p, c) -> BoxplotAggregationBuilder.parse(c, p)) + .addResultReader(InternalBoxplot::new) ); } diff --git a/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/boxplot/Boxplot.java b/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/boxplot/Boxplot.java new file mode 100644 index 0000000000000..3e4ea2b5270f0 --- /dev/null +++ b/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/boxplot/Boxplot.java @@ -0,0 +1,63 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +package org.elasticsearch.xpack.analytics.boxplot; + +import org.elasticsearch.search.aggregations.metrics.NumericMetricsAggregation; + +public interface Boxplot extends NumericMetricsAggregation.MultiValue { + + /** + * @return The minimum value of all aggregated values. + */ + double getMin(); + + /** + * @return The maximum value of all aggregated values. + */ + double getMax(); + + /** + * @return The first quartile of all aggregated values. + */ + double getQ1(); + + /** + * @return The second quartile of all aggregated values. + */ + double getQ2(); + + /** + * @return The third quartile of all aggregated values. + */ + double getQ3(); + + /** + * @return The minimum value of all aggregated values as a String. + */ + String getMinAsString(); + + /** + * @return The maximum value of all aggregated values as a String. + */ + String getMaxAsString(); + + /** + * @return The first quartile of all aggregated values as a String. + */ + String getQ1AsString(); + + /** + * @return The second quartile of all aggregated values as a String. + */ + String getQ2AsString(); + + /** + * @return The third quartile of all aggregated values as a String. + */ + String getQ3AsString(); + +} diff --git a/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/boxplot/BoxplotAggregationBuilder.java b/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/boxplot/BoxplotAggregationBuilder.java new file mode 100644 index 0000000000000..a39b799762c6a --- /dev/null +++ b/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/boxplot/BoxplotAggregationBuilder.java @@ -0,0 +1,133 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +package org.elasticsearch.xpack.analytics.boxplot; + +import org.elasticsearch.common.ParseField; +import org.elasticsearch.common.io.stream.StreamInput; +import org.elasticsearch.common.io.stream.StreamOutput; +import org.elasticsearch.common.xcontent.ObjectParser; +import org.elasticsearch.common.xcontent.XContentBuilder; +import org.elasticsearch.common.xcontent.XContentParser; +import org.elasticsearch.index.query.QueryShardContext; +import org.elasticsearch.search.aggregations.AggregationBuilder; +import org.elasticsearch.search.aggregations.AggregatorFactories; +import org.elasticsearch.search.aggregations.AggregatorFactory; +import org.elasticsearch.search.aggregations.metrics.PercentilesMethod; +import org.elasticsearch.search.aggregations.support.CoreValuesSourceType; +import org.elasticsearch.search.aggregations.support.ValueType; +import org.elasticsearch.search.aggregations.support.ValuesSource; +import org.elasticsearch.search.aggregations.support.ValuesSourceAggregationBuilder; +import org.elasticsearch.search.aggregations.support.ValuesSourceConfig; +import org.elasticsearch.search.aggregations.support.ValuesSourceParserHelper; + +import java.io.IOException; +import java.util.Map; +import java.util.Objects; + +public class BoxplotAggregationBuilder extends ValuesSourceAggregationBuilder.LeafOnly { + public static final String NAME = "boxplot"; + + public static final ParseField COMPRESSION_FIELD = new ParseField("compression"); + + private static final ObjectParser PARSER; + + static { + PARSER = new ObjectParser<>(BoxplotAggregationBuilder.NAME); + ValuesSourceParserHelper.declareNumericFields(PARSER, true, true, false); + PARSER.declareDouble((builder, compression) -> builder.compression = compression, COMPRESSION_FIELD); + } + + public static AggregationBuilder parse(String aggregationName, XContentParser parser) throws IOException { + return PARSER.parse(parser, new BoxplotAggregationBuilder(aggregationName), null); + } + + private double compression = 100.0; + + public BoxplotAggregationBuilder(String name) { + super(name, CoreValuesSourceType.NUMERIC, ValueType.NUMERIC); + } + + protected BoxplotAggregationBuilder(BoxplotAggregationBuilder clone, + AggregatorFactories.Builder factoriesBuilder, Map metaData) { + super(clone, factoriesBuilder, metaData); + this.compression = clone.compression; + } + + @Override + protected AggregationBuilder shallowCopy(AggregatorFactories.Builder factoriesBuilder, Map metaData) { + return new BoxplotAggregationBuilder(this, factoriesBuilder, metaData); + } + + /** + * Read from a stream. + */ + public BoxplotAggregationBuilder(StreamInput in) throws IOException { + super(in, CoreValuesSourceType.NUMERIC, ValueType.NUMERIC); + compression = in.readDouble(); + } + + @Override + protected void innerWriteTo(StreamOutput out) throws IOException { + out.writeDouble(compression); + } + + /** + * Expert: set the compression. Higher values improve accuracy but also + * memory usage. Only relevant when using {@link PercentilesMethod#TDIGEST}. + */ + public BoxplotAggregationBuilder compression(double compression) { + if (compression < 0.0) { + throw new IllegalArgumentException( + "[compression] must be greater than or equal to 0. Found [" + compression + "] in [" + name + "]"); + } + this.compression = compression; + return this; + } + + /** + * Expert: get the compression. Higher values improve accuracy but also + * memory usage. Only relevant when using {@link PercentilesMethod#TDIGEST}. + */ + public double compression() { + return compression; + } + + @Override + protected BoxplotAggregatorFactory innerBuild(QueryShardContext queryShardContext, + ValuesSourceConfig config, + AggregatorFactory parent, + AggregatorFactories.Builder subFactoriesBuilder) throws IOException { + return new BoxplotAggregatorFactory(name, config, compression, queryShardContext, parent, subFactoriesBuilder, metaData); + } + + @Override + public XContentBuilder doXContentBody(XContentBuilder builder, Params params) throws IOException { + builder.field(COMPRESSION_FIELD.getPreferredName(), compression); + return builder; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) return true; + if (obj == null || getClass() != obj.getClass()) return false; + if (super.equals(obj) == false) return false; + BoxplotAggregationBuilder other = (BoxplotAggregationBuilder) obj; + return Objects.equals(compression, other.compression); + } + + @Override + public int hashCode() { + return Objects.hash(super.hashCode(), compression); + } + + @Override + public String getType() { + return NAME; + } +} + diff --git a/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/boxplot/BoxplotAggregator.java b/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/boxplot/BoxplotAggregator.java new file mode 100644 index 0000000000000..c7b35df22d22e --- /dev/null +++ b/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/boxplot/BoxplotAggregator.java @@ -0,0 +1,162 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +package org.elasticsearch.xpack.analytics.boxplot; + +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.search.ScoreMode; +import org.elasticsearch.common.lease.Releasables; +import org.elasticsearch.common.util.BigArrays; +import org.elasticsearch.common.util.ObjectArray; +import org.elasticsearch.index.fielddata.SortedNumericDoubleValues; +import org.elasticsearch.search.DocValueFormat; +import org.elasticsearch.search.aggregations.Aggregator; +import org.elasticsearch.search.aggregations.InternalAggregation; +import org.elasticsearch.search.aggregations.LeafBucketCollector; +import org.elasticsearch.search.aggregations.LeafBucketCollectorBase; +import org.elasticsearch.search.aggregations.metrics.NumericMetricsAggregator; +import org.elasticsearch.search.aggregations.metrics.TDigestState; +import org.elasticsearch.search.aggregations.pipeline.PipelineAggregator; +import org.elasticsearch.search.aggregations.support.ValuesSource; +import org.elasticsearch.search.internal.SearchContext; + +import java.io.IOException; +import java.util.List; +import java.util.Map; + +public class BoxplotAggregator extends NumericMetricsAggregator.MultiValue { + + private final ValuesSource.Numeric valuesSource; + private final DocValueFormat format; + protected ObjectArray states; + protected final double compression; + + BoxplotAggregator(String name, ValuesSource.Numeric valuesSource, DocValueFormat formatter, double compression, + SearchContext context, Aggregator parent, List pipelineAggregators, + Map metaData) throws IOException { + super(name, context, parent, pipelineAggregators, metaData); + this.valuesSource = valuesSource; + this.format = formatter; + this.compression = compression; + if (valuesSource != null) { + states = context.bigArrays().newObjectArray(1); + } + } + + @Override + public ScoreMode scoreMode() { + return valuesSource != null && valuesSource.needsScores() ? ScoreMode.COMPLETE : ScoreMode.COMPLETE_NO_SCORES; + } + + @Override + public LeafBucketCollector getLeafCollector(LeafReaderContext ctx, + final LeafBucketCollector sub) throws IOException { + if (valuesSource == null) { + return LeafBucketCollector.NO_OP_COLLECTOR; + } + final BigArrays bigArrays = context.bigArrays(); + final SortedNumericDoubleValues values = valuesSource.doubleValues(ctx); + return new LeafBucketCollectorBase(sub, values) { + @Override + public void collect(int doc, long bucket) throws IOException { + states = bigArrays.grow(states, bucket + 1); + + if (values.advanceExact(doc)) { + TDigestState state = getExistingOrNewHistogram(bigArrays, bucket); + if (values.advanceExact(doc)) { + final int valueCount = values.docValueCount(); + for (int i = 0; i < valueCount; i++) { + state.add(values.nextValue()); + } + } + } + } + }; + } + + private TDigestState getExistingOrNewHistogram(final BigArrays bigArrays, long bucket) { + states = bigArrays.grow(states, bucket + 1); + TDigestState state = states.get(bucket); + if (state == null) { + state = new TDigestState(compression); + states.set(bucket, state); + } + return state; + } + + @Override + public boolean hasMetric(String name) { + try { + InternalBoxplot.Metrics.resolve(name); + return true; + } catch (IllegalArgumentException iae) { + return false; + } + } + + @Override + public double metric(String name, long owningBucketOrd) { + if (valuesSource != null && owningBucketOrd < states.size()) { + TDigestState state = states.get(owningBucketOrd); + if (state != null) { + switch (InternalBoxplot.Metrics.resolve(name)) { + case min: + return state.getMin(); + case max: + return state.getMax(); + case q1: + return state.quantile(0.25); + case q2: + return state.quantile(0.5); + case q3: + return state.quantile(0.75); + default: + throw new IllegalArgumentException("Unknown value [" + name + "] in boxplot aggregation"); + } + } + } + switch (InternalBoxplot.Metrics.resolve(name)) { + case min: + return Double.NEGATIVE_INFINITY; + case max: + return Double.POSITIVE_INFINITY; + case q1: + case q2: + case q3: + return Double.NaN; + default: + throw new IllegalArgumentException("Unknown value [" + name + "] in boxplot aggregation"); + } + } + + + @Override + public InternalAggregation buildAggregation(long owningBucketOrdinal) { + TDigestState state = getState(owningBucketOrdinal); + if (state == null) { + return buildEmptyAggregation(); + } else { + return new InternalBoxplot(name, state, format, pipelineAggregators(), metaData()); + } + } + + TDigestState getState(long bucketOrd) { + if (valuesSource == null || bucketOrd >= states.size()) { + return null; + } + return states.get(bucketOrd); + } + + @Override + public InternalAggregation buildEmptyAggregation() { + return new InternalBoxplot(name, new TDigestState(compression), format, pipelineAggregators(), metaData()); + } + + @Override + public void doClose() { + Releasables.close(states); + } +} diff --git a/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/boxplot/BoxplotAggregatorFactory.java b/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/boxplot/BoxplotAggregatorFactory.java new file mode 100644 index 0000000000000..190f65137f4be --- /dev/null +++ b/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/boxplot/BoxplotAggregatorFactory.java @@ -0,0 +1,59 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +package org.elasticsearch.xpack.analytics.boxplot; + +import org.elasticsearch.index.query.QueryShardContext; +import org.elasticsearch.search.aggregations.Aggregator; +import org.elasticsearch.search.aggregations.AggregatorFactories; +import org.elasticsearch.search.aggregations.AggregatorFactory; +import org.elasticsearch.search.aggregations.pipeline.PipelineAggregator; +import org.elasticsearch.search.aggregations.support.ValuesSource; +import org.elasticsearch.search.aggregations.support.ValuesSourceAggregatorFactory; +import org.elasticsearch.search.aggregations.support.ValuesSourceConfig; +import org.elasticsearch.search.internal.SearchContext; + +import java.io.IOException; +import java.util.List; +import java.util.Map; + +public class BoxplotAggregatorFactory extends ValuesSourceAggregatorFactory { + + private final double compression; + + BoxplotAggregatorFactory(String name, + ValuesSourceConfig config, + double compression, + QueryShardContext queryShardContext, + AggregatorFactory parent, + AggregatorFactories.Builder subFactoriesBuilder, + Map metaData) throws IOException { + super(name, config, queryShardContext, parent, subFactoriesBuilder, metaData); + this.compression = compression; + } + + @Override + protected Aggregator createUnmapped(SearchContext searchContext, + Aggregator parent, + List pipelineAggregators, + Map metaData) + throws IOException { + return new BoxplotAggregator(name, null, config.format(), compression, searchContext, parent, + pipelineAggregators, metaData); + } + + @Override + protected Aggregator doCreateInternal(ValuesSource.Numeric valuesSource, + SearchContext searchContext, + Aggregator parent, + boolean collectsFromSingleBucket, + List pipelineAggregators, + Map metaData) throws IOException { + return new BoxplotAggregator(name, valuesSource, config.format(), compression, searchContext, parent, + pipelineAggregators, metaData); + } + +} diff --git a/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/boxplot/InternalBoxplot.java b/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/boxplot/InternalBoxplot.java new file mode 100644 index 0000000000000..7ead3fa07d486 --- /dev/null +++ b/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/boxplot/InternalBoxplot.java @@ -0,0 +1,200 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +package org.elasticsearch.xpack.analytics.boxplot; + +import org.elasticsearch.common.io.stream.StreamInput; +import org.elasticsearch.common.io.stream.StreamOutput; +import org.elasticsearch.common.xcontent.XContentBuilder; +import org.elasticsearch.search.DocValueFormat; +import org.elasticsearch.search.aggregations.InternalAggregation; +import org.elasticsearch.search.aggregations.metrics.InternalNumericMetricsAggregation; +import org.elasticsearch.search.aggregations.metrics.TDigestState; +import org.elasticsearch.search.aggregations.pipeline.PipelineAggregator; + +import java.io.IOException; +import java.util.List; +import java.util.Map; +import java.util.Objects; + +public class InternalBoxplot extends InternalNumericMetricsAggregation.MultiValue implements Boxplot { + + enum Metrics { + + min, max, q1, q2, q3; + + public static Metrics resolve(String name) { + return Metrics.valueOf(name); + } + } + + private final TDigestState state; + + InternalBoxplot(String name, TDigestState state, DocValueFormat formatter, List pipelineAggregators, + Map metaData) { + super(name, pipelineAggregators, metaData); + this.state = state; + this.format = formatter; + } + + /** + * Read from a stream. + */ + public InternalBoxplot(StreamInput in) throws IOException { + super(in); + format = in.readNamedWriteable(DocValueFormat.class); + state = TDigestState.read(in); + } + + @Override + protected void doWriteTo(StreamOutput out) throws IOException { + out.writeNamedWriteable(format); + TDigestState.write(state, out); + } + + @Override + public String getWriteableName() { + return BoxplotAggregationBuilder.NAME; + } + + @Override + public double getMin() { + return state.getMin(); + } + + @Override + public double getMax() { + return state.getMax(); + } + + @Override + public double getQ1() { + return state.quantile(0.25); + } + + @Override + public double getQ2() { + return state.quantile(0.5); + } + + @Override + public double getQ3() { + return state.quantile(0.75); + } + + @Override + public String getMinAsString() { + return valueAsString(Metrics.min.name()); + } + + @Override + public String getMaxAsString() { + return valueAsString(Metrics.max.name()); + } + + @Override + public String getQ1AsString() { + return valueAsString(Metrics.q1.name()); + } + + @Override + public String getQ2AsString() { + return valueAsString(Metrics.q2.name()); + } + + @Override + public String getQ3AsString() { + return valueAsString(Metrics.q3.name()); + } + + @Override + public double value(String name) { + Metrics metrics = Metrics.valueOf(name); + switch (metrics) { + case min: + return getMin(); + case max: + return getMax(); + case q1: + return getQ1(); + case q2: + return getQ2(); + case q3: + return getQ3(); + default: + throw new IllegalArgumentException("Unknown value [" + name + "] in common stats aggregation"); + } + } + + // for testing only + DocValueFormat format() { + return format; + } + + // for testing only + TDigestState state() { + return state; + } + + @Override + public InternalBoxplot reduce(List aggregations, ReduceContext reduceContext) { + TDigestState merged = null; + for (InternalAggregation aggregation : aggregations) { + final InternalBoxplot percentiles = (InternalBoxplot) aggregation; + if (merged == null) { + merged = new TDigestState(percentiles.state.compression()); + } + merged.add(percentiles.state); + } + return new InternalBoxplot(name, merged, format, pipelineAggregators(), metaData); + } + + static class Fields { + public static final String MIN = "min"; + public static final String MIN_AS_STRING = "min_as_string"; + public static final String MAX = "max"; + public static final String MAX_AS_STRING = "max_as_string"; + public static final String Q1 = "q1"; + public static final String Q1_AS_STRING = "q1_as_string"; + public static final String Q2 = "q2"; + public static final String Q2_AS_STRING = "q2_as_string"; + public static final String Q3 = "q3"; + public static final String Q3_AS_STRING = "q3_as_string"; + } + + @Override + public XContentBuilder doXContentBody(XContentBuilder builder, Params params) throws IOException { + builder.field(Fields.MIN, getMin()); + builder.field(Fields.MAX, getMax()); + builder.field(Fields.Q1, getQ1()); + builder.field(Fields.Q2, getQ2()); + builder.field(Fields.Q3, getQ3()); + if (format != DocValueFormat.RAW) { + builder.field(Fields.MIN_AS_STRING, format.format(getMin())); + builder.field(Fields.MAX_AS_STRING, format.format(getMax())); + builder.field(Fields.Q1_AS_STRING, format.format(getQ1())); + builder.field(Fields.Q2_AS_STRING, format.format(getQ2())); + builder.field(Fields.Q3_AS_STRING, format.format(getQ3())); + } + return builder; + } + + @Override + public int hashCode() { + return Objects.hash(super.hashCode(), state); + } + + @Override + public boolean equals(Object obj) { + if (this == obj) return true; + if (obj == null || getClass() != obj.getClass()) return false; + if (super.equals(obj) == false) return false; + + InternalBoxplot that = (InternalBoxplot) obj; + return Objects.equals(state, that.state); + } +} + diff --git a/x-pack/plugin/analytics/src/test/java/org/elasticsearch/xpack/analytics/boxplot/BoxplotAggregationBuilderTests.java b/x-pack/plugin/analytics/src/test/java/org/elasticsearch/xpack/analytics/boxplot/BoxplotAggregationBuilderTests.java new file mode 100644 index 0000000000000..fc9f88b7c80d3 --- /dev/null +++ b/x-pack/plugin/analytics/src/test/java/org/elasticsearch/xpack/analytics/boxplot/BoxplotAggregationBuilderTests.java @@ -0,0 +1,65 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +package org.elasticsearch.xpack.analytics.boxplot; + +import org.elasticsearch.common.io.stream.Writeable; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.xcontent.NamedXContentRegistry; +import org.elasticsearch.common.xcontent.XContentParser; +import org.elasticsearch.search.SearchModule; +import org.elasticsearch.search.aggregations.AggregatorFactories; +import org.elasticsearch.test.AbstractSerializingTestCase; +import org.elasticsearch.xpack.analytics.AnalyticsPlugin; +import org.junit.Before; + +import java.io.IOException; +import java.util.Collections; + +import static org.hamcrest.Matchers.hasSize; + +public class BoxplotAggregationBuilderTests extends AbstractSerializingTestCase { + String aggregationName; + + @Before + public void setupName() { + aggregationName = randomAlphaOfLength(10); + } + + @Override + protected NamedXContentRegistry xContentRegistry() { + SearchModule searchModule = new SearchModule(Settings.EMPTY, Collections.singletonList(new AnalyticsPlugin())); + return new NamedXContentRegistry(searchModule.getNamedXContents()); + } + + @Override + protected BoxplotAggregationBuilder doParseInstance(XContentParser parser) throws IOException { + assertSame(XContentParser.Token.START_OBJECT, parser.nextToken()); + AggregatorFactories.Builder parsed = AggregatorFactories.parseAggregators(parser); + assertThat(parsed.getAggregatorFactories(), hasSize(1)); + assertThat(parsed.getPipelineAggregatorFactories(), hasSize(0)); + BoxplotAggregationBuilder agg = (BoxplotAggregationBuilder) parsed.getAggregatorFactories().iterator().next(); + assertNull(parser.nextToken()); + assertNotNull(agg); + return agg; + } + + @Override + protected BoxplotAggregationBuilder createTestInstance() { + BoxplotAggregationBuilder aggregationBuilder = new BoxplotAggregationBuilder(aggregationName) + .field(randomAlphaOfLength(10)); + if (randomBoolean()) { + aggregationBuilder.compression(randomDoubleBetween(0, 100, true)); + } + return aggregationBuilder; + } + + @Override + protected Writeable.Reader instanceReader() { + return BoxplotAggregationBuilder::new; + } +} + diff --git a/x-pack/plugin/analytics/src/test/java/org/elasticsearch/xpack/analytics/boxplot/BoxplotAggregatorTests.java b/x-pack/plugin/analytics/src/test/java/org/elasticsearch/xpack/analytics/boxplot/BoxplotAggregatorTests.java new file mode 100644 index 0000000000000..31a532d5314be --- /dev/null +++ b/x-pack/plugin/analytics/src/test/java/org/elasticsearch/xpack/analytics/boxplot/BoxplotAggregatorTests.java @@ -0,0 +1,323 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +package org.elasticsearch.xpack.analytics.boxplot; + +import org.apache.lucene.document.NumericDocValuesField; +import org.apache.lucene.document.SortedNumericDocValuesField; +import org.apache.lucene.document.SortedSetDocValuesField; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.search.DocValuesFieldExistsQuery; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.BytesRef; +import org.elasticsearch.common.CheckedConsumer; +import org.elasticsearch.index.mapper.KeywordFieldMapper; +import org.elasticsearch.index.mapper.MappedFieldType; +import org.elasticsearch.index.mapper.NumberFieldMapper; +import org.elasticsearch.search.aggregations.AggregationBuilder; +import org.elasticsearch.search.aggregations.AggregatorTestCase; +import org.elasticsearch.search.aggregations.InternalAggregation; +import org.elasticsearch.search.aggregations.bucket.global.GlobalAggregationBuilder; +import org.elasticsearch.search.aggregations.bucket.global.InternalGlobal; +import org.elasticsearch.search.aggregations.bucket.histogram.HistogramAggregationBuilder; +import org.elasticsearch.search.aggregations.bucket.histogram.InternalHistogram; +import org.elasticsearch.search.aggregations.support.AggregationInspectionHelper; + +import java.io.IOException; +import java.util.function.Consumer; + +import static java.util.Collections.singleton; +import static org.hamcrest.Matchers.equalTo; + +public class BoxplotAggregatorTests extends AggregatorTestCase { + + public void testNoMatchingField() throws IOException { + testCase(new MatchAllDocsQuery(), iw -> { + iw.addDocument(singleton(new SortedNumericDocValuesField("wrong_number", 7))); + iw.addDocument(singleton(new SortedNumericDocValuesField("wrong_number", 3))); + }, boxplot -> { + assertEquals(Double.POSITIVE_INFINITY, boxplot.getMin(), 0); + assertEquals(Double.NEGATIVE_INFINITY, boxplot.getMax(), 0); + assertEquals(Double.NaN, boxplot.getQ1(), 0); + assertEquals(Double.NaN, boxplot.getQ2(), 0); + assertEquals(Double.NaN, boxplot.getQ3(), 0); + }); + } + + public void testMatchesSortedNumericDocValues() throws IOException { + testCase(new MatchAllDocsQuery(), iw -> { + iw.addDocument(singleton(new SortedNumericDocValuesField("number", 2))); + iw.addDocument(singleton(new SortedNumericDocValuesField("number", 2))); + iw.addDocument(singleton(new SortedNumericDocValuesField("number", 3))); + iw.addDocument(singleton(new SortedNumericDocValuesField("number", 4))); + iw.addDocument(singleton(new SortedNumericDocValuesField("number", 5))); + iw.addDocument(singleton(new SortedNumericDocValuesField("number", 10))); + }, boxplot -> { + assertEquals(2, boxplot.getMin(), 0); + assertEquals(10, boxplot.getMax(), 0); + assertEquals(2, boxplot.getQ1(), 0); + assertEquals(3.5, boxplot.getQ2(), 0); + assertEquals(5, boxplot.getQ3(), 0); + }); + } + + public void testMatchesNumericDocValues() throws IOException { + testCase(new MatchAllDocsQuery(), iw -> { + iw.addDocument(singleton(new NumericDocValuesField("number", 2))); + iw.addDocument(singleton(new NumericDocValuesField("number", 2))); + iw.addDocument(singleton(new NumericDocValuesField("number", 3))); + iw.addDocument(singleton(new NumericDocValuesField("number", 4))); + iw.addDocument(singleton(new NumericDocValuesField("number", 5))); + iw.addDocument(singleton(new NumericDocValuesField("number", 10))); + }, boxplot -> { + assertEquals(2, boxplot.getMin(), 0); + assertEquals(10, boxplot.getMax(), 0); + assertEquals(2, boxplot.getQ1(), 0); + assertEquals(3.5, boxplot.getQ2(), 0); + assertEquals(5, boxplot.getQ3(), 0); + }); + } + + public void testSomeMatchesSortedNumericDocValues() throws IOException { + testCase(new DocValuesFieldExistsQuery("number"), iw -> { + iw.addDocument(singleton(new SortedNumericDocValuesField("number", 2))); + iw.addDocument(singleton(new SortedNumericDocValuesField("number", 2))); + iw.addDocument(singleton(new SortedNumericDocValuesField("number2", 2))); + iw.addDocument(singleton(new SortedNumericDocValuesField("number", 3))); + iw.addDocument(singleton(new SortedNumericDocValuesField("number", 4))); + iw.addDocument(singleton(new SortedNumericDocValuesField("number", 5))); + iw.addDocument(singleton(new SortedNumericDocValuesField("number", 10))); + }, boxplot -> { + assertEquals(2, boxplot.getMin(), 0); + assertEquals(10, boxplot.getMax(), 0); + assertEquals(2, boxplot.getQ1(), 0); + assertEquals(3.5, boxplot.getQ2(), 0); + assertEquals(5, boxplot.getQ3(), 0); + }); + } + + public void testSomeMatchesNumericDocValues() throws IOException { + testCase(new DocValuesFieldExistsQuery("number"), iw -> { + iw.addDocument(singleton(new NumericDocValuesField("number", 2))); + iw.addDocument(singleton(new NumericDocValuesField("number", 2))); + iw.addDocument(singleton(new NumericDocValuesField("number2", 2))); + iw.addDocument(singleton(new NumericDocValuesField("number", 3))); + iw.addDocument(singleton(new NumericDocValuesField("number", 4))); + iw.addDocument(singleton(new NumericDocValuesField("number", 5))); + iw.addDocument(singleton(new NumericDocValuesField("number", 10))); + }, boxplot -> { + assertEquals(2, boxplot.getMin(), 0); + assertEquals(10, boxplot.getMax(), 0); + assertEquals(2, boxplot.getQ1(), 0); + assertEquals(3.5, boxplot.getQ2(), 0); + assertEquals(5, boxplot.getQ3(), 0); + }); + } + + public void testUnmappedWithMissingField() throws IOException { + BoxplotAggregationBuilder aggregationBuilder = new BoxplotAggregationBuilder("boxplot") + .field("does_not_exist").missing(0L); + + MappedFieldType fieldType = new NumberFieldMapper.NumberFieldType(NumberFieldMapper.NumberType.INTEGER); + fieldType.setName("number"); + + testCase(aggregationBuilder, new MatchAllDocsQuery(), iw -> { + iw.addDocument(singleton(new NumericDocValuesField("number", 7))); + iw.addDocument(singleton(new NumericDocValuesField("number", 1))); + }, (Consumer) boxplot -> { + assertEquals(0, boxplot.getMin(), 0); + assertEquals(0, boxplot.getMax(), 0); + assertEquals(0, boxplot.getQ1(), 0); + assertEquals(0, boxplot.getQ2(), 0); + assertEquals(0, boxplot.getQ3(), 0); + }, fieldType); + } + + public void testUnsupportedType() { + BoxplotAggregationBuilder aggregationBuilder = new BoxplotAggregationBuilder("boxplot").field("not_a_number"); + + MappedFieldType fieldType = new KeywordFieldMapper.KeywordFieldType(); + fieldType.setName("not_a_number"); + fieldType.setHasDocValues(true); + + IllegalArgumentException e = expectThrows(IllegalArgumentException.class, + () -> testCase(aggregationBuilder, new MatchAllDocsQuery(), iw -> { + iw.addDocument(singleton(new SortedSetDocValuesField("string", new BytesRef("foo")))); + }, (Consumer) boxplot -> { + fail("Should have thrown exception"); + }, fieldType)); + assertEquals(e.getMessage(), "Expected numeric type on field [not_a_number], but got [keyword]"); + } + + public void testBadMissingField() { + BoxplotAggregationBuilder aggregationBuilder = new BoxplotAggregationBuilder("boxplot").field("number") + .missing("not_a_number"); + + MappedFieldType fieldType = new NumberFieldMapper.NumberFieldType(NumberFieldMapper.NumberType.INTEGER); + fieldType.setName("number"); + + expectThrows(NumberFormatException.class, + () -> testCase(aggregationBuilder, new MatchAllDocsQuery(), iw -> { + iw.addDocument(singleton(new NumericDocValuesField("number", 2))); + iw.addDocument(singleton(new NumericDocValuesField("number", 2))); + iw.addDocument(singleton(new NumericDocValuesField("number", 3))); + iw.addDocument(singleton(new NumericDocValuesField("number", 4))); + iw.addDocument(singleton(new NumericDocValuesField("number", 5))); + iw.addDocument(singleton(new NumericDocValuesField("number", 10))); + }, (Consumer) boxplot -> { + fail("Should have thrown exception"); + }, fieldType)); + } + + public void testUnmappedWithBadMissingField() { + BoxplotAggregationBuilder aggregationBuilder = new BoxplotAggregationBuilder("boxplot") + .field("does_not_exist").missing("not_a_number"); + + MappedFieldType fieldType = new NumberFieldMapper.NumberFieldType(NumberFieldMapper.NumberType.INTEGER); + fieldType.setName("number"); + + expectThrows(NumberFormatException.class, + () -> testCase(aggregationBuilder, new MatchAllDocsQuery(), iw -> { + iw.addDocument(singleton(new NumericDocValuesField("number", 2))); + iw.addDocument(singleton(new NumericDocValuesField("number", 2))); + iw.addDocument(singleton(new NumericDocValuesField("number", 3))); + iw.addDocument(singleton(new NumericDocValuesField("number", 4))); + iw.addDocument(singleton(new NumericDocValuesField("number", 5))); + iw.addDocument(singleton(new NumericDocValuesField("number", 10))); + }, (Consumer) boxplot -> { + fail("Should have thrown exception"); + }, fieldType)); + } + + public void testEmptyBucket() throws IOException { + HistogramAggregationBuilder histogram = new HistogramAggregationBuilder("histo").field("number").interval(10).minDocCount(0) + .subAggregation(new BoxplotAggregationBuilder("boxplot").field("number")); + + MappedFieldType fieldType = new NumberFieldMapper.NumberFieldType(NumberFieldMapper.NumberType.INTEGER); + fieldType.setName("number"); + + testCase(histogram, new MatchAllDocsQuery(), iw -> { + iw.addDocument(singleton(new NumericDocValuesField("number", 1))); + iw.addDocument(singleton(new NumericDocValuesField("number", 3))); + iw.addDocument(singleton(new NumericDocValuesField("number", 21))); + iw.addDocument(singleton(new NumericDocValuesField("number", 23))); + }, (Consumer) histo -> { + assertThat(histo.getBuckets().size(), equalTo(3)); + + assertNotNull(histo.getBuckets().get(0).getAggregations().asMap().get("boxplot")); + InternalBoxplot boxplot = (InternalBoxplot) histo.getBuckets().get(0).getAggregations().asMap().get("boxplot"); + assertEquals(1, boxplot.getMin(), 0); + assertEquals(3, boxplot.getMax(), 0); + assertEquals(1, boxplot.getQ1(), 0); + assertEquals(2, boxplot.getQ2(), 0); + assertEquals(3, boxplot.getQ3(), 0); + + assertNotNull(histo.getBuckets().get(1).getAggregations().asMap().get("boxplot")); + boxplot = (InternalBoxplot) histo.getBuckets().get(1).getAggregations().asMap().get("boxplot"); + assertEquals(Double.POSITIVE_INFINITY, boxplot.getMin(), 0); + assertEquals(Double.NEGATIVE_INFINITY, boxplot.getMax(), 0); + assertEquals(Double.NaN, boxplot.getQ1(), 0); + assertEquals(Double.NaN, boxplot.getQ2(), 0); + assertEquals(Double.NaN, boxplot.getQ3(), 0); + + assertNotNull(histo.getBuckets().get(2).getAggregations().asMap().get("boxplot")); + boxplot = (InternalBoxplot) histo.getBuckets().get(2).getAggregations().asMap().get("boxplot"); + assertEquals(21, boxplot.getMin(), 0); + assertEquals(23, boxplot.getMax(), 0); + assertEquals(21, boxplot.getQ1(), 0); + assertEquals(22, boxplot.getQ2(), 0); + assertEquals(23, boxplot.getQ3(), 0); + }, fieldType); + } + + public void testFormatter() throws IOException { + BoxplotAggregationBuilder aggregationBuilder = new BoxplotAggregationBuilder("boxplot").field("number") + .format("0000.0"); + + MappedFieldType fieldType = new NumberFieldMapper.NumberFieldType(NumberFieldMapper.NumberType.INTEGER); + fieldType.setName("number"); + + testCase(aggregationBuilder, new MatchAllDocsQuery(), iw -> { + iw.addDocument(singleton(new NumericDocValuesField("number", 1))); + iw.addDocument(singleton(new NumericDocValuesField("number", 2))); + iw.addDocument(singleton(new NumericDocValuesField("number", 3))); + iw.addDocument(singleton(new NumericDocValuesField("number", 4))); + iw.addDocument(singleton(new NumericDocValuesField("number", 5))); + }, (Consumer) boxplot -> { + assertEquals(1, boxplot.getMin(), 0); + assertEquals(5, boxplot.getMax(), 0); + assertEquals(1.75, boxplot.getQ1(), 0); + assertEquals(3, boxplot.getQ2(), 0); + assertEquals(4.25, boxplot.getQ3(), 0); + assertEquals("0001.0", boxplot.getMinAsString()); + assertEquals("0005.0", boxplot.getMaxAsString()); + assertEquals("0001.8", boxplot.getQ1AsString()); + assertEquals("0003.0", boxplot.getQ2AsString()); + assertEquals("0004.2", boxplot.getQ3AsString()); + }, fieldType); + } + + public void testGetProperty() throws IOException { + GlobalAggregationBuilder globalBuilder = new GlobalAggregationBuilder("global") + .subAggregation(new BoxplotAggregationBuilder("boxplot").field("number")); + + MappedFieldType fieldType = new NumberFieldMapper.NumberFieldType(NumberFieldMapper.NumberType.INTEGER); + fieldType.setName("number"); + + testCase(globalBuilder, new MatchAllDocsQuery(), iw -> { + iw.addDocument(singleton(new NumericDocValuesField("number", 1))); + iw.addDocument(singleton(new NumericDocValuesField("number", 2))); + iw.addDocument(singleton(new NumericDocValuesField("number", 3))); + iw.addDocument(singleton(new NumericDocValuesField("number", 4))); + iw.addDocument(singleton(new NumericDocValuesField("number", 5))); + }, (Consumer) global -> { + assertEquals(5, global.getDocCount()); + assertTrue(AggregationInspectionHelper.hasValue(global)); + assertNotNull(global.getAggregations().asMap().get("boxplot")); + InternalBoxplot boxplot = (InternalBoxplot) global.getAggregations().asMap().get("boxplot"); + assertThat(global.getProperty("boxplot"), equalTo(boxplot)); + assertThat(global.getProperty("boxplot.min"), equalTo(1.0)); + assertThat(global.getProperty("boxplot.max"), equalTo(5.0)); + assertThat(boxplot.getProperty("min"), equalTo(1.0)); + assertThat(boxplot.getProperty("max"), equalTo(5.0)); + }, fieldType); + } + + private void testCase(Query query, + CheckedConsumer buildIndex, + Consumer verify) throws IOException { + MappedFieldType fieldType = new NumberFieldMapper.NumberFieldType(NumberFieldMapper.NumberType.INTEGER); + fieldType.setName("number"); + BoxplotAggregationBuilder aggregationBuilder = new BoxplotAggregationBuilder("boxplot").field("number"); + testCase(aggregationBuilder, query, buildIndex, verify, fieldType); + } + + private void testCase( + T aggregationBuilder, Query query, + CheckedConsumer buildIndex, + Consumer verify, MappedFieldType fieldType) throws IOException { + try (Directory directory = newDirectory()) { + RandomIndexWriter indexWriter = new RandomIndexWriter(random(), directory); + buildIndex.accept(indexWriter); + indexWriter.close(); + + try (IndexReader indexReader = DirectoryReader.open(directory)) { + IndexSearcher indexSearcher = newSearcher(indexReader, true, true); + + V agg = searchAndReduce(indexSearcher, query, aggregationBuilder, fieldType); + verify.accept(agg); + + } + } + } + + +} diff --git a/x-pack/plugin/analytics/src/test/java/org/elasticsearch/xpack/analytics/boxplot/InternalBoxplotTests.java b/x-pack/plugin/analytics/src/test/java/org/elasticsearch/xpack/analytics/boxplot/InternalBoxplotTests.java new file mode 100644 index 0000000000000..b9400bbd07be7 --- /dev/null +++ b/x-pack/plugin/analytics/src/test/java/org/elasticsearch/xpack/analytics/boxplot/InternalBoxplotTests.java @@ -0,0 +1,113 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +package org.elasticsearch.xpack.analytics.boxplot; + +import org.elasticsearch.common.ParseField; +import org.elasticsearch.common.io.stream.BytesStreamOutput; +import org.elasticsearch.common.io.stream.NamedWriteableAwareStreamInput; +import org.elasticsearch.common.io.stream.StreamInput; +import org.elasticsearch.common.io.stream.Writeable; +import org.elasticsearch.common.xcontent.NamedXContentRegistry; +import org.elasticsearch.search.DocValueFormat; +import org.elasticsearch.search.aggregations.Aggregation; +import org.elasticsearch.search.aggregations.ParsedAggregation; +import org.elasticsearch.search.aggregations.metrics.TDigestState; +import org.elasticsearch.search.aggregations.pipeline.PipelineAggregator; +import org.elasticsearch.test.InternalAggregationTestCase; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +public class InternalBoxplotTests extends InternalAggregationTestCase { + @Override + protected InternalBoxplot createTestInstance(String name, List pipelineAggregators, + Map metaData) { + int numValues = frequently() ? randomInt(100) : 0; + double[] values = new double[numValues]; + TDigestState state = new TDigestState(100); + for (int i = 0; i < numValues; ++i) { + state.add(randomDouble()); + } + DocValueFormat formatter = randomNumericDocValueFormat(); + + return new InternalBoxplot(name, state, formatter, pipelineAggregators, metaData); + } + + @Override + protected Writeable.Reader instanceReader() { + return InternalBoxplot::new; + } + + @Override + protected void assertReduced(InternalBoxplot reduced, List inputs) { + TDigestState expected = new TDigestState(reduced.state().compression()); + for (InternalBoxplot input : inputs) { + expected.add(input.state()); + } + assertNotNull(expected); + // The final calculated result may very depending on the order, which requires higher delta + assertEquals(expected.getMax(), reduced.getMax(), 0); + assertEquals(expected.getMin(), reduced.getMin(), 0); + } + + @Override + protected void assertFromXContent(InternalBoxplot min, ParsedAggregation parsedAggregation) { + // There is no ParsedBoxplot yet so we cannot test it here + } + + @Override + protected InternalBoxplot mutateInstance(InternalBoxplot instance) { + String name = instance.getName(); + TDigestState state; + try (BytesStreamOutput output = new BytesStreamOutput()) { + TDigestState.write(instance.state(), output); + try (StreamInput in = new NamedWriteableAwareStreamInput(output.bytes().streamInput(), getNamedWriteableRegistry())) { + state = TDigestState.read(in); + } + } catch (IOException ex) { + throw new IllegalStateException(ex); + } + DocValueFormat formatter = instance.format(); + List pipelineAggregators = instance.pipelineAggregators(); + Map metaData = instance.getMetaData(); + switch (between(0, 2)) { + case 0: + name += randomAlphaOfLength(5); + break; + case 1: + state.add(randomDouble()); + break; + case 2: + if (metaData == null) { + metaData = new HashMap<>(1); + } else { + metaData = new HashMap<>(instance.getMetaData()); + } + metaData.put(randomAlphaOfLength(15), randomInt()); + break; + default: + throw new AssertionError("Illegal randomisation branch"); + } + return new InternalBoxplot(name, state, formatter, pipelineAggregators, metaData); + } + + @Override + protected List getNamedXContents() { + List extendedNamedXContents = new ArrayList<>(super.getNamedXContents()); + extendedNamedXContents.add(new NamedXContentRegistry.Entry(Aggregation.class, + new ParseField(BoxplotAggregationBuilder.NAME), + (p, c) -> { + assumeTrue("There is no ParsedBoxlot yet", false); + return null; + } + )); + return extendedNamedXContents; + } +} diff --git a/x-pack/plugin/src/test/resources/rest-api-spec/test/analytics/boxplot.yml b/x-pack/plugin/src/test/resources/rest-api-spec/test/analytics/boxplot.yml new file mode 100644 index 0000000000000..f65b9923f1948 --- /dev/null +++ b/x-pack/plugin/src/test/resources/rest-api-spec/test/analytics/boxplot.yml @@ -0,0 +1,56 @@ +setup: + - skip: + features: headers + - do: + indices.create: + index: latency + body: + mappings: + properties: + load_time: + type: double + + - do: + headers: + Authorization: "Basic eF9wYWNrX3Jlc3RfdXNlcjp4LXBhY2stdGVzdC1wYXNzd29yZA==" # run as x_pack_rest_user, i.e. the test setup superuser + bulk: + refresh: true + body: + - index: + _index: "latency" + - load_time: 2 + + - index: + _index: "latency" + - load_time: 3 + + - index: + _index: "latency" + - load_time: 5 + + - index: + _index: "latency" + - load_time: 10 + + - index: + _index: "latency" + - load_time: 15 +--- +"Basic Search": + + - do: + search: + index: "latency" + body: + size: 0 + aggs: + plot: + boxplot: + field: "load_time" + + - match: { aggregations.plot.min: 2.0 } + - match: { aggregations.plot.max: 15.0 } + - match: { aggregations.plot.q1: 2.75 } + - match: { aggregations.plot.q2: 5.0 } + - match: { aggregations.plot.q3: 11.25 } + From caaec5a23a28529e0ca0353df24de8d905ce8611 Mon Sep 17 00:00:00 2001 From: Igor Motov Date: Thu, 6 Feb 2020 16:38:46 -0500 Subject: [PATCH 2/2] Address review comments --- .../metrics/boxplot-aggregation.asciidoc | 25 +---- .../metrics/percentile-aggregation.asciidoc | 2 + .../boxplot/BoxplotAggregationBuilder.java | 7 +- .../analytics/boxplot/BoxplotAggregator.java | 32 +----- .../analytics/boxplot/InternalBoxplot.java | 102 ++++++++++-------- .../boxplot/InternalBoxplotTests.java | 1 - 6 files changed, 68 insertions(+), 101 deletions(-) diff --git a/docs/reference/aggregations/metrics/boxplot-aggregation.asciidoc b/docs/reference/aggregations/metrics/boxplot-aggregation.asciidoc index 262a53d650e24..74c20e805fbfa 100644 --- a/docs/reference/aggregations/metrics/boxplot-aggregation.asciidoc +++ b/docs/reference/aggregations/metrics/boxplot-aggregation.asciidoc @@ -17,7 +17,7 @@ A `boxplot` aggregation looks like this in isolation: -------------------------------------------------- { "boxplot": { - "buckets_path": "my_cardinality_agg" + "field": "load_time" } } -------------------------------------------------- @@ -62,15 +62,10 @@ The response will look like this: -------------------------------------------------- // TESTRESPONSE[s/\.\.\./"took": $body.took,"timed_out": false,"_shards": $body._shards,"hits": $body.hits,/] -As you can see, the aggregation will return a calculated value for each percentile -in the default range. If we assume response times are in milliseconds, it is -immediately obvious that the webpage normally loads in 10-725ms, but occasionally -spikes to 945-985ms. - ==== Script The boxplot metric supports scripting. For example, if our load times -are in milliseconds but we want percentiles calculated in seconds, we could use +are in milliseconds but we want values calculated in seconds, we could use a script to convert them on-the-fly: [source,console] @@ -162,21 +157,7 @@ GET latency/_search <1> Compression controls memory usage and approximation error -The TDigest algorithm uses a number of "nodes" to approximate percentiles -- the -more nodes available, the higher the accuracy (and large memory footprint) proportional -to the volume of data. The `compression` parameter limits the maximum number of -nodes to `20 * compression`. - -Therefore, by increasing the compression value, you can increase the accuracy of -your percentiles at the cost of more memory. Larger compression values also -make the algorithm slower since the underlying tree data structure grows in size, -resulting in more expensive operations. The default compression value is -`100`. - -A "node" uses roughly 32 bytes of memory, so under worst-case scenarios (large amount -of data which arrives sorted and in-order) the default settings will produce a -TDigest roughly 64KB in size. In practice data tends to be more random and -the TDigest will use less memory. +include::percentile-aggregation.asciidoc[tags=t-digest] ==== Missing value diff --git a/docs/reference/aggregations/metrics/percentile-aggregation.asciidoc b/docs/reference/aggregations/metrics/percentile-aggregation.asciidoc index e5744fec768b0..7e9869a003948 100644 --- a/docs/reference/aggregations/metrics/percentile-aggregation.asciidoc +++ b/docs/reference/aggregations/metrics/percentile-aggregation.asciidoc @@ -285,6 +285,7 @@ GET latency/_search <1> Compression controls memory usage and approximation error +// tag::[t-digest] The TDigest algorithm uses a number of "nodes" to approximate percentiles -- the more nodes available, the higher the accuracy (and large memory footprint) proportional to the volume of data. The `compression` parameter limits the maximum number of @@ -300,6 +301,7 @@ A "node" uses roughly 32 bytes of memory, so under worst-case scenarios (large a of data which arrives sorted and in-order) the default settings will produce a TDigest roughly 64KB in size. In practice data tends to be more random and the TDigest will use less memory. +// tag::[t-digest] ==== HDR Histogram diff --git a/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/boxplot/BoxplotAggregationBuilder.java b/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/boxplot/BoxplotAggregationBuilder.java index a39b799762c6a..0b59331b517e2 100644 --- a/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/boxplot/BoxplotAggregationBuilder.java +++ b/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/boxplot/BoxplotAggregationBuilder.java @@ -6,7 +6,6 @@ package org.elasticsearch.xpack.analytics.boxplot; -import org.elasticsearch.common.ParseField; import org.elasticsearch.common.io.stream.StreamInput; import org.elasticsearch.common.io.stream.StreamOutput; import org.elasticsearch.common.xcontent.ObjectParser; @@ -28,18 +27,18 @@ import java.util.Map; import java.util.Objects; +import static org.elasticsearch.search.aggregations.metrics.PercentilesAggregationBuilder.COMPRESSION_FIELD; + public class BoxplotAggregationBuilder extends ValuesSourceAggregationBuilder.LeafOnly { public static final String NAME = "boxplot"; - public static final ParseField COMPRESSION_FIELD = new ParseField("compression"); - private static final ObjectParser PARSER; static { PARSER = new ObjectParser<>(BoxplotAggregationBuilder.NAME); ValuesSourceParserHelper.declareNumericFields(PARSER, true, true, false); - PARSER.declareDouble((builder, compression) -> builder.compression = compression, COMPRESSION_FIELD); + PARSER.declareDouble(BoxplotAggregationBuilder::compression, COMPRESSION_FIELD); } public static AggregationBuilder parse(String aggregationName, XContentParser parser) throws IOException { diff --git a/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/boxplot/BoxplotAggregator.java b/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/boxplot/BoxplotAggregator.java index c7b35df22d22e..1c3a01b773d4f 100644 --- a/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/boxplot/BoxplotAggregator.java +++ b/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/boxplot/BoxplotAggregator.java @@ -99,37 +99,11 @@ public boolean hasMetric(String name) { @Override public double metric(String name, long owningBucketOrd) { + TDigestState state = null; if (valuesSource != null && owningBucketOrd < states.size()) { - TDigestState state = states.get(owningBucketOrd); - if (state != null) { - switch (InternalBoxplot.Metrics.resolve(name)) { - case min: - return state.getMin(); - case max: - return state.getMax(); - case q1: - return state.quantile(0.25); - case q2: - return state.quantile(0.5); - case q3: - return state.quantile(0.75); - default: - throw new IllegalArgumentException("Unknown value [" + name + "] in boxplot aggregation"); - } - } - } - switch (InternalBoxplot.Metrics.resolve(name)) { - case min: - return Double.NEGATIVE_INFINITY; - case max: - return Double.POSITIVE_INFINITY; - case q1: - case q2: - case q3: - return Double.NaN; - default: - throw new IllegalArgumentException("Unknown value [" + name + "] in boxplot aggregation"); + state = states.get(owningBucketOrd); } + return InternalBoxplot.Metrics.resolve(name).value(state); } diff --git a/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/boxplot/InternalBoxplot.java b/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/boxplot/InternalBoxplot.java index 7ead3fa07d486..c42f04832c57b 100644 --- a/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/boxplot/InternalBoxplot.java +++ b/x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/boxplot/InternalBoxplot.java @@ -17,6 +17,7 @@ import java.io.IOException; import java.util.List; +import java.util.Locale; import java.util.Map; import java.util.Objects; @@ -24,10 +25,48 @@ public class InternalBoxplot extends InternalNumericMetricsAggregation.MultiValu enum Metrics { - min, max, q1, q2, q3; + MIN, MAX, Q1, Q2, Q3; public static Metrics resolve(String name) { - return Metrics.valueOf(name); + return Metrics.valueOf(name.toUpperCase(Locale.ROOT)); + } + + public String value() { + return name().toLowerCase(Locale.ROOT); + } + + double value(InternalBoxplot boxplot) { + switch (this) { + case MIN: + return boxplot.getMin(); + case MAX: + return boxplot.getMax(); + case Q1: + return boxplot.getQ1(); + case Q2: + return boxplot.getQ2(); + case Q3: + return boxplot.getQ3(); + default: + throw new IllegalArgumentException("Unknown value [" + this.value() + "] in the boxplot aggregation"); + } + } + + double value(TDigestState state) { + switch (this) { + case MIN: + return state == null ? Double.NEGATIVE_INFINITY : state.getMin(); + case MAX: + return state == null ? Double.POSITIVE_INFINITY : state.getMax(); + case Q1: + return state == null ? Double.NaN : state.quantile(0.25); + case Q2: + return state == null ? Double.NaN : state.quantile(0.5); + case Q3: + return state == null ? Double.NaN : state.quantile(0.75); + default: + throw new IllegalArgumentException("Unknown value [" + this.value() + "] in the boxplot aggregation"); + } } } @@ -87,46 +126,32 @@ public double getQ3() { @Override public String getMinAsString() { - return valueAsString(Metrics.min.name()); + return valueAsString(Metrics.MIN.name()); } @Override public String getMaxAsString() { - return valueAsString(Metrics.max.name()); + return valueAsString(Metrics.MAX.name()); } @Override public String getQ1AsString() { - return valueAsString(Metrics.q1.name()); + return valueAsString(Metrics.Q1.name()); } @Override public String getQ2AsString() { - return valueAsString(Metrics.q2.name()); + return valueAsString(Metrics.Q2.name()); } @Override public String getQ3AsString() { - return valueAsString(Metrics.q3.name()); + return valueAsString(Metrics.Q3.name()); } @Override public double value(String name) { - Metrics metrics = Metrics.valueOf(name); - switch (metrics) { - case min: - return getMin(); - case max: - return getMax(); - case q1: - return getQ1(); - case q2: - return getQ2(); - case q3: - return getQ3(); - default: - throw new IllegalArgumentException("Unknown value [" + name + "] in common stats aggregation"); - } + return Metrics.resolve(name).value(this); } // for testing only @@ -152,32 +177,19 @@ public InternalBoxplot reduce(List aggregations, ReduceCont return new InternalBoxplot(name, merged, format, pipelineAggregators(), metaData); } - static class Fields { - public static final String MIN = "min"; - public static final String MIN_AS_STRING = "min_as_string"; - public static final String MAX = "max"; - public static final String MAX_AS_STRING = "max_as_string"; - public static final String Q1 = "q1"; - public static final String Q1_AS_STRING = "q1_as_string"; - public static final String Q2 = "q2"; - public static final String Q2_AS_STRING = "q2_as_string"; - public static final String Q3 = "q3"; - public static final String Q3_AS_STRING = "q3_as_string"; - } - @Override public XContentBuilder doXContentBody(XContentBuilder builder, Params params) throws IOException { - builder.field(Fields.MIN, getMin()); - builder.field(Fields.MAX, getMax()); - builder.field(Fields.Q1, getQ1()); - builder.field(Fields.Q2, getQ2()); - builder.field(Fields.Q3, getQ3()); + builder.field("min", getMin()); + builder.field("max", getMax()); + builder.field("q1", getQ1()); + builder.field("q2", getQ2()); + builder.field("q3", getQ3()); if (format != DocValueFormat.RAW) { - builder.field(Fields.MIN_AS_STRING, format.format(getMin())); - builder.field(Fields.MAX_AS_STRING, format.format(getMax())); - builder.field(Fields.Q1_AS_STRING, format.format(getQ1())); - builder.field(Fields.Q2_AS_STRING, format.format(getQ2())); - builder.field(Fields.Q3_AS_STRING, format.format(getQ3())); + builder.field("min_as_string", format.format(getMin())); + builder.field("max_as_string", format.format(getMax())); + builder.field("q1_as_string", format.format(getQ1())); + builder.field("q2_as_string", format.format(getQ2())); + builder.field("q3_as_string", format.format(getQ3())); } return builder; } diff --git a/x-pack/plugin/analytics/src/test/java/org/elasticsearch/xpack/analytics/boxplot/InternalBoxplotTests.java b/x-pack/plugin/analytics/src/test/java/org/elasticsearch/xpack/analytics/boxplot/InternalBoxplotTests.java index b9400bbd07be7..31fbbe024af76 100644 --- a/x-pack/plugin/analytics/src/test/java/org/elasticsearch/xpack/analytics/boxplot/InternalBoxplotTests.java +++ b/x-pack/plugin/analytics/src/test/java/org/elasticsearch/xpack/analytics/boxplot/InternalBoxplotTests.java @@ -52,7 +52,6 @@ protected void assertReduced(InternalBoxplot reduced, List inpu expected.add(input.state()); } assertNotNull(expected); - // The final calculated result may very depending on the order, which requires higher delta assertEquals(expected.getMax(), reduced.getMax(), 0); assertEquals(expected.getMin(), reduced.getMin(), 0); }