From cee5ce31a0a4feedfe2f5d6017f398757dac7ad1 Mon Sep 17 00:00:00 2001 From: Zachary Tong Date: Mon, 19 Nov 2018 15:23:38 -0500 Subject: [PATCH 01/25] Add RareTerms aggregation This adds a `rare_terms` aggregation. It is an aggregation designed to identify the long-tail of keywords, e.g. terms that are "rare" or have low doc counts. This aggregation is designed to be more memory efficient than the alternative, which is setting a terms aggregation to size: LONG_MAX (or worse, ordering a terms agg by count ascending, which has unbounded error). This aggregation works by maintaining a map of terms that have been seen. A counter associated with each value is incremented when we see the term again. If the counter surpasses a predefined threshold, the term is removed from the map and inserted into a bloom filter. If a future term is found in the bloom filter we assume it was previously removed from the map and is "common". The map keys are the "rare" terms after collection is done. --- docs/reference/aggregations/bucket.asciidoc | 1 + .../bucket/rare-terms-aggregation.asciidoc | 295 ++++++++ .../search.aggregation/280_rare_terms.yml | 453 ++++++++++++ .../common/util/BloomFilter.java | 381 ++++++++++ .../elasticsearch/search/SearchModule.java | 11 + .../bucket/BucketsAggregator.java | 4 +- .../MergingBucketsDeferringCollector.java | 63 +- .../terms/AbstractRareTermsAggregator.java | 122 +++ .../terms/AbstractStringTermsAggregator.java | 2 +- .../bucket/terms/DoubleRareTerms.java | 115 +++ .../terms/DoubleRareTermsAggregator.java | 77 ++ .../bucket/terms/IncludeExclude.java | 8 +- .../bucket/terms/InternalMappedRareTerms.java | 132 ++++ .../bucket/terms/InternalMappedTerms.java | 2 +- .../bucket/terms/LongRareTerms.java | 114 +++ .../bucket/terms/LongRareTermsAggregator.java | 204 ++++++ .../terms/RareTermsAggregationBuilder.java | 172 +++++ .../terms/RareTermsAggregatorFactory.java | 165 +++++ .../bucket/terms/StringRareTerms.java | 80 ++ .../terms/StringRareTermsAggregator.java | 213 ++++++ .../bucket/terms/StringTermsAggregator.java | 13 +- .../bucket/terms/UnmappedRareTerms.java | 131 ++++ .../aggregations/bucket/RareTermsTests.java | 101 +++ .../terms/RareTermsAggregatorTests.java | 692 ++++++++++++++++++ .../aggregations/AggregatorTestCase.java | 8 +- 25 files changed, 3541 insertions(+), 18 deletions(-) create mode 100644 docs/reference/aggregations/bucket/rare-terms-aggregation.asciidoc create mode 100644 rest-api-spec/src/main/resources/rest-api-spec/test/search.aggregation/280_rare_terms.yml create mode 100644 server/src/main/java/org/elasticsearch/common/util/BloomFilter.java create mode 100644 server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/AbstractRareTermsAggregator.java create mode 100644 server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/DoubleRareTerms.java create mode 100644 server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/DoubleRareTermsAggregator.java create mode 100644 server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/InternalMappedRareTerms.java create mode 100644 server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/LongRareTerms.java create mode 100644 server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/LongRareTermsAggregator.java create mode 100644 server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/RareTermsAggregationBuilder.java create mode 100644 server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/RareTermsAggregatorFactory.java create mode 100644 server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringRareTerms.java create mode 100644 server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringRareTermsAggregator.java create mode 100644 server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/UnmappedRareTerms.java create mode 100644 server/src/test/java/org/elasticsearch/search/aggregations/bucket/RareTermsTests.java create mode 100644 server/src/test/java/org/elasticsearch/search/aggregations/bucket/terms/RareTermsAggregatorTests.java diff --git a/docs/reference/aggregations/bucket.asciidoc b/docs/reference/aggregations/bucket.asciidoc index 52b27c578929f..6afb4aba3fa4f 100644 --- a/docs/reference/aggregations/bucket.asciidoc +++ b/docs/reference/aggregations/bucket.asciidoc @@ -63,3 +63,4 @@ include::bucket/significanttext-aggregation.asciidoc[] include::bucket/terms-aggregation.asciidoc[] +include::bucket/rare-terms-aggregation.asciidoc[] diff --git a/docs/reference/aggregations/bucket/rare-terms-aggregation.asciidoc b/docs/reference/aggregations/bucket/rare-terms-aggregation.asciidoc new file mode 100644 index 0000000000000..a0a64fef8fe87 --- /dev/null +++ b/docs/reference/aggregations/bucket/rare-terms-aggregation.asciidoc @@ -0,0 +1,295 @@ +[[search-aggregations-bucket-rare-terms-aggregation]] +=== Rare Terms Aggregation + +A multi-bucket value source based aggregation which finds "rare" terms -- terms that are at the long-tail +of the distribution and are not frequent. Conceptually, this is like a `terms` aggregation that is +sorted by `_count` ascending. As noted in the <>, +actually ordering a `terms` agg by count ascending has unbounded error. Instead, you should use the `rare_terms` +aggregation + +////////////////////////// + +[source,js] +-------------------------------------------------- +PUT /products +{ + "mappings": { + "product": { + "properties": { + "genre": { + "type": "keyword" + }, + "product": { + "type": "keyword" + } + } + } + } +} + +POST /products/product/_bulk?refresh +{"index":{"_id":0}} +{"genre": "rock", "product": "Product A"} +{"index":{"_id":1}} +{"genre": "rock"} +{"index":{"_id":2}} +{"genre": "rock"} +{"index":{"_id":3}} +{"genre": "jazz", "product": "Product Z"} +{"index":{"_id":4}} +{"genre": "jazz"} +{"index":{"_id":5}} +{"genre": "electronic"} +{"index":{"_id":6}} +{"genre": "electronic"} +{"index":{"_id":7}} +{"genre": "electronic"} +{"index":{"_id":8}} +{"genre": "electronic"} +{"index":{"_id":9}} +{"genre": "electronic"} +{"index":{"_id":10}} +{"genre": "swing"} + +------------------------------------------------- +// NOTCONSOLE +// TESTSETUP + +////////////////////////// + +==== Syntax + +A `rare_terms` aggregation looks like this in isolation: + +[source,js] +-------------------------------------------------- +{ + "rare_terms": { + "field": "the_field", + "max_doc_count": 1 + } +} +-------------------------------------------------- +// NOTCONSOLE + +.`rare_terms` Parameters +|=== +|Parameter Name |Description |Required |Default Value +|`field` |The field we wish to find rare terms in |Required | +|`max_doc_count` |The maximum number of documents a term should appear in. |Optional |`1` +|`include` |Terms that should be included in the aggregation|Optional | +|`exclude` |Terms that should be excluded from the aggregation|Optional | +|`missing` |The value that should be used if a document does not have the field being aggregated|Optional | +|=== + + +Example: + +[source,js] +-------------------------------------------------- +GET /_search +{ + "aggs" : { + "genres" : { + "rare_terms" : { + "field" : "genre" + } + } + } +} +-------------------------------------------------- +// CONSOLE +// TEST[s/_search/_search\?filter_path=aggregations/] + +Response: + +[source,js] +-------------------------------------------------- +{ + ... + "aggregations" : { + "genres" : { + "doc_count_error_upper_bound": 0, + "sum_other_doc_count": 0, + "buckets" : [ + { + "key" : "swing", + "doc_count" : 1 + } + ] + } + } +} +-------------------------------------------------- +// TESTRESPONSE[s/\.\.\.//] + +In this example, the only bucket that we see is the "swing" bucket, because it is the only term that appears in +one document. If we increase the `max_doc_count` to `2`, we'll see some more buckets: + +[source,js] +-------------------------------------------------- +GET /_search +{ + "aggs" : { + "genres" : { + "rare_terms" : { + "field" : "genre", + "max_doc_count": 2 + } + } + } +} +-------------------------------------------------- +// CONSOLE +// TEST[s/_search/_search\?filter_path=aggregations/] + +This now shows the "jazz" term which has a `doc_count` of 2": + +[source,js] +-------------------------------------------------- +{ + ... + "aggregations" : { + "genres" : { + "doc_count_error_upper_bound": 0, + "sum_other_doc_count": 0, + "buckets" : [ + { + "key" : "swing", + "doc_count" : 1 + }, + { + "key" : "jazz", + "doc_count" : 2 + } + ] + } + } +} +-------------------------------------------------- +// TESTRESPONSE[s/\.\.\.//] + +[[search-aggregations-bucket-rare-terms-aggregation-max-doc-count]] +==== Maximum document count + +The `max_doc_count` parameter is used to control the upper bound of document counts that a term can have. There +is not a size limitation on the `rare_terms` agg like `terms` agg has. This means that _all_ terms +which match the `max_doc_count` criteria will be returned. The aggregation functions in this manner to avoid +the order-by-ascending issues that afflict the `terms` aggregation. + +This does, however, mean that a large number of results can be returned if chosen incorrectly. +To limit the danger of this setting, the maximum `max_doc_count` is 10. + +[[search-aggregations-bucket-rare-terms-aggregation-approximate-counts]] +==== Document counts are approximate + +The naive way to determine the "rare" terms in a dataset is to place all the values in a map, incrementing counts +as each document is visited, then return the bottom `n` rows. This does not scale beyond even modestly sized data +sets. A sharded approach where only the "top n" values are retained from each shard (ala the `terms` aggregation) +fails because the long-tail nature of the problem means it is impossible to find the "top n" bottom values without +simply collecting all the values from all shards. + +Instead, the Rare Terms aggregation uses a different approximate algorithm: + +1. Values are placed in a map the first time they are seen. +2. Each addition occurrence of the term increments a counter in the map +3. If the counter > the `max_doc_count` threshold, the term is removed from the map and placed in a bloom filter +4. The bloom filter is consulted on each term. If the value is inside the bloom, it is known to be above the +threshold already and skipped. + +After execution, the map of values is the map of "rare" terms under the `max_doc_count` threshold. This map and bloom +filter is then merged with all other shards. If there are terms that are greater than the threshold (or appear in +a different shard's bloom filter) the term is removed from the merged list. The final map of values is returned +to the user as the "rare" terms. + +Bloom filters have the possibility of returning false positives (they can say a value exists in their collection when +it does not actually). Since the Bloom filter is being used to see if a term is over threshold, this means a false positive +from the bloom filter will mistakenly say a value is common when it is not (and thus exclude it from it final list of buckets). + + +==== Filtering Values + +It is possible to filter the values for which buckets will be created. This can be done using the `include` and +`exclude` parameters which are based on regular expression strings or arrays of exact values. Additionally, +`include` clauses can filter using `partition` expressions. + +===== Filtering Values with regular expressions + +[source,js] +-------------------------------------------------- +GET /_search +{ + "aggs" : { + "genres" : { + "rare_terms" : { + "field" : "genre", + "include" : "swi*", + "exclude" : "electro*" + } + } + } +} +-------------------------------------------------- +// CONSOLE + +In the above example, buckets will be created for all the tags that starts with `swi`, except those starting +with `electro` (so the tag `swing` will be aggregated but not `electro_swing`). The `include` regular expression will determine what +values are "allowed" to be aggregated, while the `exclude` determines the values that should not be aggregated. When +both are defined, the `exclude` has precedence, meaning, the `include` is evaluated first and only then the `exclude`. + +The syntax is the same as <>. + +===== Filtering Values with exact values + +For matching based on exact values the `include` and `exclude` parameters can simply take an array of +strings that represent the terms as they are found in the index: + +[source,js] +-------------------------------------------------- +GET /_search +{ + "aggs" : { + "genres" : { + "rare_terms" : { + "field" : "genre", + "include" : ["swing", "rock"], + "exclude" : ["jazz"] + } + } + } +} +-------------------------------------------------- +// CONSOLE + + +==== Missing value + +The `missing` parameter defines how documents that are missing a value should be treated. +By default they will be ignored but it is also possible to treat them as if they +had a value. + +[source,js] +-------------------------------------------------- +GET /_search +{ + "aggs" : { + "genres" : { + "rare_terms" : { + "field" : "genre", + "missing": "N/A" <1> + } + } + } +} +-------------------------------------------------- +// CONSOLE + +<1> Documents without a value in the `tags` field will fall into the same bucket as documents that have the value `N/A`. + + +==== Mixing field types + +WARNING: When aggregating on multiple indices the type of the aggregated field may not be the same in all indices. +Some types are compatible with each other (`integer` and `long` or `float` and `double`) but when the types are a mix +of decimal and non-decimal number the terms aggregation will promote the non-decimal numbers to decimal numbers. +This can result in a loss of precision in the bucket values. diff --git a/rest-api-spec/src/main/resources/rest-api-spec/test/search.aggregation/280_rare_terms.yml b/rest-api-spec/src/main/resources/rest-api-spec/test/search.aggregation/280_rare_terms.yml new file mode 100644 index 0000000000000..b0450013a37ae --- /dev/null +++ b/rest-api-spec/src/main/resources/rest-api-spec/test/search.aggregation/280_rare_terms.yml @@ -0,0 +1,453 @@ +setup: + - do: + indices.create: + index: test_1 + body: + settings: + number_of_replicas: 0 + mappings: + test: + properties: + str: + type: keyword + ip: + type: ip + boolean: + type: boolean + integer: + type: long + double: + type: double + number: + type: long + date: + type: date + + - do: + indices.create: + index: test_2 + body: + settings: + number_of_replicas: 0 + mappings: + test: + properties: + number: + type: double + + - do: + cluster.health: + wait_for_status: green + +--- +"Basic test": + - do: + index: + index: test_1 + type: test + id: 1 + body: { "str" : "abc" } + + - do: + index: + index: test_1 + type: test + id: 2 + body: { "str": "abc" } + + - do: + index: + index: test_1 + type: test + id: 3 + body: { "str": "bcd" } + + - do: + indices.refresh: {} + + - do: + search: + body: { "size" : 0, "aggs" : { "str_terms" : { "rare_terms" : { "field" : "str", "max_doc_count" : 1 } } } } + + - match: { hits.total: 3 } + - length: { aggregations.str_terms.buckets: 1 } + - match: { aggregations.str_terms.buckets.0.key: "bcd" } + - is_false: aggregations.str_terms.buckets.0.key_as_string + - match: { aggregations.str_terms.buckets.0.doc_count: 1 } + +--- +"IP test": + - do: + index: + index: test_1 + type: test + id: 1 + body: { "ip": "::1" } + + - do: + index: + index: test_1 + type: test + id: 2 + body: { "ip": "127.0.0.1" } + + - do: + index: + index: test_1 + type: test + id: 3 + body: { "ip": "::1" } + + - do: + indices.refresh: {} + + - do: + search: + body: { "size" : 0, "aggs" : { "ip_terms" : { "rare_terms" : { "field" : "ip" } } } } + + - match: { hits.total: 3 } + - length: { aggregations.ip_terms.buckets: 1 } + - match: { aggregations.ip_terms.buckets.0.key: "127.0.0.1" } + - is_false: aggregations.ip_terms.buckets.0.key_as_string + - match: { aggregations.ip_terms.buckets.0.doc_count: 1 } + + - do: + search: + body: { "size" : 0, "aggs" : { "ip_terms" : { "rare_terms" : { "field" : "ip", "include" : [ "127.0.0.1" ] } } } } + + - match: { hits.total: 3 } + - length: { aggregations.ip_terms.buckets: 1 } + - match: { aggregations.ip_terms.buckets.0.key: "127.0.0.1" } + - is_false: aggregations.ip_terms.buckets.0.key_as_string + - match: { aggregations.ip_terms.buckets.0.doc_count: 1 } + + - do: + search: + body: { "size" : 0, "aggs" : { "ip_terms" : { "rare_terms" : { "field" : "ip", "exclude" : [ "127.0.0.1" ] } } } } + + - match: { hits.total: 3 } + - length: { aggregations.ip_terms.buckets: 0 } + + - do: + catch: request + search: + index: test_1 + body: { "size" : 0, "aggs" : { "ip_terms" : { "rare_terms" : { "field" : "ip", "exclude" : "127.*" } } } } + + + +--- +"Boolean test": + - do: + index: + index: test_1 + type: test + id: 1 + body: { "boolean": true } + + - do: + index: + index: test_1 + type: test + id: 2 + body: { "boolean": false } + + - do: + index: + index: test_1 + type: test + id: 3 + body: { "boolean": true } + + - do: + indices.refresh: {} + + - do: + search: + body: { "size" : 0, "aggs" : { "boolean_terms" : { "rare_terms" : { "field" : "boolean" } } } } + + - match: { hits.total: 3 } + - length: { aggregations.boolean_terms.buckets: 1 } + - match: { aggregations.boolean_terms.buckets.0.key: 0 } + - match: { aggregations.boolean_terms.buckets.0.key_as_string: "false" } + - match: { aggregations.boolean_terms.buckets.0.doc_count: 1 } + +--- +"Integer test": + - do: + index: + index: test_1 + type: test + id: 1 + body: { "integer": 1234 } + + - do: + index: + index: test_1 + type: test + id: 2 + body: { "integer": 5678 } + + - do: + index: + index: test_1 + type: test + id: 3 + body: { "integer": 1234 } + + - do: + indices.refresh: {} + + - do: + search: + body: { "size" : 0, "aggs" : { "integer_terms" : { "rare_terms" : { "field" : "integer" } } } } + + - match: { hits.total: 3 } + + - length: { aggregations.integer_terms.buckets: 1 } + + - match: { aggregations.integer_terms.buckets.0.key: 5678 } + - is_false: aggregations.integer_terms.buckets.0.key_as_string + - match: { aggregations.integer_terms.buckets.0.doc_count: 1 } + + +--- +"Double test": + - do: + index: + index: test_1 + type: test + id: 1 + body: { "double": 1234.5 } + + - do: + index: + index: test_1 + type: test + id: 2 + body: { "double": 5678.5 } + + - do: + index: + index: test_1 + type: test + id: 3 + body: { "double": 1234.5 } + + - do: + indices.refresh: {} + + - do: + search: + body: { "size" : 0, "aggs" : { "double_terms" : { "rare_terms" : { "field" : "double" } } } } + + - match: { hits.total: 3 } + - length: { aggregations.double_terms.buckets: 1 } + - match: { aggregations.double_terms.buckets.0.key: 5678.5 } + - is_false: aggregations.double_terms.buckets.0.key_as_string + - match: { aggregations.double_terms.buckets.0.doc_count: 1 } + +--- +"Date test": + - do: + index: + index: test_1 + type: test + id: 1 + body: { "date": "2016-05-03" } + + - do: + index: + index: test_1 + type: test + id: 2 + body: { "date": "2014-09-01" } + + - do: + index: + index: test_1 + type: test + id: 3 + body: { "date": "2016-05-03" } + + - do: + indices.refresh: {} + + - do: + search: + body: { "size" : 0, "aggs" : { "date_terms" : { "rare_terms" : { "field" : "date" } } } } + + - match: { hits.total: 3 } + + - length: { aggregations.date_terms.buckets: 1 } + - match: { aggregations.date_terms.buckets.0.key: 1409529600000 } + - match: { aggregations.date_terms.buckets.0.key_as_string: "2014-09-01T00:00:00.000Z" } + - match: { aggregations.date_terms.buckets.0.doc_count: 1 } + + - do: + search: + body: { "size" : 0, "aggs" : { "date_terms" : { "rare_terms" : { "field" : "date", "include" : [ "2014-09-01" ] } } } } + + - match: { hits.total: 3 } + - length: { aggregations.date_terms.buckets: 1 } + - match: { aggregations.date_terms.buckets.0.key_as_string: "2014-09-01T00:00:00.000Z" } + - match: { aggregations.date_terms.buckets.0.doc_count: 1 } + + - do: + search: + body: { "size" : 0, "aggs" : { "date_terms" : { "rare_terms" : { "field" : "date", "exclude" : [ "2014-09-01" ] } } } } + + - match: { hits.total: 3 } + - length: { aggregations.date_terms.buckets: 0 } + +--- +"Unmapped strings": + + - do: + index: + index: test_1 + type: test + id: 1 + body: {} + + - do: + indices.refresh: {} + + - do: + search: + body: { "size" : 0, "aggs" : { "string_terms" : { "rare_terms" : { "field" : "unmapped_string"} } } } + + - match: { hits.total: 1 } + - length: { aggregations.string_terms.buckets: 0 } + +--- +"Unmapped booleans": + + - do: + index: + index: test_1 + type: test + id: 1 + body: {} + + - do: + indices.refresh: {} + + - do: + search: + body: { "size" : 0, "aggs" : { "boolean_terms" : { "rare_terms" : { "field" : "unmapped_boolean" } } } } + + - match: { hits.total: 1 } + - length: { aggregations.boolean_terms.buckets: 0 } + +--- +"Unmapped dates": + + - do: + index: + index: test_1 + type: test + id: 1 + body: {} + + - do: + indices.refresh: {} + + - do: + search: + body: { "size" : 0, "aggs" : { "date_terms" : { "rare_terms" : { "field" : "unmapped_date"} } } } + + - match: { hits.total: 1 } + - length: { aggregations.date_terms.buckets: 0 } + +--- +"Unmapped longs": + + - do: + index: + index: test_1 + type: test + id: 1 + body: {} + + - do: + indices.refresh: {} + + - do: + search: + body: { "size" : 0, "aggs" : { "long_terms" : { "rare_terms" : { "field" : "unmapped_long", "value_type" : "long" } } } } + + - match: { hits.total: 1 } + - length: { aggregations.long_terms.buckets: 0 } + +--- +"Unmapped doubles": + + - do: + index: + index: test_1 + type: test + id: 1 + body: {} + + - do: + indices.refresh: {} + + - do: + search: + body: { "size" : 0, "aggs" : { "double_terms" : { "rare_terms" : { "field" : "unmapped_double" } } } } + + - match: { hits.total: 1 } + - length: { aggregations.double_terms.buckets: 0 } + +--- +"Mixing longs and doubles": + + - do: + index: + index: test_1 + type: test + id: 1 + body: {"number": 100} + + - do: + index: + index: test_1 + type: test + id: 2 + body: {"number": 10} + + - do: + index: + index: test_2 + type: test + id: 3 + body: {"number": 100.0} + + - do: + index: + index: test_2 + type: test + id: 1 + body: {"number": 10.0} + + - do: + index: + index: test_2 + type: test + id: 2 + body: {"number": 14.6} + + - do: + indices.refresh: {} + + - do: + search: + body: { "size" : 0, "aggs" : { "number_terms" : { "rare_terms" : { "field" : "number" } } } } + + - match: { hits.total: 5 } + + - length: { aggregations.number_terms.buckets: 1 } + - match: { aggregations.number_terms.buckets.0.key: 14.6 } + - match: { aggregations.number_terms.buckets.0.doc_count: 1 } + diff --git a/server/src/main/java/org/elasticsearch/common/util/BloomFilter.java b/server/src/main/java/org/elasticsearch/common/util/BloomFilter.java new file mode 100644 index 0000000000000..64e24abb78c4b --- /dev/null +++ b/server/src/main/java/org/elasticsearch/common/util/BloomFilter.java @@ -0,0 +1,381 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.common.util; + +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.RamUsageEstimator; +import org.elasticsearch.common.Numbers; +import org.elasticsearch.common.hash.MurmurHash3; +import org.elasticsearch.common.io.stream.StreamInput; +import org.elasticsearch.common.io.stream.StreamOutput; +import org.elasticsearch.common.io.stream.Writeable; +import org.elasticsearch.common.lease.Releasable; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Objects; + +/** + * A bloom filter. Inspired by Guava bloom filter implementation though with some optimizations. + */ +public class BloomFilter implements Writeable, Releasable { + + // Some numbers: + // 10k =0.001: 140.4kb , 10 Hashes + // 10k =0.01 : 93.6kb , 6 Hashes + // 100k=0.01 : 936.0kb , 6 Hashes + // 100k=0.03 : 712.7kb , 5 Hashes + // 500k=0.01 : 4.5mb , 6 Hashes + // 500k=0.03 : 3.4mb , 5 Hashes + // 500k=0.05 : 2.9mb , 4 Hashes + // 1m=0.01 : 9.1mb , 6 Hashes + // 1m=0.03 : 6.9mb , 5 Hashes + // 1m=0.05 : 5.9mb , 4 Hashes + // 5m=0.01 : 45.7mb , 6 Hashes + // 5m=0.03 : 34.8mb , 5 Hashes + // 5m=0.05 : 29.7mb , 4 Hashes + // 50m=0.01 : 457.0mb , 6 Hashes + // 50m=0.03 : 297.3mb , 4 Hashes + // 50m=0.10 : 228.5mb , 3 Hashes + + /** + * The bit set of the BloomFilter (not necessarily power of 2!) + */ + private final BitArray bits; + + /** + * Number of hashes per element + */ + private final int numHashFunctions; + + private final Hashing hashing = Hashing.V1; + + /** + * Creates a bloom filter based on the with the expected number + * of insertions and expected false positive probability. + * + * @param expectedInsertions the number of expected insertions to the constructed + * @param fpp the desired false positive probability (must be positive and less than 1.0) + */ + public BloomFilter(int expectedInsertions, double fpp) { + this(expectedInsertions, fpp, -1); + } + + /** + * Creates a bloom filter based on the expected number of insertions, expected false positive probability, + * and number of hash functions. + * + * @param expectedInsertions the number of expected insertions to the constructed + * @param fpp the desired false positive probability (must be positive and less than 1.0) + * @param numHashFunctions the number of hash functions to use (must be less than or equal to 255) + */ + public BloomFilter(int expectedInsertions, double fpp, int numHashFunctions) { + if (expectedInsertions == 0) { + expectedInsertions = 1; + } + /* + * TODO(user): Put a warning in the javadoc about tiny fpp values, + * since the resulting size is proportional to -log(p), but there is not + * much of a point after all, e.g. optimalM(1000, 0.0000000000000001) = 76680 + * which is less that 10kb. Who cares! + */ + long numBits = optimalNumOfBits(expectedInsertions, fpp); + + // calculate the optimal number of hash functions + if (numHashFunctions == -1) { + numHashFunctions = optimalNumOfHashFunctions(expectedInsertions, numBits); + } + + if (numHashFunctions > 255) { + throw new IllegalArgumentException("BloomFilters with more than 255 hash functions are not allowed."); + } + + this.bits = new BitArray(numBits); + this.numHashFunctions = numHashFunctions; + } + + public BloomFilter(StreamInput in) throws IOException { + int numLongs = in.readVInt(); + long[] data = new long[numLongs]; + for (int i = 0; i < numLongs; i++) { + data[i] = in.readLong(); + } + this.numHashFunctions = in.readVInt(); + this.bits = new BitArray(data); + } + + public void merge(BloomFilter other) { + this.bits.putAll(other.bits); + } + + public boolean put(BytesRef value) { + return hashing.put(value, numHashFunctions, bits); + } + + public boolean put(byte[] value) { + return hashing.put(value, 0, value.length, numHashFunctions, bits); + } + + public boolean put(long value) { + return put(Numbers.longToBytes(value)); + } + + public boolean mightContain(BytesRef value) { + return hashing.mightContain(value, numHashFunctions, bits); + } + + public boolean mightContain(byte[] value) { + return hashing.mightContain(value, 0, value.length, numHashFunctions, bits); + } + + public boolean mightContain(long value) { + return mightContain(Numbers.longToBytes(value)); + } + + public int getNumHashFunctions() { + return this.numHashFunctions; + } + + public long getSizeInBytes() { + return bits.ramBytesUsed(); + } + + @Override + public int hashCode() { + return bits.hashCode() + numHashFunctions; + } + + @Override + public boolean equals(Object other) { + if (this == other) { + return true; + } + if (other == null || getClass() != other.getClass()) { + return false; + } + + final BloomFilter that = (BloomFilter) other; + return Objects.equals(this.bits, that.bits) + && Objects.equals(this.hashing, that.hashing) + && Objects.equals(this.numHashFunctions, that.numHashFunctions); + } + + public void writeTo(StreamOutput out) throws IOException { + out.writeVInt(bits.data.length); + for (long l : bits.data) { + out.writeLong(l); + } + out.writeVInt(numHashFunctions); + } + + @Override + public void close() { + + } + + /* + * Cheat sheet: + * + * m: total bits + * n: expected insertions + * b: m/n, bits per insertion + + * p: expected false positive probability + * + * 1) Optimal k = b * ln2 + * 2) p = (1 - e ^ (-kn/m))^k + * 3) For optimal k: p = 2 ^ (-k) ~= 0.6185^b + * 4) For optimal k: m = -nlnp / ((ln2) ^ 2) + */ + + /** + * Computes the optimal k (number of hashes per element inserted in Bloom filter), given the + * expected insertions and total number of bits in the Bloom filter. + *

+ * See http://en.wikipedia.org/wiki/File:Bloom_filter_fp_probability.svg for the formula. + * + * @param n expected insertions (must be positive) + * @param m total number of bits in Bloom filter (must be positive) + */ + private static int optimalNumOfHashFunctions(long n, long m) { + return Math.max(1, (int) Math.round(m / n * Math.log(2))); + } + + /** + * Computes m (total bits of Bloom filter) which is expected to achieve, for the specified + * expected insertions, the required false positive probability. + *

+ * See http://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives for the formula. + * + * @param n expected insertions (must be positive) + * @param p false positive rate (must be 0 < p < 1) + */ + private static long optimalNumOfBits(long n, double p) { + if (p == 0) { + p = Double.MIN_VALUE; + } + return (long) (-n * Math.log(p) / (Math.log(2) * Math.log(2))); + } + + // Note: We use this instead of java.util.BitSet because we need access to the long[] data field + static final class BitArray { + final long[] data; + final long bitSize; + long bitCount; + + BitArray(long bits) { + this(new long[size(bits)]); + } + + private static int size(long bits) { + long quotient = bits / 64; + long remainder = bits - quotient * 64; + return Math.toIntExact(remainder == 0 ? quotient : 1 + quotient); + } + + // Used by serialization + BitArray(long[] data) { + this.data = data; + long bitCount = 0; + for (long value : data) { + bitCount += Long.bitCount(value); + } + this.bitCount = bitCount; + this.bitSize = data.length * Long.SIZE; + } + + /** + * Returns true if the bit changed value. + */ + boolean set(long index) { + if (!get(index)) { + data[(int) (index >>> 6)] |= (1L << index); + bitCount++; + return true; + } + return false; + } + + boolean get(long index) { + return (data[(int) (index >>> 6)] & (1L << index)) != 0; + } + + /** + * Number of bits + */ + long bitSize() { + return bitSize; + } + + /** + * Number of set bits (1s) + */ + long bitCount() { + return bitCount; + } + + BitArray copy() { + return new BitArray(data.clone()); + } + + /** + * Combines the two BitArrays using bitwise OR. + */ + void putAll(BitArray array) { + bitCount = 0; + for (int i = 0; i < data.length; i++) { + data[i] |= array.data[i]; + bitCount += Long.bitCount(data[i]); + } + } + + @Override + public boolean equals(Object o) { + if (o instanceof BitArray) { + BitArray bitArray = (BitArray) o; + return Arrays.equals(data, bitArray.data); + } + return false; + } + + @Override + public int hashCode() { + return Arrays.hashCode(data); + } + + + public long ramBytesUsed() { + return Long.BYTES * data.length + RamUsageEstimator.NUM_BYTES_ARRAY_HEADER + 16; + } + } + + enum Hashing { + V1() { + @Override + protected boolean put(byte[] bytes, int offset, int length, int numHashFunctions, BitArray bits) { + long bitSize = bits.bitSize(); + MurmurHash3.Hash128 hash128 = MurmurHash3.hash128(bytes, offset, length, 0, new MurmurHash3.Hash128()); + + boolean bitsChanged = false; + long combinedHash = hash128.h1; + for (int i = 0; i < numHashFunctions; i++) { + // Make the combined hash positive and indexable + bitsChanged |= bits.set((combinedHash & Long.MAX_VALUE) % bitSize); + combinedHash += hash128.h2; + } + return bitsChanged; + } + + @Override + protected boolean mightContain(byte[] bytes, int offset, int length, int numHashFunctions, BitArray bits) { + long bitSize = bits.bitSize(); + MurmurHash3.Hash128 hash128 = MurmurHash3.hash128(bytes, offset, length, 0, new MurmurHash3.Hash128()); + + long combinedHash = hash128.h1; + for (int i = 0; i < numHashFunctions; i++) { + // Make the combined hash positive and indexable + if (!bits.get((combinedHash & Long.MAX_VALUE) % bitSize)) { + return false; + } + combinedHash += hash128.h2; + } + return true; + } + + @Override + protected int type() { + return 1; + } + }; + + protected boolean put(BytesRef value, int numHashFunctions, BitArray bits) { + return put(value.bytes, value.offset, value.length, numHashFunctions, bits); + } + + protected abstract boolean put(byte[] bytes, int offset, int length, int numHashFunctions, BitArray bits); + + protected boolean mightContain(BytesRef value, int numHashFunctions, BitArray bits) { + return mightContain(value.bytes, value.offset, value.length, numHashFunctions, bits); + } + + protected abstract boolean mightContain(byte[] bytes, int offset, int length, int numHashFunctions, BitArray bits); + + protected abstract int type(); + } +} diff --git a/server/src/main/java/org/elasticsearch/search/SearchModule.java b/server/src/main/java/org/elasticsearch/search/SearchModule.java index 539f2de529f23..91c22bd401117 100644 --- a/server/src/main/java/org/elasticsearch/search/SearchModule.java +++ b/server/src/main/java/org/elasticsearch/search/SearchModule.java @@ -146,10 +146,15 @@ import org.elasticsearch.search.aggregations.bucket.significant.heuristics.ScriptHeuristic; import org.elasticsearch.search.aggregations.bucket.significant.heuristics.SignificanceHeuristic; import org.elasticsearch.search.aggregations.bucket.significant.heuristics.SignificanceHeuristicParser; +import org.elasticsearch.search.aggregations.bucket.terms.DoubleRareTerms; import org.elasticsearch.search.aggregations.bucket.terms.DoubleTerms; +import org.elasticsearch.search.aggregations.bucket.terms.LongRareTerms; import org.elasticsearch.search.aggregations.bucket.terms.LongTerms; +import org.elasticsearch.search.aggregations.bucket.terms.RareTermsAggregationBuilder; +import org.elasticsearch.search.aggregations.bucket.terms.StringRareTerms; import org.elasticsearch.search.aggregations.bucket.terms.StringTerms; import org.elasticsearch.search.aggregations.bucket.terms.TermsAggregationBuilder; +import org.elasticsearch.search.aggregations.bucket.terms.UnmappedRareTerms; import org.elasticsearch.search.aggregations.bucket.terms.UnmappedTerms; import org.elasticsearch.search.aggregations.metrics.AvgAggregationBuilder; import org.elasticsearch.search.aggregations.metrics.InternalAvg; @@ -394,6 +399,12 @@ private void registerAggregations(List plugins) { .addResultReader(UnmappedTerms.NAME, UnmappedTerms::new) .addResultReader(LongTerms.NAME, LongTerms::new) .addResultReader(DoubleTerms.NAME, DoubleTerms::new)); + registerAggregation(new AggregationSpec(RareTermsAggregationBuilder.NAME, RareTermsAggregationBuilder::new, + RareTermsAggregationBuilder::parse) + .addResultReader(StringRareTerms.NAME, StringRareTerms::new) + .addResultReader(UnmappedRareTerms.NAME, UnmappedRareTerms::new) + .addResultReader(LongRareTerms.NAME, LongRareTerms::new) + .addResultReader(DoubleRareTerms.NAME, DoubleRareTerms::new)); registerAggregation(new AggregationSpec(SignificantTermsAggregationBuilder.NAME, SignificantTermsAggregationBuilder::new, SignificantTermsAggregationBuilder.getParser(significanceHeuristicParserRegistry)) .addResultReader(SignificantStringTerms.NAME, SignificantStringTerms::new) diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/BucketsAggregator.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/BucketsAggregator.java index 7b09ac9d61895..ee0c8e5518695 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/BucketsAggregator.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/BucketsAggregator.java @@ -90,7 +90,9 @@ public final void mergeBuckets(long[] mergeMap, long newNumBuckets) { docCounts.fill(0, newNumBuckets, 0); for (int i = 0; i < oldDocCounts.size(); i++) { int docCount = oldDocCounts.get(i); - if (docCount != 0) { + + // Skip any in the map which have been "removed", signified with -1 + if (docCount != 0 && mergeMap[i] != -1) { docCounts.increment(mergeMap[i], docCount); } } diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/MergingBucketsDeferringCollector.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/MergingBucketsDeferringCollector.java index 53049d0301c2d..91ba01e311e22 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/MergingBucketsDeferringCollector.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/MergingBucketsDeferringCollector.java @@ -109,16 +109,48 @@ public void collect(int doc, long bucket) { }; } + /** + * Merges/prunes the existing bucket ordinals and docDeltas according to the provided mergeMap. + * + * The mergeMap is an array where the index position represents the current bucket ordinal, and + * the value at that position represents the ordinal the bucket should be merged with. If + * the value is set to -1 it is removed entirely. + * + * For example, if the mergeMap [1,1,3,-1,3] is provided: + * - Buckets `0` and `1` will be merged to bucket ordinal `1` + * - Bucket `2` and `4` will be merged to ordinal `3` + * - Bucket `3` will be removed entirely + * + * This process rebuilds the ordinals and docDeltas according to the mergeMap, so it should + * not be called unless there are actually changes to be made, to avoid unnecessary work. + */ public void mergeBuckets(long[] mergeMap) { - List newEntries = new ArrayList<>(entries.size()); for (Entry sourceEntry : entries) { PackedLongValues.Builder newBuckets = PackedLongValues.packedBuilder(PackedInts.DEFAULT); + PackedLongValues.Builder newDocDeltas = PackedLongValues.packedBuilder(PackedInts.DEFAULT); + PackedLongValues.Iterator docDeltasItr = sourceEntry.docDeltas.iterator(); + + long lastGoodDelta = 0; for (PackedLongValues.Iterator itr = sourceEntry.buckets.iterator(); itr.hasNext();) { long bucket = itr.next(); - newBuckets.add(mergeMap[Math.toIntExact(bucket)]); + assert docDeltasItr.hasNext(); + long delta = docDeltasItr.next(); + + // Only merge in the ordinal if it hasn't been "removed", signified with -1 + long ordinal = mergeMap[Math.toIntExact(bucket)]; + + if (ordinal != -1) { + newBuckets.add(ordinal); + newDocDeltas.add(delta + lastGoodDelta); + lastGoodDelta = 0; + } else { + // we are skipping this ordinal, which means we need to accumulate the + // doc delta's since the last "good" delta + lastGoodDelta += delta; + } } - newEntries.add(new Entry(sourceEntry.context, sourceEntry.docDeltas, newBuckets.build())); + newEntries.add(new Entry(sourceEntry.context, newDocDeltas.build(), newBuckets.build())); } entries = newEntries; @@ -127,10 +159,33 @@ public void mergeBuckets(long[] mergeMap) { if (buckets.size() > 0) { PackedLongValues currentBuckets = buckets.build(); PackedLongValues.Builder newBuckets = PackedLongValues.packedBuilder(PackedInts.DEFAULT); + PackedLongValues.Builder newDocDeltas = PackedLongValues.packedBuilder(PackedInts.DEFAULT); + + // The current segment's deltas aren't built yet, so build to a temp object + PackedLongValues currentDeltas = docDeltas.build(); + PackedLongValues.Iterator docDeltasItr = currentDeltas.iterator(); + + long lastGoodDelta = 0; for (PackedLongValues.Iterator itr = currentBuckets.iterator(); itr.hasNext();) { long bucket = itr.next(); - newBuckets.add(mergeMap[Math.toIntExact(bucket)]); + assert docDeltasItr.hasNext(); + long delta = docDeltasItr.next(); + long ordinal = mergeMap[Math.toIntExact(bucket)]; + + // Only merge in the ordinal if it hasn't been "removed", signified with -1 + if (ordinal != -1) { + newBuckets.add(ordinal); + newDocDeltas.add(delta + lastGoodDelta); + lastGoodDelta = 0; + } else { + // we are skipping this ordinal, which means we need to accumulate the + // doc delta's since the last "good" delta. + // The first is skipped because the original deltas are stored as offsets from first doc, + // not offsets from 0 + lastGoodDelta += delta; + } } + docDeltas = newDocDeltas; buckets = newBuckets; } } diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/AbstractRareTermsAggregator.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/AbstractRareTermsAggregator.java new file mode 100644 index 0000000000000..fb92bf63c904e --- /dev/null +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/AbstractRareTermsAggregator.java @@ -0,0 +1,122 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.search.aggregations.bucket.terms; + +import org.elasticsearch.common.util.BloomFilter; +import org.elasticsearch.search.DocValueFormat; +import org.elasticsearch.search.aggregations.Aggregator; +import org.elasticsearch.search.aggregations.AggregatorFactories; +import org.elasticsearch.search.aggregations.bucket.DeferableBucketAggregator; +import org.elasticsearch.search.aggregations.bucket.DeferringBucketCollector; +import org.elasticsearch.search.aggregations.bucket.MergingBucketsDeferringCollector; +import org.elasticsearch.search.aggregations.bucket.nested.NestedAggregator; +import org.elasticsearch.search.aggregations.pipeline.PipelineAggregator; +import org.elasticsearch.search.aggregations.support.ValuesSource; +import org.elasticsearch.search.internal.SearchContext; + +import java.io.IOException; +import java.util.List; +import java.util.Map; + +public abstract class AbstractRareTermsAggregator + extends DeferableBucketAggregator { + + // TODO review question: What to set this at? + /** + Sets the number of "removed" values to accumulate before we purge ords + via the MergingBucketCollector's mergeBuckets() method + */ + final long GC_THRESHOLD = 10; + + MergingBucketsDeferringCollector deferringCollector; + protected final BloomFilter bloom; + protected final long maxDocCount; + protected final DocValueFormat format; + protected final T valuesSource; + protected final U includeExclude; + + AbstractRareTermsAggregator(String name, AggregatorFactories factories, SearchContext context, + Aggregator parent, List pipelineAggregators, + Map metaData, long maxDocCount, DocValueFormat format, + T valuesSource, U includeExclude) throws IOException { + super(name, factories, context, parent, pipelineAggregators, metaData); + + // TODO review: should we expose the BF settings? What's a good default? + this.bloom = new BloomFilter(1000000, 0.03); // ~7mb + this.maxDocCount = maxDocCount; + this.format = format; + this.valuesSource = valuesSource; + this.includeExclude = includeExclude; + String scoringAgg = subAggsNeedScore(); + String nestedAgg = descendsFromNestedAggregator(parent); + if (scoringAgg != null && nestedAgg != null) { + /* + * Terms agg would force the collect mode to depth_first here, because + * we need to access the score of nested documents in a sub-aggregation + * and we are not able to generate this score while replaying deferred documents. + * + * But the RareTerms agg _must_ execute in breadth first since it relies on + * deferring execution, so we just have to throw up our hands and refuse + */ + throw new IllegalStateException("RareTerms agg [" + name() + "] is the child of the nested agg [" + nestedAgg + + "], and also has a scoring child agg [" + scoringAgg + "]. This combination is not supported because " + + "it requires executing in [depth_first] mode, which the RareTerms agg cannot do."); + } + } + + @Override + protected boolean shouldDefer(Aggregator aggregator) { + return true; + } + + @Override + public DeferringBucketCollector getDeferringCollector() { + deferringCollector = new MergingBucketsDeferringCollector(context); + return deferringCollector; + } + + @Override + protected void doPostCollection() { + // Make sure we do one final GC to clean up any deleted ords + // that may be lingering (but still below GC threshold) + gcDeletedEntries(); + } + + private String subAggsNeedScore() { + for (Aggregator subAgg : subAggregators) { + if (subAgg.scoreMode().needsScores()) { + return subAgg.name(); + } + } + return null; + } + + private String descendsFromNestedAggregator(Aggregator parent) { + while (parent != null) { + if (parent.getClass() == NestedAggregator.class) { + return parent.name(); + } + parent = parent.parent(); + } + return null; + } + + protected abstract void gcDeletedEntries(); +} diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/AbstractStringTermsAggregator.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/AbstractStringTermsAggregator.java index edbf2aef25fec..78bdd0b8c7155 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/AbstractStringTermsAggregator.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/AbstractStringTermsAggregator.java @@ -33,7 +33,7 @@ import static java.util.Collections.emptyList; -abstract class AbstractStringTermsAggregator extends TermsAggregator { +public abstract class AbstractStringTermsAggregator extends TermsAggregator { protected final boolean showTermDocCountError; diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/DoubleRareTerms.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/DoubleRareTerms.java new file mode 100644 index 0000000000000..d3f964246c4af --- /dev/null +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/DoubleRareTerms.java @@ -0,0 +1,115 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.search.aggregations.bucket.terms; + +import org.apache.lucene.util.NumericUtils; +import org.elasticsearch.common.io.stream.StreamInput; +import org.elasticsearch.common.util.BloomFilter; +import org.elasticsearch.search.DocValueFormat; +import org.elasticsearch.search.aggregations.BucketOrder; +import org.elasticsearch.search.aggregations.InternalAggregation; +import org.elasticsearch.search.aggregations.InternalAggregations; +import org.elasticsearch.search.aggregations.pipeline.PipelineAggregator; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +/** + * Result of the RareTerms aggregation when the field is some kind of decimal number like a float, double, or distance. + */ +public class DoubleRareTerms extends InternalMappedRareTerms { + public static final String NAME = "drareterms"; + + DoubleRareTerms(String name, BucketOrder order, List pipelineAggregators, + Map metaData, DocValueFormat format, + List buckets, long maxDocCount, BloomFilter bloom) { + super(name, order, pipelineAggregators, metaData, format, buckets, maxDocCount, bloom); + } + + /** + * Read from a stream. + */ + public DoubleRareTerms(StreamInput in) throws IOException { + super(in, DoubleTerms.Bucket::new); + } + + @Override + public String getWriteableName() { + return NAME; + } + + @Override + public DoubleRareTerms create(List buckets) { + return new DoubleRareTerms(name, order, pipelineAggregators(), metaData, format, buckets, maxDocCount, bloom); + } + + @Override + public DoubleTerms.Bucket createBucket(InternalAggregations aggregations, DoubleTerms.Bucket prototype) { + return new DoubleTerms.Bucket((double)prototype.getKey(), prototype.docCount, aggregations, + prototype.showDocCountError, prototype.docCountError, prototype.format); + } + + @Override + protected DoubleRareTerms create(String name, List buckets, long docCountError, long otherDocCount) { + return new DoubleRareTerms(name, order, pipelineAggregators(), getMetaData(), format, + buckets, maxDocCount, bloom); + } + + @Override + protected DoubleTerms.Bucket[] createBucketsArray(int size) { + return new DoubleTerms.Bucket[size]; + } + + @Override + public InternalAggregation doReduce(List aggregations, ReduceContext reduceContext) { + boolean promoteToDouble = false; + for (InternalAggregation agg : aggregations) { + if (agg instanceof LongRareTerms && ((LongRareTerms) agg).format == DocValueFormat.RAW) { + /* + * this agg mixes longs and doubles, we must promote longs to doubles to make the internal aggs + * compatible + */ + promoteToDouble = true; + break; + } + } + if (promoteToDouble == false) { + return super.doReduce(aggregations, reduceContext); + } + List newAggs = new ArrayList<>(aggregations.size()); + for (InternalAggregation agg : aggregations) { + if (agg instanceof LongRareTerms) { + DoubleRareTerms dTerms = LongRareTerms.convertLongRareTermsToDouble((LongRareTerms) agg, format); + newAggs.add(dTerms); + } else if (agg instanceof DoubleRareTerms) { + newAggs.add(agg); + } else { + throw new IllegalStateException("Encountered a non-RareTerms numeric agg when reducing RareTerms."); + } + } + return newAggs.get(0).doReduce(newAggs, reduceContext); + } + + @Override + public boolean containsTerm(BloomFilter bloom, DoubleTerms.Bucket bucket) { + return bloom.mightContain(NumericUtils.doubleToSortableLong((double) bucket.getKey())); + } +} diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/DoubleRareTermsAggregator.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/DoubleRareTermsAggregator.java new file mode 100644 index 0000000000000..f1c431c26d39c --- /dev/null +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/DoubleRareTermsAggregator.java @@ -0,0 +1,77 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.search.aggregations.bucket.terms; + +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.SortedNumericDocValues; +import org.apache.lucene.util.NumericUtils; +import org.elasticsearch.index.fielddata.FieldData; +import org.elasticsearch.search.DocValueFormat; +import org.elasticsearch.search.aggregations.Aggregator; +import org.elasticsearch.search.aggregations.AggregatorFactories; +import org.elasticsearch.search.aggregations.pipeline.PipelineAggregator; +import org.elasticsearch.search.aggregations.support.ValuesSource; +import org.elasticsearch.search.aggregations.support.ValuesSource.Numeric; +import org.elasticsearch.search.internal.SearchContext; + +import java.io.IOException; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +public class DoubleRareTermsAggregator extends LongRareTermsAggregator { + + DoubleRareTermsAggregator(String name, AggregatorFactories factories, ValuesSource.Numeric valuesSource, + DocValueFormat format, SearchContext aggregationContext, Aggregator parent, + IncludeExclude.LongFilter longFilter, int maxDocCount, List pipelineAggregators, + Map metaData) throws IOException { + super(name, factories, valuesSource, format, aggregationContext, parent, + longFilter, maxDocCount, pipelineAggregators, metaData); + } + + @Override + protected SortedNumericDocValues getValues(Numeric valuesSource, LeafReaderContext ctx) throws IOException { + return FieldData.toSortableLongBits(valuesSource.doubleValues(ctx)); + } + + @Override + public DoubleRareTerms buildAggregation(long owningBucketOrdinal) throws IOException { + final LongRareTerms terms = (LongRareTerms) super.buildAggregation(owningBucketOrdinal); + return convertToDouble(terms); + } + + @Override + public DoubleRareTerms buildEmptyAggregation() { + final LongRareTerms terms = (LongRareTerms) super.buildEmptyAggregation(); + return convertToDouble(terms); + } + + private static DoubleRareTerms convertToDouble(LongRareTerms terms) { + List buckets = terms.buckets.stream().map(DoubleRareTermsAggregator::convertToDouble) + .collect(Collectors.toList()); + return new DoubleRareTerms(terms.getName(), terms.order, terms.pipelineAggregators(), + terms.getMetaData(), terms.format, buckets, terms.getMaxDocCount(), terms.getBloom()); + } + + private static DoubleTerms.Bucket convertToDouble(LongTerms.Bucket bucket) { + double value = NumericUtils.sortableLongToDouble(bucket.term); + return new DoubleTerms.Bucket(value, bucket.docCount, bucket.aggregations, bucket.showDocCountError, bucket.docCountError, + bucket.format); + } +} diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/IncludeExclude.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/IncludeExclude.java index 8154108f9f0bc..30653f04a355a 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/IncludeExclude.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/IncludeExclude.java @@ -137,10 +137,12 @@ public static IncludeExclude parseExclude(XContentParser parser) throws IOExcept } } + public abstract static class Filter {} + // The includeValue and excludeValue ByteRefs which are the result of the parsing // process are converted into a LongFilter when used on numeric fields // in the index. - public abstract static class LongFilter { + public abstract static class LongFilter extends Filter { public abstract boolean accept(long value); } @@ -183,7 +185,7 @@ private void addReject(long val) { } // Only used for the 'map' execution mode (ie. scripts) - public abstract static class StringFilter { + public abstract static class StringFilter extends Filter { public abstract boolean accept(BytesRef value); } @@ -231,7 +233,7 @@ public boolean accept(BytesRef value) { } } - public abstract static class OrdinalsFilter { + public abstract static class OrdinalsFilter extends Filter { public abstract LongBitSet acceptedGlobalOrdinals(SortedSetDocValues globalOrdinals) throws IOException; } diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/InternalMappedRareTerms.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/InternalMappedRareTerms.java new file mode 100644 index 0000000000000..e42f4d0fda391 --- /dev/null +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/InternalMappedRareTerms.java @@ -0,0 +1,132 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.search.aggregations.bucket.terms; + +import org.apache.lucene.util.CollectionUtil; +import org.elasticsearch.common.io.stream.StreamInput; +import org.elasticsearch.common.io.stream.StreamOutput; +import org.elasticsearch.common.util.BloomFilter; +import org.elasticsearch.search.DocValueFormat; +import org.elasticsearch.search.aggregations.AggregationExecutionException; +import org.elasticsearch.search.aggregations.BucketOrder; +import org.elasticsearch.search.aggregations.InternalAggregation; +import org.elasticsearch.search.aggregations.pipeline.PipelineAggregator; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +public abstract class InternalMappedRareTerms, B extends InternalTerms.Bucket> + extends InternalMappedTerms { + + final long maxDocCount; + final BloomFilter bloom; + + InternalMappedRareTerms(String name, BucketOrder order, List pipelineAggregators, + Map metaData, DocValueFormat format, + List buckets, long maxDocCount, BloomFilter bloom) { + // TODO is there a way to determine sum_other_doc_count and doc_count_error_upper_bound equivalents for rare based on bloom? + super(name, order, 0, 1, pipelineAggregators, metaData, format, 0, false, 0, buckets, 0); + this.maxDocCount = maxDocCount; + this.bloom = bloom; + } + + public long getMaxDocCount() { + return maxDocCount; + } + + BloomFilter getBloom() { + return bloom; + } + + /** + * Read from a stream. + */ + InternalMappedRareTerms(StreamInput in, Bucket.Reader bucketReader) throws IOException { + super(in, bucketReader); + maxDocCount = in.readLong(); + bloom = new BloomFilter(in); + } + + @Override + protected void writeTermTypeInfoTo(StreamOutput out) throws IOException { + super.writeTermTypeInfoTo(out); + out.writeLong(maxDocCount); + bloom.writeTo(out); + } + + @Override + public InternalAggregation doReduce(List aggregations, ReduceContext reduceContext) { + Map> buckets = new HashMap<>(); + InternalTerms referenceTerms = null; + BloomFilter bloomFilter = null; + + for (InternalAggregation aggregation : aggregations) { + + // Unmapped rare terms don't have a bloom filter so we'll skip all this work + // and save some type casting headaches later. + if (aggregation.isMapped() == false) { + continue; + } + + @SuppressWarnings("unchecked") + InternalTerms terms = (InternalTerms) aggregation; + if (referenceTerms == null && aggregation.getClass().equals(UnmappedRareTerms.class) == false) { + referenceTerms = terms; + } + if (referenceTerms != null && + referenceTerms.getClass().equals(terms.getClass()) == false && + terms.getClass().equals(UnmappedRareTerms.class) == false) { + // control gets into this loop when the same field name against which the query is executed + // is of different types in different indices. + throw new AggregationExecutionException("Merging/Reducing the aggregations failed when computing the aggregation [" + + referenceTerms.getName() + "] because the field you gave in the aggregation query existed as two different " + + "types in two different indices"); + } + for (B bucket : terms.getBuckets()) { + List bucketList = buckets.computeIfAbsent(bucket.getKey(), k -> new ArrayList<>()); + bucketList.add(bucket); + } + + if (bloomFilter == null) { + bloomFilter = ((InternalMappedRareTerms)aggregation).bloom; + } else { + bloomFilter.merge(((InternalMappedRareTerms)aggregation).bloom); + } + } + + // Always return all results, so just proactively size the array to num buckets + final int size = buckets.size(); + final List rare = new ArrayList<>(size); + for (List sameTermBuckets : buckets.values()) { + final B b = sameTermBuckets.get(0).reduce(sameTermBuckets, reduceContext); + // Only prune if this is the final reduction, otherwise we may remove a term that shows + // up in a later incremental reduction and looks "rare" even though it isn't. + if (reduceContext.isFinalReduce() == false || (b.getDocCount() <= maxDocCount && containsTerm(bloom, b) == false)) { + rare.add(b); + } + } + CollectionUtil.introSort(rare, order.comparator(null)); + return create(name, rare, 0, 0); + } + + public abstract boolean containsTerm(BloomFilter bloom, B b); +} diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/InternalMappedTerms.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/InternalMappedTerms.java index 547c9d0a80ec6..5622b2fa29230 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/InternalMappedTerms.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/InternalMappedTerms.java @@ -72,7 +72,7 @@ protected InternalMappedTerms(StreamInput in, Bucket.Reader bucketReader) thr } @Override - protected final void writeTermTypeInfoTo(StreamOutput out) throws IOException { + protected void writeTermTypeInfoTo(StreamOutput out) throws IOException { out.writeZLong(docCountError); out.writeNamedWriteable(format); writeSize(shardSize, out); diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/LongRareTerms.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/LongRareTerms.java new file mode 100644 index 0000000000000..ac576c3e4b294 --- /dev/null +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/LongRareTerms.java @@ -0,0 +1,114 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.search.aggregations.bucket.terms; + + +import org.elasticsearch.common.io.stream.StreamInput; +import org.elasticsearch.common.util.BloomFilter; +import org.elasticsearch.search.DocValueFormat; +import org.elasticsearch.search.aggregations.BucketOrder; +import org.elasticsearch.search.aggregations.InternalAggregation; +import org.elasticsearch.search.aggregations.InternalAggregations; +import org.elasticsearch.search.aggregations.pipeline.PipelineAggregator; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +/** + * Result of the RareTerms aggregation when the field is some kind of whole number like a integer, long, or a date. + */ +public class LongRareTerms extends InternalMappedRareTerms { + public static final String NAME = "lrareterms"; + + LongRareTerms(String name, BucketOrder order, List pipelineAggregators, + Map metaData, DocValueFormat format, + List buckets, long maxDocCount, BloomFilter bloom) { + super(name, order, pipelineAggregators, metaData, format, buckets, maxDocCount, bloom); + } + + /** + * Read from a stream. + */ + public LongRareTerms(StreamInput in) throws IOException { + super(in, LongTerms.Bucket::new); + } + + @Override + public String getWriteableName() { + return NAME; + } + + @Override + public LongRareTerms create(List buckets) { + return new LongRareTerms(name, order, pipelineAggregators(), metaData, format, + buckets, maxDocCount, bloom); + } + + @Override + public LongTerms.Bucket createBucket(InternalAggregations aggregations, LongTerms.Bucket prototype) { + return new LongTerms.Bucket(prototype.term, prototype.getDocCount(), aggregations, prototype.showDocCountError, + prototype.docCountError, prototype.format); + } + + @Override + protected LongRareTerms create(String name, List buckets, long docCountError, long otherDocCount) { + return new LongRareTerms(name, order, pipelineAggregators(), getMetaData(), format, + buckets, maxDocCount, bloom); + } + + @Override + protected LongTerms.Bucket[] createBucketsArray(int size) { + return new LongTerms.Bucket[size]; + } + + @Override + public InternalAggregation doReduce(List aggregations, ReduceContext reduceContext) { + for (InternalAggregation agg : aggregations) { + if (agg instanceof DoubleRareTerms) { + return agg.doReduce(aggregations, reduceContext); + } + } + return super.doReduce(aggregations, reduceContext); + } + + @Override + public boolean containsTerm(BloomFilter bloom, LongTerms.Bucket bucket) { + return bloom.mightContain((long) bucket.getKey()); + } + + /** + * Converts a {@link LongRareTerms} into a {@link DoubleRareTerms}, returning the + * value of the specified long terms as doubles. + */ + static DoubleRareTerms convertLongRareTermsToDouble(LongRareTerms longTerms, DocValueFormat decimalFormat) { + List buckets = longTerms.getBuckets(); + List newBuckets = new ArrayList<>(); + for (Terms.Bucket bucket : buckets) { + newBuckets.add(new DoubleTerms.Bucket(bucket.getKeyAsNumber().doubleValue(), + bucket.getDocCount(), (InternalAggregations) bucket.getAggregations(), longTerms.showTermDocCountError, + longTerms.showTermDocCountError ? bucket.getDocCountError() : 0, decimalFormat)); + } + return new DoubleRareTerms(longTerms.getName(), longTerms.order, + longTerms.pipelineAggregators(), + longTerms.metaData, longTerms.format, + newBuckets, longTerms.getMaxDocCount(), longTerms.getBloom()); + } +} diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/LongRareTermsAggregator.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/LongRareTermsAggregator.java new file mode 100644 index 0000000000000..5be6d6044cef8 --- /dev/null +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/LongRareTermsAggregator.java @@ -0,0 +1,204 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.search.aggregations.bucket.terms; + +import com.carrotsearch.hppc.LongLongHashMap; +import com.carrotsearch.hppc.cursors.LongLongCursor; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.SortedNumericDocValues; +import org.apache.lucene.util.CollectionUtil; +import org.elasticsearch.common.lease.Releasables; +import org.elasticsearch.common.util.LongHash; +import org.elasticsearch.search.DocValueFormat; +import org.elasticsearch.search.aggregations.Aggregator; +import org.elasticsearch.search.aggregations.AggregatorFactories; +import org.elasticsearch.search.aggregations.BucketOrder; +import org.elasticsearch.search.aggregations.InternalAggregation; +import org.elasticsearch.search.aggregations.LeafBucketCollector; +import org.elasticsearch.search.aggregations.LeafBucketCollectorBase; +import org.elasticsearch.search.aggregations.pipeline.PipelineAggregator; +import org.elasticsearch.search.aggregations.support.ValuesSource; +import org.elasticsearch.search.internal.SearchContext; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +import static java.util.Collections.emptyList; + +/** + * An aggregator that finds "rare" string values (e.g. terms agg that orders ascending) + */ +public class LongRareTermsAggregator extends AbstractRareTermsAggregator { + + static final BucketOrder ORDER = BucketOrder.compound(BucketOrder.count(true), BucketOrder.key(true)); // sort by count ascending + + //TODO review question: is LongLong map ok? + protected LongLongHashMap map; + protected LongHash bucketOrds; + + private LeafBucketCollector subCollectors; + + LongRareTermsAggregator(String name, AggregatorFactories factories, ValuesSource.Numeric valuesSource, DocValueFormat format, + SearchContext aggregationContext, Aggregator parent, IncludeExclude.LongFilter longFilter, + int maxDocCount, List pipelineAggregators, + Map metaData) throws IOException { + super(name, factories, aggregationContext, parent, pipelineAggregators, metaData, maxDocCount, format, valuesSource, longFilter); + this.map = new LongLongHashMap(); + this.bucketOrds = new LongHash(1, aggregationContext.bigArrays()); + } + + protected SortedNumericDocValues getValues(ValuesSource.Numeric valuesSource, LeafReaderContext ctx) throws IOException { + return valuesSource.longValues(ctx); + } + + @Override + public LeafBucketCollector getLeafCollector(LeafReaderContext ctx, + final LeafBucketCollector sub) throws IOException { + final SortedNumericDocValues values = getValues(valuesSource, ctx); + if (subCollectors == null) { + subCollectors = sub; + } + return new LeafBucketCollectorBase(sub, values) { + private long numDeleted = 0; + + @Override + public void collect(int docId, long owningBucketOrdinal) throws IOException { + if (values.advanceExact(docId)) { + final int valuesCount = values.docValueCount(); + + long previous = Long.MAX_VALUE; + for (int i = 0; i < valuesCount; ++i) { + final long val = values.nextValue(); + if (previous != val || i == 0) { + if ((includeExclude == null) || (includeExclude.accept(val))) { + if (bloom.mightContain(val) == false) { + long termCount = map.get(val); + if (termCount == 0) { + // Brand new term, save into map + map.put(val, 1L); + long bucketOrdinal = bucketOrds.add(val); + if (bucketOrdinal < 0) { // already seen + bucketOrdinal = - 1 - bucketOrdinal; + collectExistingBucket(subCollectors, docId, bucketOrdinal); + } else { + collectBucket(subCollectors, docId, bucketOrdinal); + } + } else { + // We've seen this term before, but less than the threshold + // so just increment its counter + if (termCount < maxDocCount) { + // TODO if we only need maxDocCount==1, we could specialize + // and use a bitset instead of a counter scheme + map.put(val, termCount + 1); + } else { + // Otherwise we've breached the threshold, remove from + // the map and add to the bloom filter + map.remove(val); + bloom.put(val); + numDeleted += 1; + + if (numDeleted > GC_THRESHOLD) { + gcDeletedEntries(); + } + } + } + } + } + previous = val; + } + } + } + } + }; + } + + protected void gcDeletedEntries() { + boolean hasDeletedEntry = false; + LongHash newBucketOrds = new LongHash(1, context.bigArrays()); + try (LongHash oldBucketOrds = bucketOrds) { + + long[] mergeMap = new long[(int) oldBucketOrds.size()]; + + for (int i = 0; i < oldBucketOrds.size(); i++) { + long oldKey = oldBucketOrds.get(i); + long newBucketOrd = -1; + + // if the key still exists in our map, reinsert into the new ords + if (map.containsKey(oldKey)) { + newBucketOrd = newBucketOrds.add(oldKey); + } else { + // Make a note when one of the ords has been deleted + hasDeletedEntry = true; + } + mergeMap[i] = newBucketOrd; + } + // Only merge/delete the ordinals if we have actually deleted one, + // to save on some redundant work + if (hasDeletedEntry) { + mergeBuckets(mergeMap, newBucketOrds.size()); + if (deferringCollector != null) { + deferringCollector.mergeBuckets(mergeMap); + } + } + } + bucketOrds = newBucketOrds; + } + + @Override + public InternalAggregation buildAggregation(long owningBucketOrdinal) throws IOException { + assert owningBucketOrdinal == 0; + List buckets = new ArrayList<>(map.size()); + + for (LongLongCursor cursor : map) { + // The collection managed pruning unwanted terms, so any + // terms that made it this far are "rare" and we want buckets + long bucketOrdinal = bucketOrds.find(cursor.key); + LongTerms.Bucket bucket = new LongTerms.Bucket(0, 0, null, false, 0, format); + bucket.term = cursor.key; + bucket.docCount = cursor.value; + bucket.bucketOrd = bucketOrdinal; + buckets.add(bucket); + + consumeBucketsAndMaybeBreak(1); + } + + runDeferredCollections(buckets.stream().mapToLong(b -> b.bucketOrd).toArray()); + + // Finalize the buckets + for (LongTerms.Bucket bucket : buckets) { + bucket.aggregations = bucketAggregations(bucket.bucketOrd); + bucket.docCountError = 0; + } + + CollectionUtil.introSort(buckets, ORDER.comparator(this)); + return new LongRareTerms(name, ORDER, pipelineAggregators(), metaData(), format, buckets, maxDocCount, bloom); + } + + @Override + public InternalAggregation buildEmptyAggregation() { + return new LongRareTerms(name, ORDER, pipelineAggregators(), metaData(), format, emptyList(), 0, bloom); + } + + @Override + public void doClose() { + Releasables.close(bloom, bucketOrds); + } +} diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/RareTermsAggregationBuilder.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/RareTermsAggregationBuilder.java new file mode 100644 index 0000000000000..73ecedf363fe9 --- /dev/null +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/RareTermsAggregationBuilder.java @@ -0,0 +1,172 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.search.aggregations.bucket.terms; + +import org.elasticsearch.common.ParseField; +import org.elasticsearch.common.io.stream.StreamInput; +import org.elasticsearch.common.io.stream.StreamOutput; +import org.elasticsearch.common.xcontent.ObjectParser; +import org.elasticsearch.common.xcontent.XContentBuilder; +import org.elasticsearch.common.xcontent.XContentParser; +import org.elasticsearch.search.aggregations.AggregationBuilder; +import org.elasticsearch.search.aggregations.AggregatorFactories.Builder; +import org.elasticsearch.search.aggregations.AggregatorFactory; +import org.elasticsearch.search.aggregations.support.ValueType; +import org.elasticsearch.search.aggregations.support.ValuesSource; +import org.elasticsearch.search.aggregations.support.ValuesSourceAggregationBuilder; +import org.elasticsearch.search.aggregations.support.ValuesSourceAggregatorFactory; +import org.elasticsearch.search.aggregations.support.ValuesSourceConfig; +import org.elasticsearch.search.aggregations.support.ValuesSourceParserHelper; +import org.elasticsearch.search.aggregations.support.ValuesSourceType; +import org.elasticsearch.search.internal.SearchContext; + +import java.io.IOException; +import java.util.Map; +import java.util.Objects; + +public class RareTermsAggregationBuilder extends ValuesSourceAggregationBuilder { + public static final String NAME = "rare_terms"; + + private static final ParseField MAX_DOC_COUNT_FIELD_NAME = new ParseField("max_doc_count"); + + private static final int MAX_MAX_DOC_COUNT = 10; + private static final ObjectParser PARSER; + static { + PARSER = new ObjectParser<>(RareTermsAggregationBuilder.NAME); + ValuesSourceParserHelper.declareAnyFields(PARSER, true, true); + PARSER.declareLong(RareTermsAggregationBuilder::maxDocCount, MAX_DOC_COUNT_FIELD_NAME); + + PARSER.declareField((b, v) -> b.includeExclude(IncludeExclude.merge(v, b.includeExclude())), + IncludeExclude::parseInclude, IncludeExclude.INCLUDE_FIELD, ObjectParser.ValueType.OBJECT_ARRAY_OR_STRING); + + PARSER.declareField((b, v) -> b.includeExclude(IncludeExclude.merge(b.includeExclude(), v)), + IncludeExclude::parseExclude, IncludeExclude.EXCLUDE_FIELD, ObjectParser.ValueType.STRING_ARRAY); + } + + public static AggregationBuilder parse(String aggregationName, XContentParser parser) throws IOException { + return PARSER.parse(parser, new RareTermsAggregationBuilder(aggregationName, null), null); + } + + private IncludeExclude includeExclude = null; + private int maxDocCount = 1; + + public RareTermsAggregationBuilder(String name, ValueType valueType) { + super(name, ValuesSourceType.ANY, valueType); + } + + private RareTermsAggregationBuilder(RareTermsAggregationBuilder clone, Builder factoriesBuilder, Map metaData) { + super(clone, factoriesBuilder, metaData); + this.includeExclude = clone.includeExclude; + } + + @Override + protected AggregationBuilder shallowCopy(Builder factoriesBuilder, Map metaData) { + return new RareTermsAggregationBuilder(this, factoriesBuilder, metaData); + } + + /** + * Read from a stream. + */ + public RareTermsAggregationBuilder(StreamInput in) throws IOException { + super(in, ValuesSourceType.ANY); + includeExclude = in.readOptionalWriteable(IncludeExclude::new); + maxDocCount = in.readVInt(); + } + + @Override + protected boolean serializeTargetValueType() { + return true; + } + + @Override + protected void innerWriteTo(StreamOutput out) throws IOException { + out.writeOptionalWriteable(includeExclude); + out.writeVInt(maxDocCount); + } + + /** + * Set the maximum document count terms should have in order to appear in + * the response. + */ + public RareTermsAggregationBuilder maxDocCount(long maxDocCount) { + if (maxDocCount <= 0) { + throw new IllegalArgumentException( + "[" + MAX_DOC_COUNT_FIELD_NAME.getPreferredName() + "] must be greater than 0. Found [" + + maxDocCount + "] in [" + name + "]"); + } + //TODO review: what size cap should we put on this? + if (maxDocCount > MAX_MAX_DOC_COUNT) { + throw new IllegalArgumentException("[" + MAX_DOC_COUNT_FIELD_NAME.getPreferredName() + "] must be smaller" + + "than " + MAX_MAX_DOC_COUNT + "in [" + name + "]"); + } + this.maxDocCount = (int) maxDocCount; + return this; + } + + /** + * Set terms to include and exclude from the aggregation results + */ + public RareTermsAggregationBuilder includeExclude(IncludeExclude includeExclude) { + this.includeExclude = includeExclude; + return this; + } + + /** + * Get terms to include and exclude from the aggregation results + */ + public IncludeExclude includeExclude() { + return includeExclude; + } + + @Override + protected ValuesSourceAggregatorFactory innerBuild(SearchContext context, + ValuesSourceConfig config, + AggregatorFactory parent, + Builder subFactoriesBuilder) throws IOException { + return new RareTermsAggregatorFactory(name, config, includeExclude, + context, parent, subFactoriesBuilder, metaData, maxDocCount); + } + + @Override + protected XContentBuilder doXContentBody(XContentBuilder builder, Params params) throws IOException { + if (includeExclude != null) { + includeExclude.toXContent(builder, params); + } + builder.field(MAX_DOC_COUNT_FIELD_NAME.getPreferredName(), maxDocCount); + return builder; + } + + @Override + protected int innerHashCode() { + return Objects.hash(includeExclude, maxDocCount); + } + + @Override + protected boolean innerEquals(Object obj) { + RareTermsAggregationBuilder other = (RareTermsAggregationBuilder) obj; + return Objects.equals(includeExclude, other.includeExclude) + && Objects.equals(maxDocCount, other.maxDocCount); + } + + @Override + public String getType() { + return NAME; + } + +} diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/RareTermsAggregatorFactory.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/RareTermsAggregatorFactory.java new file mode 100644 index 0000000000000..02dfaabe2420d --- /dev/null +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/RareTermsAggregatorFactory.java @@ -0,0 +1,165 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.search.aggregations.bucket.terms; + +import org.elasticsearch.common.ParseField; +import org.elasticsearch.common.logging.DeprecationLogger; +import org.elasticsearch.search.DocValueFormat; +import org.elasticsearch.search.aggregations.AggregationExecutionException; +import org.elasticsearch.search.aggregations.Aggregator; +import org.elasticsearch.search.aggregations.AggregatorFactories; +import org.elasticsearch.search.aggregations.AggregatorFactory; +import org.elasticsearch.search.aggregations.InternalAggregation; +import org.elasticsearch.search.aggregations.NonCollectingAggregator; +import org.elasticsearch.search.aggregations.pipeline.PipelineAggregator; +import org.elasticsearch.search.aggregations.support.ValuesSource; +import org.elasticsearch.search.aggregations.support.ValuesSourceAggregatorFactory; +import org.elasticsearch.search.aggregations.support.ValuesSourceConfig; +import org.elasticsearch.search.internal.SearchContext; + +import java.io.IOException; +import java.util.List; +import java.util.Map; + +public class RareTermsAggregatorFactory extends ValuesSourceAggregatorFactory { + private final IncludeExclude includeExclude; + private final int maxDocCount; + + RareTermsAggregatorFactory(String name, ValuesSourceConfig config, + IncludeExclude includeExclude, + SearchContext context, + AggregatorFactory parent, AggregatorFactories.Builder subFactoriesBuilder, + Map metaData, int maxDocCount) throws IOException { + super(name, config, context, parent, subFactoriesBuilder, metaData); + this.includeExclude = includeExclude; + this.maxDocCount = maxDocCount; + } + + @Override + protected Aggregator createUnmapped(Aggregator parent, List pipelineAggregators, Map metaData) + throws IOException { + final InternalAggregation aggregation = new UnmappedRareTerms(name, pipelineAggregators, metaData); + return new NonCollectingAggregator(name, context, parent, factories, pipelineAggregators, metaData) { + @Override + public InternalAggregation buildEmptyAggregation() { + return aggregation; + } + }; + } + + @Override + protected Aggregator doCreateInternal(ValuesSource valuesSource, Aggregator parent, boolean collectsFromSingleBucket, + List pipelineAggregators, Map metaData) throws IOException { + if (collectsFromSingleBucket == false) { + return asMultiBucketAggregator(this, context, parent); + } + if (valuesSource instanceof ValuesSource.Bytes) { + ExecutionMode execution = ExecutionMode.MAP; //TODO global ords not implemented yet, only supports "map" + + DocValueFormat format = config.format(); + if ((includeExclude != null) && (includeExclude.isRegexBased()) && format != DocValueFormat.RAW) { + throw new AggregationExecutionException("Aggregation [" + name + "] cannot support " + + "regular expression style include/exclude settings as they can only be applied to string fields. " + + "Use an array of values for include/exclude clauses"); + } + + return execution.create(name, factories, valuesSource, format, + includeExclude, context, parent, pipelineAggregators, metaData, maxDocCount); + } + + if ((includeExclude != null) && (includeExclude.isRegexBased())) { + throw new AggregationExecutionException("Aggregation [" + name + "] cannot support regular expression style include/exclude " + + "settings as they can only be applied to string fields. Use an array of numeric values for include/exclude clauses " + + "used to filter numeric fields"); + } + + if (valuesSource instanceof ValuesSource.Numeric) { + IncludeExclude.LongFilter longFilter = null; + if (((ValuesSource.Numeric) valuesSource).isFloatingPoint()) { + if (includeExclude != null) { + longFilter = includeExclude.convertToDoubleFilter(); + } + return new DoubleRareTermsAggregator(name, factories, (ValuesSource.Numeric) valuesSource, + config.format(), context, parent, longFilter, maxDocCount, pipelineAggregators, metaData); + } + if (includeExclude != null) { + longFilter = includeExclude.convertToLongFilter(config.format()); + } + return new LongRareTermsAggregator(name, factories, (ValuesSource.Numeric) valuesSource, config.format(), + context, parent, longFilter, maxDocCount, pipelineAggregators, metaData); + } + + throw new AggregationExecutionException("terms aggregation cannot be applied to field [" + config.fieldContext().field() + + "]. It can only be applied to numeric or string fields."); + } + + public enum ExecutionMode { + + MAP(new ParseField("map")) { + + @Override + Aggregator create(String name, AggregatorFactories factories, ValuesSource valuesSource, + DocValueFormat format, IncludeExclude includeExclude, + SearchContext context, Aggregator parent, + List pipelineAggregators, Map metaData, long maxDocCount) + throws IOException { + final IncludeExclude.StringFilter filter = includeExclude == null ? null : includeExclude.convertToStringFilter(format); + return new StringRareTermsAggregator(name, factories, (ValuesSource.Bytes) valuesSource, format, filter, + context, parent, pipelineAggregators, metaData, maxDocCount); + } + + @Override + boolean needsGlobalOrdinals() { + return false; + } + + }; + + public static ExecutionMode fromString(String value, final DeprecationLogger deprecationLogger) { + switch (value) { + case "map": + return MAP; + default: + throw new IllegalArgumentException("Unknown `execution_hint`: [" + value + "], expected any of [map]"); + } + } + + private final ParseField parseField; + + ExecutionMode(ParseField parseField) { + this.parseField = parseField; + } + + abstract Aggregator create(String name, AggregatorFactories factories, ValuesSource valuesSource, + DocValueFormat format, IncludeExclude includeExclude, + SearchContext context, Aggregator parent, + List pipelineAggregators, Map metaData, + long maxDocCount) + throws IOException; + + abstract boolean needsGlobalOrdinals(); + + @Override + public String toString() { + return parseField.getPreferredName(); + } + } + +} diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringRareTerms.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringRareTerms.java new file mode 100644 index 0000000000000..0acf2d786efe0 --- /dev/null +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringRareTerms.java @@ -0,0 +1,80 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.search.aggregations.bucket.terms; + +import org.elasticsearch.common.io.stream.StreamInput; +import org.elasticsearch.common.util.BloomFilter; +import org.elasticsearch.search.DocValueFormat; +import org.elasticsearch.search.aggregations.BucketOrder; +import org.elasticsearch.search.aggregations.InternalAggregations; +import org.elasticsearch.search.aggregations.pipeline.PipelineAggregator; + +import java.io.IOException; +import java.util.List; +import java.util.Map; + + +public class StringRareTerms extends InternalMappedRareTerms { + public static final String NAME = "srareterms"; + + public StringRareTerms(String name, BucketOrder order, List pipelineAggregators, + Map metaData, DocValueFormat format, + List buckets, long maxDocCount, BloomFilter bloom) { + super(name, order, pipelineAggregators, metaData, format, buckets, maxDocCount, bloom); + } + + /** + * Read from a stream. + */ + public StringRareTerms(StreamInput in) throws IOException { + super(in, StringTerms.Bucket::new); + } + + @Override + public String getWriteableName() { + return NAME; + } + + @Override + public StringRareTerms create(List buckets) { + return new StringRareTerms(name, order, pipelineAggregators(), metaData, format, buckets, maxDocCount, bloom); + } + + @Override + public StringTerms.Bucket createBucket(InternalAggregations aggregations, StringTerms.Bucket prototype) { + return new StringTerms.Bucket(prototype.termBytes, prototype.getDocCount(), aggregations, false, + prototype.docCountError, prototype.format); + } + + @Override + protected StringRareTerms create(String name, List buckets, long docCountError, long otherDocCount) { + return new StringRareTerms(name, order, pipelineAggregators(), metaData, format, + buckets, maxDocCount, bloom); + } + + @Override + protected StringTerms.Bucket[] createBucketsArray(int size) { + return new StringTerms.Bucket[size]; + } + + @Override + public boolean containsTerm(BloomFilter bloom, StringTerms.Bucket bucket) { + return bloom.mightContain(bucket.termBytes); + } +} diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringRareTermsAggregator.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringRareTermsAggregator.java new file mode 100644 index 0000000000000..7967d53900f11 --- /dev/null +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringRareTermsAggregator.java @@ -0,0 +1,213 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.search.aggregations.bucket.terms; + +import com.carrotsearch.hppc.ObjectLongHashMap; +import com.carrotsearch.hppc.cursors.ObjectLongCursor; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefBuilder; +import org.apache.lucene.util.CollectionUtil; +import org.elasticsearch.common.lease.Releasables; +import org.elasticsearch.common.util.BytesRefHash; +import org.elasticsearch.index.fielddata.SortedBinaryDocValues; +import org.elasticsearch.search.DocValueFormat; +import org.elasticsearch.search.aggregations.Aggregator; +import org.elasticsearch.search.aggregations.AggregatorFactories; +import org.elasticsearch.search.aggregations.InternalAggregation; +import org.elasticsearch.search.aggregations.LeafBucketCollector; +import org.elasticsearch.search.aggregations.LeafBucketCollectorBase; +import org.elasticsearch.search.aggregations.pipeline.PipelineAggregator; +import org.elasticsearch.search.aggregations.support.ValuesSource; +import org.elasticsearch.search.internal.SearchContext; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +import static java.util.Collections.emptyList; + +/** + * An aggregator that finds "rare" string values (e.g. terms agg that orders ascending) + */ +public class StringRareTermsAggregator extends AbstractRareTermsAggregator { + // TODO review question: is there equivalent to LongObjectPagedHashMap like used in LongRareTerms? + protected ObjectLongHashMap map; + protected BytesRefHash bucketOrds; + private LeafBucketCollector subCollectors; + + // TODO review question: What to set this at? + /** + Sets the number of "removed" values to accumulate before we purge ords + via the MergingBucketCollector's mergeBuckets() method + */ + private final long GC_THRESHOLD = 10; + + StringRareTermsAggregator(String name, AggregatorFactories factories, ValuesSource.Bytes valuesSource, + DocValueFormat format, IncludeExclude.StringFilter stringFilter, + SearchContext context, Aggregator parent, List pipelineAggregators, + Map metaData, long maxDocCount) throws IOException { + super(name, factories, context, parent, pipelineAggregators, metaData, maxDocCount, format, valuesSource, stringFilter); + this.map = new ObjectLongHashMap<>(); + this.bucketOrds = new BytesRefHash(1, context.bigArrays()); + } + + @Override + public LeafBucketCollector getLeafCollector(LeafReaderContext ctx, + final LeafBucketCollector sub) throws IOException { + final SortedBinaryDocValues values = valuesSource.bytesValues(ctx); + if (subCollectors == null) { + subCollectors = sub; + } + return new LeafBucketCollectorBase(sub, values) { + final BytesRefBuilder previous = new BytesRefBuilder(); + private long numDeleted = 0; + + @Override + public void collect(int docId, long bucket) throws IOException { + assert bucket == 0; + if (values.advanceExact(docId)) { + final int valuesCount = values.docValueCount(); + previous.clear(); + + // SortedBinaryDocValues don't guarantee uniqueness so we + // need to take care of dups + for (int i = 0; i < valuesCount; ++i) { + final BytesRef bytes = values.nextValue(); + if (includeExclude != null && !includeExclude.accept(bytes)) { + continue; + } + if (i > 0 && previous.get().equals(bytes)) { + continue; + } + + if (bloom.mightContain(bytes) == false) { + long valueCount = map.get(bytes); + if (valueCount == 0) { + // Brand new term, save into map + map.put(BytesRef.deepCopyOf(bytes), 1L); + long bucketOrdinal = bucketOrds.add(bytes); + if (bucketOrdinal < 0) { // already seen + bucketOrdinal = - 1 - bucketOrdinal; + collectExistingBucket(subCollectors, docId, bucketOrdinal); + } else { + collectBucket(subCollectors, docId, bucketOrdinal); + } + } else { + // We've seen this term before, but less than the threshold + // so just increment its counter + if (valueCount < maxDocCount) { + map.put(bytes, valueCount + 1); + } else { + // Otherwise we've breached the threshold, remove from + // the map and add to the bloom filter + map.remove(bytes); + bloom.put(bytes); + numDeleted += 1; + + if (numDeleted > GC_THRESHOLD) { + gcDeletedEntries(); + } + } + } + } + previous.copyBytes(bytes); + } + } + } + }; + } + + protected void gcDeletedEntries() { + boolean hasDeletedEntry = false; + BytesRefHash newBucketOrds = new BytesRefHash(1, context.bigArrays()); + try (BytesRefHash oldBucketOrds = bucketOrds) { + + long[] mergeMap = new long[(int) oldBucketOrds.size()]; + BytesRef scratch = new BytesRef(); + for (int i = 0; i < oldBucketOrds.size(); i++) { + BytesRef oldKey = oldBucketOrds.get(i, scratch); + long newBucketOrd = -1; + + // if the key still exists in our map, reinsert into the new ords + if (map.containsKey(oldKey)) { + newBucketOrd = newBucketOrds.add(oldKey); + } else { + // Make a note when one of the ords has been deleted + hasDeletedEntry = true; + } + mergeMap[i] = newBucketOrd; + } + // Only merge/delete the ordinals if we have actually deleted one, + // to save on some redundant work + if (hasDeletedEntry) { + mergeBuckets(mergeMap, newBucketOrds.size()); + if (deferringCollector != null) { + deferringCollector.mergeBuckets(mergeMap); + } + } + } + bucketOrds = newBucketOrds; + } + + @Override + public InternalAggregation buildAggregation(long owningBucketOrdinal) throws IOException { + assert owningBucketOrdinal == 0; + + List buckets = new ArrayList<>(map.size()); + + for (ObjectLongCursor cursor : map) { + StringTerms.Bucket bucket = new StringTerms.Bucket(new BytesRef(), 0, null, false, 0, format); + + // The collection managed pruning unwanted terms, so any + // terms that made it this far are "rare" and we want buckets + long bucketOrdinal = bucketOrds.find(cursor.key); + bucket.termBytes = BytesRef.deepCopyOf(cursor.key); + bucket.docCount = cursor.value; + bucket.bucketOrd = bucketOrdinal; + buckets.add(bucket); + + consumeBucketsAndMaybeBreak(1); + } + + runDeferredCollections(buckets.stream().mapToLong(b -> b.bucketOrd).toArray()); + + // Finalize the buckets + for (StringTerms.Bucket bucket : buckets) { + bucket.aggregations = bucketAggregations(bucket.bucketOrd); + bucket.docCountError = 0; + } + + CollectionUtil.introSort(buckets, LongRareTermsAggregator.ORDER.comparator(this)); + return new StringRareTerms(name, LongRareTermsAggregator.ORDER, pipelineAggregators(), metaData(), + format, buckets, maxDocCount, bloom); + } + + @Override + public InternalAggregation buildEmptyAggregation() { + return new StringRareTerms(name, LongRareTermsAggregator.ORDER, pipelineAggregators(), metaData(), format, emptyList(), 0, bloom); + } + + @Override + public void doClose() { + Releasables.close(bloom, bucketOrds); + } +} + diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringTermsAggregator.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringTermsAggregator.java index 6458b0066dabe..446aafa22d36b 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringTermsAggregator.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringTermsAggregator.java @@ -114,7 +114,9 @@ public void collect(int doc, long bucket) throws IOException { public InternalAggregation buildAggregation(long owningBucketOrdinal) throws IOException { assert owningBucketOrdinal == 0; - if (bucketCountThresholds.getMinDocCount() == 0 && (InternalOrder.isCountDesc(order) == false || bucketOrds.size() < bucketCountThresholds.getRequiredSize())) { + if (bucketCountThresholds.getMinDocCount() == 0 + && (InternalOrder.isCountDesc(order) == false + || bucketOrds.size() < bucketCountThresholds.getRequiredSize())) { // we need to fill-in the blanks for (LeafReaderContext ctx : context.searcher().getTopReaderContext().leaves()) { final SortedBinaryDocValues values = valuesSource.bytesValues(ctx); @@ -167,11 +169,10 @@ public InternalAggregation buildAggregation(long owningBucketOrdinal) throws IOE runDeferredCollections(survivingBucketOrds); // Now build the aggs - for (int i = 0; i < list.length; i++) { - final StringTerms.Bucket bucket = list[i]; - bucket.termBytes = BytesRef.deepCopyOf(bucket.termBytes); - bucket.aggregations = bucketAggregations(bucket.bucketOrd); - bucket.docCountError = 0; + for (final StringTerms.Bucket bucket : list) { + bucket.termBytes = BytesRef.deepCopyOf(bucket.termBytes); + bucket.aggregations = bucketAggregations(bucket.bucketOrd); + bucket.docCountError = 0; } return new StringTerms(name, order, bucketCountThresholds.getRequiredSize(), bucketCountThresholds.getMinDocCount(), diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/UnmappedRareTerms.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/UnmappedRareTerms.java new file mode 100644 index 0000000000000..9c82f2dc09ded --- /dev/null +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/UnmappedRareTerms.java @@ -0,0 +1,131 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.search.aggregations.bucket.terms; + +import org.elasticsearch.common.io.stream.StreamInput; +import org.elasticsearch.common.io.stream.StreamOutput; +import org.elasticsearch.common.xcontent.XContentBuilder; +import org.elasticsearch.search.aggregations.InternalAggregation; +import org.elasticsearch.search.aggregations.InternalAggregations; +import org.elasticsearch.search.aggregations.pipeline.PipelineAggregator; + +import java.io.IOException; +import java.util.Collections; +import java.util.List; +import java.util.Map; + +import static java.util.Collections.emptyList; + +/** + * Result of the RareTerms aggregation when the field is unmapped. + */ +public class UnmappedRareTerms extends InternalTerms { + public static final String NAME = "umrareterms"; + + UnmappedRareTerms(String name, List pipelineAggregators, + Map metaData) { + super(name, LongRareTermsAggregator.ORDER, 0, 0, pipelineAggregators, metaData); + } + + /** + * Read from a stream. + */ + public UnmappedRareTerms(StreamInput in) throws IOException { + super(in); + } + + @Override + protected void writeTermTypeInfoTo(StreamOutput out) throws IOException { + // Nothing to write + } + + @Override + public String getWriteableName() { + return NAME; + } + + @Override + public String getType() { + return StringTerms.NAME; + } + + @Override + public UnmappedRareTerms create(List buckets) { + return new UnmappedRareTerms(name, pipelineAggregators(), metaData); + } + + @Override + public UnmappedTerms.Bucket createBucket(InternalAggregations aggregations, UnmappedTerms.Bucket prototype) { + throw new UnsupportedOperationException("not supported for UnmappedTerms"); + } + + @Override + protected UnmappedRareTerms create(String name, List buckets, long docCountError, long otherDocCount) { + throw new UnsupportedOperationException("not supported for UnmappedTerms"); + } + + @Override + public InternalAggregation doReduce(List aggregations, ReduceContext reduceContext) { + return new UnmappedRareTerms(name, pipelineAggregators(), metaData); + } + + @Override + public boolean isMapped() { + return false; + } + + @Override + public final XContentBuilder doXContentBody(XContentBuilder builder, Params params) throws IOException { + return doXContentCommon(builder, params, 0, 0, Collections.emptyList()); + } + + @Override + protected void setDocCountError(long docCountError) { + } + + @Override + protected int getShardSize() { + return 0; + } + + @Override + public long getDocCountError() { + return 0; + } + + @Override + public long getSumOfOtherDocCounts() { + return 0; + } + + @Override + public List getBuckets() { + return emptyList(); + } + + @Override + public UnmappedTerms.Bucket getBucketByKey(String term) { + return null; + } + + @Override + protected UnmappedTerms.Bucket[] createBucketsArray(int size) { + return new UnmappedTerms.Bucket[size]; + } +} diff --git a/server/src/test/java/org/elasticsearch/search/aggregations/bucket/RareTermsTests.java b/server/src/test/java/org/elasticsearch/search/aggregations/bucket/RareTermsTests.java new file mode 100644 index 0000000000000..31382f5df8e5c --- /dev/null +++ b/server/src/test/java/org/elasticsearch/search/aggregations/bucket/RareTermsTests.java @@ -0,0 +1,101 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.search.aggregations.bucket; + +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.automaton.RegExp; +import org.elasticsearch.search.aggregations.BaseAggregationTestCase; +import org.elasticsearch.search.aggregations.bucket.terms.IncludeExclude; +import org.elasticsearch.search.aggregations.bucket.terms.RareTermsAggregationBuilder; + +import java.util.SortedSet; +import java.util.TreeSet; + +public class RareTermsTests extends BaseAggregationTestCase { + + @Override + protected RareTermsAggregationBuilder createTestAggregatorBuilder() { + String name = randomAlphaOfLengthBetween(3, 20); + RareTermsAggregationBuilder factory = new RareTermsAggregationBuilder(name, null); + String field = randomAlphaOfLengthBetween(3, 20); + randomFieldOrScript(factory, field); + if (randomBoolean()) { + factory.missing("MISSING"); + } + if (randomBoolean()) { + factory.format("###.##"); + } + if (randomBoolean()) { + IncludeExclude incExc = null; + switch (randomInt(6)) { + case 0: + incExc = new IncludeExclude(new RegExp("foobar"), null); + break; + case 1: + incExc = new IncludeExclude(null, new RegExp("foobaz")); + break; + case 2: + incExc = new IncludeExclude(new RegExp("foobar"), new RegExp("foobaz")); + break; + case 3: + SortedSet includeValues = new TreeSet<>(); + int numIncs = randomIntBetween(1, 20); + for (int i = 0; i < numIncs; i++) { + includeValues.add(new BytesRef(randomAlphaOfLengthBetween(1, 30))); + } + SortedSet excludeValues = null; + incExc = new IncludeExclude(includeValues, excludeValues); + break; + case 4: + SortedSet includeValues2 = null; + SortedSet excludeValues2 = new TreeSet<>(); + int numExcs2 = randomIntBetween(1, 20); + for (int i = 0; i < numExcs2; i++) { + excludeValues2.add(new BytesRef(randomAlphaOfLengthBetween(1, 30))); + } + incExc = new IncludeExclude(includeValues2, excludeValues2); + break; + case 5: + SortedSet includeValues3 = new TreeSet<>(); + int numIncs3 = randomIntBetween(1, 20); + for (int i = 0; i < numIncs3; i++) { + includeValues3.add(new BytesRef(randomAlphaOfLengthBetween(1, 30))); + } + SortedSet excludeValues3 = new TreeSet<>(); + int numExcs3 = randomIntBetween(1, 20); + for (int i = 0; i < numExcs3; i++) { + excludeValues3.add(new BytesRef(randomAlphaOfLengthBetween(1, 30))); + } + incExc = new IncludeExclude(includeValues3, excludeValues3); + break; + case 6: + final int numPartitions = randomIntBetween(1, 100); + final int partition = randomIntBetween(0, numPartitions - 1); + incExc = new IncludeExclude(partition, numPartitions); + break; + default: + fail(); + } + factory.includeExclude(incExc); + } + return factory; + } + +} diff --git a/server/src/test/java/org/elasticsearch/search/aggregations/bucket/terms/RareTermsAggregatorTests.java b/server/src/test/java/org/elasticsearch/search/aggregations/bucket/terms/RareTermsAggregatorTests.java new file mode 100644 index 0000000000000..e5656a47a8441 --- /dev/null +++ b/server/src/test/java/org/elasticsearch/search/aggregations/bucket/terms/RareTermsAggregatorTests.java @@ -0,0 +1,692 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.search.aggregations.bucket.terms; + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.LongPoint; +import org.apache.lucene.document.NumericDocValuesField; +import org.apache.lucene.document.SortedDocValuesField; +import org.apache.lucene.document.SortedNumericDocValuesField; +import org.apache.lucene.document.SortedSetDocValuesField; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.search.DocValuesFieldExistsQuery; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.search.MatchNoDocsQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.NumericUtils; +import org.elasticsearch.Version; +import org.elasticsearch.cluster.metadata.IndexMetaData; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.util.MockBigArrays; +import org.elasticsearch.common.util.MockPageCacheRecycler; +import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.mapper.IdFieldMapper; +import org.elasticsearch.index.mapper.KeywordFieldMapper; +import org.elasticsearch.index.mapper.MappedFieldType; +import org.elasticsearch.index.mapper.NumberFieldMapper; +import org.elasticsearch.index.mapper.SeqNoFieldMapper; +import org.elasticsearch.index.mapper.TypeFieldMapper; +import org.elasticsearch.index.mapper.Uid; +import org.elasticsearch.indices.breaker.NoneCircuitBreakerService; +import org.elasticsearch.search.SearchHit; +import org.elasticsearch.search.aggregations.Aggregation; +import org.elasticsearch.search.aggregations.Aggregations; +import org.elasticsearch.search.aggregations.Aggregator; +import org.elasticsearch.search.aggregations.AggregatorTestCase; +import org.elasticsearch.search.aggregations.InternalAggregation; +import org.elasticsearch.search.aggregations.InternalMultiBucketAggregation; +import org.elasticsearch.search.aggregations.MultiBucketConsumerService; +import org.elasticsearch.search.aggregations.bucket.MultiBucketsAggregation; +import org.elasticsearch.search.aggregations.bucket.global.GlobalAggregationBuilder; +import org.elasticsearch.search.aggregations.bucket.global.InternalGlobal; +import org.elasticsearch.search.aggregations.bucket.nested.InternalNested; +import org.elasticsearch.search.aggregations.bucket.nested.NestedAggregationBuilder; +import org.elasticsearch.search.aggregations.metrics.InternalTopHits; +import org.elasticsearch.search.aggregations.metrics.Max; +import org.elasticsearch.search.aggregations.metrics.MaxAggregationBuilder; +import org.elasticsearch.search.aggregations.metrics.TopHitsAggregationBuilder; +import org.elasticsearch.search.aggregations.support.ValueType; +import org.elasticsearch.search.sort.FieldSortBuilder; +import org.elasticsearch.search.sort.ScoreSortBuilder; +import org.junit.Assert; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.function.Consumer; + +import static org.elasticsearch.index.mapper.SeqNoFieldMapper.PRIMARY_TERM_NAME; +import static org.hamcrest.Matchers.equalTo; +import static org.hamcrest.Matchers.greaterThan; + +public class RareTermsAggregatorTests extends AggregatorTestCase { + + private static final String LONG_FIELD = "numeric"; + private static final String KEYWORD_FIELD = "keyword"; + private static final String DOUBLE_FIELD = "double"; + + private static final List dataset; + static { + List d = new ArrayList<>(45); + for (int i = 0; i < 10; i++) { + for (int j = 0; j < i; j++) { + d.add((long) i); + } + } + dataset = d; + } + + public void testMatchNoDocs() throws IOException { + testBothCases(new MatchNoDocsQuery(), dataset, + aggregation -> aggregation.field(KEYWORD_FIELD).maxDocCount(1), + agg -> assertEquals(0, agg.getBuckets().size()), ValueType.STRING + ); + testBothCases(new MatchNoDocsQuery(), dataset, + aggregation -> aggregation.field(LONG_FIELD).maxDocCount(1), + agg -> assertEquals(0, agg.getBuckets().size()), ValueType.NUMERIC + ); + testBothCases(new MatchNoDocsQuery(), dataset, + aggregation -> aggregation.field(DOUBLE_FIELD).maxDocCount(1), + agg -> assertEquals(0, agg.getBuckets().size()), ValueType.DOUBLE + ); + } + + public void testMatchAllDocs() throws IOException { + Query query = new MatchAllDocsQuery(); + + testBothCases(query, dataset, + aggregation -> aggregation.field(LONG_FIELD).maxDocCount(1), + agg -> { + assertEquals(1, agg.getBuckets().size()); + LongTerms.Bucket bucket = (LongTerms.Bucket) agg.getBuckets().get(0); + assertThat(bucket.getKey(), equalTo(1L)); + assertThat(bucket.getDocCount(), equalTo(1L)); + }, ValueType.NUMERIC + ); + testBothCases(query, dataset, + aggregation -> aggregation.field(KEYWORD_FIELD).maxDocCount(1), + agg -> { + assertEquals(1, agg.getBuckets().size()); + StringTerms.Bucket bucket = (StringTerms.Bucket) agg.getBuckets().get(0); + assertThat(bucket.getKeyAsString(), equalTo("1")); + assertThat(bucket.getDocCount(), equalTo(1L)); + }, ValueType.STRING + ); + testBothCases(query, dataset, + aggregation -> aggregation.field(DOUBLE_FIELD).maxDocCount(1), + agg -> { + assertEquals(1, agg.getBuckets().size()); + DoubleTerms.Bucket bucket = (DoubleTerms.Bucket) agg.getBuckets().get(0); + assertThat(bucket.getKey(), equalTo(1.0)); + assertThat(bucket.getDocCount(), equalTo(1L)); + }, ValueType.DOUBLE + ); + } + + public void testIncludeExclude() throws IOException { + Query query = new MatchAllDocsQuery(); + + testBothCases(query, dataset, + aggregation -> aggregation.field(LONG_FIELD) + .maxDocCount(2) // bump to 2 since we're only including "2" + .includeExclude(new IncludeExclude(new long[]{2}, new long[]{})), + agg -> { + assertEquals(1, agg.getBuckets().size()); + LongTerms.Bucket bucket = (LongTerms.Bucket) agg.getBuckets().get(0); + assertThat(bucket.getKey(), equalTo(2L)); + assertThat(bucket.getDocCount(), equalTo(2L)); + }, ValueType.NUMERIC + ); + testBothCases(query, dataset, + aggregation -> aggregation.field(KEYWORD_FIELD) + .maxDocCount(2) // bump to 2 since we're only including "2" + .includeExclude(new IncludeExclude(new String[]{"2"}, new String[]{})), + agg -> { + assertEquals(1, agg.getBuckets().size()); + StringTerms.Bucket bucket = (StringTerms.Bucket) agg.getBuckets().get(0); + assertThat(bucket.getKeyAsString(), equalTo("2")); + assertThat(bucket.getDocCount(), equalTo(2L)); + }, ValueType.STRING + ); + testBothCases(query, dataset, + aggregation -> aggregation.field(DOUBLE_FIELD) + .maxDocCount(2) // bump to 2 since we're only including "2" + .includeExclude(new IncludeExclude(new double[]{2.0}, new double[]{})), + agg -> { + assertEquals(1, agg.getBuckets().size()); + DoubleTerms.Bucket bucket = (DoubleTerms.Bucket) agg.getBuckets().get(0); + assertThat(bucket.getKey(), equalTo(2.0)); + assertThat(bucket.getDocCount(), equalTo(2L)); + }, ValueType.DOUBLE + ); + } + + public void testEmbeddedMaxAgg() throws IOException { + Query query = new MatchAllDocsQuery(); + + testBothCases(query, dataset, aggregation -> { + MaxAggregationBuilder max = new MaxAggregationBuilder("the_max").field(LONG_FIELD); + aggregation.field(LONG_FIELD).maxDocCount(1).subAggregation(max); + }, + agg -> { + assertEquals(1, agg.getBuckets().size()); + LongTerms.Bucket bucket = (LongTerms.Bucket) agg.getBuckets().get(0); + assertThat(bucket.getKey(), equalTo(1L)); + assertThat(bucket.getDocCount(), equalTo(1L)); + + Aggregations children = bucket.getAggregations(); + assertThat(children.asList().size(), equalTo(1)); + assertThat(children.asList().get(0).getName(), equalTo("the_max")); + assertThat(((Max)(children.asList().get(0))).getValue(), equalTo(1.0)); + }, ValueType.NUMERIC + ); + testBothCases(query, dataset, aggregation -> { + MaxAggregationBuilder max = new MaxAggregationBuilder("the_max").field(LONG_FIELD); + aggregation.field(KEYWORD_FIELD).maxDocCount(1).subAggregation(max); + }, + agg -> { + assertEquals(1, agg.getBuckets().size()); + StringTerms.Bucket bucket = (StringTerms.Bucket) agg.getBuckets().get(0); + assertThat(bucket.getKey(), equalTo("1")); + assertThat(bucket.getDocCount(), equalTo(1L)); + + Aggregations children = bucket.getAggregations(); + assertThat(children.asList().size(), equalTo(1)); + assertThat(children.asList().get(0).getName(), equalTo("the_max")); + assertThat(((Max)(children.asList().get(0))).getValue(), equalTo(1.0)); + }, ValueType.STRING + ); + testBothCases(query, dataset, aggregation -> { + MaxAggregationBuilder max = new MaxAggregationBuilder("the_max").field(LONG_FIELD); + aggregation.field(DOUBLE_FIELD).maxDocCount(1).subAggregation(max); + }, + agg -> { + assertEquals(1, agg.getBuckets().size()); + DoubleTerms.Bucket bucket = (DoubleTerms.Bucket) agg.getBuckets().get(0); + assertThat(bucket.getKey(), equalTo(1.0)); + assertThat(bucket.getDocCount(), equalTo(1L)); + + Aggregations children = bucket.getAggregations(); + assertThat(children.asList().size(), equalTo(1)); + assertThat(children.asList().get(0).getName(), equalTo("the_max")); + assertThat(((Max)(children.asList().get(0))).getValue(), equalTo(1.0)); + }, ValueType.DOUBLE + ); + } + + public void testEmpty() throws IOException { + Query query = new MatchAllDocsQuery(); + + testSearchCase(query, Collections.emptyList(), + aggregation -> aggregation.field(LONG_FIELD).maxDocCount(1), + agg -> assertEquals(0, agg.getBuckets().size()), ValueType.NUMERIC + ); + testSearchCase(query, Collections.emptyList(), + aggregation -> aggregation.field(KEYWORD_FIELD).maxDocCount(1), + agg -> assertEquals(0, agg.getBuckets().size()), ValueType.STRING + ); + testSearchCase(query, Collections.emptyList(), + aggregation -> aggregation.field(DOUBLE_FIELD).maxDocCount(1), + agg -> assertEquals(0, agg.getBuckets().size()), ValueType.DOUBLE + ); + + // Note: the search and reduce test will generate no segments (due to no docs) + // and so will return a null agg because the aggs aren't run/reduced + testSearchAndReduceCase(query, Collections.emptyList(), + aggregation -> aggregation.field(LONG_FIELD).maxDocCount(1), + Assert::assertNull, ValueType.NUMERIC + ); + testSearchAndReduceCase(query, Collections.emptyList(), + aggregation -> aggregation.field(KEYWORD_FIELD).maxDocCount(1), + Assert::assertNull, ValueType.STRING + ); + testSearchAndReduceCase(query, Collections.emptyList(), + aggregation -> aggregation.field(DOUBLE_FIELD).maxDocCount(1), + Assert::assertNull, ValueType.DOUBLE + ); + } + + public void testUnmapped() throws Exception { + try (Directory directory = newDirectory()) { + try (RandomIndexWriter indexWriter = new RandomIndexWriter(random(), directory)) { + Document document = new Document(); + document.add(new SortedDocValuesField("string", new BytesRef("a"))); + document.add(new NumericDocValuesField("long", 0L)); + document.add(new NumericDocValuesField("double", Double.doubleToRawLongBits(0L))); + indexWriter.addDocument(document); + MappedFieldType fieldType1 = new KeywordFieldMapper.KeywordFieldType(); + fieldType1.setName("another_string"); + fieldType1.setHasDocValues(true); + + MappedFieldType fieldType2 = new NumberFieldMapper.NumberFieldType(NumberFieldMapper.NumberType.LONG); + fieldType2.setName("another_long"); + fieldType2.setHasDocValues(true); + + MappedFieldType fieldType3 = new NumberFieldMapper.NumberFieldType(NumberFieldMapper.NumberType.DOUBLE); + fieldType3.setName("another_double"); + fieldType3.setHasDocValues(true); + try (IndexReader indexReader = maybeWrapReaderEs(indexWriter.getReader())) { + IndexSearcher indexSearcher = newIndexSearcher(indexReader); + ValueType[] valueTypes = new ValueType[]{ValueType.STRING, ValueType.LONG, ValueType.DOUBLE}; + String[] fieldNames = new String[]{"string", "long", "double"}; + for (int i = 0; i < fieldNames.length; i++) { + RareTermsAggregationBuilder aggregationBuilder = new RareTermsAggregationBuilder("_name", valueTypes[i]) + .field(fieldNames[i]); + Aggregator aggregator = createAggregator(aggregationBuilder, indexSearcher, fieldType1, fieldType2, fieldType3); + aggregator.preCollection(); + indexSearcher.search(new MatchAllDocsQuery(), aggregator); + aggregator.postCollection(); + Terms result = (Terms) aggregator.buildAggregation(0L); + assertEquals("_name", result.getName()); + assertEquals(0, result.getBuckets().size()); + } + } + } + } + } + + public void testNestedTerms() throws IOException { + Query query = new MatchAllDocsQuery(); + + testBothCases(query, dataset, aggregation -> { + TermsAggregationBuilder terms = new TermsAggregationBuilder("the_terms", ValueType.STRING).field(KEYWORD_FIELD); + aggregation.field(LONG_FIELD).maxDocCount(1).subAggregation(terms); + }, + agg -> { + assertEquals(1, agg.getBuckets().size()); + LongTerms.Bucket bucket = (LongTerms.Bucket) agg.getBuckets().get(0); + assertThat(bucket.getKey(), equalTo(1L)); + assertThat(bucket.getDocCount(), equalTo(1L)); + + Aggregations children = bucket.getAggregations(); + assertThat(children.asList().size(), equalTo(1)); + assertThat(children.asList().get(0).getName(), equalTo("the_terms")); + assertThat(((Terms)(children.asList().get(0))).getBuckets().size(), equalTo(1)); + assertThat(((Terms)(children.asList().get(0))).getBuckets().get(0).getKeyAsString(), equalTo("1")); + }, ValueType.NUMERIC + ); + + testBothCases(query, dataset, aggregation -> { + TermsAggregationBuilder terms = new TermsAggregationBuilder("the_terms", ValueType.STRING).field(KEYWORD_FIELD); + aggregation.field(KEYWORD_FIELD).maxDocCount(1).subAggregation(terms); + }, + agg -> { + assertEquals(1, agg.getBuckets().size()); + StringTerms.Bucket bucket = (StringTerms.Bucket) agg.getBuckets().get(0); + assertThat(bucket.getKey(), equalTo("1")); + assertThat(bucket.getDocCount(), equalTo(1L)); + + Aggregations children = bucket.getAggregations(); + assertThat(children.asList().size(), equalTo(1)); + assertThat(children.asList().get(0).getName(), equalTo("the_terms")); + assertThat(((Terms)(children.asList().get(0))).getBuckets().size(), equalTo(1)); + assertThat(((Terms)(children.asList().get(0))).getBuckets().get(0).getKeyAsString(), equalTo("1")); + }, ValueType.STRING + ); + testBothCases(query, dataset, aggregation -> { + TermsAggregationBuilder terms = new TermsAggregationBuilder("the_terms", ValueType.STRING).field(KEYWORD_FIELD); + aggregation.field(DOUBLE_FIELD).maxDocCount(1).subAggregation(terms); + }, + agg -> { + assertEquals(1, agg.getBuckets().size()); + DoubleTerms.Bucket bucket = (DoubleTerms.Bucket) agg.getBuckets().get(0); + assertThat(bucket.getKey(), equalTo(1.0)); + assertThat(bucket.getDocCount(), equalTo(1L)); + + Aggregations children = bucket.getAggregations(); + assertThat(children.asList().size(), equalTo(1)); + assertThat(children.asList().get(0).getName(), equalTo("the_terms")); + assertThat(((Terms)(children.asList().get(0))).getBuckets().size(), equalTo(1)); + assertThat(((Terms)(children.asList().get(0))).getBuckets().get(0).getKeyAsString(), equalTo("1")); + }, ValueType.DOUBLE + ); + } + + public void testMixLongAndDouble() throws Exception { + RareTermsAggregationBuilder aggregationBuilder = new RareTermsAggregationBuilder("_name", ValueType.LONG) + .field(LONG_FIELD) + .maxDocCount(2); + List aggs = new ArrayList<>(); + + try (Directory directory = newDirectory()) { + try (RandomIndexWriter indexWriter = new RandomIndexWriter(random(), directory)) { + Document document = new Document(); + document.add(new NumericDocValuesField(LONG_FIELD, 1L)); + indexWriter.addDocument(document); + + try (IndexReader indexReader = maybeWrapReaderEs(indexWriter.getReader())) { + IndexSearcher indexSearcher = newIndexSearcher(indexReader); + MappedFieldType fieldType = + new NumberFieldMapper.NumberFieldType(NumberFieldMapper.NumberType.LONG); + fieldType.setName(LONG_FIELD); + fieldType.setHasDocValues(true); + aggs.add(buildInternalAggregation(aggregationBuilder, fieldType, indexSearcher)); + + } + } + } + try (Directory directory = newDirectory()) { + try (RandomIndexWriter indexWriter = new RandomIndexWriter(random(), directory)) { + Document document = new Document(); + document.add(new SortedNumericDocValuesField(LONG_FIELD, NumericUtils.doubleToSortableLong(1.0d))); + indexWriter.addDocument(document); + + try (IndexReader indexReader = maybeWrapReaderEs(indexWriter.getReader())) { + IndexSearcher indexSearcher = newIndexSearcher(indexReader); + MappedFieldType fieldType = + new NumberFieldMapper.NumberFieldType(NumberFieldMapper.NumberType.DOUBLE); + fieldType.setName(LONG_FIELD); + fieldType.setHasDocValues(true); + aggs.add(buildInternalAggregation(aggregationBuilder, fieldType, indexSearcher)); + } + } + } + InternalAggregation.ReduceContext ctx = + new InternalAggregation.ReduceContext(new MockBigArrays(new MockPageCacheRecycler(Settings.EMPTY), + new NoneCircuitBreakerService()), null, true); + InternalAggregation mergedAggs = aggs.get(0).doReduce(aggs, ctx); + assertTrue(mergedAggs instanceof DoubleRareTerms); + List buckets = ((DoubleRareTerms) mergedAggs).getBuckets(); + assertThat(buckets.size(), equalTo(1)); + assertThat(buckets.get(0).getKeyAsString(), equalTo("1.0")); + } + + + public void testGlobalAggregationWithScore() throws IOException { + try (Directory directory = newDirectory()) { + try (RandomIndexWriter indexWriter = new RandomIndexWriter(random(), directory)) { + Document document = new Document(); + document.add(new SortedDocValuesField("keyword", new BytesRef("a"))); + indexWriter.addDocument(document); + document = new Document(); + document.add(new SortedDocValuesField("keyword", new BytesRef("c"))); + indexWriter.addDocument(document); + document = new Document(); + document.add(new SortedDocValuesField("keyword", new BytesRef("e"))); + indexWriter.addDocument(document); + try (IndexReader indexReader = maybeWrapReaderEs(indexWriter.getReader())) { + IndexSearcher indexSearcher = newIndexSearcher(indexReader); + Aggregator.SubAggCollectionMode collectionMode = randomFrom(Aggregator.SubAggCollectionMode.values()); + GlobalAggregationBuilder globalBuilder = new GlobalAggregationBuilder("global") + .subAggregation( + new RareTermsAggregationBuilder("terms", ValueType.STRING) + .field("keyword") + .subAggregation( + new RareTermsAggregationBuilder("sub_terms", ValueType.STRING) + .field("keyword") + .subAggregation( + new TopHitsAggregationBuilder("top_hits") + .storedField("_none_") + ) + ) + ); + + MappedFieldType fieldType = new KeywordFieldMapper.KeywordFieldType(); + fieldType.setName("keyword"); + fieldType.setHasDocValues(true); + + InternalGlobal result = searchAndReduce(indexSearcher, new MatchAllDocsQuery(), globalBuilder, fieldType); + InternalMultiBucketAggregation terms = result.getAggregations().get("terms"); + assertThat(terms.getBuckets().size(), equalTo(3)); + for (MultiBucketsAggregation.Bucket bucket : terms.getBuckets()) { + InternalMultiBucketAggregation subTerms = bucket.getAggregations().get("sub_terms"); + assertThat(subTerms.getBuckets().size(), equalTo(1)); + MultiBucketsAggregation.Bucket subBucket = subTerms.getBuckets().get(0); + InternalTopHits topHits = subBucket.getAggregations().get("top_hits"); + assertThat(topHits.getHits().getHits().length, equalTo(1)); + for (SearchHit hit : topHits.getHits()) { + assertThat(hit.getScore(), greaterThan(0f)); + } + } + } + } + } + } + + public void testWithNestedAggregations() throws IOException { + try (Directory directory = newDirectory()) { + try (RandomIndexWriter indexWriter = new RandomIndexWriter(random(), directory)) { + for (int i = 0; i < 10; i++) { + int[] nestedValues = new int[i]; + for (int j = 0; j < i; j++) { + nestedValues[j] = j; + } + indexWriter.addDocuments(generateDocsWithNested(Integer.toString(i), i, nestedValues)); + } + indexWriter.commit(); + + NestedAggregationBuilder nested = new NestedAggregationBuilder("nested", "nested_object") + .subAggregation(new RareTermsAggregationBuilder("terms", ValueType.LONG) + .field("nested_value") + .maxDocCount(1) + ); + MappedFieldType fieldType = new NumberFieldMapper.NumberFieldType(NumberFieldMapper.NumberType.LONG); + fieldType.setHasDocValues(true); + fieldType.setName("nested_value"); + try (IndexReader indexReader = wrap(DirectoryReader.open(directory))) { + InternalNested result = searchAndReduce(newIndexSearcher(indexReader), + // match root document only + new DocValuesFieldExistsQuery(PRIMARY_TERM_NAME), nested, fieldType); + InternalMultiBucketAggregation terms = result.getAggregations().get("terms"); + assertThat(terms.getBuckets().size(), equalTo(1)); + assertThat(terms.getBuckets().get(0).getKeyAsString(), equalTo("8")); + } + + } + } + } + + public void testWithNestedScoringAggregations() throws IOException { + try (Directory directory = newDirectory()) { + try (RandomIndexWriter indexWriter = new RandomIndexWriter(random(), directory)) { + for (int i = 0; i < 10; i++) { + int[] nestedValues = new int[i]; + for (int j = 0; j < i; j++) { + nestedValues[j] = j; + } + indexWriter.addDocuments(generateDocsWithNested(Integer.toString(i), i, nestedValues)); + } + indexWriter.commit(); + for (boolean withScore : new boolean[]{true, false}) { + NestedAggregationBuilder nested = new NestedAggregationBuilder("nested", "nested_object") + .subAggregation(new RareTermsAggregationBuilder("terms", ValueType.LONG) + .field("nested_value") + .maxDocCount(2) + .subAggregation( + new TopHitsAggregationBuilder("top_hits") + .sort(withScore ? new ScoreSortBuilder() : new FieldSortBuilder("_doc")) + .storedField("_none_") + ) + ); + MappedFieldType fieldType = new NumberFieldMapper.NumberFieldType(NumberFieldMapper.NumberType.LONG); + fieldType.setHasDocValues(true); + fieldType.setName("nested_value"); + try (IndexReader indexReader = wrap(DirectoryReader.open(directory))) { + + if (withScore) { + + IllegalStateException e = expectThrows(IllegalStateException.class, + () -> searchAndReduce(newIndexSearcher(indexReader), + // match root document only + new DocValuesFieldExistsQuery(PRIMARY_TERM_NAME), nested, fieldType)); + assertThat(e.getMessage(), equalTo("RareTerms agg [terms] is the child of the nested agg [nested], " + + "and also has a scoring child agg [top_hits]. This combination is not supported because it requires " + + "executing in [depth_first] mode, which the RareTerms agg cannot do.")); + } else { + InternalNested result = searchAndReduce(newIndexSearcher(indexReader), + // match root document only + new DocValuesFieldExistsQuery(PRIMARY_TERM_NAME), nested, fieldType); + InternalMultiBucketAggregation terms = result.getAggregations().get("terms"); + assertThat(terms.getBuckets().size(), equalTo(2)); + for (MultiBucketsAggregation.Bucket bucket : terms.getBuckets()) { + InternalTopHits topHits = bucket.getAggregations().get("top_hits"); + assertThat(topHits.getHits().totalHits, equalTo(1L)); + assertThat(topHits.getHits().getMaxScore(), equalTo(Float.NaN)); + } + } + } + } + } + } + } + + private final SeqNoFieldMapper.SequenceIDFields sequenceIDFields = SeqNoFieldMapper.SequenceIDFields.emptySeqID(); + private List generateDocsWithNested(String id, int value, int[] nestedValues) { + List documents = new ArrayList<>(); + + for (int nestedValue : nestedValues) { + Document document = new Document(); + document.add(new Field(IdFieldMapper.NAME, Uid.encodeId(id), IdFieldMapper.Defaults.NESTED_FIELD_TYPE)); + document.add(new Field(TypeFieldMapper.NAME, "__nested_object", TypeFieldMapper.Defaults.FIELD_TYPE)); + document.add(new SortedNumericDocValuesField("nested_value", nestedValue)); + documents.add(document); + } + + Document document = new Document(); + document.add(new Field(IdFieldMapper.NAME, Uid.encodeId(id), IdFieldMapper.Defaults.FIELD_TYPE)); + document.add(new Field(TypeFieldMapper.NAME, "docs", TypeFieldMapper.Defaults.FIELD_TYPE)); + document.add(new SortedNumericDocValuesField("value", value)); + document.add(sequenceIDFields.primaryTerm); + documents.add(document); + + return documents; + } + + private InternalAggregation buildInternalAggregation(RareTermsAggregationBuilder builder, MappedFieldType fieldType, + IndexSearcher searcher) throws IOException { + AbstractRareTermsAggregator aggregator = createAggregator(builder, searcher, fieldType); + aggregator.preCollection(); + searcher.search(new MatchAllDocsQuery(), aggregator); + aggregator.postCollection(); + return aggregator.buildAggregation(0L); + } + + private void testSearchCase(Query query, List dataset, + Consumer configure, + Consumer verify, ValueType valueType) throws IOException { + executeTestCase(false, query, dataset, configure, verify, valueType); + } + + private void testSearchAndReduceCase(Query query, List dataset, + Consumer configure, + Consumer verify, ValueType valueType) throws IOException { + executeTestCase(true, query, dataset, configure, verify, valueType); + } + + private void testBothCases(Query query, List dataset, + Consumer configure, + Consumer verify, ValueType valueType) throws IOException { + testSearchCase(query, dataset, configure, verify, valueType); + testSearchAndReduceCase(query, dataset, configure, verify, valueType); + } + + @Override + protected IndexSettings createIndexSettings() { + Settings nodeSettings = Settings.builder() + .put("search.max_buckets", 100000).build(); + return new IndexSettings( + IndexMetaData.builder("_index").settings(Settings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)) + .numberOfShards(1) + .numberOfReplicas(0) + .creationDate(System.currentTimeMillis()) + .build(), + nodeSettings + ); + } + + private void executeTestCase(boolean reduced, Query query, List dataset, + Consumer configure, + Consumer verify, ValueType valueType) throws IOException { + + try (Directory directory = newDirectory()) { + try (RandomIndexWriter indexWriter = new RandomIndexWriter(random(), directory)) { + Document document = new Document(); + for (Long value : dataset) { + if (frequently()) { + indexWriter.commit(); + } + + document.add(new SortedNumericDocValuesField(LONG_FIELD, value)); + document.add(new LongPoint(LONG_FIELD, value)); + document.add(new SortedSetDocValuesField(KEYWORD_FIELD, new BytesRef(Long.toString(value)))); + document.add(new SortedNumericDocValuesField(DOUBLE_FIELD, Double.doubleToRawLongBits((double) value))); + indexWriter.addDocument(document); + document.clear(); + } + } + + try (IndexReader indexReader = DirectoryReader.open(directory)) { + IndexSearcher indexSearcher = newIndexSearcher(indexReader); + + RareTermsAggregationBuilder aggregationBuilder = new RareTermsAggregationBuilder("_name", valueType); + if (configure != null) { + configure.accept(aggregationBuilder); + } + + MappedFieldType keywordFieldType = new KeywordFieldMapper.KeywordFieldType(); + keywordFieldType.setName(KEYWORD_FIELD); + keywordFieldType.setHasDocValues(true); + + MappedFieldType longFieldType = new NumberFieldMapper.NumberFieldType(NumberFieldMapper.NumberType.LONG); + longFieldType.setName(LONG_FIELD); + longFieldType.setHasDocValues(true); + + MappedFieldType doubleFieldType = new NumberFieldMapper.NumberFieldType(NumberFieldMapper.NumberType.DOUBLE); + doubleFieldType.setName(DOUBLE_FIELD); + doubleFieldType.setHasDocValues(true); + + InternalMappedRareTerms rareTerms; + if (reduced) { + rareTerms = searchAndReduce(indexSearcher, query, aggregationBuilder, keywordFieldType, longFieldType, doubleFieldType); + } else { + rareTerms = search(indexSearcher, query, aggregationBuilder, keywordFieldType, longFieldType, doubleFieldType); + } + verify.accept(rareTerms); + } + } + } + + @Override + public void doAssertReducedMultiBucketConsumer(Aggregation agg, MultiBucketConsumerService.MultiBucketConsumer bucketConsumer) { + /* + * No-op. + * + * This is used in the aggregator tests to check that after a reduction, we have the correct number of buckets. + * This can be done during incremental reduces, and the final reduce. Unfortunately, the number of buckets + * can _decrease_ during runtime as values are reduced together (e.g. 1 count on each shard, but when + * reduced it becomes 2 and is greater than the threshold). + * + * Because the incremental reduction test picks random subsets to reduce together, it's impossible + * to predict how the buckets will end up, and so this assertion will fail. + * + * If we want to put this assertion back in, we'll need this test to override the incremental reduce + * portion so that we can deterministically know which shards are being reduced together and which + * buckets we should have left after each reduction. + */ + } + + +} diff --git a/test/framework/src/main/java/org/elasticsearch/search/aggregations/AggregatorTestCase.java b/test/framework/src/main/java/org/elasticsearch/search/aggregations/AggregatorTestCase.java index cdf4a68b52f18..e790c53f8dad9 100644 --- a/test/framework/src/main/java/org/elasticsearch/search/aggregations/AggregatorTestCase.java +++ b/test/framework/src/main/java/org/elasticsearch/search/aggregations/AggregatorTestCase.java @@ -399,7 +399,7 @@ protected A searchAndReduc new InternalAggregation.ReduceContext(root.context().bigArrays(), null, reduceBucketConsumer, false); A reduced = (A) aggs.get(0).doReduce(toReduce, context); - InternalAggregationTestCase.assertMultiBucketConsumer(reduced, reduceBucketConsumer); + doAssertReducedMultiBucketConsumer(reduced, reduceBucketConsumer); aggs = new ArrayList<>(aggs.subList(r, toReduceSize)); aggs.add(reduced); } @@ -415,12 +415,16 @@ protected A searchAndReduc internalAgg = (A) pipelineAggregator.reduce(internalAgg, context); } } - InternalAggregationTestCase.assertMultiBucketConsumer(internalAgg, reduceBucketConsumer); + doAssertReducedMultiBucketConsumer(internalAgg, reduceBucketConsumer); return internalAgg; } } + protected void doAssertReducedMultiBucketConsumer(Aggregation agg, MultiBucketConsumerService.MultiBucketConsumer bucketConsumer) { + InternalAggregationTestCase.assertMultiBucketConsumer(agg, bucketConsumer); + } + private static class ShardSearcher extends IndexSearcher { private final List ctx; From 2f3a9598749e9d71a4653b788d537eadda4cfb82 Mon Sep 17 00:00:00 2001 From: Zachary Tong Date: Tue, 22 Jan 2019 16:04:02 -0500 Subject: [PATCH 02/25] Make bloom filter less bad (cleanup legacy cruft) --- .../common/util/BloomFilter.java | 194 ++++++++---------- 1 file changed, 84 insertions(+), 110 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/common/util/BloomFilter.java b/server/src/main/java/org/elasticsearch/common/util/BloomFilter.java index 64e24abb78c4b..1cb68ced8a899 100644 --- a/server/src/main/java/org/elasticsearch/common/util/BloomFilter.java +++ b/server/src/main/java/org/elasticsearch/common/util/BloomFilter.java @@ -25,7 +25,6 @@ import org.elasticsearch.common.io.stream.StreamInput; import org.elasticsearch.common.io.stream.StreamOutput; import org.elasticsearch.common.io.stream.Writeable; -import org.elasticsearch.common.lease.Releasable; import java.io.IOException; import java.util.Arrays; @@ -34,25 +33,26 @@ /** * A bloom filter. Inspired by Guava bloom filter implementation though with some optimizations. */ -public class BloomFilter implements Writeable, Releasable { - - // Some numbers: - // 10k =0.001: 140.4kb , 10 Hashes - // 10k =0.01 : 93.6kb , 6 Hashes - // 100k=0.01 : 936.0kb , 6 Hashes - // 100k=0.03 : 712.7kb , 5 Hashes - // 500k=0.01 : 4.5mb , 6 Hashes - // 500k=0.03 : 3.4mb , 5 Hashes - // 500k=0.05 : 2.9mb , 4 Hashes - // 1m=0.01 : 9.1mb , 6 Hashes - // 1m=0.03 : 6.9mb , 5 Hashes - // 1m=0.05 : 5.9mb , 4 Hashes - // 5m=0.01 : 45.7mb , 6 Hashes - // 5m=0.03 : 34.8mb , 5 Hashes - // 5m=0.05 : 29.7mb , 4 Hashes - // 50m=0.01 : 457.0mb , 6 Hashes - // 50m=0.03 : 297.3mb , 4 Hashes - // 50m=0.10 : 228.5mb , 3 Hashes +public class BloomFilter implements Writeable { + + // Some anecdotal sizing numbers: + // expected insertions, false positive probability, bloom size, num hashes + // 10k, 0.001, 140.4kb, 10 Hashes + // 10k, 0.01, 93.6kb, 6 Hashes + // 100k, 0.01, 936.0kb, 6 Hashes + // 100k, 0.03, 712.7kb, 5 Hashes + // 500k, 0.01, 4.5mb, 6 Hashes + // 500k, 0.03, 3.4mb, 5 Hashes + // 500k, 0.05, 2.9mb, 4 Hashes + // 1m, 0.01, 9.1mb, 6 Hashes + // 1m, 0.03, 6.9mb, 5 Hashes + // 1m, 0.05, 5.9mb, 4 Hashes + // 5m, 0.01, 45.7mb, 6 Hashes + // 5m, 0.03, 34.8mb, 5 Hashes + // 5m, 0.05, 29.7mb, 4 Hashes + // 50m, 0.01, 457.0mb, 6 Hashes + // 50m, 0.03, 297.3mb, 4 Hashes + // 50m, 0.10, 228.5mb, 3 Hashes /** * The bit set of the BloomFilter (not necessarily power of 2!) @@ -64,10 +64,8 @@ public class BloomFilter implements Writeable, Releasable { */ private final int numHashFunctions; - private final Hashing hashing = Hashing.V1; - /** - * Creates a bloom filter based on the with the expected number + * Creates a bloom filter with the expected number * of insertions and expected false positive probability. * * @param expectedInsertions the number of expected insertions to the constructed @@ -92,7 +90,7 @@ public BloomFilter(int expectedInsertions, double fpp, int numHashFunctions) { /* * TODO(user): Put a warning in the javadoc about tiny fpp values, * since the resulting size is proportional to -log(p), but there is not - * much of a point after all, e.g. optimalM(1000, 0.0000000000000001) = 76680 + * much of a point after all, e.g. optimalNumOfBits(1000, 0.0000000000000001) = 76680 * which is less that 10kb. Who cares! */ long numBits = optimalNumOfBits(expectedInsertions, fpp); @@ -111,13 +109,13 @@ public BloomFilter(int expectedInsertions, double fpp, int numHashFunctions) { } public BloomFilter(StreamInput in) throws IOException { - int numLongs = in.readVInt(); - long[] data = new long[numLongs]; - for (int i = 0; i < numLongs; i++) { - data[i] = in.readLong(); - } + this.bits = new BitArray(in); this.numHashFunctions = in.readVInt(); - this.bits = new BitArray(data); + } + + public void writeTo(StreamOutput out) throws IOException { + bits.writeTo(out); + out.writeVInt(numHashFunctions); } public void merge(BloomFilter other) { @@ -125,11 +123,11 @@ public void merge(BloomFilter other) { } public boolean put(BytesRef value) { - return hashing.put(value, numHashFunctions, bits); + return Hashing.put(value, numHashFunctions, bits); } public boolean put(byte[] value) { - return hashing.put(value, 0, value.length, numHashFunctions, bits); + return Hashing.put(value, 0, value.length, numHashFunctions, bits); } public boolean put(long value) { @@ -137,11 +135,11 @@ public boolean put(long value) { } public boolean mightContain(BytesRef value) { - return hashing.mightContain(value, numHashFunctions, bits); + return Hashing.mightContain(value, numHashFunctions, bits); } - public boolean mightContain(byte[] value) { - return hashing.mightContain(value, 0, value.length, numHashFunctions, bits); + private boolean mightContain(byte[] value) { + return Hashing.mightContain(value, 0, value.length, numHashFunctions, bits); } public boolean mightContain(long value) { @@ -172,22 +170,9 @@ public boolean equals(Object other) { final BloomFilter that = (BloomFilter) other; return Objects.equals(this.bits, that.bits) - && Objects.equals(this.hashing, that.hashing) && Objects.equals(this.numHashFunctions, that.numHashFunctions); } - public void writeTo(StreamOutput out) throws IOException { - out.writeVInt(bits.data.length); - for (long l : bits.data) { - out.writeLong(l); - } - out.writeVInt(numHashFunctions); - } - - @Override - public void close() { - - } /* * Cheat sheet: @@ -234,13 +219,19 @@ private static long optimalNumOfBits(long n, double p) { } // Note: We use this instead of java.util.BitSet because we need access to the long[] data field - static final class BitArray { - final long[] data; - final long bitSize; - long bitCount; + static final class BitArray implements Writeable { + private final long[] data; + private final long bitSize; + private long bitCount; BitArray(long bits) { - this(new long[size(bits)]); + this.data = new long[size(bits)]; + long bitCount = 0; + for (long value : data) { + bitCount += Long.bitCount(value); + } + this.bitCount = bitCount; + this.bitSize = data.length * Long.SIZE; } private static int size(long bits) { @@ -249,15 +240,17 @@ private static int size(long bits) { return Math.toIntExact(remainder == 0 ? quotient : 1 + quotient); } - // Used by serialization - BitArray(long[] data) { - this.data = data; - long bitCount = 0; - for (long value : data) { - bitCount += Long.bitCount(value); - } - this.bitCount = bitCount; - this.bitSize = data.length * Long.SIZE; + BitArray(StreamInput in) throws IOException { + this.data = in.readVLongArray(); + this.bitSize = in.readVLong(); + this.bitCount = in.readVLong(); + } + + @Override + public void writeTo(StreamOutput out) throws IOException { + out.writeVLongArray(data); + out.writeVLong(bitSize); + out.writeVLong(bitCount); } /** @@ -290,10 +283,6 @@ long bitCount() { return bitCount; } - BitArray copy() { - return new BitArray(data.clone()); - } - /** * Combines the two BitArrays using bitwise OR. */ @@ -319,63 +308,48 @@ public int hashCode() { return Arrays.hashCode(data); } - public long ramBytesUsed() { return Long.BYTES * data.length + RamUsageEstimator.NUM_BYTES_ARRAY_HEADER + 16; } } - enum Hashing { - V1() { - @Override - protected boolean put(byte[] bytes, int offset, int length, int numHashFunctions, BitArray bits) { - long bitSize = bits.bitSize(); - MurmurHash3.Hash128 hash128 = MurmurHash3.hash128(bytes, offset, length, 0, new MurmurHash3.Hash128()); - - boolean bitsChanged = false; - long combinedHash = hash128.h1; - for (int i = 0; i < numHashFunctions; i++) { - // Make the combined hash positive and indexable - bitsChanged |= bits.set((combinedHash & Long.MAX_VALUE) % bitSize); - combinedHash += hash128.h2; - } - return bitsChanged; - } + private static class Hashing { - @Override - protected boolean mightContain(byte[] bytes, int offset, int length, int numHashFunctions, BitArray bits) { - long bitSize = bits.bitSize(); - MurmurHash3.Hash128 hash128 = MurmurHash3.hash128(bytes, offset, length, 0, new MurmurHash3.Hash128()); - - long combinedHash = hash128.h1; - for (int i = 0; i < numHashFunctions; i++) { - // Make the combined hash positive and indexable - if (!bits.get((combinedHash & Long.MAX_VALUE) % bitSize)) { - return false; - } - combinedHash += hash128.h2; - } - return true; - } - - @Override - protected int type() { - return 1; - } - }; - - protected boolean put(BytesRef value, int numHashFunctions, BitArray bits) { + static boolean put(BytesRef value, int numHashFunctions, BitArray bits) { return put(value.bytes, value.offset, value.length, numHashFunctions, bits); } - protected abstract boolean put(byte[] bytes, int offset, int length, int numHashFunctions, BitArray bits); + static boolean put(byte[] bytes, int offset, int length, int numHashFunctions, BitArray bits) { + long bitSize = bits.bitSize(); + MurmurHash3.Hash128 hash128 = MurmurHash3.hash128(bytes, offset, length, 0, new MurmurHash3.Hash128()); - protected boolean mightContain(BytesRef value, int numHashFunctions, BitArray bits) { + boolean bitsChanged = false; + long combinedHash = hash128.h1; + for (int i = 0; i < numHashFunctions; i++) { + // Make the combined hash positive and indexable + bitsChanged |= bits.set((combinedHash & Long.MAX_VALUE) % bitSize); + combinedHash += hash128.h2; + } + return bitsChanged; + } + + static boolean mightContain(BytesRef value, int numHashFunctions, BitArray bits) { return mightContain(value.bytes, value.offset, value.length, numHashFunctions, bits); } - protected abstract boolean mightContain(byte[] bytes, int offset, int length, int numHashFunctions, BitArray bits); + static boolean mightContain(byte[] bytes, int offset, int length, int numHashFunctions, BitArray bits) { + long bitSize = bits.bitSize(); + MurmurHash3.Hash128 hash128 = MurmurHash3.hash128(bytes, offset, length, 0, new MurmurHash3.Hash128()); - protected abstract int type(); + long combinedHash = hash128.h1; + for (int i = 0; i < numHashFunctions; i++) { + // Make the combined hash positive and indexable + if (!bits.get((combinedHash & Long.MAX_VALUE) % bitSize)) { + return false; + } + combinedHash += hash128.h2; + } + return true; + } } } From ce15588b4e520b856af6ee816a83e8f53148702d Mon Sep 17 00:00:00 2001 From: Zachary Tong Date: Tue, 22 Jan 2019 17:14:14 -0500 Subject: [PATCH 03/25] review cleanup --- .../bucket/rare-terms-aggregation.asciidoc | 8 -- .../common/util/BloomFilter.java | 15 ++- .../elasticsearch/search/SearchModule.java | 10 +- .../terms/AbstractRareTermsAggregator.java | 6 +- .../terms/AbstractStringTermsAggregator.java | 2 +- .../bucket/terms/DoubleRareTerms.java | 115 ------------------ .../terms/DoubleRareTermsAggregator.java | 77 ------------ .../bucket/terms/InternalMappedRareTerms.java | 23 ++-- .../bucket/terms/LongRareTerms.java | 29 +---- .../bucket/terms/LongRareTermsAggregator.java | 32 +++-- .../terms/RareTermsAggregatorFactory.java | 8 +- .../bucket/terms/StringRareTerms.java | 5 + .../terms/StringRareTermsAggregator.java | 30 +++-- .../terms/RareTermsAggregatorTests.java | 7 +- 14 files changed, 92 insertions(+), 275 deletions(-) delete mode 100644 server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/DoubleRareTerms.java delete mode 100644 server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/DoubleRareTermsAggregator.java diff --git a/docs/reference/aggregations/bucket/rare-terms-aggregation.asciidoc b/docs/reference/aggregations/bucket/rare-terms-aggregation.asciidoc index a0a64fef8fe87..6c178863e8f91 100644 --- a/docs/reference/aggregations/bucket/rare-terms-aggregation.asciidoc +++ b/docs/reference/aggregations/bucket/rare-terms-aggregation.asciidoc @@ -285,11 +285,3 @@ GET /_search // CONSOLE <1> Documents without a value in the `tags` field will fall into the same bucket as documents that have the value `N/A`. - - -==== Mixing field types - -WARNING: When aggregating on multiple indices the type of the aggregated field may not be the same in all indices. -Some types are compatible with each other (`integer` and `long` or `float` and `double`) but when the types are a mix -of decimal and non-decimal number the terms aggregation will promote the non-decimal numbers to decimal numbers. -This can result in a loss of precision in the bucket values. diff --git a/server/src/main/java/org/elasticsearch/common/util/BloomFilter.java b/server/src/main/java/org/elasticsearch/common/util/BloomFilter.java index 1cb68ced8a899..25bb76c4ec772 100644 --- a/server/src/main/java/org/elasticsearch/common/util/BloomFilter.java +++ b/server/src/main/java/org/elasticsearch/common/util/BloomFilter.java @@ -83,7 +83,7 @@ public BloomFilter(int expectedInsertions, double fpp) { * @param fpp the desired false positive probability (must be positive and less than 1.0) * @param numHashFunctions the number of hash functions to use (must be less than or equal to 255) */ - public BloomFilter(int expectedInsertions, double fpp, int numHashFunctions) { + private BloomFilter(int expectedInsertions, double fpp, int numHashFunctions) { if (expectedInsertions == 0) { expectedInsertions = 1; } @@ -108,6 +108,15 @@ public BloomFilter(int expectedInsertions, double fpp, int numHashFunctions) { this.numHashFunctions = numHashFunctions; } + public static BloomFilter EmptyBloomFilter(long numBits, int numHashFunctions) { + return new BloomFilter(numBits, numHashFunctions); + } + + private BloomFilter(long numBits, int numHashFunctions) { + this.bits = new BitArray(numBits); + this.numHashFunctions = numHashFunctions; + } + public BloomFilter(StreamInput in) throws IOException { this.bits = new BitArray(in); this.numHashFunctions = in.readVInt(); @@ -150,6 +159,10 @@ public int getNumHashFunctions() { return this.numHashFunctions; } + public long getNumBits() { + return bits.bitSize(); + } + public long getSizeInBytes() { return bits.ramBytesUsed(); } diff --git a/server/src/main/java/org/elasticsearch/search/SearchModule.java b/server/src/main/java/org/elasticsearch/search/SearchModule.java index 91c22bd401117..73fd827eac529 100644 --- a/server/src/main/java/org/elasticsearch/search/SearchModule.java +++ b/server/src/main/java/org/elasticsearch/search/SearchModule.java @@ -146,7 +146,6 @@ import org.elasticsearch.search.aggregations.bucket.significant.heuristics.ScriptHeuristic; import org.elasticsearch.search.aggregations.bucket.significant.heuristics.SignificanceHeuristic; import org.elasticsearch.search.aggregations.bucket.significant.heuristics.SignificanceHeuristicParser; -import org.elasticsearch.search.aggregations.bucket.terms.DoubleRareTerms; import org.elasticsearch.search.aggregations.bucket.terms.DoubleTerms; import org.elasticsearch.search.aggregations.bucket.terms.LongRareTerms; import org.elasticsearch.search.aggregations.bucket.terms.LongTerms; @@ -400,11 +399,10 @@ private void registerAggregations(List plugins) { .addResultReader(LongTerms.NAME, LongTerms::new) .addResultReader(DoubleTerms.NAME, DoubleTerms::new)); registerAggregation(new AggregationSpec(RareTermsAggregationBuilder.NAME, RareTermsAggregationBuilder::new, - RareTermsAggregationBuilder::parse) - .addResultReader(StringRareTerms.NAME, StringRareTerms::new) - .addResultReader(UnmappedRareTerms.NAME, UnmappedRareTerms::new) - .addResultReader(LongRareTerms.NAME, LongRareTerms::new) - .addResultReader(DoubleRareTerms.NAME, DoubleRareTerms::new)); + RareTermsAggregationBuilder::parse) + .addResultReader(StringRareTerms.NAME, StringRareTerms::new) + .addResultReader(UnmappedRareTerms.NAME, UnmappedRareTerms::new) + .addResultReader(LongRareTerms.NAME, LongRareTerms::new)); registerAggregation(new AggregationSpec(SignificantTermsAggregationBuilder.NAME, SignificantTermsAggregationBuilder::new, SignificantTermsAggregationBuilder.getParser(significanceHeuristicParserRegistry)) .addResultReader(SignificantStringTerms.NAME, SignificantStringTerms::new) diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/AbstractRareTermsAggregator.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/AbstractRareTermsAggregator.java index fb92bf63c904e..8c37bb1c0b82e 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/AbstractRareTermsAggregator.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/AbstractRareTermsAggregator.java @@ -35,7 +35,7 @@ import java.util.List; import java.util.Map; -public abstract class AbstractRareTermsAggregator +public abstract class AbstractRareTermsAggregator extends DeferableBucketAggregator { // TODO review question: What to set this at? @@ -96,7 +96,7 @@ public DeferringBucketCollector getDeferringCollector() { protected void doPostCollection() { // Make sure we do one final GC to clean up any deleted ords // that may be lingering (but still below GC threshold) - gcDeletedEntries(); + gcDeletedEntries(null); } private String subAggsNeedScore() { @@ -118,5 +118,5 @@ private String descendsFromNestedAggregator(Aggregator parent) { return null; } - protected abstract void gcDeletedEntries(); + protected abstract void gcDeletedEntries(Long numDeleted); } diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/AbstractStringTermsAggregator.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/AbstractStringTermsAggregator.java index 78bdd0b8c7155..edbf2aef25fec 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/AbstractStringTermsAggregator.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/AbstractStringTermsAggregator.java @@ -33,7 +33,7 @@ import static java.util.Collections.emptyList; -public abstract class AbstractStringTermsAggregator extends TermsAggregator { +abstract class AbstractStringTermsAggregator extends TermsAggregator { protected final boolean showTermDocCountError; diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/DoubleRareTerms.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/DoubleRareTerms.java deleted file mode 100644 index d3f964246c4af..0000000000000 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/DoubleRareTerms.java +++ /dev/null @@ -1,115 +0,0 @@ -/* - * Licensed to Elasticsearch under one or more contributor - * license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright - * ownership. Elasticsearch licenses this file to you under - * the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.elasticsearch.search.aggregations.bucket.terms; - -import org.apache.lucene.util.NumericUtils; -import org.elasticsearch.common.io.stream.StreamInput; -import org.elasticsearch.common.util.BloomFilter; -import org.elasticsearch.search.DocValueFormat; -import org.elasticsearch.search.aggregations.BucketOrder; -import org.elasticsearch.search.aggregations.InternalAggregation; -import org.elasticsearch.search.aggregations.InternalAggregations; -import org.elasticsearch.search.aggregations.pipeline.PipelineAggregator; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; -import java.util.Map; - -/** - * Result of the RareTerms aggregation when the field is some kind of decimal number like a float, double, or distance. - */ -public class DoubleRareTerms extends InternalMappedRareTerms { - public static final String NAME = "drareterms"; - - DoubleRareTerms(String name, BucketOrder order, List pipelineAggregators, - Map metaData, DocValueFormat format, - List buckets, long maxDocCount, BloomFilter bloom) { - super(name, order, pipelineAggregators, metaData, format, buckets, maxDocCount, bloom); - } - - /** - * Read from a stream. - */ - public DoubleRareTerms(StreamInput in) throws IOException { - super(in, DoubleTerms.Bucket::new); - } - - @Override - public String getWriteableName() { - return NAME; - } - - @Override - public DoubleRareTerms create(List buckets) { - return new DoubleRareTerms(name, order, pipelineAggregators(), metaData, format, buckets, maxDocCount, bloom); - } - - @Override - public DoubleTerms.Bucket createBucket(InternalAggregations aggregations, DoubleTerms.Bucket prototype) { - return new DoubleTerms.Bucket((double)prototype.getKey(), prototype.docCount, aggregations, - prototype.showDocCountError, prototype.docCountError, prototype.format); - } - - @Override - protected DoubleRareTerms create(String name, List buckets, long docCountError, long otherDocCount) { - return new DoubleRareTerms(name, order, pipelineAggregators(), getMetaData(), format, - buckets, maxDocCount, bloom); - } - - @Override - protected DoubleTerms.Bucket[] createBucketsArray(int size) { - return new DoubleTerms.Bucket[size]; - } - - @Override - public InternalAggregation doReduce(List aggregations, ReduceContext reduceContext) { - boolean promoteToDouble = false; - for (InternalAggregation agg : aggregations) { - if (agg instanceof LongRareTerms && ((LongRareTerms) agg).format == DocValueFormat.RAW) { - /* - * this agg mixes longs and doubles, we must promote longs to doubles to make the internal aggs - * compatible - */ - promoteToDouble = true; - break; - } - } - if (promoteToDouble == false) { - return super.doReduce(aggregations, reduceContext); - } - List newAggs = new ArrayList<>(aggregations.size()); - for (InternalAggregation agg : aggregations) { - if (agg instanceof LongRareTerms) { - DoubleRareTerms dTerms = LongRareTerms.convertLongRareTermsToDouble((LongRareTerms) agg, format); - newAggs.add(dTerms); - } else if (agg instanceof DoubleRareTerms) { - newAggs.add(agg); - } else { - throw new IllegalStateException("Encountered a non-RareTerms numeric agg when reducing RareTerms."); - } - } - return newAggs.get(0).doReduce(newAggs, reduceContext); - } - - @Override - public boolean containsTerm(BloomFilter bloom, DoubleTerms.Bucket bucket) { - return bloom.mightContain(NumericUtils.doubleToSortableLong((double) bucket.getKey())); - } -} diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/DoubleRareTermsAggregator.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/DoubleRareTermsAggregator.java deleted file mode 100644 index f1c431c26d39c..0000000000000 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/DoubleRareTermsAggregator.java +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Licensed to Elasticsearch under one or more contributor - * license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright - * ownership. Elasticsearch licenses this file to you under - * the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.elasticsearch.search.aggregations.bucket.terms; - -import org.apache.lucene.index.LeafReaderContext; -import org.apache.lucene.index.SortedNumericDocValues; -import org.apache.lucene.util.NumericUtils; -import org.elasticsearch.index.fielddata.FieldData; -import org.elasticsearch.search.DocValueFormat; -import org.elasticsearch.search.aggregations.Aggregator; -import org.elasticsearch.search.aggregations.AggregatorFactories; -import org.elasticsearch.search.aggregations.pipeline.PipelineAggregator; -import org.elasticsearch.search.aggregations.support.ValuesSource; -import org.elasticsearch.search.aggregations.support.ValuesSource.Numeric; -import org.elasticsearch.search.internal.SearchContext; - -import java.io.IOException; -import java.util.List; -import java.util.Map; -import java.util.stream.Collectors; - -public class DoubleRareTermsAggregator extends LongRareTermsAggregator { - - DoubleRareTermsAggregator(String name, AggregatorFactories factories, ValuesSource.Numeric valuesSource, - DocValueFormat format, SearchContext aggregationContext, Aggregator parent, - IncludeExclude.LongFilter longFilter, int maxDocCount, List pipelineAggregators, - Map metaData) throws IOException { - super(name, factories, valuesSource, format, aggregationContext, parent, - longFilter, maxDocCount, pipelineAggregators, metaData); - } - - @Override - protected SortedNumericDocValues getValues(Numeric valuesSource, LeafReaderContext ctx) throws IOException { - return FieldData.toSortableLongBits(valuesSource.doubleValues(ctx)); - } - - @Override - public DoubleRareTerms buildAggregation(long owningBucketOrdinal) throws IOException { - final LongRareTerms terms = (LongRareTerms) super.buildAggregation(owningBucketOrdinal); - return convertToDouble(terms); - } - - @Override - public DoubleRareTerms buildEmptyAggregation() { - final LongRareTerms terms = (LongRareTerms) super.buildEmptyAggregation(); - return convertToDouble(terms); - } - - private static DoubleRareTerms convertToDouble(LongRareTerms terms) { - List buckets = terms.buckets.stream().map(DoubleRareTermsAggregator::convertToDouble) - .collect(Collectors.toList()); - return new DoubleRareTerms(terms.getName(), terms.order, terms.pipelineAggregators(), - terms.getMetaData(), terms.format, buckets, terms.getMaxDocCount(), terms.getBloom()); - } - - private static DoubleTerms.Bucket convertToDouble(LongTerms.Bucket bucket) { - double value = NumericUtils.sortableLongToDouble(bucket.term); - return new DoubleTerms.Bucket(value, bucket.docCount, bucket.aggregations, bucket.showDocCountError, bucket.docCountError, - bucket.format); - } -} diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/InternalMappedRareTerms.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/InternalMappedRareTerms.java index e42f4d0fda391..743170d0ce6a1 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/InternalMappedRareTerms.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/InternalMappedRareTerms.java @@ -80,7 +80,6 @@ public InternalAggregation doReduce(List aggregations, Redu BloomFilter bloomFilter = null; for (InternalAggregation aggregation : aggregations) { - // Unmapped rare terms don't have a bloom filter so we'll skip all this work // and save some type casting headaches later. if (aggregation.isMapped() == false) { @@ -107,26 +106,30 @@ public InternalAggregation doReduce(List aggregations, Redu } if (bloomFilter == null) { - bloomFilter = ((InternalMappedRareTerms)aggregation).bloom; + bloomFilter = BloomFilter.EmptyBloomFilter(((InternalMappedRareTerms)aggregation).getBloom().getNumBits(), + ((InternalMappedRareTerms)aggregation).getBloom().getNumHashFunctions()); } else { - bloomFilter.merge(((InternalMappedRareTerms)aggregation).bloom); + bloomFilter.merge(((InternalMappedRareTerms)aggregation).getBloom()); } } - // Always return all results, so just proactively size the array to num buckets - final int size = buckets.size(); - final List rare = new ArrayList<>(size); + final List rare = new ArrayList<>(); for (List sameTermBuckets : buckets.values()) { final B b = sameTermBuckets.get(0).reduce(sameTermBuckets, reduceContext); - // Only prune if this is the final reduction, otherwise we may remove a term that shows - // up in a later incremental reduction and looks "rare" even though it isn't. - if (reduceContext.isFinalReduce() == false || (b.getDocCount() <= maxDocCount && containsTerm(bloom, b) == false)) { + if ((b.getDocCount() <= maxDocCount && containsTerm(bloomFilter, b) == false)) { rare.add(b); + reduceContext.consumeBucketsAndMaybeBreak(1); + } else if (b.getDocCount() > maxDocCount) { + // this term has gone over threshold while merging, so add it to the bloom. + // Note this may happen during incremental reductions too + addToBloom(bloomFilter, b); } } CollectionUtil.introSort(rare, order.comparator(null)); return create(name, rare, 0, 0); } - public abstract boolean containsTerm(BloomFilter bloom, B b); + public abstract boolean containsTerm(BloomFilter bloom, B bucket); + + public abstract void addToBloom(BloomFilter bloom, B bucket); } diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/LongRareTerms.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/LongRareTerms.java index ac576c3e4b294..30393e92668e1 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/LongRareTerms.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/LongRareTerms.java @@ -79,36 +79,13 @@ protected LongTerms.Bucket[] createBucketsArray(int size) { return new LongTerms.Bucket[size]; } - @Override - public InternalAggregation doReduce(List aggregations, ReduceContext reduceContext) { - for (InternalAggregation agg : aggregations) { - if (agg instanceof DoubleRareTerms) { - return agg.doReduce(aggregations, reduceContext); - } - } - return super.doReduce(aggregations, reduceContext); - } - @Override public boolean containsTerm(BloomFilter bloom, LongTerms.Bucket bucket) { return bloom.mightContain((long) bucket.getKey()); } - /** - * Converts a {@link LongRareTerms} into a {@link DoubleRareTerms}, returning the - * value of the specified long terms as doubles. - */ - static DoubleRareTerms convertLongRareTermsToDouble(LongRareTerms longTerms, DocValueFormat decimalFormat) { - List buckets = longTerms.getBuckets(); - List newBuckets = new ArrayList<>(); - for (Terms.Bucket bucket : buckets) { - newBuckets.add(new DoubleTerms.Bucket(bucket.getKeyAsNumber().doubleValue(), - bucket.getDocCount(), (InternalAggregations) bucket.getAggregations(), longTerms.showTermDocCountError, - longTerms.showTermDocCountError ? bucket.getDocCountError() : 0, decimalFormat)); - } - return new DoubleRareTerms(longTerms.getName(), longTerms.order, - longTerms.pipelineAggregators(), - longTerms.metaData, longTerms.format, - newBuckets, longTerms.getMaxDocCount(), longTerms.getBloom()); + @Override + public void addToBloom(BloomFilter bloom, LongTerms.Bucket bucket) { + bloom.put((long) bucket.getKey()); } } diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/LongRareTermsAggregator.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/LongRareTermsAggregator.java index 5be6d6044cef8..5c0dd289b9e58 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/LongRareTermsAggregator.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/LongRareTermsAggregator.java @@ -96,8 +96,8 @@ public void collect(int docId, long owningBucketOrdinal) throws IOException { map.put(val, 1L); long bucketOrdinal = bucketOrds.add(val); if (bucketOrdinal < 0) { // already seen - bucketOrdinal = - 1 - bucketOrdinal; - collectExistingBucket(subCollectors, docId, bucketOrdinal); + throw new IllegalStateException("Term count is zero, but an ordinal for this " + + "term has already been recorded"); } else { collectBucket(subCollectors, docId, bucketOrdinal); } @@ -108,6 +108,14 @@ public void collect(int docId, long owningBucketOrdinal) throws IOException { // TODO if we only need maxDocCount==1, we could specialize // and use a bitset instead of a counter scheme map.put(val, termCount + 1); + long bucketOrdinal = bucketOrds.add(val); + if (bucketOrdinal < 0) { + bucketOrdinal = - 1 - bucketOrdinal; + collectExistingBucket(subCollectors, docId, bucketOrdinal); + } else { + throw new IllegalStateException("Term has seen before, but we have not recorded " + + "an ordinal yet."); + } } else { // Otherwise we've breached the threshold, remove from // the map and add to the bloom filter @@ -116,7 +124,8 @@ public void collect(int docId, long owningBucketOrdinal) throws IOException { numDeleted += 1; if (numDeleted > GC_THRESHOLD) { - gcDeletedEntries(); + gcDeletedEntries(numDeleted); + numDeleted = 0; } } } @@ -130,13 +139,12 @@ public void collect(int docId, long owningBucketOrdinal) throws IOException { }; } - protected void gcDeletedEntries() { - boolean hasDeletedEntry = false; + protected void gcDeletedEntries(Long numDeleted) { + long deletionCount = 0; LongHash newBucketOrds = new LongHash(1, context.bigArrays()); try (LongHash oldBucketOrds = bucketOrds) { long[] mergeMap = new long[(int) oldBucketOrds.size()]; - for (int i = 0; i < oldBucketOrds.size(); i++) { long oldKey = oldBucketOrds.get(i); long newBucketOrd = -1; @@ -146,13 +154,19 @@ protected void gcDeletedEntries() { newBucketOrd = newBucketOrds.add(oldKey); } else { // Make a note when one of the ords has been deleted - hasDeletedEntry = true; + deletionCount += 1; } mergeMap[i] = newBucketOrd; } + + if (numDeleted != null && deletionCount != numDeleted) { + throw new IllegalStateException("Expected to prune [" + numDeleted + "] terms, but [" + numDeleted + + "] were removed instead"); + } + // Only merge/delete the ordinals if we have actually deleted one, // to save on some redundant work - if (hasDeletedEntry) { + if (deletionCount > 0) { mergeBuckets(mergeMap, newBucketOrds.size()); if (deferringCollector != null) { deferringCollector.mergeBuckets(mergeMap); @@ -199,6 +213,6 @@ public InternalAggregation buildEmptyAggregation() { @Override public void doClose() { - Releasables.close(bloom, bucketOrds); + Releasables.close(bucketOrds); } } diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/RareTermsAggregatorFactory.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/RareTermsAggregatorFactory.java index 02dfaabe2420d..a1918150b49bc 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/RareTermsAggregatorFactory.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/RareTermsAggregatorFactory.java @@ -93,11 +93,7 @@ protected Aggregator doCreateInternal(ValuesSource valuesSource, Aggregator pare if (valuesSource instanceof ValuesSource.Numeric) { IncludeExclude.LongFilter longFilter = null; if (((ValuesSource.Numeric) valuesSource).isFloatingPoint()) { - if (includeExclude != null) { - longFilter = includeExclude.convertToDoubleFilter(); - } - return new DoubleRareTermsAggregator(name, factories, (ValuesSource.Numeric) valuesSource, - config.format(), context, parent, longFilter, maxDocCount, pipelineAggregators, metaData); + throw new AggregationExecutionException("RareTerms aggregation does not support floating point fields."); } if (includeExclude != null) { longFilter = includeExclude.convertToLongFilter(config.format()); @@ -106,7 +102,7 @@ protected Aggregator doCreateInternal(ValuesSource valuesSource, Aggregator pare context, parent, longFilter, maxDocCount, pipelineAggregators, metaData); } - throw new AggregationExecutionException("terms aggregation cannot be applied to field [" + config.fieldContext().field() + throw new AggregationExecutionException("RareTerms aggregation cannot be applied to field [" + config.fieldContext().field() + "]. It can only be applied to numeric or string fields."); } diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringRareTerms.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringRareTerms.java index 0acf2d786efe0..9ab2fbdba9370 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringRareTerms.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringRareTerms.java @@ -77,4 +77,9 @@ protected StringTerms.Bucket[] createBucketsArray(int size) { public boolean containsTerm(BloomFilter bloom, StringTerms.Bucket bucket) { return bloom.mightContain(bucket.termBytes); } + + @Override + public void addToBloom(BloomFilter bloom, StringTerms.Bucket bucket) { + bloom.put(bucket.termBytes); + } } diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringRareTermsAggregator.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringRareTermsAggregator.java index 7967d53900f11..9fcd91996dec1 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringRareTermsAggregator.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringRareTermsAggregator.java @@ -105,8 +105,8 @@ public void collect(int docId, long bucket) throws IOException { map.put(BytesRef.deepCopyOf(bytes), 1L); long bucketOrdinal = bucketOrds.add(bytes); if (bucketOrdinal < 0) { // already seen - bucketOrdinal = - 1 - bucketOrdinal; - collectExistingBucket(subCollectors, docId, bucketOrdinal); + throw new IllegalStateException("Term count is zero, but an ordinal for this " + + "term has already been recorded"); } else { collectBucket(subCollectors, docId, bucketOrdinal); } @@ -115,6 +115,14 @@ public void collect(int docId, long bucket) throws IOException { // so just increment its counter if (valueCount < maxDocCount) { map.put(bytes, valueCount + 1); + long bucketOrdinal = bucketOrds.add(bytes); + if (bucketOrdinal < 0) { + bucketOrdinal = - 1 - bucketOrdinal; + collectExistingBucket(subCollectors, docId, bucketOrdinal); + } else { + throw new IllegalStateException("Term has seen before, but we have not recorded " + + "an ordinal yet."); + } } else { // Otherwise we've breached the threshold, remove from // the map and add to the bloom filter @@ -123,7 +131,8 @@ public void collect(int docId, long bucket) throws IOException { numDeleted += 1; if (numDeleted > GC_THRESHOLD) { - gcDeletedEntries(); + gcDeletedEntries(numDeleted); + numDeleted = 0; } } } @@ -135,8 +144,8 @@ public void collect(int docId, long bucket) throws IOException { }; } - protected void gcDeletedEntries() { - boolean hasDeletedEntry = false; + protected void gcDeletedEntries(Long numDeleted) { + long deletionCount = 0; BytesRefHash newBucketOrds = new BytesRefHash(1, context.bigArrays()); try (BytesRefHash oldBucketOrds = bucketOrds) { @@ -151,13 +160,18 @@ protected void gcDeletedEntries() { newBucketOrd = newBucketOrds.add(oldKey); } else { // Make a note when one of the ords has been deleted - hasDeletedEntry = true; + deletionCount += 1; } mergeMap[i] = newBucketOrd; } + if (numDeleted != null && deletionCount != numDeleted) { + throw new IllegalStateException("Expected to prune [" + numDeleted + "] terms, but [" + numDeleted + + "] were removed instead"); + } + // Only merge/delete the ordinals if we have actually deleted one, // to save on some redundant work - if (hasDeletedEntry) { + if (deletionCount > 0) { mergeBuckets(mergeMap, newBucketOrds.size()); if (deferringCollector != null) { deferringCollector.mergeBuckets(mergeMap); @@ -207,7 +221,7 @@ public InternalAggregation buildEmptyAggregation() { @Override public void doClose() { - Releasables.close(bloom, bucketOrds); + Releasables.close(bucketOrds); } } diff --git a/server/src/test/java/org/elasticsearch/search/aggregations/bucket/terms/RareTermsAggregatorTests.java b/server/src/test/java/org/elasticsearch/search/aggregations/bucket/terms/RareTermsAggregatorTests.java index e5656a47a8441..ab6ac83a7c56a 100644 --- a/server/src/test/java/org/elasticsearch/search/aggregations/bucket/terms/RareTermsAggregatorTests.java +++ b/server/src/test/java/org/elasticsearch/search/aggregations/bucket/terms/RareTermsAggregatorTests.java @@ -52,6 +52,7 @@ import org.elasticsearch.indices.breaker.NoneCircuitBreakerService; import org.elasticsearch.search.SearchHit; import org.elasticsearch.search.aggregations.Aggregation; +import org.elasticsearch.search.aggregations.AggregationExecutionException; import org.elasticsearch.search.aggregations.Aggregations; import org.elasticsearch.search.aggregations.Aggregator; import org.elasticsearch.search.aggregations.AggregatorTestCase; @@ -407,11 +408,7 @@ public void testMixLongAndDouble() throws Exception { InternalAggregation.ReduceContext ctx = new InternalAggregation.ReduceContext(new MockBigArrays(new MockPageCacheRecycler(Settings.EMPTY), new NoneCircuitBreakerService()), null, true); - InternalAggregation mergedAggs = aggs.get(0).doReduce(aggs, ctx); - assertTrue(mergedAggs instanceof DoubleRareTerms); - List buckets = ((DoubleRareTerms) mergedAggs).getBuckets(); - assertThat(buckets.size(), equalTo(1)); - assertThat(buckets.get(0).getKeyAsString(), equalTo("1.0")); + AggregationExecutionException e = expectThrows(AggregationExecutionException.class, () -> aggs.get(0).doReduce(aggs, ctx)); } From a0c56a2d677f17f252bcd754efb6b209ab20db45 Mon Sep 17 00:00:00 2001 From: Zachary Tong Date: Tue, 22 Jan 2019 17:26:13 -0500 Subject: [PATCH 04/25] Add bloom and map to CB --- .../bucket/terms/AbstractRareTermsAggregator.java | 5 +++++ .../aggregations/bucket/terms/LongRareTermsAggregator.java | 3 +++ .../aggregations/bucket/terms/StringRareTermsAggregator.java | 3 +++ 3 files changed, 11 insertions(+) diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/AbstractRareTermsAggregator.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/AbstractRareTermsAggregator.java index 8c37bb1c0b82e..c36d3c8b28022 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/AbstractRareTermsAggregator.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/AbstractRareTermsAggregator.java @@ -34,6 +34,7 @@ import java.io.IOException; import java.util.List; import java.util.Map; +import java.util.function.Consumer; public abstract class AbstractRareTermsAggregator extends DeferableBucketAggregator { @@ -51,6 +52,8 @@ public abstract class AbstractRareTermsAggregator circuitBreakerConsumer; + AbstractRareTermsAggregator(String name, AggregatorFactories factories, SearchContext context, Aggregator parent, List pipelineAggregators, @@ -60,10 +63,12 @@ public abstract class AbstractRareTermsAggregator GC_THRESHOLD) { gcDeletedEntries(numDeleted); diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringRareTermsAggregator.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringRareTermsAggregator.java index 9fcd91996dec1..927c6a5791920 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringRareTermsAggregator.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringRareTermsAggregator.java @@ -103,6 +103,8 @@ public void collect(int docId, long bucket) throws IOException { if (valueCount == 0) { // Brand new term, save into map map.put(BytesRef.deepCopyOf(bytes), 1L); + circuitBreakerConsumer.accept(bytes.length + 8L); // size of term + 8 for counter + long bucketOrdinal = bucketOrds.add(bytes); if (bucketOrdinal < 0) { // already seen throw new IllegalStateException("Term count is zero, but an ordinal for this " + @@ -129,6 +131,7 @@ public void collect(int docId, long bucket) throws IOException { map.remove(bytes); bloom.put(bytes); numDeleted += 1; + circuitBreakerConsumer.accept(-(bytes.length + 8L)); // size of term + 8 for counter if (numDeleted > GC_THRESHOLD) { gcDeletedEntries(numDeleted); From 79faa4b72f11ae1e470c09c191518cfa9be19d7d Mon Sep 17 00:00:00 2001 From: Zachary Tong Date: Thu, 24 Jan 2019 17:33:23 -0500 Subject: [PATCH 05/25] Refactor bloom to track exact set, add tests --- .../common/hash/MurmurHash3.java | 20 ++ ...BloomFilter.java => ExactBloomFilter.java} | 266 ++++++++++++------ .../terms/AbstractRareTermsAggregator.java | 6 +- .../bucket/terms/InternalMappedRareTerms.java | 22 +- .../bucket/terms/LongRareTerms.java | 10 +- .../bucket/terms/StringRareTerms.java | 8 +- .../common/util/ExactBloomFilterTests.java | 90 ++++++ .../terms/RareTermsAggregatorTests.java | 2 +- 8 files changed, 311 insertions(+), 113 deletions(-) rename server/src/main/java/org/elasticsearch/common/util/{BloomFilter.java => ExactBloomFilter.java} (57%) create mode 100644 server/src/test/java/org/elasticsearch/common/util/ExactBloomFilterTests.java diff --git a/server/src/main/java/org/elasticsearch/common/hash/MurmurHash3.java b/server/src/main/java/org/elasticsearch/common/hash/MurmurHash3.java index a52f0e8acc4ae..2c028dd514049 100644 --- a/server/src/main/java/org/elasticsearch/common/hash/MurmurHash3.java +++ b/server/src/main/java/org/elasticsearch/common/hash/MurmurHash3.java @@ -21,6 +21,8 @@ import org.elasticsearch.common.util.ByteUtils; +import java.util.Objects; + /** * MurmurHash3 hashing functions. @@ -36,6 +38,24 @@ public static class Hash128 { public long h1; /** higher 64 bits part **/ public long h2; + + @Override + public boolean equals(Object other) { + if (this == other) { + return true; + } + if (other == null || getClass() != other.getClass()) { + return false; + } + Hash128 that = (Hash128) other; + return Objects.equals(this.h1, that.h1) + && Objects.equals(this.h2, that.h2); + } + + @Override + public int hashCode() { + return Objects.hash(h1, h2); + } } private static long C1 = 0x87c37b91114253d5L; diff --git a/server/src/main/java/org/elasticsearch/common/util/BloomFilter.java b/server/src/main/java/org/elasticsearch/common/util/ExactBloomFilter.java similarity index 57% rename from server/src/main/java/org/elasticsearch/common/util/BloomFilter.java rename to server/src/main/java/org/elasticsearch/common/util/ExactBloomFilter.java index 25bb76c4ec772..c25243169fdd3 100644 --- a/server/src/main/java/org/elasticsearch/common/util/BloomFilter.java +++ b/server/src/main/java/org/elasticsearch/common/util/ExactBloomFilter.java @@ -28,12 +28,15 @@ import java.io.IOException; import java.util.Arrays; +import java.util.HashSet; import java.util.Objects; +import java.util.Set; /** - * A bloom filter. Inspired by Guava bloom filter implementation though with some optimizations. + * A bloom filter which keeps an exact set of values until a threshold is reached, then the values + * are replayed into a traditional bloom filter for approximate tracking */ -public class BloomFilter implements Writeable { +public class ExactBloomFilter implements Writeable { // Some anecdotal sizing numbers: // expected insertions, false positive probability, bloom size, num hashes @@ -55,9 +58,10 @@ public class BloomFilter implements Writeable { // 50m, 0.10, 228.5mb, 3 Hashes /** - * The bit set of the BloomFilter (not necessarily power of 2!) + * The bit set of the ExactBloomFilter (not necessarily power of 2!) */ - private final BitArray bits; + BitArray bits; + Set hashedValues = new HashSet<>(); /** * Number of hashes per element @@ -65,111 +69,232 @@ public class BloomFilter implements Writeable { private final int numHashFunctions; /** - * Creates a bloom filter with the expected number - * of insertions and expected false positive probability. - * - * @param expectedInsertions the number of expected insertions to the constructed - * @param fpp the desired false positive probability (must be positive and less than 1.0) + * The number of bits in the bloom */ - public BloomFilter(int expectedInsertions, double fpp) { - this(expectedInsertions, fpp, -1); - } + private long numBits; + + /** + * The threshold (in bytes) before we convert the exact set into an approximate bloom filter + */ + private final long threshold; + + /** + * True if we are still tracking with a Set + */ + private boolean setMode = true; /** - * Creates a bloom filter based on the expected number of insertions, expected false positive probability, - * and number of hash functions. + * Creates a bloom filter with the expected number + * of insertions and expected false positive probability. * * @param expectedInsertions the number of expected insertions to the constructed * @param fpp the desired false positive probability (must be positive and less than 1.0) - * @param numHashFunctions the number of hash functions to use (must be less than or equal to 255) + * @param threshold number of bytes to record exactly before converting to Bloom filter */ - private BloomFilter(int expectedInsertions, double fpp, int numHashFunctions) { + public ExactBloomFilter(int expectedInsertions, double fpp, long threshold) { + if (threshold <= 0) { + throw new IllegalArgumentException("BloomFilter threshold must be a non-negative number"); + } + if (expectedInsertions == 0) { expectedInsertions = 1; } + this.threshold = threshold; /* * TODO(user): Put a warning in the javadoc about tiny fpp values, * since the resulting size is proportional to -log(p), but there is not * much of a point after all, e.g. optimalNumOfBits(1000, 0.0000000000000001) = 76680 * which is less that 10kb. Who cares! */ - long numBits = optimalNumOfBits(expectedInsertions, fpp); + this.numBits = optimalNumOfBits(expectedInsertions, fpp); // calculate the optimal number of hash functions - if (numHashFunctions == -1) { - numHashFunctions = optimalNumOfHashFunctions(expectedInsertions, numBits); - } - + this.numHashFunctions = optimalNumOfHashFunctions(expectedInsertions, numBits); if (numHashFunctions > 255) { throw new IllegalArgumentException("BloomFilters with more than 255 hash functions are not allowed."); } - - this.bits = new BitArray(numBits); - this.numHashFunctions = numHashFunctions; } - public static BloomFilter EmptyBloomFilter(long numBits, int numHashFunctions) { - return new BloomFilter(numBits, numHashFunctions); - } - - private BloomFilter(long numBits, int numHashFunctions) { - this.bits = new BitArray(numBits); - this.numHashFunctions = numHashFunctions; + /** + * Copy constructor. The new Bloom will be an identical copy of the provided bloom + */ + public ExactBloomFilter(ExactBloomFilter otherBloom) { + this.numHashFunctions = otherBloom.getNumHashFunctions(); + this.threshold = otherBloom.getThreshold(); + this.numBits = otherBloom.getNumBits(); + this.setMode = otherBloom.setMode; + this.hashedValues = new HashSet<>(otherBloom.hashedValues); + if (otherBloom.bits != null) { + this.bits = new BitArray(otherBloom.numBits); + this.bits.putAll(otherBloom.bits); + } } - public BloomFilter(StreamInput in) throws IOException { - this.bits = new BitArray(in); + public ExactBloomFilter(StreamInput in) throws IOException { + this.setMode = in.readBoolean(); + if (setMode) { + this.hashedValues = in.readSet(in1 -> { + MurmurHash3.Hash128 hash = new MurmurHash3.Hash128(); + hash.h1 = in1.readLong(); + hash.h2 = in1.readLong(); + return hash; + }); + } else { + this.bits = new BitArray(in); + } this.numHashFunctions = in.readVInt(); + this.threshold = in.readVLong(); + this.numBits = in.readVLong(); } public void writeTo(StreamOutput out) throws IOException { - bits.writeTo(out); + out.writeBoolean(setMode); + if (setMode) { + out.writeCollection(hashedValues, (out1, hash) -> { + out1.writeLong(hash.h1); + out1.writeLong(hash.h2); + }); + } else { + bits.writeTo(out); + } out.writeVInt(numHashFunctions); + out.writeVLong(threshold); + out.writeVLong(numBits); } - public void merge(BloomFilter other) { - this.bits.putAll(other.bits); + /** + * Merge `other` bloom filter into this bloom. After merging, this bloom's state will + * be the union of the two. During the merging process, the internal Set may be upgraded + * to a Bloom if it goes over threshold + */ + public void merge(ExactBloomFilter other) { + assert this.numBits == other.numBits; + if (setMode && other.setMode) { + // Both in sets, merge collections then see if we need to convert to bloom + hashedValues.addAll(other.hashedValues); + checkAndConvertToBloom(); + } else if (setMode && other.setMode == false) { + // Other is in bloom mode, so we convert our set to a bloom then merge + convertToBloom(); + this.bits.putAll(other.bits); + } else if (setMode == false && other.setMode) { + // we're in bloom mode, so convert other's set and merge + other.convertToBloom(); + this.bits.putAll(other.bits); + } else { + this.bits.putAll(other.bits); + } } public boolean put(BytesRef value) { - return Hashing.put(value, numHashFunctions, bits); + MurmurHash3.Hash128 hash = MurmurHash3.hash128(value.bytes, value.offset, value.length, 0, new MurmurHash3.Hash128()); + return put(hash); } public boolean put(byte[] value) { - return Hashing.put(value, 0, value.length, numHashFunctions, bits); + MurmurHash3.Hash128 hash = MurmurHash3.hash128(value, 0, value.length, 0, new MurmurHash3.Hash128()); + return put(hash); } public boolean put(long value) { return put(Numbers.longToBytes(value)); } + private boolean put(MurmurHash3.Hash128 hash) { + if (setMode) { + boolean newItem = hashedValues.add(hash); + checkAndConvertToBloom(); + return newItem; + } else { + return putBloom(hash); + } + } + + private boolean putBloom(MurmurHash3.Hash128 hash128) { + long bitSize = bits.bitSize(); + boolean bitsChanged = false; + long combinedHash = hash128.h1; + for (int i = 0; i < numHashFunctions; i++) { + // Make the combined hash positive and indexable + bitsChanged |= bits.set((combinedHash & Long.MAX_VALUE) % bitSize); + combinedHash += hash128.h2; + } + return bitsChanged; + } + public boolean mightContain(BytesRef value) { - return Hashing.mightContain(value, numHashFunctions, bits); + return mightContain(value.bytes, value.offset, value.length); } - private boolean mightContain(byte[] value) { - return Hashing.mightContain(value, 0, value.length, numHashFunctions, bits); + public boolean mightContain(byte[] value) { + return mightContain(value, 0, value.length); } public boolean mightContain(long value) { return mightContain(Numbers.longToBytes(value)); } - public int getNumHashFunctions() { + private boolean mightContain(byte[] bytes, int offset, int length) { + MurmurHash3.Hash128 hash128 = MurmurHash3.hash128(bytes, offset, length, 0, new MurmurHash3.Hash128()); + + if (setMode) { + return hashedValues.contains(hash128); + } else { + long bitSize = bits.bitSize(); + long combinedHash = hash128.h1; + for (int i = 0; i < numHashFunctions; i++) { + // Make the combined hash positive and indexable + if (!bits.get((combinedHash & Long.MAX_VALUE) % bitSize)) { + return false; + } + combinedHash += hash128.h2; + } + return true; + } + } + + private int getNumHashFunctions() { return this.numHashFunctions; } - public long getNumBits() { - return bits.bitSize(); + private long getNumBits() { + return numBits; } + public long getThreshold() { + return threshold; + } + + /** + * Get the approximate size of this datastructure. Approximate because only the Set occupants + * are tracked, not the overhead of the Set itself. + */ public long getSizeInBytes() { - return bits.ramBytesUsed(); + long bytes = (hashedValues.size() * 16) + 8 + 4 + 1; + if (bits != null) { + bytes += bits.ramBytesUsed(); + } + return bytes; + } + + private void checkAndConvertToBloom() { + if (hashedValues.size() * 16 > threshold) { + convertToBloom(); + } + } + + private void convertToBloom() { + bits = new BitArray(numBits); + setMode = false; + for (MurmurHash3.Hash128 hash : hashedValues) { + putBloom(hash); + } + hashedValues.clear(); } @Override public int hashCode() { - return bits.hashCode() + numHashFunctions; + return Objects.hash(numHashFunctions, hashedValues, bits, setMode, threshold, numBits); } @Override @@ -181,12 +306,17 @@ public boolean equals(Object other) { return false; } - final BloomFilter that = (BloomFilter) other; + final ExactBloomFilter that = (ExactBloomFilter) other; return Objects.equals(this.bits, that.bits) - && Objects.equals(this.numHashFunctions, that.numHashFunctions); + && Objects.equals(this.numHashFunctions, that.numHashFunctions) + && Objects.equals(this.threshold, that.threshold) + && Objects.equals(this.setMode, that.setMode) + && Objects.equals(this.hashedValues, that.hashedValues) + && Objects.equals(this.numBits, that.numBits); } + /* * Cheat sheet: * @@ -231,7 +361,6 @@ private static long optimalNumOfBits(long n, double p) { return (long) (-n * Math.log(p) / (Math.log(2) * Math.log(2))); } - // Note: We use this instead of java.util.BitSet because we need access to the long[] data field static final class BitArray implements Writeable { private final long[] data; private final long bitSize; @@ -326,43 +455,4 @@ public long ramBytesUsed() { } } - private static class Hashing { - - static boolean put(BytesRef value, int numHashFunctions, BitArray bits) { - return put(value.bytes, value.offset, value.length, numHashFunctions, bits); - } - - static boolean put(byte[] bytes, int offset, int length, int numHashFunctions, BitArray bits) { - long bitSize = bits.bitSize(); - MurmurHash3.Hash128 hash128 = MurmurHash3.hash128(bytes, offset, length, 0, new MurmurHash3.Hash128()); - - boolean bitsChanged = false; - long combinedHash = hash128.h1; - for (int i = 0; i < numHashFunctions; i++) { - // Make the combined hash positive and indexable - bitsChanged |= bits.set((combinedHash & Long.MAX_VALUE) % bitSize); - combinedHash += hash128.h2; - } - return bitsChanged; - } - - static boolean mightContain(BytesRef value, int numHashFunctions, BitArray bits) { - return mightContain(value.bytes, value.offset, value.length, numHashFunctions, bits); - } - - static boolean mightContain(byte[] bytes, int offset, int length, int numHashFunctions, BitArray bits) { - long bitSize = bits.bitSize(); - MurmurHash3.Hash128 hash128 = MurmurHash3.hash128(bytes, offset, length, 0, new MurmurHash3.Hash128()); - - long combinedHash = hash128.h1; - for (int i = 0; i < numHashFunctions; i++) { - // Make the combined hash positive and indexable - if (!bits.get((combinedHash & Long.MAX_VALUE) % bitSize)) { - return false; - } - combinedHash += hash128.h2; - } - return true; - } - } } diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/AbstractRareTermsAggregator.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/AbstractRareTermsAggregator.java index c36d3c8b28022..5a95b606c0a39 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/AbstractRareTermsAggregator.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/AbstractRareTermsAggregator.java @@ -19,7 +19,7 @@ package org.elasticsearch.search.aggregations.bucket.terms; -import org.elasticsearch.common.util.BloomFilter; +import org.elasticsearch.common.util.ExactBloomFilter; import org.elasticsearch.search.DocValueFormat; import org.elasticsearch.search.aggregations.Aggregator; import org.elasticsearch.search.aggregations.AggregatorFactories; @@ -47,7 +47,7 @@ public abstract class AbstractRareTermsAggregator, B e extends InternalMappedTerms { final long maxDocCount; - final BloomFilter bloom; + final ExactBloomFilter bloom; InternalMappedRareTerms(String name, BucketOrder order, List pipelineAggregators, Map metaData, DocValueFormat format, - List buckets, long maxDocCount, BloomFilter bloom) { + List buckets, long maxDocCount, ExactBloomFilter bloom) { // TODO is there a way to determine sum_other_doc_count and doc_count_error_upper_bound equivalents for rare based on bloom? super(name, order, 0, 1, pipelineAggregators, metaData, format, 0, false, 0, buckets, 0); this.maxDocCount = maxDocCount; @@ -53,7 +53,7 @@ public long getMaxDocCount() { return maxDocCount; } - BloomFilter getBloom() { + ExactBloomFilter getBloom() { return bloom; } @@ -63,7 +63,7 @@ BloomFilter getBloom() { InternalMappedRareTerms(StreamInput in, Bucket.Reader bucketReader) throws IOException { super(in, bucketReader); maxDocCount = in.readLong(); - bloom = new BloomFilter(in); + bloom = new ExactBloomFilter(in); } @Override @@ -77,7 +77,7 @@ protected void writeTermTypeInfoTo(StreamOutput out) throws IOException { public InternalAggregation doReduce(List aggregations, ReduceContext reduceContext) { Map> buckets = new HashMap<>(); InternalTerms referenceTerms = null; - BloomFilter bloomFilter = null; + ExactBloomFilter bloomFilter = null; for (InternalAggregation aggregation : aggregations) { // Unmapped rare terms don't have a bloom filter so we'll skip all this work @@ -105,11 +105,11 @@ public InternalAggregation doReduce(List aggregations, Redu bucketList.add(bucket); } + ExactBloomFilter otherBloom = ((InternalMappedRareTerms)aggregation).getBloom(); if (bloomFilter == null) { - bloomFilter = BloomFilter.EmptyBloomFilter(((InternalMappedRareTerms)aggregation).getBloom().getNumBits(), - ((InternalMappedRareTerms)aggregation).getBloom().getNumHashFunctions()); + bloomFilter = new ExactBloomFilter(otherBloom); } else { - bloomFilter.merge(((InternalMappedRareTerms)aggregation).getBloom()); + bloomFilter.merge(otherBloom); } } @@ -129,7 +129,7 @@ public InternalAggregation doReduce(List aggregations, Redu return create(name, rare, 0, 0); } - public abstract boolean containsTerm(BloomFilter bloom, B bucket); + public abstract boolean containsTerm(ExactBloomFilter bloom, B bucket); - public abstract void addToBloom(BloomFilter bloom, B bucket); + public abstract void addToBloom(ExactBloomFilter bloom, B bucket); } diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/LongRareTerms.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/LongRareTerms.java index 30393e92668e1..de2415e898c41 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/LongRareTerms.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/LongRareTerms.java @@ -20,15 +20,13 @@ import org.elasticsearch.common.io.stream.StreamInput; -import org.elasticsearch.common.util.BloomFilter; +import org.elasticsearch.common.util.ExactBloomFilter; import org.elasticsearch.search.DocValueFormat; import org.elasticsearch.search.aggregations.BucketOrder; -import org.elasticsearch.search.aggregations.InternalAggregation; import org.elasticsearch.search.aggregations.InternalAggregations; import org.elasticsearch.search.aggregations.pipeline.PipelineAggregator; import java.io.IOException; -import java.util.ArrayList; import java.util.List; import java.util.Map; @@ -40,7 +38,7 @@ public class LongRareTerms extends InternalMappedRareTerms pipelineAggregators, Map metaData, DocValueFormat format, - List buckets, long maxDocCount, BloomFilter bloom) { + List buckets, long maxDocCount, ExactBloomFilter bloom) { super(name, order, pipelineAggregators, metaData, format, buckets, maxDocCount, bloom); } @@ -80,12 +78,12 @@ protected LongTerms.Bucket[] createBucketsArray(int size) { } @Override - public boolean containsTerm(BloomFilter bloom, LongTerms.Bucket bucket) { + public boolean containsTerm(ExactBloomFilter bloom, LongTerms.Bucket bucket) { return bloom.mightContain((long) bucket.getKey()); } @Override - public void addToBloom(BloomFilter bloom, LongTerms.Bucket bucket) { + public void addToBloom(ExactBloomFilter bloom, LongTerms.Bucket bucket) { bloom.put((long) bucket.getKey()); } } diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringRareTerms.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringRareTerms.java index 9ab2fbdba9370..024c7fe575dac 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringRareTerms.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringRareTerms.java @@ -19,7 +19,7 @@ package org.elasticsearch.search.aggregations.bucket.terms; import org.elasticsearch.common.io.stream.StreamInput; -import org.elasticsearch.common.util.BloomFilter; +import org.elasticsearch.common.util.ExactBloomFilter; import org.elasticsearch.search.DocValueFormat; import org.elasticsearch.search.aggregations.BucketOrder; import org.elasticsearch.search.aggregations.InternalAggregations; @@ -35,7 +35,7 @@ public class StringRareTerms extends InternalMappedRareTerms pipelineAggregators, Map metaData, DocValueFormat format, - List buckets, long maxDocCount, BloomFilter bloom) { + List buckets, long maxDocCount, ExactBloomFilter bloom) { super(name, order, pipelineAggregators, metaData, format, buckets, maxDocCount, bloom); } @@ -74,12 +74,12 @@ protected StringTerms.Bucket[] createBucketsArray(int size) { } @Override - public boolean containsTerm(BloomFilter bloom, StringTerms.Bucket bucket) { + public boolean containsTerm(ExactBloomFilter bloom, StringTerms.Bucket bucket) { return bloom.mightContain(bucket.termBytes); } @Override - public void addToBloom(BloomFilter bloom, StringTerms.Bucket bucket) { + public void addToBloom(ExactBloomFilter bloom, StringTerms.Bucket bucket) { bloom.put(bucket.termBytes); } } diff --git a/server/src/test/java/org/elasticsearch/common/util/ExactBloomFilterTests.java b/server/src/test/java/org/elasticsearch/common/util/ExactBloomFilterTests.java new file mode 100644 index 0000000000000..7405da6b0a648 --- /dev/null +++ b/server/src/test/java/org/elasticsearch/common/util/ExactBloomFilterTests.java @@ -0,0 +1,90 @@ +package org.elasticsearch.common.util; + +import org.elasticsearch.common.Numbers; +import org.elasticsearch.common.hash.MurmurHash3; +import org.elasticsearch.common.io.stream.Writeable; +import org.elasticsearch.test.AbstractWireSerializingTestCase; + +import java.util.HashSet; +import java.util.Set; + +import static org.hamcrest.Matchers.empty; +import static org.hamcrest.Matchers.equalTo; +import static org.hamcrest.Matchers.greaterThan; + +public class ExactBloomFilterTests extends AbstractWireSerializingTestCase { + + @Override + protected ExactBloomFilter createTestInstance() { + ExactBloomFilter bloom = new ExactBloomFilter(randomIntBetween(1, 100000000), + ((float)randomIntBetween(1, 50)) / 100.0, randomNonNegativeLong()); + + int num = randomIntBetween(0, 10); + for (int i = 0; i < num; i++) { + bloom.put(randomLong()); + } + + return bloom; + } + + @Override + protected Writeable.Reader instanceReader() { + return ExactBloomFilter::new; + } + + @Override + protected ExactBloomFilter mutateInstance(ExactBloomFilter instance) { + ExactBloomFilter newInstance = new ExactBloomFilter(instance); + int num = randomIntBetween(1, 10); + for (int i = 0; i < num; i++) { + newInstance.put(randomLong()); + } + return newInstance; + } + + public void testExact() { + long threshold = randomLongBetween(1000, 10000); + ExactBloomFilter bloom = new ExactBloomFilter(1000000, 0.03, threshold); + + int size = 0; + Set values = new HashSet<>(); + Set hashed = new HashSet<>(values.size()); + while (size < threshold - 100) { + long value = randomLong(); + bloom.put(value); + boolean newValue = values.add(value); + if (newValue) { + byte[] bytes = Numbers.longToBytes(value); + MurmurHash3.Hash128 hash128 = MurmurHash3.hash128(bytes, 0, bytes.length, 0, new MurmurHash3.Hash128()); + hashed.add(hash128); + + size += 16; + } + } + assertThat(bloom.hashedValues.size(), equalTo(hashed.size())); + assertThat(bloom.hashedValues, equalTo(hashed)); + + for (Long value : values) { + assertThat(bloom.mightContain(value), equalTo(true)); + } + } + + public void testConvert() { + long threshold = randomLongBetween(1000, 10000); + ExactBloomFilter bloom = new ExactBloomFilter(1000000, 0.03, threshold); + + int size = 0; + Set values = new HashSet<>(); + while (size < threshold + 100) { + long value = randomLong(); + bloom.put(value); + boolean newValue = values.add(value); + if (newValue) { + size += 16; + } + } + assertThat(bloom.hashedValues, empty()); + assertThat(bloom.bits.bitSize(), greaterThan(0L)); + } + +} diff --git a/server/src/test/java/org/elasticsearch/search/aggregations/bucket/terms/RareTermsAggregatorTests.java b/server/src/test/java/org/elasticsearch/search/aggregations/bucket/terms/RareTermsAggregatorTests.java index ab6ac83a7c56a..9003e4dbf5496 100644 --- a/server/src/test/java/org/elasticsearch/search/aggregations/bucket/terms/RareTermsAggregatorTests.java +++ b/server/src/test/java/org/elasticsearch/search/aggregations/bucket/terms/RareTermsAggregatorTests.java @@ -540,7 +540,7 @@ public void testWithNestedScoringAggregations() throws IOException { assertThat(terms.getBuckets().size(), equalTo(2)); for (MultiBucketsAggregation.Bucket bucket : terms.getBuckets()) { InternalTopHits topHits = bucket.getAggregations().get("top_hits"); - assertThat(topHits.getHits().totalHits, equalTo(1L)); + assertThat(topHits.getHits(), equalTo(1L)); assertThat(topHits.getHits().getMaxScore(), equalTo(Float.NaN)); } } From 2819a3d4a4c8a84655b65af93d1d1fa524bd557d Mon Sep 17 00:00:00 2001 From: Zachary Tong Date: Thu, 31 Jan 2019 13:11:48 -0500 Subject: [PATCH 06/25] Tweak CB --- .../bucket/terms/AbstractRareTermsAggregator.java | 2 -- .../aggregations/bucket/terms/LongRareTermsAggregator.java | 4 ++-- .../aggregations/bucket/terms/StringRareTermsAggregator.java | 4 ++-- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/AbstractRareTermsAggregator.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/AbstractRareTermsAggregator.java index 5a95b606c0a39..4ab6932a0e6de 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/AbstractRareTermsAggregator.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/AbstractRareTermsAggregator.java @@ -52,7 +52,6 @@ public abstract class AbstractRareTermsAggregator circuitBreakerConsumer; AbstractRareTermsAggregator(String name, AggregatorFactories factories, SearchContext context, @@ -68,7 +67,6 @@ public abstract class AbstractRareTermsAggregator GC_THRESHOLD) { gcDeletedEntries(numDeleted); diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringRareTermsAggregator.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringRareTermsAggregator.java index 927c6a5791920..b46b811f5de27 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringRareTermsAggregator.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringRareTermsAggregator.java @@ -103,7 +103,7 @@ public void collect(int docId, long bucket) throws IOException { if (valueCount == 0) { // Brand new term, save into map map.put(BytesRef.deepCopyOf(bytes), 1L); - circuitBreakerConsumer.accept(bytes.length + 8L); // size of term + 8 for counter + addRequestCircuitBreakerBytes(bytes.length + 8L); // size of term + 8 for counter long bucketOrdinal = bucketOrds.add(bytes); if (bucketOrdinal < 0) { // already seen @@ -131,7 +131,7 @@ public void collect(int docId, long bucket) throws IOException { map.remove(bytes); bloom.put(bytes); numDeleted += 1; - circuitBreakerConsumer.accept(-(bytes.length + 8L)); // size of term + 8 for counter + addRequestCircuitBreakerBytes(-(bytes.length + 8L)); // size of term + 8 for counter if (numDeleted > GC_THRESHOLD) { gcDeletedEntries(numDeleted); From 56297a516e3d06a709335cc230c16d0cd086c5b9 Mon Sep 17 00:00:00 2001 From: Zachary Tong Date: Thu, 28 Feb 2019 13:39:07 -0500 Subject: [PATCH 07/25] Decouple from terms agg --- .../search.aggregation/280_rare_terms.yml | 166 ++------------ .../terms/AbstractRareTermsAggregator.java | 19 +- .../bucket/terms/InternalMappedRareTerms.java | 66 ++++-- .../bucket/terms/InternalRareTerms.java | 202 ++++++++++++++++++ .../bucket/terms/LongRareTerms.java | 97 +++++++-- .../bucket/terms/LongRareTermsAggregator.java | 22 +- .../aggregations/bucket/terms/RareTerms.java | 48 +++++ .../bucket/terms/StringRareTerms.java | 101 +++++++-- .../terms/StringRareTermsAggregator.java | 30 +-- .../bucket/terms/UnmappedRareTerms.java | 54 ++--- .../common/util/ExactBloomFilterTests.java | 18 ++ .../terms/RareTermsAggregatorTests.java | 186 +++++----------- 12 files changed, 604 insertions(+), 405 deletions(-) create mode 100644 server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/InternalRareTerms.java create mode 100644 server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/RareTerms.java diff --git a/rest-api-spec/src/main/resources/rest-api-spec/test/search.aggregation/280_rare_terms.yml b/rest-api-spec/src/main/resources/rest-api-spec/test/search.aggregation/280_rare_terms.yml index b0450013a37ae..e0eb30e3582ab 100644 --- a/rest-api-spec/src/main/resources/rest-api-spec/test/search.aggregation/280_rare_terms.yml +++ b/rest-api-spec/src/main/resources/rest-api-spec/test/search.aggregation/280_rare_terms.yml @@ -6,34 +6,20 @@ setup: settings: number_of_replicas: 0 mappings: - test: - properties: - str: - type: keyword - ip: - type: ip - boolean: - type: boolean - integer: - type: long - double: - type: double - number: - type: long - date: - type: date + properties: + str: + type: keyword + ip: + type: ip + boolean: + type: boolean + integer: + type: long + number: + type: long + date: + type: date - - do: - indices.create: - index: test_2 - body: - settings: - number_of_replicas: 0 - mappings: - test: - properties: - number: - type: double - do: cluster.health: @@ -44,21 +30,18 @@ setup: - do: index: index: test_1 - type: test id: 1 body: { "str" : "abc" } - do: index: index: test_1 - type: test id: 2 body: { "str": "abc" } - do: index: index: test_1 - type: test id: 3 body: { "str": "bcd" } @@ -80,21 +63,18 @@ setup: - do: index: index: test_1 - type: test id: 1 body: { "ip": "::1" } - do: index: index: test_1 - type: test id: 2 body: { "ip": "127.0.0.1" } - do: index: index: test_1 - type: test id: 3 body: { "ip": "::1" } @@ -141,21 +121,18 @@ setup: - do: index: index: test_1 - type: test id: 1 body: { "boolean": true } - do: index: index: test_1 - type: test id: 2 body: { "boolean": false } - do: index: index: test_1 - type: test id: 3 body: { "boolean": true } @@ -177,21 +154,18 @@ setup: - do: index: index: test_1 - type: test id: 1 body: { "integer": 1234 } - do: index: index: test_1 - type: test id: 2 body: { "integer": 5678 } - do: index: index: test_1 - type: test id: 3 body: { "integer": 1234 } @@ -210,63 +184,23 @@ setup: - is_false: aggregations.integer_terms.buckets.0.key_as_string - match: { aggregations.integer_terms.buckets.0.doc_count: 1 } - ---- -"Double test": - - do: - index: - index: test_1 - type: test - id: 1 - body: { "double": 1234.5 } - - - do: - index: - index: test_1 - type: test - id: 2 - body: { "double": 5678.5 } - - - do: - index: - index: test_1 - type: test - id: 3 - body: { "double": 1234.5 } - - - do: - indices.refresh: {} - - - do: - search: - body: { "size" : 0, "aggs" : { "double_terms" : { "rare_terms" : { "field" : "double" } } } } - - - match: { hits.total: 3 } - - length: { aggregations.double_terms.buckets: 1 } - - match: { aggregations.double_terms.buckets.0.key: 5678.5 } - - is_false: aggregations.double_terms.buckets.0.key_as_string - - match: { aggregations.double_terms.buckets.0.doc_count: 1 } - --- "Date test": - do: index: index: test_1 - type: test id: 1 body: { "date": "2016-05-03" } - do: index: index: test_1 - type: test id: 2 body: { "date": "2014-09-01" } - do: index: index: test_1 - type: test id: 3 body: { "date": "2016-05-03" } @@ -306,7 +240,6 @@ setup: - do: index: index: test_1 - type: test id: 1 body: {} @@ -326,7 +259,6 @@ setup: - do: index: index: test_1 - type: test id: 1 body: {} @@ -346,7 +278,6 @@ setup: - do: index: index: test_1 - type: test id: 1 body: {} @@ -366,7 +297,6 @@ setup: - do: index: index: test_1 - type: test id: 1 body: {} @@ -380,74 +310,4 @@ setup: - match: { hits.total: 1 } - length: { aggregations.long_terms.buckets: 0 } ---- -"Unmapped doubles": - - - do: - index: - index: test_1 - type: test - id: 1 - body: {} - - - do: - indices.refresh: {} - - - do: - search: - body: { "size" : 0, "aggs" : { "double_terms" : { "rare_terms" : { "field" : "unmapped_double" } } } } - - - match: { hits.total: 1 } - - length: { aggregations.double_terms.buckets: 0 } - ---- -"Mixing longs and doubles": - - - do: - index: - index: test_1 - type: test - id: 1 - body: {"number": 100} - - - do: - index: - index: test_1 - type: test - id: 2 - body: {"number": 10} - - - do: - index: - index: test_2 - type: test - id: 3 - body: {"number": 100.0} - - - do: - index: - index: test_2 - type: test - id: 1 - body: {"number": 10.0} - - - do: - index: - index: test_2 - type: test - id: 2 - body: {"number": 14.6} - - - do: - indices.refresh: {} - - - do: - search: - body: { "size" : 0, "aggs" : { "number_terms" : { "rare_terms" : { "field" : "number" } } } } - - - match: { hits.total: 5 } - - - length: { aggregations.number_terms.buckets: 1 } - - match: { aggregations.number_terms.buckets.0.key: 14.6 } - - match: { aggregations.number_terms.buckets.0.doc_count: 1 } diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/AbstractRareTermsAggregator.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/AbstractRareTermsAggregator.java index 4ab6932a0e6de..def09c3168ba1 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/AbstractRareTermsAggregator.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/AbstractRareTermsAggregator.java @@ -23,6 +23,8 @@ import org.elasticsearch.search.DocValueFormat; import org.elasticsearch.search.aggregations.Aggregator; import org.elasticsearch.search.aggregations.AggregatorFactories; +import org.elasticsearch.search.aggregations.BucketOrder; +import org.elasticsearch.search.aggregations.LeafBucketCollector; import org.elasticsearch.search.aggregations.bucket.DeferableBucketAggregator; import org.elasticsearch.search.aggregations.bucket.DeferringBucketCollector; import org.elasticsearch.search.aggregations.bucket.MergingBucketsDeferringCollector; @@ -34,25 +36,28 @@ import java.io.IOException; import java.util.List; import java.util.Map; -import java.util.function.Consumer; public abstract class AbstractRareTermsAggregator extends DeferableBucketAggregator { - // TODO review question: What to set this at? /** Sets the number of "removed" values to accumulate before we purge ords via the MergingBucketCollector's mergeBuckets() method */ - final long GC_THRESHOLD = 10; + static final long GC_THRESHOLD = 1000; + static final BucketOrder ORDER = BucketOrder.compound(BucketOrder.count(true), BucketOrder.key(true)); // sort by count ascending - MergingBucketsDeferringCollector deferringCollector; - protected final ExactBloomFilter bloom; protected final long maxDocCount; protected final DocValueFormat format; protected final T valuesSource; protected final U includeExclude; + // Counter used during collection to track map entries that need GC'ing + long numDeleted = 0; + + MergingBucketsDeferringCollector deferringCollector; + LeafBucketCollector subCollectors; + final ExactBloomFilter bloom; AbstractRareTermsAggregator(String name, AggregatorFactories factories, SearchContext context, Aggregator parent, List pipelineAggregators, @@ -99,7 +104,7 @@ public DeferringBucketCollector getDeferringCollector() { protected void doPostCollection() { // Make sure we do one final GC to clean up any deleted ords // that may be lingering (but still below GC threshold) - gcDeletedEntries(null); + gcDeletedEntries(-1); } private String subAggsNeedScore() { @@ -121,5 +126,5 @@ private String descendsFromNestedAggregator(Aggregator parent) { return null; } - protected abstract void gcDeletedEntries(Long numDeleted); + abstract void gcDeletedEntries(long numDeleted); } diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/InternalMappedRareTerms.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/InternalMappedRareTerms.java index 79a453011ca57..d3e1b888e00e3 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/InternalMappedRareTerms.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/InternalMappedRareTerms.java @@ -22,6 +22,7 @@ import org.elasticsearch.common.io.stream.StreamInput; import org.elasticsearch.common.io.stream.StreamOutput; import org.elasticsearch.common.util.ExactBloomFilter; +import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.search.DocValueFormat; import org.elasticsearch.search.aggregations.AggregationExecutionException; import org.elasticsearch.search.aggregations.BucketOrder; @@ -33,19 +34,25 @@ import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.Objects; +import java.util.function.Function; +import java.util.stream.Collectors; -public abstract class InternalMappedRareTerms, B extends InternalTerms.Bucket> - extends InternalMappedTerms { +public abstract class InternalMappedRareTerms, B extends InternalRareTerms.Bucket> + extends InternalRareTerms { + + protected DocValueFormat format; + protected List buckets; + protected Map bucketMap; - final long maxDocCount; final ExactBloomFilter bloom; InternalMappedRareTerms(String name, BucketOrder order, List pipelineAggregators, Map metaData, DocValueFormat format, List buckets, long maxDocCount, ExactBloomFilter bloom) { - // TODO is there a way to determine sum_other_doc_count and doc_count_error_upper_bound equivalents for rare based on bloom? - super(name, order, 0, 1, pipelineAggregators, metaData, format, 0, false, 0, buckets, 0); - this.maxDocCount = maxDocCount; + super(name, order, maxDocCount, pipelineAggregators, metaData); + this.format = format; + this.buckets = buckets; this.bloom = bloom; } @@ -61,22 +68,23 @@ ExactBloomFilter getBloom() { * Read from a stream. */ InternalMappedRareTerms(StreamInput in, Bucket.Reader bucketReader) throws IOException { - super(in, bucketReader); - maxDocCount = in.readLong(); + super(in); + format = in.readNamedWriteable(DocValueFormat.class); + buckets = in.readList(stream -> bucketReader.read(stream, format)); bloom = new ExactBloomFilter(in); } @Override protected void writeTermTypeInfoTo(StreamOutput out) throws IOException { - super.writeTermTypeInfoTo(out); - out.writeLong(maxDocCount); + out.writeNamedWriteable(format); + out.writeList(buckets); bloom.writeTo(out); } @Override public InternalAggregation doReduce(List aggregations, ReduceContext reduceContext) { Map> buckets = new HashMap<>(); - InternalTerms referenceTerms = null; + InternalRareTerms referenceTerms = null; ExactBloomFilter bloomFilter = null; for (InternalAggregation aggregation : aggregations) { @@ -87,7 +95,7 @@ public InternalAggregation doReduce(List aggregations, Redu } @SuppressWarnings("unchecked") - InternalTerms terms = (InternalTerms) aggregation; + InternalRareTerms terms = (InternalRareTerms) aggregation; if (referenceTerms == null && aggregation.getClass().equals(UnmappedRareTerms.class) == false) { referenceTerms = terms; } @@ -126,10 +134,42 @@ public InternalAggregation doReduce(List aggregations, Redu } } CollectionUtil.introSort(rare, order.comparator(null)); - return create(name, rare, 0, 0); + return createWithBloom(name, rare, bloomFilter); } public abstract boolean containsTerm(ExactBloomFilter bloom, B bucket); public abstract void addToBloom(ExactBloomFilter bloom, B bucket); + + @Override + public List getBuckets() { + return buckets; + } + + @Override + public B getBucketByKey(String term) { + if (bucketMap == null) { + bucketMap = buckets.stream().collect(Collectors.toMap(InternalRareTerms.Bucket::getKeyAsString, Function.identity())); + } + return bucketMap.get(term); + } + + @Override + protected boolean doEquals(Object obj) { + InternalMappedRareTerms that = (InternalMappedRareTerms) obj; + return super.doEquals(obj) + && Objects.equals(buckets, that.buckets) + && Objects.equals(format, that.format) + && Objects.equals(bloom, that.bloom); + } + + @Override + protected int doHashCode() { + return Objects.hash(super.doHashCode(), buckets, format, bloom); + } + + @Override + public final XContentBuilder doXContentBody(XContentBuilder builder, Params params) throws IOException { + return doXContentCommon(builder, params, buckets); + } } diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/InternalRareTerms.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/InternalRareTerms.java new file mode 100644 index 0000000000000..5a56ea2202340 --- /dev/null +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/InternalRareTerms.java @@ -0,0 +1,202 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.search.aggregations.bucket.terms; + +import org.elasticsearch.common.io.stream.StreamInput; +import org.elasticsearch.common.io.stream.StreamOutput; +import org.elasticsearch.common.util.ExactBloomFilter; +import org.elasticsearch.common.xcontent.XContentBuilder; +import org.elasticsearch.search.DocValueFormat; +import org.elasticsearch.search.aggregations.Aggregations; +import org.elasticsearch.search.aggregations.BucketOrder; +import org.elasticsearch.search.aggregations.InternalAggregation; +import org.elasticsearch.search.aggregations.InternalAggregations; +import org.elasticsearch.search.aggregations.InternalMultiBucketAggregation; +import org.elasticsearch.search.aggregations.InternalOrder; +import org.elasticsearch.search.aggregations.KeyComparable; +import org.elasticsearch.search.aggregations.pipeline.PipelineAggregator; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.Objects; + +public abstract class InternalRareTerms, B extends InternalRareTerms.Bucket> + extends InternalMultiBucketAggregation implements RareTerms { + + public abstract static class Bucket> extends InternalMultiBucketAggregation.InternalBucket + implements RareTerms.Bucket, KeyComparable { + /** + * Reads a bucket. Should be a constructor reference. + */ + @FunctionalInterface + public interface Reader> { + B read(StreamInput in, DocValueFormat format) throws IOException; + } + + long bucketOrd; + + protected long docCount; + protected InternalAggregations aggregations; + protected final DocValueFormat format; + + protected Bucket(long docCount, InternalAggregations aggregations, DocValueFormat formatter) { + this.format = formatter; + this.docCount = docCount; + this.aggregations = aggregations; + } + + /** + * Read from a stream. + */ + protected Bucket(StreamInput in, DocValueFormat formatter) throws IOException { + this.format = formatter; + docCount = in.readVLong(); + aggregations = InternalAggregations.readAggregations(in); + } + + @Override + public final void writeTo(StreamOutput out) throws IOException { + out.writeVLong(getDocCount()); + aggregations.writeTo(out); + writeTermTo(out); + } + + protected abstract void writeTermTo(StreamOutput out) throws IOException; + + @Override + public long getDocCount() { + return docCount; + } + + @Override + public Aggregations getAggregations() { + return aggregations; + } + + abstract B newBucket(long docCount, InternalAggregations aggs); + + public B reduce(List buckets, ReduceContext context) { + long docCount = 0; + List aggregationsList = new ArrayList<>(buckets.size()); + for (B bucket : buckets) { + docCount += bucket.docCount; + aggregationsList.add(bucket.aggregations); + } + InternalAggregations aggs = InternalAggregations.reduce(aggregationsList, context); + return newBucket(docCount, aggs); + } + + @Override + public final XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { + builder.startObject(); + keyToXContent(builder); + builder.field(CommonFields.DOC_COUNT.getPreferredName(), getDocCount()); + aggregations.toXContentInternal(builder, params); + builder.endObject(); + return builder; + } + + protected abstract XContentBuilder keyToXContent(XContentBuilder builder) throws IOException; + + @Override + public boolean equals(Object obj) { + if (obj == null || getClass() != obj.getClass()) { + return false; + } + Bucket that = (Bucket) obj; + return Objects.equals(docCount, that.docCount) + && Objects.equals(aggregations, that.aggregations); + } + + @Override + public int hashCode() { + return Objects.hash(getClass(), docCount, aggregations); + } + } + + protected final BucketOrder order; + protected final long maxDocCount; + + protected InternalRareTerms(String name, BucketOrder order, long maxDocCount, + List pipelineAggregators, Map metaData) { + super(name, pipelineAggregators, metaData); + this.order = order; + this.maxDocCount = maxDocCount; + } + + /** + * Read from a stream. + */ + protected InternalRareTerms(StreamInput in) throws IOException { + super(in); + order = InternalOrder.Streams.readOrder(in); + maxDocCount = in.readVLong(); + } + + @Override + protected final void doWriteTo(StreamOutput out) throws IOException { + order.writeTo(out); + out.writeVLong(maxDocCount); + writeTermTypeInfoTo(out); + } + + protected abstract void writeTermTypeInfoTo(StreamOutput out) throws IOException; + + @Override + public abstract List getBuckets(); + + @Override + public abstract B getBucketByKey(String term); + + @Override + public InternalAggregation doReduce(List aggregations, ReduceContext reduceContext) { + throw new UnsupportedOperationException(); + } + + protected abstract A createWithBloom(String name, List buckets, ExactBloomFilter bloomFilter); + + /** + * Create an array to hold some buckets. Used in collecting the results. + */ + protected abstract B[] createBucketsArray(int size); + + @Override + protected boolean doEquals(Object obj) { + InternalRareTerms that = (InternalRareTerms) obj; + return Objects.equals(maxDocCount, that.maxDocCount) + && Objects.equals(order, that.order); + } + + @Override + protected int doHashCode() { + return Objects.hash(maxDocCount, order); + } + + protected static XContentBuilder doXContentCommon(XContentBuilder builder, Params params, + List buckets) throws IOException { + builder.startArray(CommonFields.BUCKETS.getPreferredName()); + for (Bucket bucket : buckets) { + bucket.toXContent(builder, params); + } + builder.endArray(); + return builder; + } +} diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/LongRareTerms.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/LongRareTerms.java index de2415e898c41..65c7a5f425328 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/LongRareTerms.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/LongRareTerms.java @@ -20,7 +20,9 @@ import org.elasticsearch.common.io.stream.StreamInput; +import org.elasticsearch.common.io.stream.StreamOutput; import org.elasticsearch.common.util.ExactBloomFilter; +import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.search.DocValueFormat; import org.elasticsearch.search.aggregations.BucketOrder; import org.elasticsearch.search.aggregations.InternalAggregations; @@ -29,16 +31,83 @@ import java.io.IOException; import java.util.List; import java.util.Map; +import java.util.Objects; /** * Result of the RareTerms aggregation when the field is some kind of whole number like a integer, long, or a date. */ -public class LongRareTerms extends InternalMappedRareTerms { +public class LongRareTerms extends InternalMappedRareTerms { public static final String NAME = "lrareterms"; + public static class Bucket extends InternalRareTerms.Bucket { + long term; + + public Bucket(long term, long docCount, InternalAggregations aggregations, DocValueFormat format) { + super(docCount, aggregations, format); + this.term = term; + } + + /** + * Read from a stream. + */ + public Bucket(StreamInput in, DocValueFormat format) throws IOException { + super(in, format); + term = in.readLong(); + } + + @Override + protected void writeTermTo(StreamOutput out) throws IOException { + out.writeLong(term); + } + + @Override + public String getKeyAsString() { + return format.format(term).toString(); + } + + @Override + public Object getKey() { + return term; + } + + @Override + public Number getKeyAsNumber() { + return term; + } + + @Override + public int compareKey(Bucket other) { + return Long.compare(term, other.term); + } + + @Override + Bucket newBucket(long docCount, InternalAggregations aggs) { + return new Bucket(term, docCount, aggs, format); + } + + @Override + protected final XContentBuilder keyToXContent(XContentBuilder builder) throws IOException { + builder.field(CommonFields.KEY.getPreferredName(), term); + if (format != DocValueFormat.RAW) { + builder.field(CommonFields.KEY_AS_STRING.getPreferredName(), format.format(term).toString()); + } + return builder; + } + + @Override + public boolean equals(Object obj) { + return super.equals(obj) && Objects.equals(term, ((Bucket) obj).term); + } + + @Override + public int hashCode() { + return Objects.hash(super.hashCode(), term); + } + } + LongRareTerms(String name, BucketOrder order, List pipelineAggregators, Map metaData, DocValueFormat format, - List buckets, long maxDocCount, ExactBloomFilter bloom) { + List buckets, long maxDocCount, ExactBloomFilter bloom) { super(name, order, pipelineAggregators, metaData, format, buckets, maxDocCount, bloom); } @@ -46,7 +115,7 @@ public class LongRareTerms extends InternalMappedRareTerms buckets) { - return new LongRareTerms(name, order, pipelineAggregators(), metaData, format, - buckets, maxDocCount, bloom); + public LongRareTerms create(List buckets) { + return new LongRareTerms(name, order, pipelineAggregators(), metaData, format, buckets, maxDocCount, bloom); } @Override - public LongTerms.Bucket createBucket(InternalAggregations aggregations, LongTerms.Bucket prototype) { - return new LongTerms.Bucket(prototype.term, prototype.getDocCount(), aggregations, prototype.showDocCountError, - prototype.docCountError, prototype.format); + public LongRareTerms.Bucket createBucket(InternalAggregations aggregations, LongRareTerms.Bucket prototype) { + return new LongRareTerms.Bucket(prototype.term, prototype.getDocCount(), aggregations, prototype.format); } @Override - protected LongRareTerms create(String name, List buckets, long docCountError, long otherDocCount) { + protected LongRareTerms createWithBloom(String name, List buckets, ExactBloomFilter bloomFilter) { return new LongRareTerms(name, order, pipelineAggregators(), getMetaData(), format, - buckets, maxDocCount, bloom); + buckets, maxDocCount, bloomFilter); } @Override - protected LongTerms.Bucket[] createBucketsArray(int size) { - return new LongTerms.Bucket[size]; + protected LongRareTerms.Bucket[] createBucketsArray(int size) { + return new LongRareTerms.Bucket[size]; } @Override - public boolean containsTerm(ExactBloomFilter bloom, LongTerms.Bucket bucket) { + public boolean containsTerm(ExactBloomFilter bloom, LongRareTerms.Bucket bucket) { return bloom.mightContain((long) bucket.getKey()); } @Override - public void addToBloom(ExactBloomFilter bloom, LongTerms.Bucket bucket) { + public void addToBloom(ExactBloomFilter bloom, LongRareTerms.Bucket bucket) { bloom.put((long) bucket.getKey()); } } diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/LongRareTermsAggregator.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/LongRareTermsAggregator.java index be803b715c867..264a1cd0444f4 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/LongRareTermsAggregator.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/LongRareTermsAggregator.java @@ -28,7 +28,6 @@ import org.elasticsearch.search.DocValueFormat; import org.elasticsearch.search.aggregations.Aggregator; import org.elasticsearch.search.aggregations.AggregatorFactories; -import org.elasticsearch.search.aggregations.BucketOrder; import org.elasticsearch.search.aggregations.InternalAggregation; import org.elasticsearch.search.aggregations.LeafBucketCollector; import org.elasticsearch.search.aggregations.LeafBucketCollectorBase; @@ -48,13 +47,11 @@ */ public class LongRareTermsAggregator extends AbstractRareTermsAggregator { - static final BucketOrder ORDER = BucketOrder.compound(BucketOrder.count(true), BucketOrder.key(true)); // sort by count ascending - //TODO review question: is LongLong map ok? protected LongLongHashMap map; protected LongHash bucketOrds; - private LeafBucketCollector subCollectors; + private static final long MAP_SLOT_SIZE = Long.BYTES * 2; LongRareTermsAggregator(String name, AggregatorFactories factories, ValuesSource.Numeric valuesSource, DocValueFormat format, SearchContext aggregationContext, Aggregator parent, IncludeExclude.LongFilter longFilter, @@ -77,7 +74,7 @@ public LeafBucketCollector getLeafCollector(LeafReaderContext ctx, subCollectors = sub; } return new LeafBucketCollectorBase(sub, values) { - private long numDeleted = 0; + @Override public void collect(int docId, long owningBucketOrdinal) throws IOException { @@ -94,7 +91,7 @@ public void collect(int docId, long owningBucketOrdinal) throws IOException { if (termCount == 0) { // Brand new term, save into map map.put(val, 1L); - addRequestCircuitBreakerBytes(16L);// 8 bytes for key, 8 for value + addRequestCircuitBreakerBytes(MAP_SLOT_SIZE);// 8 bytes for key, 8 for value long bucketOrdinal = bucketOrds.add(val); if (bucketOrdinal < 0) { // already seen @@ -123,8 +120,8 @@ public void collect(int docId, long owningBucketOrdinal) throws IOException { // the map and add to the bloom filter map.remove(val); bloom.put(val); + addRequestCircuitBreakerBytes(-MAP_SLOT_SIZE); // 8 bytes for key, 8 for value numDeleted += 1; - addRequestCircuitBreakerBytes(-16L); // 8 bytes for key, 8 for value if (numDeleted > GC_THRESHOLD) { gcDeletedEntries(numDeleted); @@ -142,7 +139,7 @@ public void collect(int docId, long owningBucketOrdinal) throws IOException { }; } - protected void gcDeletedEntries(Long numDeleted) { + protected void gcDeletedEntries(long numDeleted) { long deletionCount = 0; LongHash newBucketOrds = new LongHash(1, context.bigArrays()); try (LongHash oldBucketOrds = bucketOrds) { @@ -162,7 +159,7 @@ protected void gcDeletedEntries(Long numDeleted) { mergeMap[i] = newBucketOrd; } - if (numDeleted != null && deletionCount != numDeleted) { + if (numDeleted != -1 && deletionCount != numDeleted) { throw new IllegalStateException("Expected to prune [" + numDeleted + "] terms, but [" + numDeleted + "] were removed instead"); } @@ -182,13 +179,13 @@ protected void gcDeletedEntries(Long numDeleted) { @Override public InternalAggregation buildAggregation(long owningBucketOrdinal) throws IOException { assert owningBucketOrdinal == 0; - List buckets = new ArrayList<>(map.size()); + List buckets = new ArrayList<>(map.size()); for (LongLongCursor cursor : map) { // The collection managed pruning unwanted terms, so any // terms that made it this far are "rare" and we want buckets long bucketOrdinal = bucketOrds.find(cursor.key); - LongTerms.Bucket bucket = new LongTerms.Bucket(0, 0, null, false, 0, format); + LongRareTerms.Bucket bucket = new LongRareTerms.Bucket(0, 0, null, format); bucket.term = cursor.key; bucket.docCount = cursor.value; bucket.bucketOrd = bucketOrdinal; @@ -200,9 +197,8 @@ public InternalAggregation buildAggregation(long owningBucketOrdinal) throws IOE runDeferredCollections(buckets.stream().mapToLong(b -> b.bucketOrd).toArray()); // Finalize the buckets - for (LongTerms.Bucket bucket : buckets) { + for (LongRareTerms.Bucket bucket : buckets) { bucket.aggregations = bucketAggregations(bucket.bucketOrd); - bucket.docCountError = 0; } CollectionUtil.introSort(buckets, ORDER.comparator(this)); diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/RareTerms.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/RareTerms.java new file mode 100644 index 0000000000000..2248514783264 --- /dev/null +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/RareTerms.java @@ -0,0 +1,48 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.search.aggregations.bucket.terms; + +import org.elasticsearch.search.aggregations.bucket.MultiBucketsAggregation; + +import java.util.List; + + +public interface RareTerms extends MultiBucketsAggregation { + + /** + * A bucket that is associated with a single term + */ + interface Bucket extends MultiBucketsAggregation.Bucket { + + Number getKeyAsNumber(); + } + + /** + * Return the sorted list of the buckets in this terms aggregation. + */ + @Override + List getBuckets(); + + /** + * Get the bucket for the given term, or null if there is no such bucket. + */ + Bucket getBucketByKey(String term); + +} + diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringRareTerms.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringRareTerms.java index 024c7fe575dac..3a0f03446700f 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringRareTerms.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringRareTerms.java @@ -18,8 +18,11 @@ */ package org.elasticsearch.search.aggregations.bucket.terms; +import org.apache.lucene.util.BytesRef; import org.elasticsearch.common.io.stream.StreamInput; +import org.elasticsearch.common.io.stream.StreamOutput; import org.elasticsearch.common.util.ExactBloomFilter; +import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.search.DocValueFormat; import org.elasticsearch.search.aggregations.BucketOrder; import org.elasticsearch.search.aggregations.InternalAggregations; @@ -28,14 +31,85 @@ import java.io.IOException; import java.util.List; import java.util.Map; +import java.util.Objects; - -public class StringRareTerms extends InternalMappedRareTerms { +public class StringRareTerms extends InternalMappedRareTerms { public static final String NAME = "srareterms"; + public static class Bucket extends InternalRareTerms.Bucket { + BytesRef termBytes; + + public Bucket(BytesRef term, long docCount, InternalAggregations aggregations, DocValueFormat format) { + super(docCount, aggregations, format); + this.termBytes = term; + } + + /** + * Read from a stream. + */ + public Bucket(StreamInput in, DocValueFormat format) throws IOException { + super(in, format); + termBytes = in.readBytesRef(); + } + + @Override + protected void writeTermTo(StreamOutput out) throws IOException { + out.writeBytesRef(termBytes); + } + + @Override + public Object getKey() { + return getKeyAsString(); + } + + // this method is needed for scripted numeric aggs + @Override + public Number getKeyAsNumber() { + /* + * If the term is a long greater than 2^52 then parsing as a double would lose accuracy. Therefore, we first parse as a long and + * if this fails then we attempt to parse the term as a double. + */ + try { + return Long.parseLong(termBytes.utf8ToString()); + } catch (final NumberFormatException ignored) { + return Double.parseDouble(termBytes.utf8ToString()); + } + } + + @Override + public String getKeyAsString() { + return format.format(termBytes).toString(); + } + + @Override + public int compareKey(Bucket other) { + return termBytes.compareTo(other.termBytes); + } + + @Override + Bucket newBucket(long docCount, InternalAggregations aggs) { + return new Bucket(termBytes, docCount, aggs, format); + } + + @Override + protected final XContentBuilder keyToXContent(XContentBuilder builder) throws IOException { + return builder.field(CommonFields.KEY.getPreferredName(), getKeyAsString()); + } + + @Override + public boolean equals(Object obj) { + return super.equals(obj) && Objects.equals(termBytes, ((Bucket) obj).termBytes); + } + + @Override + public int hashCode() { + return Objects.hash(super.hashCode(), termBytes); + } + } + public StringRareTerms(String name, BucketOrder order, List pipelineAggregators, Map metaData, DocValueFormat format, - List buckets, long maxDocCount, ExactBloomFilter bloom) { + List buckets, long maxDocCount, ExactBloomFilter bloom) { super(name, order, pipelineAggregators, metaData, format, buckets, maxDocCount, bloom); } @@ -43,7 +117,7 @@ public StringRareTerms(String name, BucketOrder order, List * Read from a stream. */ public StringRareTerms(StreamInput in) throws IOException { - super(in, StringTerms.Bucket::new); + super(in, StringRareTerms.Bucket::new); } @Override @@ -52,34 +126,33 @@ public String getWriteableName() { } @Override - public StringRareTerms create(List buckets) { + public StringRareTerms create(List buckets) { return new StringRareTerms(name, order, pipelineAggregators(), metaData, format, buckets, maxDocCount, bloom); } @Override - public StringTerms.Bucket createBucket(InternalAggregations aggregations, StringTerms.Bucket prototype) { - return new StringTerms.Bucket(prototype.termBytes, prototype.getDocCount(), aggregations, false, - prototype.docCountError, prototype.format); + public StringRareTerms.Bucket createBucket(InternalAggregations aggregations, StringRareTerms.Bucket prototype) { + return new StringRareTerms.Bucket(prototype.termBytes, prototype.getDocCount(), aggregations, prototype.format); } @Override - protected StringRareTerms create(String name, List buckets, long docCountError, long otherDocCount) { + protected StringRareTerms createWithBloom(String name, List buckets, ExactBloomFilter bloomFilter) { return new StringRareTerms(name, order, pipelineAggregators(), metaData, format, - buckets, maxDocCount, bloom); + buckets, maxDocCount, bloomFilter); } @Override - protected StringTerms.Bucket[] createBucketsArray(int size) { - return new StringTerms.Bucket[size]; + protected StringRareTerms.Bucket[] createBucketsArray(int size) { + return new StringRareTerms.Bucket[size]; } @Override - public boolean containsTerm(ExactBloomFilter bloom, StringTerms.Bucket bucket) { + public boolean containsTerm(ExactBloomFilter bloom, StringRareTerms.Bucket bucket) { return bloom.mightContain(bucket.termBytes); } @Override - public void addToBloom(ExactBloomFilter bloom, StringTerms.Bucket bucket) { + public void addToBloom(ExactBloomFilter bloom, StringRareTerms.Bucket bucket) { bloom.put(bucket.termBytes); } } diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringRareTermsAggregator.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringRareTermsAggregator.java index b46b811f5de27..6e135db335fc5 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringRareTermsAggregator.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringRareTermsAggregator.java @@ -48,17 +48,10 @@ * An aggregator that finds "rare" string values (e.g. terms agg that orders ascending) */ public class StringRareTermsAggregator extends AbstractRareTermsAggregator { - // TODO review question: is there equivalent to LongObjectPagedHashMap like used in LongRareTerms? protected ObjectLongHashMap map; protected BytesRefHash bucketOrds; - private LeafBucketCollector subCollectors; - // TODO review question: What to set this at? - /** - Sets the number of "removed" values to accumulate before we purge ords - via the MergingBucketCollector's mergeBuckets() method - */ - private final long GC_THRESHOLD = 10; + private static final long MAP_VALUE_SIZE = Long.BYTES; StringRareTermsAggregator(String name, AggregatorFactories factories, ValuesSource.Bytes valuesSource, DocValueFormat format, IncludeExclude.StringFilter stringFilter, @@ -78,7 +71,6 @@ public LeafBucketCollector getLeafCollector(LeafReaderContext ctx, } return new LeafBucketCollectorBase(sub, values) { final BytesRefBuilder previous = new BytesRefBuilder(); - private long numDeleted = 0; @Override public void collect(int docId, long bucket) throws IOException { @@ -103,7 +95,7 @@ public void collect(int docId, long bucket) throws IOException { if (valueCount == 0) { // Brand new term, save into map map.put(BytesRef.deepCopyOf(bytes), 1L); - addRequestCircuitBreakerBytes(bytes.length + 8L); // size of term + 8 for counter + addRequestCircuitBreakerBytes(bytes.length + MAP_VALUE_SIZE); // size of term + 8 for counter long bucketOrdinal = bucketOrds.add(bytes); if (bucketOrdinal < 0) { // already seen @@ -131,7 +123,7 @@ public void collect(int docId, long bucket) throws IOException { map.remove(bytes); bloom.put(bytes); numDeleted += 1; - addRequestCircuitBreakerBytes(-(bytes.length + 8L)); // size of term + 8 for counter + addRequestCircuitBreakerBytes(-(bytes.length + MAP_VALUE_SIZE)); // size of term + 8 for counter if (numDeleted > GC_THRESHOLD) { gcDeletedEntries(numDeleted); @@ -147,7 +139,7 @@ public void collect(int docId, long bucket) throws IOException { }; } - protected void gcDeletedEntries(Long numDeleted) { + protected void gcDeletedEntries(long numDeleted) { long deletionCount = 0; BytesRefHash newBucketOrds = new BytesRefHash(1, context.bigArrays()); try (BytesRefHash oldBucketOrds = bucketOrds) { @@ -167,7 +159,7 @@ protected void gcDeletedEntries(Long numDeleted) { } mergeMap[i] = newBucketOrd; } - if (numDeleted != null && deletionCount != numDeleted) { + if (numDeleted != -1 && deletionCount != numDeleted) { throw new IllegalStateException("Expected to prune [" + numDeleted + "] terms, but [" + numDeleted + "] were removed instead"); } @@ -188,10 +180,10 @@ protected void gcDeletedEntries(Long numDeleted) { public InternalAggregation buildAggregation(long owningBucketOrdinal) throws IOException { assert owningBucketOrdinal == 0; - List buckets = new ArrayList<>(map.size()); + List buckets = new ArrayList<>(map.size()); for (ObjectLongCursor cursor : map) { - StringTerms.Bucket bucket = new StringTerms.Bucket(new BytesRef(), 0, null, false, 0, format); + StringRareTerms.Bucket bucket = new StringRareTerms.Bucket(new BytesRef(), 0, null, format); // The collection managed pruning unwanted terms, so any // terms that made it this far are "rare" and we want buckets @@ -207,14 +199,12 @@ public InternalAggregation buildAggregation(long owningBucketOrdinal) throws IOE runDeferredCollections(buckets.stream().mapToLong(b -> b.bucketOrd).toArray()); // Finalize the buckets - for (StringTerms.Bucket bucket : buckets) { + for (StringRareTerms.Bucket bucket : buckets) { bucket.aggregations = bucketAggregations(bucket.bucketOrd); - bucket.docCountError = 0; } - CollectionUtil.introSort(buckets, LongRareTermsAggregator.ORDER.comparator(this)); - return new StringRareTerms(name, LongRareTermsAggregator.ORDER, pipelineAggregators(), metaData(), - format, buckets, maxDocCount, bloom); + CollectionUtil.introSort(buckets, ORDER.comparator(this)); + return new StringRareTerms(name, ORDER, pipelineAggregators(), metaData(), format, buckets, maxDocCount, bloom); } @Override diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/UnmappedRareTerms.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/UnmappedRareTerms.java index 9c82f2dc09ded..ddb89285b63b2 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/UnmappedRareTerms.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/UnmappedRareTerms.java @@ -20,7 +20,9 @@ import org.elasticsearch.common.io.stream.StreamInput; import org.elasticsearch.common.io.stream.StreamOutput; +import org.elasticsearch.common.util.ExactBloomFilter; import org.elasticsearch.common.xcontent.XContentBuilder; +import org.elasticsearch.search.DocValueFormat; import org.elasticsearch.search.aggregations.InternalAggregation; import org.elasticsearch.search.aggregations.InternalAggregations; import org.elasticsearch.search.aggregations.pipeline.PipelineAggregator; @@ -35,12 +37,17 @@ /** * Result of the RareTerms aggregation when the field is unmapped. */ -public class UnmappedRareTerms extends InternalTerms { +public class UnmappedRareTerms extends InternalRareTerms { public static final String NAME = "umrareterms"; - UnmappedRareTerms(String name, List pipelineAggregators, - Map metaData) { - super(name, LongRareTermsAggregator.ORDER, 0, 0, pipelineAggregators, metaData); + protected abstract static class Bucket extends InternalRareTerms.Bucket { + private Bucket(long docCount, InternalAggregations aggregations, DocValueFormat formatter) { + super(docCount, aggregations, formatter); + } + } + + UnmappedRareTerms(String name, List pipelineAggregators, Map metaData) { + super(name, LongRareTermsAggregator.ORDER, 0, pipelineAggregators, metaData); } /** @@ -66,18 +73,18 @@ public String getType() { } @Override - public UnmappedRareTerms create(List buckets) { + public UnmappedRareTerms create(List buckets) { return new UnmappedRareTerms(name, pipelineAggregators(), metaData); } @Override - public UnmappedTerms.Bucket createBucket(InternalAggregations aggregations, UnmappedTerms.Bucket prototype) { - throw new UnsupportedOperationException("not supported for UnmappedTerms"); + public UnmappedRareTerms.Bucket createBucket(InternalAggregations aggregations, UnmappedRareTerms.Bucket prototype) { + throw new UnsupportedOperationException("not supported for UnmappedRareTerms"); } @Override - protected UnmappedRareTerms create(String name, List buckets, long docCountError, long otherDocCount) { - throw new UnsupportedOperationException("not supported for UnmappedTerms"); + protected UnmappedRareTerms createWithBloom(String name, List buckets, ExactBloomFilter bloomFilter) { + throw new UnsupportedOperationException("not supported for UnmappedRareTerms"); } @Override @@ -92,40 +99,21 @@ public boolean isMapped() { @Override public final XContentBuilder doXContentBody(XContentBuilder builder, Params params) throws IOException { - return doXContentCommon(builder, params, 0, 0, Collections.emptyList()); - } - - @Override - protected void setDocCountError(long docCountError) { - } - - @Override - protected int getShardSize() { - return 0; - } - - @Override - public long getDocCountError() { - return 0; - } - - @Override - public long getSumOfOtherDocCounts() { - return 0; + return doXContentCommon(builder, params, Collections.emptyList()); } @Override - public List getBuckets() { + public List getBuckets() { return emptyList(); } @Override - public UnmappedTerms.Bucket getBucketByKey(String term) { + public UnmappedRareTerms.Bucket getBucketByKey(String term) { return null; } @Override - protected UnmappedTerms.Bucket[] createBucketsArray(int size) { - return new UnmappedTerms.Bucket[size]; + protected UnmappedRareTerms.Bucket[] createBucketsArray(int size) { + return new UnmappedRareTerms.Bucket[size]; } } diff --git a/server/src/test/java/org/elasticsearch/common/util/ExactBloomFilterTests.java b/server/src/test/java/org/elasticsearch/common/util/ExactBloomFilterTests.java index 7405da6b0a648..d841b4e9b69ce 100644 --- a/server/src/test/java/org/elasticsearch/common/util/ExactBloomFilterTests.java +++ b/server/src/test/java/org/elasticsearch/common/util/ExactBloomFilterTests.java @@ -1,3 +1,21 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ package org.elasticsearch.common.util; import org.elasticsearch.common.Numbers; diff --git a/server/src/test/java/org/elasticsearch/search/aggregations/bucket/terms/RareTermsAggregatorTests.java b/server/src/test/java/org/elasticsearch/search/aggregations/bucket/terms/RareTermsAggregatorTests.java index 9003e4dbf5496..009aa03320ce8 100644 --- a/server/src/test/java/org/elasticsearch/search/aggregations/bucket/terms/RareTermsAggregatorTests.java +++ b/server/src/test/java/org/elasticsearch/search/aggregations/bucket/terms/RareTermsAggregatorTests.java @@ -33,14 +33,12 @@ import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.search.MatchNoDocsQuery; import org.apache.lucene.search.Query; +import org.apache.lucene.search.TotalHits; import org.apache.lucene.store.Directory; import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.NumericUtils; import org.elasticsearch.Version; import org.elasticsearch.cluster.metadata.IndexMetaData; import org.elasticsearch.common.settings.Settings; -import org.elasticsearch.common.util.MockBigArrays; -import org.elasticsearch.common.util.MockPageCacheRecycler; import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.mapper.IdFieldMapper; import org.elasticsearch.index.mapper.KeywordFieldMapper; @@ -49,10 +47,8 @@ import org.elasticsearch.index.mapper.SeqNoFieldMapper; import org.elasticsearch.index.mapper.TypeFieldMapper; import org.elasticsearch.index.mapper.Uid; -import org.elasticsearch.indices.breaker.NoneCircuitBreakerService; import org.elasticsearch.search.SearchHit; import org.elasticsearch.search.aggregations.Aggregation; -import org.elasticsearch.search.aggregations.AggregationExecutionException; import org.elasticsearch.search.aggregations.Aggregations; import org.elasticsearch.search.aggregations.Aggregator; import org.elasticsearch.search.aggregations.AggregatorTestCase; @@ -87,7 +83,6 @@ public class RareTermsAggregatorTests extends AggregatorTestCase { private static final String LONG_FIELD = "numeric"; private static final String KEYWORD_FIELD = "keyword"; - private static final String DOUBLE_FIELD = "double"; private static final List dataset; static { @@ -109,10 +104,6 @@ public void testMatchNoDocs() throws IOException { aggregation -> aggregation.field(LONG_FIELD).maxDocCount(1), agg -> assertEquals(0, agg.getBuckets().size()), ValueType.NUMERIC ); - testBothCases(new MatchNoDocsQuery(), dataset, - aggregation -> aggregation.field(DOUBLE_FIELD).maxDocCount(1), - agg -> assertEquals(0, agg.getBuckets().size()), ValueType.DOUBLE - ); } public void testMatchAllDocs() throws IOException { @@ -122,7 +113,7 @@ public void testMatchAllDocs() throws IOException { aggregation -> aggregation.field(LONG_FIELD).maxDocCount(1), agg -> { assertEquals(1, agg.getBuckets().size()); - LongTerms.Bucket bucket = (LongTerms.Bucket) agg.getBuckets().get(0); + LongRareTerms.Bucket bucket = (LongRareTerms.Bucket) agg.getBuckets().get(0); assertThat(bucket.getKey(), equalTo(1L)); assertThat(bucket.getDocCount(), equalTo(1L)); }, ValueType.NUMERIC @@ -131,19 +122,42 @@ public void testMatchAllDocs() throws IOException { aggregation -> aggregation.field(KEYWORD_FIELD).maxDocCount(1), agg -> { assertEquals(1, agg.getBuckets().size()); - StringTerms.Bucket bucket = (StringTerms.Bucket) agg.getBuckets().get(0); + StringRareTerms.Bucket bucket = (StringRareTerms.Bucket) agg.getBuckets().get(0); assertThat(bucket.getKeyAsString(), equalTo("1")); assertThat(bucket.getDocCount(), equalTo(1L)); }, ValueType.STRING ); - testBothCases(query, dataset, - aggregation -> aggregation.field(DOUBLE_FIELD).maxDocCount(1), + } + + public void testManyDocsOneRare() throws IOException { + Query query = new MatchAllDocsQuery(); + + List d = new ArrayList<>(500); + for (int i = 1; i < 500; i++) { + d.add((long) i); + d.add((long) i); + } + + // The one rare term + d.add(0L); + + testSearchAndReduceCase(query, d, + aggregation -> aggregation.field(LONG_FIELD).maxDocCount(1), agg -> { assertEquals(1, agg.getBuckets().size()); - DoubleTerms.Bucket bucket = (DoubleTerms.Bucket) agg.getBuckets().get(0); - assertThat(bucket.getKey(), equalTo(1.0)); + LongRareTerms.Bucket bucket = (LongRareTerms.Bucket) agg.getBuckets().get(0); + assertThat(bucket.getKey(), equalTo(0L)); assertThat(bucket.getDocCount(), equalTo(1L)); - }, ValueType.DOUBLE + }, ValueType.NUMERIC + ); + testSearchAndReduceCase(query, d, + aggregation -> aggregation.field(KEYWORD_FIELD).maxDocCount(1), + agg -> { + assertEquals(1, agg.getBuckets().size()); + StringRareTerms.Bucket bucket = (StringRareTerms.Bucket) agg.getBuckets().get(0); + assertThat(bucket.getKeyAsString(), equalTo("0")); + assertThat(bucket.getDocCount(), equalTo(1L)); + }, ValueType.STRING ); } @@ -156,7 +170,7 @@ public void testIncludeExclude() throws IOException { .includeExclude(new IncludeExclude(new long[]{2}, new long[]{})), agg -> { assertEquals(1, agg.getBuckets().size()); - LongTerms.Bucket bucket = (LongTerms.Bucket) agg.getBuckets().get(0); + LongRareTerms.Bucket bucket = (LongRareTerms.Bucket) agg.getBuckets().get(0); assertThat(bucket.getKey(), equalTo(2L)); assertThat(bucket.getDocCount(), equalTo(2L)); }, ValueType.NUMERIC @@ -167,22 +181,11 @@ public void testIncludeExclude() throws IOException { .includeExclude(new IncludeExclude(new String[]{"2"}, new String[]{})), agg -> { assertEquals(1, agg.getBuckets().size()); - StringTerms.Bucket bucket = (StringTerms.Bucket) agg.getBuckets().get(0); + StringRareTerms.Bucket bucket = (StringRareTerms.Bucket) agg.getBuckets().get(0); assertThat(bucket.getKeyAsString(), equalTo("2")); assertThat(bucket.getDocCount(), equalTo(2L)); }, ValueType.STRING ); - testBothCases(query, dataset, - aggregation -> aggregation.field(DOUBLE_FIELD) - .maxDocCount(2) // bump to 2 since we're only including "2" - .includeExclude(new IncludeExclude(new double[]{2.0}, new double[]{})), - agg -> { - assertEquals(1, agg.getBuckets().size()); - DoubleTerms.Bucket bucket = (DoubleTerms.Bucket) agg.getBuckets().get(0); - assertThat(bucket.getKey(), equalTo(2.0)); - assertThat(bucket.getDocCount(), equalTo(2L)); - }, ValueType.DOUBLE - ); } public void testEmbeddedMaxAgg() throws IOException { @@ -194,7 +197,7 @@ public void testEmbeddedMaxAgg() throws IOException { }, agg -> { assertEquals(1, agg.getBuckets().size()); - LongTerms.Bucket bucket = (LongTerms.Bucket) agg.getBuckets().get(0); + LongRareTerms.Bucket bucket = (LongRareTerms.Bucket) agg.getBuckets().get(0); assertThat(bucket.getKey(), equalTo(1L)); assertThat(bucket.getDocCount(), equalTo(1L)); @@ -210,7 +213,7 @@ public void testEmbeddedMaxAgg() throws IOException { }, agg -> { assertEquals(1, agg.getBuckets().size()); - StringTerms.Bucket bucket = (StringTerms.Bucket) agg.getBuckets().get(0); + StringRareTerms.Bucket bucket = (StringRareTerms.Bucket) agg.getBuckets().get(0); assertThat(bucket.getKey(), equalTo("1")); assertThat(bucket.getDocCount(), equalTo(1L)); @@ -220,22 +223,6 @@ public void testEmbeddedMaxAgg() throws IOException { assertThat(((Max)(children.asList().get(0))).getValue(), equalTo(1.0)); }, ValueType.STRING ); - testBothCases(query, dataset, aggregation -> { - MaxAggregationBuilder max = new MaxAggregationBuilder("the_max").field(LONG_FIELD); - aggregation.field(DOUBLE_FIELD).maxDocCount(1).subAggregation(max); - }, - agg -> { - assertEquals(1, agg.getBuckets().size()); - DoubleTerms.Bucket bucket = (DoubleTerms.Bucket) agg.getBuckets().get(0); - assertThat(bucket.getKey(), equalTo(1.0)); - assertThat(bucket.getDocCount(), equalTo(1L)); - - Aggregations children = bucket.getAggregations(); - assertThat(children.asList().size(), equalTo(1)); - assertThat(children.asList().get(0).getName(), equalTo("the_max")); - assertThat(((Max)(children.asList().get(0))).getValue(), equalTo(1.0)); - }, ValueType.DOUBLE - ); } public void testEmpty() throws IOException { @@ -249,10 +236,6 @@ public void testEmpty() throws IOException { aggregation -> aggregation.field(KEYWORD_FIELD).maxDocCount(1), agg -> assertEquals(0, agg.getBuckets().size()), ValueType.STRING ); - testSearchCase(query, Collections.emptyList(), - aggregation -> aggregation.field(DOUBLE_FIELD).maxDocCount(1), - agg -> assertEquals(0, agg.getBuckets().size()), ValueType.DOUBLE - ); // Note: the search and reduce test will generate no segments (due to no docs) // and so will return a null agg because the aggs aren't run/reduced @@ -264,10 +247,6 @@ public void testEmpty() throws IOException { aggregation -> aggregation.field(KEYWORD_FIELD).maxDocCount(1), Assert::assertNull, ValueType.STRING ); - testSearchAndReduceCase(query, Collections.emptyList(), - aggregation -> aggregation.field(DOUBLE_FIELD).maxDocCount(1), - Assert::assertNull, ValueType.DOUBLE - ); } public void testUnmapped() throws Exception { @@ -276,7 +255,6 @@ public void testUnmapped() throws Exception { Document document = new Document(); document.add(new SortedDocValuesField("string", new BytesRef("a"))); document.add(new NumericDocValuesField("long", 0L)); - document.add(new NumericDocValuesField("double", Double.doubleToRawLongBits(0L))); indexWriter.addDocument(document); MappedFieldType fieldType1 = new KeywordFieldMapper.KeywordFieldType(); fieldType1.setName("another_string"); @@ -286,21 +264,19 @@ public void testUnmapped() throws Exception { fieldType2.setName("another_long"); fieldType2.setHasDocValues(true); - MappedFieldType fieldType3 = new NumberFieldMapper.NumberFieldType(NumberFieldMapper.NumberType.DOUBLE); - fieldType3.setName("another_double"); - fieldType3.setHasDocValues(true); + try (IndexReader indexReader = maybeWrapReaderEs(indexWriter.getReader())) { IndexSearcher indexSearcher = newIndexSearcher(indexReader); - ValueType[] valueTypes = new ValueType[]{ValueType.STRING, ValueType.LONG, ValueType.DOUBLE}; - String[] fieldNames = new String[]{"string", "long", "double"}; + ValueType[] valueTypes = new ValueType[]{ValueType.STRING, ValueType.LONG}; + String[] fieldNames = new String[]{"string", "long"}; for (int i = 0; i < fieldNames.length; i++) { RareTermsAggregationBuilder aggregationBuilder = new RareTermsAggregationBuilder("_name", valueTypes[i]) .field(fieldNames[i]); - Aggregator aggregator = createAggregator(aggregationBuilder, indexSearcher, fieldType1, fieldType2, fieldType3); + Aggregator aggregator = createAggregator(aggregationBuilder, indexSearcher, fieldType1, fieldType2); aggregator.preCollection(); indexSearcher.search(new MatchAllDocsQuery(), aggregator); aggregator.postCollection(); - Terms result = (Terms) aggregator.buildAggregation(0L); + RareTerms result = (RareTerms) aggregator.buildAggregation(0L); assertEquals("_name", result.getName()); assertEquals(0, result.getBuckets().size()); } @@ -318,7 +294,7 @@ public void testNestedTerms() throws IOException { }, agg -> { assertEquals(1, agg.getBuckets().size()); - LongTerms.Bucket bucket = (LongTerms.Bucket) agg.getBuckets().get(0); + LongRareTerms.Bucket bucket = (LongRareTerms.Bucket) agg.getBuckets().get(0); assertThat(bucket.getKey(), equalTo(1L)); assertThat(bucket.getDocCount(), equalTo(1L)); @@ -336,7 +312,7 @@ public void testNestedTerms() throws IOException { }, agg -> { assertEquals(1, agg.getBuckets().size()); - StringTerms.Bucket bucket = (StringTerms.Bucket) agg.getBuckets().get(0); + StringRareTerms.Bucket bucket = (StringRareTerms.Bucket) agg.getBuckets().get(0); assertThat(bucket.getKey(), equalTo("1")); assertThat(bucket.getDocCount(), equalTo(1L)); @@ -347,71 +323,8 @@ public void testNestedTerms() throws IOException { assertThat(((Terms)(children.asList().get(0))).getBuckets().get(0).getKeyAsString(), equalTo("1")); }, ValueType.STRING ); - testBothCases(query, dataset, aggregation -> { - TermsAggregationBuilder terms = new TermsAggregationBuilder("the_terms", ValueType.STRING).field(KEYWORD_FIELD); - aggregation.field(DOUBLE_FIELD).maxDocCount(1).subAggregation(terms); - }, - agg -> { - assertEquals(1, agg.getBuckets().size()); - DoubleTerms.Bucket bucket = (DoubleTerms.Bucket) agg.getBuckets().get(0); - assertThat(bucket.getKey(), equalTo(1.0)); - assertThat(bucket.getDocCount(), equalTo(1L)); - - Aggregations children = bucket.getAggregations(); - assertThat(children.asList().size(), equalTo(1)); - assertThat(children.asList().get(0).getName(), equalTo("the_terms")); - assertThat(((Terms)(children.asList().get(0))).getBuckets().size(), equalTo(1)); - assertThat(((Terms)(children.asList().get(0))).getBuckets().get(0).getKeyAsString(), equalTo("1")); - }, ValueType.DOUBLE - ); } - public void testMixLongAndDouble() throws Exception { - RareTermsAggregationBuilder aggregationBuilder = new RareTermsAggregationBuilder("_name", ValueType.LONG) - .field(LONG_FIELD) - .maxDocCount(2); - List aggs = new ArrayList<>(); - - try (Directory directory = newDirectory()) { - try (RandomIndexWriter indexWriter = new RandomIndexWriter(random(), directory)) { - Document document = new Document(); - document.add(new NumericDocValuesField(LONG_FIELD, 1L)); - indexWriter.addDocument(document); - - try (IndexReader indexReader = maybeWrapReaderEs(indexWriter.getReader())) { - IndexSearcher indexSearcher = newIndexSearcher(indexReader); - MappedFieldType fieldType = - new NumberFieldMapper.NumberFieldType(NumberFieldMapper.NumberType.LONG); - fieldType.setName(LONG_FIELD); - fieldType.setHasDocValues(true); - aggs.add(buildInternalAggregation(aggregationBuilder, fieldType, indexSearcher)); - - } - } - } - try (Directory directory = newDirectory()) { - try (RandomIndexWriter indexWriter = new RandomIndexWriter(random(), directory)) { - Document document = new Document(); - document.add(new SortedNumericDocValuesField(LONG_FIELD, NumericUtils.doubleToSortableLong(1.0d))); - indexWriter.addDocument(document); - - try (IndexReader indexReader = maybeWrapReaderEs(indexWriter.getReader())) { - IndexSearcher indexSearcher = newIndexSearcher(indexReader); - MappedFieldType fieldType = - new NumberFieldMapper.NumberFieldType(NumberFieldMapper.NumberType.DOUBLE); - fieldType.setName(LONG_FIELD); - fieldType.setHasDocValues(true); - aggs.add(buildInternalAggregation(aggregationBuilder, fieldType, indexSearcher)); - } - } - } - InternalAggregation.ReduceContext ctx = - new InternalAggregation.ReduceContext(new MockBigArrays(new MockPageCacheRecycler(Settings.EMPTY), - new NoneCircuitBreakerService()), null, true); - AggregationExecutionException e = expectThrows(AggregationExecutionException.class, () -> aggs.get(0).doReduce(aggs, ctx)); - } - - public void testGlobalAggregationWithScore() throws IOException { try (Directory directory = newDirectory()) { try (RandomIndexWriter indexWriter = new RandomIndexWriter(random(), directory)) { @@ -538,10 +451,14 @@ public void testWithNestedScoringAggregations() throws IOException { new DocValuesFieldExistsQuery(PRIMARY_TERM_NAME), nested, fieldType); InternalMultiBucketAggregation terms = result.getAggregations().get("terms"); assertThat(terms.getBuckets().size(), equalTo(2)); + long counter = 1; for (MultiBucketsAggregation.Bucket bucket : terms.getBuckets()) { InternalTopHits topHits = bucket.getAggregations().get("top_hits"); - assertThat(topHits.getHits(), equalTo(1L)); + TotalHits hits = topHits.getHits().getTotalHits(); + assertNotNull(hits); + assertThat(hits.value, equalTo(counter)); assertThat(topHits.getHits().getMaxScore(), equalTo(Float.NaN)); + counter += 1; } } } @@ -629,7 +546,6 @@ private void executeTestCase(boolean reduced, Query query, List dataset, document.add(new SortedNumericDocValuesField(LONG_FIELD, value)); document.add(new LongPoint(LONG_FIELD, value)); document.add(new SortedSetDocValuesField(KEYWORD_FIELD, new BytesRef(Long.toString(value)))); - document.add(new SortedNumericDocValuesField(DOUBLE_FIELD, Double.doubleToRawLongBits((double) value))); indexWriter.addDocument(document); document.clear(); } @@ -651,15 +567,11 @@ private void executeTestCase(boolean reduced, Query query, List dataset, longFieldType.setName(LONG_FIELD); longFieldType.setHasDocValues(true); - MappedFieldType doubleFieldType = new NumberFieldMapper.NumberFieldType(NumberFieldMapper.NumberType.DOUBLE); - doubleFieldType.setName(DOUBLE_FIELD); - doubleFieldType.setHasDocValues(true); - InternalMappedRareTerms rareTerms; if (reduced) { - rareTerms = searchAndReduce(indexSearcher, query, aggregationBuilder, keywordFieldType, longFieldType, doubleFieldType); + rareTerms = searchAndReduce(indexSearcher, query, aggregationBuilder, keywordFieldType, longFieldType); } else { - rareTerms = search(indexSearcher, query, aggregationBuilder, keywordFieldType, longFieldType, doubleFieldType); + rareTerms = search(indexSearcher, query, aggregationBuilder, keywordFieldType, longFieldType); } verify.accept(rareTerms); } From db20294e3750f764503bc1ea05b5f483c0b93ce8 Mon Sep 17 00:00:00 2001 From: Zachary Tong Date: Thu, 28 Feb 2019 16:48:21 -0500 Subject: [PATCH 08/25] Test tweaks, comments --- .../search.aggregation/280_rare_terms.yml | 26 +++++++++---------- .../terms/AbstractRareTermsAggregator.java | 8 ++++++ .../bucket/terms/LongRareTermsAggregator.java | 2 +- .../terms/StringRareTermsAggregator.java | 1 + 4 files changed, 23 insertions(+), 14 deletions(-) diff --git a/rest-api-spec/src/main/resources/rest-api-spec/test/search.aggregation/280_rare_terms.yml b/rest-api-spec/src/main/resources/rest-api-spec/test/search.aggregation/280_rare_terms.yml index e0eb30e3582ab..eac3287b2ab78 100644 --- a/rest-api-spec/src/main/resources/rest-api-spec/test/search.aggregation/280_rare_terms.yml +++ b/rest-api-spec/src/main/resources/rest-api-spec/test/search.aggregation/280_rare_terms.yml @@ -52,7 +52,7 @@ setup: search: body: { "size" : 0, "aggs" : { "str_terms" : { "rare_terms" : { "field" : "str", "max_doc_count" : 1 } } } } - - match: { hits.total: 3 } + - match: { hits.total.value: 3 } - length: { aggregations.str_terms.buckets: 1 } - match: { aggregations.str_terms.buckets.0.key: "bcd" } - is_false: aggregations.str_terms.buckets.0.key_as_string @@ -85,7 +85,7 @@ setup: search: body: { "size" : 0, "aggs" : { "ip_terms" : { "rare_terms" : { "field" : "ip" } } } } - - match: { hits.total: 3 } + - match: { hits.total.value: 3 } - length: { aggregations.ip_terms.buckets: 1 } - match: { aggregations.ip_terms.buckets.0.key: "127.0.0.1" } - is_false: aggregations.ip_terms.buckets.0.key_as_string @@ -95,7 +95,7 @@ setup: search: body: { "size" : 0, "aggs" : { "ip_terms" : { "rare_terms" : { "field" : "ip", "include" : [ "127.0.0.1" ] } } } } - - match: { hits.total: 3 } + - match: { hits.total.value: 3 } - length: { aggregations.ip_terms.buckets: 1 } - match: { aggregations.ip_terms.buckets.0.key: "127.0.0.1" } - is_false: aggregations.ip_terms.buckets.0.key_as_string @@ -105,7 +105,7 @@ setup: search: body: { "size" : 0, "aggs" : { "ip_terms" : { "rare_terms" : { "field" : "ip", "exclude" : [ "127.0.0.1" ] } } } } - - match: { hits.total: 3 } + - match: { hits.total.value: 3 } - length: { aggregations.ip_terms.buckets: 0 } - do: @@ -143,7 +143,7 @@ setup: search: body: { "size" : 0, "aggs" : { "boolean_terms" : { "rare_terms" : { "field" : "boolean" } } } } - - match: { hits.total: 3 } + - match: { hits.total.value: 3 } - length: { aggregations.boolean_terms.buckets: 1 } - match: { aggregations.boolean_terms.buckets.0.key: 0 } - match: { aggregations.boolean_terms.buckets.0.key_as_string: "false" } @@ -176,7 +176,7 @@ setup: search: body: { "size" : 0, "aggs" : { "integer_terms" : { "rare_terms" : { "field" : "integer" } } } } - - match: { hits.total: 3 } + - match: { hits.total.value: 3 } - length: { aggregations.integer_terms.buckets: 1 } @@ -211,7 +211,7 @@ setup: search: body: { "size" : 0, "aggs" : { "date_terms" : { "rare_terms" : { "field" : "date" } } } } - - match: { hits.total: 3 } + - match: { hits.total.value: 3 } - length: { aggregations.date_terms.buckets: 1 } - match: { aggregations.date_terms.buckets.0.key: 1409529600000 } @@ -222,7 +222,7 @@ setup: search: body: { "size" : 0, "aggs" : { "date_terms" : { "rare_terms" : { "field" : "date", "include" : [ "2014-09-01" ] } } } } - - match: { hits.total: 3 } + - match: { hits.total.value: 3 } - length: { aggregations.date_terms.buckets: 1 } - match: { aggregations.date_terms.buckets.0.key_as_string: "2014-09-01T00:00:00.000Z" } - match: { aggregations.date_terms.buckets.0.doc_count: 1 } @@ -231,7 +231,7 @@ setup: search: body: { "size" : 0, "aggs" : { "date_terms" : { "rare_terms" : { "field" : "date", "exclude" : [ "2014-09-01" ] } } } } - - match: { hits.total: 3 } + - match: { hits.total.value: 3 } - length: { aggregations.date_terms.buckets: 0 } --- @@ -250,7 +250,7 @@ setup: search: body: { "size" : 0, "aggs" : { "string_terms" : { "rare_terms" : { "field" : "unmapped_string"} } } } - - match: { hits.total: 1 } + - match: { hits.total.value: 1 } - length: { aggregations.string_terms.buckets: 0 } --- @@ -269,7 +269,7 @@ setup: search: body: { "size" : 0, "aggs" : { "boolean_terms" : { "rare_terms" : { "field" : "unmapped_boolean" } } } } - - match: { hits.total: 1 } + - match: { hits.total.value: 1 } - length: { aggregations.boolean_terms.buckets: 0 } --- @@ -288,7 +288,7 @@ setup: search: body: { "size" : 0, "aggs" : { "date_terms" : { "rare_terms" : { "field" : "unmapped_date"} } } } - - match: { hits.total: 1 } + - match: { hits.total.value: 1 } - length: { aggregations.date_terms.buckets: 0 } --- @@ -307,7 +307,7 @@ setup: search: body: { "size" : 0, "aggs" : { "long_terms" : { "rare_terms" : { "field" : "unmapped_long", "value_type" : "long" } } } } - - match: { hits.total: 1 } + - match: { hits.total.value: 1 } - length: { aggregations.long_terms.buckets: 0 } diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/AbstractRareTermsAggregator.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/AbstractRareTermsAggregator.java index def09c3168ba1..69439d32a05b1 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/AbstractRareTermsAggregator.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/AbstractRareTermsAggregator.java @@ -126,5 +126,13 @@ private String descendsFromNestedAggregator(Aggregator parent) { return null; } + /** + * Remove entries from the ordinal map which are no longer tracked in the active key's map. + * Will internally call the merge function of {@link MergingBucketsDeferringCollector}, so this + * should be called sparingly for performance reasons + * + * @param numDeleted the number of keys that are expected to be pruned during GC. + * Used to help verify correct functioning of GC + */ abstract void gcDeletedEntries(long numDeleted); } diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/LongRareTermsAggregator.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/LongRareTermsAggregator.java index 264a1cd0444f4..a622d539d29d2 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/LongRareTermsAggregator.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/LongRareTermsAggregator.java @@ -47,10 +47,10 @@ */ public class LongRareTermsAggregator extends AbstractRareTermsAggregator { - //TODO review question: is LongLong map ok? protected LongLongHashMap map; protected LongHash bucketOrds; + // Size of a key:value pair in the active map, used for CB accounting private static final long MAP_SLOT_SIZE = Long.BYTES * 2; LongRareTermsAggregator(String name, AggregatorFactories factories, ValuesSource.Numeric valuesSource, DocValueFormat format, diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringRareTermsAggregator.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringRareTermsAggregator.java index 6e135db335fc5..e52c0fb123c78 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringRareTermsAggregator.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringRareTermsAggregator.java @@ -51,6 +51,7 @@ public class StringRareTermsAggregator extends AbstractRareTermsAggregator map; protected BytesRefHash bucketOrds; + // Size of values in active map, used for CB accounting private static final long MAP_VALUE_SIZE = Long.BYTES; StringRareTermsAggregator(String name, AggregatorFactories factories, ValuesSource.Bytes valuesSource, From 0acd9ddd2c939d9281f43c76f1aa431c5e66233b Mon Sep 17 00:00:00 2001 From: Zachary Tong Date: Fri, 1 Mar 2019 12:05:30 -0500 Subject: [PATCH 09/25] Fix doc tests, add back final on InternalMappedTerms#writeTermTypeInfoTo --- .../bucket/rare-terms-aggregation.asciidoc | 20 +++++++------------ .../bucket/terms/InternalMappedTerms.java | 2 +- 2 files changed, 8 insertions(+), 14 deletions(-) diff --git a/docs/reference/aggregations/bucket/rare-terms-aggregation.asciidoc b/docs/reference/aggregations/bucket/rare-terms-aggregation.asciidoc index 6c178863e8f91..214fb4102327f 100644 --- a/docs/reference/aggregations/bucket/rare-terms-aggregation.asciidoc +++ b/docs/reference/aggregations/bucket/rare-terms-aggregation.asciidoc @@ -14,20 +14,18 @@ aggregation PUT /products { "mappings": { - "product": { - "properties": { - "genre": { - "type": "keyword" - }, - "product": { - "type": "keyword" - } + "properties": { + "genre": { + "type": "keyword" + }, + "product": { + "type": "keyword" } } } } -POST /products/product/_bulk?refresh +POST /products/_doc/_bulk?refresh {"index":{"_id":0}} {"genre": "rock", "product": "Product A"} {"index":{"_id":1}} @@ -109,8 +107,6 @@ Response: ... "aggregations" : { "genres" : { - "doc_count_error_upper_bound": 0, - "sum_other_doc_count": 0, "buckets" : [ { "key" : "swing", @@ -151,8 +147,6 @@ This now shows the "jazz" term which has a `doc_count` of 2": ... "aggregations" : { "genres" : { - "doc_count_error_upper_bound": 0, - "sum_other_doc_count": 0, "buckets" : [ { "key" : "swing", diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/InternalMappedTerms.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/InternalMappedTerms.java index 5622b2fa29230..547c9d0a80ec6 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/InternalMappedTerms.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/InternalMappedTerms.java @@ -72,7 +72,7 @@ protected InternalMappedTerms(StreamInput in, Bucket.Reader bucketReader) thr } @Override - protected void writeTermTypeInfoTo(StreamOutput out) throws IOException { + protected final void writeTermTypeInfoTo(StreamOutput out) throws IOException { out.writeZLong(docCountError); out.writeNamedWriteable(format); writeSize(shardSize, out); From f4e0146cbbb69808a891b4c67c17902bcae588e8 Mon Sep 17 00:00:00 2001 From: Zachary Tong Date: Wed, 13 Mar 2019 14:59:46 -0400 Subject: [PATCH 10/25] Rework rare_terms to use CuckooFilter Implements a scaling cuckoo filter instead of bloom filter --- .../common/util/CuckooFilter.java | 474 ++++++++++++++++++ .../common/util/ExactBloomFilter.java | 458 ----------------- .../util/SetBackedScalingCuckooFilter.java | 361 +++++++++++++ .../terms/AbstractRareTermsAggregator.java | 21 +- .../bucket/terms/InternalMappedRareTerms.java | 49 +- .../bucket/terms/InternalRareTerms.java | 4 +- .../bucket/terms/LongRareTerms.java | 22 +- .../bucket/terms/LongRareTermsAggregator.java | 15 +- .../terms/RareTermsAggregationBuilder.java | 34 +- .../terms/RareTermsAggregatorFactory.java | 15 +- .../bucket/terms/StringRareTerms.java | 23 +- .../terms/StringRareTermsAggregator.java | 14 +- .../bucket/terms/UnmappedRareTerms.java | 4 +- .../common/util/CuckooFilterTests.java | 132 +++++ .../common/util/ExactBloomFilterTests.java | 108 ---- .../SetBackedScalingCuckooFilterTests.java | 188 +++++++ .../terms/RareTermsAggregatorTests.java | 74 +++ .../org/elasticsearch/test/ESTestCase.java | 3 +- ...SessionFactoryLoadBalancingTests.java.orig | 402 +++++++++++++++ 19 files changed, 1754 insertions(+), 647 deletions(-) create mode 100644 server/src/main/java/org/elasticsearch/common/util/CuckooFilter.java delete mode 100644 server/src/main/java/org/elasticsearch/common/util/ExactBloomFilter.java create mode 100644 server/src/main/java/org/elasticsearch/common/util/SetBackedScalingCuckooFilter.java create mode 100644 server/src/test/java/org/elasticsearch/common/util/CuckooFilterTests.java delete mode 100644 server/src/test/java/org/elasticsearch/common/util/ExactBloomFilterTests.java create mode 100644 server/src/test/java/org/elasticsearch/common/util/SetBackedScalingCuckooFilterTests.java create mode 100644 x-pack/plugin/security/src/test/java/org/elasticsearch/xpack/security/authc/ldap/support/SessionFactoryLoadBalancingTests.java.orig diff --git a/server/src/main/java/org/elasticsearch/common/util/CuckooFilter.java b/server/src/main/java/org/elasticsearch/common/util/CuckooFilter.java new file mode 100644 index 0000000000000..9b9cf524668b6 --- /dev/null +++ b/server/src/main/java/org/elasticsearch/common/util/CuckooFilter.java @@ -0,0 +1,474 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.common.util; + +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.util.packed.PackedInts; +import org.elasticsearch.common.hash.MurmurHash3; +import org.elasticsearch.common.io.stream.StreamInput; +import org.elasticsearch.common.io.stream.StreamOutput; +import org.elasticsearch.common.io.stream.Writeable; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Iterator; +import java.util.Objects; +import java.util.Random; + +/** + * An approximate set membership datastructure + * + * CuckooFilters are similar to Bloom Filters in usage; values are inserted, and the Cuckoo + * can be asked if it has seen a particular value before. Because the structure is approximate, + * it can return false positives (says it has seen an item when it has not). False negatives + * are not possible though; if the structure says it _has not_ seen an item, that can be + * trusted. + * + * The filter can "saturate" at which point the map is fully loaded and will refuse to accept + * any new insertions. + * + * NOTE: this version does not support deletions, and as such does not save duplicate + * fingerprints (e.g. when inserting, if the fingerprint is already present in the + * candidate buckets, it is not inserted). By not saving duplicates, the CuckooFilter + * loses the ability to delete values. + */ +public class CuckooFilter implements Writeable { + + private static final double LN_2 = Math.log(2); + private static final int MAX_EVICTIONS = 500; + static final int EMPTY = 0; + + private final PackedInts.Mutable data; + private final int numBuckets; + private final int bitsPerEntry; + private final int fingerprintMask; + private final int entriesPerBucket; + private final Random rng; + private int count; + private int evictedFingerprint = EMPTY; + + /** + * @param capacity The number of expected inserts. The filter can hold more than this value, it is just an estimate + * @param fpp The desired false positive rate. Smaller values will reduce the + * false positives at expense of larger size + * @param rng A random number generator, used with the cuckoo hashing process + */ + CuckooFilter(long capacity, double fpp, Random rng) { + this.rng = rng; + this.entriesPerBucket = entriesPerBucket(fpp); + double loadFactor = getLoadFactor(entriesPerBucket); + this.bitsPerEntry = bitsPerEntry(fpp, entriesPerBucket); + this.numBuckets = getNumBuckets(capacity, loadFactor, entriesPerBucket); + + // This shouldn't happen, but as a sanity check + if (numBuckets * entriesPerBucket > Integer.MAX_VALUE) { + throw new IllegalArgumentException("Attempted to create [" + numBuckets * entriesPerBucket + + "] entries which is > Integer.MAX_VALUE"); + } + this.data = PackedInts.getMutable(numBuckets * entriesPerBucket, bitsPerEntry, PackedInts.COMPACT); + + // puts the bits at the right side of the mask, e.g. `0000000000001111` for bitsPerEntry = 4 + this.fingerprintMask = (0x80000000 >> (bitsPerEntry - 1)) >>> (Integer.SIZE - bitsPerEntry); + } + + CuckooFilter(StreamInput in, Random rng) throws IOException { + this.numBuckets = in.readVInt(); + this.bitsPerEntry = in.readVInt(); + this.entriesPerBucket = in.readVInt(); + this.count = in.readVInt(); + this.evictedFingerprint = in.readVInt(); + this.rng = rng; + + this.fingerprintMask = (0x80000000 >> (bitsPerEntry - 1)) >>> (Integer.SIZE - bitsPerEntry); + + data = (PackedInts.Mutable) PackedInts.getReader(new DataInput() { + @Override + public byte readByte() throws IOException { + return in.readByte(); + } + + @Override + public void readBytes(byte[] b, int offset, int len) throws IOException { + in.readBytes(b, offset, len); + } + }); + } + + CuckooFilter(CuckooFilter other) { + this.numBuckets = other.numBuckets; + this.bitsPerEntry = other.bitsPerEntry; + this.entriesPerBucket = other.entriesPerBucket; + this.count = other.count; + this.evictedFingerprint = other.evictedFingerprint; + this.rng = other.rng; + this.fingerprintMask = other.fingerprintMask; + + // This shouldn't happen, but as a sanity check + if (numBuckets * entriesPerBucket > Integer.MAX_VALUE) { + throw new IllegalArgumentException("Attempted to create [" + numBuckets * entriesPerBucket + + "] entries which is > Integer.MAX_VALUE"); + } + // TODO this is probably super slow, but just used for testing atm + this.data = PackedInts.getMutable(numBuckets * entriesPerBucket, bitsPerEntry, PackedInts.COMPACT); + for (int i = 0; i < other.data.size(); i++) { + data.set(i, other.data.get(i)); + } + } + + /** + * Get the number of unique items that are being tracked + */ + public int getCount() { + return count; + } + + /** + * Returns an iterator that returns the long[] representation of each bucket. The value + * inside each long will be a fingerprint (or 0L, representing empty). + * + * Expert-level API + */ + Iterator getBuckets() { + return new Iterator() { + int current = 0; + + @Override + public boolean hasNext() { + return current < numBuckets; + } + + @Override + public long[] next() { + long[] values = new long[entriesPerBucket]; + int offset = getOffset(current, 0); + data.get(offset, values, 0, entriesPerBucket); + current += 1; + return values; + } + }; + } + + /** + * Returns true if the set might contain the provided value, false otherwise. False values are + * 100% accurate, while true values may be a false-positive. + */ + boolean mightContain(MurmurHash3.Hash128 hash) { + int bucket = hashToIndex((int) hash.h1); + int fingerprint = fingerprint((int) hash.h2); + + return mightContainFingerprint(bucket, fingerprint); + } + + /** + * Returns true if the bucket or it's alternate bucket contains the fingerprint. + * + * Expert-level API, use {@link CuckooFilter#mightContain(MurmurHash3.Hash128)} to check if + * a value is in the filter. + */ + boolean mightContainFingerprint(int bucket, int fingerprint) { + int alternateBucket = alternateIndex(bucket, fingerprint); + + // check all entries for both buckets and the evicted slot + return hasFingerprint(bucket, fingerprint) || hasFingerprint(alternateBucket, fingerprint) || evictedFingerprint == fingerprint; + } + + /** + * Return's true if any of the entries in the bucket contain the fingerprint + */ + private boolean hasFingerprint(int bucket, long fingerprint) { + long[] values = new long[entriesPerBucket]; + int offset = getOffset(bucket, 0); + data.get(offset, values, 0, entriesPerBucket); + + return Arrays.stream(values).anyMatch(value -> value == fingerprint); + } + + /** + * Add's the hash to the bucket or alternate bucket. Returns true if the insertion was + * successful, false if the filter is saturated. + */ + boolean add(MurmurHash3.Hash128 hash) { + // can only use 64 of 128 bytes unfortunately (32 for each bucket), simplest + // to just truncate h1 and h2 appropriately + int bucket = hashToIndex((int) hash.h1); + int fingerprint = fingerprint((int) hash.h2); + return mergeFingerprint(bucket, fingerprint); + } + + /** + * Attempts to merge the fingerprint into the specified bucket or it's alternate bucket. + * Returns true if the insertion was successful, false if the filter is saturated. + * + * Expert-level API, use {@link CuckooFilter#add(MurmurHash3.Hash128)} to insert + * values into the filter + */ + boolean mergeFingerprint(int bucket, int fingerprint) { + // If we already have an evicted fingerprint we are full, no need to try + if (evictedFingerprint != EMPTY) { + return false; + } + + int alternateBucket = alternateIndex(bucket, fingerprint); + if (tryInsert(bucket, fingerprint) || tryInsert(alternateBucket, fingerprint)) { + count += 1; + return true; + } + + for (int i = 0; i < MAX_EVICTIONS; i++) { + // overwrite our alternate bucket, and a random entry + int offset = getOffset(alternateBucket, rng.nextInt(entriesPerBucket - 1)); + int oldFingerprint = (int) data.get(offset); + data.set(offset, fingerprint); + + // replace details and start again + fingerprint = oldFingerprint; + bucket = alternateBucket; + alternateBucket = alternateIndex(bucket, fingerprint); + + // Only try to insert into alternate bucket + if (tryInsert(alternateBucket, fingerprint)) { + count += 1; + return true; + } + } + + // If we get this far, we failed to insert the value after MAX_EVICTION rounds, + // so cache the last evicted value (so we don't lose it) and signal we failed + evictedFingerprint = fingerprint; + return false; + } + + /** + * Low-level insert method. Attempts to write the fingerprint into an empty entry + * at this bucket's position. Returns true if that was sucessful, false if all entries + * were occupied. + * + * If the fingerprint already exists in one of the entries, it will not duplicate the + * fingerprint like the original paper. This means the filter _cannot_ support deletes, + * but is not sensitive to "overflowing" buckets with repeated inserts + */ + private boolean tryInsert(int bucket, int fingerprint) { + long[] values = new long[entriesPerBucket]; + int offset = getOffset(bucket, 0); + data.get(offset, values, 0, entriesPerBucket); + + // TODO implement semi-sorting + for (int i = 0; i < values.length; i++) { + if (values[i] == EMPTY) { + data.set(offset + i, fingerprint); + return true; + } else if (values[i] == fingerprint) { + // Already have the fingerprint, no need to save + return true; + } + } + return false; + } + + /** + * Converts a hash into a bucket index (primary or alternate). + * + * If the hash is negative, this flips the bits. The hash is then modulo numBuckets + * to get the final index. + */ + private int hashToIndex(int hash) { + // invert the bits if we're negative + if (hash < 0) { + hash = ~hash; + } + return hash % numBuckets; + } + + /** + * Calculates the alternate bucket for a given bucket:fingerprint tuple + * + * The alternate bucket is the fingerprint multiplied by a mixing constant, + * then xor'd against the bucket. This new value is modulo'd against + * the buckets via {@link CuckooFilter#hashToIndex(int)} to get the final + * index. + * + * Note that the xor makes this operation reversible as long as we have the + * fingerprint and current bucket (regardless of if that bucket was the primary + * or alternate). + */ + private int alternateIndex(int bucket, int fingerprint) { + /* + Reference impl uses murmur2 mixing constant: + https://github.com/efficient/cuckoofilter/blob/master/src/cuckoofilter.h#L78 + // NOTE(binfan): originally we use: + // index ^ HashUtil::BobHash((const void*) (&tag), 4)) & table_->INDEXMASK; + // now doing a quick-n-dirty way: + // 0x5bd1e995 is the hash constant from MurmurHash2 + return IndexHash((uint32_t)(index ^ (tag * 0x5bd1e995))); + */ + int index = bucket ^ (fingerprint * 0x5bd1e995); + return hashToIndex(index); + } + + /** + * Given the bucket and entry position, returns the absolute offset + * inside the PackedInts datastructure + */ + private int getOffset(int bucket, int position) { + return (bucket * entriesPerBucket) + position; + } + + /** + * Calculates the fingerprint for a given hash. + * + * The fingerprint is simply the first `bitsPerEntry` number of bits that are non-zero. + * If the entire hash is zero, `(int) 1` is used + */ + private int fingerprint(int hash) { + if (hash == 0) { + // we use 0 as "empty" so if the hash actually hashes to zero... return 1 + // Some other impls will re-hash with a salt but this seems simpler + return 1; + } + + for (int i = 0; i + bitsPerEntry <= Long.SIZE; i += bitsPerEntry) { + int v = (hash >> i) & this.fingerprintMask; + if (v != 0) { + return v; + } + } + return 1; + } + + /** + * Calculate the optimal number of bits per entry + */ + private int bitsPerEntry(double fp, int numEntriesPerBucket) { + return (int) Math.round(log2((2 * numEntriesPerBucket) / fp)); + } + + /** + * Calculate the optimal number of entries per bucket. Will return 2, 4 or 8 + * depending on the false positive rate + */ + private int entriesPerBucket(double fpp) { + /* + Empirical constants from paper: + "the space-optimal bucket size depends on the target false positive rate ε: + when ε > 0.002, having two entries per bucket yields slightly better results + than using four entries per bucket; when ε decreases to 0.00001 < ε <= 0.002, + four entries per bucket minimzes space" + */ + + if (fpp > 0.002) { + return 2; + } else if (fpp > 0.00001 && fpp <= 0.002) { + return 4; + } + return 8; + } + + /** + * Calculates the optimal load factor for the filter, given the number of entries + * per bucket. Will return 0.84, 0.955 or 0.98 depending on b + */ + private double getLoadFactor(int b) { + if ((b == 2 || b == 4 || b == 8) == false) { + throw new IllegalArgumentException("b must be one of [2,4,8]"); + } + /* + Empirical constants from the paper: + "With k = 2 hash functions, the load factor α is 50% when bucket size b = 1 (i.e + the hash table is directly mapped), bu tincreases to 84%, 95%, 98% respectively + using bucket size b = 2, 4, 8" + */ + if (b == 2) { + return 0.84D; + } else if (b == 4) { + return 0.955D; + } else { + return 0.98D; + } + } + + /** + * Calculates the optimal number of buckets for this filter. The xor used in the bucketing + * algorithm requires this to be a power of two, so the optimal number of buckets will + * be rounded to the next largest power of two where applicable. + * + * TODO: there are schemes to avoid powers of two, might want to investigate those + */ + private int getNumBuckets(long capacity, double loadFactor, int b) { + // Rounds up to nearest power of 2 + long buckets = Math.round((((double) capacity / loadFactor)) / (double) b); + + // Make sure it isn't larger than the largest signed power of 2 for an int + if ((1 << -Long.numberOfLeadingZeros(buckets - 1)) > (1 << (Integer.SIZE - 2))) { + throw new IllegalArgumentException("Cannot create more than [" + Integer.MAX_VALUE + "] buckets"); + } + return 1 << -Integer.numberOfLeadingZeros((int)buckets - 1); + } + + private double log2(double x) { + return Math.log(x) / LN_2; + } + + public long getSizeInBytes() { + return data.ramBytesUsed() + 24; + } + + @Override + public void writeTo(StreamOutput out) throws IOException { + out.writeVInt(numBuckets); + out.writeVInt(bitsPerEntry); + out.writeVInt(entriesPerBucket); + out.writeVInt(count); + out.writeVInt(evictedFingerprint); + + data.save(new DataOutput() { + @Override + public void writeByte(byte b) throws IOException { + out.writeByte(b); + } + + @Override + public void writeBytes(byte[] b, int offset, int length) throws IOException { + out.writeBytes(b, offset, length); + } + }); + } + + @Override + public int hashCode() { + return Objects.hash(numBuckets, bitsPerEntry, entriesPerBucket, count, evictedFingerprint); + } + + @Override + public boolean equals(Object other) { + if (this == other) { + return true; + } + if (other == null || getClass() != other.getClass()) { + return false; + } + + final CuckooFilter that = (CuckooFilter) other; + return Objects.equals(this.numBuckets, that.numBuckets) + && Objects.equals(this.bitsPerEntry, that.bitsPerEntry) + && Objects.equals(this.entriesPerBucket, that.entriesPerBucket) + && Objects.equals(this.count, that.count) + && Objects.equals(this.evictedFingerprint, that.evictedFingerprint); + } +} diff --git a/server/src/main/java/org/elasticsearch/common/util/ExactBloomFilter.java b/server/src/main/java/org/elasticsearch/common/util/ExactBloomFilter.java deleted file mode 100644 index c25243169fdd3..0000000000000 --- a/server/src/main/java/org/elasticsearch/common/util/ExactBloomFilter.java +++ /dev/null @@ -1,458 +0,0 @@ -/* - * Licensed to Elasticsearch under one or more contributor - * license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright - * ownership. Elasticsearch licenses this file to you under - * the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.elasticsearch.common.util; - -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.RamUsageEstimator; -import org.elasticsearch.common.Numbers; -import org.elasticsearch.common.hash.MurmurHash3; -import org.elasticsearch.common.io.stream.StreamInput; -import org.elasticsearch.common.io.stream.StreamOutput; -import org.elasticsearch.common.io.stream.Writeable; - -import java.io.IOException; -import java.util.Arrays; -import java.util.HashSet; -import java.util.Objects; -import java.util.Set; - -/** - * A bloom filter which keeps an exact set of values until a threshold is reached, then the values - * are replayed into a traditional bloom filter for approximate tracking - */ -public class ExactBloomFilter implements Writeable { - - // Some anecdotal sizing numbers: - // expected insertions, false positive probability, bloom size, num hashes - // 10k, 0.001, 140.4kb, 10 Hashes - // 10k, 0.01, 93.6kb, 6 Hashes - // 100k, 0.01, 936.0kb, 6 Hashes - // 100k, 0.03, 712.7kb, 5 Hashes - // 500k, 0.01, 4.5mb, 6 Hashes - // 500k, 0.03, 3.4mb, 5 Hashes - // 500k, 0.05, 2.9mb, 4 Hashes - // 1m, 0.01, 9.1mb, 6 Hashes - // 1m, 0.03, 6.9mb, 5 Hashes - // 1m, 0.05, 5.9mb, 4 Hashes - // 5m, 0.01, 45.7mb, 6 Hashes - // 5m, 0.03, 34.8mb, 5 Hashes - // 5m, 0.05, 29.7mb, 4 Hashes - // 50m, 0.01, 457.0mb, 6 Hashes - // 50m, 0.03, 297.3mb, 4 Hashes - // 50m, 0.10, 228.5mb, 3 Hashes - - /** - * The bit set of the ExactBloomFilter (not necessarily power of 2!) - */ - BitArray bits; - Set hashedValues = new HashSet<>(); - - /** - * Number of hashes per element - */ - private final int numHashFunctions; - - /** - * The number of bits in the bloom - */ - private long numBits; - - /** - * The threshold (in bytes) before we convert the exact set into an approximate bloom filter - */ - private final long threshold; - - /** - * True if we are still tracking with a Set - */ - private boolean setMode = true; - - /** - * Creates a bloom filter with the expected number - * of insertions and expected false positive probability. - * - * @param expectedInsertions the number of expected insertions to the constructed - * @param fpp the desired false positive probability (must be positive and less than 1.0) - * @param threshold number of bytes to record exactly before converting to Bloom filter - */ - public ExactBloomFilter(int expectedInsertions, double fpp, long threshold) { - if (threshold <= 0) { - throw new IllegalArgumentException("BloomFilter threshold must be a non-negative number"); - } - - if (expectedInsertions == 0) { - expectedInsertions = 1; - } - this.threshold = threshold; - /* - * TODO(user): Put a warning in the javadoc about tiny fpp values, - * since the resulting size is proportional to -log(p), but there is not - * much of a point after all, e.g. optimalNumOfBits(1000, 0.0000000000000001) = 76680 - * which is less that 10kb. Who cares! - */ - this.numBits = optimalNumOfBits(expectedInsertions, fpp); - - // calculate the optimal number of hash functions - this.numHashFunctions = optimalNumOfHashFunctions(expectedInsertions, numBits); - if (numHashFunctions > 255) { - throw new IllegalArgumentException("BloomFilters with more than 255 hash functions are not allowed."); - } - } - - /** - * Copy constructor. The new Bloom will be an identical copy of the provided bloom - */ - public ExactBloomFilter(ExactBloomFilter otherBloom) { - this.numHashFunctions = otherBloom.getNumHashFunctions(); - this.threshold = otherBloom.getThreshold(); - this.numBits = otherBloom.getNumBits(); - this.setMode = otherBloom.setMode; - this.hashedValues = new HashSet<>(otherBloom.hashedValues); - if (otherBloom.bits != null) { - this.bits = new BitArray(otherBloom.numBits); - this.bits.putAll(otherBloom.bits); - } - } - - public ExactBloomFilter(StreamInput in) throws IOException { - this.setMode = in.readBoolean(); - if (setMode) { - this.hashedValues = in.readSet(in1 -> { - MurmurHash3.Hash128 hash = new MurmurHash3.Hash128(); - hash.h1 = in1.readLong(); - hash.h2 = in1.readLong(); - return hash; - }); - } else { - this.bits = new BitArray(in); - } - this.numHashFunctions = in.readVInt(); - this.threshold = in.readVLong(); - this.numBits = in.readVLong(); - } - - public void writeTo(StreamOutput out) throws IOException { - out.writeBoolean(setMode); - if (setMode) { - out.writeCollection(hashedValues, (out1, hash) -> { - out1.writeLong(hash.h1); - out1.writeLong(hash.h2); - }); - } else { - bits.writeTo(out); - } - out.writeVInt(numHashFunctions); - out.writeVLong(threshold); - out.writeVLong(numBits); - } - - /** - * Merge `other` bloom filter into this bloom. After merging, this bloom's state will - * be the union of the two. During the merging process, the internal Set may be upgraded - * to a Bloom if it goes over threshold - */ - public void merge(ExactBloomFilter other) { - assert this.numBits == other.numBits; - if (setMode && other.setMode) { - // Both in sets, merge collections then see if we need to convert to bloom - hashedValues.addAll(other.hashedValues); - checkAndConvertToBloom(); - } else if (setMode && other.setMode == false) { - // Other is in bloom mode, so we convert our set to a bloom then merge - convertToBloom(); - this.bits.putAll(other.bits); - } else if (setMode == false && other.setMode) { - // we're in bloom mode, so convert other's set and merge - other.convertToBloom(); - this.bits.putAll(other.bits); - } else { - this.bits.putAll(other.bits); - } - } - - public boolean put(BytesRef value) { - MurmurHash3.Hash128 hash = MurmurHash3.hash128(value.bytes, value.offset, value.length, 0, new MurmurHash3.Hash128()); - return put(hash); - } - - public boolean put(byte[] value) { - MurmurHash3.Hash128 hash = MurmurHash3.hash128(value, 0, value.length, 0, new MurmurHash3.Hash128()); - return put(hash); - } - - public boolean put(long value) { - return put(Numbers.longToBytes(value)); - } - - private boolean put(MurmurHash3.Hash128 hash) { - if (setMode) { - boolean newItem = hashedValues.add(hash); - checkAndConvertToBloom(); - return newItem; - } else { - return putBloom(hash); - } - } - - private boolean putBloom(MurmurHash3.Hash128 hash128) { - long bitSize = bits.bitSize(); - boolean bitsChanged = false; - long combinedHash = hash128.h1; - for (int i = 0; i < numHashFunctions; i++) { - // Make the combined hash positive and indexable - bitsChanged |= bits.set((combinedHash & Long.MAX_VALUE) % bitSize); - combinedHash += hash128.h2; - } - return bitsChanged; - } - - public boolean mightContain(BytesRef value) { - return mightContain(value.bytes, value.offset, value.length); - } - - public boolean mightContain(byte[] value) { - return mightContain(value, 0, value.length); - } - - public boolean mightContain(long value) { - return mightContain(Numbers.longToBytes(value)); - } - - private boolean mightContain(byte[] bytes, int offset, int length) { - MurmurHash3.Hash128 hash128 = MurmurHash3.hash128(bytes, offset, length, 0, new MurmurHash3.Hash128()); - - if (setMode) { - return hashedValues.contains(hash128); - } else { - long bitSize = bits.bitSize(); - long combinedHash = hash128.h1; - for (int i = 0; i < numHashFunctions; i++) { - // Make the combined hash positive and indexable - if (!bits.get((combinedHash & Long.MAX_VALUE) % bitSize)) { - return false; - } - combinedHash += hash128.h2; - } - return true; - } - } - - private int getNumHashFunctions() { - return this.numHashFunctions; - } - - private long getNumBits() { - return numBits; - } - - public long getThreshold() { - return threshold; - } - - /** - * Get the approximate size of this datastructure. Approximate because only the Set occupants - * are tracked, not the overhead of the Set itself. - */ - public long getSizeInBytes() { - long bytes = (hashedValues.size() * 16) + 8 + 4 + 1; - if (bits != null) { - bytes += bits.ramBytesUsed(); - } - return bytes; - } - - private void checkAndConvertToBloom() { - if (hashedValues.size() * 16 > threshold) { - convertToBloom(); - } - } - - private void convertToBloom() { - bits = new BitArray(numBits); - setMode = false; - for (MurmurHash3.Hash128 hash : hashedValues) { - putBloom(hash); - } - hashedValues.clear(); - } - - @Override - public int hashCode() { - return Objects.hash(numHashFunctions, hashedValues, bits, setMode, threshold, numBits); - } - - @Override - public boolean equals(Object other) { - if (this == other) { - return true; - } - if (other == null || getClass() != other.getClass()) { - return false; - } - - final ExactBloomFilter that = (ExactBloomFilter) other; - return Objects.equals(this.bits, that.bits) - && Objects.equals(this.numHashFunctions, that.numHashFunctions) - && Objects.equals(this.threshold, that.threshold) - && Objects.equals(this.setMode, that.setMode) - && Objects.equals(this.hashedValues, that.hashedValues) - && Objects.equals(this.numBits, that.numBits); - } - - - - /* - * Cheat sheet: - * - * m: total bits - * n: expected insertions - * b: m/n, bits per insertion - - * p: expected false positive probability - * - * 1) Optimal k = b * ln2 - * 2) p = (1 - e ^ (-kn/m))^k - * 3) For optimal k: p = 2 ^ (-k) ~= 0.6185^b - * 4) For optimal k: m = -nlnp / ((ln2) ^ 2) - */ - - /** - * Computes the optimal k (number of hashes per element inserted in Bloom filter), given the - * expected insertions and total number of bits in the Bloom filter. - *

- * See http://en.wikipedia.org/wiki/File:Bloom_filter_fp_probability.svg for the formula. - * - * @param n expected insertions (must be positive) - * @param m total number of bits in Bloom filter (must be positive) - */ - private static int optimalNumOfHashFunctions(long n, long m) { - return Math.max(1, (int) Math.round(m / n * Math.log(2))); - } - - /** - * Computes m (total bits of Bloom filter) which is expected to achieve, for the specified - * expected insertions, the required false positive probability. - *

- * See http://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives for the formula. - * - * @param n expected insertions (must be positive) - * @param p false positive rate (must be 0 < p < 1) - */ - private static long optimalNumOfBits(long n, double p) { - if (p == 0) { - p = Double.MIN_VALUE; - } - return (long) (-n * Math.log(p) / (Math.log(2) * Math.log(2))); - } - - static final class BitArray implements Writeable { - private final long[] data; - private final long bitSize; - private long bitCount; - - BitArray(long bits) { - this.data = new long[size(bits)]; - long bitCount = 0; - for (long value : data) { - bitCount += Long.bitCount(value); - } - this.bitCount = bitCount; - this.bitSize = data.length * Long.SIZE; - } - - private static int size(long bits) { - long quotient = bits / 64; - long remainder = bits - quotient * 64; - return Math.toIntExact(remainder == 0 ? quotient : 1 + quotient); - } - - BitArray(StreamInput in) throws IOException { - this.data = in.readVLongArray(); - this.bitSize = in.readVLong(); - this.bitCount = in.readVLong(); - } - - @Override - public void writeTo(StreamOutput out) throws IOException { - out.writeVLongArray(data); - out.writeVLong(bitSize); - out.writeVLong(bitCount); - } - - /** - * Returns true if the bit changed value. - */ - boolean set(long index) { - if (!get(index)) { - data[(int) (index >>> 6)] |= (1L << index); - bitCount++; - return true; - } - return false; - } - - boolean get(long index) { - return (data[(int) (index >>> 6)] & (1L << index)) != 0; - } - - /** - * Number of bits - */ - long bitSize() { - return bitSize; - } - - /** - * Number of set bits (1s) - */ - long bitCount() { - return bitCount; - } - - /** - * Combines the two BitArrays using bitwise OR. - */ - void putAll(BitArray array) { - bitCount = 0; - for (int i = 0; i < data.length; i++) { - data[i] |= array.data[i]; - bitCount += Long.bitCount(data[i]); - } - } - - @Override - public boolean equals(Object o) { - if (o instanceof BitArray) { - BitArray bitArray = (BitArray) o; - return Arrays.equals(data, bitArray.data); - } - return false; - } - - @Override - public int hashCode() { - return Arrays.hashCode(data); - } - - public long ramBytesUsed() { - return Long.BYTES * data.length + RamUsageEstimator.NUM_BYTES_ARRAY_HEADER + 16; - } - } - -} diff --git a/server/src/main/java/org/elasticsearch/common/util/SetBackedScalingCuckooFilter.java b/server/src/main/java/org/elasticsearch/common/util/SetBackedScalingCuckooFilter.java new file mode 100644 index 0000000000000..8be2d9a904b6e --- /dev/null +++ b/server/src/main/java/org/elasticsearch/common/util/SetBackedScalingCuckooFilter.java @@ -0,0 +1,361 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.common.util; + +import org.apache.lucene.util.BytesRef; +import org.elasticsearch.common.Numbers; +import org.elasticsearch.common.hash.MurmurHash3; +import org.elasticsearch.common.io.stream.StreamInput; +import org.elasticsearch.common.io.stream.StreamOutput; +import org.elasticsearch.common.io.stream.Writeable; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Objects; +import java.util.Random; +import java.util.Set; +import java.util.function.Consumer; + +/** + * An approximate set membership datastructure that scales as more unique values are inserted. + * Can definitively say if a member does not exist (no false negatives), but may say an item exists + * when it does not (has false negatives). Similar in usage to a Bloom Filter. + * + * Internally, the datastructure maintains a Set of hashes up to a specified threshold. This provides + * 100% accurate membership queries. + * + * When the threshold is breached, a list of CuckooFilters are created and used to track membership. + * These filters are approximate similar to Bloom Filters. + * + * This datastructure scales as more values are inserted by growing the list of CuckooFilters. + * Final size is dependent on the cardinality of data inserted, and the precision specified. + */ +public class SetBackedScalingCuckooFilter implements Writeable { + + private static final int FILTER_CAPACITY = 1000000; + + // Package-private for testing + Set hashes; + List filters; + + private final int threshold; + private final Random rng; + private final int capacity; + private final double fpp; + private Consumer breaker = aLong -> { + //noop + }; + private boolean isSetMode = true; + + /** + * @param threshold The number of distinct values that should be tracked + * before converting to an approximate representation + * @param rng A random number generator needed for the cuckoo hashing process + * @param fpp the false-positive rate that should be used for the cuckoo filters. + */ + public SetBackedScalingCuckooFilter(int threshold, Random rng, double fpp) { + this.hashes = new HashSet<>(threshold); + this.threshold = threshold; + this.rng = rng; + this.capacity = FILTER_CAPACITY; + this.fpp = fpp; + } + + public SetBackedScalingCuckooFilter(StreamInput in, Random rng) throws IOException { + this.threshold = in.readVInt(); + this.isSetMode = in.readBoolean(); + this.rng = rng; + this.capacity = in.readVInt(); + this.fpp = in.readDouble(); + + if (isSetMode) { + this.hashes = in.readSet(in1 -> { + MurmurHash3.Hash128 hash = new MurmurHash3.Hash128(); + hash.h1 = in1.readZLong(); + hash.h2 = in1.readZLong(); + return hash; + }); + } else { + this.filters = in.readList(in12 -> new CuckooFilter(in12, rng)); + } + } + + public SetBackedScalingCuckooFilter(SetBackedScalingCuckooFilter other) { + this.threshold = other.threshold; + this.isSetMode = other.isSetMode; + this.rng = other.rng; + this.breaker = other.breaker; + this.capacity = other.capacity; + this.fpp = other.fpp; + if (isSetMode) { + this.hashes = new HashSet<>(other.hashes); + } else { + this.filters = new ArrayList<>(other.filters); + } + } + + /** + * Registers a circuit breaker with the datastructure. + * + * CuckooFilter's can "saturate" and refuse to accept any new values. When this happens, + * the datastructure scales by adding a new filter. This new filter's bytes will be tracked + * in the registered breaker when configured. + */ + public void registerBreaker(Consumer breaker) { + this.breaker = breaker; + breaker.accept(getSizeInBytes()); + } + + /** + * Returns true if the set might contain the provided value, false otherwise. False values are + * 100% accurate, while true values may be a false-positive. + */ + public boolean mightContain(BytesRef value) { + return mightContain(value.bytes, value.offset, value.length); + } + + /** + * Returns true if the set might contain the provided value, false otherwise. False values are + * 100% accurate, while true values may be a false-positive. + */ + public boolean mightContain(byte[] value) { + return mightContain(value, 0, value.length); + } + + /** + * Returns true if the set might contain the provided value, false otherwise. False values are + * 100% accurate, while true values may be a false-positive. + */ + public boolean mightContain(long value) { + return mightContain(Numbers.longToBytes(value)); + } + + private boolean mightContain(byte[] bytes, int offset, int length) { + return mightContain(MurmurHash3.hash128(bytes, offset, length, 0, new MurmurHash3.Hash128())); + } + + private boolean mightContain(MurmurHash3.Hash128 hash) { + if (isSetMode) { + return hashes.contains(hash); + } + return filters.stream().anyMatch(filter -> filter.mightContain(hash)); + } + + /** + * Returns true if any of the filters contain this fingerprint at the specified bucket. + * This is an expert-level API since it is dealing with buckets and fingerprints, not raw values + * being hashed. + */ + private boolean mightContainFingerprint(int bucket, int fingerprint) { + return filters.stream().anyMatch(filter -> filter.mightContainFingerprint(bucket, fingerprint)); + } + + /** + * Add's the provided value to the set for tracking + */ + public boolean add(BytesRef value) { + MurmurHash3.Hash128 hash = MurmurHash3.hash128(value.bytes, value.offset, value.length, 0, new MurmurHash3.Hash128()); + return add(hash); + } + + /** + * Add's the provided value to the set for tracking + */ + public boolean add(byte[] value) { + MurmurHash3.Hash128 hash = MurmurHash3.hash128(value, 0, value.length, 0, new MurmurHash3.Hash128()); + return add(hash); + } + + /** + * Add's the provided value to the set for tracking + */ + public boolean add(long value) { + return add(Numbers.longToBytes(value)); + } + + private boolean add(MurmurHash3.Hash128 hash) { + if (isSetMode) { + hashes.add(hash); + if (hashes.size() > threshold) { + convert(); + } + return true; + } + + boolean success = filters.get(filters.size() - 1).add(hash); + if (success == false) { + // filter is full, create a new one and insert there + CuckooFilter t = new CuckooFilter(capacity, fpp, rng); + t.add(hash); + filters.add(t); + breaker.accept(t.getSizeInBytes()); // make sure we account for the new filter + } + return true; + } + + /** + * If we still holding values in a set, convert this filter into an approximate, cuckoo-backed filter. + * This will create a list of CuckooFilters, and null out the set of hashes + */ + private void convert() { + if (isSetMode) { + long oldSize = getSizeInBytes(); + + filters = new ArrayList<>(); + CuckooFilter t = new CuckooFilter(capacity, fpp, rng); + hashes.forEach(t::add); + filters.add(t); + + hashes = null; + isSetMode = false; + + breaker.accept(-oldSize); // this zeros out the overhead of the set + breaker.accept(getSizeInBytes()); // this adds back in the new overhead of the cuckoo filters + } + } + + /** + * Get the approximate size of this datastructure. Approximate because only the Set occupants + * are tracked, not the overhead of the Set itself. + */ + public long getSizeInBytes() { + long bytes = 0; + if (hashes != null) { + bytes = (hashes.size() * 16) + 8 + 4 + 1; + } + if (filters != null) { + bytes += filters.stream().mapToLong(CuckooFilter::getSizeInBytes).sum(); + } + return bytes; + } + + + /** + * Merge `other` cuckoo filter into this cuckoo. After merging, this filter's state will + * be the union of the two. During the merging process, the internal Set may be upgraded + * to a cuckoo if it goes over threshold + */ + public void merge(SetBackedScalingCuckooFilter other) { + if (isSetMode && other.isSetMode) { + // Both in sets, merge collections then see if we need to convert to cuckoo + hashes.addAll(other.hashes); + if (hashes.size() > threshold) { + convert(); + } + } else if (isSetMode && other.isSetMode == false) { + // Other is in cuckoo mode, so we convert our set to a cuckoo then merge collections. + // We could probably get fancy and keep our side in set-mode, but simpler to just convert + convert(); + filters.addAll(other.filters); + } else if (isSetMode == false && other.isSetMode) { + // Rather than converting the other to a cuckoo first, we can just + // replay the values directly into our filter. + other.hashes.forEach(this::add); + } else { + // Both are in cuckoo mode, merge raw fingerprints + + int current = 0; + CuckooFilter currentFilter = filters.get(current); + + for (CuckooFilter filter : other.filters) { + + // The iterator returns an array of longs corresponding to the + // fingerprints for buckets at the current position + Iterator iter = filter.getBuckets(); + int bucket = 0; + while (iter.hasNext()) { + long[] fingerprints = iter.next(); + + // We check to see if the fingerprint is present in any of the existing filters + // (in the same bucket/alternate bucket), or if the fingerprint is empty. In these cases + // we can skip the fingerprint + for (long fingerprint : fingerprints) { + if (fingerprint == CuckooFilter.EMPTY || mightContainFingerprint(bucket, (int) fingerprint)) { + continue; + } + boolean success = false; + + // If the fingerprint is new, we try to merge it into the filter at our `current` pointer. + // This might fail (e.g. the filter is full), so we may have to try multiple times + while (success == false) { + success = currentFilter.mergeFingerprint(bucket, (int) fingerprint); + + // If we failed to insert, the current filter is full, get next one + if (success == false) { + current += 1; + + // if we're out of filters, we need to create a new one + if (current >= filters.size()) { + CuckooFilter t = new CuckooFilter(capacity, fpp, rng); + filters.add(t); + breaker.accept(t.getSizeInBytes()); // make sure we account for the new filter + } + currentFilter = filters.get(current); + } + } + } + bucket += 1; + } + } + } + } + + @Override + public void writeTo(StreamOutput out) throws IOException { + out.writeVInt(threshold); + out.writeBoolean(isSetMode); + out.writeVInt(capacity); + out.writeDouble(fpp); + if (isSetMode) { + out.writeCollection(hashes, (out1, hash) -> { + out1.writeZLong(hash.h1); + out1.writeZLong(hash.h2); + }); + } else { + out.writeList(filters); + } + } + + @Override + public int hashCode() { + return Objects.hash(hashes, filters, threshold, isSetMode, capacity, fpp); + } + + @Override + public boolean equals(Object other) { + if (this == other) { + return true; + } + if (other == null || getClass() != other.getClass()) { + return false; + } + + final SetBackedScalingCuckooFilter that = (SetBackedScalingCuckooFilter) other; + return Objects.equals(this.hashes, that.hashes) + && Objects.equals(this.filters, that.filters) + && Objects.equals(this.threshold, that.threshold) + && Objects.equals(this.isSetMode, that.isSetMode) + && Objects.equals(this.capacity, that.capacity) + && Objects.equals(this.fpp, that.fpp); + } +} diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/AbstractRareTermsAggregator.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/AbstractRareTermsAggregator.java index 69439d32a05b1..ea3d461852e75 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/AbstractRareTermsAggregator.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/AbstractRareTermsAggregator.java @@ -19,7 +19,8 @@ package org.elasticsearch.search.aggregations.bucket.terms; -import org.elasticsearch.common.util.ExactBloomFilter; +import org.elasticsearch.common.Randomness; +import org.elasticsearch.common.util.SetBackedScalingCuckooFilter; import org.elasticsearch.search.DocValueFormat; import org.elasticsearch.search.aggregations.Aggregator; import org.elasticsearch.search.aggregations.AggregatorFactories; @@ -44,10 +45,11 @@ public abstract class AbstractRareTermsAggregator pipelineAggregators, - Map metaData, long maxDocCount, DocValueFormat format, - T valuesSource, U includeExclude) throws IOException { + Aggregator parent, List pipelineAggregators, + Map metaData, long maxDocCount, double precision, + DocValueFormat format, T valuesSource, U includeExclude) throws IOException { super(name, factories, context, parent, pipelineAggregators, metaData); - // TODO review: should we expose the BF settings? What's a good default? - this.bloom = new ExactBloomFilter(1000000, 0.03, 7000); // ~7mb - this.addRequestCircuitBreakerBytes(bloom.getSizeInBytes()); + this.filter = new SetBackedScalingCuckooFilter(10000, Randomness.get(), precision); + this.filter.registerBreaker(this::addRequestCircuitBreakerBytes); + this.maxDocCount = maxDocCount; + this.precision = precision; this.format = format; this.valuesSource = valuesSource; this.includeExclude = includeExclude; diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/InternalMappedRareTerms.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/InternalMappedRareTerms.java index d3e1b888e00e3..db84951477076 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/InternalMappedRareTerms.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/InternalMappedRareTerms.java @@ -18,10 +18,13 @@ */ package org.elasticsearch.search.aggregations.bucket.terms; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; import org.apache.lucene.util.CollectionUtil; +import org.elasticsearch.common.Randomness; import org.elasticsearch.common.io.stream.StreamInput; import org.elasticsearch.common.io.stream.StreamOutput; -import org.elasticsearch.common.util.ExactBloomFilter; +import org.elasticsearch.common.util.SetBackedScalingCuckooFilter; import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.search.DocValueFormat; import org.elasticsearch.search.aggregations.AggregationExecutionException; @@ -45,23 +48,25 @@ public abstract class InternalMappedRareTerms, protected List buckets; protected Map bucketMap; - final ExactBloomFilter bloom; + final SetBackedScalingCuckooFilter filter; + + protected final Logger logger = LogManager.getLogger(getClass()); InternalMappedRareTerms(String name, BucketOrder order, List pipelineAggregators, Map metaData, DocValueFormat format, - List buckets, long maxDocCount, ExactBloomFilter bloom) { + List buckets, long maxDocCount, SetBackedScalingCuckooFilter filter) { super(name, order, maxDocCount, pipelineAggregators, metaData); this.format = format; this.buckets = buckets; - this.bloom = bloom; + this.filter = filter; } public long getMaxDocCount() { return maxDocCount; } - ExactBloomFilter getBloom() { - return bloom; + SetBackedScalingCuckooFilter getFilter() { + return filter; } /** @@ -71,24 +76,24 @@ ExactBloomFilter getBloom() { super(in); format = in.readNamedWriteable(DocValueFormat.class); buckets = in.readList(stream -> bucketReader.read(stream, format)); - bloom = new ExactBloomFilter(in); + filter = new SetBackedScalingCuckooFilter(in, Randomness.get()); } @Override protected void writeTermTypeInfoTo(StreamOutput out) throws IOException { out.writeNamedWriteable(format); out.writeList(buckets); - bloom.writeTo(out); + filter.writeTo(out); } @Override public InternalAggregation doReduce(List aggregations, ReduceContext reduceContext) { Map> buckets = new HashMap<>(); InternalRareTerms referenceTerms = null; - ExactBloomFilter bloomFilter = null; + SetBackedScalingCuckooFilter filter = null; for (InternalAggregation aggregation : aggregations) { - // Unmapped rare terms don't have a bloom filter so we'll skip all this work + // Unmapped rare terms don't have a cuckoo filter so we'll skip all this work // and save some type casting headaches later. if (aggregation.isMapped() == false) { continue; @@ -113,33 +118,33 @@ public InternalAggregation doReduce(List aggregations, Redu bucketList.add(bucket); } - ExactBloomFilter otherBloom = ((InternalMappedRareTerms)aggregation).getBloom(); - if (bloomFilter == null) { - bloomFilter = new ExactBloomFilter(otherBloom); + SetBackedScalingCuckooFilter otherFilter = ((InternalMappedRareTerms)aggregation).getFilter(); + if (filter == null) { + filter = new SetBackedScalingCuckooFilter(otherFilter); } else { - bloomFilter.merge(otherBloom); + filter.merge(otherFilter); } } final List rare = new ArrayList<>(); for (List sameTermBuckets : buckets.values()) { final B b = sameTermBuckets.get(0).reduce(sameTermBuckets, reduceContext); - if ((b.getDocCount() <= maxDocCount && containsTerm(bloomFilter, b) == false)) { + if ((b.getDocCount() <= maxDocCount && containsTerm(filter, b) == false)) { rare.add(b); reduceContext.consumeBucketsAndMaybeBreak(1); } else if (b.getDocCount() > maxDocCount) { - // this term has gone over threshold while merging, so add it to the bloom. + // this term has gone over threshold while merging, so add it to the filter. // Note this may happen during incremental reductions too - addToBloom(bloomFilter, b); + addToFilter(filter, b); } } CollectionUtil.introSort(rare, order.comparator(null)); - return createWithBloom(name, rare, bloomFilter); + return createWithFilter(name, rare, filter); } - public abstract boolean containsTerm(ExactBloomFilter bloom, B bucket); + public abstract boolean containsTerm(SetBackedScalingCuckooFilter filter, B bucket); - public abstract void addToBloom(ExactBloomFilter bloom, B bucket); + public abstract void addToFilter(SetBackedScalingCuckooFilter filter, B bucket); @Override public List getBuckets() { @@ -160,12 +165,12 @@ protected boolean doEquals(Object obj) { return super.doEquals(obj) && Objects.equals(buckets, that.buckets) && Objects.equals(format, that.format) - && Objects.equals(bloom, that.bloom); + && Objects.equals(filter, that.filter); } @Override protected int doHashCode() { - return Objects.hash(super.doHashCode(), buckets, format, bloom); + return Objects.hash(super.doHashCode(), buckets, format, filter); } @Override diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/InternalRareTerms.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/InternalRareTerms.java index 5a56ea2202340..6c2b3db8b9bdc 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/InternalRareTerms.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/InternalRareTerms.java @@ -20,7 +20,7 @@ import org.elasticsearch.common.io.stream.StreamInput; import org.elasticsearch.common.io.stream.StreamOutput; -import org.elasticsearch.common.util.ExactBloomFilter; +import org.elasticsearch.common.util.SetBackedScalingCuckooFilter; import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.search.DocValueFormat; import org.elasticsearch.search.aggregations.Aggregations; @@ -171,7 +171,7 @@ public InternalAggregation doReduce(List aggregations, Redu throw new UnsupportedOperationException(); } - protected abstract A createWithBloom(String name, List buckets, ExactBloomFilter bloomFilter); + protected abstract A createWithFilter(String name, List buckets, SetBackedScalingCuckooFilter filter); /** * Create an array to hold some buckets. Used in collecting the results. diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/LongRareTerms.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/LongRareTerms.java index 65c7a5f425328..29f84fb6030e1 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/LongRareTerms.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/LongRareTerms.java @@ -21,7 +21,7 @@ import org.elasticsearch.common.io.stream.StreamInput; import org.elasticsearch.common.io.stream.StreamOutput; -import org.elasticsearch.common.util.ExactBloomFilter; +import org.elasticsearch.common.util.SetBackedScalingCuckooFilter; import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.search.DocValueFormat; import org.elasticsearch.search.aggregations.BucketOrder; @@ -106,9 +106,9 @@ public int hashCode() { } LongRareTerms(String name, BucketOrder order, List pipelineAggregators, - Map metaData, DocValueFormat format, - List buckets, long maxDocCount, ExactBloomFilter bloom) { - super(name, order, pipelineAggregators, metaData, format, buckets, maxDocCount, bloom); + Map metaData, DocValueFormat format, + List buckets, long maxDocCount, SetBackedScalingCuckooFilter filter) { + super(name, order, pipelineAggregators, metaData, format, buckets, maxDocCount, filter); } /** @@ -125,7 +125,7 @@ public String getWriteableName() { @Override public LongRareTerms create(List buckets) { - return new LongRareTerms(name, order, pipelineAggregators(), metaData, format, buckets, maxDocCount, bloom); + return new LongRareTerms(name, order, pipelineAggregators(), metaData, format, buckets, maxDocCount, filter); } @Override @@ -134,9 +134,9 @@ public LongRareTerms.Bucket createBucket(InternalAggregations aggregations, Long } @Override - protected LongRareTerms createWithBloom(String name, List buckets, ExactBloomFilter bloomFilter) { + protected LongRareTerms createWithFilter(String name, List buckets, SetBackedScalingCuckooFilter filter) { return new LongRareTerms(name, order, pipelineAggregators(), getMetaData(), format, - buckets, maxDocCount, bloomFilter); + buckets, maxDocCount, filter); } @Override @@ -145,12 +145,12 @@ protected LongRareTerms.Bucket[] createBucketsArray(int size) { } @Override - public boolean containsTerm(ExactBloomFilter bloom, LongRareTerms.Bucket bucket) { - return bloom.mightContain((long) bucket.getKey()); + public boolean containsTerm(SetBackedScalingCuckooFilter filter, LongRareTerms.Bucket bucket) { + return filter.mightContain((long) bucket.getKey()); } @Override - public void addToBloom(ExactBloomFilter bloom, LongRareTerms.Bucket bucket) { - bloom.put((long) bucket.getKey()); + public void addToFilter(SetBackedScalingCuckooFilter filter, LongRareTerms.Bucket bucket) { + filter.add((long) bucket.getKey()); } } diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/LongRareTermsAggregator.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/LongRareTermsAggregator.java index a622d539d29d2..2be5697423630 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/LongRareTermsAggregator.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/LongRareTermsAggregator.java @@ -55,9 +55,10 @@ public class LongRareTermsAggregator extends AbstractRareTermsAggregator pipelineAggregators, + int maxDocCount, double precision, List pipelineAggregators, Map metaData) throws IOException { - super(name, factories, aggregationContext, parent, pipelineAggregators, metaData, maxDocCount, format, valuesSource, longFilter); + super(name, factories, aggregationContext, parent, pipelineAggregators, metaData, maxDocCount, precision, + format, valuesSource, longFilter); this.map = new LongLongHashMap(); this.bucketOrds = new LongHash(1, aggregationContext.bigArrays()); } @@ -86,7 +87,7 @@ public void collect(int docId, long owningBucketOrdinal) throws IOException { final long val = values.nextValue(); if (previous != val || i == 0) { if ((includeExclude == null) || (includeExclude.accept(val))) { - if (bloom.mightContain(val) == false) { + if (filter.mightContain(val) == false) { long termCount = map.get(val); if (termCount == 0) { // Brand new term, save into map @@ -117,9 +118,9 @@ public void collect(int docId, long owningBucketOrdinal) throws IOException { } } else { // Otherwise we've breached the threshold, remove from - // the map and add to the bloom filter + // the map and add to the cuckoo filter map.remove(val); - bloom.put(val); + filter.add(val); addRequestCircuitBreakerBytes(-MAP_SLOT_SIZE); // 8 bytes for key, 8 for value numDeleted += 1; @@ -202,12 +203,12 @@ public InternalAggregation buildAggregation(long owningBucketOrdinal) throws IOE } CollectionUtil.introSort(buckets, ORDER.comparator(this)); - return new LongRareTerms(name, ORDER, pipelineAggregators(), metaData(), format, buckets, maxDocCount, bloom); + return new LongRareTerms(name, ORDER, pipelineAggregators(), metaData(), format, buckets, maxDocCount, filter); } @Override public InternalAggregation buildEmptyAggregation() { - return new LongRareTerms(name, ORDER, pipelineAggregators(), metaData(), format, emptyList(), 0, bloom); + return new LongRareTerms(name, ORDER, pipelineAggregators(), metaData(), format, emptyList(), 0, filter); } @Override diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/RareTermsAggregationBuilder.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/RareTermsAggregationBuilder.java index 73ecedf363fe9..9176d4dcd8355 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/RareTermsAggregationBuilder.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/RareTermsAggregationBuilder.java @@ -44,6 +44,7 @@ public class RareTermsAggregationBuilder extends ValuesSourceAggregationBuilder< public static final String NAME = "rare_terms"; private static final ParseField MAX_DOC_COUNT_FIELD_NAME = new ParseField("max_doc_count"); + private static final ParseField PRECISION = new ParseField("precision"); private static final int MAX_MAX_DOC_COUNT = 10; private static final ObjectParser PARSER; @@ -57,6 +58,8 @@ public class RareTermsAggregationBuilder extends ValuesSourceAggregationBuilder< PARSER.declareField((b, v) -> b.includeExclude(IncludeExclude.merge(b.includeExclude(), v)), IncludeExclude::parseExclude, IncludeExclude.EXCLUDE_FIELD, ObjectParser.ValueType.STRING_ARRAY); + + PARSER.declareDouble(RareTermsAggregationBuilder::setPrecision, PRECISION); } public static AggregationBuilder parse(String aggregationName, XContentParser parser) throws IOException { @@ -65,6 +68,7 @@ public static AggregationBuilder parse(String aggregationName, XContentParser pa private IncludeExclude includeExclude = null; private int maxDocCount = 1; + private double precision = 0.01; public RareTermsAggregationBuilder(String name, ValueType valueType) { super(name, ValuesSourceType.ANY, valueType); @@ -134,13 +138,35 @@ public IncludeExclude includeExclude() { return includeExclude; } + /** + * Get the current false positive rate for individual cuckoo filters. + */ + public double getPrecision() { + return precision; + } + + /** + * Set's the false-positive rate for individual cuckoo filters. Does not dictate the overall fpp rate + * since we use a "scaling" cuckoo filter which adds more filters as required, and the overall + * error rate grows differently than individual filters + * + * This value does, however, affect the overall space usage of the filter. Coarser precisions provide + * more compact filters. The default is 0.01 + */ + public void setPrecision(double precision) { + if (precision < 0.00001) { + throw new IllegalArgumentException("[precision] must be greater than 0.00001"); + } + this.precision = precision; + } + @Override protected ValuesSourceAggregatorFactory innerBuild(SearchContext context, ValuesSourceConfig config, AggregatorFactory parent, Builder subFactoriesBuilder) throws IOException { return new RareTermsAggregatorFactory(name, config, includeExclude, - context, parent, subFactoriesBuilder, metaData, maxDocCount); + context, parent, subFactoriesBuilder, metaData, maxDocCount, precision); } @Override @@ -149,19 +175,21 @@ protected XContentBuilder doXContentBody(XContentBuilder builder, Params params) includeExclude.toXContent(builder, params); } builder.field(MAX_DOC_COUNT_FIELD_NAME.getPreferredName(), maxDocCount); + builder.field(PRECISION.getPreferredName(), precision); return builder; } @Override protected int innerHashCode() { - return Objects.hash(includeExclude, maxDocCount); + return Objects.hash(includeExclude, maxDocCount, precision); } @Override protected boolean innerEquals(Object obj) { RareTermsAggregationBuilder other = (RareTermsAggregationBuilder) obj; return Objects.equals(includeExclude, other.includeExclude) - && Objects.equals(maxDocCount, other.maxDocCount); + && Objects.equals(maxDocCount, other.maxDocCount) + && Objects.equals(precision, other.precision); } @Override diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/RareTermsAggregatorFactory.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/RareTermsAggregatorFactory.java index a1918150b49bc..ddb563e03039d 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/RareTermsAggregatorFactory.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/RareTermsAggregatorFactory.java @@ -41,15 +41,17 @@ public class RareTermsAggregatorFactory extends ValuesSourceAggregatorFactory { private final IncludeExclude includeExclude; private final int maxDocCount; + private final double precision; RareTermsAggregatorFactory(String name, ValuesSourceConfig config, IncludeExclude includeExclude, SearchContext context, AggregatorFactory parent, AggregatorFactories.Builder subFactoriesBuilder, - Map metaData, int maxDocCount) throws IOException { + Map metaData, int maxDocCount, double precision) throws IOException { super(name, config, context, parent, subFactoriesBuilder, metaData); this.includeExclude = includeExclude; this.maxDocCount = maxDocCount; + this.precision = precision; } @Override @@ -81,7 +83,7 @@ protected Aggregator doCreateInternal(ValuesSource valuesSource, Aggregator pare } return execution.create(name, factories, valuesSource, format, - includeExclude, context, parent, pipelineAggregators, metaData, maxDocCount); + includeExclude, context, parent, pipelineAggregators, metaData, maxDocCount, precision); } if ((includeExclude != null) && (includeExclude.isRegexBased())) { @@ -99,7 +101,7 @@ protected Aggregator doCreateInternal(ValuesSource valuesSource, Aggregator pare longFilter = includeExclude.convertToLongFilter(config.format()); } return new LongRareTermsAggregator(name, factories, (ValuesSource.Numeric) valuesSource, config.format(), - context, parent, longFilter, maxDocCount, pipelineAggregators, metaData); + context, parent, longFilter, maxDocCount, precision, pipelineAggregators, metaData); } throw new AggregationExecutionException("RareTerms aggregation cannot be applied to field [" + config.fieldContext().field() @@ -114,11 +116,12 @@ public enum ExecutionMode { Aggregator create(String name, AggregatorFactories factories, ValuesSource valuesSource, DocValueFormat format, IncludeExclude includeExclude, SearchContext context, Aggregator parent, - List pipelineAggregators, Map metaData, long maxDocCount) + List pipelineAggregators, + Map metaData, long maxDocCount, double precision) throws IOException { final IncludeExclude.StringFilter filter = includeExclude == null ? null : includeExclude.convertToStringFilter(format); return new StringRareTermsAggregator(name, factories, (ValuesSource.Bytes) valuesSource, format, filter, - context, parent, pipelineAggregators, metaData, maxDocCount); + context, parent, pipelineAggregators, metaData, maxDocCount, precision); } @Override @@ -147,7 +150,7 @@ abstract Aggregator create(String name, AggregatorFactories factories, ValuesSou DocValueFormat format, IncludeExclude includeExclude, SearchContext context, Aggregator parent, List pipelineAggregators, Map metaData, - long maxDocCount) + long maxDocCount, double precision) throws IOException; abstract boolean needsGlobalOrdinals(); diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringRareTerms.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringRareTerms.java index 3a0f03446700f..3c3e19664a631 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringRareTerms.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringRareTerms.java @@ -21,7 +21,7 @@ import org.apache.lucene.util.BytesRef; import org.elasticsearch.common.io.stream.StreamInput; import org.elasticsearch.common.io.stream.StreamOutput; -import org.elasticsearch.common.util.ExactBloomFilter; +import org.elasticsearch.common.util.SetBackedScalingCuckooFilter; import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.search.DocValueFormat; import org.elasticsearch.search.aggregations.BucketOrder; @@ -107,10 +107,10 @@ public int hashCode() { } } - public StringRareTerms(String name, BucketOrder order, List pipelineAggregators, + StringRareTerms(String name, BucketOrder order, List pipelineAggregators, Map metaData, DocValueFormat format, - List buckets, long maxDocCount, ExactBloomFilter bloom) { - super(name, order, pipelineAggregators, metaData, format, buckets, maxDocCount, bloom); + List buckets, long maxDocCount, SetBackedScalingCuckooFilter filter) { + super(name, order, pipelineAggregators, metaData, format, buckets, maxDocCount, filter); } /** @@ -127,7 +127,7 @@ public String getWriteableName() { @Override public StringRareTerms create(List buckets) { - return new StringRareTerms(name, order, pipelineAggregators(), metaData, format, buckets, maxDocCount, bloom); + return new StringRareTerms(name, order, pipelineAggregators(), metaData, format, buckets, maxDocCount, filter); } @Override @@ -136,9 +136,10 @@ public StringRareTerms.Bucket createBucket(InternalAggregations aggregations, St } @Override - protected StringRareTerms createWithBloom(String name, List buckets, ExactBloomFilter bloomFilter) { + protected StringRareTerms createWithFilter(String name, List buckets, + SetBackedScalingCuckooFilter filterFilter) { return new StringRareTerms(name, order, pipelineAggregators(), metaData, format, - buckets, maxDocCount, bloomFilter); + buckets, maxDocCount, filterFilter); } @Override @@ -147,12 +148,12 @@ protected StringRareTerms.Bucket[] createBucketsArray(int size) { } @Override - public boolean containsTerm(ExactBloomFilter bloom, StringRareTerms.Bucket bucket) { - return bloom.mightContain(bucket.termBytes); + public boolean containsTerm(SetBackedScalingCuckooFilter filter, StringRareTerms.Bucket bucket) { + return filter.mightContain(bucket.termBytes); } @Override - public void addToBloom(ExactBloomFilter bloom, StringRareTerms.Bucket bucket) { - bloom.put(bucket.termBytes); + public void addToFilter(SetBackedScalingCuckooFilter filter, StringRareTerms.Bucket bucket) { + filter.add(bucket.termBytes); } } diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringRareTermsAggregator.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringRareTermsAggregator.java index e52c0fb123c78..24dab7fa1612e 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringRareTermsAggregator.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringRareTermsAggregator.java @@ -57,8 +57,8 @@ public class StringRareTermsAggregator extends AbstractRareTermsAggregator pipelineAggregators, - Map metaData, long maxDocCount) throws IOException { - super(name, factories, context, parent, pipelineAggregators, metaData, maxDocCount, format, valuesSource, stringFilter); + Map metaData, long maxDocCount, double precision) throws IOException { + super(name, factories, context, parent, pipelineAggregators, metaData, maxDocCount, precision, format, valuesSource, stringFilter); this.map = new ObjectLongHashMap<>(); this.bucketOrds = new BytesRefHash(1, context.bigArrays()); } @@ -91,7 +91,7 @@ public void collect(int docId, long bucket) throws IOException { continue; } - if (bloom.mightContain(bytes) == false) { + if (filter.mightContain(bytes) == false) { long valueCount = map.get(bytes); if (valueCount == 0) { // Brand new term, save into map @@ -120,9 +120,9 @@ public void collect(int docId, long bucket) throws IOException { } } else { // Otherwise we've breached the threshold, remove from - // the map and add to the bloom filter + // the map and add to the cuckoo filter map.remove(bytes); - bloom.put(bytes); + filter.add(bytes); numDeleted += 1; addRequestCircuitBreakerBytes(-(bytes.length + MAP_VALUE_SIZE)); // size of term + 8 for counter @@ -205,12 +205,12 @@ public InternalAggregation buildAggregation(long owningBucketOrdinal) throws IOE } CollectionUtil.introSort(buckets, ORDER.comparator(this)); - return new StringRareTerms(name, ORDER, pipelineAggregators(), metaData(), format, buckets, maxDocCount, bloom); + return new StringRareTerms(name, ORDER, pipelineAggregators(), metaData(), format, buckets, maxDocCount, filter); } @Override public InternalAggregation buildEmptyAggregation() { - return new StringRareTerms(name, LongRareTermsAggregator.ORDER, pipelineAggregators(), metaData(), format, emptyList(), 0, bloom); + return new StringRareTerms(name, LongRareTermsAggregator.ORDER, pipelineAggregators(), metaData(), format, emptyList(), 0, filter); } @Override diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/UnmappedRareTerms.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/UnmappedRareTerms.java index ddb89285b63b2..eff5441a1d7e7 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/UnmappedRareTerms.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/UnmappedRareTerms.java @@ -20,7 +20,7 @@ import org.elasticsearch.common.io.stream.StreamInput; import org.elasticsearch.common.io.stream.StreamOutput; -import org.elasticsearch.common.util.ExactBloomFilter; +import org.elasticsearch.common.util.SetBackedScalingCuckooFilter; import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.search.DocValueFormat; import org.elasticsearch.search.aggregations.InternalAggregation; @@ -83,7 +83,7 @@ public UnmappedRareTerms.Bucket createBucket(InternalAggregations aggregations, } @Override - protected UnmappedRareTerms createWithBloom(String name, List buckets, ExactBloomFilter bloomFilter) { + protected UnmappedRareTerms createWithFilter(String name, List buckets, SetBackedScalingCuckooFilter filter) { throw new UnsupportedOperationException("not supported for UnmappedRareTerms"); } diff --git a/server/src/test/java/org/elasticsearch/common/util/CuckooFilterTests.java b/server/src/test/java/org/elasticsearch/common/util/CuckooFilterTests.java new file mode 100644 index 0000000000000..1c0b6696b5c33 --- /dev/null +++ b/server/src/test/java/org/elasticsearch/common/util/CuckooFilterTests.java @@ -0,0 +1,132 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.common.util; + +import org.elasticsearch.common.Numbers; +import org.elasticsearch.common.Randomness; +import org.elasticsearch.common.hash.MurmurHash3; +import org.elasticsearch.common.io.stream.Writeable; +import org.elasticsearch.test.AbstractWireSerializingTestCase; + +import static org.hamcrest.Matchers.equalTo; +import static org.hamcrest.Matchers.lessThanOrEqualTo; + +public class CuckooFilterTests extends AbstractWireSerializingTestCase { + + @Override + protected CuckooFilter createTestInstance() { + CuckooFilter filter = new CuckooFilter(randomIntBetween(1, 100000), + ((float)randomIntBetween(1, 50)) / 100.0, Randomness.get()); + + int num = randomIntBetween(0, 10); + for (int i = 0; i < num; i++) { + filter.add(hash(randomLong())); + } + + return filter; + } + + @Override + protected Writeable.Reader instanceReader() { + return in -> new CuckooFilter(in, Randomness.get()); + } + + @Override + protected CuckooFilter mutateInstance(CuckooFilter instance) { + CuckooFilter newInstance = new CuckooFilter(instance); + int num = randomIntBetween(1, 10); + for (int i = 0; i < num; i++) { + newInstance.add(hash(randomLong())); + } + return newInstance; + } + + public void testExact() { + CuckooFilter filter = new CuckooFilter(10000, 0.03, Randomness.get()); + + for (int i = 0; i < 100; i++) { + filter.add(hash(i)); + } + + // Was sized sufficiently large that all of these values should be retained + for (int i = 0; i < 100; i++) { + assertThat(filter.mightContain(hash(i)), equalTo(true)); + } + } + + public void testSaturate() { + CuckooFilter filter = new CuckooFilter(10, 0.03, Randomness.get()); + int counter = 0; + boolean saturated = false; + for (int i = 0; i < 100; i++) { + logger.info("Value: " + i); + if (filter.add(hash(i)) == false) { + saturated = true; + } + counter += 1; + if (saturated) { + break; + } + } + // Unclear when it will saturate exactly, but should be before 100 given the configuration + assertTrue(saturated); + logger.info("Saturated at: " + counter); + + for (int i = 0; i < counter; i++) { + logger.info("Value: " + i); + assertThat(filter.mightContain(hash(i)), equalTo(true)); + } + } + + public void testBig() { + CuckooFilter filter = new CuckooFilter(1000000, 0.001, Randomness.get()); + + for (int i = 0; i < 10000; i++) { + filter.add(hash(i)); + } + + int correct = 0; + int incorrect = 0; + for (int i = 0; i < 10000; i++) { + if (filter.mightContain(hash(i))) { + correct += 1; + } else { + incorrect += 1; + } + } + + assertThat(correct, equalTo(10000)); + assertThat(incorrect, equalTo(0)); + + for (int i = 10000; i < 100000; i++) { + if (filter.mightContain(hash(i))) { + incorrect += 1; + } else { + correct += 1; + } + } + + double fppRate = (double) incorrect / 100000; + assertThat(fppRate, lessThanOrEqualTo(0.001)); + } + + private MurmurHash3.Hash128 hash(long i) { + return MurmurHash3.hash128(Numbers.longToBytes(i), 0, 8, 0, new MurmurHash3.Hash128()); + } +} diff --git a/server/src/test/java/org/elasticsearch/common/util/ExactBloomFilterTests.java b/server/src/test/java/org/elasticsearch/common/util/ExactBloomFilterTests.java deleted file mode 100644 index d841b4e9b69ce..0000000000000 --- a/server/src/test/java/org/elasticsearch/common/util/ExactBloomFilterTests.java +++ /dev/null @@ -1,108 +0,0 @@ -/* - * Licensed to Elasticsearch under one or more contributor - * license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright - * ownership. Elasticsearch licenses this file to you under - * the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.elasticsearch.common.util; - -import org.elasticsearch.common.Numbers; -import org.elasticsearch.common.hash.MurmurHash3; -import org.elasticsearch.common.io.stream.Writeable; -import org.elasticsearch.test.AbstractWireSerializingTestCase; - -import java.util.HashSet; -import java.util.Set; - -import static org.hamcrest.Matchers.empty; -import static org.hamcrest.Matchers.equalTo; -import static org.hamcrest.Matchers.greaterThan; - -public class ExactBloomFilterTests extends AbstractWireSerializingTestCase { - - @Override - protected ExactBloomFilter createTestInstance() { - ExactBloomFilter bloom = new ExactBloomFilter(randomIntBetween(1, 100000000), - ((float)randomIntBetween(1, 50)) / 100.0, randomNonNegativeLong()); - - int num = randomIntBetween(0, 10); - for (int i = 0; i < num; i++) { - bloom.put(randomLong()); - } - - return bloom; - } - - @Override - protected Writeable.Reader instanceReader() { - return ExactBloomFilter::new; - } - - @Override - protected ExactBloomFilter mutateInstance(ExactBloomFilter instance) { - ExactBloomFilter newInstance = new ExactBloomFilter(instance); - int num = randomIntBetween(1, 10); - for (int i = 0; i < num; i++) { - newInstance.put(randomLong()); - } - return newInstance; - } - - public void testExact() { - long threshold = randomLongBetween(1000, 10000); - ExactBloomFilter bloom = new ExactBloomFilter(1000000, 0.03, threshold); - - int size = 0; - Set values = new HashSet<>(); - Set hashed = new HashSet<>(values.size()); - while (size < threshold - 100) { - long value = randomLong(); - bloom.put(value); - boolean newValue = values.add(value); - if (newValue) { - byte[] bytes = Numbers.longToBytes(value); - MurmurHash3.Hash128 hash128 = MurmurHash3.hash128(bytes, 0, bytes.length, 0, new MurmurHash3.Hash128()); - hashed.add(hash128); - - size += 16; - } - } - assertThat(bloom.hashedValues.size(), equalTo(hashed.size())); - assertThat(bloom.hashedValues, equalTo(hashed)); - - for (Long value : values) { - assertThat(bloom.mightContain(value), equalTo(true)); - } - } - - public void testConvert() { - long threshold = randomLongBetween(1000, 10000); - ExactBloomFilter bloom = new ExactBloomFilter(1000000, 0.03, threshold); - - int size = 0; - Set values = new HashSet<>(); - while (size < threshold + 100) { - long value = randomLong(); - bloom.put(value); - boolean newValue = values.add(value); - if (newValue) { - size += 16; - } - } - assertThat(bloom.hashedValues, empty()); - assertThat(bloom.bits.bitSize(), greaterThan(0L)); - } - -} diff --git a/server/src/test/java/org/elasticsearch/common/util/SetBackedScalingCuckooFilterTests.java b/server/src/test/java/org/elasticsearch/common/util/SetBackedScalingCuckooFilterTests.java new file mode 100644 index 0000000000000..16f2b4d5c2f4f --- /dev/null +++ b/server/src/test/java/org/elasticsearch/common/util/SetBackedScalingCuckooFilterTests.java @@ -0,0 +1,188 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.common.util; + +import org.elasticsearch.common.Numbers; +import org.elasticsearch.common.Randomness; +import org.elasticsearch.common.hash.MurmurHash3; +import org.elasticsearch.common.io.stream.Writeable; +import org.elasticsearch.test.AbstractWireSerializingTestCase; + +import java.io.IOException; +import java.util.HashSet; +import java.util.Set; + +import static org.hamcrest.Matchers.equalTo; +import static org.hamcrest.Matchers.greaterThan; +import static org.hamcrest.Matchers.lessThanOrEqualTo; + +public class SetBackedScalingCuckooFilterTests extends AbstractWireSerializingTestCase { + + @Override + protected SetBackedScalingCuckooFilter createTestInstance() { + SetBackedScalingCuckooFilter bloom = new SetBackedScalingCuckooFilter(1000, Randomness.get(), 0.01); + + int num = randomIntBetween(0, 10); + for (int i = 0; i < num; i++) { + bloom.add(randomLong()); + } + + return bloom; + } + + @Override + protected Writeable.Reader instanceReader() { + return in -> new SetBackedScalingCuckooFilter(in, Randomness.get()); + } + + @Override + protected SetBackedScalingCuckooFilter mutateInstance(SetBackedScalingCuckooFilter instance) throws IOException { + SetBackedScalingCuckooFilter newInstance = new SetBackedScalingCuckooFilter(instance); + int num = randomIntBetween(1, 10); + for (int i = 0; i < num; i++) { + newInstance.add(randomLong()); + } + return newInstance; + } + + public void testExact() { + int threshold = randomIntBetween(1000, 10000); + SetBackedScalingCuckooFilter filter = new SetBackedScalingCuckooFilter(threshold, Randomness.get(), 0.01); + + int size = 0; + Set values = new HashSet<>(); + Set hashed = new HashSet<>(values.size()); + while (size < threshold - 100) { + long value = randomLong(); + filter.add(value); + boolean newValue = values.add(value); + if (newValue) { + byte[] bytes = Numbers.longToBytes(value); + MurmurHash3.Hash128 hash128 = MurmurHash3.hash128(bytes, 0, bytes.length, 0, new MurmurHash3.Hash128()); + hashed.add(hash128); + + size += 16; + } + } + assertThat(filter.hashes.size(), equalTo(hashed.size())); + assertThat(filter.hashes, equalTo(hashed)); + assertNull(filter.filters); + + for (Long value : values) { + assertThat(filter.mightContain(value), equalTo(true)); + } + } + + public void testConvert() { + int threshold = randomIntBetween(1000, 10000); + SetBackedScalingCuckooFilter filter = new SetBackedScalingCuckooFilter(threshold, Randomness.get(), 0.01); + + int counter = 0; + Set values = new HashSet<>(); + while (counter < threshold + 100) { + long value = randomLong(); + filter.add(value); + boolean newValue = values.add(value); + if (newValue) { + counter += 1; + } + } + assertNull(filter.hashes); + assertThat(filter.filters.size(), greaterThan(0)); + + int incorrect = 0; + for (Long v : values) { + if (filter.mightContain(v) == false) { + incorrect += 1; + } + } + double fppRate = (double) incorrect / values.size(); + assertThat(fppRate, lessThanOrEqualTo(0.001)); + } + + public void testMergeSmall() { + int threshold = 1000; + + // Setup the first filter + SetBackedScalingCuckooFilter filter = new SetBackedScalingCuckooFilter(threshold, Randomness.get(), 0.01); + + int counter = 0; + Set values = new HashSet<>(); + while (counter < threshold + 1) { + long value = randomLong(); + filter.add(value); + boolean newValue = values.add(value); + if (newValue) { + counter += 1; + } + } + assertNull(filter.hashes); + assertThat(filter.filters.size(), greaterThan(0)); + + int incorrect = 0; + for (Long v : values) { + if (filter.mightContain(v) == false) { + incorrect += 1; + } + } + double fppRate = (double) incorrect / values.size(); + assertThat(fppRate, lessThanOrEqualTo(0.001)); + + // Setup the second filter + SetBackedScalingCuckooFilter filter2 = new SetBackedScalingCuckooFilter(threshold, Randomness.get(), 0.01); + counter = 0; + Set values2 = new HashSet<>(); + while (counter < threshold + 1) { + long value = randomLong(); + filter2.add(value); + boolean newValue = values2.add(value); + if (newValue) { + counter += 1; + } + } + assertNull(filter2.hashes); + assertThat(filter2.filters.size(), greaterThan(0)); + + incorrect = 0; + for (Long v : values2) { + if (filter2.mightContain(v) == false) { + incorrect += 1; + } + } + fppRate = (double) incorrect / values2.size(); + assertThat(fppRate, lessThanOrEqualTo(0.001)); + + // now merge and verify the combined set + filter.merge(filter2); + incorrect = 0; + for (Long v : values) { + if (filter.mightContain(v) == false) { + incorrect += 1; + } + } + for (Long v : values2) { + if (filter.mightContain(v) == false) { + incorrect += 1; + } + } + fppRate = (double) incorrect / (values.size() + values2.size()); + assertThat(fppRate, lessThanOrEqualTo(0.001)); + + } +} diff --git a/server/src/test/java/org/elasticsearch/search/aggregations/bucket/terms/RareTermsAggregatorTests.java b/server/src/test/java/org/elasticsearch/search/aggregations/bucket/terms/RareTermsAggregatorTests.java index 009aa03320ce8..b56421c952afc 100644 --- a/server/src/test/java/org/elasticsearch/search/aggregations/bucket/terms/RareTermsAggregatorTests.java +++ b/server/src/test/java/org/elasticsearch/search/aggregations/bucket/terms/RareTermsAggregatorTests.java @@ -487,6 +487,39 @@ private List generateDocsWithNested(String id, int value, int[] nested documents.add(document); return documents; + } + + public void testBenchmark() throws IOException { + Query query = new MatchAllDocsQuery(); + + int size = 3_000_000; + List d = new ArrayList<>(size); + + long[] uniqueValues = new long[]{1, 100, 1000, 10000}; + for (long v : uniqueValues) { + for (long i = 0; i < v; i++) { + d.add(i); + } + + for (long i = v; i < size; i++) { + d.add(i); + d.add(i); + + + } + + executeBench(true, query, d, + aggregation -> aggregation.field(LONG_FIELD).maxDocCount(1), + agg -> { + logger.error(v + ": " + agg.getBuckets().size()); + }, + ValueType.NUMERIC); + d.clear(); + } + + + + } private InternalAggregation buildInternalAggregation(RareTermsAggregationBuilder builder, MappedFieldType fieldType, @@ -578,6 +611,47 @@ private void executeTestCase(boolean reduced, Query query, List dataset, } } + private void executeBench(boolean reduced, Query query, List dataset, + Consumer configure, + Consumer verify, ValueType valueType) throws IOException { + + try (Directory directory = newDirectory()) { + try (RandomIndexWriter indexWriter = new RandomIndexWriter(random(), directory)) { + Document document = new Document(); + for (Long value : dataset) { + + document.add(new SortedNumericDocValuesField(LONG_FIELD, value)); + document.add(new LongPoint(LONG_FIELD, value)); + indexWriter.addDocument(document); + document.clear(); + } + } + + logger.info("Start agg"); + try (IndexReader indexReader = DirectoryReader.open(directory)) { + IndexSearcher indexSearcher = newIndexSearcher(indexReader); + + RareTermsAggregationBuilder aggregationBuilder = new RareTermsAggregationBuilder("_name", valueType); + if (configure != null) { + configure.accept(aggregationBuilder); + } + + MappedFieldType longFieldType = new NumberFieldMapper.NumberFieldType(NumberFieldMapper.NumberType.LONG); + longFieldType.setName(LONG_FIELD); + longFieldType.setHasDocValues(true); + + InternalMappedRareTerms rareTerms; + if (reduced) { + rareTerms = searchAndReduce(indexSearcher, query, aggregationBuilder, longFieldType); + } else { + rareTerms = search(indexSearcher, query, aggregationBuilder, longFieldType); + } + verify.accept(rareTerms); + } + logger.info("End agg"); + } + } + @Override public void doAssertReducedMultiBucketConsumer(Aggregation agg, MultiBucketConsumerService.MultiBucketConsumer bucketConsumer) { /* diff --git a/test/framework/src/main/java/org/elasticsearch/test/ESTestCase.java b/test/framework/src/main/java/org/elasticsearch/test/ESTestCase.java index 6b36f985c210b..e3991f47221de 100644 --- a/test/framework/src/main/java/org/elasticsearch/test/ESTestCase.java +++ b/test/framework/src/main/java/org/elasticsearch/test/ESTestCase.java @@ -173,7 +173,8 @@ }) @ThreadLeakScope(Scope.SUITE) @ThreadLeakLingering(linger = 5000) // 5 sec lingering -@TimeoutSuite(millis = 20 * TimeUnits.MINUTE) +// nocommit +@TimeoutSuite(millis = 40 * TimeUnits.MINUTE) @LuceneTestCase.SuppressSysoutChecks(bugUrl = "we log a lot on purpose") // we suppress pretty much all the lucene codecs for now, except asserting // assertingcodec is the winner for a codec here: it finds bugs and gives clear exceptions. diff --git a/x-pack/plugin/security/src/test/java/org/elasticsearch/xpack/security/authc/ldap/support/SessionFactoryLoadBalancingTests.java.orig b/x-pack/plugin/security/src/test/java/org/elasticsearch/xpack/security/authc/ldap/support/SessionFactoryLoadBalancingTests.java.orig new file mode 100644 index 0000000000000..b58d8f9fb7637 --- /dev/null +++ b/x-pack/plugin/security/src/test/java/org/elasticsearch/xpack/security/authc/ldap/support/SessionFactoryLoadBalancingTests.java.orig @@ -0,0 +1,402 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ +package org.elasticsearch.xpack.security.authc.ldap.support; + +import com.unboundid.ldap.listener.InMemoryDirectoryServer; +import com.unboundid.ldap.sdk.LDAPConnection; +import com.unboundid.ldap.sdk.LDAPException; +import com.unboundid.ldap.sdk.SimpleBindRequest; +import org.apache.logging.log4j.message.ParameterizedMessage; +import org.elasticsearch.action.ActionListener; +import org.elasticsearch.common.SuppressForbidden; +import org.elasticsearch.common.network.InetAddressHelper; +import org.elasticsearch.common.settings.SecureString; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.util.concurrent.ThreadContext; +import org.elasticsearch.core.internal.io.IOUtils; +import org.elasticsearch.env.TestEnvironment; +import org.elasticsearch.mocksocket.MockServerSocket; +import org.elasticsearch.mocksocket.MockSocket; +import org.elasticsearch.test.junit.annotations.TestLogging; +import org.elasticsearch.threadpool.TestThreadPool; +import org.elasticsearch.threadpool.ThreadPool; +import org.elasticsearch.xpack.core.common.socket.SocketAccess; +import org.elasticsearch.xpack.core.security.authc.RealmConfig; +import org.elasticsearch.xpack.core.security.authc.ldap.support.LdapSearchScope; +import org.elasticsearch.xpack.core.ssl.SSLService; +import org.junit.After; +import org.junit.Before; + +import java.io.IOException; +<<<<<<< HEAD +======= +import java.net.ConnectException; +>>>>>>> origin/master +import java.net.Inet4Address; +import java.net.InetAddress; +import java.net.InetSocketAddress; +import java.net.NoRouteToHostException; +import java.net.Socket; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.stream.Collectors; + +import static org.hamcrest.Matchers.containsString; +import static org.hamcrest.Matchers.greaterThanOrEqualTo; +import static org.hamcrest.Matchers.is; +import static org.hamcrest.Matchers.not; + +/** + * Tests that the server sets properly load balance connections without throwing exceptions + */ +@TestLogging("org.elasticsearch.xpack.security.authc.ldap.support:DEBUG") +public class SessionFactoryLoadBalancingTests extends LdapTestCase { + + private ThreadPool threadPool; + + @Before + public void init() throws Exception { + threadPool = new TestThreadPool("SessionFactoryLoadBalancingTests thread pool"); + } + + @After + public void shutdown() { + terminate(threadPool); + } + + public void testRoundRobin() throws Exception { + TestSessionFactory testSessionFactory = createSessionFactory(LdapLoadBalancing.ROUND_ROBIN); + + final int numberOfIterations = randomIntBetween(1, 5); + for (int iteration = 0; iteration < numberOfIterations; iteration++) { + for (int i = 0; i < numberOfLdapServers; i++) { + try (LDAPConnection connection = LdapUtils.privilegedConnect(testSessionFactory.getServerSet()::getConnection)) { + assertThat(connection.getConnectedPort(), is(ldapServers[i].getListenPort())); + } + } + } + } + + public void testRoundRobinWithFailures() throws Exception { + assumeTrue("at least two ldap servers should be present for this test", ldapServers.length > 1); + logger.debug("using [{}] ldap servers, urls {}", ldapServers.length, ldapUrls()); + TestSessionFactory testSessionFactory = createSessionFactory(LdapLoadBalancing.ROUND_ROBIN); + + // create a list of ports + List ports = new ArrayList<>(numberOfLdapServers); + for (InMemoryDirectoryServer ldapServer : ldapServers) { + ports.add(ldapServer.getListenPort()); + } + logger.debug("list of all ports {}", ports); + + final int numberToKill = randomIntBetween(1, numberOfLdapServers - 1); + logger.debug("killing [{}] servers", numberToKill); + + // get a subset to kill + final List ldapServersToKill = randomSubsetOf(numberToKill, ldapServers); + final List ldapServersList = Arrays.asList(ldapServers); + final MockServerSocket mockServerSocket = new MockServerSocket(0, 0); + final List listenThreads = new ArrayList<>(); + final CountDownLatch latch = new CountDownLatch(ldapServersToKill.size()); + final CountDownLatch closeLatch = new CountDownLatch(1); + try { + final AtomicBoolean success = new AtomicBoolean(true); + for (InMemoryDirectoryServer ldapServerToKill : ldapServersToKill) { + final int index = ldapServersList.indexOf(ldapServerToKill); + assertThat(index, greaterThanOrEqualTo(0)); + final int port = ldapServers[index].getListenPort(); + logger.debug("shutting down server index [{}] listening on [{}]", index, port); + assertTrue(ports.remove(Integer.valueOf(port))); + ldapServers[index].shutDown(true); + + // when running multiple test jvms, there is a chance that something else could + // start listening on this port so we try to avoid this by creating a local socket + // that will be bound to the port the ldap server was running on and connecting to + // a mock server socket. + // NOTE: this is not perfect as there is a small amount of time between the shutdown + // of the ldap server and the opening of the socket + logger.debug("opening mock client sockets bound to [{}]", port); + Runnable runnable = new PortBlockingRunnable(mockServerSocket.getInetAddress(), mockServerSocket.getLocalPort(), port, + latch, closeLatch, success); + Thread thread = new Thread(runnable); + thread.start(); + listenThreads.add(thread); + + assertThat(ldapServers[index].getListenPort(), is(-1)); + } + + latch.await(); + + assumeTrue("Failed to open sockets on all addresses with the port that an LDAP server was bound to. Some operating systems " + + "allow binding to an address and port combination even if an application is bound to the port on a wildcard address", + success.get()); + final int numberOfIterations = randomIntBetween(1, 5); + logger.debug("list of all open ports {}", ports); + // go one iteration through and attempt a bind + for (int iteration = 0; iteration < numberOfIterations; iteration++) { + logger.debug("iteration [{}]", iteration); + for (Integer port : ports) { + logger.debug("attempting connection with expected port [{}]", port); + LDAPConnection connection = null; + try { + do { + final LDAPConnection finalConnection = + LdapUtils.privilegedConnect(testSessionFactory.getServerSet()::getConnection); + connection = finalConnection; + logger.debug("established connection with port [{}] expected port [{}]", + finalConnection.getConnectedPort(), port); + if (finalConnection.getConnectedPort() != port) { + LDAPException e = expectThrows(LDAPException.class, () -> finalConnection.bind(new SimpleBindRequest())); + assertThat(e.getMessage(), containsString("not connected")); + finalConnection.close(); + } + } while (connection.getConnectedPort() != port); + + assertThat(connection.getConnectedPort(), is(port)); + } finally { + if (connection != null) { + connection.close(); + } + } + } + } + } finally { + closeLatch.countDown(); + mockServerSocket.close(); + for (Thread t : listenThreads) { + t.join(); + } + } + } + + @SuppressForbidden(reason = "Allow opening socket for test") + private MockSocket openMockSocket(InetAddress remoteAddress, int remotePort, InetAddress localAddress, int localPort) + throws IOException { + final MockSocket socket = new MockSocket(); + socket.setReuseAddress(true); // allow binding even if the previous socket is in timed wait state. + socket.setSoLinger(true, 0); // close immediately as we are not writing anything here. + socket.bind(new InetSocketAddress(localAddress, localPort)); + SocketAccess.doPrivileged(() -> socket.connect(new InetSocketAddress(remoteAddress, remotePort))); + return socket; + } + + public void testFailover() throws Exception { + assumeTrue("at least two ldap servers should be present for this test", ldapServers.length > 1); + logger.debug("using [{}] ldap servers, urls {}", ldapServers.length, ldapUrls()); + TestSessionFactory testSessionFactory = createSessionFactory(LdapLoadBalancing.FAILOVER); + + // first test that there is no round robin stuff going on + final int firstPort = ldapServers[0].getListenPort(); + for (int i = 0; i < numberOfLdapServers; i++) { + try (LDAPConnection connection = LdapUtils.privilegedConnect(testSessionFactory.getServerSet()::getConnection)) { + assertThat(connection.getConnectedPort(), is(firstPort)); + } + } + + // we need at least one good server. Hence the upper bound is number - 2 since we need at least + // one server to use! + InMemoryDirectoryServer[] allButFirstServer = Arrays.copyOfRange(ldapServers, 1, ldapServers.length); + final List ldapServersToKill; + if (ldapServers.length > 2) { + final int numberToKill = randomIntBetween(1, numberOfLdapServers - 2); + ldapServersToKill = randomSubsetOf(numberToKill, allButFirstServer); + ldapServersToKill.add(ldapServers[0]); // always kill the first one + } else { + ldapServersToKill = Collections.singletonList(ldapServers[0]); + } + final List ldapServersList = Arrays.asList(ldapServers); + final MockServerSocket mockServerSocket = new MockServerSocket(0, 0); + final List listenThreads = new ArrayList<>(); + final CountDownLatch latch = new CountDownLatch(ldapServersToKill.size()); + final CountDownLatch closeLatch = new CountDownLatch(1); + final AtomicBoolean success = new AtomicBoolean(true); + for (InMemoryDirectoryServer ldapServerToKill : ldapServersToKill) { + final int index = ldapServersList.indexOf(ldapServerToKill); + final int port = ldapServers[index].getListenPort(); + logger.debug("shutting down server index [{}] listening on [{}]", index, port); + ldapServers[index].shutDown(true); + + // when running multiple test jvms, there is a chance that something else could + // start listening on this port so we try to avoid this by creating a local socket + // that will be bound to the port the ldap server was running on and connecting to + // a mock server socket. + // NOTE: this is not perfect as there is a small amount of time between the shutdown + // of the ldap server and the opening of the socket + logger.debug("opening mock server socket listening on [{}]", port); + Runnable runnable = new PortBlockingRunnable(mockServerSocket.getInetAddress(), mockServerSocket.getLocalPort(), port, + latch, closeLatch, success); + Thread thread = new Thread(runnable); + thread.start(); + listenThreads.add(thread); + + assertThat(ldapServers[index].getListenPort(), is(-1)); + } + + try { + latch.await(); + + assumeTrue("Failed to open sockets on all addresses with the port that an LDAP server was bound to. Some operating systems " + + "allow binding to an address and port combination even if an application is bound to the port on a wildcard address", + success.get()); + int firstNonStoppedPort = -1; + // now we find the first that isn't stopped + for (int i = 0; i < numberOfLdapServers; i++) { + if (ldapServers[i].getListenPort() != -1) { + firstNonStoppedPort = ldapServers[i].getListenPort(); + break; + } + } + logger.debug("first non stopped port [{}]", firstNonStoppedPort); + assertThat(firstNonStoppedPort, not(-1)); + final int numberOfIterations = randomIntBetween(1, 5); + for (int iteration = 0; iteration < numberOfIterations; iteration++) { + logger.debug("attempting connection with expected port [{}] iteration [{}]", firstNonStoppedPort, iteration); + LDAPConnection connection = null; + try { + do { + final LDAPConnection finalConnection = + LdapUtils.privilegedConnect(testSessionFactory.getServerSet()::getConnection); + connection = finalConnection; + logger.debug("established connection with port [{}] expected port [{}]", + finalConnection.getConnectedPort(), firstNonStoppedPort); + if (finalConnection.getConnectedPort() != firstNonStoppedPort) { + LDAPException e = expectThrows(LDAPException.class, () -> finalConnection.bind(new SimpleBindRequest())); + assertThat(e.getMessage(), containsString("not connected")); + finalConnection.close(); + } + } while (connection.getConnectedPort() != firstNonStoppedPort); + + assertThat(connection.getConnectedPort(), is(firstNonStoppedPort)); + } finally { + if (connection != null) { + connection.close(); + } + } + } + } finally { + closeLatch.countDown(); + mockServerSocket.close(); + for (Thread t : listenThreads) { + t.join(); + } + } + } + + private TestSessionFactory createSessionFactory(LdapLoadBalancing loadBalancing) throws Exception { + String groupSearchBase = "cn=HMS Lydia,ou=crews,ou=groups,o=sevenSeas"; + String userTemplate = "cn={0},ou=people,o=sevenSeas"; + Settings settings = buildLdapSettings(ldapUrls(), new String[] { userTemplate }, groupSearchBase, + LdapSearchScope.SUB_TREE, loadBalancing); + Settings globalSettings = Settings.builder().put("path.home", createTempDir()).put(settings).build(); + RealmConfig config = new RealmConfig(REALM_IDENTIFIER, globalSettings, + TestEnvironment.newEnvironment(globalSettings), new ThreadContext(Settings.EMPTY)); + return new TestSessionFactory(config, new SSLService(Settings.EMPTY, TestEnvironment.newEnvironment(config.settings())), + threadPool); + } + + private class PortBlockingRunnable implements Runnable { + + private final InetAddress serverAddress; + private final int serverPort; + private final int portToBind; + private final CountDownLatch latch; + private final CountDownLatch closeLatch; + private final AtomicBoolean success; + + private PortBlockingRunnable(InetAddress serverAddress, int serverPort, int portToBind, CountDownLatch latch, + CountDownLatch closeLatch, AtomicBoolean success) { + this.serverAddress = serverAddress; + this.serverPort = serverPort; + this.portToBind = portToBind; + this.latch = latch; + this.closeLatch = closeLatch; + this.success = success; + } + + @Override + public void run() { + final List openedSockets = new ArrayList<>(); + final List blacklistedAddress = new ArrayList<>(); + try { + final boolean allSocketsOpened = awaitBusy(() -> { + try { + InetAddress[] allAddresses = InetAddressHelper.getAllAddresses(); + if (serverAddress instanceof Inet4Address) { + allAddresses = InetAddressHelper.filterIPV4(allAddresses); + } else { + allAddresses = InetAddressHelper.filterIPV6(allAddresses); + } + final List inetAddressesToBind = Arrays.stream(allAddresses) + .filter(addr -> openedSockets.stream().noneMatch(s -> addr.equals(s.getLocalAddress()))) + .filter(addr -> addr instanceof Inet4Address) + .filter(addr -> blacklistedAddress.contains(addr) == false) + .collect(Collectors.toList()); + for (InetAddress localAddress : inetAddressesToBind) { + try { + final Socket socket = openMockSocket(serverAddress, serverPort, localAddress, portToBind); + openedSockets.add(socket); + logger.debug("opened socket [{}]", socket); + } catch (NoRouteToHostException e) { + logger.debug(new ParameterizedMessage("blacklisting address [{}] due to:", localAddress), e); + blacklistedAddress.add(localAddress); + } catch (ConnectException e) { + logger.debug(new ParameterizedMessage("blacklisting address [{}] due to:", localAddress), e); + blacklistedAddress.add(localAddress); + } + } + if (openedSockets.size() == 0) { + logger.debug("Could not open any sockets from the available addresses"); + return false; + } + return true; + } catch (IOException e) { + logger.debug(new ParameterizedMessage("caught exception while opening socket on [{}]", portToBind), e); + return false; + } + }); + + if (allSocketsOpened) { + latch.countDown(); + } else { + success.set(false); + IOUtils.closeWhileHandlingException(openedSockets); + openedSockets.clear(); + latch.countDown(); + return; + } + } catch (InterruptedException e) { + logger.debug(new ParameterizedMessage("interrupted while trying to open sockets on [{}]", portToBind), e); + Thread.currentThread().interrupt(); + } + + try { + closeLatch.await(); + } catch (InterruptedException e) { + logger.debug("caught exception while waiting for close latch", e); + Thread.currentThread().interrupt(); + } finally { + logger.debug("closing sockets on [{}]", portToBind); + IOUtils.closeWhileHandlingException(openedSockets); + } + } + } + + static class TestSessionFactory extends SessionFactory { + + protected TestSessionFactory(RealmConfig config, SSLService sslService, ThreadPool threadPool) { + super(config, sslService, threadPool); + } + + @Override + public void session(String user, SecureString password, ActionListener listener) { + listener.onResponse(null); + } + } +} From b5d7f92740e68813dd24aae1853e4bc4cb707645 Mon Sep 17 00:00:00 2001 From: Zachary Tong Date: Fri, 29 Mar 2019 17:34:41 -0400 Subject: [PATCH 11/25] Update documentation --- .../bucket/rare-terms-aggregation.asciidoc | 26 +++++++++++++------ 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/docs/reference/aggregations/bucket/rare-terms-aggregation.asciidoc b/docs/reference/aggregations/bucket/rare-terms-aggregation.asciidoc index 214fb4102327f..32ce8d25d647e 100644 --- a/docs/reference/aggregations/bucket/rare-terms-aggregation.asciidoc +++ b/docs/reference/aggregations/bucket/rare-terms-aggregation.asciidoc @@ -75,6 +75,8 @@ A `rare_terms` aggregation looks like this in isolation: |Parameter Name |Description |Required |Default Value |`field` |The field we wish to find rare terms in |Required | |`max_doc_count` |The maximum number of documents a term should appear in. |Optional |`1` +|`precision` |The precision of the internal CuckooFilters. Smaller precision leads to +better approximation, but higher memory usage. Cannot be smaller than `0.00001` |Optional |`0.01` |`include` |Terms that should be included in the aggregation|Optional | |`exclude` |Terms that should be excluded from the aggregation|Optional | |`missing` |The value that should be used if a document does not have the field being aggregated|Optional | @@ -187,19 +189,27 @@ Instead, the Rare Terms aggregation uses a different approximate algorithm: 1. Values are placed in a map the first time they are seen. 2. Each addition occurrence of the term increments a counter in the map -3. If the counter > the `max_doc_count` threshold, the term is removed from the map and placed in a bloom filter -4. The bloom filter is consulted on each term. If the value is inside the bloom, it is known to be above the +3. If the counter > the `max_doc_count` threshold, the term is removed from the map and placed in a CuckooFilter +4. The CuckooFilteris consulted on each term. If the value is inside the filter, it is known to be above the threshold already and skipped. -After execution, the map of values is the map of "rare" terms under the `max_doc_count` threshold. This map and bloom -filter is then merged with all other shards. If there are terms that are greater than the threshold (or appear in -a different shard's bloom filter) the term is removed from the merged list. The final map of values is returned +After execution, the map of values is the map of "rare" terms under the `max_doc_count` threshold. This map and CuckooFilter +are then merged with all other shards. If there are terms that are greater than the threshold (or appear in +a different shard's CuckooFilter) the term is removed from the merged list. The final map of values is returned to the user as the "rare" terms. -Bloom filters have the possibility of returning false positives (they can say a value exists in their collection when -it does not actually). Since the Bloom filter is being used to see if a term is over threshold, this means a false positive -from the bloom filter will mistakenly say a value is common when it is not (and thus exclude it from it final list of buckets). +CuckooFilter have the possibility of returning false positives (they can say a value exists in their collection when +it does not actually). Since the CuckooFilter is being used to see if a term is over threshold, this means a false positive +from the CuckooFilter will mistakenly say a value is common when it is not (and thus exclude it from it final list of buckets). +==== Precision + +Although the internal CuckooFilter is approximate in nature, the false-positive rate can be controlled with a +`precision` parameter. This allows the user to trade more runtime memory for more accurate results. + +The default precision is `0.01`, and the smallest (e.g. most accurate and largest memory overhead) is `0.00001`. + +TODO charts here ==== Filtering Values From d9717abff1243cf4d30b20f29ddc4002b83e4006 Mon Sep 17 00:00:00 2001 From: Zachary Tong Date: Tue, 2 Apr 2019 10:27:12 -0400 Subject: [PATCH 12/25] Remove benchmark, temp was not supposed to be committed --- .../terms/RareTermsAggregatorTests.java | 73 ------------------- 1 file changed, 73 deletions(-) diff --git a/server/src/test/java/org/elasticsearch/search/aggregations/bucket/terms/RareTermsAggregatorTests.java b/server/src/test/java/org/elasticsearch/search/aggregations/bucket/terms/RareTermsAggregatorTests.java index b56421c952afc..8f42b48b1d33e 100644 --- a/server/src/test/java/org/elasticsearch/search/aggregations/bucket/terms/RareTermsAggregatorTests.java +++ b/server/src/test/java/org/elasticsearch/search/aggregations/bucket/terms/RareTermsAggregatorTests.java @@ -489,38 +489,6 @@ private List generateDocsWithNested(String id, int value, int[] nested return documents; } - public void testBenchmark() throws IOException { - Query query = new MatchAllDocsQuery(); - - int size = 3_000_000; - List d = new ArrayList<>(size); - - long[] uniqueValues = new long[]{1, 100, 1000, 10000}; - for (long v : uniqueValues) { - for (long i = 0; i < v; i++) { - d.add(i); - } - - for (long i = v; i < size; i++) { - d.add(i); - d.add(i); - - - } - - executeBench(true, query, d, - aggregation -> aggregation.field(LONG_FIELD).maxDocCount(1), - agg -> { - logger.error(v + ": " + agg.getBuckets().size()); - }, - ValueType.NUMERIC); - d.clear(); - } - - - - - } private InternalAggregation buildInternalAggregation(RareTermsAggregationBuilder builder, MappedFieldType fieldType, IndexSearcher searcher) throws IOException { @@ -611,47 +579,6 @@ private void executeTestCase(boolean reduced, Query query, List dataset, } } - private void executeBench(boolean reduced, Query query, List dataset, - Consumer configure, - Consumer verify, ValueType valueType) throws IOException { - - try (Directory directory = newDirectory()) { - try (RandomIndexWriter indexWriter = new RandomIndexWriter(random(), directory)) { - Document document = new Document(); - for (Long value : dataset) { - - document.add(new SortedNumericDocValuesField(LONG_FIELD, value)); - document.add(new LongPoint(LONG_FIELD, value)); - indexWriter.addDocument(document); - document.clear(); - } - } - - logger.info("Start agg"); - try (IndexReader indexReader = DirectoryReader.open(directory)) { - IndexSearcher indexSearcher = newIndexSearcher(indexReader); - - RareTermsAggregationBuilder aggregationBuilder = new RareTermsAggregationBuilder("_name", valueType); - if (configure != null) { - configure.accept(aggregationBuilder); - } - - MappedFieldType longFieldType = new NumberFieldMapper.NumberFieldType(NumberFieldMapper.NumberType.LONG); - longFieldType.setName(LONG_FIELD); - longFieldType.setHasDocValues(true); - - InternalMappedRareTerms rareTerms; - if (reduced) { - rareTerms = searchAndReduce(indexSearcher, query, aggregationBuilder, longFieldType); - } else { - rareTerms = search(indexSearcher, query, aggregationBuilder, longFieldType); - } - verify.accept(rareTerms); - } - logger.info("End agg"); - } - } - @Override public void doAssertReducedMultiBucketConsumer(Aggregation agg, MultiBucketConsumerService.MultiBucketConsumer bucketConsumer) { /* From d95d8048fb96d80f699e4f3e255b99116298a375 Mon Sep 17 00:00:00 2001 From: Colin Goodheart-Smithe Date: Thu, 4 Apr 2019 14:42:27 -0400 Subject: [PATCH 13/25] Apply suggestions from code review Co-Authored-By: polyfractal --- .../aggregations/bucket/rare-terms-aggregation.asciidoc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/reference/aggregations/bucket/rare-terms-aggregation.asciidoc b/docs/reference/aggregations/bucket/rare-terms-aggregation.asciidoc index 32ce8d25d647e..d40b636393bbf 100644 --- a/docs/reference/aggregations/bucket/rare-terms-aggregation.asciidoc +++ b/docs/reference/aggregations/bucket/rare-terms-aggregation.asciidoc @@ -169,7 +169,7 @@ This now shows the "jazz" term which has a `doc_count` of 2": ==== Maximum document count The `max_doc_count` parameter is used to control the upper bound of document counts that a term can have. There -is not a size limitation on the `rare_terms` agg like `terms` agg has. This means that _all_ terms +is not a size limitation on the `rare_terms` agg like `terms` agg has. This means that terms which match the `max_doc_count` criteria will be returned. The aggregation functions in this manner to avoid the order-by-ascending issues that afflict the `terms` aggregation. @@ -190,7 +190,7 @@ Instead, the Rare Terms aggregation uses a different approximate algorithm: 1. Values are placed in a map the first time they are seen. 2. Each addition occurrence of the term increments a counter in the map 3. If the counter > the `max_doc_count` threshold, the term is removed from the map and placed in a CuckooFilter -4. The CuckooFilteris consulted on each term. If the value is inside the filter, it is known to be above the +4. The CuckooFilter is consulted on each term. If the value is inside the filter, it is known to be above the threshold already and skipped. After execution, the map of values is the map of "rare" terms under the `max_doc_count` threshold. This map and CuckooFilter From d9998d59d6c368f9ed4f57a099aa091f1ebac147 Mon Sep 17 00:00:00 2001 From: Zachary Tong Date: Fri, 5 Apr 2019 12:26:10 -0400 Subject: [PATCH 14/25] Address review comments - Docs tweaks - Comments - Remove map used by each aggregator and rely on ords instead, which allowed refactoring part of the logic to the abstract class --- .../bucket/rare-terms-aggregation.asciidoc | 20 +++- .../common/util/CuckooFilter.java | 7 ++ .../terms/AbstractRareTermsAggregator.java | 69 +++++++++++++- .../bucket/terms/LongRareTermsAggregator.java | 95 +++++++------------ .../terms/StringRareTermsAggregator.java | 95 +++++++------------ 5 files changed, 154 insertions(+), 132 deletions(-) diff --git a/docs/reference/aggregations/bucket/rare-terms-aggregation.asciidoc b/docs/reference/aggregations/bucket/rare-terms-aggregation.asciidoc index d40b636393bbf..7680f7b856bb1 100644 --- a/docs/reference/aggregations/bucket/rare-terms-aggregation.asciidoc +++ b/docs/reference/aggregations/bucket/rare-terms-aggregation.asciidoc @@ -189,7 +189,8 @@ Instead, the Rare Terms aggregation uses a different approximate algorithm: 1. Values are placed in a map the first time they are seen. 2. Each addition occurrence of the term increments a counter in the map -3. If the counter > the `max_doc_count` threshold, the term is removed from the map and placed in a CuckooFilter +3. If the counter > the `max_doc_count` threshold, the term is removed from the map and placed in a +https://www.cs.cmu.edu/~dga/papers/cuckoo-conext2014.pdf[CuckooFilter] 4. The CuckooFilter is consulted on each term. If the value is inside the filter, it is known to be above the threshold already and skipped. @@ -198,10 +199,15 @@ are then merged with all other shards. If there are terms that are greater than a different shard's CuckooFilter) the term is removed from the merged list. The final map of values is returned to the user as the "rare" terms. -CuckooFilter have the possibility of returning false positives (they can say a value exists in their collection when +CuckooFilters have the possibility of returning false positives (they can say a value exists in their collection when it does not actually). Since the CuckooFilter is being used to see if a term is over threshold, this means a false positive from the CuckooFilter will mistakenly say a value is common when it is not (and thus exclude it from it final list of buckets). +CuckooFilters are described in more detail in the paper: + +https://www.cs.cmu.edu/~dga/papers/cuckoo-conext2014.pdf[Fan, Bin, et al. "Cuckoo filter: Practically better than bloom."] +Proceedings of the 10th ACM International on Conference on emerging Networking Experiments and Technologies. ACM, 2014. + ==== Precision Although the internal CuckooFilter is approximate in nature, the false-positive rate can be controlled with a @@ -289,3 +295,13 @@ GET /_search // CONSOLE <1> Documents without a value in the `tags` field will fall into the same bucket as documents that have the value `N/A`. + +==== Nested, RareTerms, and scoring sub-aggregations + +The RareTerms aggregation has to operate in `breadth_first` mode, since it needs to prune terms as doc count thresholds +are breached. This requirement means the RareTerms aggregation is incompatible with certain combinations of aggregations +that require `depth_first`. In particular, scoring sub-aggregations that are inside a `nested` force the entire aggregation tree to run +in `depth_first` mode. This will throw an exception since RareTerms is unable to process `depth_first`. + +As a concrete example, if `rare_terms` aggregation is the child of a `nested` aggregation, and one of the child aggregations of `rare_terms` +needs document scores (like a `top_hits` aggregation), this will throw an exception. \ No newline at end of file diff --git a/server/src/main/java/org/elasticsearch/common/util/CuckooFilter.java b/server/src/main/java/org/elasticsearch/common/util/CuckooFilter.java index 9b9cf524668b6..18d4c1151d530 100644 --- a/server/src/main/java/org/elasticsearch/common/util/CuckooFilter.java +++ b/server/src/main/java/org/elasticsearch/common/util/CuckooFilter.java @@ -48,6 +48,13 @@ * fingerprints (e.g. when inserting, if the fingerprint is already present in the * candidate buckets, it is not inserted). By not saving duplicates, the CuckooFilter * loses the ability to delete values. + * + * Based on the paper: + * + * Fan, Bin, et al. "Cuckoo filter: Practically better than bloom." + * Proceedings of the 10th ACM International on Conference on emerging Networking Experiments and Technologies. ACM, 2014. + * + * https://www.cs.cmu.edu/~dga/papers/cuckoo-conext2014.pdf */ public class CuckooFilter implements Writeable { diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/AbstractRareTermsAggregator.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/AbstractRareTermsAggregator.java index ea3d461852e75..109525219e3ca 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/AbstractRareTermsAggregator.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/AbstractRareTermsAggregator.java @@ -19,7 +19,6 @@ package org.elasticsearch.search.aggregations.bucket.terms; -import org.elasticsearch.common.Randomness; import org.elasticsearch.common.util.SetBackedScalingCuckooFilter; import org.elasticsearch.search.DocValueFormat; import org.elasticsearch.search.aggregations.Aggregator; @@ -37,15 +36,16 @@ import java.io.IOException; import java.util.List; import java.util.Map; +import java.util.Random; -public abstract class AbstractRareTermsAggregator +public abstract class AbstractRareTermsAggregator extends DeferableBucketAggregator { /** Sets the number of "removed" values to accumulate before we purge ords via the MergingBucketCollector's mergeBuckets() method */ - static final long GC_THRESHOLD = 1000000; + private static final long GC_THRESHOLD = 1000000; static final BucketOrder ORDER = BucketOrder.compound(BucketOrder.count(true), BucketOrder.key(true)); // sort by count ascending protected final long maxDocCount; @@ -55,7 +55,7 @@ public abstract class AbstractRareTermsAggregator= 0; + collectBucket(subCollectors, docId, ord); + + } else { + // we've seen this value before, see if it is below threshold + long termCount = bucketDocCount(bucketOrdinal); + if (termCount < maxDocCount) { + // TODO if we only need maxDocCount==1, we could specialize + // and use a bitset instead of a counter scheme + + collectExistingBucket(subCollectors, docId, bucketOrdinal); + + } else { + // Otherwise we've breached the threshold, add to the cuckoo filter + addValueToFilter(val); + numDeleted += 1; + + // This is a bit hacky, but we need to collect the value once more to + // make sure the doc_count is over threshold (used later when gc'ing) + collectExistingBucket(subCollectors, docId, bucketOrdinal); + + if (numDeleted > GC_THRESHOLD) { + gcDeletedEntries(numDeleted); + numDeleted = 0; + } + } + } + } + } + /** * Remove entries from the ordinal map which are no longer tracked in the active key's map. * Will internally call the merge function of {@link MergingBucketsDeferringCollector}, so this @@ -138,4 +176,25 @@ private String descendsFromNestedAggregator(Aggregator parent) { * Used to help verify correct functioning of GC */ abstract void gcDeletedEntries(long numDeleted); + + /** + * Returns true if the aggregator's approximate filter contains the value, false otherwise + */ + abstract boolean filterMightContain(V value); + + /** + * Returns the bucket ordinal associated with the value, -1 if the value was not found + */ + abstract long findOrdinal(V value); + + /** + * Add's the value to the ordinal map. Return the newly allocated id if it wasn't in the ordinal map yet, + * or -1-id if it was already present + */ + abstract long addValueToOrds(V value); + + /** + * Adds the value to the aggregator's approximate filter. + */ + abstract void addValueToFilter(V value); } diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/LongRareTermsAggregator.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/LongRareTermsAggregator.java index 2be5697423630..da2260c03408b 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/LongRareTermsAggregator.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/LongRareTermsAggregator.java @@ -18,8 +18,6 @@ */ package org.elasticsearch.search.aggregations.bucket.terms; -import com.carrotsearch.hppc.LongLongHashMap; -import com.carrotsearch.hppc.cursors.LongLongCursor; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.SortedNumericDocValues; import org.apache.lucene.util.CollectionUtil; @@ -45,21 +43,16 @@ /** * An aggregator that finds "rare" string values (e.g. terms agg that orders ascending) */ -public class LongRareTermsAggregator extends AbstractRareTermsAggregator { +public class LongRareTermsAggregator extends AbstractRareTermsAggregator { - protected LongLongHashMap map; protected LongHash bucketOrds; - // Size of a key:value pair in the active map, used for CB accounting - private static final long MAP_SLOT_SIZE = Long.BYTES * 2; - LongRareTermsAggregator(String name, AggregatorFactories factories, ValuesSource.Numeric valuesSource, DocValueFormat format, SearchContext aggregationContext, Aggregator parent, IncludeExclude.LongFilter longFilter, int maxDocCount, double precision, List pipelineAggregators, Map metaData) throws IOException { super(name, factories, aggregationContext, parent, pipelineAggregators, metaData, maxDocCount, precision, format, valuesSource, longFilter); - this.map = new LongLongHashMap(); this.bucketOrds = new LongHash(1, aggregationContext.bigArrays()); } @@ -76,61 +69,16 @@ public LeafBucketCollector getLeafCollector(LeafReaderContext ctx, } return new LeafBucketCollectorBase(sub, values) { - @Override public void collect(int docId, long owningBucketOrdinal) throws IOException { if (values.advanceExact(docId)) { final int valuesCount = values.docValueCount(); - long previous = Long.MAX_VALUE; for (int i = 0; i < valuesCount; ++i) { final long val = values.nextValue(); if (previous != val || i == 0) { if ((includeExclude == null) || (includeExclude.accept(val))) { - if (filter.mightContain(val) == false) { - long termCount = map.get(val); - if (termCount == 0) { - // Brand new term, save into map - map.put(val, 1L); - addRequestCircuitBreakerBytes(MAP_SLOT_SIZE);// 8 bytes for key, 8 for value - - long bucketOrdinal = bucketOrds.add(val); - if (bucketOrdinal < 0) { // already seen - throw new IllegalStateException("Term count is zero, but an ordinal for this " + - "term has already been recorded"); - } else { - collectBucket(subCollectors, docId, bucketOrdinal); - } - } else { - // We've seen this term before, but less than the threshold - // so just increment its counter - if (termCount < maxDocCount) { - // TODO if we only need maxDocCount==1, we could specialize - // and use a bitset instead of a counter scheme - map.put(val, termCount + 1); - long bucketOrdinal = bucketOrds.add(val); - if (bucketOrdinal < 0) { - bucketOrdinal = - 1 - bucketOrdinal; - collectExistingBucket(subCollectors, docId, bucketOrdinal); - } else { - throw new IllegalStateException("Term has seen before, but we have not recorded " + - "an ordinal yet."); - } - } else { - // Otherwise we've breached the threshold, remove from - // the map and add to the cuckoo filter - map.remove(val); - filter.add(val); - addRequestCircuitBreakerBytes(-MAP_SLOT_SIZE); // 8 bytes for key, 8 for value - numDeleted += 1; - - if (numDeleted > GC_THRESHOLD) { - gcDeletedEntries(numDeleted); - numDeleted = 0; - } - } - } - } + doCollect(val, docId); } previous = val; } @@ -140,6 +88,26 @@ public void collect(int docId, long owningBucketOrdinal) throws IOException { }; } + @Override + boolean filterMightContain(Long value) { + return filter.mightContain(value); + } + + @Override + long findOrdinal(Long value) { + return bucketOrds.find(value); + } + + @Override + long addValueToOrds(Long value) { + return bucketOrds.add(value); + } + + @Override + void addValueToFilter(Long value) { + filter.add(value); + } + protected void gcDeletedEntries(long numDeleted) { long deletionCount = 0; LongHash newBucketOrds = new LongHash(1, context.bigArrays()); @@ -150,13 +118,15 @@ protected void gcDeletedEntries(long numDeleted) { long oldKey = oldBucketOrds.get(i); long newBucketOrd = -1; - // if the key still exists in our map, reinsert into the new ords - if (map.containsKey(oldKey)) { + long docCount = bucketDocCount(i); + // if the key is below threshold, reinsert into the new ords + if (docCount <= maxDocCount) { newBucketOrd = newBucketOrds.add(oldKey); } else { // Make a note when one of the ords has been deleted deletionCount += 1; } + mergeMap[i] = newBucketOrd; } @@ -180,16 +150,15 @@ protected void gcDeletedEntries(long numDeleted) { @Override public InternalAggregation buildAggregation(long owningBucketOrdinal) throws IOException { assert owningBucketOrdinal == 0; - List buckets = new ArrayList<>(map.size()); + List buckets = new ArrayList<>(); - for (LongLongCursor cursor : map) { - // The collection managed pruning unwanted terms, so any + for (long i = 0; i < bucketOrds.size(); i++) { + // The agg managed pruning unwanted terms at runtime, so any // terms that made it this far are "rare" and we want buckets - long bucketOrdinal = bucketOrds.find(cursor.key); LongRareTerms.Bucket bucket = new LongRareTerms.Bucket(0, 0, null, format); - bucket.term = cursor.key; - bucket.docCount = cursor.value; - bucket.bucketOrd = bucketOrdinal; + bucket.term = bucketOrds.get(i); + bucket.docCount = bucketDocCount(i); + bucket.bucketOrd = i; buckets.add(bucket); consumeBucketsAndMaybeBreak(1); diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringRareTermsAggregator.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringRareTermsAggregator.java index 24dab7fa1612e..b7691bc4630ab 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringRareTermsAggregator.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringRareTermsAggregator.java @@ -18,8 +18,6 @@ */ package org.elasticsearch.search.aggregations.bucket.terms; -import com.carrotsearch.hppc.ObjectLongHashMap; -import com.carrotsearch.hppc.cursors.ObjectLongCursor; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefBuilder; @@ -47,19 +45,14 @@ /** * An aggregator that finds "rare" string values (e.g. terms agg that orders ascending) */ -public class StringRareTermsAggregator extends AbstractRareTermsAggregator { - protected ObjectLongHashMap map; +public class StringRareTermsAggregator extends AbstractRareTermsAggregator { protected BytesRefHash bucketOrds; - // Size of values in active map, used for CB accounting - private static final long MAP_VALUE_SIZE = Long.BYTES; - StringRareTermsAggregator(String name, AggregatorFactories factories, ValuesSource.Bytes valuesSource, DocValueFormat format, IncludeExclude.StringFilter stringFilter, SearchContext context, Aggregator parent, List pipelineAggregators, Map metaData, long maxDocCount, double precision) throws IOException { super(name, factories, context, parent, pipelineAggregators, metaData, maxDocCount, precision, format, valuesSource, stringFilter); - this.map = new ObjectLongHashMap<>(); this.bucketOrds = new BytesRefHash(1, context.bigArrays()); } @@ -91,48 +84,7 @@ public void collect(int docId, long bucket) throws IOException { continue; } - if (filter.mightContain(bytes) == false) { - long valueCount = map.get(bytes); - if (valueCount == 0) { - // Brand new term, save into map - map.put(BytesRef.deepCopyOf(bytes), 1L); - addRequestCircuitBreakerBytes(bytes.length + MAP_VALUE_SIZE); // size of term + 8 for counter - - long bucketOrdinal = bucketOrds.add(bytes); - if (bucketOrdinal < 0) { // already seen - throw new IllegalStateException("Term count is zero, but an ordinal for this " + - "term has already been recorded"); - } else { - collectBucket(subCollectors, docId, bucketOrdinal); - } - } else { - // We've seen this term before, but less than the threshold - // so just increment its counter - if (valueCount < maxDocCount) { - map.put(bytes, valueCount + 1); - long bucketOrdinal = bucketOrds.add(bytes); - if (bucketOrdinal < 0) { - bucketOrdinal = - 1 - bucketOrdinal; - collectExistingBucket(subCollectors, docId, bucketOrdinal); - } else { - throw new IllegalStateException("Term has seen before, but we have not recorded " + - "an ordinal yet."); - } - } else { - // Otherwise we've breached the threshold, remove from - // the map and add to the cuckoo filter - map.remove(bytes); - filter.add(bytes); - numDeleted += 1; - addRequestCircuitBreakerBytes(-(bytes.length + MAP_VALUE_SIZE)); // size of term + 8 for counter - - if (numDeleted > GC_THRESHOLD) { - gcDeletedEntries(numDeleted); - numDeleted = 0; - } - } - } - } + doCollect(bytes, docId); previous.copyBytes(bytes); } } @@ -140,6 +92,26 @@ public void collect(int docId, long bucket) throws IOException { }; } + @Override + boolean filterMightContain(BytesRef value) { + return filter.mightContain(value); + } + + @Override + long findOrdinal(BytesRef value) { + return bucketOrds.find(value); + } + + @Override + long addValueToOrds(BytesRef value) { + return bucketOrds.add(value); + } + + @Override + void addValueToFilter(BytesRef value) { + filter.add(value); + } + protected void gcDeletedEntries(long numDeleted) { long deletionCount = 0; BytesRefHash newBucketOrds = new BytesRefHash(1, context.bigArrays()); @@ -150,9 +122,9 @@ protected void gcDeletedEntries(long numDeleted) { for (int i = 0; i < oldBucketOrds.size(); i++) { BytesRef oldKey = oldBucketOrds.get(i, scratch); long newBucketOrd = -1; - - // if the key still exists in our map, reinsert into the new ords - if (map.containsKey(oldKey)) { + long docCount = bucketDocCount(i); + // if the key is below threshold, reinsert into the new ords + if (docCount <= maxDocCount) { newBucketOrd = newBucketOrds.add(oldKey); } else { // Make a note when one of the ords has been deleted @@ -181,17 +153,16 @@ protected void gcDeletedEntries(long numDeleted) { public InternalAggregation buildAggregation(long owningBucketOrdinal) throws IOException { assert owningBucketOrdinal == 0; - List buckets = new ArrayList<>(map.size()); - - for (ObjectLongCursor cursor : map) { - StringRareTerms.Bucket bucket = new StringRareTerms.Bucket(new BytesRef(), 0, null, format); + List buckets = new ArrayList<>(); - // The collection managed pruning unwanted terms, so any + for (long i = 0; i < bucketOrds.size(); i++) { + // The agg managed pruning unwanted terms at runtime, so any // terms that made it this far are "rare" and we want buckets - long bucketOrdinal = bucketOrds.find(cursor.key); - bucket.termBytes = BytesRef.deepCopyOf(cursor.key); - bucket.docCount = cursor.value; - bucket.bucketOrd = bucketOrdinal; + StringRareTerms.Bucket bucket = new StringRareTerms.Bucket(new BytesRef(), 0, null, format); + bucketOrds.get(i, bucket.termBytes ); + bucket.termBytes = BytesRef.deepCopyOf(bucket.termBytes); + bucket.docCount = bucketDocCount(i); + bucket.bucketOrd = i; buckets.add(bucket); consumeBucketsAndMaybeBreak(1); From b82617509f67cc8ecf9a9c6386d6dbff1222f272 Mon Sep 17 00:00:00 2001 From: Zachary Tong Date: Fri, 12 Apr 2019 15:51:32 -0400 Subject: [PATCH 15/25] Address review comments --- .../common/util/CuckooFilter.java | 97 +++++---- .../util/SetBackedScalingCuckooFilter.java | 200 +++++++++++------- .../SetBackedScalingCuckooFilterTests.java | 46 ++++ 3 files changed, 213 insertions(+), 130 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/common/util/CuckooFilter.java b/server/src/main/java/org/elasticsearch/common/util/CuckooFilter.java index 18d4c1151d530..3c40043a8adfa 100644 --- a/server/src/main/java/org/elasticsearch/common/util/CuckooFilter.java +++ b/server/src/main/java/org/elasticsearch/common/util/CuckooFilter.java @@ -41,13 +41,16 @@ * are not possible though; if the structure says it _has not_ seen an item, that can be * trusted. * - * The filter can "saturate" at which point the map is fully loaded and will refuse to accept + * The filter can "saturate" at which point the map has hit it's configured load factor (or near enough + * that a large number of evictions are not able to find a free slot) and will refuse to accept * any new insertions. * * NOTE: this version does not support deletions, and as such does not save duplicate * fingerprints (e.g. when inserting, if the fingerprint is already present in the * candidate buckets, it is not inserted). By not saving duplicates, the CuckooFilter - * loses the ability to delete values. + * loses the ability to delete values. But not by allowing deletions, we can save space + * (do not need to waste slots on duplicate fingerprints), and we do not need to worry + * about inserts "overflowing" a bucket because the same item has been repeated repeatedly * * Based on the paper: * @@ -84,7 +87,6 @@ public class CuckooFilter implements Writeable { this.bitsPerEntry = bitsPerEntry(fpp, entriesPerBucket); this.numBuckets = getNumBuckets(capacity, loadFactor, entriesPerBucket); - // This shouldn't happen, but as a sanity check if (numBuckets * entriesPerBucket > Integer.MAX_VALUE) { throw new IllegalArgumentException("Attempted to create [" + numBuckets * entriesPerBucket + "] entries which is > Integer.MAX_VALUE"); @@ -95,6 +97,27 @@ public class CuckooFilter implements Writeable { this.fingerprintMask = (0x80000000 >> (bitsPerEntry - 1)) >>> (Integer.SIZE - bitsPerEntry); } + CuckooFilter(CuckooFilter other) { + this.numBuckets = other.numBuckets; + this.bitsPerEntry = other.bitsPerEntry; + this.entriesPerBucket = other.entriesPerBucket; + this.count = other.count; + this.evictedFingerprint = other.evictedFingerprint; + this.rng = other.rng; + this.fingerprintMask = other.fingerprintMask; + + // This shouldn't happen, but as a sanity check + if (numBuckets * entriesPerBucket > Integer.MAX_VALUE) { + throw new IllegalArgumentException("Attempted to create [" + numBuckets * entriesPerBucket + + "] entries which is > Integer.MAX_VALUE"); + } + // TODO this is probably super slow, but just used for testing atm + this.data = PackedInts.getMutable(numBuckets * entriesPerBucket, bitsPerEntry, PackedInts.COMPACT); + for (int i = 0; i < other.data.size(); i++) { + data.set(i, other.data.get(i)); + } + } + CuckooFilter(StreamInput in, Random rng) throws IOException { this.numBuckets = in.readVInt(); this.bitsPerEntry = in.readVInt(); @@ -118,25 +141,25 @@ public void readBytes(byte[] b, int offset, int len) throws IOException { }); } - CuckooFilter(CuckooFilter other) { - this.numBuckets = other.numBuckets; - this.bitsPerEntry = other.bitsPerEntry; - this.entriesPerBucket = other.entriesPerBucket; - this.count = other.count; - this.evictedFingerprint = other.evictedFingerprint; - this.rng = other.rng; - this.fingerprintMask = other.fingerprintMask; + @Override + public void writeTo(StreamOutput out) throws IOException { + out.writeVInt(numBuckets); + out.writeVInt(bitsPerEntry); + out.writeVInt(entriesPerBucket); + out.writeVInt(count); + out.writeVInt(evictedFingerprint); - // This shouldn't happen, but as a sanity check - if (numBuckets * entriesPerBucket > Integer.MAX_VALUE) { - throw new IllegalArgumentException("Attempted to create [" + numBuckets * entriesPerBucket - + "] entries which is > Integer.MAX_VALUE"); - } - // TODO this is probably super slow, but just used for testing atm - this.data = PackedInts.getMutable(numBuckets * entriesPerBucket, bitsPerEntry, PackedInts.COMPACT); - for (int i = 0; i < other.data.size(); i++) { - data.set(i, other.data.get(i)); - } + data.save(new DataOutput() { + @Override + public void writeByte(byte b) throws IOException { + out.writeByte(b); + } + + @Override + public void writeBytes(byte[] b, int offset, int length) throws IOException { + out.writeBytes(b, offset, length); + } + }); } /** @@ -362,8 +385,8 @@ private int fingerprint(int hash) { /** * Calculate the optimal number of bits per entry */ - private int bitsPerEntry(double fp, int numEntriesPerBucket) { - return (int) Math.round(log2((2 * numEntriesPerBucket) / fp)); + private int bitsPerEntry(double fpp, int numEntriesPerBucket) { + return (int) Math.round(log2((2 * numEntriesPerBucket) / fpp)); } /** @@ -418,13 +441,9 @@ private double getLoadFactor(int b) { * TODO: there are schemes to avoid powers of two, might want to investigate those */ private int getNumBuckets(long capacity, double loadFactor, int b) { - // Rounds up to nearest power of 2 long buckets = Math.round((((double) capacity / loadFactor)) / (double) b); - // Make sure it isn't larger than the largest signed power of 2 for an int - if ((1 << -Long.numberOfLeadingZeros(buckets - 1)) > (1 << (Integer.SIZE - 2))) { - throw new IllegalArgumentException("Cannot create more than [" + Integer.MAX_VALUE + "] buckets"); - } + // Rounds up to nearest power of 2 return 1 << -Integer.numberOfLeadingZeros((int)buckets - 1); } @@ -433,30 +452,10 @@ private double log2(double x) { } public long getSizeInBytes() { + // (numBuckets, bitsPerEntry, fingerprintMask, entriesPerBucket, count, evictedFingerprint) * 4b == 24b return data.ramBytesUsed() + 24; } - @Override - public void writeTo(StreamOutput out) throws IOException { - out.writeVInt(numBuckets); - out.writeVInt(bitsPerEntry); - out.writeVInt(entriesPerBucket); - out.writeVInt(count); - out.writeVInt(evictedFingerprint); - - data.save(new DataOutput() { - @Override - public void writeByte(byte b) throws IOException { - out.writeByte(b); - } - - @Override - public void writeBytes(byte[] b, int offset, int length) throws IOException { - out.writeBytes(b, offset, length); - } - }); - } - @Override public int hashCode() { return Objects.hash(numBuckets, bitsPerEntry, entriesPerBucket, count, evictedFingerprint); diff --git a/server/src/main/java/org/elasticsearch/common/util/SetBackedScalingCuckooFilter.java b/server/src/main/java/org/elasticsearch/common/util/SetBackedScalingCuckooFilter.java index 8be2d9a904b6e..4a4d26de20c25 100644 --- a/server/src/main/java/org/elasticsearch/common/util/SetBackedScalingCuckooFilter.java +++ b/server/src/main/java/org/elasticsearch/common/util/SetBackedScalingCuckooFilter.java @@ -52,10 +52,24 @@ */ public class SetBackedScalingCuckooFilter implements Writeable { + /** + * This is the estimated insertion capacity for each individual internal CuckooFilter. + */ private static final int FILTER_CAPACITY = 1000000; - // Package-private for testing + /** + * This set is used to track the insertions before we convert over to an approximate + * filter. This gives us 100% accuracy for small cardinalities. This will be null + * if isSetMode = false; + * + * package-private for testing + */ Set hashes; + + /** + * This list holds our approximate filters, after we have migrated out of a set. + * This will be null if isSetMode = true; + */ List filters; private final int threshold; @@ -65,6 +79,8 @@ public class SetBackedScalingCuckooFilter implements Writeable { private Consumer breaker = aLong -> { //noop }; + + // True if we are tracking inserts with a set, false otherwise private boolean isSetMode = true; /** @@ -74,6 +90,18 @@ public class SetBackedScalingCuckooFilter implements Writeable { * @param fpp the false-positive rate that should be used for the cuckoo filters. */ public SetBackedScalingCuckooFilter(int threshold, Random rng, double fpp) { + if (threshold <= 0) { + throw new IllegalArgumentException("[threshold] must be a positive integer"); + } + + // We have to ensure that, in the worst case, two full sets can be converted into + // one cuckoo filter without overflowing. This keeps merging logic simpler + if (threshold * 2 > FILTER_CAPACITY) { + throw new IllegalArgumentException("[threshold] must be smaller than [" + (FILTER_CAPACITY / 2) + "]"); + } + if (fpp < 0) { + throw new IllegalArgumentException("[fpp] must be a positive double"); + } this.hashes = new HashSet<>(threshold); this.threshold = threshold; this.rng = rng; @@ -81,6 +109,20 @@ public SetBackedScalingCuckooFilter(int threshold, Random rng, double fpp) { this.fpp = fpp; } + public SetBackedScalingCuckooFilter(SetBackedScalingCuckooFilter other) { + this.threshold = other.threshold; + this.isSetMode = other.isSetMode; + this.rng = other.rng; + this.breaker = other.breaker; + this.capacity = other.capacity; + this.fpp = other.fpp; + if (isSetMode) { + this.hashes = new HashSet<>(other.hashes); + } else { + this.filters = new ArrayList<>(other.filters); + } + } + public SetBackedScalingCuckooFilter(StreamInput in, Random rng) throws IOException { this.threshold = in.readVInt(); this.isSetMode = in.readBoolean(); @@ -100,20 +142,23 @@ public SetBackedScalingCuckooFilter(StreamInput in, Random rng) throws IOExcepti } } - public SetBackedScalingCuckooFilter(SetBackedScalingCuckooFilter other) { - this.threshold = other.threshold; - this.isSetMode = other.isSetMode; - this.rng = other.rng; - this.breaker = other.breaker; - this.capacity = other.capacity; - this.fpp = other.fpp; + @Override + public void writeTo(StreamOutput out) throws IOException { + out.writeVInt(threshold); + out.writeBoolean(isSetMode); + out.writeVInt(capacity); + out.writeDouble(fpp); if (isSetMode) { - this.hashes = new HashSet<>(other.hashes); + out.writeCollection(hashes, (out1, hash) -> { + out1.writeZLong(hash.h1); + out1.writeZLong(hash.h2); + }); } else { - this.filters = new ArrayList<>(other.filters); + out.writeList(filters); } } + /** * Registers a circuit breaker with the datastructure. * @@ -122,7 +167,7 @@ public SetBackedScalingCuckooFilter(SetBackedScalingCuckooFilter other) { * in the registered breaker when configured. */ public void registerBreaker(Consumer breaker) { - this.breaker = breaker; + this.breaker = Objects.requireNonNull(breaker, "Circuit Breaker Consumer cannot be null"); breaker.accept(getSizeInBytes()); } @@ -173,33 +218,31 @@ private boolean mightContainFingerprint(int bucket, int fingerprint) { /** * Add's the provided value to the set for tracking */ - public boolean add(BytesRef value) { + public void add(BytesRef value) { MurmurHash3.Hash128 hash = MurmurHash3.hash128(value.bytes, value.offset, value.length, 0, new MurmurHash3.Hash128()); - return add(hash); + add(hash); } /** * Add's the provided value to the set for tracking */ - public boolean add(byte[] value) { + public void add(byte[] value) { MurmurHash3.Hash128 hash = MurmurHash3.hash128(value, 0, value.length, 0, new MurmurHash3.Hash128()); - return add(hash); + add(hash); } /** * Add's the provided value to the set for tracking */ - public boolean add(long value) { - return add(Numbers.longToBytes(value)); + public void add(long value) { + add(Numbers.longToBytes(value)); } - private boolean add(MurmurHash3.Hash128 hash) { + private void add(MurmurHash3.Hash128 hash) { if (isSetMode) { hashes.add(hash); - if (hashes.size() > threshold) { - convert(); - } - return true; + maybeConvert(); + return; } boolean success = filters.get(filters.size() - 1).add(hash); @@ -210,28 +253,36 @@ private boolean add(MurmurHash3.Hash128 hash) { filters.add(t); breaker.accept(t.getSizeInBytes()); // make sure we account for the new filter } - return true; + } + + private void maybeConvert() { + if (isSetMode && hashes.size() > threshold) { + convert(); + } } /** * If we still holding values in a set, convert this filter into an approximate, cuckoo-backed filter. * This will create a list of CuckooFilters, and null out the set of hashes */ - private void convert() { - if (isSetMode) { - long oldSize = getSizeInBytes(); + void convert() { + if (isSetMode == false) { + throw new IllegalStateException("Cannot convert SetBackedScalingCuckooFilter to approximate " + + "when it has already been converted."); + } + long oldSize = getSizeInBytes(); - filters = new ArrayList<>(); - CuckooFilter t = new CuckooFilter(capacity, fpp, rng); - hashes.forEach(t::add); - filters.add(t); + filters = new ArrayList<>(); + CuckooFilter t = new CuckooFilter(capacity, fpp, rng); + hashes.forEach(t::add); + filters.add(t); - hashes = null; - isSetMode = false; + hashes = null; + isSetMode = false; + + breaker.accept(-oldSize); // this zeros out the overhead of the set + breaker.accept(getSizeInBytes()); // this adds back in the new overhead of the cuckoo filters - breaker.accept(-oldSize); // this zeros out the overhead of the set - breaker.accept(getSizeInBytes()); // this adds back in the new overhead of the cuckoo filters - } } /** @@ -239,9 +290,9 @@ private void convert() { * are tracked, not the overhead of the Set itself. */ public long getSizeInBytes() { - long bytes = 0; + long bytes = 13; // fpp (double), threshold (int), isSetMode (boolean) if (hashes != null) { - bytes = (hashes.size() * 16) + 8 + 4 + 1; + bytes = (hashes.size() * 16); } if (filters != null) { bytes += filters.stream().mapToLong(CuckooFilter::getSizeInBytes).sum(); @@ -256,32 +307,45 @@ public long getSizeInBytes() { * to a cuckoo if it goes over threshold */ public void merge(SetBackedScalingCuckooFilter other) { + // Some basic sanity checks to make sure we can merge + if (this.threshold != other.threshold) { + throw new IllegalStateException("Cannot merge other CuckooFilter because thresholds do not match: [" + + this.threshold + "] vs [" + other.threshold + "]"); + } + if (this.capacity != other.capacity) { + throw new IllegalStateException("Cannot merge other CuckooFilter because capacities do not match: [" + + this.capacity + "] vs [" + other.capacity + "]"); + } + if (this.fpp != other.fpp) { + throw new IllegalStateException("Cannot merge other CuckooFilter because precisions do not match: [" + + this.fpp + "] vs [" + other.fpp + "]"); + } + if (isSetMode && other.isSetMode) { // Both in sets, merge collections then see if we need to convert to cuckoo hashes.addAll(other.hashes); - if (hashes.size() > threshold) { - convert(); - } + maybeConvert(); } else if (isSetMode && other.isSetMode == false) { - // Other is in cuckoo mode, so we convert our set to a cuckoo then merge collections. - // We could probably get fancy and keep our side in set-mode, but simpler to just convert + // Other is in cuckoo mode, so we convert our set to a cuckoo, then + // call the merge function again. Since both are now in set-mode + // this will fall through to the last conditional and do a cuckoo-cuckoo merge convert(); - filters.addAll(other.filters); + merge(other); } else if (isSetMode == false && other.isSetMode) { // Rather than converting the other to a cuckoo first, we can just // replay the values directly into our filter. other.hashes.forEach(this::add); + maybeConvert(); } else { // Both are in cuckoo mode, merge raw fingerprints - int current = 0; - CuckooFilter currentFilter = filters.get(current); + CuckooFilter currentFilter = filters.get(filters.size() - 1); - for (CuckooFilter filter : other.filters) { + for (CuckooFilter otherFilter : other.filters) { // The iterator returns an array of longs corresponding to the // fingerprints for buckets at the current position - Iterator iter = filter.getBuckets(); + Iterator iter = otherFilter.getBuckets(); int bucket = 0; while (iter.hasNext()) { long[] fingerprints = iter.next(); @@ -293,25 +357,14 @@ public void merge(SetBackedScalingCuckooFilter other) { if (fingerprint == CuckooFilter.EMPTY || mightContainFingerprint(bucket, (int) fingerprint)) { continue; } - boolean success = false; - - // If the fingerprint is new, we try to merge it into the filter at our `current` pointer. - // This might fail (e.g. the filter is full), so we may have to try multiple times - while (success == false) { - success = currentFilter.mergeFingerprint(bucket, (int) fingerprint); - - // If we failed to insert, the current filter is full, get next one - if (success == false) { - current += 1; - - // if we're out of filters, we need to create a new one - if (current >= filters.size()) { - CuckooFilter t = new CuckooFilter(capacity, fpp, rng); - filters.add(t); - breaker.accept(t.getSizeInBytes()); // make sure we account for the new filter - } - currentFilter = filters.get(current); - } + // Try to insert into the last filter in our list + if (currentFilter.mergeFingerprint(bucket, (int) fingerprint) == false) { + // if we failed, the filter is now saturated and we need to create a new one + CuckooFilter t = new CuckooFilter(capacity, fpp, rng); + filters.add(t); + breaker.accept(t.getSizeInBytes()); // make sure we account for the new filter + + currentFilter = filters.get(filters.size() - 1); } } bucket += 1; @@ -320,21 +373,6 @@ public void merge(SetBackedScalingCuckooFilter other) { } } - @Override - public void writeTo(StreamOutput out) throws IOException { - out.writeVInt(threshold); - out.writeBoolean(isSetMode); - out.writeVInt(capacity); - out.writeDouble(fpp); - if (isSetMode) { - out.writeCollection(hashes, (out1, hash) -> { - out1.writeZLong(hash.h1); - out1.writeZLong(hash.h2); - }); - } else { - out.writeList(filters); - } - } @Override public int hashCode() { diff --git a/server/src/test/java/org/elasticsearch/common/util/SetBackedScalingCuckooFilterTests.java b/server/src/test/java/org/elasticsearch/common/util/SetBackedScalingCuckooFilterTests.java index 16f2b4d5c2f4f..886e1b8dca659 100644 --- a/server/src/test/java/org/elasticsearch/common/util/SetBackedScalingCuckooFilterTests.java +++ b/server/src/test/java/org/elasticsearch/common/util/SetBackedScalingCuckooFilterTests.java @@ -26,6 +26,7 @@ import java.io.IOException; import java.util.HashSet; +import java.util.Random; import java.util.Set; import static org.hamcrest.Matchers.equalTo; @@ -116,6 +117,27 @@ public void testConvert() { assertThat(fppRate, lessThanOrEqualTo(0.001)); } + public void testConvertTwice() { + int threshold = randomIntBetween(1000, 10000); + SetBackedScalingCuckooFilter filter = new SetBackedScalingCuckooFilter(threshold, Randomness.get(), 0.01); + + int counter = 0; + Set values = new HashSet<>(); + while (counter < threshold + 100) { + long value = randomLong(); + filter.add(value); + boolean newValue = values.add(value); + if (newValue) { + counter += 1; + } + } + assertNull(filter.hashes); + assertThat(filter.filters.size(), greaterThan(0)); + IllegalStateException e = expectThrows(IllegalStateException.class, filter::convert); + assertThat(e.getMessage(), equalTo("Cannot convert SetBackedScalingCuckooFilter to approximate " + + "when it has already been converted.")); + } + public void testMergeSmall() { int threshold = 1000; @@ -183,6 +205,30 @@ public void testMergeSmall() { } fppRate = (double) incorrect / (values.size() + values2.size()); assertThat(fppRate, lessThanOrEqualTo(0.001)); + } + + public void testMergeIncompatible() { + SetBackedScalingCuckooFilter filter1 = new SetBackedScalingCuckooFilter(100, Randomness.get(), 0.01); + SetBackedScalingCuckooFilter filter2 = new SetBackedScalingCuckooFilter(1000, Randomness.get(), 0.01); + IllegalStateException e = expectThrows(IllegalStateException.class, () -> filter1.merge(filter2)); + assertThat(e.getMessage(), equalTo("Cannot merge other CuckooFilter because thresholds do not match: [100] vs [1000]")); + + SetBackedScalingCuckooFilter filter3 = new SetBackedScalingCuckooFilter(100, Randomness.get(), 0.001); + e = expectThrows(IllegalStateException.class, () -> filter1.merge(filter3)); + assertThat(e.getMessage(), equalTo("Cannot merge other CuckooFilter because precisions do not match: [0.01] vs [0.001]")); + } + + public void testBadParameters() { + IllegalArgumentException e = expectThrows(IllegalArgumentException.class, + () -> new SetBackedScalingCuckooFilter(-1, Randomness.get(), 0.11)); + assertThat(e.getMessage(), equalTo("[threshold] must be a positive integer")); + + e = expectThrows(IllegalArgumentException.class, + () -> new SetBackedScalingCuckooFilter(1000000, Randomness.get(), 0.11)); + assertThat(e.getMessage(), equalTo("[threshold] must be smaller than [500000]")); + e = expectThrows(IllegalArgumentException.class, + () -> new SetBackedScalingCuckooFilter(100, Randomness.get(), -1.0)); + assertThat(e.getMessage(), equalTo("[fpp] must be a positive double")); } } From 57f4f678844a17324ef3c2c4266e876cefdc0469 Mon Sep 17 00:00:00 2001 From: Zachary Tong Date: Tue, 28 May 2019 13:22:39 -0400 Subject: [PATCH 16/25] Performance improvements - Restructures the algorithm so that the sketch is only built at the very end. This means it collects all the counts/ordinals like the terms agg (in deferred mode), then prunes any counts over-threshold and adds them to the sketch. This means there is more runtime overhead, but drastically less computation. It still maintains sub-linear size when serializing to coordinating node. Comparable performance to terms agg. - Removes java streams in hot code, extra allocations really kill performance - Adds a lighter 64bit murmur hash for longs. Still uses 128bit algo for BytesRef - Stores just the needed 64 bits of the hash, instead of the full 128bit Murmur3 hash. - Exposes a number of expert APIs so the Scaling filter implementation can calculate hashes/buckets once for all the lookups --- .../common/util/CuckooFilter.java | 97 +++++++++++++------ .../util/SetBackedScalingCuckooFilter.java | 87 +++++++++-------- .../MergingBucketsDeferringCollector.java | 2 +- .../terms/AbstractRareTermsAggregator.java | 83 ++-------------- .../bucket/terms/LongRareTermsAggregator.java | 54 ++++------- .../terms/StringRareTermsAggregator.java | 53 ++++------ .../common/util/CuckooFilterTests.java | 12 ++- .../terms/RareTermsAggregatorTests.java | 2 - 8 files changed, 172 insertions(+), 218 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/common/util/CuckooFilter.java b/server/src/main/java/org/elasticsearch/common/util/CuckooFilter.java index 3c40043a8adfa..0b37fd92d27f7 100644 --- a/server/src/main/java/org/elasticsearch/common/util/CuckooFilter.java +++ b/server/src/main/java/org/elasticsearch/common/util/CuckooFilter.java @@ -21,13 +21,11 @@ import org.apache.lucene.store.DataInput; import org.apache.lucene.store.DataOutput; import org.apache.lucene.util.packed.PackedInts; -import org.elasticsearch.common.hash.MurmurHash3; import org.elasticsearch.common.io.stream.StreamInput; import org.elasticsearch.common.io.stream.StreamOutput; import org.elasticsearch.common.io.stream.Writeable; import java.io.IOException; -import java.util.Arrays; import java.util.Iterator; import java.util.Objects; import java.util.Random; @@ -52,6 +50,10 @@ * (do not need to waste slots on duplicate fingerprints), and we do not need to worry * about inserts "overflowing" a bucket because the same item has been repeated repeatedly * + * NOTE: this CuckooFilter exposes a number of Expert APIs which assume the caller has + * intimate knowledge about how the algorithm works. It is recommended to avoid these + * APIs, or better yet, use {@link SetBackedScalingCuckooFilter} instead. + * * Based on the paper: * * Fan, Bin, et al. "Cuckoo filter: Practically better than bloom." @@ -87,7 +89,7 @@ public class CuckooFilter implements Writeable { this.bitsPerEntry = bitsPerEntry(fpp, entriesPerBucket); this.numBuckets = getNumBuckets(capacity, loadFactor, entriesPerBucket); - if (numBuckets * entriesPerBucket > Integer.MAX_VALUE) { + if ((long) numBuckets * (long) entriesPerBucket > Integer.MAX_VALUE) { throw new IllegalArgumentException("Attempted to create [" + numBuckets * entriesPerBucket + "] entries which is > Integer.MAX_VALUE"); } @@ -97,6 +99,9 @@ public class CuckooFilter implements Writeable { this.fingerprintMask = (0x80000000 >> (bitsPerEntry - 1)) >>> (Integer.SIZE - bitsPerEntry); } + /** + * This ctor is likely slow and should only be used for testing + */ CuckooFilter(CuckooFilter other) { this.numBuckets = other.numBuckets; this.bitsPerEntry = other.bitsPerEntry; @@ -107,7 +112,7 @@ public class CuckooFilter implements Writeable { this.fingerprintMask = other.fingerprintMask; // This shouldn't happen, but as a sanity check - if (numBuckets * entriesPerBucket > Integer.MAX_VALUE) { + if ((long) numBuckets * (long) entriesPerBucket > Integer.MAX_VALUE) { throw new IllegalArgumentException("Attempted to create [" + numBuckets * entriesPerBucket + "] entries which is > Integer.MAX_VALUE"); } @@ -169,6 +174,24 @@ public int getCount() { return count; } + /** + * Returns the number of buckets that has been chosen based + * on the initial configuration + * + * Expert-level API + */ + int getNumBuckets() { + return numBuckets; + } + + int getBitsPerEntry() { + return bitsPerEntry; + } + + int getFingerprintMask() { + return fingerprintMask; + } + /** * Returns an iterator that returns the long[] representation of each bucket. The value * inside each long will be a fingerprint (or 0L, representing empty). @@ -199,21 +222,21 @@ public long[] next() { * Returns true if the set might contain the provided value, false otherwise. False values are * 100% accurate, while true values may be a false-positive. */ - boolean mightContain(MurmurHash3.Hash128 hash) { - int bucket = hashToIndex((int) hash.h1); - int fingerprint = fingerprint((int) hash.h2); + boolean mightContain(long hash) { + int bucket = hashToIndex((int) hash, numBuckets); + int fingerprint = fingerprint((int) (hash >>> 32), bitsPerEntry, fingerprintMask); + int alternateIndex = alternateIndex(bucket, fingerprint, numBuckets); - return mightContainFingerprint(bucket, fingerprint); + return mightContainFingerprint(bucket, fingerprint, alternateIndex); } /** * Returns true if the bucket or it's alternate bucket contains the fingerprint. * - * Expert-level API, use {@link CuckooFilter#mightContain(MurmurHash3.Hash128)} to check if + * Expert-level API, use {@link CuckooFilter#mightContain(long)} to check if * a value is in the filter. */ - boolean mightContainFingerprint(int bucket, int fingerprint) { - int alternateBucket = alternateIndex(bucket, fingerprint); + boolean mightContainFingerprint(int bucket, int fingerprint, int alternateBucket) { // check all entries for both buckets and the evicted slot return hasFingerprint(bucket, fingerprint) || hasFingerprint(alternateBucket, fingerprint) || evictedFingerprint == fingerprint; @@ -227,18 +250,23 @@ private boolean hasFingerprint(int bucket, long fingerprint) { int offset = getOffset(bucket, 0); data.get(offset, values, 0, entriesPerBucket); - return Arrays.stream(values).anyMatch(value -> value == fingerprint); + for (int i = 0; i < entriesPerBucket; i++) { + if (values[i] == fingerprint) { + return true; + } + } + return false; } /** * Add's the hash to the bucket or alternate bucket. Returns true if the insertion was * successful, false if the filter is saturated. */ - boolean add(MurmurHash3.Hash128 hash) { + boolean add(long hash) { // can only use 64 of 128 bytes unfortunately (32 for each bucket), simplest // to just truncate h1 and h2 appropriately - int bucket = hashToIndex((int) hash.h1); - int fingerprint = fingerprint((int) hash.h2); + int bucket = hashToIndex((int) hash, numBuckets); + int fingerprint = fingerprint((int) (hash >>> 32), bitsPerEntry, fingerprintMask); return mergeFingerprint(bucket, fingerprint); } @@ -246,7 +274,7 @@ boolean add(MurmurHash3.Hash128 hash) { * Attempts to merge the fingerprint into the specified bucket or it's alternate bucket. * Returns true if the insertion was successful, false if the filter is saturated. * - * Expert-level API, use {@link CuckooFilter#add(MurmurHash3.Hash128)} to insert + * Expert-level API, use {@link CuckooFilter#add(long)} to insert * values into the filter */ boolean mergeFingerprint(int bucket, int fingerprint) { @@ -255,7 +283,7 @@ boolean mergeFingerprint(int bucket, int fingerprint) { return false; } - int alternateBucket = alternateIndex(bucket, fingerprint); + int alternateBucket = alternateIndex(bucket, fingerprint, numBuckets); if (tryInsert(bucket, fingerprint) || tryInsert(alternateBucket, fingerprint)) { count += 1; return true; @@ -270,7 +298,7 @@ boolean mergeFingerprint(int bucket, int fingerprint) { // replace details and start again fingerprint = oldFingerprint; bucket = alternateBucket; - alternateBucket = alternateIndex(bucket, fingerprint); + alternateBucket = alternateIndex(bucket, fingerprint, numBuckets); // Only try to insert into alternate bucket if (tryInsert(alternateBucket, fingerprint)) { @@ -317,13 +345,11 @@ private boolean tryInsert(int bucket, int fingerprint) { * * If the hash is negative, this flips the bits. The hash is then modulo numBuckets * to get the final index. + * + * Expert-level API */ - private int hashToIndex(int hash) { - // invert the bits if we're negative - if (hash < 0) { - hash = ~hash; - } - return hash % numBuckets; + static int hashToIndex(int hash, int numBuckets) { + return hash & (numBuckets - 1); } /** @@ -331,14 +357,16 @@ private int hashToIndex(int hash) { * * The alternate bucket is the fingerprint multiplied by a mixing constant, * then xor'd against the bucket. This new value is modulo'd against - * the buckets via {@link CuckooFilter#hashToIndex(int)} to get the final + * the buckets via {@link CuckooFilter#hashToIndex(int, int)} to get the final * index. * * Note that the xor makes this operation reversible as long as we have the * fingerprint and current bucket (regardless of if that bucket was the primary * or alternate). + * + * Expert-level API */ - private int alternateIndex(int bucket, int fingerprint) { + static int alternateIndex(int bucket, int fingerprint, int numBuckets) { /* Reference impl uses murmur2 mixing constant: https://github.com/efficient/cuckoofilter/blob/master/src/cuckoofilter.h#L78 @@ -349,7 +377,7 @@ private int alternateIndex(int bucket, int fingerprint) { return IndexHash((uint32_t)(index ^ (tag * 0x5bd1e995))); */ int index = bucket ^ (fingerprint * 0x5bd1e995); - return hashToIndex(index); + return hashToIndex(index, numBuckets); } /** @@ -365,8 +393,10 @@ private int getOffset(int bucket, int position) { * * The fingerprint is simply the first `bitsPerEntry` number of bits that are non-zero. * If the entire hash is zero, `(int) 1` is used + * + * Expert-level API */ - private int fingerprint(int hash) { + static int fingerprint(int hash, int bitsPerEntry, int fingerprintMask) { if (hash == 0) { // we use 0 as "empty" so if the hash actually hashes to zero... return 1 // Some other impls will re-hash with a salt but this seems simpler @@ -374,7 +404,7 @@ private int fingerprint(int hash) { } for (int i = 0; i + bitsPerEntry <= Long.SIZE; i += bitsPerEntry) { - int v = (hash >> i) & this.fingerprintMask; + int v = (hash >> i) & fingerprintMask; if (v != 0) { return v; } @@ -477,4 +507,13 @@ public boolean equals(Object other) { && Objects.equals(this.count, that.count) && Objects.equals(this.evictedFingerprint, that.evictedFingerprint); } + + static long murmur64(long h) { + h ^= h >>> 33; + h *= 0xff51afd7ed558ccdL; + h ^= h >>> 33; + h *= 0xc4ceb9fe1a85ec53L; + h ^= h >>> 33; + return h; + } } diff --git a/server/src/main/java/org/elasticsearch/common/util/SetBackedScalingCuckooFilter.java b/server/src/main/java/org/elasticsearch/common/util/SetBackedScalingCuckooFilter.java index 4a4d26de20c25..bdddead67cf87 100644 --- a/server/src/main/java/org/elasticsearch/common/util/SetBackedScalingCuckooFilter.java +++ b/server/src/main/java/org/elasticsearch/common/util/SetBackedScalingCuckooFilter.java @@ -64,7 +64,7 @@ public class SetBackedScalingCuckooFilter implements Writeable { * * package-private for testing */ - Set hashes; + Set hashes; /** * This list holds our approximate filters, after we have migrated out of a set. @@ -80,6 +80,12 @@ public class SetBackedScalingCuckooFilter implements Writeable { //noop }; + // cached here for performance reasons + private int numBuckets = 0; + private int bitsPerEntry = 0; + private int fingerprintMask = 0; + private MurmurHash3.Hash128 scratchHash = new MurmurHash3.Hash128(); + // True if we are tracking inserts with a set, false otherwise private boolean isSetMode = true; @@ -120,6 +126,9 @@ public SetBackedScalingCuckooFilter(SetBackedScalingCuckooFilter other) { this.hashes = new HashSet<>(other.hashes); } else { this.filters = new ArrayList<>(other.filters); + this.numBuckets = filters.get(0).getNumBuckets(); + this.fingerprintMask = filters.get(0).getFingerprintMask(); + this.bitsPerEntry = filters.get(0).getBitsPerEntry(); } } @@ -131,14 +140,12 @@ public SetBackedScalingCuckooFilter(StreamInput in, Random rng) throws IOExcepti this.fpp = in.readDouble(); if (isSetMode) { - this.hashes = in.readSet(in1 -> { - MurmurHash3.Hash128 hash = new MurmurHash3.Hash128(); - hash.h1 = in1.readZLong(); - hash.h2 = in1.readZLong(); - return hash; - }); + this.hashes = in.readSet(StreamInput::readZLong); } else { this.filters = in.readList(in12 -> new CuckooFilter(in12, rng)); + this.numBuckets = filters.get(0).getNumBuckets(); + this.fingerprintMask = filters.get(0).getFingerprintMask(); + this.bitsPerEntry = filters.get(0).getBitsPerEntry(); } } @@ -149,16 +156,12 @@ public void writeTo(StreamOutput out) throws IOException { out.writeVInt(capacity); out.writeDouble(fpp); if (isSetMode) { - out.writeCollection(hashes, (out1, hash) -> { - out1.writeZLong(hash.h1); - out1.writeZLong(hash.h2); - }); + out.writeCollection(hashes, StreamOutput::writeZLong); } else { out.writeList(filters); } } - /** * Registers a circuit breaker with the datastructure. * @@ -176,34 +179,39 @@ public void registerBreaker(Consumer breaker) { * 100% accurate, while true values may be a false-positive. */ public boolean mightContain(BytesRef value) { - return mightContain(value.bytes, value.offset, value.length); + MurmurHash3.Hash128 hash = MurmurHash3.hash128(value.bytes, value.offset, value.length, 0, scratchHash); + return mightContainHash(hash.h1); } /** * Returns true if the set might contain the provided value, false otherwise. False values are * 100% accurate, while true values may be a false-positive. */ - public boolean mightContain(byte[] value) { - return mightContain(value, 0, value.length); + public boolean mightContain(long value) { + long hash = CuckooFilter.murmur64(value); + return mightContainHash(hash); } /** * Returns true if the set might contain the provided value, false otherwise. False values are * 100% accurate, while true values may be a false-positive. */ - public boolean mightContain(long value) { - return mightContain(Numbers.longToBytes(value)); - } - - private boolean mightContain(byte[] bytes, int offset, int length) { - return mightContain(MurmurHash3.hash128(bytes, offset, length, 0, new MurmurHash3.Hash128())); - } - - private boolean mightContain(MurmurHash3.Hash128 hash) { + private boolean mightContainHash(long hash) { if (isSetMode) { return hashes.contains(hash); } - return filters.stream().anyMatch(filter -> filter.mightContain(hash)); + + // We calculate these once up front for all the filters and use the expert API + int bucket = CuckooFilter.hashToIndex((int) hash, numBuckets); + int fingerprint = CuckooFilter.fingerprint((int) (hash >> 32), bitsPerEntry, fingerprintMask); + int alternateIndex = CuckooFilter.alternateIndex(bucket, fingerprint, numBuckets); + + for (CuckooFilter filter : filters) { + if (filter.mightContainFingerprint(bucket, fingerprint, alternateIndex)) { + return true; + } + } + return false; } /** @@ -212,33 +220,31 @@ private boolean mightContain(MurmurHash3.Hash128 hash) { * being hashed. */ private boolean mightContainFingerprint(int bucket, int fingerprint) { - return filters.stream().anyMatch(filter -> filter.mightContainFingerprint(bucket, fingerprint)); + int alternateIndex = CuckooFilter.alternateIndex(bucket, fingerprint, numBuckets); + for (CuckooFilter filter : filters) { + if (filter.mightContainFingerprint(bucket, fingerprint, alternateIndex)) { + return true; + } + } + return false; } /** * Add's the provided value to the set for tracking */ public void add(BytesRef value) { - MurmurHash3.Hash128 hash = MurmurHash3.hash128(value.bytes, value.offset, value.length, 0, new MurmurHash3.Hash128()); - add(hash); - } - - /** - * Add's the provided value to the set for tracking - */ - public void add(byte[] value) { - MurmurHash3.Hash128 hash = MurmurHash3.hash128(value, 0, value.length, 0, new MurmurHash3.Hash128()); - add(hash); + MurmurHash3.Hash128 hash = MurmurHash3.hash128(value.bytes, value.offset, value.length, 0, scratchHash); + addHash(hash.h1); } /** * Add's the provided value to the set for tracking */ public void add(long value) { - add(Numbers.longToBytes(value)); + addHash(CuckooFilter.murmur64(value)); } - private void add(MurmurHash3.Hash128 hash) { + private void addHash(long hash) { if (isSetMode) { hashes.add(hash); maybeConvert(); @@ -274,6 +280,11 @@ void convert() { filters = new ArrayList<>(); CuckooFilter t = new CuckooFilter(capacity, fpp, rng); + // Cache the chosen numBuckets for later use + numBuckets = t.getNumBuckets(); + fingerprintMask = t.getFingerprintMask(); + bitsPerEntry = t.getBitsPerEntry(); + hashes.forEach(t::add); filters.add(t); diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/MergingBucketsDeferringCollector.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/MergingBucketsDeferringCollector.java index 91ba01e311e22..5d63f5eb873b6 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/MergingBucketsDeferringCollector.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/MergingBucketsDeferringCollector.java @@ -156,7 +156,7 @@ public void mergeBuckets(long[] mergeMap) { // if there are buckets that have been collected in the current segment // we need to update the bucket ordinals there too - if (buckets.size() > 0) { + if (buckets != null && buckets.size() > 0) { PackedLongValues currentBuckets = buckets.build(); PackedLongValues.Builder newBuckets = PackedLongValues.packedBuilder(PackedInts.DEFAULT); PackedLongValues.Builder newDocDeltas = PackedLongValues.packedBuilder(PackedInts.DEFAULT); diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/AbstractRareTermsAggregator.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/AbstractRareTermsAggregator.java index 109525219e3ca..f6fe23a8a7e35 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/AbstractRareTermsAggregator.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/AbstractRareTermsAggregator.java @@ -38,14 +38,8 @@ import java.util.Map; import java.util.Random; -public abstract class AbstractRareTermsAggregator - extends DeferableBucketAggregator { +public abstract class AbstractRareTermsAggregator extends DeferableBucketAggregator { - /** - Sets the number of "removed" values to accumulate before we purge ords - via the MergingBucketCollector's mergeBuckets() method - */ - private static final long GC_THRESHOLD = 1000000; static final BucketOrder ORDER = BucketOrder.compound(BucketOrder.count(true), BucketOrder.key(true)); // sort by count ascending protected final long maxDocCount; @@ -54,9 +48,6 @@ public abstract class AbstractRareTermsAggregator= 0; - collectBucket(subCollectors, docId, ord); - - } else { - // we've seen this value before, see if it is below threshold - long termCount = bucketDocCount(bucketOrdinal); - if (termCount < maxDocCount) { - // TODO if we only need maxDocCount==1, we could specialize - // and use a bitset instead of a counter scheme - - collectExistingBucket(subCollectors, docId, bucketOrdinal); - - } else { - // Otherwise we've breached the threshold, add to the cuckoo filter - addValueToFilter(val); - numDeleted += 1; - - // This is a bit hacky, but we need to collect the value once more to - // make sure the doc_count is over threshold (used later when gc'ing) - collectExistingBucket(subCollectors, docId, bucketOrdinal); - - if (numDeleted > GC_THRESHOLD) { - gcDeletedEntries(numDeleted); - numDeleted = 0; - } - } - } + long bucketOrdinal = addValueToOrds(val); + + if (bucketOrdinal < 0) { // already seen + bucketOrdinal = -1 - bucketOrdinal; + collectExistingBucket(subCollectors, docId, bucketOrdinal); + } else { + collectBucket(subCollectors, docId, bucketOrdinal); } } - /** - * Remove entries from the ordinal map which are no longer tracked in the active key's map. - * Will internally call the merge function of {@link MergingBucketsDeferringCollector}, so this - * should be called sparingly for performance reasons - * - * @param numDeleted the number of keys that are expected to be pruned during GC. - * Used to help verify correct functioning of GC - */ - abstract void gcDeletedEntries(long numDeleted); - - /** - * Returns true if the aggregator's approximate filter contains the value, false otherwise - */ - abstract boolean filterMightContain(V value); - - /** - * Returns the bucket ordinal associated with the value, -1 if the value was not found - */ - abstract long findOrdinal(V value); - /** * Add's the value to the ordinal map. Return the newly allocated id if it wasn't in the ordinal map yet, * or -1-id if it was already present */ abstract long addValueToOrds(V value); - - /** - * Adds the value to the aggregator's approximate filter. - */ - abstract void addValueToFilter(V value); } diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/LongRareTermsAggregator.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/LongRareTermsAggregator.java index da2260c03408b..b1d294fefdcf6 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/LongRareTermsAggregator.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/LongRareTermsAggregator.java @@ -88,29 +88,23 @@ public void collect(int docId, long owningBucketOrdinal) throws IOException { }; } - @Override - boolean filterMightContain(Long value) { - return filter.mightContain(value); - } - - @Override - long findOrdinal(Long value) { - return bucketOrds.find(value); - } - @Override long addValueToOrds(Long value) { return bucketOrds.add(value); } - @Override - void addValueToFilter(Long value) { - filter.add(value); - } - - protected void gcDeletedEntries(long numDeleted) { + /** + * Merges the ordinals to a minimal set, populates the CuckooFilter and + * generates a final set of buckets. + * + * If a term is below the maxDocCount, it is turned into a Bucket. Otherwise, + * the term is added to the filter, and pruned from the ordinal map. If + * necessary the ordinal map is merged down to a minimal set to remove deletions + */ + private List buildSketch() { long deletionCount = 0; LongHash newBucketOrds = new LongHash(1, context.bigArrays()); + List buckets = new ArrayList<>(); try (LongHash oldBucketOrds = bucketOrds) { long[] mergeMap = new long[(int) oldBucketOrds.size()]; @@ -122,19 +116,19 @@ protected void gcDeletedEntries(long numDeleted) { // if the key is below threshold, reinsert into the new ords if (docCount <= maxDocCount) { newBucketOrd = newBucketOrds.add(oldKey); + LongRareTerms.Bucket bucket = new LongRareTerms.Bucket(oldKey, docCount, null, format); + bucket.bucketOrd = newBucketOrd; + buckets.add(bucket); + + consumeBucketsAndMaybeBreak(1); } else { // Make a note when one of the ords has been deleted deletionCount += 1; + filter.add(oldKey); } - mergeMap[i] = newBucketOrd; } - if (numDeleted != -1 && deletionCount != numDeleted) { - throw new IllegalStateException("Expected to prune [" + numDeleted + "] terms, but [" + numDeleted - + "] were removed instead"); - } - // Only merge/delete the ordinals if we have actually deleted one, // to save on some redundant work if (deletionCount > 0) { @@ -145,25 +139,13 @@ protected void gcDeletedEntries(long numDeleted) { } } bucketOrds = newBucketOrds; + return buckets; } @Override public InternalAggregation buildAggregation(long owningBucketOrdinal) throws IOException { assert owningBucketOrdinal == 0; - List buckets = new ArrayList<>(); - - for (long i = 0; i < bucketOrds.size(); i++) { - // The agg managed pruning unwanted terms at runtime, so any - // terms that made it this far are "rare" and we want buckets - LongRareTerms.Bucket bucket = new LongRareTerms.Bucket(0, 0, null, format); - bucket.term = bucketOrds.get(i); - bucket.docCount = bucketDocCount(i); - bucket.bucketOrd = i; - buckets.add(bucket); - - consumeBucketsAndMaybeBreak(1); - } - + List buckets = buildSketch(); runDeferredCollections(buckets.stream().mapToLong(b -> b.bucketOrd).toArray()); // Finalize the buckets diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringRareTermsAggregator.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringRareTermsAggregator.java index b7691bc4630ab..0c200e96b242c 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringRareTermsAggregator.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringRareTermsAggregator.java @@ -92,29 +92,23 @@ public void collect(int docId, long bucket) throws IOException { }; } - @Override - boolean filterMightContain(BytesRef value) { - return filter.mightContain(value); - } - - @Override - long findOrdinal(BytesRef value) { - return bucketOrds.find(value); - } - @Override long addValueToOrds(BytesRef value) { return bucketOrds.add(value); } - @Override - void addValueToFilter(BytesRef value) { - filter.add(value); - } - - protected void gcDeletedEntries(long numDeleted) { + /** + * Merges the ordinals to a minimal set, populates the CuckooFilter and + * generates a final set of buckets. + * + * If a term is below the maxDocCount, it is turned into a Bucket. Otherwise, + * the term is added to the filter, and pruned from the ordinal map. If + * necessary the ordinal map is merged down to a minimal set to remove deletions + */ + private List buildSketch() { long deletionCount = 0; BytesRefHash newBucketOrds = new BytesRefHash(1, context.bigArrays()); + List buckets = new ArrayList<>(); try (BytesRefHash oldBucketOrds = bucketOrds) { long[] mergeMap = new long[(int) oldBucketOrds.size()]; @@ -126,16 +120,18 @@ protected void gcDeletedEntries(long numDeleted) { // if the key is below threshold, reinsert into the new ords if (docCount <= maxDocCount) { newBucketOrd = newBucketOrds.add(oldKey); + StringRareTerms.Bucket bucket = new StringRareTerms.Bucket(BytesRef.deepCopyOf(oldKey), docCount, null, format); + bucket.bucketOrd = newBucketOrd; + buckets.add(bucket); + + consumeBucketsAndMaybeBreak(1); } else { // Make a note when one of the ords has been deleted deletionCount += 1; + filter.add(oldKey); } mergeMap[i] = newBucketOrd; } - if (numDeleted != -1 && deletionCount != numDeleted) { - throw new IllegalStateException("Expected to prune [" + numDeleted + "] terms, but [" + numDeleted - + "] were removed instead"); - } // Only merge/delete the ordinals if we have actually deleted one, // to save on some redundant work @@ -147,27 +143,14 @@ protected void gcDeletedEntries(long numDeleted) { } } bucketOrds = newBucketOrds; + return buckets; } @Override public InternalAggregation buildAggregation(long owningBucketOrdinal) throws IOException { assert owningBucketOrdinal == 0; - List buckets = new ArrayList<>(); - - for (long i = 0; i < bucketOrds.size(); i++) { - // The agg managed pruning unwanted terms at runtime, so any - // terms that made it this far are "rare" and we want buckets - StringRareTerms.Bucket bucket = new StringRareTerms.Bucket(new BytesRef(), 0, null, format); - bucketOrds.get(i, bucket.termBytes ); - bucket.termBytes = BytesRef.deepCopyOf(bucket.termBytes); - bucket.docCount = bucketDocCount(i); - bucket.bucketOrd = i; - buckets.add(bucket); - - consumeBucketsAndMaybeBreak(1); - } - + List buckets = buildSketch(); runDeferredCollections(buckets.stream().mapToLong(b -> b.bucketOrd).toArray()); // Finalize the buckets diff --git a/server/src/test/java/org/elasticsearch/common/util/CuckooFilterTests.java b/server/src/test/java/org/elasticsearch/common/util/CuckooFilterTests.java index 1c0b6696b5c33..b467a5fee5603 100644 --- a/server/src/test/java/org/elasticsearch/common/util/CuckooFilterTests.java +++ b/server/src/test/java/org/elasticsearch/common/util/CuckooFilterTests.java @@ -94,6 +94,10 @@ public void testSaturate() { } } + public void testHash() { + CuckooFilter.hashToIndex(-10, 32); + } + public void testBig() { CuckooFilter filter = new CuckooFilter(1000000, 0.001, Randomness.get()); @@ -126,7 +130,11 @@ public void testBig() { assertThat(fppRate, lessThanOrEqualTo(0.001)); } - private MurmurHash3.Hash128 hash(long i) { - return MurmurHash3.hash128(Numbers.longToBytes(i), 0, 8, 0, new MurmurHash3.Hash128()); + //private MurmurHash3.Hash128 hash(long i) { + // return MurmurHash3.hash128(Numbers.longToBytes(i), 0, 8, 0, new MurmurHash3.Hash128()); + //} + + private long hash(long i) { + return CuckooFilter.murmur64(i); } } diff --git a/server/src/test/java/org/elasticsearch/search/aggregations/bucket/terms/RareTermsAggregatorTests.java b/server/src/test/java/org/elasticsearch/search/aggregations/bucket/terms/RareTermsAggregatorTests.java index 8f42b48b1d33e..a0d48b7ab778f 100644 --- a/server/src/test/java/org/elasticsearch/search/aggregations/bucket/terms/RareTermsAggregatorTests.java +++ b/server/src/test/java/org/elasticsearch/search/aggregations/bucket/terms/RareTermsAggregatorTests.java @@ -597,6 +597,4 @@ public void doAssertReducedMultiBucketConsumer(Aggregation agg, MultiBucketConsu * buckets we should have left after each reduction. */ } - - } From a75076ff07aed188620ba67838edfa55f6fb75a4 Mon Sep 17 00:00:00 2001 From: Zachary Tong Date: Tue, 18 Jun 2019 10:38:30 -0400 Subject: [PATCH 17/25] Update defaults, add charts to docs --- .../bucket/rare-terms-aggregation.asciidoc | 44 ++++++++++++++++-- .../images/rare_terms/accuracy_0001.png | Bin 0 -> 25315 bytes .../images/rare_terms/accuracy_001.png | Bin 0 -> 27086 bytes .../images/rare_terms/accuracy_01.png | Bin 0 -> 25075 bytes docs/reference/images/rare_terms/memory.png | Bin 0 -> 20465 bytes .../terms/RareTermsAggregationBuilder.java | 4 +- 6 files changed, 41 insertions(+), 7 deletions(-) create mode 100644 docs/reference/images/rare_terms/accuracy_0001.png create mode 100644 docs/reference/images/rare_terms/accuracy_001.png create mode 100644 docs/reference/images/rare_terms/accuracy_01.png create mode 100644 docs/reference/images/rare_terms/memory.png diff --git a/docs/reference/aggregations/bucket/rare-terms-aggregation.asciidoc b/docs/reference/aggregations/bucket/rare-terms-aggregation.asciidoc index 7680f7b856bb1..88378f64eb572 100644 --- a/docs/reference/aggregations/bucket/rare-terms-aggregation.asciidoc +++ b/docs/reference/aggregations/bucket/rare-terms-aggregation.asciidoc @@ -174,7 +174,7 @@ which match the `max_doc_count` criteria will be returned. The aggregation func the order-by-ascending issues that afflict the `terms` aggregation. This does, however, mean that a large number of results can be returned if chosen incorrectly. -To limit the danger of this setting, the maximum `max_doc_count` is 10. +To limit the danger of this setting, the maximum `max_doc_count` is 100. [[search-aggregations-bucket-rare-terms-aggregation-approximate-counts]] ==== Document counts are approximate @@ -200,8 +200,10 @@ a different shard's CuckooFilter) the term is removed from the merged list. The to the user as the "rare" terms. CuckooFilters have the possibility of returning false positives (they can say a value exists in their collection when -it does not actually). Since the CuckooFilter is being used to see if a term is over threshold, this means a false positive +it actually does not). Since the CuckooFilter is being used to see if a term is over threshold, this means a false positive from the CuckooFilter will mistakenly say a value is common when it is not (and thus exclude it from it final list of buckets). +Practically, this means the aggregations exhibits false-negative behavior since the filter is being used "in reverse" +of how people generally think of approximate set membership sketches. CuckooFilters are described in more detail in the paper: @@ -210,12 +212,44 @@ Proceedings of the 10th ACM International on Conference on emerging Networking E ==== Precision -Although the internal CuckooFilter is approximate in nature, the false-positive rate can be controlled with a +Although the internal CuckooFilter is approximate in nature, the false-negative rate can be controlled with a `precision` parameter. This allows the user to trade more runtime memory for more accurate results. -The default precision is `0.01`, and the smallest (e.g. most accurate and largest memory overhead) is `0.00001`. +The default precision is `0.001`, and the smallest (e.g. most accurate and largest memory overhead) is `0.00001`. +Below are some charts which demonstrate how the accuracy of the aggregation is affected by precision and number +of distinct terms. -TODO charts here +The X-axis shows the number of distinct values the aggregation has seen, and the Y-axis shows the percent error. +Each line series represents one "rarity" condition (ranging from one rare item to 100,000 rare items). For example, +the orange "10" line means ten of the values were "rare" (`doc_count == 1`), out of 1-20m distinct values (where the +rest of the values had `doc_count > 1`) + +This first chart shows precision `0.01`: + +image:images/rare_terms/accuracy_01.png[] + +And precision `0.001` (the default): + +image:images/rare_terms/accuracy_001.png[] + +And finally `precision 0.0001`: + +image:images/rare_terms/accuracy_0001.png[] + +The default precision of `0.001` maintains an accuracy of < 2.5% for the tested conditions, and accuracy slowly +degrades in a controlled, linear fashion as the number of distinct values increases. + +The default precision of `0.001` has a memory profile of `1.748⁻⁶ * n` bytes, where `n` is the number +of distinct values the aggregation has seen (it can also be roughly eyeballed, e.g. 20 million unique values is about +30mb of memory). The memory usage is linear to the number of distinct values regardless of which precision is chosen, +the precision only affects the slope of the memory profile as seen in this chart: + +image:images/rare_terms/memory.png[] + +For comparison, an equivalent terms aggregation at 20 million buckets would be roughly +`20m * 69b == ~1.38gb` (with 69 bytes being a very optimistic estimate of an empty bucket cost, far lower than what +the circuit breaker accounts for). So although the `rare_terms` agg is relatively heavy, it is still orders of +magnitude smaller than the equivalent terms aggregation ==== Filtering Values diff --git a/docs/reference/images/rare_terms/accuracy_0001.png b/docs/reference/images/rare_terms/accuracy_0001.png new file mode 100644 index 0000000000000000000000000000000000000000..0c13a3938cde2e2ca119f098aabb1ee8c4d1ce8c GIT binary patch literal 25315 zcmd432T)Y)wk_JAD7KOWL_wk?K|zrQ$q0xjIcE?RB!lFfBr6#~o1AmE!NEkaq_)|qP(ER-mOaEZM-s}npDN!|w?##J1tZY)LUGqBcB$+MrdPqq@9hr_A4SI4GKT-wXP zbw*BbM>g|6KkR*BJn8W{8k^FKVh}6yxkY2_Rf;h0N&mfhxVy5pi!rW_x$(#hSLryH z5yDWGXynnZRibB_^(d;UgQx!1REUPk)+R&G(a|G`!x-F>vM-lzmC2WL^}hlxmhB;7 zDnE&3U|xCFBV~L2X(u}kbMS|jd^;0ZA54|)gr4QZQh(kWZhy*^XqZd0P-bG^6WjQb z#?UE7vrwbp#YlmA-uRZ~U}5coXD|ufypQN%!bybS)ev(aBd$i_I* zyq-C)3^`*SC?(%FVf5N~P+*!7$slH#=5ttJ=+Mrlg)*qkM_*gV`r_-wj#kU}zTb!% zrgt4TiIO;}QAG0JoT(BHK69Q_Cq>w~SSkp9(=IMw4#N@SncV(7+Pd-lvx#yrFTE*N zoD)apPF?587@QUinvM@z1Hc7_yHdqgkmCe?j1-};v%$z8-n&J{&Yi&;pVaTG_^B}O z7UZQm5UDN=6vT0l8#@nbuD>cb>nv7I6X$`69{i50dit5Lel9_O_2-9sF5P1L!_D}k z=CRV}ZR4|hYhz>Gz9)`H#|si^1Ep+bX!Ew-LQU?YQB2c3yxcM~XS-%5oZEWt$2(N* z5Sh=$8P`;<9v;1l@2JK-z3OHfBe&1rBI&KrjeAJX;tl(LMZsFLkgx&v;~AP}`|fLe zFG6wM_~R&Co9}VR1ep9BDNK7Dhs}7+t96|yMv3*v%^`ie+q<-3*8TaaH(Vp@7#Ic> zoY%)k28oxOWb4Jl#%JBy(rH`CB#wIo>?RGCX^+=ko6A>F^{C%h`(nO(GEIuDjFji$ zrpbKmD>H>d&NF)Te2ZelcBx~YEZmy(l=Xj7nG4tPXhMqlU_GZC#}cnt{a!zUGkwCwFixtoEHM8vD+~{&xP!Z5%6%aP6&3@JF$RybR#H7#=6kZ> zTSaHr(0c+Vtlp+#4T-QSel`0I1sg9jm9IWtugAI&_2Q4a@fCJH+exT%Pl)WP`86++ z7X@<|upEWETwlJC({j=&IJKIZ=eRgjk~Creg~T?g`D7Q3y7G%|>Q8V{`$(DTn~Uf0 zTfF$0F3lqin{cG&lF;r7ruwFO?pIMG>LKdTb0oAcG>fz<$8766=eyIeQaaelM)|7# zUBept3=whaY-67T)B2Ocm6B(z;cS&|Vb)o+WQ}H=)Q6P{(*i@@PkuABiJS|%VK}92 z&BYhywT^T)kU#`)&QXQRQqTD~waiE0bU& zr_pmi!J{}=jqBsen5}om+R{{WbCITszcQ*}>xp)a_GcBvAdPg5Y3y3{l#4f?hCP*c z7RqjGkml=AuE7KOsZT{NvP5$|J_@C*b!fS^>E5~{Z1YqQ;UK%y+UVe*9_QKoE1yKP z2VtC&4clEJD2tSUIOLNscw3=!0eGhI=UVEuu2bF^YUD$VawU0T=LTU+*Gg#{gCl~} z`)JR(Lqh~%HFRsN{H1cs7%SK4ZkTHnwZ{zoYpDRFUg5d+i!$&J9SOG`z4KTQ4;5rt zu>NRY+{!Dg?%HUjVMz?~C}__xTqZ+Dao_2L&vJm1=3AA`*gTTT(*E)w&oV-0S{vaH z_OZR*0vr|;45Fs%4$0nJqOs59R=4-M<6d0a@mFuk^u-Fv4=+qhgW(n^!~oD);eyQA#Kb^jjEkT#@=By zipcGHj8GF!*s#j{C=b$%AUH)$SxWQZMx5@&qb$4F9>MVp6S)DI$A~i}`qeH(GT!%4 zcGF*i1bw%wHj&SQ`UJPb2FkKQ`03nek@iE%1d-LGn`Y*k)yXbLWs0-Eq2YB2GFX#7 z12fkxp4Fx-n!=y#qjF6*etl-^kT_htK_jbKlrgW*rSnL*O67k%fzr-Z4bLpe5Oy1O{J(eG8AU6rWR6 zDll{6;w&cWYfbP_k~qQ^3AAvR2wr~h|J#zGD)9m%=$qYTHW>Ieh3 z=^!?Q5QAZo+vVD}FoxIp3FcozDb{K8yJHDm*jcw2dN^GR#>+MG_mufzkD7gOPZEl- zS7Coh{D~!*@;@Gwr{R7^ROI>-QAH07JnoE6;4xg5-Rf)Lp$!^!uvJjXDcA+ue7EL)A_1N^^a}8qQONUaoyv_^^-D-IA+tCF!Sm2m^^K+FytY z-n^5;Tf3izgnLNapYcm0hw`xOl9~id#JAi46nuos-jcP%7=WO4Iy}~EK?K!?SZaZ`fSs2DiIzX= zL#Ra*8a)C(yPYD~V81E-iT%*|j1P;OsFocmEL#FEpnCF|vk_@;n^b8!C$hx3Jc?Uh zvU0H&GL;Zk%WCykjXtXI5nr~uT+;oR<)}~YMU0Fds>S;Lp!H%~Ruh7@bhUAR(lM*} zrf4A9S8^rA?=z|rS8;(1Z{U5)#@QHR>`!zwg+j=!I-TZpV!Z=UqCo6cdUX(J+ z{{5KQo&2TLmUwnWMXfrlzs&jv%bEekCs`LPui9b_q#xF46}n$+Of3{rkBAP)49FJD zf4%5T@QD||K|*Dst&NN~n#8;CADj7-%e%F5+U)!9h^ZYMRci*bGdH`qCk5&_3K*<1 z_wP^MEjFwl0=arBepx%-Vsr9L*!aDa5TCvbUP69!9P-B!;$eoWbT79kfuAv0cUY+O z>pS_z1BIGJ<6jFt&?;m^TihWckCMz*+_7QS#l#QSr zxe+BggK`#Nb!@&oB0jNuc?I{*u6;bpdaB;be6*sVGf9w_dm(;pt zanHdOq#^8wT$i7`nPu2zJV&HtPE>yjJ9U*6xb(8zfT_O$Ktp;e0RUo6Ck=ckI6G+ zqQBr1hx^6=X?5$PO}_+_l>35%=^AD1y{l<8)znCGjqfXUz!z|CJcm$*BHHTY%fH*> zT=rIVi!=(dZ{E zxwWMh%b-j%FCV-O-)SOLdH=_TDBRETDTB`+lOs^9S@Jgh$>e5w%OEy7*vI)^6o0&} znkM43s8Ws7ZrY-z6SPeRF>I;9#>jIG;kn%@xK_K6%B{N>#Ow809DmSVKW{%xvwvs83OZ3r>JywDieP4a{%F}fB*C!?p#XR?h z0n%UID`_zS1*`SR@j;H|NAcqgX{gKqJG5!Q`BI)#TI4~S#7U{x;bImKY#?9t3%kPm zr<5E@PGb$bL;82S4`StcN{KFBsX@Ne*;iSpwjAU-!eM=}Rlc(kf|--u8td8(Ynt}* zFB^?`hmN@4SN*praSDEyx&{6`4e4!*D$1GUwV&*mHB8Vt#lz@&GivmGVH zlEG{vX0~y@&Us~ZKFQ7{jSQ-)J5$B#H9pGyPOd=fmYa>>Pq6qp?^~c&YfNNUPPI<+ z+6sdz$4aopt9jf!Fh!uG<>3s0O2hFc)=BTTuB#bMdzENi0CFd`#YXW6jKOTvi=N$Z z*V|(7dEqNWo+(MmLXO`i80~Z$o=5FN3r;K}Npat4G1@9Kn=r)k1r$h!tDAOj*DvKJ zt&Z0^?tP_HmhoN(RrMmA093!>brJAO)La@LA;_DP0F|q@hWDuh&XF;!Hd$!;s}r$M zNAfoAsEsxIHX8!)VOxvzZ3OHD$}gKR9!|;ydsO(Dd{A;xXUZ4vtu3i#6S$Z0T!mS7 zA#?9!G%-?}Xpzpg<}-D&#{J&yB5kzodD83lN2+T+@YCXxioI9YK3Mzvwu<-F8J($#}l?S`Z4E0e*^XqLa{ zsnkf_dv52bKr(-~u)PMiH)crwPNw^@zb5lyI5*1v!DEc}*y1667wzl3u=n#`_wXnG zGx8{H5GriOs%+R6a(S(G8bC4g?$#S;M&9P*t@mvV>vbzAM*^9)<-zYzxxbFDcUp?= zsoYttTNK?v_9nCnyELL_8__T-c0vjjR*l6CWZn{41H2pHbNiqW9fVAn%Jd^eiI$}C z{PgSq<@-cTHbsmlZWMj0;#wW!W|2SDH^cs6T3KsAMcAO&yzk zeXddx+T@JePQx3Nk$KOS*nO^y@?5WN10>SAWo83AFIV!<6cLQ~SP47Vp_ySf*qpig z=$3GXB9&S1FU2c!2*qtwmXV0sXU%Ta&+XjVh4xtwu2B*1Mz}RNlG3a(ELZO8aeZau zBY*IZ@TL5)*s4g9>I8~ji&M?MQv1fF45Et(1=rr^Uy(x76 z9z9^StZ#_E0P#`Q%L4`Lbmr8Snj4gc=e<|3jb@80As9=Uy2$0%6cKQr3=PVRn1Rb; z3vTF?A7TpsNJa(WE;eEr*hDuh8SnendVJ=*&mp^UB{={d_Cyv$riey(;-a_G{$w!T z_NxcF%~O$tfra+Cg;E&)B6A@d$*c@09z$XkVp78^Yu9%tRpituC4=%eFp(~dy9}Yu ztJ4}CIiq?+>=Ta~$@qVmn3uS-VE8CDBXQ>?{yb>Ga>IgxB+pWXnWt13`{H)rds8!b zu9nu&Hw^kp(!bY~Bwn*GW@po*iP}%zz+Awa@oT*_L)&MHD|4B<59~{>?F@c>f)Gg& zajm*`&rAm@s6V4Y{Xo%kH#-wyP(a{y(g?vh3;yzdlq9 zQ!8mFMz8fY$S)^FhdC-~{&VMRR#?$J2G5JJW9vOai!)np<;-gxsyd1xu3hAj&OAYDFBjOw+q?sETP5jf+TW$4GlOO-|Mv7fVAO^k%^EPY* z6x~$2zs2f!dr(xljeIoZdy$R&?_svyeQLiJDJH`36Sto?Fb5EwM3)kvpW7WzNAG|)-I^mimU}M0nF%|4Vk3?|A!F3J=5ERQ+8MsA* zE4)UlqtiucuMZi_dkD~-P4FjN!}+k)h+8>b9kg=_of-qG##!=9`W6TiRg))KQaMjj z1_zs^f)sXDN9vr5xPQHvNtRyH_jDfEI|hWaQ_x-KWU|EXIJFR`2}i?jx3MBeH{lhw zp`#+eAwM=8lbdOj(6|`Ah|_lDa{h|35-{)iL~ZW8cB3;XJ`4nABiYN?MuwH>=- zOLmRt8B%qTQ$+>sJD!5^hxmMaWQ(7yqlWzzBFjdfhNG+4;Tyf!$jP)iS-AVM_V35@ zddr9b1r_5abMgy~RRJk9xwG{3hjwg-|JWjufS0P&IB!nP@bJcci+mmL*M%+7NXdey z>Ak5Me>6?*Qy}BjiI|sDqPn`0ZRV!nF#%4O~wgS4}pp`5T9F33m(I5iujt=MwZI!~(;&vwI4ta5aO~>#RF`=expm5LVN^R^ z__I;))xtI6H4O%Avh)$-S5KQkYff@={PK0QkiJIE8}>;YoHy_k*B87MpjjD}turc} zWX$xx@N5F#bF}$qHmY8ngR5bs^2=-A#j1jaYU#w*`ow9FXBru>2CO|7z)HB~S_sRvAiew?v3KZ3r# zvLS#40g~+huS%5ab8pt8k|*!DC@}QmUc)8a`c>AEQIJq5nM=e%ej6ZL2zAGCC|6P3 zkd>LRZ4IXu0Nf@`Xe6VAEu8;PQd!3$(P^}P@t`X`at0Mw@Y+F4%ze~Ettx(9N@wD&}eZJ({`7tdkb1x|F)8p*zlaF@^$N!472ZDge7j!zej2 zZCy8hJ)pbu)bF-#*;hb<3ws`|Rj(nfP^AEC=(I)BPt>_@&yN5IXLGc-HdLU_v0n}l zl~rGMSm%d(ZNX%Ww%ppNiYD))I&ZE!H&rNt!JYhCejiB=pp7L~`f}a?I-Yyp#gZ^@ zls<^rVjw@p(0%S5&(#!B_dd&k{OOkH3hR-+pw!kkH1!LByZ(6_(=6h;?u3`{J@K*{ z{+O}5I&$yL%k$8R(KnIFz@@|T0s6YAoN8F_GLmH1WHC|iTKzKBJ(qRIZO089%OOBn z$09ipH|=l_cOeuaLxV5nE}|{|$%^FJGS{ zDFebr+P9}3(`@mFK+CL#&>K)8tk=hD*Gms63ziJQK9FDI=;Mu(y4?H%TsJ6;w*nU=Nj1|- zW9)Lyc09B%?$! zE2E4$l>dCpJjqt@&%Jz!=yP&C2C~v2q|V*ij?68jOxD;BSvu@ z#3-ODKQrw}P_(26gY%oY94c91-pi77$0R^8RdfUp5fsk#Byfo_;AYa@ z=fM%zh4|*{p0`Dg{QCTQ4N$^bwGO8GK+1c^iytwaD1R9RF+)Sey3gNnY5r)hg^r(G zGWJaGpUqhtKwD^4+Zu|znvoe;gCgIVbC*8;&vB=}!lFNarOJBb8Bn83g&gOHH>SVX z0L(vBZD(AqOq2^k<=^l_IQKt?pOvAKa$1CC&8Hi;^{bpdn{Cdt4%Ip?Z; ztv#8jb<8`$A8aq^gF>|gL{wc-5ims9(uOya+SMmtBk4t+1VlWFs>qJ*wkH`6z5S#tHqMe{ zl3N*qS+JS;8ism2u{GCW^ErS>04Q)tLXNrMdD}-TES>`>Rt^B_2mr>8?oVKTOzLvP z&+Zd1yPuJ}=D&z9(yJ_lSlR(TD9Y&lbcQOYGYHOO&Z`B=s^CnQxMC*-%zj=?5^+U& z7N5ufnWq92jR~VsQW~ti31?t%{WRGF@c>`qA5*a#Rt)jm=c2Ej&BUX@QV4kNt*Uhl zyhQ=c_AVw&HnzVT=%vMvq8O@N*7T~67kjhV&TuUnN667Ign}y6ak6yrwP>FMOp;<) zfPLzb3d(=Ix9>BYT7$@7^IYw1NFpp zXTyAuKF8v3su!DeBG*f0^B|)0U&GK@1)d~3>L++M7$-{;sSMC6UUg2$IG!(p2!)`r{Kw?4qGwm2Rkii}8^Xlf>XV<>b z;O6%}+!;?5g?4`p2yq~>E?Bx`j(|{G&^A%$>=;+DKGi%Gaf@cO;r5Vl@5bceg`AA0 z!9b-4?McPduwT!o5&X65<7mu9v0$c4cgPF0ZV@k#Fg@qY2~iD&B2wQ<=Kn=VdLh-c zzy$Z6NdNhe%7Qer$Js~B4tDJ!6Lg@+b%1OM{`vo(zwFcGbR-XIYMNSL33utl`Aow2 zBLBbQqwK7nd8VyD?mU?~nDK7#%-sLC0XMr)g~N3B&#Txz8B{G}HytMvNF8MtDA|Vr zscZv8yaVy%3P^K+!B}4$|Jj6Oi6Y>PNCEUQ5Gp86rz73(-NEwPhKBXpsR{4NG9TQb zZX%CiF<1%r`MoJpf#KdU{p1v}^4#2(o9wqs8FXJ8wpQfaTh&*lla$JWmK2-~GgLOf%kR>|@S9$+@;ln64$ z%(EZ`?Tk{Rc$@*2`4?C>9`Vm0I#UHD_X)>tF_-Beno(c|d0IX0@J8_iAKSbSh(czs z&y!V^zwFink%l zdLIq}s|*A+4R@bb;PRIYfZJ!>0ax$1ptO(QUg)j{UM4;8OpXpHa8+3B!ir=C$StuS zXL@Hr+)b%2kx~M9bPdusATJAm2&+anO`rHP*6=k-# zH)dBY&)dj9?^EZBwS_2h#brPZ+FrbJ>!WIx>F%Uz0u*G^@|*ZYsL8I|4TUUTH&Bt&Pg8bh72$maVSYPKO1`6F*ilh9(<^bI@BLa|MBO+5)Q040<${CDUq_>E=~w;id^phTi;p#hKSI{ab%TxK(bdfe zR{uL@LqCdv0>N$5leHw^f0fsF3x&sC(IpJa*)x$?Kv^J$^4NZ4jVX!$dXp5M{=r zbe&fQixPXPXl+nRDZ&=mwyYNhx0bGgPluZRNA$97AGnvf&bF+_XE!M-=$cpe-f4ND z9T(V**EJ{n8|<=`Z)fsfF4vO^(yO*jS>Z+BVO@^x12{;(!R_wf@I~t}Z}Aj`2}5c%8vpWw7lGkX4U;T+&4v}K)F@h|{0AWE+~ z1BGzbtYHw?skW0vmkhR_iQA=zn%j1dl)p^W%QbTX@B7hD!S*+6P?ZsTyTc>qu>*3B z(3Z&Rw1}?-B~HD|kQ12cfKwBoGH_N5XaTX#Ont_|fMubSte1<*x5ci!P2>-zg3 ze-;$>(A)r9XWED!hEu%*RZPe(+D695gCmqSm zaA#5$<-1+&abZ?QZ$XB7`Cq>oK=bGS66@J3d5j${ho*XMJ@&-o_92q0)=dZu52jo8 z?*Lh~%nM*h0qfyAA`ZVpfnfB*?B{!v8;>I2*7Hn(OXyWuTR<@m5_y(EHS;DB3Z2m= zvu#n;pqQ=a!-3HCB!N-FXZ+lyt4|@rD}Y*~!01?_o~N{)2XL+`rqXsoIO^s0hvc)F zz-KiA4&iE92T!>?PwiJOjeM0dTi~Xvy$@X&W&stv&P#o*V8?1&9)Qxn80gbH8=+h> zhkI*Qz?f9kl_K)UkmkMQK$Ewpe%Xt#cf87pY%s|fu03!JTlqPZ+0pKIQv}I*j$N6x zS&kz)FFh=K-bM-}N!YM%atU-!vq~#WSF1ea zIR4JJOHv53Qh_P0=!4gsU-h(!9kfidl(fmj|D+%JI%litw*Xge3y|C27fC~dk0ydl zf&jU_)5~ouBo3=Yk^g2TZ~Qa!W-mnI__Jn?;j30^NM1hGCS?X!+sC3D~9KwWpuO&GGt{o>F?uy*0DOz$dy2vR@S#!KX@YXu`-JQ0%y_j8+!W zSt!k(GnWyHV6FDpwM-IrE=1Ul-(RkiijI>6n@#Kpm~>x)$9K#c@1-{I6!0ufV-_YOXtIE=1PP_Z%>Bw1Ro+S31O8`Y2Mpn!)RsLG{;JA z=sMvHu?t7QEB8yu;DT!0y@^+C^pG#t;5rmx{P^?7Q4U`G0ECu98ZU4s8(#bf06cre z>AnK4B0<{<$mBTbHcKK{_%-`+Tro(^wQlavc!bh>vWHCCQ5|_;&j34c-y~2U=|o(t zd5lP5em}Q5uFIr80}+=F({L3OC|zJk76PN7A{rwR9DfRudS|OVGz>`tp$jq~`IbWj z4DcM7B^o${X#$<`#RLeuj(;ECJ{8;q?AUsfy&s9)BcdOn5s+&uT@DW(&m}u$BBZ1l z{{dG2{pU5uKN}{3SaaYwxL3uK zKPWSoS;FM|uQFd8Y|ggXeDWgzkV90v36kKQq3FzSGs>Ui44i>o?LuJYfF+1XN2dK;#F`lDRUUOYIt;AqJ02tN7|>&6egnh~xUL5A0{fAny-> zGDxihQ=dF&2q8#?QONF$$M;YY;Q0&t-o2sUMV zqu|0zI>Ds$mJpir^y;`>Jq$`7gY72sBULwAQMZpNVxB`>+9>ObG(5MKxGP_FGtdE> zbBS*GH()W3uXF^bz|WT?)z{s~$hFr7n_R76!Wj$3ssm#4liU3e1k(ivN=+}38|($R z1h-bPy?;wquA^PkK`sDXB35X+qGcYn8I4aCfGI37sO>|n)bEX28J1`P0H3-366o{m zpoITfI{hW2et~Py;|Rb2S`+BtH03@yqe&X5R>W0_dg5$|0>|BZmTu}J$T#)9^BPu} zdha2N{W`E&6bp#G(R2m{CxlUeEj*d-oIx8vmo&AxKAB;rS!<8RJPNe%fL80rHP0KW zMd$!bS?=Nk?jp&Y|L^7Uj}-?t4u3ht??!IZMEby{(FnZ9>t`9=ei76!7Z#Nr)|D2n zw-n^&f%;(&7$Bav9XqKK?Oc{E>>lW9vDQ% zB)}bi2?EQHD>YMwMzHN*p(b>{msvjx7`rgdC(iqE++fwbcX}V85*C$x594qhO>?W#A&;0p>bH6CZ2Y2cf8EoAjo4s?Bc zqU20Xb}KQ*GRw4Crd-@s_I$goi5i$WDJ8-fOEo8gK{AekH2m>P%}F@l=mle z=<_h>#}DQKUVOEm^(&ik@QwxNt41N2?p+9pPTEgq{F7`Df?#jt!}m-T%C@rpoiXA$ zU8TxdcMsCC`8hBRux`4CZ28Q3>V5eF91xVK-6Y(@PGM!EF5ceHt>d8qiFqftH+l$= z?mW7=CWUr&OYDL*QdPo=t%Y@Q|Kuum(1eiz-TTdWZ%m*qh)_^#$enc`QQ>Bw!RmA( z5fje``9QeupyfAzng{w9c*&ud>63YiPTrzAs*2CEAgGTz?R;jDZwhFCW+ATzMQ=0- zeaP1PprdL37t6}=heQ*0KaUcduNvGQ*nfmeo6(%RDSGqwE_SN<#fO%dU|Q`Tz^>~t z)I|yHWy>;#P%0i&^NEM&{)FoqJ`V!I|HcsoQL}YdC6pSOoWXfV<}FN} ztW8;i;>b4LUj<1FsSB_zW)AKloq+u>Nx_$7(C?_i zL3TK8M5te{>xdY30Y`RjhK&FwryO`-@AY_r+5>fn_e!ziLAx`j2>Lbrc+4~k)YGKm z3YWD}m!+J<1X|rlaT#zaKVOKnbdC$-|EMD{l*7&8pAX{X-ne78sduOibhDA110jUq z!aoQDwhlq-;a~H=ME?VtP0zD-4%I2NKW6Ill0Fo2T~AJG<7h}YB>|@CBzzjyCL6~9 zP+O6W-MjuDpv>|4dF!dB31R2u-#&mKM{b@*|6!py>#Cj*MxS9II9M|J9+!G=wR1qt zG&)(b5##NrfY#dK1G~HXM8SefT6HmS3=ZYU(}2oAyI}4gL}h&Z8lcB5z>`!0VMHHz zr0+GM772TE6_dvO?Is&WY?}_us#m|!*;N3YW*D>}VcA8Az{9ExQw6h(ls*purByK? z0w;k?34#sx>)(m5)}j&a|+M`+(Cr==*TG(G5KXr_sM4GjNZNA9V^~oesYVHkkl^!X>M7 zZKS*nN<+X46$=p>y4QrsV15;6O#udH{@ETEWn~1IF;=G2Kn3_U?MbqnzxBl5OK}-D zH8gItT&x1<%M;{H!$Qc1yjoa<`qA2P3MWsC&zSjyTe>r{!Tq83Tt`+amER)hUV0g_ z&soRULLEs&Sy*qZ=lUj~5uw(uD!KqVFnUQ7C^lJrV@Fb&|Al7})aex`=d^wIDnDfLE=U56?j zN-)>JaPJ)|Rq$g4YWa%0eF|j7!3ip`7?&m!`)wJ>|Dg|%qBc;40fa&f`V?z|!)GC5 zg9EGsRyN2D`18ik%0r@J5xPm3hWGT$Q2FK#=@2r; zDu6e1Kp5R_;-L^KDs%rc9byCAL#ON$)Ug&Nx)loW&-Zhr*Z)}$|cz*8< z*|(g!C<|vm7MVSYyj}{bqY=>SXARm7+;tP*f*1qP@5oH+8yiqVxd3Zsy{X`&Th{Jk z?-wXm3Lf18gY;_=@HuuHYxJz9<6Hv?maPLcb^TL-sD(%U526eh?Hub+00%!#QCA}D z1yj4+_xzvmE9Lv=*XK(>QD1u>V&2jh$D40Tpgrx5wiAIm*j?we^bD*qSEUyK?^+-b z#10wLn$`hNFG?*NAUYvK?P!0w>6bwELp|yE7a#>qt)$n3l-J+mqJ-Sx)5v=?6w!Br zGUQx<;0N|7ccsEVC5azPbuS>y&NuWIDebL}JO_1CgQ6c9$lj^USWc-y3gp3o5LFOaz-Up6A9Uz(-IM#x) zuuci;jfEX&0N%_tyOvEpp{93=e>q7n#9}lBTzkREu&j(W6ciE*q}D*AL>(?KNZ@Os z!wGuE(mE5Nm=|qw$|wxVY@g(3N^etG@LQ1hmK=ofYxyM<93X$-fXr?`w%}pN2QI}g zdqC6tr!}_93uEM2f;+@Ccg>%cHQxz4=?nEfn1k|oUN3MBqWB55OP{qil*r3!or#Hk zzUaS%uY?xY56{UDf;F0Hh>dH-I}~c7^xR)}#^V7SIRaV(hi9Qqa?J?$OC6d$f*C&75b7RnsP8%xG zf8;$0SUd~JfJ?{BJL&UZ$Z-3j|0cr$))vqdMXFiB7w`6g$U>t9YNG21axq|{2F?cd zdSEo3xHG-E2X;JIQ^$^7EMOug@ZD(=Q?vrs=Ag>`xo1HLBR7A&fPcO-`mnk&uI)y9 zNqVYav;|-Uq3)~;+ze2b)Bn0gMlbvV94)}hC%mKO5*_jZjl)(PO3C&e8gUx+E;eJ; zc4||4Gnh)t?&yQ6EsJ;|c~X zfko^l%Vpj^4|`#hCY!^!eJvSh72Ap9zV5Zs_ejmdwzPr9r_p!7y`xXaj_Es~BO&p? z(pzfj6%ZV|#`b{Yu2OKys~;2|o7sE)8PexzGO2v1-VWsD17>Ha-qnuop4r<@U{n>; z^34R{U+>gK0HM%@##C{y8_=!WtM2A7Y+bV+X=cAR>T6zJs<4XInY`GX>g&Vg(1GXW zR8^1n4x){Zp#{_~IKqiD(C(u+rg$Y(=wn`Rxsr++Am|W6|=sS7&VUWu23M}sGIjOT>u1% zQ^5kDByd%(SR+qR2u+ATTWh=Lv16XZXBrOmfs5IKwuyhi)-Gr6fy21E?U;r%uY8uV zg7a9H6!;?y1UIwZ%!dNNq7SLoR`UX&BhsojGe{r(*P75b5@pVG?%x>UN-5BY(B1VI zY&Dh500J0m3lIgMNWd;P0ZiNg-4toWRf75roCDC>khI~G4Pe_k2ShmV*_CO5 z6G`V&d^3!S&^dCt1Lw9bS#>ru-UwwN1Y(7e(O#B&_znOq(ei;TOn{>W;PsR7<&vA5 zBD@=CH^==IpjP~Vh{w7&I_6n(gc{}_V`8Z`eiv^Nk|{3u4M5tIWN%gv8IvYNc&cbT z8;t)`nZT+?`B|>x*zH78;FQ4iic~o(^N0zAjfb}z(t$@JR>mqs3Ul`pVf1~7_x0=k zbDN5AMCSAzH+qp|5|~(WOTV8`r*pFE>#MCISldZ+$-~l;NC`gxpCt^)V9&^cC99ypNUA=`mc-`K zUM9R0O@BJ*4ZdrZ;TbNh{mhYiEKH|Fw4@~Ub55i|ZkVE2wa8Qm0V6aGewlF!%uU0V zkNEDsmtt+5^ip}ixr4v(@H-Xs44mez2BEzCtyf6d_=yQdB*AUZQaX~evBx=X{oX`! z7>cM#I#N(O=(OjWKUj=tQ`X%c<5rOmeEav(u?2oAp@Eh=PsB7(u#O6_nK8xo_Ys5BjFD?E2t%EaPplD2pYfmSO z=%%(}of--3!zJkti;R>@ZZkpI28a~-dA+6g&*>>mh(E|eb1J7+`ZI#kCpm^tO(3a@ zjV`bO6B&Fb&>E61144vCW8=bv-1+P~?`d9Dx(|~P!YYJh$W=;P@+6Xkoz1pxP~y)( zE0aq#o$jH}H{&UKS{`yQ=26~aNwMNOl4r|z@3yd|?z(7JFmO5NGPrs;bajY)_xsfT zV(NnerYxHl#zOz7u2MHFI8)Cz zwe3co*NAM&z`#3!QqU5?h;metI)Xa1@JZI;G|?vn>Af*YX8s~TCt8!Hr^o5TUU*H3of1f#>*9tGH*yn?ORL?tA_*jTq*!}TdD+`(9tESYo(+J|G_@kVX6 zx5L<(Ug+kZTdvC#JbuZO$%=B%*kwd|IbTbv+Z~sxtzJvmlg74{t9KM1)Z+g${ z=SSc$lpS~Z&s>PDby(WokccBugkd64Shse zAdSjSM5SUtLq)a0UnUWMe6U>t?3hEK$~8#gfiW@6Gd)q}a-_&{>x`ohx)l@PQ$avA z(^=q%FRfmk{DHquhHFlseywmW9qiB2s`KIg;Qo-Is3rW=P~?-t&!> zlmByGh;eeH4RM~FXRvqf^6ga)M65yJ3Ezi&wr~xR_@#`za@C_axHv}D8sG~mnE^CM z{5hGPPsF9Gfa3s6+H*3oooGl>xx1hBi68i}Fxj>TTXWF&Z~UX>u%BHK`2x_}rGSYU z0pz4Lxc4=QM=!JlWT{#tWVm+hJN=(NsT49O3p%kTDN0#=b!ifLA`X85*8M>A(cLX) zrow+`UI!Me$7^~?VmE%hCaWW$m|4pMWsAVGuf#O}6}VwRG(zfM0ghAV;bj{kz6H$D zqlX44vO_z=L%GPPZ}?0@yJ_SN?L&Sp-23j)-)x>FU#gW^iGwi72HPhRMmIj_R~2A~ z^H)YCj^gs$P~BNNX}W3jk36vSjk2Xp;Nxk}#5v9;q0gv-%FzZ?QaPYA#}HTuSZwh? z-l{_;=cRi5AbB6X=tGsf)M9Lb^GXQG}drdopb2n$9Y= zabtzDqT4PYtr@@?XYdUXyDPolUs?9$_D+LJZS8RPdtZ*c;cze01CsDfbNnN- ztRH8qUS@8>nZoYC8PRR=Ck#{qmZm^ne*u+{z;ILZzXtHHJPDq22PdszlXuC;P$#eI|;d%<%3=AS^!pX|%QQdWK~=jr4Y z_EV9}umWP4n~gw?1asNgu=;45YUz9s^Q53NRZoB7>Y}uPV>WlpQzrH5gp)q2twGZh zZ1V?_OC#}%?%zCp8n)lk*vEc|<8Ttm+r^6s{x)H^e9EeSqdmw+oW$p#;x?S#LnKD% zeda2qq{#(e|pQchQgq!#Tw) zsP0cD=|T}oZe9u>88BewQFm0c?~Dn3(GsA|q{Q}A#zUzJKKubL*`ACoAjJ=}PuNpA zlLu}uJcznEiFM=JXMoKxlQWa6tqnXO5NlFU?rB3jVr@ScywUQ}v!Ucwe2Kb5q+HOt z%;7*_aE!)7_}xjX5S0Ay$_ASmqO90oaW8tpSbIm~G+Lgqe^DuX&3)%Rp^7N6%toVA z(k&IqC?5UtcaELY?Pd{@fA&5UK7^A|e;}5K*-Kj8j=dmzfHHhs`R&$J8|WVf&mh-+ zC4YAJ)df8QG5rUeeLgkwpW~x)RX-5R#g>ewDXiU#neEtFbP6-~@gz#}^7OHKo!S1C zeB8^7AP97f8+SOx1)2L&y&l+jS1Oe0?gq)Dr=zm5ul#WT3#Th15C{Sxw_j~qoW5S=9g?^fmXjSjDjk}oC7 zP-*q(?ANqrmU~8&VSN^OZ@?b?jM>s9=E+6Ac}6Oo5x#`X;~FH_kn`p*qxWJNI#wQgOL3>ubU4dvWb4roMF}!UybH++Q*0YJ-doJz93iL5-k?N9`{cnDq0j!J zJ<}Q{8JPJyB!JBK{_~D+*F{PL9-(C5rjvZwKd~1cJRp_K$n<8fnJWKCdh+1U1ow_m zEqwYaU%TDHnX}-FPnte6X18d+r3^DeN-0+$xo+EeJMZk@onG+rp}%?8(C4ZJ1Y7!{ofr+^d%j5QeieI_K=!AJFbM&4D~r&cy;L1}Uo z$t{-F7eX@px*x>vs!&<*WNu(KXRnKu&OaqoaUz!C^l17stM_S(+AExSAp6VUNkx=| z_GK(~TlHRivdsMNnK}YuK0=D{bBc3i+%%s#7@6~~B2#)RZoayEURhG1p&P~o{v3UN z-|h#_Ds?%!9+_%=KJ6K?P8#NYx{6XW)|`cPAaQ4m8#duJkm-v)`1KX z=l-8s&O93G{r}^VN>p@hxss()5%mozm1RN^HKaz?ZX4Mq$rdwHqLe}^iewF4%S6^N zA~odNx|SNtWNXx5FfnGv@_oJCZ|B^5&$shE_jk_kpE<`l4)a;&Gw=87`FK1N&J2`2 zx6KsZ|L6^Wg@75H{7+Im1Cfuy`?xD~d#zUL8H(vVyVXHo?-F{f8tXfzxA+Q45&q6# zf27Z^RT)v$7O?&kOI9JJ5W*GG4v&Q-57!U!ZwD?-!op42JkesUVKufT1}jnD>OHrv zupu(ItKepp#KlduEKLD%`0!^%FpzS8fp#EnV} zRi+6SEvm7<)V9N05l)JNx1#lJnRd(7l{@5vg|#bq)CSB}f0c^P-d$}(l9$8jTZ#_`A+nM8W z?_;FX8MUF1aO*t-p-LEWafg4kNw2!R7LPx(diprQTeq*F;b`w**V5b8!Qx87hSn>RH)s43 zwmU3enZ`F~uMF+@sr2lUF-pDaXJx^ls*bBez|KNrh5%3;PoVpxxd>191jOQ^J_PfJ z{W~FzGjS;Xu?_|4Lyk)|R{6bbg}Rn`cPHiB%Y&qo@pMO}UV=pMmFiW-E1rB(Xq8+v zCif$3)-u}zY^b$932}`-lA~}L@vDE&mA_k&K-WwiAhGt~2aB8NbF>=2eIr?#eRv@u zObGqyJgd%_JG1X*kp0I!>0>bhK_+L~xg2(#F$4dA&auxObF7c`t}KHQj6mX;k0LGa z^Z`?w@u6`fQ_+e5W&q^Oqq7~#1AAkh#XN(>Kb#Cb-01?Sg+OGK+c_Ozqq1NscD+d) zrqc$xf8Aizxy$`C!=WkAVc(``mwFg=8=P4FsGn(IUFdoY6>k7>hTz1sqC17T|7)l4 zw=t-EII(KIdehBB_l71W%p^7_;m>Ydf17*fFDw1?_OQ~ArtR$cGynP=#TA1<@TkjT z4d+2-A2YCanfSO)hite#au@4lZ~3+LVT`Sh{hkX=miKOh?Je+8@a2uN$+&{^mr4)? zN!*uz>5G+&uhv0Kwg=3g*Lv$e`5%e{%=(aNWv%uC*d{q(BrP6puhRDZPLJi_{tz6- z;cOg+Y~b@}<*(2uP`4v=eS<@qZ88wGjHv;X7t{c7ya5Re5@`zpgR~6o3<}1Un14>j1ZFh~2bgpeY5>5q@U+56PrrOE)fc*(v#58L|RRyvUH%3TZLOgjfp z%Tg=Er1kn|Hno>78P{yU3mYvt18JM5atDx(xD@`*)muH^8mUXF>^O zKB|4tC?CV{VKdql;MRAp^1V@eT+QW)fjZcdhD8q&=Wd7lHWKH9!hL(zRT3%2d+?Q{ zxA3Y@di#baKgD*oZ59s`2_e?~S9lZhL#|$Vc8h{o0w;Rop<#fBUKG=bolr#gvrcl3 z%IlTEdsyYa>v0DpIKMFcc>7z@OeN5CV~?{`w3gun#9!PX1>5!@h)R7?SEb;42=zS> zXn@4ZsR!hsH?wr!hzNi(7OAfg$TJ=41s{0XJoC}@Sw8CO;X z$B`noDeF6s1@Ul4Hz!=IVP*E0oc!2!t0u*jI&fcHoGGj*C z04Wh3%8SU|5TlQFo|00W(XB5PF~2*_)zqNv=FhU#X#wGoPo*Rj)Ajq|HBrC+)a)vK z_HUY$S#wckLCdRH{!xPeb*p9jd|E|Cj=fNnvu^uZj=6Y|{i6fyOT;5k_; zBYu8dmceQ&8Kd_mkDPApjzYuQc;2OI|x^Y`Ghca zf}?(qV!Yy(s7A>v$lRRu=f)!)G(+Gk?PXw2V+m1X>yTYPqGSnf{y=$_MileXkA;S! zW;nQiA1|M9UYm%odgc6>q~EVRyJ{I%vmp{LObWOKj61JVRa6|~?ezo4-^g?qyAbD7 zSAV^^>B)0Q-(N$nH?)ofJdiKov3^Z$UDc7T_YFv!eV+6*r@>)!jyW?AI$r*=(Qj0B z^Fd$?h7URMqAO-5>M2{cZcxrl^a`RZ>1SI^-)1x5=78fSrgNp>xS^?_?M`*tU2`qh znXFjsQ4QwpY9MKBEpv=;R-A$^2HG8jZ#JQ0fE97>M&}_g(UqkXC6wsoDLP$ zWx%?UhHy0+s)~i%(W=Jd{q9lBZh#k}LAATG9RP$pxV#`^bffj=({|Jn=i3VpT?vHB zd;o&_Q;dS{B?Om-!UXbCe9(KZBjD453n1$gZygZ`%!-E4*}lCMgpDuP3S2RVY{x(d zt!ELJ-WI3j+4Vsk1=tl}q8!gVWz4bBHWe|+3&DGXfNPg`Dzp;^1Z_4TFrHlKem(;~ zIFy%Pp8^4jwo=} zvOesOWl}arK{Q;pP=f|{E$*u}0B+831jS_HWY(w`V89C%GTMX=2fqZ-Abk1wb%(%_ z!ciZIiXs7ld00yr2&r~uy*-UYSb(dG%`FKv~{d1{K)$ zS1V!Ii2(G@P#MREW0V_^*&txT&a?Cl@T%B}wQY|xP0j+oFq-FjbF^}PjvcXw#&mO{ za~u8KzYVoay;CFKcUNHAqv^A27^<8rrT$uH)zGF>$!&-xgo-%i(v;gS%tE#Fw%rGo z!~%N)o3TNM5+p>m<=Az|zi>_Z+?e17@rBQ#Bl?iK`!yZgb=2uO&^YR&g-Zmc5i^D)B7!tlJNwl zU9PE33O}M<2*pJj=Xa{OI!c=sbl`FeY zWHkHmQsiF1_`S#@^81H;noqYyZM^vXLagyD8`;SRY2-W#Y3%S7+BpP7;=oGNal*!; zz}hZlZU^9JUfODRT`G8enV-n%&cBWXpvqM5JfkW3J3P#@<8}R^F-A`MS`S0QeKo!= zPE(8)k-ri5a?7g_s}f76Oc5#7n!VQ#Qt#J(KL@9Vz}j*1O;>??tHI38$(5L)Vh6~a z?-DPmS8W+jVF!ku+yEq?48Y&gDHghoX@KfD93?kn0<#GRQi*$^tNA$-m$pa@wb!dm z@_y~MBq?ZbGrzy|GWOp7J1!Yj@~wDNa?Uw?4+V2AJWlm;y-x%n#EHy92&4Z99mRe! z5u?yW9pa?l;*E+{F2EC9aj8^HTeexUVWHfY=1e!VqFF4P$&Fbmh;@qbZ@upU@X%EA zelfT?EUc*WmpD)gv1koE9Y^Bg&e!Y=y<_C$WYGsE>}c=l4`ZPFoUdNTW`jeOqa3Y*#cDVC(fvHsgd zew1Uzo&Cp0IA>Ac8z|bI#AW{lQ4UN)` zG)TSYkNbZ5d7t%u|9r95cdd0TWcbAy``mjU`#AQ5s4B}5z$xG`7>qzcUPc`TyZRIc z!+MN|3qIi`d?N#XTz@0~%mD_w*$V#ucIB;1gg*>s;iDiUsp*=sF@@itY1VkLO`E|= zE-gvguZkzhZU4dl#`}5$(oj-t#`YVJUPxpQr4P(!;ZXQJ$s39`PTkAsm&BK4=d_KB zq?VEzk|2D+YVK>_e10NU?OgG?ujrYBUd5{)OJ~Rp^6yDin4%tDkr}tNr5fc&=Y-I> zVDc;e>RZr1U@~_zuR}jQh?1ZmE!X~E`>PN9&zjN2IEA3CwLzUD+ddj5F-FgSC7;z{ z@q1a0V!XDVcA++x)j)Pgl@+G9X)lLo?C$H7Tis2Q4t=6%`sA+J_V?1GlwYR^1SgGY zFOE7kGJ|N?|Lrl}e+Sdo3-Z#+DeW4{EXTIzI&3<^sUN=lcH`urrrAdP;<&F^C0$H* zCVo))@sD1ylXX-FhYq@u+{okSlL#7vJRH~v?#A+?aezyfp?F14~*3BG#xWm_a<~l_3XF!o>%=2(6-uYyun#@$(v@cEc zV6%SfIoSD0x5#(b2*)39HE7s8%kn%q7<1ohe4Ur>RX1Fq!Lnh+jBnq&H~!dVv+_Zo z$e^;w_SY25b@pL5wjr-qYh$m|ok~&>k!$NN2I@9?7(r_4nF+h*jqojh63d4K&Ku)q zU|k&APGA|tw~}DL()l^JqSH^aDVwr!v#|W70$aboj1XOQYxf_ZjSk1Cz%wZdGw}bV}iW-sNHhn zW_fSzQrcSh)1@!Fz*@$b;`L{dq)wuPsqs?lc}T_!U;txvPHU*tcSh$s@y=sr@md?w z(7>Qn%e#kJpMSP)Plz zgQ}9+Ol3=duf0#SRXRqV!(u06edEJAMmFB3bF@>!&t1mK%|=jK92nir{>*oJ=V5jp zZ33njYt839Vu!VTwP~CJBg3j%5m81eYUOkjh#K0YiX7uN8&#OPnL)n2)!`m=-70%c zFPIxSX7Ep9pT{@brsG$7&0dZ8x4bPtR*(i{XtK{~uCv!&8pf4TG;FY74zX>RX12{lwU&F) z?`IB6y-qXvYsrp%u{wq;NVXy4nBCsTedy+MB&KvIl1V^**wq6p#pLNN0TJ1m@xmI3 z-RK6pIkIniT+K+gp?u{OoduTVYW0mExxT5l9n{!x{2Ec)8u_01z-DcvSgRw& zA~j>+(!g2+0v%8+-S|)884=>=pTnphKU{btXE9z`utq^2Dfow! zM7eCnE4w~J(?&}04{^B#&o*D2H0zuhNcY_@jtexUNBkjdPypb~GJMo^v*u$BB*nwqPEkmaA zZ^_JLvEVZQRAt-ecU&Fn65!q6sBc)$5=f=f>$i=tT+-uD$GrHrIp2ht+DHd*RDQR+c#!%)pi^T*Ba3d$^d4}9U%f(+fuz;L z_56;@s)8cQqLRS2do5|-=`VSp71>wdM$4iZLPk*yKfY!Vz+(OSu`?K$ydIP^TX9EF zq*9bGOr2^-9}MM>G$!-=cj1#I^<*70hBPdRtlo{Z7|hiwjEsM8T^-v%d#w8M_+Y}$ zMVul$mzURV$5b$TO-}r;TKCL#``(5ppV<$N8=Z5V2Gs1Hme231=bw@8&lX7dA(Qzr z@`uAag+2+&{c9;bPF(@y`3IX*dN#GlU$Be;5YK|}7cCv>G=G)$NJ1+jQf`ScT52OtuJI|b7<-3g{vgEf z$9j_J3ika4djtDSA;YXzx~tc27Ck?IZ*-cKQF0Zw;M0!{J2)pQqCNd}4egf_O3x}| z;Tjt#&Go#pQ;TO%^sQ#n7>7!+r4_qds;xWw>tY+r{5W=Wfk%~XM&X8g*sqKe?R*=Y z9O7|yWJVgfmYqgP+qW=EgUlNxZ=~2I`x2$TsCey==#=X%ogOcxyEHPPZmBA~iZ~ za55RLg9vWVR76tPQb1pF{CFjQTo0`NiEP0izi`=hmZ6DH_s2p&%nCo)^1e9qE(RWM zv&>LB%!5o$-eqo7YAcVG(d`g1AOM%jslQnAKJOr7`f0_1hf~`tiB?3}EJ+=54puvg z4V|DcKdT+R@Tcmp!_=ODGp5GB)Mhswv>3#g^yqDtm1EQPH_FYX$A3o*9qsP@Nd5Uz zz)(Z&o<^9hK(AJ--kK%l$|AJ|jZ$4LuVi)3s#QVubM&`2_xEF#J3a}CqXrHZlkMu* z?co$%?f29nWA;?Nsvo;nxiO!G4QEIkv6yLUEZoUucV##H6+1dI^8k25U_@=$WS<=$ zjMEJ{!Z1gTU5=qrmrl9)3s0K+2#iMJ1x#m#81StSRQ<}e|Q{r@wm`9JZ4 zW`)hf!;?1JbK~^m@2}Uhqm@wylLs3UHeFFn2h;9bV~|^`Kbn0%XuDK}F{}WI_D@6a zGpCdEi(^`ltnP+3uDPv^mVNR(nV6~ey?QNh2f38yjz0SnB<{FWcc;A8P;!K@T(tR6 zrNGRg>Aj~7BJ*kT!-ubXQ?jTu%KCKp4jU6DaEWL^2Bqb-6P-S))3if2QDqfxMJH%A zQnXq-bGj(_(6|M5QsaFhh>@t@NPWO@5u~q^%ni84qID zMTMyQf&T%rW*S*oWj7<%UkBwpAkV?z5>SwupIMja(e`7Pf@#7;w0GTF@~X$0r+w>nfeQ|$D%;#c^!(o~6?JN>+(Y|ZjAltj zZ}ef4^6mKqQr1;WiBVOU=1Jma<2uWr)A!1)F}OYVJNVAsm{m`l43BLdir_2z^OFF4 z9Gq==&!l-CJi~O}dF))YwxQ;FykFv-df);M$L7ezR~9i75@w}R0NqA(m-@WVD;jsY zIS%HdRkXQII_1P((4PMZn(D8ge{z&3erADrkh#HaP&ecS4CVPdTmtL8<)76a$M%>O zD5+e_`8*rSF}aoEwwQ?iat5Wt<5gBmUMyq4;!Hq0TBm>2OQ^>NK}mC>2=1HrY4 znv!Du+VXKXrQE2KsT ztcBjr!43Dih|y3kOMUE8w~-$&v!rmP?~~1{M}ZE@wg~E0oIC*oP7%n<#%UZ6 zTN7|BvNX1^jhW$V26zFT8Mi{go?yNljr9cs5LqRL4FjDXx`|!)3&tMD%^@P?eXmZ( zvj+x7TOL-A;;`OHWh|DR0<(7sb0px?xGMP8;Oqpf>lxZ$=Y%;{vm0!>!I}egxf26I62p(g?V$=^Ige|-Q?#ocS8aiQSG+Q6K1`o z9!lJ$RJlfH)VC5ZxiFrR%tG8KxB@HxYVx5{M+)bZz>w8i)ezAQb$$9v_hnr}?mttj zVUF+;H?2F@F?2J_SmUs3KC9NtrGK3@eWTG9HGSjED{^hwW#hiW{`8?c4+re1aS>p5 zeU%PO^SKi_@!qGv`c|2T9of9U$llbR?(}RxWgt-Vm}^WE*SVj?%wVN@Gvn4=cBwl;SIP5l*^3id~RVkZ>P%9E2`90C_alBQ+ z#(sCDR})4&)p4me`UtYe#y@c@e?tRrqvRwS?@rY@IfU5+m=WwEZtn>%x^G0yoOnUL zqri_j%a1zk>uRiIgo3D28ghTNHX|b=9@yoq+1c7xkAvKGhN@>16yiT&1s@k5aV$ zySFk@X`z8}T^}xt*Hf8+v79F=SI@v%O17w9_Ppp0Vf6$mK7{m3JmL#ZV z0jV@bH5bFCF?q`JBM3o-6~7|f1WcjnBrO!lrq!j!)Hgi$35(y5ju6Q`sNf-PHhyS5 ztpn>O%)A~fbheasRBTgDM=u#hRD5_cbJ2t<%lvJdbP;*PWiLEk)A2V~duGR-zp^7) zA~i(t!;_hCtLgW01ndHfUfb`DJ|EvrBG0l*mutlNl`8MWv^B>hGX#WvEhZ|Kx)0_eFJYhny3A1g+lBUhPjyre}DE%5U42H zPR7j&qYWF!4;xbyK+R?{o&Ce_fNMM-Z`D<8?ov$XMKk4jaOdNnQF%6=8@&QiEYV-3hnk%)iVOhxy0#P8qFC{txXZ-wX+KJu6Tl z`$_4ajtNkkId-Z7+D2rR{MfHMBkAwfOw~HP#zYR+pr;vav_7fddpa|Yn^{Xg9v*J? z#Sh1J+Go+s9PJNha&@>dG^#!)A7);X6~}B^SeG35J^#oDWw|n8N06uXPHDK*KIi&T z5@A*#YH_IHPb)t9qh|0+_Ckf4HBEjIiGdWFlkepM?&|t4--E(@v~y4*8$a5fGgv5+qOOn&d$hlr zPhSvNfXG^-+;t?_R>COe0v=JYbmSgvNxKr!uS3C0=s1PNpeN~&JQtRgPoRH!A$VYMvo#&QWb%7J+ zX?_Qihhp1JI3rEV6ZTrj!Gp%kM-)=IAv1=u96!qXYnuNy&eKHd_kIc_Dwr&#I9t_H zNMTK1Ng!ELG)3C%XRvCTpCtV$ai7ljO;FBVi=B+@sr1TSawy1Ja@kZ*C2U^fVbA^L z;z`nX_>NF)d9fwmw*;en^CDzzacBCee%#7)rVFpg%gp#_=zMjOT_#`fi7eb+l=;WR zPZfrPOq@r=94TbstWTvsd6)3CjwtZ#|%N@ zxa1DdhmcFbk%9Mhq~~zwFfS^@zA;>~M5~+*n03SP(-qRy#>DqZ7(S4x-Ai^TAc|S> z{9Us()>HzExdtj1G4y1D5^leqUy(Sz2Y_@qf35gn&4PTG!!;Ny3xHFdYSnGse|Sh{C5O^_y0Y=<|p)50t8lQ)a*4qF;?+9VnP|zCh<#kpkZWjA7rjT zVH*_w>-!ylwOpYDI0K+Vu7$eS>7Ld2vLPdA11kW#(twBUe)8dtiTmMZRa+3@#NNuF z4TK-OFT5y)9rLLlJAYptE_@B)ywo?XCwWewT>H>^lvl6XIx*E_r%P5`8+u~^`C9)T zDBMkCNWQcF-5LO~0%KMsjjzZJT{HJr20K6y)v}p_=nVinevb(4et%FHm+m;Az-?{u z7$U8{yt_65Dpti{Ae*nx{_c;|Iawu(x>tu$2@S+?K6`N$_r{y%Le(6fC&pOdh21(u z($Rta1xs?i3LuHn$5>?>$5>;~2Ae84x$}_~Apf@-%KwP3Gyqa|@(;H;b$=Ot0%dU9 z+H;TNh4`tNrQTF4F1_kXko(+TS%&7|Pd}1b@OJLo!#BTLP#wvFcGVz*YDXS!8oV?J zU?wPZU(O7&24#{>7r8Ks1!V3+n{iFD4f$BCh>h8WEa zqi5ivLK~JTapnNF^Gq z*;vuD3RH0ky*vDQ7F}*S1VI=`D5hCZqf0*`2GUWL@l==OtBfAB!NY9d%{%o`EtTd! z8OVb*M>4X}3Zrnv8t{j< zqlP=YkF{15i-uT$={vOTljb&1PD6Ii%2PQX%{s|AJ^4@#fb=4;@y~2jzHsn)K!yZ` z3-AN>N75X+7zP!4{{LV9pIr?JP9^m4_?giDQ5^Bs_1QpB*@qFyQ9eIg+e$U$n(`t| zB#()Tzm=jY4{esuXAPfFwKoW|7@rpmQ1JNr^aYQ`Z2A40%44`p$mD-q1(Q)niz57) zNGUEA2_Gd@rAPsr1%u5p4)-^FU1E476OrMc526(eM(2pl6hi{VBMBxp6FU;g>kFZm zV<7jE-bMIl_&>S~QeP91u~IiRa&918iUjJbtfrH%pe}>iSFJh|S6Qm8cpGDO&4{R0 z_1T&0%B+l+52T85H8*@{QqV8GMSPV+^~OJDpeP_yBGLF%vQ3?Dk@Yy>a@1rbNub#I z8c>>qm78gz?vr(p5Mfma%(xRTQ_3*6MYHI|O-W=No)T*|a2k1O$nI|uv|Cbxpwwv% z3VXo2R>rKM|NHRN)VDFyJOc_Nmf-YMfY>s2FGA!6OUj>f3&<@&lyV%G-Yb@L4}0^D z8!#BVQe>tZ4kzBNY?lH}-;h%O7r{>E$dip48$dKyzxgdyzh2(64wYf?c^diw(0bX3 z+(%WTv0_J3AG{7Hpb;4bbgMaVkuKt3TnZ0WS`CTto$oPYE~Eg zON9Cuvq03y8pK4;o2mPPI*H$3SyUXZWWQOvH14*Py40#sPR60F3+iwBtWh_qg+KqP zF#dv74{+XqSRTfFqPkcQEp7xjs>8!tho$F$eJZw_X`-iM1KaAyaO1~IRvPwaAc_F5 zZ1bzdS1Ur$ZrZw}VJCEdbE^JZUGhU46kMD}bh^~!oq)vvSO~GutR{!fGpxo0+3p`}L_1KNt9+P0h0%Sa%VsMmhP_7IJ_+{s3 zCljEmXS0;zpd7YDX3fvS_7N_Y|L5I|1Kb!Y@U0Z#%TW(u9SQgbdw;X*anc63x z={Z8y>bktT_^_sbH1gt8fq$u{jQR6MWEIEA8;pTVZR?E!e)IkoP+~K%Nu&|99_>G@ z1#WgMU+t5=gu3ehRi1{jP&;??sXe8jZORl7zHF8)3Q`L@cCL;T>)AYp+ZQaiu}lFm z5=`sihr;vEgM}l_8n@TRtVfET z4aZq@8Upk4OeewNJ6oTu9VAQCZSttqLx*}A-oiU8K)827rUVhJoMqIw8w z(}HFgjKk(%qgYlxX``Y)Q_TDSCYEV3>t`y6O8&dJHT9=H@xkloATOEPIud1psP%fC zL7)P7R;6?Wap@+I6-7bfz|Kq)Eu{Tum`rEd< zvs}8?xOUGOMKUG+F=V>;za4~dnA81;K5iLAX)XmBvXJ6k+6Hqg_W@Bz>3mR&Kh{5< zk~t4N)pez3r2fI0#)zsuY-UoT%S%`9V<*1Y<8`ww#&mCvUX;Ly3rZQE-O$6fm%Usz zUagMgzf!A9p=#^DN02M&5&Fi6AVX%-JC}E74aAaQ`WvKhbHBPNu4<`osb_TtG6gQs zGhR*{ZwG5|h&&wQiq`dID`wTse5p$#C3E>)qUPk`+v*vmnzmWD*NG@EFAv5eP1w1C z>)RUDCz{OQYvP7?Nzc(?0jt)O-rZ5VEm-iw-^7je%4OIHA#;~EjGWG)@MU}N{xFuR zQHk~bKmKqOz@)QIFH|{CbQ3;auenN(ywM{aaw)$<(5Ph^5Wp(m8}`<@NyC|0R|Ta!;{o*RD5<*mcx8Y zy>4fEVEsH;=9tU<%z)hwVCrJxohrMam86Gj zZMSmbeF2ZUak&SClC4qLaHVH~QBjASWG{nh+IEN}FV~Zbl%LbAVBUnydhdSb*UP6r z!+XG4o9U*2phyTcAo4SD3ue7s0a*JDMfROHR-zXs7G#)uY?*%;Qdowhf<1v^q6{}D zSs@*Eqf#*M<>gVV{!BQ5dKJ_5A@h%AB4!DLA3>kDh{v*Xno%U_d<3)iht^x7ox@o- zN}^G5AuAGx{!U!beIU|se?}_No?tSs7Q>}8&++}V8rzg ztQ^oRRQVXl;$~TsFZb{+WX(RK?-9CDAAAdl{iJIT(3vlwoJFDtm^Tx?;sOl~Tu7#b zxRs&%HnconwR2lYL(+0Sck#b4foh34dR=`dP>1eDPvITG zDaUAtbxsnndUcJEh7<76mmB@{NnJ}!p=wW1yn|1tCK1C|!>UXNar500Z_!H3`_Sj< zDTYxP(eoa=Azc6vCDwj^3IfSZ3AVc?)#X6(18ykE5-PsEo3z9-NYscGD1(zT-w}H! zB}>Ye%@!JW#yS~0uhw3e#rqujwwvl6wV=<5n#_DEf{W;+; z1^FqT@>n}#c7=~Yh&$l1Jw6Fm5$ZYMh&jAheRO$m+|G3+&959$N;vdbxXgaUS(oW% zw=&w7Ey{%Lrt~hHF}_qrH8c(&Z-^V5oFSwYv+jxKaXGXaXiNfRn8s{l4>hzvH_orj!W`BysEtD0-xVZ%Br}{iQoJ%e>&pdk8^g}+49tlPS|pnldm-Vh6%rOBk>Tdmu{X&K@touXqJg%pufTXh9XFD z??HAPoFavtEAOAf(8bKTK%x#O>jrmR`f?(+L@bT5J?`T!6LZ7rE4f6H0i{w5=vzCGp>pq|v85op|M-g>8pi_2740AXC)29(7)EJ5 zeDM`;qTZ#1TFhfKS0&vREHL-)G~CM-VbW3aWozpfX(jHSdMY?&3gu3Kf*b=ij>0%v zz)1tT-1I*eJ@Pd2l$j>yzT-W8#$JP#fvPwL#-Xyh@JIY0w{O_D(M-Y^RM&sY0tqst zk}LuNcQ~#@x2%;yz;Yfd+Htx63rInYfs%`kUskei^NZt=|q2~^adc# zwMz^|-RD2bt^1Qqivu~AZ53eAfOxWJ9T3gjt9)26)$kY830QQT^x>ol>SKTrs{(*~ z{63UaO5drEg|NQk)VnUgds^BxYzH<1zSVg#@wGOW_)2!PM?EoM#O=U?t3i2c0!Vrx zBbuND@ck+ca+zKaq-g@SlcNL+^`L(4Sx;Q=xS~G%E{CQd_i6Gtz%2%SgM%vK zHeg4b3XuQIjQwaqy0XA5#GCs0n>cT*f$mPe4qK1?95&NDXZvJUz6A!I`Xfo8aQuE% z#bf7n-7HZ!D8sR_;?VrC7$6cQr~oqbFiNi)2V~ms)FX>(!pSj9yH> z9i|~PaA4f1GkP=^w5f8wC6}_lJ~miYOU|voJl>nMQx2ww=^}FS&-GSs1$?u2ENr;6 z4$^AVc>IyX6vm)A%76i&PRq5G4)H{o8n|I7VIL&}zrs==(Up@y!+~rwXG5*;0WrwiZ4J zHsFoThP?%)j-&bWO7oBSGVJRa$8rERq>5gg?xKC#s%<9Lf%c;UaQ~0r=*)rxVFRSR zT3-r5nG}@QUJ0&m0h>i90Dlq7nA}3TUrL%C{LTm@qF1&wN-d`w+zMRoag^PBnpk&@ z?0;K4s=pR(x{CO(g(BiTDBIrrauud3RnQuh4OddTsrXE*UJ<=4~g@M@oU$J+ZfWs_QGGQx=XJa>E!$dM_wQWT8gpwrqj9!N8 zhFC>oc6oV(j4>8F78Dcal>~D{C%HV0*`0K|Sr|j?wsEmYRQWEWFb5nlOCI+XUnytx zSvM0e4*HE8xD5{7hCK_ipdpaX2`Nfm)lfETxvnHJeXu_E5=2ZDp~IQ2+JB_XMv`q& zN}_4!xm70H_)2aw93d^(*jW?%%i5wSrQ(S*SMB2G3$|SqE?b5!iWw_|c@xV&KRpoB zvCQ<(m810>nntd1QHpzc+~EPmPBP*KVlD@*#2djkah`qvg&|BBp+&HS%i#U1s-!C#M3b5OTPl1I6YdR_#xyj*e!pwMjv@S^h0-Z8tTJOi%4j9@ z>XVgZWm~KqRCW*#x=CFQYL{&kpqLBnnP4vG3QQu(QL-!A#TfoK%!TH6!Lo@0o|1&5 zz`r4A4$31@a-2-3vwZt+1};+z=V;42N$dOl(u8B}E7Gm; zUy$&7KUJy0ias%`=HnlBlfLPotnIu$%6Bqc;^Io+Y&G1iK~N;BgCmowKOx*uku|_$B?~*DfNqO@!$FM*VGbHn&P_2!-i`Q5S(I8p zi<4dsq*Rz`w?CLY2&X@O{=DCbzefE>*eX=AK%ZICfxUdAYw7&cl1o}?Op zWXxN2Ae)lti+#?Txl-Q4U2*rpt5G?zom=s@ofw=tW|Hli#!_`*u*l{Nla&$lt7xl* z{`wd@T||vx;70G(ftLf5kjSkBpPlT3W?`*zPjrmb^{0rCLYc1?9HbLF-yC~{kSk%aj_5aS78|p zrX+D+OI#Ezv2$MpyGx5Pngn8ZPOEuRh8YAs(jEE5H+DQ~u(`wJF_iC`GlzFT0y==|;nu!`9*U6Qp02){2%6ly=%3+qyp{h<5fKn`&{?iEicvidDznHA4z7Q z0K&cqCQ4)xa6?gzgE|cXJY-e|G)B^Y$raxY+FyFC(-z;W56%1OlK-0$WY$3)M2U7F^U*+J-HHqh;U%&LIh7DdX%%d~^9g3U}v z3dHA?G@?kLVLC%`8I`>V5C9@Q99p%GZl5bxt$`fDxr-@&eovoMN~64H#%lu9CrK9- zv0y}O1+D7SZCfI-RB=%Y_pJ$8UpASYyk{M5HUXgrRBTN_f>>Ow6ak8H#*o+m43f7C z59UKe%D@RK&BQ-T8JVstg)AFjF-Y%z1jM3_O=75U{%2`)UqApz%_VLz1?BkdvnwP9 zGF@lly%HP)GUZWF(JbV;Oy(9(^>ssOl7M*BNK0Ekx*0eZwf!RyyW73oU@!#3OlizR zRt0}30>H8d&<2ln@VGw)t~&uAw6xT!zSqCEn|?8Q<{HOs@O&NEa)#;7-M}Tv82?gi zx-k2r!{Lh8DwvQgy;}toX{cXpluAHx^t{S4cF@6u&C+}zM2*@B)ZEeeCJKJoxx~FQ z^t@LhLE>Nc+cAr(6i6;znwh5x6`0zzwA?R~oWRi^DMEweHFjGgefyA6@B`%Vo^l)u z_JRvjO3KQr@U<-}reXA|PiMYHUicarA`dw9YNs`AK{Gk6ZBVC7K6Egl`w9HRJN>N+ zO*Y7B!_?I-^AE{%j%rn@=x=Hb#ypb`hxxG5bDrV*UYE#WN17vnm}xzf16>M58%Xq~#RW9I z)v3&sCVj^l59<{8YW>lgl{~shbPC z{>hkn(?(DYGUSOGJEN+94Vu3=W=a{GK;0f2X2ucIzgLld_We>T4f=pAA&Oy+=?V;w zQA0P%FZR|U$;VPgY-U(KH@FfKT7z&8!5fnXDj4i;07cv>G7(P^#vW;_kljlj5JFWgi(Nfup={8k~w;Eih?dgu&Rz7&sNC8x?^k01;1B9@31)RS|6k#3(MdMuMR7;Q)lmq22#tSqojstb7 zUM&0yj5Wr(b-*;;X?CQ75qcEa521giE|VEpH``s&ksz#DVbD$nrXX-u0Oi04S3M95 zLrtd6?PNN-VA`#t1TYDlPSONWi|cyKD_a^5oMuK0Na~F1UAEA`?EvmtH)aXW1~wkF zS=OWO0*bxpITVs=L-ZgWWGWCro2=u)=)NJuNO@ZE6eUh$+g|JZr#$BUjy^qDVA9Am zg^RK}q&~O1#75K<wSKcS?HRFa8ZoscG7SK zE<@|jKgI#|4>Cz;{hWYkDO$0~H1|!RdY}OfwcllfNtgwHS8c!pkb*S;awGH(S?p98Y(Ok#VS!F1ra2Nuwf6RRmmabnzGC45Xs<#B*s-_5+g1 zOAdVP$yFF#Y&o7H(~~V?7^x0xrh@bz{yesi%odINBYGo%$KiFp0#gkHxF*AdZ74(2 z&k&UAVRTfP{>%hz^`Jw+pidoS7=X+2tcFgaxK~5+5{M#MnkFHJIjg``5~ZmUzh~Di z(ytxR>VpL1MNsD?=qs!L<%i$u>IRcYWSuA_wdWye+lJABV|$4!2--FJqBh6Clgj5? zG;f=*nF5=YumERT*m2nyfb*(CbX_;7Am084qGGqqSWr$RG7&^IpMmR4{1CPF5U_Ee zaUw3&>+gy)&?;+-)_~9mO^1lDHZVsXJBR3JJvLqVKS5CqkmSdo*3yAP06HS70`SSz z_pZQxx%w8YBV(h;YA-Ea|3|v`43N?14gC@Z`kz6Yn-%CAsQxEBQryvh zD5by$GLEDZA+OWVpw!bAOIHFqYwmznoZ!e&{~RJ-Dng0cx|Q5yYrreoKJWdX>rwzf zWQt*cj-@D^trDI7zdPq^H$hfoS9U{DcGWD)<+=p-sV%mlQjK$Qtu-JhM?T3Bf^w-* z)dGih(VIi_{qYNwJFjsIOs~|~*TBAuDN~HRtH08D<9Ut4|Lg%e_tIS-N-8kW#)Sq? zGr`_ww8V4+5H~5RN3yaF3O+?ZC6g%@Ukq~6-9GWHZ}+RM$I2}eL#6V2)&c0Q23_6{ zK1>0*5936srn6*0%cXP6$4=78KB;0>e~U|Ifp=1C{jUx{@OsZb9e{w%I6s76{Eh9o zkg?^m;ApY^NAGjzufrIeiGL-q8fsh9+N%Z~6F^l}0~&XFZEF|eQhLtGH28YRup}l; zDPyvW+SISOC*BFamZ7TtgDW%%$YW}iBITtIgq_#_`KW@SBQW5Xz^_z6M?I$=v<40K zieH@a8j9I;wVBq&a7;FPHxp0`4}sXeSa(6=HvbVdunG?bEi)BB8$Z^?l1|lKh9#l9 ztkq2o&Lq1UNO%o!M5jXKwKg<=C?89-&jN=*gF&VK#?b z0w;zz80=mRa2pfrV-+^@ogb@ByNGpSrAbj>?SUkVh>{G{k&0lKix~{{vDcZr%?b}( z=!viI#dp%r=PK^K5hgr%pCC3_$RQVW4^0=-9ypA#BF)v`xPlY1zFyJiEXpJNmi>@C z&XI{Qdf_=1RPsYn{oMufBWLgQ$1Z3NkDoPmGYmA;KIKC-Kby~+tX=-@Gx!_kw6Z=k zMtA^}mDsR^&wyBJB-R>m=aSMS(@VXz0yTA;VNKO*4R1h>uY9RoZK%J}^TZj0b6M$O z+&Vwma#>OKW#C*v&@lH@VZQ)!Ocb(o4_D0upSlK_mge=>0&|x4Z#u5U4A|Eo3awjb z5uy_rjNBqg20ZzaKtlwy|AujYIX~0z&R(TO)-AzeYR86YpZuOdayKG& zO44!$XrPgu{s?KE#@T zR_B(bZYX6eqX~^??K5%f@?|5I-fs13=Bruq1mr-s_D|_SAZD0U$s+XnV|3w51tP;} zePrB$qOW`awLO<_f;l8HG3zF^f?6($Q0_^6ImWK@owsz9up|QBOU&_M44G8g{d2H_ zWsDJCeX&`;(HN!42t)#gG+_5;hH)rG5~&PAh>Ay}OFDoi_PUO_x{9jU0-VlebM<%BCv5z0rD4KYW zGuDvtDhVvZ7~HLzi>D|XWd>z8uvv&MF$*jurH1lpNbSQK%emxA6ekradwBsn^bIMk zksV*(0B{u}YaIH5&kwF0d4Q%Ug~AR2NS?c;S`2f`K}h!T1I(7hSE9JA z_bu44*bwlFTLAUniVLL+I)NsUVGsDojF?Ru>x0;4Ezy+mCF<@l_PmBmB&rX*92fE&Nx-5_BEA!ALOuWZ_&o}!Dc&JqBZ){cm8;q1>jP2Ic5#BMR( zSPeeEM`Z6<^T6IgDrY=aAx;H*WA(&8B$(`(C(2A;uPbM=UJj4~4A3-^ol^nTGS5=t z&fU)RXyHx8%!?g>tN^hi4(^t}XP_HpeN{s;#^FtwVtXmta(+?^rv)sI0(qOTUAn z&$V1$8G%PQ1d6hwnEB+vzaa+DSprSN60-QDap86aw{WAbYM7g_sm1tdTyIr>NDtbH zp)d8(%m!7q@Tx`11O#y$ajfv804l(sB0zQrgH#l%e16zW5w^;OEsj==kykai9Yhef zQl#LD{){jQQgHv*=OFVyA>%ncjl1u}A^7-2zW)Nn+=ZiY|7@yQ2R=W0Zd|qAC29$6 zaFSlIOk^+`g7OP*ta{lyja#k6U>K@UFZ2ug_zcD_69u4_kPpg{X~$E>fhz{EPD-d^ z!D2w+U1F@8h1HF?EsF6PbAYd&12UGFUGojQgl@MW>#Zn&s-f@Caki)Siih?H)uKrp zkTCaAxE%rc2271RTmLX?czv#sB0gqwu8DFC@4U|EBUUR+~R0d=P# zlm*Upk5XJiw-4X};h` zX)dt4&=Y}TMN#&;qWYq!5?p?F_sCO<7--~yNLQ7pc8{Bki&Nn)Zmvhr%U*X~XB+yO~eC^a5Lr8~4`I?#&i*aEUg<-8jZ9QKee{912kd zLP10$piJ?gwKWDLeSd3|3@@)4W!xQm0B{|__)RsJK?x<7(@#Anhyx(7E5K$wWUE~i z815w8&y)jW-T^#!q;@~$a3D=w_E+DowWkQqYnyQ_F;`V&bfvy%O7Z!z9-y^b-L6Ff zafbwfq7EAp}d~p~>Z{H-J@t1SzDz(Ua^BAV-j1 zzMUSt9SbKn3X}x-c;DW84vJyG*J_r6*r#F4ZmAm;iKQ_j{U_m0cqvHwxDkzovA_

H4Suen#sD0@?h)EO`^i9|_0@LP)HGz{!YS9v1`1O>WdeTTD3#AYFYQt+OuxgRHfIY2T@vu)?b5fRRconvY|0uYzll^ zR;v9J;7le0vSdI7qzXWJo(*S!sqKKu9_Z}Cp8+lE`bRmMG7@Fii%Oty90n-o6+mtk z(Dz|2cZ7D|=I4~n@^BS?hRckM1`3-yKxTN8>X9)Je3T10EROyM!V3U7%QaB_9h!EX zBX9>__OlK;-HM)j4Jfc2+x`!o7ZCcvA_-+qHq6ObXk$Y8hD3q_9|AN@ShR0C4IRt4>GL7Sk(?is&!9DQ@h+44t)O`~#jKW3LQg5O@=Tn; zmn6Jx+X6C)jr!F8OLV=gsNq)~CV!^4zHInR_z0yNa-*(&vA9drE#jah;2(?|`us{> z2hb@OmDvIf>Bg|mg+P#0+p3|*wk-5DVFo4YbsLUrBR}@#+346rEW!uDcPRa@=FU8v z%Jtpj+9fPo=B-VpiVQ^=ijWMMWggmwD3u5aiDigT*}F)FP-JCg3QguQLn~B<%q*)# znP-+|cJ7Dv-oM{)+~^jEmp#BY>jIEigXU01iA;7h%JHNb};B6 zOrVV~Zj_|8kxH4UT?ddy)jPS@Li1%c(cggP-WN*JDDtmxFNIX&6zhB z&8(iMT*qkDnbNDO0KudP9)9-*DHl!7+4(mfL#M|;5`d@g03fDW3H*3vv&)k5GL)(0 z)d#C5{12yLcg@7==0@#c(47sKGPWmK#SiiUD^4Q)J{nm9XNa(Mx4hQeh-tHBS;@8; zi)9~8Tl%gUwy~yG@W~$=cZH-P*Tu(~eTP8l;FEOJpX_WxpeNC*qYIt%F>+VnD8Yvv zKZ0CC(yjz)pfD2u$^midEA3)hQMmT)j=1o)RQA)B^#(^ne|^oMV=HOmpk~}^`14OR zHw{tFhWDlQ)up2lAw)dcm~6t2^QuyA)3YUAj6U~i?`o!Fg;mU#9=nH)$rghy3{mM( zjhn*k;y^?RR@TPndyH8F$5T(Y0~85jz{&)fPqAJj2h5%$ZlY6(H5b2R%{Pkq%qd*3 zWQJg92u#*g@cK^*BW&bq^#t2(0{E+Qx{EPKA09Bs2XP#N**g>Y8G)V$;LnB3`MoqmdAO}M!iae=^%{!zP|Ubp$= z5qcULRapJQ_b0#HZ@(ZxoB5ljB?1K@$$-OBK2&IiNl)cg6iX~rM-OiQ=|dV@Qd-mZ z>bX%H2m-yYxf*hh+FKDKzdr^{I))U64{GvZLfSl!z$Z^bBMWQuo)FI1{n33wRmMKJ z-LS-@fW;oQhr4CREx&5xrlN6m~5=$GG#Wv)9XfgA-n^y zhd-J^>Lbrv>I{aad6IFvYkApNV-2_03T*Ae{fF!v)0=O73w@^7v!>Z8ay>WdKvtzX zY6X$qs_gxuzc)o@g0YSZ4oR~bizmJ!Df^-Jps9z=cqR%^3Pa9DMy_A;vCd`GWP@D> zfOHyjLd9!qH(STrw{}ldJ#3vj;ol9vzBGMSS*?7V>_ZmeRm%Mu4H1=Ksst-4@S29B zs!zk~+d^N~Vx@?Y?PJ;y;hQXCiZwA5y)ngRpCPq5s&RW|QWQDm4r-sXt2gZx)ViTl zuHq^jgax=C3Q2))OTQiVK9Jal9~URnKAMXbRNO2ccdsdAmZj)rbibGM#DH=#fAOH! z*N$sJclR>me^m8i?QT@nueX{)9K}^F*(48h&hzPR6QB4rwXm+NP%#bG9yms!W#m=a zaiWC2n5SjBkSTK1Y&z<87d+AvPjcO%0|y1IIBvm!bEMb56T3H5$jDC2!LnY4tBz4k zrP{hl?JNnI8 zB*T(m=$lccmQcx(>aA^G$Eas3A{JIDC<=829?;qdtrR!`nUuFb?F{p@AHmZxTVSoz z?Y%O-@kX@>g564^4v9K=c7elYp-<`5I+J&tV$_0`7EuThj@B@?yT&=#&XF4@!du1K zg3N^XSP5+CW;?Rbq%e?mIUj|G7{bWE=?9OT1IEZ{uGXJVskrJf*8H+L2!Y|PdZ=dS zM8&gbtp|YnseFtLjR%yG6SB|FPUmNBKJ+LwI(M5w`WueG=^8FrXhVcKELc&@>6E7I zA()5Ole6&zf)5=FAm+7DAoy)XXu?&XAVc_9&7ke@uW%z?)lrp9ORuMLdtkF5Lnrvw zJa#i62L%5^+2<}Tio&7b1n`YMrrPBS7n!C+^imUYcz1$MsAiUFCg=d zKZrrZ4GlfyuSf-g3GSfR*V?DHwGtqyD?^inN_B`(fKdXe1Z}iG>xBr_VA!i!kM?=g z4jhY!fOQN`T&ds>J{NqB1(uJ{rVuAU^fQUS<(lPfEet1xpq033pVj3##VcuT{!BsD|JI9p6HThAUY*``=-I!1HyB$N+F=yPG4<^KV$2KYMx zhB5rX-23ZNjEj91)r+j1M2K76L*ozx-^FazD)pQyf^1%ZM2a~?yh__YT~t8#*Ps1o z%al?)lGO9JfB7P0q`uMQ^Qk#4M_xndfjNZS9zm4t>;-KtG<3oWpbTplsIDnqfkWEN5UBwglw}j9*Bt(b&T;x@C+fXWHUT*WB@?Q{t4u zXE#00azCbqd#%QiL?{vh8nCBxn9DSZJP4Odf24MFY+0$}f92?%rbF}YX*!g%6=ATL z3hO{P>&3hrk_Y6ty)k;GQS?ua+D~o}r;0@Squtjz-Xe!F6gJX)UsVvy8N=1_njLNIKHL7?6Gi!-YS{5m3>M)*7J8t zQ@x5PfhJDoVC-&!tq;LAgK%h%0>M^Wj8ce?ir__-U~1o!rMaP=X>zqof6ZN2E#i+% z+)1A;4{ZHgXq*4R2txxZ7!v=Y5F?uSaS|e9X76n`KM_9=%)SG7fsC7SXFL<-y z@7reL<$s*0@GZ<+4i)Ue_$%~-re3+d0HQVWDu@vv&CZ4)&@AnROeu*(xWqRVq*Hat z){it5K(MGw_Hb8m43C6(UQr?$$e2I5gM#K~nsL4bsxu&z+yzO;M7qylE&iEFX*^|~ z?n#s^Up!Yb-=gaR(OP`p6C4&YAc;)RHb7FYkbv0OlFOdpjn*l>R7>qVAJK)Bi1`;Q z>BgUU_qh{t672~wuDf!0ZW;W@vB)cBJ2BV7ixu)4#4B7kuvUf(@6C&a!ob0$HEd2u zW04M2mws&@53rY?;-v{Ynfb0jKHgiXdPy=cyD`4haGi|@!<_&tJI2{ukKe?|LHV?p zXZzUms+fM~vnvT@V9-yh6|fB}-$Nm&C?VZm_bhTM&*?yb41n3^Y(pYsC4&jh*3~1y zj1n&)irNKfmY0$jYYw~sb{OAR7F_Zp;0X^et#D)7u5DIl@uRNyc+7Ns++-(v?TmEe zRN=rnx`59i>eQLDn3^?@oG6xrz2|-$aj6kHCpQqMP;KF_DB4XuI5~*Scj@z~CKWYeT zYCDwe)Udg*33Wf4mVtTZ-|35D(SK%!dMa?@Ad9sJv;)Rx2>b_J39D(?ftuR zP{{%4@KR3#AGJ}jBCydwaYlVyc}R>QMRc!ByBR<3u8h4QJDvF5;suI|FO}%sBR}-m zWR%EIi$tgcla_yxaAmsVxxP-VC~>n51l{l+YQ=^>76!u=(9p>TCiK`9!Ot>*WN6dt z10*;!O_bkTZHX&)fTNg3hW8;;zrnJIM#U3ziywcF_!ijc_qmo4ejJw2ujT%_sp!Cc zu$eE^fFCIsS_!ypP1qG?j;LjNPz6X zn7KNv%(j-6vZ1$50%0%L;qbv@ejqD|rr48A%@GoHEVsfHX{*_&IYDB=Bu_m&F<-LJ z=EwKXQzQ?ez|Xa;(B@zeICuwcCswOaOWaim|d>x6%I7LXH(6|5c=Hc0rp-V~ud!wYL!Nem7#}#bm%L z8;x+@6C|1BQ$jZeR^x}f#~~g!)L5$5p_C)V>-|`HPKpsVJcjxE zAV`h?fyw*in3H%p*>;}6Ohz-zTG|Xq|8*gr+`?Z=?WkBSBEh^PxYw*A=3W8&K=5x! zIQ@62yjYx^|KNq=lX8LZxqv>Ms%w@!a#~=m(BM~uolTPBN|S;dF*sRxCnFA-wAbv&pz^ z>O#pz?eRV&#|^?W3J`R4&-;!hII;xI2+=zI6tKWirkB>W`C%iZCJB}%xHzrMCyJ0V zD($9*KjMlJifmPmrMhAv)yQd8$^D6OO90xCU|lxM(<5|&owc$$r0j-FRhX?rd0@#S~0s#q*rO@X!V{lrL0OL9`ShWUt zivjL$NQkT<-;FdGd$uY!<-*++_=H8w+j= zHS7kKU}R3zi|4)=r8EbuqU9BNbRn*txgcs(?4k?__y`jRM`$wK)<6*vTJEEkUqYE% zD=Qmoq>V0sA13e5Sdo^^@2wf+@prFRGb@ktodsmZZ0yo;;fX1Dl>@O{G&LXW*6w_y z-x;6@vM>Wxoj|*|B29!q8;n1*(tIT1hm-<94t*S$khc|U6K?HzT`r_S>->11%@+iz zNPrMRbVQzhX#XbA2`cEFkh%X2I}Tal5Oli9(K>Ch&mRmSfPG~EpzmoNwurdrc%u)A z%W_GzXj^MC%^LXivtNfryBkT|BDi5}#3=VO>`hmI8$lotm*V=I^2V0{cmqID2ND}y zSxa;+u9NLtp^RNZ8uz`|6$5(AZx}5aJ^H{g+nV^SFVQ)t=sk4Jnn3|SW;e=jKwvV6 zcz@5Ntlh6;G7asE&2LAP+!^vEfl$8q=MI5%bqDsy- z8iDf7k-^>6VT;Xnjx>mus%0`;sWS-Z=bo^2tdI3o;h+aepA%He6cwgbE7zjhPol=m z55iJJ>C@BH-<<+-?rYT_VV&^y2|`X`KW3$p?gYz?9vCMcO0qD~Y~bAl{ACLunqZS_ zit}7aaR~sy?EF{f3EfEd6+P}{aCqmWjnFAC24P<$9ifEyPkJP0b(>2u@vJsS1ekIr z%zN|34R?YT@Q81kdYKU;m&6O!gjL66&yOGk7}zFnD#8tUGf$o2cq|FFYHG^ir(b6p z&aSt}E+@)5n?F0OeiTv*@J=4#t@K=!oDcBw5VEZ%+DBDG(z(>t+TSY2+4^Q6ZSa>M zEdDU4;K{^cDT;>ZbSoGx-Lp__+Z4|h-iHfRU=y=`GuU=&H6LF|LGP!a*ohXV-wE~I zP+!TkJu(pi^7Y>gI>dPZ!63&aGz=QuX~_?^20|FTA2gM>eZ>Oc%?>4*E=YQIt~-`{ zfo|8Hbl(^Dwxcf1rB6UOwfJdNcv8n>0ogv;QvEVawao{+`V1&pSXVY5uGCw3 zC}(X%Nb@mVCaPnw%Vl~U;6wpkSDNXR6g%$z=NiRff#22C)=WRx!AhAEl=Hq|uo2Fe zC^0GfZAAe(0NKf;T?7^!z1gG_y*|Q;>3W3=9bWJ*ylg|V*ZMhu4|{MrAU#fNE@FN~ zB||GTOg{7V6O4)901xRZ?Oqv<&pqaumH3yqj`o24VS71WHm!=#3eyTQ;q`ugz!!S< zi8Jj{+GCta)5Cj4JKIdWMl;g7fw;8bdWznaL0i0&=H?;x+!|>~M`-hFTtpWg`Al)lV29^;YX|8BQbl?{C&)H%UNamvVx{|} zZQ>Rg>m#$@@RX=$IV9qdF?z#!WdkNB+vQE&F+>j?mDZ`wZ8R~+O`+YIl^Rws1ju&w zLTSe{+_hFG%ip;-GS-s{@26)z@s%)bKHc_J5!~C?JiLsR+DpYKSJLHfRq4+%*-E=u zMg4yK1|dGfTf`kS$p-+jdN@{QBpEmPCVJ$N`rN~}6{9XXy`&l!{4A2Bd@uEh+@gOw z&n-a{U&T9W)Dkc{b+_YWt)i)uwV-7+cbVP(;-y@Sk>l>9g*tIJzIko{{}HPqX!1-! zzI6wyL%K~)&u4Gt`QoCF$;W4E*qz7Sa(Y^(gnkKZ%Ky@QxZIQDbqnW%4;Ksf(Fptp mPtDc=@Gr;B|K;Vt#`f-`U&0wbxGf=rMElff^&C~Z;Qs=?Q~QVj literal 0 HcmV?d00001 diff --git a/docs/reference/images/rare_terms/accuracy_01.png b/docs/reference/images/rare_terms/accuracy_01.png new file mode 100644 index 0000000000000000000000000000000000000000..7182b7d3c537e2b376a27d9c68a6117dd2d3ed3d GIT binary patch literal 25075 zcmd432UJtr+AbVGEK!OAq98?4s(^(eNC!bcIs#HHAiYlMeo=fE@oy9i`xBX60gl78!c%0j=i-(8GNqLmil~} zwIEW*YIv~Xb->A&eiz_nN%Qy-00cTcVF!glcY^ei->l5FJ^F zD@?ovt_CKdK0UDu{SPcDn^}Sgyn3q~V~75}90((YUd7HPK`*sz#MIF1mW($s^qP1Q z+!_q_i;4)kU6|*U|DO-OH`iyHSgc!Y@o1pLdJVx2Q+-aE1f%wSu{PB=*PX=@9mS>H zVu8b0H^&N=E_qJhb!opxJ2{~<)e?_~V;Zg-VMR{JMKI6{+7w;9{k%6y+i>>a-bbky zWP3G-gv0qJqk{s`ote}}2u9e+bHvo%Da$c6d*8IOb)J+CyIY#|^%a_0k6;l3j#CeI zS3O2c`#hGar{PuLtihE_t|JIsi};~umVUW|Szkj0 z+t4Z)kIE6k8G3=A`)-=2m%r27_DWNTQoA0mY)($!nP^Lv-dw3?(i^I99&!#FuMZvA zU8-TqzdrbmMQ%PqMJ)d`Z&e$oW+o*;D0P2-$*VLY5!uh^xhz?^P%<$8PL45OK7#ez zpjLgJ*CDcUYb+?|>9;rLw&U8_`0;5*)71-C^>Saz5aze16J4k2lnG*t@oO(hP7Deq zIx07e)yXj-j7?kO?3G|dN$;U&D9S&yJEvVUhq{TD$h?tUn=G|Jr6Mv4$8?}I*l3Sq z%EsV?vPXr_doom*XfH5&?b++)J$^al)T!jMvu<9w)^c}2{9s&s9*pp!{qE+%0&UHf zFXCV=QDwGwZ=P$|zR04x5xZHOTV9uodOIt^Q#pT2uiPQcQfxDoY5zxj2D~=tJa<7> zF1CMRseC%2F#lP7zh!Rq6eE|_pxqZT%*Bt1{cns8R$~@j{mxu`dbnG2XnDAucxdiA ztK4p76v>xaPa1^=UVgG)=|?~**8C#NCt|`HNNt14J{!}ar%x*GZz%Tv|2LviQ@YTGxkR*yzGiG!v4F9m95`H znU-2Ka0=7chb?7^9uybAjdu|IU-k|NHA^M!&Rx|8r8WjLsQn2x@q8YGiZWcgHxY?~ zGZv;g1D5GBI{vO}#q;Nemk&1wY+aQSG-vk4Xlu-DvD@j0wb+p&d~VI~!QQlZrFGE) zir+HZe*8P8cCf%iv2@R0d@q@+>;UJrAA`W_jX3b@bLM3|{-o1MV+^*cCEujiMK8&uOn7Y#1GO$7gM#`|0@CI+nU} zjYiTRdn9t2N<&)=Q#l5hnt*bo<Bt588M+7~=iec!Qx*q)5ZF{pG&ezw(Yi+r{**Ij5> z?Y@o0tz7jW%arBp6V=F5n^EKLzuhV_UJ~fMzm9sb+Q4S;t7Akaiu0k>&-XW7>KR;n zYhRKW6n+`~;$Ml<^|kZO%}8)>G$2`E(^8^=9V1(Eg1u=>5cRmim^B*>-`Wu{7)kNi zAuKIfaBE6LHu2UB`V!Gx?RJtURi!yU;*RIWWyp{!wF_>3;!rg@zZC06dK>X2x1Wh7 z`^)Z=Yh$iP`5|9Yok0Z%J$?35&P;!CRu+w#nJV`ySInm;s=wTJ7fKB@#7VjNhtAQY zKg`ldl@0z_pPlZpRJ9huWgLV_v=GwN8zW_p$9vio+JG)0$Vjq*{l>2r!>G&IcOS$z*%2-3_-rg&YaVJ;P(NY>Gp%7915h-AoeR*Z58z27AOxAv4+z@{4 zY^z4$b0v@2v=~n&Th84#V&tJHV_({_htJZGyoPS8(P&nw<}aG;+LWq}H@a_SUPM_K zq6Dnx@b9~qyvZGi>&fEi&rQ< zbM?9L%{4mqX3?N)!xD7vh|H1qGE28b!bl|AK-;E{TDG0s@Y>i{yR6Y%UezGeovlJ@ z*&Ub35kY&SYx*AILEQ_y$S0aG^Um~pZwsPBG&gAwm(r5!R@P@bheA=GI4B<^i1hI< zba+#)WPV7n;Hmn|F2bpwyn48>c!h z-4hOR>)KP=_vU*qnVmdF!|5dIOxEox%Tj4xFtog;9vpUDylua&oNYk1~DJTbSx~P(|OM z;2GkF+pRgD+AWDLJ4co6^|;npnLhA+}A1u|jDpE@_zDWBsksx@dK$EmMFBh`dq zyl{V(mLkcS%yj8jw==(A>8_1vKx`+!q`e=AGX~d-l~v4PUX#sL;_X zv-EA3*CRtDshz4`#5m>NYG}2rZ58cQ9Y0(XKO8^4+E)Jk6UVyrTAI00h3u#N?lm)` zim4Bnv!vcq0HUf=cTrqbGoswMcuMd|-7)PSU15W$R=JVhpebc_CVEk=C;aDWI3F%+ z5gD+wn(HmF7>ZxphzZ1*es9I{sq0eS;@2wlXZe-`w+VmoW7kzMMfi{ z#JDR|PJFaJ#iw*fZdNGe1g|Q)e2Ef35raSM=#*4mU2>Cl!XGBM;p$sOHyz^``LAhi zbzCh)b1pzPp+Sb)aWpTwM`pPgZ&{{;Q{eX)#cA+6ezI53(_CIZo=+*t3p>UDkoA}< zk(fKX8I5kWkMk|gT6cu30$0TDsY-@og{`unn0AGkX0HCcEDSu}`KGJZ!#J036ca7{#>!0b^d|N>C5zH)Xpo@HwUPJ$kmqr6b6JgK4jW zdY+oSRVyZUbTI&%$h~nUy&B}c!>VU69e=?ICeJnlyw&|l-JZfO)*K=S${jNXrmw~5hi7T9M?nn5Wu`4TPISb;b;yKU zw`jR|q0aYoj(VDcz~fq&(LjcK&7M`_%!b-Ya6xD*J*^qVpfCOurY0kOSi?S%O{+zE ziM+E(-dF6`XVkQBt71L3BautP9!t)?PW!vIhnu$7dMx|$o_v{y@b}<2s7~MdOKokX z{m$6?E}AW2cQIf_wsaf6h9GjhkmGkwZOo;$J4MO^U-gz2q{X|lv_s1ey3iVnY{Oz9 z40oq6p~Uh<4wH@LUK-UQ^`T6mdMs2+asCucjE=iHt@~f8RIcVo!ei$46-p2VIJql< z7^Upp`1ynHm0tOn48s%CaT=>EH+o7M3}eF=M`FGT+VcWi?a6M;Szbv+T_RHTeva0{UX}!Yehb96+f)v z3VX{a+TTsg@k3;HzR=#w6^yIRZm#Dfmw?US9SFK8Sgirh&$K{5Wg|ut_=(>3b z5e9!o_aCGo0QdL5K1F3U_BGHZR&d1g_Nb7N+o*5J1Q_JWni;uDbQ*nxfCj50X{+<1C3->m%# zLHE7Livq;nO8sVcF0vcBJ1|m_G-r_8lcQJpm|-D3*2ZBs+?a5)FW-N+ABTMoFsf6{ zQu{$}#T+Lg%?G1hU^%N6hRdG`g2l(ii6Q!jm~0;vJX6`MAf~*2jGeiboKa*3rH}Sb zrAw@5^!N$2&;c^R!N;bp3B`b_@+~eIdhKsu+S<_?8D#^R-s<^J@9hpwi_zxqz2U;x z;Zx9U{BFyS2}*hJGEQ&w4J6PV`ml)I`Kq-R$ENN}kN{vudW)u4Q&sKdT`jV}Oh1|W z9k7VXiOVe%izpag!Igw*H(cAWPF0+FMu1XclG5wKkTrllmH>iX%G8aS5$2eGQKVwP z+r9bPNRFnr4!!&_kKekVPp`rW&81`EA?n}q(D&v>nu1fyQq3WjtLs{&Km4YJSlY53 zpbqubDzn49;sb2%2pL5eYfbhL=bDJ25o1DYWr9UP+H#UaCX$y4xs}Y7mvs>ZuN)ni zTVm-$C^*@s7}4hcUPLmjXsr9ruDW-Zdv_#|S4QkVKio@*AjU{_n#34P`9xuIOWHRM z0fm@rSwLqb6adEWhtdAxM`qm=&7JJn8FPE58Za^6=xY_BF^2BubvmObVwd0Z4ob&J}2p zw5R$+QRz`iEG7DJx{nqgDXVoBlP`O>gw=3WA0)3F2_vP!sAIEw+ar5Zf*6SmpAh(y zk)58>r;M*G4Ayg&p(WZTwjNxCO~Z*TR;?-PVLi*UW>-ncE5aI-m_G)?7 z9?QG!mJgpWw>vTlIehK*9<;A#;JGm)=j%oqq}d|>xt4w~&2?$GL&0UHO87&#GDP-FFbpq4V*NV{0QB8RknjV%d@i8pBpRpZD&01|Y0V>|R z`QD9+ja{%1&VYsRtIm>Q{UwGn6?^4FlFu7dg-quy@qJjA$OFyPv&!{VmN-n?xJ-Ok zHDVA!7&_aEwvDf5wPsUEC;|(k!uH3*04+%kCk9QfQBl*aH91_x*NTw2SLZ_Es8LOg zly`G`362HR%F~SL?llS?UgLqT)*_>XyM6l`KKcc_Y!=8&6mhqM(@fiEnkt&N-PG4n zBfFaY9#s4N%ImsHn{jmWGJL)MY@U&dHPataY$|}O>zk>Xkpnh;X`^2?QzJsg7c(;* z;>xr^ci(BgT{9@XU!U~)kX$u6AKch2ry|jER<7XLVS6ZYV%M3J`NBz6iP~@Yu7e$X zF57(8%JLTqwgt-E1FE^@9AbI56W09i+AEB!MgoxuGqyHV&Ig-t>AAu*pLjV0Z)ul!eYtGwk-B~A3}>EMrTG4=O0rKN>ML6O-d1Vrira`Vp5CyV+^tLG2R<(T zEsk1l#~e2zz5if6r#N#qDFcW@i$*lRs#qWF2pB54bn$Txp`vH{ zuQq$6VK08TGwt;E+kGnU;#G|lL@p)n6lI=5p}c10noo4U=B`MJ4c#pQCwdK<^BgXB z>7YA_&M<7*C(w8|j_-Y7-J2cadn#&RAly5TeZ9M>)3vU~A}$8x5e%!slg5a~VogU$ z25C1Y_FRG1*8HH6zuo4jJmO4v`;3MKd>ql4TZh5EdzgXPm{#*T6?`S#*06iih{YLw zu{BBZRrq41Qmg=`Ez8!jn*0~th61TDVgT>IgEA8rj)4d|^5oofGM1Lk_%psR$>#%H zozu>P#T8vxD*H~OkeKWCI8m9sNbwKJE6k%$U(B+t+# zb+#hx0cq30SDcKGvbl9%{RDEz(Lm*>6n9|^?xtB%DHiNz=^n2ks5DpdEqf;vO1?Ud zfn{6LS`L4DFqk1jxDpLPm@Zxgr3KBkWV*97bIyB)N~-Bb9;01e^82CGjC{IjJ|*nl z`0a?HdgFS=Aj4gE#2g(5^(Ui9wkS*Vv})l!L|=;J?e#&pkKvZG%lw?z6yf(0fSsG` zFSeBNx2Ms%$6{LwC&g@HGcnTtD&KpjPtGpw^WNOE}03 zsDL1*G6$rN%3y}OJ;7~JXvfXW+Khf?8sUcl-8b+%>7s_oqJ8@v%RQV{?xVten+=9c zXv0{Yb3;Ze9*0H2m{1FN5pTD9=C4b(e2pE$8ODSw{XnIk{FvJ;d|qzbMgZHD2r=+3NhI3hc9(}qhy*7 zqpv?Yx1!@~;C#?S*+obGsQBC{f@kkiuJGrqhL9c6nG1Y`H-v@!kb5HqmNG&MBbF_x z&&}pE7lY%{#OQZ=bZTe~9K&0jHE&-fv(NTU>VLniyw%VmRCRc-U|Op=h&W&~_wzBn z=@59t%GFAvc3-UVK*nu4qZCUjyeHL=QM<&Mz(u324bO>pa{7rEX*M?o+a<+42b>IJ zN1MWjf@VetYf$NgrwAJw=#j#^JIuUaIkn2~d#+O{*DRf?F>GM%R{A|CaI^fKn(gvv z!#n4Ktuh~{p){?tYSNZ<*Fg`qj;O}Kj-4igOUPy>?xBV5gWy+|0iwO)Qdt&;FS z*?pEQnPG_bsy=JWqpq%&RNH1MntoCAaBj!&h#eB6bUkn}cdOcqbNLfG%4l|&t%h&~ zm58&rW$2aZc59;(U+%GAy?rP6>INALxlC$fV0VyTw2+AfIe2hoacLQ(u=wmffCf5 zGm!V>6&WyV$eF~fpneMZQ{d#!7RB33R(cpj-MfxcFj@i2B%kx)r(VF|7UCSmDkmyk zmr5>6`}IPKJ)lgFu)@$6-?~gmWU;rkQeuhY$jf$7cRb zz+C$tY8#k{y07N}%2MIHHq~4R5L-Wh)j7anTFh|9_nh6;UVc+*-I=a5jv1(WQ3wd< z*QuW$o&kzj3fwkbV22Hf(d1E2oKyi{EYWv10M?yrB#!S$3j(q`*VA%T2f9aYRGyCC z!Z^kVS$Q2fG&@GUY zUXj>NWee3x`B3P)|)?9VaW;z~f@v4EM>wdEtqTAK1z7olOE{apt&V86d`kwPfmmhu#Iu z;j4?gezp6?xtc_?0U5lo)6B2qMXEaLVd{yQVB~bb6|I{V38AM*fzKf4=9 z@V}o2oLKhjoQY>s-bnu<=~`s zk;i#IoKGqd)oZ?91xR}(Qog`xR-HD4k6k9k8hEO4cH)d=iDD*=)Hq&6KKH~OV}7%? zTIG1*$>T@QO|`et(_0v^Q)DU2hI%E>FM^^_P@3i|dp)JM>PJq;&M+kv8w$&yd> zi4rX50S9~6X{}##CCzi+n2T~B(A*qv7g5`R+;CL^YQ%5DyXOC};r$;8R^ST%AM@$| zD2_lS1jX?oml4)e0bfwe1TGY2qVkxyE9G90Ro&}T0xwpL>2E%jG28YdqqpkKd*ZUS z{N(P7)sM~ZZ+?*tx%9!wpvY;{5$I0zB5rw*bo5CR{q!^~PmyEG_Ix;46a$bI3kuT8N z1N?ch0u(&CdGVorh%y30<*DP;&wSwe4S=)e0Ikg?5v~IYAd3hz8BFdo6KjHMk*yfR z)03s08!!np{6eUpJw7UEL{LHNIVxl+B$^ewY8sH3Q9ilj3S8;M!?o$wQgFArtqG!q zu4<|llUQMv^~1uN-6xEp$6%__%iBc9^q7ew>ol%|jKgWt3!#j*eFfh@sDZ)NAbqnF z)HK!L_ZIncUFc^(=W*s&2hwc;#J-+;jeid=1b#(^=kN~(#D&$%Ct>$#5}72Hk6n@Y zmTH%Rl^f;z@#?t8{URQX-C0mBmE!KS^RGZ$ms=;V7Dx>yw@q7$fUo~3R=_$NEO-I4 zHfA7UA_Awom%c`rF%-PAB!}HU0nV9sOjrW4YJqTaA^$-r7>5Kq=<<@KPY5RY)`j~- z?#Og!s(o=MELuPN{x%uB^xeiUj64 zx|Yg*HZn*qi9{1LBkRrizOzkTm!Q&h3TNUVPG_znzDrmtWfXPq1toT9l|?S(_3^N_ z8oGR5;iK#c`}LRPs!*vygaTkThH5u|eN?Nka=@GqsZ@^UF_^1+cifk~87Yz~7h1xh zk*))A`e(tzrH<3emTgHdL5FB0lYt&}QeMiZbDB&TU3W%z918lk(u#d_0G)bD$k&8&1#)b zk#gBO7(lb!H-}we5&j6YSj86xCI3*W?j4mX?+5>5l85qI&n3oD{QD*nSQw|6xc9H4 zBa9_dcz?l;o%r9xQ4;TGghZ<3DZa$qzIX-3Ob6!5%KvyJ&>j7YOwpFVo1>fCp(zgq zOyUzXSpHBNNfjQ5^(LVdf$WI?FO~0xmvq~$m=iE&22UjHv()c`{yV{B{`8yyb2(@> zeUSROI?*8T^lN}8lQQ~u?8*4s*2=gw4r8DN5*Bxndu~_?HkRL=pdiJl2HBIQNQ(y@ z5leCH+0ok)-ZFyR%~3i^Iy4DBe*Zc`^>5Lm>K*lqKG6-7M6lUkYb51gEx9U*nUbKc zew1_ps$wBw#B0Q+VwN=)r~!JVAUi}aVD%nYtG&tzBIdWB|I7nHp6`PyDc0jTciq=B z$rwd?TjGSCJ^ZAZ?|YiIUufFXr2*(i)z(b)0PDhH z@sN`x*Pybe`9xZI(o}P7%yg=vQ4&cPa5EkY4%)q{-dk;QnT*ozhXOqizC%oM8RfUc z-yHzecCeZLj$M*9(5Ho+=O&Q4r8W%tiJ%gH8!7>2vpvrGe=fJoVIYI_tFc{@Y6e#9 zx@1VO1?6tAW;)Re%Wu*3V;H0=Vgf(|n3W1Gvg+&bjRmYd=ALF*GfeT1(4FHg!pE=i= z5h{AHHU8h@NtFGg*-8L!7PlX=01m(zEA0Hqa#Ij^0a;*D__abwKZ$fwVF@UeW>dK! zK!4XI+j_99y;}qUwA1_`o{Ar&v>S@>3?gPeN|_=E-r%Y|Tx0Lpr&x}adTu*F5hvgs zH9l{NGml_ZDiE;l*8yutcc{1JD&FA`U9hAsL|J_f)#t(%*jmwmJSTPE^Y)lZs7Q z2ZU>`Y^Ju+sH8Zv$7|+t6Nq8VV99*D+(tA9>DbM))CnI!+G{h77tvtoxpEKp(H5%I z16k+wfgM3?0jz z+ANH$)g@>f3fM_jF>y*i34IY_h9th#1leyMK(}7#P~>&ySGC~GNd5-yeQxI(_oITZ z{P~?O;I3Tu!;OLc&T|RP^zcs%n$VsqS2@~g3D^OnCX>=%Oo)H{W_0aim4S5ec}i zSMsx&w7bUZu9j(_&iFU zdi{)3Sdi+QwuGLfi8y03RT>1Tl61`1ErOpKXf=KV>;k+2oA9rw%*<*6jC1H@cLE?! zWjx%d^Y@?801ELXSlIx|gMmKc7e}hm}BSO28F|B&2dW8T**!6Zp9bh;142SMK&WbcBjq`Vvekt53>T3(Qt0q18|hO6Y84Qn>z z=#)cCt@^@)zHrhIQ@_zNmsNcV)pTJ{)5TJeDxf9>>ZHHC%9kXQ40+N3#GwH_E8o`m zGHVZ7+?tcj!O@+m*IrU(ce}ZBg$4y`(Ewx@MWY`Xfs(NL=SN@@nxIe*-SkCiRMTEc(M-|YfuPOtJ1|9f->ZxJc!?|=tXka!(78^PbOJ~--rDr7n{s( z^7N!GtMoD0$5*WsNmjpa^N*Nq;(Hld*gy4Kp;M}%;s<3QYT+T^GT*B&0&!FCTVwx# zsaLPD5q$#jEnx0>LXkTN@0PnMfKS3mAGfF669B1)Sm2xmU$}Xt$P?Hw0#>~@?s{(J z#f!Y?c>eSK6M&x{RPc;AoJ-nV9J0LYx>%&hQ~s5wYFR2vC;zG05Fp(S_9vn9IavKM zS+oNk{xd{9y<#O65drJQB0M0;A>NVP02Ai*;gV!vxd`m2uRSVGvqvVBZp2&)`x|%hHuxwKu#9;T-A-S1rrk3 zEd>BZL#fGU!2b2a9}@ua14}@ScoL49T?jfwhK6m$3E$J~0-yyTJdOtLkr(%}n@YiG z)-!nx0?Shrv_UJL1o8kbdIs+g5cnhq;ImaKx?=ZZ1y3TFi+tPNd)sRZ6?lweAted8 zOo>O45l8JG3O5)yG+c8T^`+JQ_J&3p4;oKIkTi5J-s(;A-WH&b)88A9b94}}15V1c(>xd-iuv)gAqmtvTA|%g8t8e#mSIA>CJN67dC5Wd0 zNNSV6rxtY{=mw9#bIt*w?2CFaN?<~{|9W}C3J|f)b~&a#aCcU~T~C5w$#L>a@3W*O z-Zf2~RCwSAXc(YO?8pC$5(p3*$W8e50jjwGinYMA?`PKw%b z`mM61K@woF9dBs3^dH8`DQ$xS_5`BabBxNOq!?z%Z15yj7EtlxUJj?DLzh=Bt2C$&48??~d)xN#ryikX} z7NhHwR>`9c=BXxrb zm=uwO%8_l%fnw@0UE9$ZNb8Z`&?7o=*ce^bA`W#^5o*fsq zSfSt0$L?uyi?EUFHMBqR#=;x8Wwu@*U9e_iJ>$RHPhve^Go zJ|Y`T)E@8zTpxgN3GYhPPQ2T>oLAy~SieD*BWUN{frm29k99yd;Nr@nf7>|o>9H9Q zUv=>gpphv&83Tz^3|g-Y3bA0KS@l{(3HpLBmQ;|dRJb_h{U|=WFVLQV9oFp}Fotyb z5SIO}_(Wb(f&VF3c6r0q9s}=hKC@aJEMLsuk&eG(Ul)At0!AfK>}IQ?ZT3pfx(ta9gP7Bx7`jWE^Hl+;=^t%wTtRPB_L+?JU5-GIg+fMbTI@8LR&uQa zi1tK&I#d8NyEdJv`?dKnSGAo?{VxawS3l_iEx?<(ZC&yc_nRYB=v@vLP`S;`QIk<8 z+@w?wNu-4~qsRPB?Jp^jGisS?TLCr)(enK@K5vOzD3Hq}0CfYEVRM}6@XnLsEOmkL1wE23eiOe2XtGi-YE5bbNV0(T~cEhh~_M^5ra$!nkM?wFw1Y`V&D}@awtu>7Y|z zZ4^4k#rGRj%7r)oL5Yoy2(R>8J@7hQI_&!%Om8rfXopJWpyk1CjlFNuKp!+Y@uygf z7KpyP0q)29XM?=AI|%SEER>Fv93Ctkx=4FMH+s}JHX7Ixp(OuM-Y|Fzrm8i0RpMyJ z!`;QQFJ!}ZM=h1(WthGTtOxrgQROh>%S{;bMbMK9SlEc>K#g!-V}k5`o{E{2fOiCk z?h}y3Brz)FG!urOqO!E|w#w$p0SylL6$79*Mr>Seg8Do4d8O^tStP4Uf)z+^o#Y+? z7Ef%?^z!Hzm(>Y*C^Qf&Eue?L1 zAmlRtJ~qo4=$AIxum4)tZ(}jQrnz`GBTh+Ty_STctifD-XNF~WYdmz}^9iORXah54(NN-1I^Pf8fx7b)B<=I_H5%Q)+Jww8$UB(1y6_LdX7$h zo%76UCdUXcrUrj_f{()9$N`c)4+8sVo7T$!ReJO&Mjougk*)87RsaRq@L`84T?YU@ zKH6;sz~OB!>C+I!XV(YY!CbeQ<4_wAP3OYj;sDPz&aovQggaZbat*C>YYr+DUjnF| zYheG4rrgb)I^WX?_WYL25l4|o|2h|>X;uNqr|cL6W$A@fSf)6*%s~1$nBLSAT(cE5 z7Q%|8Xy9z`F0O#p2jxPYrGcgigPn}-%uuO(?wlg@0n|tQ)T@r(lXMpPYtm>FR5Tj> zH=|?MD!VBXOrx5Eja;bbd91g~NjF83@2<~2LfPhF}B>DtI(NOG-d&K7w{=4dzWIfVvDJ+=YF@h%4I(aA;B)yDipPVy*tpMzbMh}CdmMr4e$qJ$lF7rA#-Xn9h(I@UB zZLz3+4YHb^bR0|&J1Tu8utxw>wc4mJA_~3#(1dsA_|ZGF7GIHbmb6=>jskz z+S>!tsLnu68`2lK7KYoz|C}WHEi8`$5Z7Pnn!7Mq-pNxwajtT??liQuEZG_OSPkDU zLnoI&gGo|sJ2KpS@~8J%=$*!bxT`2=3*LU5s7H^IMcR|*rYNqMpbTi!rD`oOXN`e| zT)aBjG>wR7`k$d2IO_fzp(O90xRcL}igO*H* z+sae0qbp*_Z9Lef51;~D$Xz{JH|HT)fLRA@C6JOq|<8D7TDm1~J zm~XBAb=3jFJhWI%FsvMV!{xBD4n_zDzyJ4jt|N3Acl5Qz_T#bxHC{EA9$NqgiPEZ^ z0-sEh;I|zfELJ_8@f-nAOyhQirirM8xmhi=PYY}&`-NJ9+PHH+R0YCbE@wyz`N{pMmJg<&AhvL0K!(x zp{ounk-G5u&4lEk;BalUdRkSE;yo>r$nfV|FJ}*Rq1g*NQu(<8 zoRKEg6kX6_1R~YqA(X2I0O!iOPM6>xi!%E}W}0G5MWP*)teiOnkEwa{_j-{v=9e&c zB16qz#=+71!V#B=VI7~RPpFoG>jWOxG;2FljJ=tehoCR~%jVw=^j}y{TE9TlKKSQ4 z_4^1jB5F*^DOC{4E`g#04W7`Xh_>#{`wR;GRF8RH(4PT@1LS`rLHq5zHb0m9m+(S) z`4JUYNHZhu0*%~;$!jkk>=ohTTK!kfT@ero#$2ha!D{O)!wc9BTPdOV^1%j`UeI8z z0lx1kwKXbLA~6UJoAv8#HZh89f3ktULQrr!S*R5>rnb7`T zh^K{$tlkfF21M7fSY!04(Q<&mgMmLegT9-_f15~9j=S>&$OCVB0oX=7O^6v58PDNz z7f0}qJ^8GkAtlNKs}-ji^Kib*NcHr};CT=&$OMi-*Rb0-8E)jh*!Bd~T__k3P5_bj z8+>$$a!n64XmS9HwJ%jFQs(POKyEIL9Mw{QEGj1He%8@TNF7>y#m>N>shb4(k)qoZ z5obfe=3T~tLeo&u^)_jYN;vY!2K;+uA!P~)m!qRj0m`3!DG=lB>-ODbLm90#(jVjl z-c?>tS8WTVrk*58lDp1xk@Nvj?T~nK+R0l%rdSN3@X6BtP)J_CuoBFTN06@K{yU)% zWd))eG9)@?|JC>_Wut(>u27FL%fR3oYNVM!tm7K92kme0bqS=Bq|W*ZCPRH=2|UvX zK(Wgc?SKO0ptM7sUDe}Ox4eTW@^t_YxIT6t;x(nZC#>Ib)5+QS?xzN z?~#`W$tiWx-q3~rM?3W2=w1#TgC{M6&3NF?U-tM*F-Q7N(f%o5NIU@nZ191kG(q9*^_<{_yMTIXv- zjsMh;;Uxbgp8lJ2vkRNpQ)|O~U;ft%%q|$*^ZCJq2zA=V3(doA|4-HFWLhXr68ZNC~(BKWcg#mmI2D?XqoG{h%N11#MRnG`EmHxLE zZ$sNaZW1EQ2Hfx=jG`7mODq9iybf4WR-<*kE-gZd2CERf4ZAFD1#q@jxx?h3J^&P| z>2mg@VP3%WK0$sdkBH=WZ}>IEd6Xq4AN*oL6WGnliEmH!2T8N`90*~MUAog&Vl!m% zr?4N=8|tH7Wet30I@ci}spkOmU*He@BS*iy1KL8oiS|xomV-19SYrqj2};w*39XCZ z13DMj?&LuG5nB5|1aeB2(=D-@t!p0p7NmGA6Oa<%^MO6@_PE*YSboc?X#5E1bCrj! ziQ*%40#;^_;?dxIGKJaGa4!%641&9WkP0btn7)xR`|)hBpe_nqLQkdd z+zgDy1~>%J?+8>vKh03k#N+t+-hOzUIqW3av9sLmC4oI0ng3W*|4lSvN(Hc8n1n3N zJ@5W9`&9a$quAX=M-sumftZ0K3qx@av_Vj=+mc@aF}}|C8Y#y{;3u_Izm3JAf$wiG zJ_7^g9NNvKpakav@>@QWF9xV5T(XkojFLhWFm^?s>c<10(1xH%D9p38mo#jq)}|-!?-D9Mb3vRV+EP=(7Itbs>+Eg{&=r0@ZI3ms za=teX1uw1_;#kd&!=K6V1=#8c z5K@@0q2iX*d+hT4dj!H2 znagrV=8}mLyU5$?mQ@Xbg_zV>J@B1DqV?Mekyk?!aKm<3DWft2T zd(6WQlaR8Py4Rn`gnXsBCD3<<)4WH3EWr!i7Zl@np1Wdl;S3`+B|QiRe@x(KVagSH zpE%BbUVmxqEFw4P;mxY=j_$eQ2n?KRT->VwAHT#L_0*1TZw&1vc9KFl6k+wsKOqL4 zN_-BC;8unX(~hpz8PWvLU+%ONb)Dj*`}~Gxg#M+C7LzbeC9^k@qa5o?+_Mbl!r)+s zWb4(PHVXHbY)oat-%{J)UB1kjxhqoDIJExK^)oLq&s~CqKN?OvI3M@*w2B0Pb+E17 zHVTVBKCgNjeBKOvz6O`bXjGvz819$obquZZQm!Ue!EKcaCAAv@(i@uiJ9t5K5**FDDHMI`QcRfXGqJFVVw zPE8xqvc!9k8;;v(3IsiT&-CHlzJIJfMt;y|Z(`aa7EvKaqhTaU?_qR!QYE=|LOO+V zGDjt9ggI*NfI1wdH=jh!ZDUu^irMaZd+A~rc$Vz4$kT3taFK8Nln)-7=a!aIU|#Hu zZA|`H*B5-9@wLM<*dv)eVVM|_U56-Nc{e&cKkqL-G~LP){Vi)YK#p`9E&4Rqag-wp z@$HoInACR`j)ld<8hSPvd2`tuE~PMduKXH3N~EEVDvawGd*ms}1d_&x!tx|_?q8EL zdgHZ?T^pO5-!?XRyI+_n(5J(%i(EU+fL7pt{8jrdQnFW0JpMs9^7WG3OJ>6b;H>$ZcIG4pZhNLJ`t{ib!S?O#Qz|*Rd($8DLwVoqrR0wV9iF*D@Z534eeKOv^Ud;jE{m02 zvPfJPP9q`^&tNOAztnr{Y+U$)pw|1wS{U*6 z8QJY~aQT&;SrOGN&E3$LTtYCbO|f1)DZyU0bph9yM?0zD8Im zzdL6iNE!J{^qX(igQ|>)0+q{4FMojT)09ph_NGE8qx92jIMPm;JVpDZ>yTDyZk`oe zQi2mIE|92SIXjyCB|~!7gDG_V4 z(UDOEYKwT)S}-5OOvXt9Q17-1HeRaOadSkqMa=81XtI)TTrI<4Bn)6L8}OCm8D0v* zqLgehfzkf{nHLDRAFbt?tL>Xf?=HWn+4sx%xGUdvb*1x}ZkKS{P+D%=h{j13b~~`K zEVE#r=<7+uyH)4v>S>;6%LFOAis>0~cmr?R>aoM?aU&0}wK#s`n$dHYmd?1+E&_()JM)PM>_iMBuKeQT?FOKCzFp!0_|DIlv_J4>cz2pVkg%$51(v_Fk<=x%MCrqB>NrQoZ2Uq#Ljbt*$7 z^f7oJl}PB3s$$!=idF)HCG9QsnafY!Nz_~=9beifUE=p5THilY$6-UsmX744v58o$ z%;iSCM3c1m`tPksh03Ll`+Qc;6<&IePAmE9u=D2VhhCVTt}ZHCN_|wJa`D64YRX(e zRc|n79>vU)un$q}I*Co-_Qpqa>LZoemLD9)-q}?s4@*?23DCcPvyAob$b;jeM$-Hl z8Uv9m*e?+uZ-*NR`&%EKUf(Hx5@B@9Zu1c?iihvoyb9vxe%HEa!eZ9EsgadcUJ-|ZmZhzpWJ zo_cXo1%1q4=CVRl(**xvz2$9F$8on?kIEFaYIziW=RBKalf|?N71xKSt$f+*sEEB` zq|{-pel^wn-wW=iEwnKTU3q_)oUs2^>pYKP@~~d?tdQe5(hzl;^>N<}Y99^Z)UW)d zUgXxIpk7VX*fohC1glBd<3w=25l$9Hs|Ia9pPO!aChAcdJA5s(IzREsA@{bZ6w|AF z%*N#mET=C11cxA%M z)zfMbiCGICMyD*F+62~cV%$gx1qUyU%&3h0np}0>X!R^P=PMAP6H(ykPhOTl%>5;d zf`%eVKeA;mvkU)<)C%;DT-Y}Ce7kp^d*n06^9)bl*gH%gH9V?}beKL$ZrAcoP(Eb$ zaR2}z-Ee%+(o4O1cn@(o>5Z_&f5VM9Z0T)>fZJvbd{LLFpBJ;#)826Fm$kJBO&2O+ z!LJ+9-!cx!c~l7fxIkQ?V@piKp|K7=SJ*H8$OQ-mbdMed&^cH6no?B>Zx6v(_o>IE~c&7D0iEmx7 z+)>!8qXZ~5A(a^L9*E%Emp&-g0TFoJ1yV z#dcN8nyE8(GRzI^oihlCU>_%=>jk-?|5eJl$3vO#aeS;$ zO3GzxNo;M1q-_bgmr7eB*FlK2T|{G)dkoSR66NT!$RN3kT*`G!TS+Im=2B)dg&Map zuJbU{aK1l#wr$VudF|`G&Ogt8^SqvUe$Vgn{eIq`55ok}F*tFjHc{11-j>7V&wz&W zlvY0I(=-Mu;UeTd@+-w-(E`U5g92<9z>hM-h<_j$ zZ5a{!Q%Uw0up`tgr}ns>DYndrgzi>x2R*w~@ydso_CxuXN?cBVODU7-4_?!+x18l- z0Pi7dZA;DC7Drvu5|=iyMYVK1_D}5`^lv_6%8$K(zFi?ug(u^B?QbZ6r9M?Bk^^ah z-fFnGQ;r9N;?}VE@)$K}y!T6knC81{nJt%=3FTh~SIYU6AnY^IHg28Xc4pLd|KEIk zI(f)k0(T(YKB_Y9tKNE6L^gB;>SGhO-#RuRNn zH;p|AVkrz@56r@8reVg9jaR2X#|ciaT)XWd^l?rIi>)mGq!I%SJw*<3%#>6}#s0HB zq$5r+#}Yb~ zk)4o8byAT$o!}BS_nv#bIJ6!@MUps=8~^OQQKHf6^;zk(ih6yHk7-Ew0l#L`Fj8z_ zFI*2yV+EQbk-V^r=1X$7yFnHu0Yc!g22}v}${{|Z2U zPQ_D7bx-*-BPU#{{Yn6V*t11)lJ8XOyOI~;R6E68G9l3D)e{p?HF|@^=BOHNOiqln z$`|~9KOdkq02`G7ms1O0zk!04H}shrsd^y@vCt0~y$5J0i<5rlEr`@RJUdiD0!Y$u zsn%jfB}zm0?E!dm=!y%o`$E90TL{Wn(dKAONY<9;13c*``n-meWjDUG?-W<sQd)>Q0bi3K0IgKQl7h?8LMLT0EW+2F`kR@THgjSNP)$j z%k-XISV7h&H)>Q=l2kN70PcS0z#PoKIspNhI?Sk>()Azb2?seu>&qNcZG{L%)#zf9 zzNMwf%70<|RcmeVlvJh+v?jVA=su%_&j({fmsaCb23L%DxzvfL@`(M%W>Jb0lH~b! zm6c4hlvna1I;+m^uItLR+PF>OML(j&7*z!kUqkn{T$4O>fD3G#X!g~FgC%t|S$ar{ zOg~?qL;NL_6pLX_g`5-h(KtI%KXy-NoNd->SFP&Uz5Z4+3%@eQj#>FWaIB1rw79dS0Ho z2WU9GqdZqu0Vp~mV2m_q3QEP7K!nl@%seQB#usP_%h zuQ|IL)tbug!68W}60hrTLiz551tnhm^=+%T>2p(%c~R3{d4$8esQFB5fc++r$)4{t zw@^DcEF1*9I1Kvb_6m+?&jvMfB&Jet=4oEO$d~BjJKgpr*kq^IRcxy1HoZ1_>pZXb z#%L|N7M_xNf_-<*EJAIgmQM7%8*Bk5;4lzSmL!&XQOJ%y;hw$v@}&z#Zbw4c+7M%l zF;w??PFk%PtL$H?qY^*=;QZNMSv6Z9=JTg$wL8jM*MU7a%2L>>${~R@K9@uMXR|)K zK36skHQv1+{s-+!!h4tkY6(6+v7mAQ9Jg@I6V)Qi6FWsk^Kc=u#3j0Zi1 zREI1)suK%{1<4>zo&-IU>$))Gy5snZwfy#*buzvw?dt+FZsq)x1`<#b)WZ==0Och@ zQyqlS>eqncZ0LG(cI58X%j=B87>MXbfH-3kl!YGO7DM~Kg?IFU#P|>}J9U*FYpkT- z@nrxF?q|A5nl;2OD1X!w-^#NWoaV)CJ`~qI z+x)f;(endjMjS^q&HbY~b#?b4?3)Z5eec2Y-IM z9kcOP3fBc*chQF@%PsS)i7~q#?Upng zSg&rLJ08wT4Ll5Lm2*=YEY72m%eVSyt0CtY-5Z?TXn(3Y9t4wpk&%IekO7mw9!fbR zs3Uh97&1)<15x}891AIkW;Y~gbQod`o4(3^LmGDmwuNq_-)e#s<-s>&m7bmPv=^S4 zN?TwU67Wf$PXTBoUEs=E&~irDBDp94YAndV2pj@u?%SP#@(~GU-!I8E^V^c2z^~T~ zTU~uz+#F7HGyY9Nxcc*Ot~63rQvK;8q+t4*ro-IYBzoPvUY=CDx=n1qh@TZZ)*1H3 zvEVyUSZsBO_B_y0i*N8nnP(L=0dXI!M}54WCxgrXYdq~VbdR_X0^pN1mh zP=4DiM4}yot@n(`Tsxq{`RJpbcECTjmCu|v0*Ns|Tc^Z>%9F}-E6(=G$5#X9=HmOE zF+YAExym-Ne_~fWlO-v+zO64cmUkm>DBU$*@DUmX}u5zn(;U~4MDWYYTGiIZs$ZEXvzMY3n&`USjcip37HJku#2V@;Yf#!if3_}n$_WLO3t-AZcS?x{% z?ueY+b#FK@dA}9aF0fe|ZRaJ0$B&@89#%nGNMVnfgokAhSsi!sVGK&3`#_RAXYs=$ z&Qw9n5fI~Cu3F(HAJYlsxanK@n{8YR$H|*lF#XM%V}Sg{^1JqRRvC85q8 z7sQrzw*NHc=`5pbJNTtYq{D(FbFbD;il?sDc8ZQxh3XTJ>1u~7ostK%mE_Ji<9loi zqz=@QV|iRN8~l~MRW`({&5rnP$nk(_ByG3=vF2;+YcdaOHuYZ{bspGf%uCR>FeJ^M zzQ|Ee7WH46V{|0B)uG|eJi)UoCs|n`ZC){~Nqao%qNY8LV6#v2n;6drT0s|*hld>e z6s)U0;3E3aj$~kc&S2-;*@RrZBPY2EMw+(yEyb(@9W81UK!9)PHrrxZ?8jp9tw#M; zHnbp_%03B`cP#jjBCq^iHHY@u&)ijiizWP_6#PGaJa3$y?7Xo&+Di%DWd9xuv!Y#Y Gk^cfCuV!ul literal 0 HcmV?d00001 diff --git a/docs/reference/images/rare_terms/memory.png b/docs/reference/images/rare_terms/memory.png new file mode 100644 index 0000000000000000000000000000000000000000..e0de5c2163913181dba940847b04564fea2f9b8e GIT binary patch literal 20465 zcmc({XINBOw>7!}5d;ZJumK51l3*ezu~f2>K@&TjJ5r>G?W?WIOz}s zVNg|3&_NI?Ed-%xr`-*I5li{n2|xDSRk{8EK^Q;7zgh}f5%+TlqM4+sa6!-Y4Uw?t zHETx|>9>5~%v1AF^*zIs^@)jJR8Hm7+Jt{#rdB*n!CLxA`a;mYxuuv2h{^uTa}H}Ck+u1cOhi)6cGV`)(dWU3jnGSsc_Sh614_^TxW)hG?!{iy zlVLtJ3jXm;3WM8QBw}%&gK$AeL5~u$=8I52dEe()V}G)8{!!K1uNVoNc7aUYG>s_x z*(#EW$7D*R9eIV=Iz$NiL_}nmU zU)h7SAMx^oqGhA|Nl|NrIq%5U>8){}t+cA&U$|08lZ&K{#iqWAmLvj^iiWW}RXxUT z_>;ebk=Jbd=^qA@1Ah8>uD@>|=sh#^LYXIT=%b%e!JQv7kLX3;UClP^NsITkdGPCd z9(wH(ZVRKy{C5@}lGYBMzi(a^B78#4f?QhJB!{ztMS196|Gl~%@?6c_)(hTKH!=NY zToQJ7W4q0DwO3{d(~a*WZQASAudAjeCx3FD>AodmSokhl+_s!7MwySD?kpwW_S#(T zT3H*9H~RRLfw1)2d})0up)=Eb?b_Fub(X>VUDWKS0YCJwjDXfEAh(#a!HpXMT@?PR(y9S&Dv|qZ#vzO zo~>Nes8}5iwEzD4bZ54ar1aWon6RCh8&R!pu&L>Ea=!i2+8YUbL*HE87P(jE@m{~l z+)XR@7%_L46L6tgG-Kb-HWkwFdcM1Y)1lF{{Lv9*T;gCnxk)C?xp3EPujg*(=T@&b z5U%p8TE4}#;VVC(*e0{2n7=ddWc5U9#^ja!0ZX^u zLhF&9@X|J{hGIOhK3{1U>sz?JKv-hy(-s^4g<<@coJ(g9x>Vr|@wy!*D z5*bIACfkUw&4TZhbo>ewEw5)6G1OaMb@X23njWaC=wn*vTJ5PBOSc=Gtn)D5GfSG7Y787OdLx z4IX*&)z~51+^XWG&FLbs1s1ZTS~9LJ)8Hh@r0`_*a<=1I?>47(%b;4U88%IcP(wO9KZ5zI;W!1eq{V?MriV&$Hwov zZ%@tYtLa z3RjyJTwfthD>WFFI1r_>rcMa<3-V2_iE$=(t&<109R|q*B(=WvH2KZ1A`^3MX_~EG z2?Zag-ha~ZQzoUGrFbP5iA*t1VX?=rwK*@DFv>1zkvnp|$Gpear*mD7xS1THsIuG) zUK!fnXxc8d7aN--xC~+s>BQz(#(Q-Z8Rd=zQT}x&f5gf5VZ+5mbRU)`S}W5Py2o93 zyt}M1ziZ#CPudR~dNVajq((XQU1!V2`?oTm)lTupd)^-T5+q#J)KuLep(*qG3AsPL zAXoe4Vxx4Ey1M$ovHjTYWtWaZDPL-A_2#S((O|QLGF+;9-DlH2vz%$OkkBNj6298g zNz$1fXwTb*InDCkTzBr!H|SxI?El$mVa;h`5w9(qR!y`1JmLVOB8iFP%u#FUqbt&{ z8vnk7LUUDa|GI+|^^QCD&cEf6%jU-%Mqj0;3V{P`_m@J-UE(L67d1xPaK4nhozk3A zkjr^q>*nvyLVQ6=aFP_IRA$CLrda!{G|fixcemEYD>SO4w53=wU%b0~vZ@{XMqy|i z?st+0L(mz5`*ilK8yixXis8m+k84hgIKAqm*P^p(l39Fi>^2rLropmwIo3`6f*!Jz zYLrfuadtw;{b!4Zw&j&rjTW)gOT(nE1N~$$DgrKYamO$l6v92 z65I7vPr2+cr?FAl{S>3R#GfgQDN;p{+RLX4J-97vh#k*jFJ2m+wu#@EP?o>Ho!yj} zDr7x{r(>C@S|yZBncKEs;vRviEVF05Sr})txxQ>M5N~q%E#}a0uFt0JB&)RioHFiG zQg7dtp^x;J*BzGZgG3wGN$y~E?{ zeNC`>`q!_JH&^-`OY+R>u9j8h-B#YaOoDq{Vz;sS+rd3eX)QV5EA#9|bU;L@Y+R1d>gBKE-mXEZ&eXbqvDub4Bxs(6&s>hM&UJ~af ze&+0yy7ZOe-S31pW~nsuVfoHo5E^6)I`KK~sDH{n9ml2fZ0Vks`$PVkHVzTmu0$7g zUu|WrHY=xBVlR|#e~M9Kz04ie-KaDyzcJ-{O!U!;l$li0jI4{BZhK*UuZK*Jgj7-{ zDM6cGJ*IVuBZyPKz*Ls?<|C#3?JwFr+E|Yf#;YvVzqa%c8)Ky_G%^~0)fM+h zhsAau3S}67VN^eN? z`qiFl{c+O`CkgQgwVF)k7w5`b>N`BF^Ma-8&XB=PLBs3>lT9WMw`h6_ayzI=VVhP%lyNlE2ea zJ*O2J=b%ecu1(punf(iuE_6lycKsmy3M%DK<0P%OD>x*PbcomZJ>TOAf<&xQ=jB0L~dllrY;Zc zPZOA|t#F^4Qpj8!S1(Z`jbCU}(_g=SM8e%Mf@qZ6mQFI{kR7b`r6;h82HmwLydjaj zS86(IREGyo|1R2`y%aR7*Xb-DpX)Tg$f57L5Fl^WE^yuA9QWlyCBf^@f;WRx$a4lQ zB)*Q3D~i+nnTO%3-RjOGYk`0mJ_gwBRI@?1>eXR-=_!0t ze|dg)hBc4+5f);}6)&E`8b!;Wn=?j5mlRJII-e}&<}A|w*-|JOqLwuMO@)P_7@7J3gWDd zqojO=^6*d>a#B?TUK4KT(!?t zqSez^Pl}0N|D~?pDiD-WLPc?;M8Pv&eYbexsT<`(c~J`VFgwOu4hsD~1tX_)YEWeVd-ug-jWjlWs?N zDJhb#_()dC8<$-670?sWh(BM&q(9$2uH6TddRm_Rz6lU2i7$4LO}>FVV>$H)9bd`5 z1RQfKC*!7BPH>=sah8nBpjU|zk}W7E)Sqy6M5yp*?7kdEeL^dxV{92I6*Kufo9?33 z3m1!aij|xuR-@0^hYe|SG%5^OG1EFrfxDS!bDOhUe;OK%j&EA^rb?}>SNJPAEJ;m;db~Xz;dKpWZuT)6iEO?H z*lug}>2o(H;gSQ~IrpsSt#B_EFg557{54$>4dJAorqR=^m|3(jO3=7@*!YjLFFI{M z>7=Q}a$_g-hK*uPuu`|v*wb6}4r9OXPajF*4RP70AzRbfZ8H@jMEWc=S7v6N5ekUkc^v+uH3j8!!^8QAi6`222Y;@#0tSs5+l}z5CXOo+gj?>w{{l~_oe_8UbuAvov zwdc#+_}M&l|B`Ry8G~h5R5d_elp;gfQ7V$yH@8p9OhvLs3h(-o6E%N{JMvTGt;5)( z!hK4y?>{a4a(Bor$YjjPAtqXDn{ZfRImOo_U5#TiW(PB7E6m%=mU4LYUjVwKvs_Ql zXjrkInNA*gFbQ>}{XJ6+`CWc7nLyL=qQSB~$=W7D|HjfHci3OaW7Ammc#oxgSe+J9 z=7&)8rCFV>VxMk;-rD$0tLE=#d{(!r=_p+Fne?As>`G~gnssu1^e|)iEFBUBOiie+ z*rw8fp?2zrX2MBhoDmA0%MaqbR<28OYtgS@bQq16X#UMns%)$l zn<$^bZw_kQ(vkl)QYYXFt`|52EU~wv@!ZJX7#X+xid_B7t59IK zACO9z;J_=m@~!o0`?_H6V%siJ7mw|&&Ek6_N=79PvCvmnI#rZor+0TKpcK@3(ft%t zW09uLM=nD@U(h5+{!S?IB0H0|=`@$8I_TeF(N8TXPEhIHyO$TlDc%t0_KPD%+QngV zlTNG1wyT)q?453?j@bzb?@?j}@Xwx{JoKRZh7(lnMO`M-kF?V?VuvZ)X9*PqP6721 zE;-Lq5##c97{#5-uTJE#Xq-Fro*gCD>h<3;YAycyQP7cT&?X&t^Tf3|#G^qVv{x7|n*|Jvm!|C2LX*BpDYRa;?kQ=4v3W2fdF0^i& zfv&OO?IpinBgd*S%2%`T&?2ixiwP!lqHJG17KX_-*;kTl?7cRYW2FFOj2#SH!hOk# zOUz2ESslHY(4@m`2@X}uouz#){rCKWxFj2k9SAn+F%m_)20fSd%s|F1EKe0sZOR|%zkXHRY(O?buk{J-#3o7FM)71>5sz}j%&I&~AL9xK(coAIEPut83t1EIVu9CS8d@np?L>&o2VTiHgZ)_(ry2MOnyYm#1$ zKudN)U?ju&!KEjFGUp%kIs z|A}i4m4M6d-Z{g?fKyb-yl8as?ZQ8yF{k+-Q1uH3sgL4jc24r&ha!e23AZcM7##vm zOm**Bq?Z2-g$>Zuoe38nC&Zwk%opM-hd~LKFTvclKpqzmDqQ-oUB;Fr1MZm~S$}8q z&3|QcHaG|qOpnC@PZZ0*apu$3LS#+1VA`A8=#pX4lxcz6V*#9N5#*L~`)I)S>(W5d zGQ0z=gwJ%Wc1gBVo+9eN6p|w70qN<%kg8a|Q@gfuS+mpk({-g});OT5515~&Krkx% ztSDP6?rjha0v=n_T!d_zalPP9XN8L++;bT0RXsg(qhh4-ic%4+p^Il?T+4yA)9Ph8pH+zPCIz7sJm+5_wS>$DVtmg^=t z8^>tx4>7|;B<`bAwTyF1t?&Ylun)4lEj{~*cPM!Ukxux#M!>`)T5(!50_sg|Dq(72 zCAM9SC>x$zwGwUY++!i(v*pPp<%EaiZq!+hwM8T9N3{4iSoN$<1u&g)W}7U`CQv10 zJyr^19IJb>?WIOTFYTnABU%PG?)i44>(+_ZY}_{xt2JT~|GRLYGw62oKtm)|J$m{w zPzQ)8sYjvOw#+Csk1NqEn#mXZri!{T?#tj`^FPg;R1hrHaQ{r zN-c`tV=qjm{*^&`<}RO$>Be6OAPme8yYuaf%Vf2b`mOvi^(psSx* z$e;QsttQg?UnDPPQ3%7`J7M^4eepPs7aK zS{TYh>`@4s`E8pME|i5tgol^b*>q+_ja@~9yPp3olNMBuVxW_HtG&o5WA9ABcJ*YJ z3{3GDRl4S(0qi{6Z|y;vE!x9!yu1Fz5h{n%a47Bu+Z0mdK#)2R`X2+FKi26b!yl*K zr)tF7fH+hbA#4B%fzYox-WdDXK?-i%d@5`5mNZOGJ>+?7!+}b-#T%|Ki|zY*wUxQY z9z`g{{+B8@u9St&PjWEje*N(D!wh5D#)tcn4)!UeBkTIvX+lLwOf`xDg@uIjMU0BO zIV|7YI&0Zj2te$!eQuxd)ht?k5qj?*W!+VFrEER?AEI422-A^kD(@g2bS*t$7mU#) zQ*BHJsfKlEeD?$O>^QMx(7A1i@!+|e0J#c>hgS>a%CkcwRE`*Y)m!=lLA1Zrys%(9 z-rv|RCuFC)BDckZaepF(Nw@mcTVzXwa_m$RBERJ-P3nJO|KSm}1F>!NYuHFJTAr~} z?djhaS=o>C%x)>tB$|g%Ak5++vgd?mCAO7bjSQ^DaP%n)=LFOc+_Ks7KS|$x(tVc} zi|Yl_pI!|L)Pw1me+GMJk}{J)r)T4v^EsUH8S{eI&^kGBhYAUG5#W1q>2)=YxgMTM zuPIKJhzd9^wrh63Zhv*6DVqwRl}q_LWBiF0^So_U0Rfy!v@dRo@y6AW=D zX(cfO|HP#i1C}+BKMU_gzL#yJg$^{f5sdgwKJiw&u^le_;pp#mjVu?+i=e3@7~0cb zOvstXXoUYIhu1&I@}c~*#)c|Ai8}bAlu=Ls{&W%AI|Ng&aC=iXhdmTwcK=6t$2?QG zy(DOQ%8QBi7#g-53JH6Vu3MC$7p^mC(J&r-2k-{U*N-fvAkH37YXmtXnupspBzOsO z8Xx95y_Rn3mAv00b$u(uKD148-rgh`0c^4R`Tk*#&5oe2clj!1JCtHO(6Ea=*Y1;h zn!JC_q7oqy3R?+UF>E?AB7*Lq7lJ7=KZZez0a^;7|G-Xq`EfhH=Jr}@0mR8}3lS+1 z4QZk_80PED31JGgTazH6mK?hDG<+Z}-4m_np!wcO+Jk-@C3JPS892pI;dM75T z&ka;b+^XHh1zK<`(X#Q4wOL(o(Y=u`Wl?rzM};XeAgEWSGIM($QPaU>KR9h$5`Bq11^hU6gA=y; zt9r#7%JS^b0D}@sP%!h5>oImugOKyHxJ!QKJ z)MLN{#{DDZRkuF}-1jjU5{$GshFP9y`KaG7J)KoBlN_dTzw4p<`eYg*SQ-lG%oi?4 zGzLq>b)xbiqnF$SGe@#O2UdyAmlJuBgw=#0hE!;7-e(K1SH8YS>txl^%h(5v=Xbmg z18KLU8NQ(~J12Z?I=3e71(Gu5gUYM)TG2kB1R3-~a+~F`?a0uNl7EwPy#q3FVF?ui zfIyV*!i(KVNBg^5mxM)wthGHaS4!il1HeI zv3@VSX%E>)^nJEbNn^Rw9eUvjgjYQWv9H91 z4Tvd2Q)s*?ZM2uov!Acomk{pwkoyMvCxSDqtv;oLki0VF$6J2J_gmyiP4aYx!a6|8 zq1rHLf4@{!Uu?^(7mG2C&y4+8kBW@9-~x7wig<3(I*qEkMmB4b_vh-A1=3aNvdP9x}BCHkjwe%yi6`%*tJmqqzZ`e@o zczN=~i9!&Ndf<8;=Bh|>MpP-*p6icM5~3r~G~e7m_HON-do?Jlu;eD}z87JZjTEbp z&)v2cxKbKBdVL!A5k2O46O zW!$~b2Nk51KDvOzrHPxt4bzir?V17t2PgL{t%(X# zOhLVPH@LM@;;^cINGXCOhMff|tO(2y0!g{qMOQi3AcMT(y0u{Cs`b$?x_n%lQzMI+ z5~*i;oAKm8_|(;(08??a^=dd&`0Dp{!sgwe=KtZ-PI)FaLg%6Xl^{&|uS%siJnj>3 zOXrU+jyIJ+>sMY3V5L{WuKVZ3SE;LLfMeY9_a{^@#?Ez1&sC0W8C`M#-_e63@^l(8 zl8(W`N8&WfcZdHnoQl@XgY;qnCWL;L*9)|*bhdwI9!{T$j=w+G7} zSdxD}X3DZ|OXct&Jfx!Og?}&Qyg~2jF1VpkUr^2F_K(FJTSEJ>e0E zev%RcDarO^)k1?XYTaqAs|J&dofyo9Iei-mIeP6c=xBUL<(u2_E3a=JQ4Y>(Ee1U1 z)D@|Jx^&+WCG%;K@CG$pt4f&nukTT2&=6wT=HfhN&-dk91Xj3;03JB~gb%UK2cE=* zp&QNO7BY~c^!6@y;>3Wec;JLO(V;bZT!=7 zAH#}-Jlz-9f4^}VaO-imMlkG0cXQ|+W;v8nnc?>QC`Aj~Xz#bzX8jrZnOQLWdrZ^C zyAAg17iZV%HpASCttSr^D;RDsff#>;OC4taoPX>eO0I~)v(7Up%$${D^V53qY|4%+ zJZl$TSL6KWJKtYxyBoEUel^8A0s#Ap4xCy2xg143#b0=nMY)`S*G{srM~@L;3s5=1 z-hv{%Ys27{5ekEwh7VB5^-=5k{D6Y*M{kKEuYs?Tn0_vpz&d&*1g;xVMW}q34AYd$ zgAoIG;%m~br$Fv&1z$PP`QATcI{k2o&`f&%02EeKQl9-?_F%mHXR`aA2>Td%N+YUe z004c~X|YNi;BGwgaMA<0uTbT_++P&40~H681>OFFiludI!B3fZXDRg*`=+Avqlfv+ z<=Rm}Qlgwaydlo=GM{aNlQ!YeBT8F{JG5xyewqUv0n2B!G)~r|7X=L90CYf(6|>ha zjD7R6H_E&!A}PgI_`VYZ;8<-38seVmAU3E0u;fSa&fjr+Z>{qn=6eAnYKZ`>KonG! zsM;a?`fumbacRZur9m*gr`C5N91$%`i4A`M59dr@^HOA0r=AkwW8L2+S$3+e^bHwU zPN5`#?btxBS z(2S7~L05`70?~|Uhjq{~4tg#IdHw!;U{=|HDYy#9jwDQ>4I)y82d^8X=$p_=XLS~* z%V?HE8p$@U=!Cp?$Yb<{@(h@w@=!-e&_xs_zxc4ukL)W3L!JTCEoj0V_$>ZnD8oy# zXWZ`m8)Wr7+jr#5f(cv|c*-mzjh&;=wq3OJCnf2r5p*i(wJWIS>1$n&3 zQAmKdlIz^vgzOt-t-z4~@|JR$o$=^{sVJ`<2eSP+YOUWkB-_RXcEwzXB8m zs0W<;>@A9WN*qm($3V!x%*ygf%-}{=BVBLv$K2L9{5RWqG2|a?rzlH;Q11#5;MwzT z>!dkH;G|0Ayw=T^vMt0g=_(4x+f0Q3B98r1eZ^mYJ{$?%^Frhov$^FV_9RV23lOk= zC$K@KWG9c$0c?WyeaWg}bn(hVf?ROTz+&D4e%5P`Zxq;uS|Ov$cXK&bzW92)d$CF0 z5k*WM>I{2K05Um~i6s?vI#Gdpkg@xb4UU;rotUl&;Mle%>X;C7ci4b9OV%@)HX8R=D3)WvXq+mH%^INqmC&@x4c|d(o zL*};vdDf-IXzgTL%gbYz`wKsK%$C)OmQNgZF%^0sAk2CCTc;H#5l_yoBb@W%Y^!pWml8Q|_dXoPNv%QED(OT( z0*mY$L$~>*FI^aG)+xvxF7mwIt8eLb&ffWcS9dI&){lYfRZAC00$s#f`jbx6@AUX; z!x|2%1>J~H-adyiVfjOOG$T}c_z+dnm%$T@CI&b3e~%#dFSAqX<#5v^df!$)lm^m3 z#{jx6o@8AeYU_9`{(WWIbi0?$R_k?xIj&VqHt3w}QuC#IL#jeCuukibK$D7?t6rrF zPXo2V`RlRB?#~BI-?VhKh$o1BN}F>38|*mxE($|E59JVVe8cR-f)o1YifgS87QK;) zI|C-l)>Df><9DY-noM=_TLh?u^PaoQH#x%gBNydORt~!UPECkGR$sOtDRh58HpJS7 zqLx4C=#r}d>#ht3p6q?Hhs6#Wu_#YB(?@Za_8|J=6CYL|$^0qam{S0urFBH+P`o+B zw7_GbSsI9aN`K)*8@h({_ct=& z@Y5~Fo;$52LaU2r;fxJ53p?c*it`|$BN;|x!Sp?pc*@i{#g8CM+W(bzU7`*d|H6Y# zI-B-jys^4Oc6}l#bLq9orH-1etm|f|85&lp-t;C+WCvdRqfHm%s9_bFV3Mh?AChpE z_>!m?m<6X4?N$L=flzb{x-8@482v(GdRm-eO5GACID%a|-jkOmb%`XUa#$9$WjEeb z*G=d;O|&Wzdy4`qG)oM!uDxh22E8%8)huy7TojlO{L|(fA4(uW{xA>hJo*>rans84 z(<{!NGMjwWfusa@#(S@)dz=NHz3H08UI1BnTYKuIpYWUOg0!1mdd}NE8QtUI= zuc0*#`otyx77jq9l^U_6)b0liSb+8u&L5}`nD3AgFTw+18E}`K5l)1qha%Kc@}D3` z(B7z{u2lrO5J+6uE^kz%Ks~cLPH0o?03u(0{KhE1bMplnizE&Nx1d)72?$}+vT)Gv ziaBc;oEe@*=Z3QHK*UtkVZR=s=4125TlF0YK9{H#l41l19QBe9JOh`wRJ@SIK~PbP zftIuddo#+WSbv5xy!LF;qrMnTYPP3#lA52Y?)1Nt+Nf1=G^L(J-_@}^nD`1o@5HFP zJ1ypfp_Y(GTm~S&2>QzPsZ1iIm6HsYp%pFx990WN1M?JCa9JYUz`uLG%-06cPq)^E z({DH!Mbts@8&gVaJ(fhFX0YY8++jd?7N&C(I1A(7wAucxT$fLX(<1o9PZt?4$R40R z%1rTOe}WCGc{oi*g}pxS<;dr|@i7kNU!nVnUAo7R21|iY8aO#UQza@r;G$?%fwig^ z&h`M#3c-lMinD%r71$iY#z*r-M*SB|2y>`y@#n!f9j)c@+v_}s`Qp>&POtAuRtAs; zy9BJk>|5*wd{GYWQ|O^fGq4$=2|^PyzyI5#ah8kBp=TcLVHSZh1+iV6n0S~Bv=$L< zAA!9#1Xyneh|77vZTvcrk7w5wkWjWOgH(xu^Scx@uQt5S-2Vl2>v|7(E~&@sJ0b)iH5}=l7rm`Og)FZo%UsRmlF{ju1e>i6FuM* zH{e%*zbW$Laj)lb8{H@U>$TZT>*bT!BrRC?FG9cGeChh*c;_jK>mQe7`J(>=mO|Ne zbiYB^$vwyiP!c}i8JX!|us^@L;@pWvT{rTcp;-V3kgIplEeRI+K(p~N?YoAIW<1|5 zz~+x)spF=Um?zp8N{klV0|f)P2cTSKITN!w(;>dyf?N2JvJ?9!??8vOql5>u%r@ zIXFM`(SEWu1>N>w^fwpM$BwGiaiCTw{)7Vl$l(#os7u~E^)=Fg?3m~(rL2)_Pd2*d zT;^Vth4@%fj7Hb*#sm22=`GrpGwTD{&;uZp{rb|R^PheVfFrXHCKBRiPV))zc$|`? z)Vn$OnJ#tio+qNCr-2*xcyem!3Fe%F1Sb8N0ys@#%EzsN?XG~%{M|P{oguiLp@E0F z63~9`k{@FcXr|xLP5~;*J79Wy0Czre;KZ?xeOgcZMCX}%_snhtcZv>+YZXAVAHipi zP|?K}*u0#ecj7ka1I}w>krj#i(1Zv165V-Y54ew_gtOj}gZ^w47r)QUZ#o{T(8xc$ zC_^(rsYjzdK=J5~&opov3jW{f+xOekxDDlFGJNu5>r1k=EVvsWq>TBgMb(+896)da zhYJy${0k;6B-9x+u((4q6gVGP*lU6k$$DC9GMS~XMHVnQnuAoh8_M8NBJZbH86Bo& z8rlPl8|<~h?}ueH^nOVeQDJ~aEF-Xme{>J9`RqR|qU)JSQbLob19$1I1k z?5%w9a0VnJ7@D!cHo z=cczi&kVxy9+3FYTFD1an%G@>x;HOY+NJj&CKLorD9=9ceCHl79OqyCm|>ySEsvEM z!73ln{UmO_M`EO~84L%{HDArSVGVk~19|uw#&@4_K4)u8E9_4D>FJHTm_fRMrdk2K zRp8Ld24S!>O*6ifd_cs|9N1vzAyp~Z{Q~K?pr^o+C4pJvPs#}uUr((`?A%L?D4iEh z$5p85Z{~zFxe~x8oZoWd38pCEeDdXTrBLSUPe3BkVOd8ao_EUr1sz^$UGP_cUPLuM zEwPSS>u-3d$ZScphAgkJ%z&i)^o8ffwSf!S2dS9h_4Mk5SzCX;U#>=@fk z*vsefy83_{WFKG)4)Z49^wz4kbdNt)YhYUq6#+U4VRnGMC)^OUry3p~{5%1`q2m&c z61Vs8I}knqxg~Jk@vd?1nRL<_@QSKks(SMQ)(%n|g7{WOl(M6GPrVo=tCpg}(jGh@ zZoq@Q?l=tAxIkoUCH&D$l|&oq{%-W|&vrxDIy{Mf&X<5{#u(g1lLar4)GJ@_yr3OD zhQ=$i1{%D7?l$EcAM0C^g5YT?Fe3EhPmVJ{ZlAtTb)G&mbEixCG{e zP!D&>9H+1-e-{~(wK+&-a4?J?uI*o6n2Sa}vI{~#nSMIHHFziFy5Laq{zv{U`v_5k zO%FpH$BRJQSED@+hv|n=$!hT_-KOwAIwZ(HYVF!!iOwT(;Rxof+8H+U{h8CpJYvrz zK>{0uEDLccMahb*e2>l{8tr_;J72Q}US?X9h(a*Sa5i?dr;*iMa`oKje?Q*k^S<9D z*8w0QxD$skhb;Vkxm(%egwo&pO$jl`p*ALvv-(qK_lwUrF)GS|(_&?bRq_#$R+SjK%NjxO7|^Lo&+-{cDO z8x}Q>xz>KjcthqSN;Dy%ig1?y7h7bx^?4DHAs1NC+X>4!pTY*Rg#$x)>Tk67ekE#b z11By(cgODGhfZ-Zar|#U5G>S?HtF#MRzEHy&<+O$EG*lYJU40jw=XhPGv0wja53#K z{yV7es}5B?Zd;H!Tea;fBcf>_qBxxzo&AllDG!nT*x_Q?!q{2BUprM*Pa(0iv|F99 zC}!RIi4ci@_wHT)xR$b5kJdx;1}hkSq#eUK~cI1FHc`6eI(Fj+oB@1(hQhDD_*7UOlS%Gl^6}bWY z+C!&6S8nZ^G9$o;P9Qy0`B9MR-l3QczAziem#d)VCd7l4>j+2yJ>hdf()A?Pi*xV&Wh8cqpGDl0Xr9v$UjMQbb0sAIqa_uG zc!gxZ;uMB{#ppW~8iDjbjUf4IHQ#QltUUDFd@ncue-C!(KYK6#!vhI20@T#hvS3r_ zpCPy(52R|@CgKV0?yL@yVaK8G$=fh|ZSAC|#LuB?9NBZ16z3vF%(f0=Qmp>+@S>aF ziGaTU60i1_U%$Sy+a08CBIDi)m$_o$+{#vmn$4Aj7&clj_Ht*UjpdtrPu1gO&QJU@ zC!kxmsncPDeoaLmK!5CklpO6OrS~G=;!xlZ28i2(>zyU25a__WF=bFg2Pb$-|8n~F zC~Q=`ZwgA>G6Kn3a@hTnx8o5-5wlrO97|$up0Tx=Ft28 zA(j1DLj>9rk=j7HNr;IQUsuMEZ-Ef++<1Px8>sm#*hcOQrhvjoB!IKQ^!C2W@Y;Pn@ENshi znH#tuc`^xHy=I_8p` zg3ddumczFFrFrD{mw|lBhQ%!bm{i4ULcD$B6N_f)<435p&%Fzadln7MN)K8C|1$lb zKqnf^$VAll4gy(e(`R&#eTjVHXq%=yc6$&I5^1}aW?e&Jh}`@Sac_1D&n+BA^b+>OGACmaowBYJ^X|Md!E4q;#Wt9ig`7kL7;QOAvVt8T)Tw_G&UNPcO_Utb@JrNxM>fW^sb2@&yVX8CR7IiocZv2vT zXs89khA2mGC~z=B<6+|oQ}@Y~*_okRe%!y+on|LeKAh?yA!Cwd*Y|@grvEm=koga!q7i?5T|FE_6vYhi2vsL52lO z`z!tO;twE}8h5>tY%sFp2#xN!yhG{^jW)1+Ri9U zZhX7-uyQIR^O1w0(&Ux$)Wl~}7tx-iPB9y+?KXfvC@kE|6dlJ3fol=(i^to^D-2z` z4wD+!4&m*|&!Bu{flX!BW>`C^H75F7kdLnZTRfx_cy2>Mll*qS zK2K}mb9~v*Q?7n-2d5j)8szYh?WF^wuR!2MKor8+`e^cMJ+rd@<9RYDx%mu>?Kqsy z;@-3Zz)gr@7SMkkur81>7Uhv=lU(cz5Q~mP^Iq&_@XP}^a zia<`7O2`97cc|XhCCV`*Y}`2+QOqB1QrKD4!Sidalw7a_rMHFVP@u( zH+auK<_)h<$M(hw&*|PC1d!N)KC6cpc1bv*s2LT;q44Y{bp!ch{0FqyA>8-t524g| zDLF7OkO=ijWN|RBZr}QZWa4vq)2J4K2?j4z|f^7cvg}_su{ZjPu!YBi#W1${W-HPuK?0s2hxiGpi#Z zCFlbhIK^$uQQHX^ySij^oGVv)ZC6nG6eLZP-qdSf<2<%D)=)bry7|R^u$lno-W%)E zV6{PMT(AJ7~0kWgiY#HpFBFxpt zZ*N$V(e8SMq1GU7PdatF(g0U6|O$q7WpIFcb?^vw(7yAH>OMLU7QIJ$zn@{;0 zIwc}l>+(_kh^^WZ!4v|c^Pbl(MGOUt}W(0xbb~frJ^%J?+p0yBkpXk+K=I zu9-*er|S-mm}n9!mRc{5tZc)s3Bsny&48+Gulw*6mmbt*ohokPLcpTb4@qug<^h6P zg&2CzkInH}DTwc3@>#uvKBL0|IN(BM9=A_HZ(J#a^f^VeRbETjM^|L?y=~k^p$t6y z=oFvr>{>K9+6q1fb z52UWVK_4Q8s;A(cngEciVyVC@3N1wUM9nI(pIpLbBs{eFlrdiIi zBqYBgNcu99b3ahgpF8_22e4aJX2gQt_lwgxOTEz&~C$z0}*05|r&0?}^kZp{~Y;o=0>bcJu zLV2i7iAKH>zLIR2&j^+^8)GL4w4EpdB#+B)Pu8{rx}CHA*SK}~4cp4vRSjq=rSu|Z zdu?;EzFcD28ix`pwZjym4JF}mUNK_UFDtTJgxJLJvk!!k^q&e~lp!oh>L=*Fo+qzI zCw5hPc^oij#cj+NcNiY2N>yqoWEWm3V|`la(2z9c3Q&6IAz=1L=!(PmHJugt+V8O8 znxn;o_D{-%q|#@E`mQv+_2ZeMv!4*JBBk8T@=ATgUj8cLB7V?|5uud_Tli{1OKLaY zTkYs;1J42}{iFzq7_9-HI)@_}!D;&#^}j5tJqWz`JfkbZ#c2AS7-izik?7V`?TCe= z9u6(NaSEK1+7Z%0yC-FBr+T3P8>JH1sPsv8j~lbNVYLqp0DZ*gpNG`0PsA z{&f7M6WPC8se?L>XP2}V$_0s~d#%rPd*#&f3Yq30S?R$hD*J=%(p`O1r_x1wnbOfT zQ3`2-j|P7!?vJ}0cq=IrY@GB;xmJ30wMlLw_UhMzPR$oAdZVTuSd+VJ_qjKki*=mj rpw2O;7T8(XuK%lr`oDn)ZQE3A0&+)klAX0tB%!LPp^$U&=A-`u_Why_ literal 0 HcmV?d00001 diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/RareTermsAggregationBuilder.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/RareTermsAggregationBuilder.java index 9176d4dcd8355..95dff8603e3cb 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/RareTermsAggregationBuilder.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/RareTermsAggregationBuilder.java @@ -46,7 +46,7 @@ public class RareTermsAggregationBuilder extends ValuesSourceAggregationBuilder< private static final ParseField MAX_DOC_COUNT_FIELD_NAME = new ParseField("max_doc_count"); private static final ParseField PRECISION = new ParseField("precision"); - private static final int MAX_MAX_DOC_COUNT = 10; + private static final int MAX_MAX_DOC_COUNT = 100; private static final ObjectParser PARSER; static { PARSER = new ObjectParser<>(RareTermsAggregationBuilder.NAME); @@ -68,7 +68,7 @@ public static AggregationBuilder parse(String aggregationName, XContentParser pa private IncludeExclude includeExclude = null; private int maxDocCount = 1; - private double precision = 0.01; + private double precision = 0.001; public RareTermsAggregationBuilder(String name, ValueType valueType) { super(name, ValuesSourceType.ANY, valueType); From d5a356e03166a4274546ebb5ba82f6d19f010afa Mon Sep 17 00:00:00 2001 From: Zachary Tong Date: Tue, 18 Jun 2019 13:05:00 -0400 Subject: [PATCH 18/25] Compile errors, checkstyle --- .../main/java/org/elasticsearch/common/util/CuckooFilter.java | 4 ++-- .../common/util/SetBackedScalingCuckooFilter.java | 1 - .../bucket/terms/AbstractRareTermsAggregator.java | 3 ++- .../search/aggregations/bucket/terms/InternalRareTerms.java | 2 +- .../java/org/elasticsearch/common/util/CuckooFilterTests.java | 2 -- .../common/util/SetBackedScalingCuckooFilterTests.java | 1 - .../src/main/java/org/elasticsearch/test/ESTestCase.java | 3 +-- 7 files changed, 6 insertions(+), 10 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/common/util/CuckooFilter.java b/server/src/main/java/org/elasticsearch/common/util/CuckooFilter.java index 0b37fd92d27f7..76c470ff1f938 100644 --- a/server/src/main/java/org/elasticsearch/common/util/CuckooFilter.java +++ b/server/src/main/java/org/elasticsearch/common/util/CuckooFilter.java @@ -450,8 +450,8 @@ private double getLoadFactor(int b) { } /* Empirical constants from the paper: - "With k = 2 hash functions, the load factor α is 50% when bucket size b = 1 (i.e - the hash table is directly mapped), bu tincreases to 84%, 95%, 98% respectively + "With k = 2 hash functions, the load factor α is 50% when bucket size b = 1 (i.e + the hash table is directly mapped), but increases to 84%, 95%, 98% respectively using bucket size b = 2, 4, 8" */ if (b == 2) { diff --git a/server/src/main/java/org/elasticsearch/common/util/SetBackedScalingCuckooFilter.java b/server/src/main/java/org/elasticsearch/common/util/SetBackedScalingCuckooFilter.java index bdddead67cf87..e01f930a2ba3d 100644 --- a/server/src/main/java/org/elasticsearch/common/util/SetBackedScalingCuckooFilter.java +++ b/server/src/main/java/org/elasticsearch/common/util/SetBackedScalingCuckooFilter.java @@ -20,7 +20,6 @@ package org.elasticsearch.common.util; import org.apache.lucene.util.BytesRef; -import org.elasticsearch.common.Numbers; import org.elasticsearch.common.hash.MurmurHash3; import org.elasticsearch.common.io.stream.StreamInput; import org.elasticsearch.common.io.stream.StreamOutput; diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/AbstractRareTermsAggregator.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/AbstractRareTermsAggregator.java index f6fe23a8a7e35..d618f989109b7 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/AbstractRareTermsAggregator.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/AbstractRareTermsAggregator.java @@ -38,7 +38,8 @@ import java.util.Map; import java.util.Random; -public abstract class AbstractRareTermsAggregator extends DeferableBucketAggregator { +public abstract class AbstractRareTermsAggregator extends DeferableBucketAggregator { static final BucketOrder ORDER = BucketOrder.compound(BucketOrder.count(true), BucketOrder.key(true)); // sort by count ascending diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/InternalRareTerms.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/InternalRareTerms.java index 6c2b3db8b9bdc..5b725be75612b 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/InternalRareTerms.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/InternalRareTerms.java @@ -69,7 +69,7 @@ protected Bucket(long docCount, InternalAggregations aggregations, DocValueForma protected Bucket(StreamInput in, DocValueFormat formatter) throws IOException { this.format = formatter; docCount = in.readVLong(); - aggregations = InternalAggregations.readAggregations(in); + aggregations = new InternalAggregations(in); } @Override diff --git a/server/src/test/java/org/elasticsearch/common/util/CuckooFilterTests.java b/server/src/test/java/org/elasticsearch/common/util/CuckooFilterTests.java index b467a5fee5603..426d9829ec043 100644 --- a/server/src/test/java/org/elasticsearch/common/util/CuckooFilterTests.java +++ b/server/src/test/java/org/elasticsearch/common/util/CuckooFilterTests.java @@ -18,9 +18,7 @@ */ package org.elasticsearch.common.util; -import org.elasticsearch.common.Numbers; import org.elasticsearch.common.Randomness; -import org.elasticsearch.common.hash.MurmurHash3; import org.elasticsearch.common.io.stream.Writeable; import org.elasticsearch.test.AbstractWireSerializingTestCase; diff --git a/server/src/test/java/org/elasticsearch/common/util/SetBackedScalingCuckooFilterTests.java b/server/src/test/java/org/elasticsearch/common/util/SetBackedScalingCuckooFilterTests.java index 886e1b8dca659..ea7672a768f1a 100644 --- a/server/src/test/java/org/elasticsearch/common/util/SetBackedScalingCuckooFilterTests.java +++ b/server/src/test/java/org/elasticsearch/common/util/SetBackedScalingCuckooFilterTests.java @@ -26,7 +26,6 @@ import java.io.IOException; import java.util.HashSet; -import java.util.Random; import java.util.Set; import static org.hamcrest.Matchers.equalTo; diff --git a/test/framework/src/main/java/org/elasticsearch/test/ESTestCase.java b/test/framework/src/main/java/org/elasticsearch/test/ESTestCase.java index 1190326a5e91a..66a27d10780d4 100644 --- a/test/framework/src/main/java/org/elasticsearch/test/ESTestCase.java +++ b/test/framework/src/main/java/org/elasticsearch/test/ESTestCase.java @@ -175,8 +175,7 @@ }) @ThreadLeakScope(Scope.SUITE) @ThreadLeakLingering(linger = 5000) // 5 sec lingering -// nocommit -@TimeoutSuite(millis = 40 * TimeUnits.MINUTE) +@TimeoutSuite(millis = 20 * TimeUnits.MINUTE) @LuceneTestCase.SuppressSysoutChecks(bugUrl = "we log a lot on purpose") // we suppress pretty much all the lucene codecs for now, except asserting // assertingcodec is the winner for a codec here: it finds bugs and gives clear exceptions. From feb844ac721bcb2b169c6f72b56c37cf31634572 Mon Sep 17 00:00:00 2001 From: Zachary Tong Date: Tue, 18 Jun 2019 13:11:54 -0400 Subject: [PATCH 19/25] Add blurb to docs about max_buckets --- .../bucket/rare-terms-aggregation.asciidoc | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/docs/reference/aggregations/bucket/rare-terms-aggregation.asciidoc b/docs/reference/aggregations/bucket/rare-terms-aggregation.asciidoc index 88378f64eb572..d88be3ac292d3 100644 --- a/docs/reference/aggregations/bucket/rare-terms-aggregation.asciidoc +++ b/docs/reference/aggregations/bucket/rare-terms-aggregation.asciidoc @@ -176,6 +176,22 @@ the order-by-ascending issues that afflict the `terms` aggregation. This does, however, mean that a large number of results can be returned if chosen incorrectly. To limit the danger of this setting, the maximum `max_doc_count` is 100. +[[search-aggregations-bucket-rare-terms-aggregation-max-buckets]] +==== `search.max_buckets` + +The Rare Terms aggregation is more liable to trip the `search.max_buckets` soft limit than other aggregations due +to how it works. The `max_bucket` soft-limit is evaluated on a per-shard basis while the aggregation is collecting +results. It is possible for a term to be "rare" on a shard but become "not rare" once all the shard results are +merged together. This means that individual shards tend to collect more buckets than are truly rare, because +they only have their own local view. This list is ultimately pruned to the correct, smaller list of rare +terms on the coordinating node... but a shard may have already tripped the `max_buckets` soft limit and aborted +the request. + +When aggregating on fields that have potentially many "rare" terms, you may need to increase the `max_buckets` soft +limit. Alternatively, you might need to find a way to filter the results to return fewer rare values (smaller time +span, filter by category, etc), or re-evaluate your definition of "rare" (e.g. if something +appears 100,000 times, is it truly "rare"?) + [[search-aggregations-bucket-rare-terms-aggregation-approximate-counts]] ==== Document counts are approximate From 56fcdab7fcad0eb23b1b36bb2e6c446dfb1df900 Mon Sep 17 00:00:00 2001 From: Zachary Tong Date: Tue, 18 Jun 2019 15:28:26 -0400 Subject: [PATCH 20/25] Fix test --- .../common/util/SetBackedScalingCuckooFilterTests.java | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/server/src/test/java/org/elasticsearch/common/util/SetBackedScalingCuckooFilterTests.java b/server/src/test/java/org/elasticsearch/common/util/SetBackedScalingCuckooFilterTests.java index ea7672a768f1a..11ba2879f4862 100644 --- a/server/src/test/java/org/elasticsearch/common/util/SetBackedScalingCuckooFilterTests.java +++ b/server/src/test/java/org/elasticsearch/common/util/SetBackedScalingCuckooFilterTests.java @@ -18,9 +18,7 @@ */ package org.elasticsearch.common.util; -import org.elasticsearch.common.Numbers; import org.elasticsearch.common.Randomness; -import org.elasticsearch.common.hash.MurmurHash3; import org.elasticsearch.common.io.stream.Writeable; import org.elasticsearch.test.AbstractWireSerializingTestCase; @@ -67,15 +65,14 @@ public void testExact() { int size = 0; Set values = new HashSet<>(); - Set hashed = new HashSet<>(values.size()); + Set hashed = new HashSet<>(values.size()); while (size < threshold - 100) { long value = randomLong(); filter.add(value); boolean newValue = values.add(value); if (newValue) { - byte[] bytes = Numbers.longToBytes(value); - MurmurHash3.Hash128 hash128 = MurmurHash3.hash128(bytes, 0, bytes.length, 0, new MurmurHash3.Hash128()); - hashed.add(hash128); + Long hash = CuckooFilter.murmur64(value); + hashed.add(hash); size += 16; } From f7647b351be570da5cf6ed2c684f0b974322ffb2 Mon Sep 17 00:00:00 2001 From: Zachary Tong Date: Tue, 18 Jun 2019 15:38:45 -0400 Subject: [PATCH 21/25] Update comments --- .../elasticsearch/common/util/CuckooFilter.java | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/common/util/CuckooFilter.java b/server/src/main/java/org/elasticsearch/common/util/CuckooFilter.java index 76c470ff1f938..62c0c2744e310 100644 --- a/server/src/main/java/org/elasticsearch/common/util/CuckooFilter.java +++ b/server/src/main/java/org/elasticsearch/common/util/CuckooFilter.java @@ -184,10 +184,22 @@ int getNumBuckets() { return numBuckets; } + /** + * Returns the number of bits used per entry + * + * Expert-level API + */ int getBitsPerEntry() { return bitsPerEntry; } + /** + * Returns the cached fingerprint mask. This is simply a mask for the + * first bitsPerEntry bits, used by {@link CuckooFilter#fingerprint(int, int, int)} + * to generate the fingerprint of a hash + * + * Expert-level API + */ int getFingerprintMask() { return fingerprintMask; } @@ -199,7 +211,7 @@ int getFingerprintMask() { * Expert-level API */ Iterator getBuckets() { - return new Iterator() { + return new Iterator<>() { int current = 0; @Override @@ -263,8 +275,7 @@ private boolean hasFingerprint(int bucket, long fingerprint) { * successful, false if the filter is saturated. */ boolean add(long hash) { - // can only use 64 of 128 bytes unfortunately (32 for each bucket), simplest - // to just truncate h1 and h2 appropriately + // Each bucket needs 32 bits, so we truncate for the first bucket and shift/truncate for second int bucket = hashToIndex((int) hash, numBuckets); int fingerprint = fingerprint((int) (hash >>> 32), bitsPerEntry, fingerprintMask); return mergeFingerprint(bucket, fingerprint); From 5feb762ca7aef928b9979ef9ee1edd433b05a078 Mon Sep 17 00:00:00 2001 From: Zachary Tong Date: Tue, 25 Jun 2019 11:51:31 -0400 Subject: [PATCH 22/25] Address review comments, merge conflicts --- .../bucket/rare-terms-aggregation.asciidoc | 2 +- .../elasticsearch/common/hash/MurmurHash3.java | 18 ++++++++++++++++++ .../common/util/CuckooFilter.java | 13 ++----------- .../util/SetBackedScalingCuckooFilter.java | 5 ++--- .../MergingBucketsDeferringCollector.java | 10 +++++----- .../terms/AbstractRareTermsAggregator.java | 2 +- .../bucket/terms/InternalMappedRareTerms.java | 12 +++++++----- .../bucket/terms/InternalRareTerms.java | 9 ++++++--- .../terms/RareTermsAggregationBuilder.java | 9 ++++++--- .../common/util/CuckooFilterTests.java | 7 ++----- .../SetBackedScalingCuckooFilterTests.java | 3 ++- 11 files changed, 52 insertions(+), 38 deletions(-) diff --git a/docs/reference/aggregations/bucket/rare-terms-aggregation.asciidoc b/docs/reference/aggregations/bucket/rare-terms-aggregation.asciidoc index d88be3ac292d3..e2537b61aefda 100644 --- a/docs/reference/aggregations/bucket/rare-terms-aggregation.asciidoc +++ b/docs/reference/aggregations/bucket/rare-terms-aggregation.asciidoc @@ -177,7 +177,7 @@ This does, however, mean that a large number of results can be returned if chos To limit the danger of this setting, the maximum `max_doc_count` is 100. [[search-aggregations-bucket-rare-terms-aggregation-max-buckets]] -==== `search.max_buckets` +==== Max Bucket Limit The Rare Terms aggregation is more liable to trip the `search.max_buckets` soft limit than other aggregations due to how it works. The `max_bucket` soft-limit is evaluated on a per-shard basis while the aggregation is collecting diff --git a/server/src/main/java/org/elasticsearch/common/hash/MurmurHash3.java b/server/src/main/java/org/elasticsearch/common/hash/MurmurHash3.java index 2c028dd514049..a9232e06657ad 100644 --- a/server/src/main/java/org/elasticsearch/common/hash/MurmurHash3.java +++ b/server/src/main/java/org/elasticsearch/common/hash/MurmurHash3.java @@ -180,4 +180,22 @@ public static Hash128 hash128(byte[] key, int offset, int length, long seed, Has return hash; } + /** + * A 64-bit variant which accepts a long to hash, and returns the 64bit long hash. + * This is useful if the input is already in long (or smaller) format and you don't + * need the full 128b width and flexibility of + * {@link MurmurHash3#hash128(byte[], int, int, long, Hash128)} + * + * Given the limited nature of this variant, it should be faster than the 128b version + * when you only need 128b (many fewer instructions) + */ + public static long murmur64(long h) { + h ^= h >>> 33; + h *= 0xff51afd7ed558ccdL; + h ^= h >>> 33; + h *= 0xc4ceb9fe1a85ec53L; + h ^= h >>> 33; + return h; + } + } diff --git a/server/src/main/java/org/elasticsearch/common/util/CuckooFilter.java b/server/src/main/java/org/elasticsearch/common/util/CuckooFilter.java index 62c0c2744e310..54099735fba47 100644 --- a/server/src/main/java/org/elasticsearch/common/util/CuckooFilter.java +++ b/server/src/main/java/org/elasticsearch/common/util/CuckooFilter.java @@ -51,8 +51,8 @@ * about inserts "overflowing" a bucket because the same item has been repeated repeatedly * * NOTE: this CuckooFilter exposes a number of Expert APIs which assume the caller has - * intimate knowledge about how the algorithm works. It is recommended to avoid these - * APIs, or better yet, use {@link SetBackedScalingCuckooFilter} instead. + * intimate knowledge about how the algorithm works. It is recommended to use + * {@link SetBackedScalingCuckooFilter} instead. * * Based on the paper: * @@ -518,13 +518,4 @@ public boolean equals(Object other) { && Objects.equals(this.count, that.count) && Objects.equals(this.evictedFingerprint, that.evictedFingerprint); } - - static long murmur64(long h) { - h ^= h >>> 33; - h *= 0xff51afd7ed558ccdL; - h ^= h >>> 33; - h *= 0xc4ceb9fe1a85ec53L; - h ^= h >>> 33; - return h; - } } diff --git a/server/src/main/java/org/elasticsearch/common/util/SetBackedScalingCuckooFilter.java b/server/src/main/java/org/elasticsearch/common/util/SetBackedScalingCuckooFilter.java index e01f930a2ba3d..095416e5d9aa5 100644 --- a/server/src/main/java/org/elasticsearch/common/util/SetBackedScalingCuckooFilter.java +++ b/server/src/main/java/org/elasticsearch/common/util/SetBackedScalingCuckooFilter.java @@ -187,7 +187,7 @@ public boolean mightContain(BytesRef value) { * 100% accurate, while true values may be a false-positive. */ public boolean mightContain(long value) { - long hash = CuckooFilter.murmur64(value); + long hash = MurmurHash3.murmur64(value); return mightContainHash(hash); } @@ -240,7 +240,7 @@ public void add(BytesRef value) { * Add's the provided value to the set for tracking */ public void add(long value) { - addHash(CuckooFilter.murmur64(value)); + addHash(MurmurHash3.murmur64(value)); } private void addHash(long hash) { @@ -345,7 +345,6 @@ public void merge(SetBackedScalingCuckooFilter other) { // Rather than converting the other to a cuckoo first, we can just // replay the values directly into our filter. other.hashes.forEach(this::add); - maybeConvert(); } else { // Both are in cuckoo mode, merge raw fingerprints diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/MergingBucketsDeferringCollector.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/MergingBucketsDeferringCollector.java index a32cce6651341..b95a1a35f47a9 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/MergingBucketsDeferringCollector.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/MergingBucketsDeferringCollector.java @@ -84,13 +84,13 @@ public void mergeBuckets(long[] mergeMap) { // if there are buckets that have been collected in the current segment // we need to update the bucket ordinals there too - if (buckets != null && buckets.size() > 0) { - PackedLongValues currentBuckets = buckets.build(); + if (bucketsBuilder != null && bucketsBuilder.size() > 0) { + PackedLongValues currentBuckets = bucketsBuilder.build(); PackedLongValues.Builder newBuckets = PackedLongValues.packedBuilder(PackedInts.DEFAULT); PackedLongValues.Builder newDocDeltas = PackedLongValues.packedBuilder(PackedInts.DEFAULT); // The current segment's deltas aren't built yet, so build to a temp object - PackedLongValues currentDeltas = docDeltas.build(); + PackedLongValues currentDeltas = newDocDeltas.build(); PackedLongValues.Iterator docDeltasItr = currentDeltas.iterator(); long lastGoodDelta = 0; @@ -113,8 +113,8 @@ public void mergeBuckets(long[] mergeMap) { lastGoodDelta += delta; } } - docDeltas = newDocDeltas; - buckets = newBuckets; + docDeltasBuilder = newDocDeltas; + bucketsBuilder = newBuckets; } } } diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/AbstractRareTermsAggregator.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/AbstractRareTermsAggregator.java index d618f989109b7..2bbe3c01988df 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/AbstractRareTermsAggregator.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/AbstractRareTermsAggregator.java @@ -92,7 +92,7 @@ protected boolean shouldDefer(Aggregator aggregator) { @Override public DeferringBucketCollector getDeferringCollector() { - deferringCollector = new MergingBucketsDeferringCollector(context); + deferringCollector = new MergingBucketsDeferringCollector(context, descendsFromGlobalAggregator(parent())); return deferringCollector; } diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/InternalMappedRareTerms.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/InternalMappedRareTerms.java index db84951477076..d774d09fa1862 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/InternalMappedRareTerms.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/InternalMappedRareTerms.java @@ -160,17 +160,19 @@ public B getBucketByKey(String term) { } @Override - protected boolean doEquals(Object obj) { + public boolean equals(Object obj) { + if (this == obj) return true; + if (obj == null || getClass() != obj.getClass()) return false; + if (super.equals(obj) == false) return false; InternalMappedRareTerms that = (InternalMappedRareTerms) obj; - return super.doEquals(obj) - && Objects.equals(buckets, that.buckets) + return Objects.equals(buckets, that.buckets) && Objects.equals(format, that.format) && Objects.equals(filter, that.filter); } @Override - protected int doHashCode() { - return Objects.hash(super.doHashCode(), buckets, format, filter); + public int hashCode() { + return Objects.hash(super.hashCode(), buckets, format, filter); } @Override diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/InternalRareTerms.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/InternalRareTerms.java index 5b725be75612b..dd1a0c19200cf 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/InternalRareTerms.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/InternalRareTerms.java @@ -179,15 +179,18 @@ public InternalAggregation doReduce(List aggregations, Redu protected abstract B[] createBucketsArray(int size); @Override - protected boolean doEquals(Object obj) { + public boolean equals(Object obj) { + if (this == obj) return true; + if (obj == null || getClass() != obj.getClass()) return false; + if (super.equals(obj) == false) return false; InternalRareTerms that = (InternalRareTerms) obj; return Objects.equals(maxDocCount, that.maxDocCount) && Objects.equals(order, that.order); } @Override - protected int doHashCode() { - return Objects.hash(maxDocCount, order); + public int hashCode() { + return Objects.hash(super.hashCode(), maxDocCount, order); } protected static XContentBuilder doXContentCommon(XContentBuilder builder, Params params, diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/RareTermsAggregationBuilder.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/RareTermsAggregationBuilder.java index 95dff8603e3cb..5772cfa9708d5 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/RareTermsAggregationBuilder.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/RareTermsAggregationBuilder.java @@ -180,12 +180,15 @@ protected XContentBuilder doXContentBody(XContentBuilder builder, Params params) } @Override - protected int innerHashCode() { - return Objects.hash(includeExclude, maxDocCount, precision); + public int hashCode() { + return Objects.hash(super.hashCode(), includeExclude, maxDocCount, precision); } @Override - protected boolean innerEquals(Object obj) { + public boolean equals(Object obj) { + if (this == obj) return true; + if (obj == null || getClass() != obj.getClass()) return false; + if (super.equals(obj) == false) return false; RareTermsAggregationBuilder other = (RareTermsAggregationBuilder) obj; return Objects.equals(includeExclude, other.includeExclude) && Objects.equals(maxDocCount, other.maxDocCount) diff --git a/server/src/test/java/org/elasticsearch/common/util/CuckooFilterTests.java b/server/src/test/java/org/elasticsearch/common/util/CuckooFilterTests.java index 426d9829ec043..47e9081d815b6 100644 --- a/server/src/test/java/org/elasticsearch/common/util/CuckooFilterTests.java +++ b/server/src/test/java/org/elasticsearch/common/util/CuckooFilterTests.java @@ -19,6 +19,7 @@ package org.elasticsearch.common.util; import org.elasticsearch.common.Randomness; +import org.elasticsearch.common.hash.MurmurHash3; import org.elasticsearch.common.io.stream.Writeable; import org.elasticsearch.test.AbstractWireSerializingTestCase; @@ -128,11 +129,7 @@ public void testBig() { assertThat(fppRate, lessThanOrEqualTo(0.001)); } - //private MurmurHash3.Hash128 hash(long i) { - // return MurmurHash3.hash128(Numbers.longToBytes(i), 0, 8, 0, new MurmurHash3.Hash128()); - //} - private long hash(long i) { - return CuckooFilter.murmur64(i); + return MurmurHash3.murmur64(i); } } diff --git a/server/src/test/java/org/elasticsearch/common/util/SetBackedScalingCuckooFilterTests.java b/server/src/test/java/org/elasticsearch/common/util/SetBackedScalingCuckooFilterTests.java index 11ba2879f4862..20ffaa00998a1 100644 --- a/server/src/test/java/org/elasticsearch/common/util/SetBackedScalingCuckooFilterTests.java +++ b/server/src/test/java/org/elasticsearch/common/util/SetBackedScalingCuckooFilterTests.java @@ -19,6 +19,7 @@ package org.elasticsearch.common.util; import org.elasticsearch.common.Randomness; +import org.elasticsearch.common.hash.MurmurHash3; import org.elasticsearch.common.io.stream.Writeable; import org.elasticsearch.test.AbstractWireSerializingTestCase; @@ -71,7 +72,7 @@ public void testExact() { filter.add(value); boolean newValue = values.add(value); if (newValue) { - Long hash = CuckooFilter.murmur64(value); + Long hash = MurmurHash3.murmur64(value); hashed.add(hash); size += 16; From 8115862fb79a26094c9805cb4f1e6a2bd11151e2 Mon Sep 17 00:00:00 2001 From: Zachary Tong Date: Wed, 26 Jun 2019 10:33:04 -0400 Subject: [PATCH 23/25] Skip segments when there are no buckets after merging --- .../bucket/MergingBucketsDeferringCollector.java | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/MergingBucketsDeferringCollector.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/MergingBucketsDeferringCollector.java index b95a1a35f47a9..bff5015846951 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/MergingBucketsDeferringCollector.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/MergingBucketsDeferringCollector.java @@ -78,7 +78,11 @@ public void mergeBuckets(long[] mergeMap) { lastGoodDelta += delta; } } - newEntries.add(new Entry(sourceEntry.context, newDocDeltas.build(), newBuckets.build())); + // Only create an entry if this segment has buckets after merging + if (newBuckets.size() > 0) { + assert newDocDeltas.size() > 0 : "docDeltas was empty but we had buckets"; + newEntries.add(new Entry(sourceEntry.context, newDocDeltas.build(), newBuckets.build())); + } } entries = newEntries; @@ -90,7 +94,7 @@ public void mergeBuckets(long[] mergeMap) { PackedLongValues.Builder newDocDeltas = PackedLongValues.packedBuilder(PackedInts.DEFAULT); // The current segment's deltas aren't built yet, so build to a temp object - PackedLongValues currentDeltas = newDocDeltas.build(); + PackedLongValues currentDeltas = docDeltasBuilder.build(); PackedLongValues.Iterator docDeltasItr = currentDeltas.iterator(); long lastGoodDelta = 0; From b027d4a228c3758e624b9721fc0cb6c26fa9178c Mon Sep 17 00:00:00 2001 From: Zachary Tong Date: Wed, 26 Jun 2019 16:04:22 -0400 Subject: [PATCH 24/25] Add version skip to yaml tests --- .../rest-api-spec/test/search.aggregation/280_rare_terms.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/rest-api-spec/src/main/resources/rest-api-spec/test/search.aggregation/280_rare_terms.yml b/rest-api-spec/src/main/resources/rest-api-spec/test/search.aggregation/280_rare_terms.yml index eac3287b2ab78..73c46bc963e12 100644 --- a/rest-api-spec/src/main/resources/rest-api-spec/test/search.aggregation/280_rare_terms.yml +++ b/rest-api-spec/src/main/resources/rest-api-spec/test/search.aggregation/280_rare_terms.yml @@ -1,4 +1,7 @@ setup: + - skip: + version: " - 8.0.0" # TODO change this after backport + reason: RareTerms added in 7.3.0 - do: indices.create: index: test_1 From 916f194ceb05f3bf8858c129ffc35f223fc009ce Mon Sep 17 00:00:00 2001 From: Zachary Tong Date: Fri, 28 Jun 2019 09:56:32 -0400 Subject: [PATCH 25/25] Remove merge conflict .orig file --- ...SessionFactoryLoadBalancingTests.java.orig | 402 ------------------ 1 file changed, 402 deletions(-) delete mode 100644 x-pack/plugin/security/src/test/java/org/elasticsearch/xpack/security/authc/ldap/support/SessionFactoryLoadBalancingTests.java.orig diff --git a/x-pack/plugin/security/src/test/java/org/elasticsearch/xpack/security/authc/ldap/support/SessionFactoryLoadBalancingTests.java.orig b/x-pack/plugin/security/src/test/java/org/elasticsearch/xpack/security/authc/ldap/support/SessionFactoryLoadBalancingTests.java.orig deleted file mode 100644 index b58d8f9fb7637..0000000000000 --- a/x-pack/plugin/security/src/test/java/org/elasticsearch/xpack/security/authc/ldap/support/SessionFactoryLoadBalancingTests.java.orig +++ /dev/null @@ -1,402 +0,0 @@ -/* - * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one - * or more contributor license agreements. Licensed under the Elastic License; - * you may not use this file except in compliance with the Elastic License. - */ -package org.elasticsearch.xpack.security.authc.ldap.support; - -import com.unboundid.ldap.listener.InMemoryDirectoryServer; -import com.unboundid.ldap.sdk.LDAPConnection; -import com.unboundid.ldap.sdk.LDAPException; -import com.unboundid.ldap.sdk.SimpleBindRequest; -import org.apache.logging.log4j.message.ParameterizedMessage; -import org.elasticsearch.action.ActionListener; -import org.elasticsearch.common.SuppressForbidden; -import org.elasticsearch.common.network.InetAddressHelper; -import org.elasticsearch.common.settings.SecureString; -import org.elasticsearch.common.settings.Settings; -import org.elasticsearch.common.util.concurrent.ThreadContext; -import org.elasticsearch.core.internal.io.IOUtils; -import org.elasticsearch.env.TestEnvironment; -import org.elasticsearch.mocksocket.MockServerSocket; -import org.elasticsearch.mocksocket.MockSocket; -import org.elasticsearch.test.junit.annotations.TestLogging; -import org.elasticsearch.threadpool.TestThreadPool; -import org.elasticsearch.threadpool.ThreadPool; -import org.elasticsearch.xpack.core.common.socket.SocketAccess; -import org.elasticsearch.xpack.core.security.authc.RealmConfig; -import org.elasticsearch.xpack.core.security.authc.ldap.support.LdapSearchScope; -import org.elasticsearch.xpack.core.ssl.SSLService; -import org.junit.After; -import org.junit.Before; - -import java.io.IOException; -<<<<<<< HEAD -======= -import java.net.ConnectException; ->>>>>>> origin/master -import java.net.Inet4Address; -import java.net.InetAddress; -import java.net.InetSocketAddress; -import java.net.NoRouteToHostException; -import java.net.Socket; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; -import java.util.concurrent.CountDownLatch; -import java.util.concurrent.atomic.AtomicBoolean; -import java.util.stream.Collectors; - -import static org.hamcrest.Matchers.containsString; -import static org.hamcrest.Matchers.greaterThanOrEqualTo; -import static org.hamcrest.Matchers.is; -import static org.hamcrest.Matchers.not; - -/** - * Tests that the server sets properly load balance connections without throwing exceptions - */ -@TestLogging("org.elasticsearch.xpack.security.authc.ldap.support:DEBUG") -public class SessionFactoryLoadBalancingTests extends LdapTestCase { - - private ThreadPool threadPool; - - @Before - public void init() throws Exception { - threadPool = new TestThreadPool("SessionFactoryLoadBalancingTests thread pool"); - } - - @After - public void shutdown() { - terminate(threadPool); - } - - public void testRoundRobin() throws Exception { - TestSessionFactory testSessionFactory = createSessionFactory(LdapLoadBalancing.ROUND_ROBIN); - - final int numberOfIterations = randomIntBetween(1, 5); - for (int iteration = 0; iteration < numberOfIterations; iteration++) { - for (int i = 0; i < numberOfLdapServers; i++) { - try (LDAPConnection connection = LdapUtils.privilegedConnect(testSessionFactory.getServerSet()::getConnection)) { - assertThat(connection.getConnectedPort(), is(ldapServers[i].getListenPort())); - } - } - } - } - - public void testRoundRobinWithFailures() throws Exception { - assumeTrue("at least two ldap servers should be present for this test", ldapServers.length > 1); - logger.debug("using [{}] ldap servers, urls {}", ldapServers.length, ldapUrls()); - TestSessionFactory testSessionFactory = createSessionFactory(LdapLoadBalancing.ROUND_ROBIN); - - // create a list of ports - List ports = new ArrayList<>(numberOfLdapServers); - for (InMemoryDirectoryServer ldapServer : ldapServers) { - ports.add(ldapServer.getListenPort()); - } - logger.debug("list of all ports {}", ports); - - final int numberToKill = randomIntBetween(1, numberOfLdapServers - 1); - logger.debug("killing [{}] servers", numberToKill); - - // get a subset to kill - final List ldapServersToKill = randomSubsetOf(numberToKill, ldapServers); - final List ldapServersList = Arrays.asList(ldapServers); - final MockServerSocket mockServerSocket = new MockServerSocket(0, 0); - final List listenThreads = new ArrayList<>(); - final CountDownLatch latch = new CountDownLatch(ldapServersToKill.size()); - final CountDownLatch closeLatch = new CountDownLatch(1); - try { - final AtomicBoolean success = new AtomicBoolean(true); - for (InMemoryDirectoryServer ldapServerToKill : ldapServersToKill) { - final int index = ldapServersList.indexOf(ldapServerToKill); - assertThat(index, greaterThanOrEqualTo(0)); - final int port = ldapServers[index].getListenPort(); - logger.debug("shutting down server index [{}] listening on [{}]", index, port); - assertTrue(ports.remove(Integer.valueOf(port))); - ldapServers[index].shutDown(true); - - // when running multiple test jvms, there is a chance that something else could - // start listening on this port so we try to avoid this by creating a local socket - // that will be bound to the port the ldap server was running on and connecting to - // a mock server socket. - // NOTE: this is not perfect as there is a small amount of time between the shutdown - // of the ldap server and the opening of the socket - logger.debug("opening mock client sockets bound to [{}]", port); - Runnable runnable = new PortBlockingRunnable(mockServerSocket.getInetAddress(), mockServerSocket.getLocalPort(), port, - latch, closeLatch, success); - Thread thread = new Thread(runnable); - thread.start(); - listenThreads.add(thread); - - assertThat(ldapServers[index].getListenPort(), is(-1)); - } - - latch.await(); - - assumeTrue("Failed to open sockets on all addresses with the port that an LDAP server was bound to. Some operating systems " + - "allow binding to an address and port combination even if an application is bound to the port on a wildcard address", - success.get()); - final int numberOfIterations = randomIntBetween(1, 5); - logger.debug("list of all open ports {}", ports); - // go one iteration through and attempt a bind - for (int iteration = 0; iteration < numberOfIterations; iteration++) { - logger.debug("iteration [{}]", iteration); - for (Integer port : ports) { - logger.debug("attempting connection with expected port [{}]", port); - LDAPConnection connection = null; - try { - do { - final LDAPConnection finalConnection = - LdapUtils.privilegedConnect(testSessionFactory.getServerSet()::getConnection); - connection = finalConnection; - logger.debug("established connection with port [{}] expected port [{}]", - finalConnection.getConnectedPort(), port); - if (finalConnection.getConnectedPort() != port) { - LDAPException e = expectThrows(LDAPException.class, () -> finalConnection.bind(new SimpleBindRequest())); - assertThat(e.getMessage(), containsString("not connected")); - finalConnection.close(); - } - } while (connection.getConnectedPort() != port); - - assertThat(connection.getConnectedPort(), is(port)); - } finally { - if (connection != null) { - connection.close(); - } - } - } - } - } finally { - closeLatch.countDown(); - mockServerSocket.close(); - for (Thread t : listenThreads) { - t.join(); - } - } - } - - @SuppressForbidden(reason = "Allow opening socket for test") - private MockSocket openMockSocket(InetAddress remoteAddress, int remotePort, InetAddress localAddress, int localPort) - throws IOException { - final MockSocket socket = new MockSocket(); - socket.setReuseAddress(true); // allow binding even if the previous socket is in timed wait state. - socket.setSoLinger(true, 0); // close immediately as we are not writing anything here. - socket.bind(new InetSocketAddress(localAddress, localPort)); - SocketAccess.doPrivileged(() -> socket.connect(new InetSocketAddress(remoteAddress, remotePort))); - return socket; - } - - public void testFailover() throws Exception { - assumeTrue("at least two ldap servers should be present for this test", ldapServers.length > 1); - logger.debug("using [{}] ldap servers, urls {}", ldapServers.length, ldapUrls()); - TestSessionFactory testSessionFactory = createSessionFactory(LdapLoadBalancing.FAILOVER); - - // first test that there is no round robin stuff going on - final int firstPort = ldapServers[0].getListenPort(); - for (int i = 0; i < numberOfLdapServers; i++) { - try (LDAPConnection connection = LdapUtils.privilegedConnect(testSessionFactory.getServerSet()::getConnection)) { - assertThat(connection.getConnectedPort(), is(firstPort)); - } - } - - // we need at least one good server. Hence the upper bound is number - 2 since we need at least - // one server to use! - InMemoryDirectoryServer[] allButFirstServer = Arrays.copyOfRange(ldapServers, 1, ldapServers.length); - final List ldapServersToKill; - if (ldapServers.length > 2) { - final int numberToKill = randomIntBetween(1, numberOfLdapServers - 2); - ldapServersToKill = randomSubsetOf(numberToKill, allButFirstServer); - ldapServersToKill.add(ldapServers[0]); // always kill the first one - } else { - ldapServersToKill = Collections.singletonList(ldapServers[0]); - } - final List ldapServersList = Arrays.asList(ldapServers); - final MockServerSocket mockServerSocket = new MockServerSocket(0, 0); - final List listenThreads = new ArrayList<>(); - final CountDownLatch latch = new CountDownLatch(ldapServersToKill.size()); - final CountDownLatch closeLatch = new CountDownLatch(1); - final AtomicBoolean success = new AtomicBoolean(true); - for (InMemoryDirectoryServer ldapServerToKill : ldapServersToKill) { - final int index = ldapServersList.indexOf(ldapServerToKill); - final int port = ldapServers[index].getListenPort(); - logger.debug("shutting down server index [{}] listening on [{}]", index, port); - ldapServers[index].shutDown(true); - - // when running multiple test jvms, there is a chance that something else could - // start listening on this port so we try to avoid this by creating a local socket - // that will be bound to the port the ldap server was running on and connecting to - // a mock server socket. - // NOTE: this is not perfect as there is a small amount of time between the shutdown - // of the ldap server and the opening of the socket - logger.debug("opening mock server socket listening on [{}]", port); - Runnable runnable = new PortBlockingRunnable(mockServerSocket.getInetAddress(), mockServerSocket.getLocalPort(), port, - latch, closeLatch, success); - Thread thread = new Thread(runnable); - thread.start(); - listenThreads.add(thread); - - assertThat(ldapServers[index].getListenPort(), is(-1)); - } - - try { - latch.await(); - - assumeTrue("Failed to open sockets on all addresses with the port that an LDAP server was bound to. Some operating systems " + - "allow binding to an address and port combination even if an application is bound to the port on a wildcard address", - success.get()); - int firstNonStoppedPort = -1; - // now we find the first that isn't stopped - for (int i = 0; i < numberOfLdapServers; i++) { - if (ldapServers[i].getListenPort() != -1) { - firstNonStoppedPort = ldapServers[i].getListenPort(); - break; - } - } - logger.debug("first non stopped port [{}]", firstNonStoppedPort); - assertThat(firstNonStoppedPort, not(-1)); - final int numberOfIterations = randomIntBetween(1, 5); - for (int iteration = 0; iteration < numberOfIterations; iteration++) { - logger.debug("attempting connection with expected port [{}] iteration [{}]", firstNonStoppedPort, iteration); - LDAPConnection connection = null; - try { - do { - final LDAPConnection finalConnection = - LdapUtils.privilegedConnect(testSessionFactory.getServerSet()::getConnection); - connection = finalConnection; - logger.debug("established connection with port [{}] expected port [{}]", - finalConnection.getConnectedPort(), firstNonStoppedPort); - if (finalConnection.getConnectedPort() != firstNonStoppedPort) { - LDAPException e = expectThrows(LDAPException.class, () -> finalConnection.bind(new SimpleBindRequest())); - assertThat(e.getMessage(), containsString("not connected")); - finalConnection.close(); - } - } while (connection.getConnectedPort() != firstNonStoppedPort); - - assertThat(connection.getConnectedPort(), is(firstNonStoppedPort)); - } finally { - if (connection != null) { - connection.close(); - } - } - } - } finally { - closeLatch.countDown(); - mockServerSocket.close(); - for (Thread t : listenThreads) { - t.join(); - } - } - } - - private TestSessionFactory createSessionFactory(LdapLoadBalancing loadBalancing) throws Exception { - String groupSearchBase = "cn=HMS Lydia,ou=crews,ou=groups,o=sevenSeas"; - String userTemplate = "cn={0},ou=people,o=sevenSeas"; - Settings settings = buildLdapSettings(ldapUrls(), new String[] { userTemplate }, groupSearchBase, - LdapSearchScope.SUB_TREE, loadBalancing); - Settings globalSettings = Settings.builder().put("path.home", createTempDir()).put(settings).build(); - RealmConfig config = new RealmConfig(REALM_IDENTIFIER, globalSettings, - TestEnvironment.newEnvironment(globalSettings), new ThreadContext(Settings.EMPTY)); - return new TestSessionFactory(config, new SSLService(Settings.EMPTY, TestEnvironment.newEnvironment(config.settings())), - threadPool); - } - - private class PortBlockingRunnable implements Runnable { - - private final InetAddress serverAddress; - private final int serverPort; - private final int portToBind; - private final CountDownLatch latch; - private final CountDownLatch closeLatch; - private final AtomicBoolean success; - - private PortBlockingRunnable(InetAddress serverAddress, int serverPort, int portToBind, CountDownLatch latch, - CountDownLatch closeLatch, AtomicBoolean success) { - this.serverAddress = serverAddress; - this.serverPort = serverPort; - this.portToBind = portToBind; - this.latch = latch; - this.closeLatch = closeLatch; - this.success = success; - } - - @Override - public void run() { - final List openedSockets = new ArrayList<>(); - final List blacklistedAddress = new ArrayList<>(); - try { - final boolean allSocketsOpened = awaitBusy(() -> { - try { - InetAddress[] allAddresses = InetAddressHelper.getAllAddresses(); - if (serverAddress instanceof Inet4Address) { - allAddresses = InetAddressHelper.filterIPV4(allAddresses); - } else { - allAddresses = InetAddressHelper.filterIPV6(allAddresses); - } - final List inetAddressesToBind = Arrays.stream(allAddresses) - .filter(addr -> openedSockets.stream().noneMatch(s -> addr.equals(s.getLocalAddress()))) - .filter(addr -> addr instanceof Inet4Address) - .filter(addr -> blacklistedAddress.contains(addr) == false) - .collect(Collectors.toList()); - for (InetAddress localAddress : inetAddressesToBind) { - try { - final Socket socket = openMockSocket(serverAddress, serverPort, localAddress, portToBind); - openedSockets.add(socket); - logger.debug("opened socket [{}]", socket); - } catch (NoRouteToHostException e) { - logger.debug(new ParameterizedMessage("blacklisting address [{}] due to:", localAddress), e); - blacklistedAddress.add(localAddress); - } catch (ConnectException e) { - logger.debug(new ParameterizedMessage("blacklisting address [{}] due to:", localAddress), e); - blacklistedAddress.add(localAddress); - } - } - if (openedSockets.size() == 0) { - logger.debug("Could not open any sockets from the available addresses"); - return false; - } - return true; - } catch (IOException e) { - logger.debug(new ParameterizedMessage("caught exception while opening socket on [{}]", portToBind), e); - return false; - } - }); - - if (allSocketsOpened) { - latch.countDown(); - } else { - success.set(false); - IOUtils.closeWhileHandlingException(openedSockets); - openedSockets.clear(); - latch.countDown(); - return; - } - } catch (InterruptedException e) { - logger.debug(new ParameterizedMessage("interrupted while trying to open sockets on [{}]", portToBind), e); - Thread.currentThread().interrupt(); - } - - try { - closeLatch.await(); - } catch (InterruptedException e) { - logger.debug("caught exception while waiting for close latch", e); - Thread.currentThread().interrupt(); - } finally { - logger.debug("closing sockets on [{}]", portToBind); - IOUtils.closeWhileHandlingException(openedSockets); - } - } - } - - static class TestSessionFactory extends SessionFactory { - - protected TestSessionFactory(RealmConfig config, SSLService sslService, ThreadPool threadPool) { - super(config, sslService, threadPool); - } - - @Override - public void session(String user, SecureString password, ActionListener listener) { - listener.onResponse(null); - } - } -}