From f1ad01bd3ae467242ede2d999104012af135a734 Mon Sep 17 00:00:00 2001 From: markharwood Date: Mon, 27 Jan 2014 12:12:10 +0000 Subject: [PATCH 1/8] Significant_terms aggregation identifies terms that are significant rather than merely popular in a set. Significance is related to the changes in document frequency observed between everyday use in the corpus and frequency observed in the result set. The asciidocs include extensive details on the applications of this feature. --- .../significantterms-aggregation.asciidoc | 410 ++++++++++++++++++ .../aggregations/AggregationBuilders.java | 5 + .../aggregations/AggregationModule.java | 2 + .../TransportAggregationModule.java | 6 + .../BucketSignificancePriorityQueue.java | 35 ++ .../significant/InternalSignificantTerms.java | 300 +++++++++++++ .../significant/SignificantLongTerms.java | 166 +++++++ .../SignificantLongTermsAggregator.java | 155 +++++++ .../significant/SignificantStringTerms.java | 165 +++++++ .../SignificantStringTermsAggregator.java | 276 ++++++++++++ .../bucket/significant/SignificantTerms.java | 76 ++++ .../SignificantTermsAggregatorFactory.java | 181 ++++++++ .../significant/SignificantTermsBuilder.java | 83 ++++ .../significant/SignificantTermsParser.java | 211 +++++++++ .../significant/UnmappedSignificantTerms.java | 91 ++++ .../UnmappedSignificantTermsAggregator.java | 66 +++ .../bucket/SignificantTermsTests.java | 161 +++++++ 17 files changed, 2389 insertions(+) create mode 100644 docs/reference/search/aggregations/bucket/significantterms-aggregation.asciidoc create mode 100644 src/main/java/org/elasticsearch/search/aggregations/bucket/significant/BucketSignificancePriorityQueue.java create mode 100644 src/main/java/org/elasticsearch/search/aggregations/bucket/significant/InternalSignificantTerms.java create mode 100644 src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantLongTerms.java create mode 100644 src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantLongTermsAggregator.java create mode 100644 src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantStringTerms.java create mode 100644 src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantStringTermsAggregator.java create mode 100644 src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTerms.java create mode 100644 src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsAggregatorFactory.java create mode 100644 src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsBuilder.java create mode 100644 src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsParser.java create mode 100644 src/main/java/org/elasticsearch/search/aggregations/bucket/significant/UnmappedSignificantTerms.java create mode 100644 src/main/java/org/elasticsearch/search/aggregations/bucket/significant/UnmappedSignificantTermsAggregator.java create mode 100644 src/test/java/org/elasticsearch/search/aggregations/bucket/SignificantTermsTests.java diff --git a/docs/reference/search/aggregations/bucket/significantterms-aggregation.asciidoc b/docs/reference/search/aggregations/bucket/significantterms-aggregation.asciidoc new file mode 100644 index 0000000000000..e394e00a88c3e --- /dev/null +++ b/docs/reference/search/aggregations/bucket/significantterms-aggregation.asciidoc @@ -0,0 +1,410 @@ +[[search-aggregations-bucket-significantterms-aggregation]] +=== Significant Terms + +An aggregation that returns interesting or unusual occurrences of terms in a set. + +.Example use cases: +* Suggesting "H5N1" when users search for "bird flu" in text +* Identifying the merchant that is the "common point of compromise" from the transaction history of credit card owners reporting loss +* Suggesting keywords relating to stock symbol $ATI for an automated news classifier +* Spotting the fraudulent doctor who is diagnosing more than his fair share of whiplash injuries +* Spotting the tire manufacturer who has a disproportionate number of blow-outs + +In all these cases the terms being selected are not simply the most popular terms in a set. +They are the terms that have undergone a significant change in popularity measured between a _foreground_ and _background_ set. +If the term "H5N1" only exists in 5 documents in a 10 million document index and yet is found in 4 of the 100 documents that make up a user's search results +that is significant and probably very relevant to their search. 5/10,000,000 vs 4/100 is a big swing in frequency. + +==== Single-set analysis + +In the simplest case, the _foreground_ set of interest is the search results matched by a query and the _background_ +set used for statistical comparisons is the index or indices from which the results were gathered. + +Example: + +[source,js] +-------------------------------------------------- +{ + "query" : + { + "query_string" : {"query":"British AND Transport AND Police"} + }, + "aggregations" : { + "significantCrimeTypes" : { + "significant_terms" : { "field" : "crimeType" } + } + } +} +-------------------------------------------------- + +Response: + +[source,js] +-------------------------------------------------- +{ + ... + + "aggregations" : { + "significantCrimeTypes" : { + "subset_size": 47347, + "superset_size": 5064554, + "buckets" : [ + { + "key": "Bicycle theft", + "doc_count": 3640, + "significance_score": 0.371235374214817, + "superset_doc_freq": 66799 + } + ... + ] + } + } +} +-------------------------------------------------- + +When querying an index of all crimes from all police forces, what these results show is that the British Transport Police force +stand out as a force dealing with a disproportionately large number of bicycle thefts. Ordinarily, bicycle thefts represent only one 1 in +every 100 crimes but for the British Transport Police, who handle crime on railways and stations, 7 in every 100 crimes is +a bike theft. This is a significant seven-fold increase in frequency and so this anomaly was highlighted as the top crime type. + +The problem with using a query to spot anomalies is it only gives us one subset to use for comparisons. +To discover all the other police forces' anomalies we would have to repeat the query for each of the different forces. + +This can be a tedious way to look for unusual patterns in an index + + + +==== Multi-set analysis +A simpler way to perform analysis across multiple categories is to use a parent-level aggregation to segment the data ready for analysis. + + +Example using a parent aggregation for segmentation: + +[source,js] +-------------------------------------------------- +{ + "aggregations": { + "forces": { + "terms": {"field": "force"}, + "aggregations": { + "significantCrimeTypes": { + "significant_terms": {"field": "crimeType"} + } + } + } + } +} +-------------------------------------------------- + +Response: + +[source,js] +-------------------------------------------------- +{ + ... + + "aggregations": { + "forces": { + "buckets": [ + { + "key": "Metropolitan Police Service", + "doc_count": 894038, + "significantCrimeTypes": { + "subset_size": 894038, + "superset_size": 5064554, + "buckets": [ + { + "key": "Robbery", + "doc_count": 27617, + "significance_score": 0.0599, + "superset_doc_freq": 53182 + }, + ... + ] + } + }, + { + "key": "British Transport Police", + "doc_count": 47347, + "significantCrimeTypes": { + "subset_size": 47347, + "superset_size": 5064554, + "buckets": [ + { + "key": "Bicycle theft", + "doc_count": 3640, + "significance_score": 0.371, + "superset_doc_freq": 66799 + }, + ... + ] + } + } + ] + } +} + +-------------------------------------------------- + +Now we have anomaly detection for each of the police forces using a single request. + +We can use other forms of top-level aggregations to segment our data, for example segmenting by geographic +area to identify unusual hot-spots of a particular crime type: + + +[source,js] +-------------------------------------------------- +{ + "aggs": { + "hotspots": { + "geohash_grid" : { + "field":"location", + "precision":5, + }, + "aggs": { + "significantCrimeTypes": { + "significant_terms": {"field": "crimeType"} + } + } + } + } +} +-------------------------------------------------- + +This example uses the `geohash_grid` aggregation to create result buckets that represent geographic areas, and inside each +bucket we can identify anomalous levels of a crime type in these tightly-focused areas e.g. + +* Airports exhibit unusual numbers of weapon confiscations +* Universities show uplifts of bicycle thefts + +At a higher geohash_grid zoom-level with larger coverage areas we would start to see where an entire police-force may be +tackling an unusual volume of a particular crime type. + + +Obviously a time-based top-level segmentation would help identify current trends for each point in time +where a simple `terms` aggregation would typically show the very popular "constants" that persist across all time slots. + + + +.What are the significance_scores? +********************************** +The numbers returned for scores are primarily intended for ranking different suggestions sensibly rather than something easily understood by end users. +The scores are derived from the doc frequencies in _foreground_ and _background_ sets. The _absolute_ change in popularity (foregroundPercent - backgroundPercent) would favour +common terms whereas the _relative_ change in popularity (foregroundPercent/ backgroundPercent) would favour rare terms. +Rare vs common is essentially a precision vs recall balance and so the absolute and relative changes are multiplied to provide a sweet spot between precision and recall. + +********************************** + + + +=== Use on free-text fields + +The significant_terms aggregation can be used effectively on tokenized free-text fields to suggest: + +* keywords for refining end-user searches +* keywords for use in percolator queries + +WARNING: Picking a free-text field as the subject of a significant terms analysis can be expensive! It will attempt +to load every unique word into RAM. It is recommended to only use this on smaller indices. In future we may provide an option +to load only a sample of top-matching documents and use their contents to count word frequencies on-the-fly + +.Use the _"like this but not this"_ pattern +********************************** +You can spot mis-categorized content by first searching a structured field e.g. `category:adultMovie` and use significant_terms on the +free-text "movieDescription" field. Take the suggested words (I'll leave them to your imagination) and then search for all movies NOT marked as category:adultMovie but containing these keywords. +You now have a ranked list of badly-categorized movies that you should reclassify or at least remove from the "familyFriendly" category. +Using the `minimum_should_match` setting of the `terms` query with the keywords will help control the balance of precision/recall in the result set i.e +a high setting would have a small number of relevant results packed full of keywords and a setting of "1" would produce a more exhaustive results set with all documents containing _any_ keyword. +********************************** + +.Hot tip: Show significant_terms in context +********************************** +Free-text significant_terms are much more easily understood when viewed in context. Take the results of `significant_terms` suggestions from a +free-text field and use them in a `terms` query on the same field with a `highlight` clause to present users with example snippets of documents. When the terms +are presented unstemmed, highlighted, with the right case, in the right order and with some context, their significance/meaning is more readily apparent. +********************************** + + +=== Limitations + +===== Single _background_ comparison base +The above examples show how to select the _foreground_ set for analysis using a query or parent aggregation to filter but currently there is no means of specifying +a _background_ set other than the index from which all results are ultimately drawn. Sometimes it may prove useful to use a different +background set as the basis for comparisons e.g. to first select the tweets for the TV show "XFactor" and then look +for significant terms in a subset of that content which is from this week. + +===== No scripts +Unlike the terms aggregation it is currently not possible to use script-generated terms for counting purposes. +Because of the way the significant_terms aggregation must consider both _foreground_ and _background_ frequencies +it would be prohibitively expensive to use a script on the entire index to obtain background frequencies for comparisons. + +===== No analysis of floating point fields +Floating point fields are currently not supported as the subject of significant_terms analysis. +While integer or long fields can be used to represent concepts like bank account numbers or category numbers which +can be interesting to track, floating point fields are usually used to represent quantities of something. +As such, individual floating point terms are not useful for this form of frequency analysis. + +===== Use as a parent aggregation +If there is the equivalent of a `match_all` query or no query criteria providing a subset of the index the significant_terms aggregation should not be used as the +top-most aggregation - in this scenario the _foreground_ set is exactly the same as the _background_ set and +so there is no difference in document frequencies to observe and from which to make sensible suggestions. + +Another consideration is that the significant_terms aggregation produces many candidate results at shard level +that are only later pruned on the reducing node once all statistics from all shards are merged. As a result, +it can be inefficient and costly in terms of RAM to embed large child aggregations under a signigicant_terms +aggregation that later discards many candidate terms. It is advisable in these cases to perform two searches - the first to provide a rationalized list of +significant_terms and then add this shortlist of terms to a second query to go back and fetch the required child aggregations. + +===== Approximate counts +The counts of how many documents contain a term provided in results are based on summing the samples returned from each shard and +as such may be: + +* low if certain shards did not provide figures for a given term in their top sample +* high when considering the background frequency as it may count occurrences found in deleted documents + +Like most design decisions, this is the basis of a trade-off in which we have chosen to provide fast performance at the cost of some (typically small) inaccuracies. +However, the "size" and "shard size" settings covered in the next section provide tools to help control the accuracy levels. + + +=== Parameters + + +==== Size & Shard Size + +The `size` parameter can be set to define how many term buckets should be returned out of the overall terms list. By +default, the node coordinating the search process will request each shard to provide its own top `size` term buckets +and once all shards respond, it will reduce the results to the final list that will then be returned to the client. +This means that if the number of unique terms is greater than `size`, the returned list is slightly off and not accurate +(it could be that the term counts are slightly off and it could even be that a term that should have been in the top +size buckets was not returned). + +The higher the requested `size` is, the more accurate the results will be, but also, the more expensive it will be to +compute the final results (both due to bigger priority queues that are managed on a shard level and due to bigger data +transfers between the nodes and the client). + +The `shard_size` parameter can be used to control the volumes of candidate terms produced by each shard. +Low-frequency terms can turn out to be the most interesting ones once all results are combined so the +significant_terms aggregation can produce higher-quality results when the `shard_size` parameter is set to +values significantly higher than the `size` setting. This ensures that a bigger volume of promising candidate terms are given +a consolidated review by the reducing node before the final selection. Obviously large candidate term lists +will cause extra network traffic and RAM usage so this is quality/cost trade off that needs to be balanced. + +NOTE: `shard_size` cannot be smaller than `size` (as it doesn't make much sense). When it is, elasticsearch will + override it and reset it to be equal to `size`. + +==== Minimum document count + +It is possible to only return terms that match more than a configured number of hits using the `min_doc_count` option: + +[source,js] +-------------------------------------------------- +{ + "aggs" : { + "tags" : { + "significant_terms" : { + "field" : "tag", + "min_doc_count": 10 + } + } + } +} +-------------------------------------------------- + +The above aggregation would only return tags which have been found in 10 hits or more. Default value is `3`. + +WARNING: Setting `min_doc_count` to `1` is generally not advised as it tends to return terms that + are typos or other bizarre curiosities. Finding more than one instance of a term helps + reinforce that, while still rare, the term was not the result of a one-off accident. The + default value of 3 is used to provide a minimum weight-of-evidence. + + +==== Filtering Values + +It is possible (although rarely required) to filter the values for which buckets will be created. This can be done using the `include` and +`exclude` parameters which are based on regular expressions. This functionality mirrors the features +offered by the `terms` aggregation. + + +[source,js] +-------------------------------------------------- +{ + "aggs" : { + "tags" : { + "significant_terms" : { + "field" : "tags", + "include" : ".*sport.*", + "exclude" : "water_.*" + } + } + } +} +-------------------------------------------------- + +In the above example, buckets will be created for all the tags that has the word `sport` in them, except those starting +with `water_` (so the tag `water_sports` will no be aggregated). The `include` regular expression will determine what +values are "allowed" to be aggregated, while the `exclude` determines the values that should not be aggregated. When +both are defined, the `exclude` has precedence, meaning, the `include` is evaluated first and only then the `exclude`. + +The regular expression are based on the Java(TM) http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html[Pattern], +and as such, they it is also possible to pass in flags that will determine how the compiled regular expression will work: + +[source,js] +-------------------------------------------------- +{ + "aggs" : { + "tags" : { + "terms" : { + "field" : "tags", + "include" : { + "pattern" : ".*sport.*", + "flags" : "CANON_EQ|CASE_INSENSITIVE" <1> + }, + "exclude" : { + "pattern" : "water_.*", + "flags" : "CANON_EQ|CASE_INSENSITIVE" + } + } + } + } +} +-------------------------------------------------- + +<1> the flags are concatenated using the `|` character as a separator + +The possible flags that can be used are: +http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#CANON_EQ[`CANON_EQ`], +http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#CASE_INSENSITIVE[`CASE_INSENSITIVE`], +http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#COMMENTS[`COMMENTS`], +http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#DOTALL[`DOTALL`], +http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#LITERAL[`LITERAL`], +http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#MULTILINE[`MULTILINE`], +http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#UNICODE_CASE[`UNICODE_CASE`], +http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#UNICODE_CHARACTER_CLASS[`UNICODE_CHARACTER_CLASS`] and +http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#UNIX_LINES[`UNIX_LINES`] + +==== Execution hint + +There are two mechanisms by which terms aggregations can be executed: either by using field values directly in order to aggregate +data per-bucket (`map`), or by using ordinals of the field values instead of the values themselves (`ordinals`). Although the +latter execution mode can be expected to be slightly faster, it is only available for use when the underlying data source exposes +those terms ordinals. Moreover, it may actually be slower if most field values are unique. Elasticsearch tries to have sensible +defaults when it comes to the execution mode that should be used, but in case you know that an execution mode may perform better +than the other one, you have the ability to provide Elasticsearch with a hint: + +[source,js] +-------------------------------------------------- +{ + "aggs" : { + "tags" : { + "significant_terms" : { + "field" : "tags", + "execution_hint": "map" <1> + } + } + } +} +-------------------------------------------------- + +<1> the possible values are `map` and `ordinals` + +Please note that Elasticsearch will ignore this execution hint if it is not applicable. diff --git a/src/main/java/org/elasticsearch/search/aggregations/AggregationBuilders.java b/src/main/java/org/elasticsearch/search/aggregations/AggregationBuilders.java index be0ad1eaa4861..d5f45f9c99d5a 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/AggregationBuilders.java +++ b/src/main/java/org/elasticsearch/search/aggregations/AggregationBuilders.java @@ -29,6 +29,7 @@ import org.elasticsearch.search.aggregations.bucket.range.date.DateRangeBuilder; import org.elasticsearch.search.aggregations.bucket.range.geodistance.GeoDistanceBuilder; import org.elasticsearch.search.aggregations.bucket.range.ipv4.IPv4RangeBuilder; +import org.elasticsearch.search.aggregations.bucket.significant.SignificantTermsBuilder; import org.elasticsearch.search.aggregations.bucket.terms.TermsBuilder; import org.elasticsearch.search.aggregations.metrics.avg.AvgBuilder; import org.elasticsearch.search.aggregations.metrics.max.MaxBuilder; @@ -103,6 +104,10 @@ public static GeoHashGridBuilder geohashGrid(String name) { return new GeoHashGridBuilder(name); } + public static SignificantTermsBuilder significantTerms(String name) { + return new SignificantTermsBuilder(name); + } + public static DateHistogramBuilder dateHistogram(String name) { return new DateHistogramBuilder(name); } diff --git a/src/main/java/org/elasticsearch/search/aggregations/AggregationModule.java b/src/main/java/org/elasticsearch/search/aggregations/AggregationModule.java index 3475f3de9a1c4..66ca55fefc532 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/AggregationModule.java +++ b/src/main/java/org/elasticsearch/search/aggregations/AggregationModule.java @@ -32,6 +32,7 @@ import org.elasticsearch.search.aggregations.bucket.range.date.DateRangeParser; import org.elasticsearch.search.aggregations.bucket.range.geodistance.GeoDistanceParser; import org.elasticsearch.search.aggregations.bucket.range.ipv4.IpRangeParser; +import org.elasticsearch.search.aggregations.bucket.significant.SignificantTermsParser; import org.elasticsearch.search.aggregations.bucket.terms.TermsParser; import org.elasticsearch.search.aggregations.metrics.avg.AvgParser; import org.elasticsearch.search.aggregations.metrics.max.MaxParser; @@ -65,6 +66,7 @@ public AggregationModule() { parsers.add(MissingParser.class); parsers.add(FilterParser.class); parsers.add(TermsParser.class); + parsers.add(SignificantTermsParser.class); parsers.add(RangeParser.class); parsers.add(DateRangeParser.class); parsers.add(IpRangeParser.class); diff --git a/src/main/java/org/elasticsearch/search/aggregations/TransportAggregationModule.java b/src/main/java/org/elasticsearch/search/aggregations/TransportAggregationModule.java index b97145f357ddd..5a3089304f377 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/TransportAggregationModule.java +++ b/src/main/java/org/elasticsearch/search/aggregations/TransportAggregationModule.java @@ -30,6 +30,9 @@ import org.elasticsearch.search.aggregations.bucket.range.date.InternalDateRange; import org.elasticsearch.search.aggregations.bucket.range.geodistance.InternalGeoDistance; import org.elasticsearch.search.aggregations.bucket.range.ipv4.InternalIPv4Range; +import org.elasticsearch.search.aggregations.bucket.significant.SignificantLongTerms; +import org.elasticsearch.search.aggregations.bucket.significant.SignificantStringTerms; +import org.elasticsearch.search.aggregations.bucket.significant.UnmappedSignificantTerms; import org.elasticsearch.search.aggregations.bucket.terms.DoubleTerms; import org.elasticsearch.search.aggregations.bucket.terms.LongTerms; import org.elasticsearch.search.aggregations.bucket.terms.StringTerms; @@ -67,6 +70,9 @@ protected void configure() { InternalMissing.registerStreams(); StringTerms.registerStreams(); LongTerms.registerStreams(); + SignificantStringTerms.registerStreams(); + SignificantLongTerms.registerStreams(); + UnmappedSignificantTerms.registerStreams(); InternalGeoHashGrid.registerStreams(); DoubleTerms.registerStreams(); UnmappedTerms.registerStreams(); diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/BucketSignificancePriorityQueue.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/BucketSignificancePriorityQueue.java new file mode 100644 index 0000000000000..8cc51428c4988 --- /dev/null +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/BucketSignificancePriorityQueue.java @@ -0,0 +1,35 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.search.aggregations.bucket.significant; + +import org.apache.lucene.util.PriorityQueue; + +public class BucketSignificancePriorityQueue extends PriorityQueue { + + + public BucketSignificancePriorityQueue(int size) { + super(size); + } + + @Override + protected boolean lessThan(SignificantTerms.Bucket o1, SignificantTerms.Bucket o2) { + return o1.getSignificanceScore() < o2.getSignificanceScore(); + } +} diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/InternalSignificantTerms.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/InternalSignificantTerms.java new file mode 100644 index 0000000000000..2a27b16353c38 --- /dev/null +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/InternalSignificantTerms.java @@ -0,0 +1,300 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.search.aggregations.bucket.significant; + +import com.google.common.collect.Lists; +import com.google.common.collect.Maps; +import org.elasticsearch.cache.recycler.CacheRecycler; +import org.elasticsearch.common.io.stream.StreamInput; +import org.elasticsearch.common.io.stream.StreamOutput; +import org.elasticsearch.common.io.stream.Streamable; +import org.elasticsearch.common.xcontent.ToXContent; +import org.elasticsearch.search.aggregations.Aggregations; +import org.elasticsearch.search.aggregations.InternalAggregation; +import org.elasticsearch.search.aggregations.InternalAggregations; + +import java.io.IOException; +import java.util.*; + +/** + * + */ +public abstract class InternalSignificantTerms extends InternalAggregation implements SignificantTerms, ToXContent, Streamable { + + public static abstract class Bucket extends SignificantTerms.Bucket { + + long bucketOrd; + protected InternalAggregations aggregations; + double score; + + protected Bucket(long subsetDf, long subsetSize, long supersetDf, long supersetSize, InternalAggregations aggregations) { + super(subsetDf, subsetSize, supersetDf, supersetSize); + this.aggregations = aggregations; + assert subsetDf <= supersetDf; + updateScore(); + } + + @Override + public long getSubsetDf(){ + return subsetDf; + } + + @Override + public long getSupersetDf(){ + return supersetDf; + } + + @Override + public long getSupersetSize(){ + return supersetSize; + } + + @Override + public long getSubsetSize(){ + return subsetSize; + } + + /** + * Calculates the significance of a term in a sample against a background of + * normal distributions by comparing the changes in frequency. This is the heart + * of the significant terms feature. + * + * TODO - allow pluggable scoring implementations + * + * @param subsetFreq The frequency of the term in the selected sample + * @param subsetSize The size of the selected sample (typically number of docs) + * @param supersetFreq The frequency of the term in the superset from which the sample was taken + * @param supersetSize The size of the superset from which the sample was taken (typically number of docs) + * @return a "significance" score + */ + public static final double getSampledTermSignificance(long subsetFreq, long subsetSize, long supersetFreq, long supersetSize) { + if ((subsetSize == 0) || (supersetSize == 0)) { + // avoid any divide by zero issues + return 0; + } + + double subsetProbability = (double) subsetFreq / (double) subsetSize; + double supersetProbability = (double) supersetFreq / (double) supersetSize; + + // Using absoluteProbabilityChange alone favours very common words e.g. you, we etc + // because a doubling in popularity of a common term is a big percent difference + // whereas a rare term would have to achieve a hundred-fold increase in popularity to + // achieve the same difference measure. + // In favouring common words as suggested features for search we would get high + // recall but low precision. + double absoluteProbabilityChange = subsetProbability - supersetProbability; + if (absoluteProbabilityChange <= 0) { + return 0; + } + // Using relativeProbabilityChange tends to favour rarer terms e.g.mis-spellings or + // unique URLs. + // A very low-probability term can very easily double in popularity due to the low + // numbers required to do so whereas a high-probability term would have to add many + // extra individual sightings to achieve the same shift. + // In favouring rare words as suggested features for search we would get high + // precision but low recall. + double relativeProbabilityChange = (subsetProbability / supersetProbability); + + // A blend of the above metrics - favours medium-rare terms to strike a useful + // balance between precision and recall. + double score = absoluteProbabilityChange * relativeProbabilityChange; + return score; + } + + public void updateScore() { + score=getSampledTermSignificance(subsetDf,subsetSize,supersetDf,supersetSize); + } + + @Override + public long getDocCount() { + return subsetDf; + } + + @Override + public Aggregations getAggregations() { + return aggregations; + } + + public Bucket reduce(List buckets, CacheRecycler cacheRecycler) { + if (buckets.size() == 1) { + return buckets.get(0); + } + Bucket reduced = null; + List aggregationsList = new ArrayList(buckets.size()); + for (Bucket bucket : buckets) { + if (reduced == null) { + reduced = bucket; + } else { + reduced.subsetDf += bucket.subsetDf; + reduced.supersetDf += bucket.supersetDf; + reduced.updateScore(); + } + aggregationsList.add(bucket.aggregations); + } + assert reduced.subsetDf <= reduced.supersetDf; + reduced.aggregations = InternalAggregations.reduce(aggregationsList, cacheRecycler); + return reduced; + } + + @Override + public double getSignificanceScore() { + return score; + } + } + + protected int requiredSize; + protected long minDocCount; + protected Collection buckets; + protected Map bucketMap; + protected long subsetSize; + protected long supersetSize; + + protected InternalSignificantTerms() {} // for serialization + + protected InternalSignificantTerms(long subsetSize, long supersetSize, String name, int requiredSize, long minDocCount, Collection buckets) { + super(name); + this.requiredSize = requiredSize; + this.minDocCount = minDocCount; + this.buckets = buckets; + this.subsetSize = subsetSize; + this.supersetSize = supersetSize; + } + + @Override + public Iterator iterator() { + Object o = buckets.iterator(); + return (Iterator) o; + } + + @Override + public Collection buckets() { + Object o = buckets; + return (Collection) o; + } + + @Override + public SignificantTerms.Bucket getByTerm(String term) { + if (bucketMap == null) { + bucketMap = Maps.newHashMapWithExpectedSize(buckets.size()); + for (Bucket bucket : buckets) { + bucketMap.put(bucket.getKey(), bucket); + } + } + return bucketMap.get(term); + } + + @Override + public InternalSignificantTerms reduce(ReduceContext reduceContext) { + List aggregations = reduceContext.aggregations(); + if (aggregations.size() == 1) { + InternalSignificantTerms terms = (InternalSignificantTerms) aggregations.get(0); + terms.trimExcessEntries(); + return terms; + } + InternalSignificantTerms reduced = null; + + long globalSubsetSize = 0; + long globalSupersetSize = 0; + // Compute the overall result set size and the corpus size using the + // top-level Aggregations from each shard + for (InternalAggregation aggregation : aggregations) { + InternalSignificantTerms terms = (InternalSignificantTerms) aggregation; + globalSubsetSize += terms.subsetSize; + globalSupersetSize += terms.supersetSize; + } + Map> buckets = null; + for (InternalAggregation aggregation : aggregations) { + InternalSignificantTerms terms = (InternalSignificantTerms) aggregation; + if (terms instanceof UnmappedSignificantTerms) { + continue; + } + if (reduced == null) { + reduced = terms; + } + if (buckets == null) { + buckets = new HashMap>(terms.buckets.size()); + } + for (Bucket bucket : terms.buckets) { + List existingBuckets = buckets.get(bucket.getKey()); + if (existingBuckets == null) { + existingBuckets = new ArrayList(aggregations.size()); + buckets.put(bucket.getKey(), existingBuckets); + } + // Adjust the buckets with the global stats representing the + // total size of the pots from which the stats are drawn + bucket.subsetSize = globalSubsetSize; + bucket.supersetSize = globalSupersetSize; + bucket.updateScore(); + existingBuckets.add(bucket); + } + } + + if (reduced == null) { + // there are only unmapped terms, so we just return the first one + // (no need to reduce) + return (UnmappedSignificantTerms) aggregations.get(0); + } + + final int size = Math.min(requiredSize, buckets.size()); + BucketSignificancePriorityQueue ordered = new BucketSignificancePriorityQueue(size); + for (Map.Entry> entry : buckets.entrySet()) { + List sameTermBuckets = entry.getValue(); + final Bucket b = sameTermBuckets.get(0).reduce(sameTermBuckets, reduceContext.cacheRecycler()); + if((b.score>0)&& (b.subsetDf >= minDocCount)) { + ordered.insertWithOverflow(b); + } + } + Bucket[] list = new Bucket[ordered.size()]; + for (int i = ordered.size() - 1; i >= 0; i--) { + list[i] = (Bucket) ordered.pop(); + } + reduced.buckets = Arrays.asList(list); + reduced.subsetSize = globalSubsetSize; + reduced.supersetSize = globalSupersetSize; + return reduced; + } + + final void trimExcessEntries() { + //TODO is this sorted in the desired order? + final List newBuckets = Lists.newArrayList(); + for (Bucket b : buckets) { + if (newBuckets.size() >= requiredSize) { + break; + } + if (b.subsetDf >= minDocCount) { + newBuckets.add(b); + } + } + buckets = newBuckets; + } + + // 0 actually means unlimited + protected static int readSize(StreamInput in) throws IOException { + final int size = in.readVInt(); + return size == 0 ? Integer.MAX_VALUE : size; + } + + protected static void writeSize(int size, StreamOutput out) throws IOException { + if (size == Integer.MAX_VALUE) { + size = 0; + } + out.writeVInt(size); + } + +} diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantLongTerms.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantLongTerms.java new file mode 100644 index 0000000000000..a7ec384adff7a --- /dev/null +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantLongTerms.java @@ -0,0 +1,166 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.search.aggregations.bucket.significant; + +import com.google.common.primitives.Longs; +import org.elasticsearch.common.io.stream.StreamInput; +import org.elasticsearch.common.io.stream.StreamOutput; +import org.elasticsearch.common.text.StringText; +import org.elasticsearch.common.text.Text; +import org.elasticsearch.common.xcontent.XContentBuilder; +import org.elasticsearch.search.aggregations.AggregationStreams; +import org.elasticsearch.search.aggregations.InternalAggregations; +import org.elasticsearch.search.aggregations.support.numeric.ValueFormatter; +import org.elasticsearch.search.aggregations.support.numeric.ValueFormatterStreams; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; + +/** + * + */ +public class SignificantLongTerms extends InternalSignificantTerms { + + public static final Type TYPE = new Type("significant_terms", "siglterms"); + + public static AggregationStreams.Stream STREAM = new AggregationStreams.Stream() { + @Override + public SignificantLongTerms readResult(StreamInput in) throws IOException { + SignificantLongTerms buckets = new SignificantLongTerms(); + buckets.readFrom(in); + return buckets; + } + }; + + public static void registerStreams() { + AggregationStreams.registerStream(STREAM, TYPE.stream()); + } + + + static class Bucket extends InternalSignificantTerms.Bucket { + + long term; + + public Bucket(long subsetDf, long subsetSize, long supersetDf, long supersetSize, long term, InternalAggregations aggregations) { + super(subsetDf, subsetSize, supersetDf, supersetSize, aggregations); + this.term = term; + } + + @Override + public Text getKeyAsText() { + return new StringText(String.valueOf(term)); + } + + @Override + public Number getKeyAsNumber() { + return term; + } + + @Override + int compareTerm(SignificantTerms.Bucket other) { + return Longs.compare(term, other.getKeyAsNumber().longValue()); + } + + @Override + public String getKey() { + return Long.toString(term); + } + + } + + private ValueFormatter valueFormatter; + + SignificantLongTerms() {} // for serialization + + public SignificantLongTerms(long subsetSize, long supersetSize, String name, ValueFormatter valueFormatter, int requiredSize, long minDocCount, Collection buckets) { + super(subsetSize, supersetSize,name, requiredSize, minDocCount, buckets); + this.valueFormatter = valueFormatter; + } + + @Override + public Type type() { + return TYPE; + } + + + @Override + public void readFrom(StreamInput in) throws IOException { + this.name = in.readString(); + this.valueFormatter = ValueFormatterStreams.readOptional(in); + this.requiredSize = readSize(in); + this.minDocCount = in.readVLong(); + this.subsetSize = in.readVLong(); + this.supersetSize = in.readVLong(); + + int size = in.readVInt(); + List buckets = new ArrayList(size); + for (int i = 0; i < size; i++) { + long subsetDf=in.readVLong(); + long supersetDf=in.readVLong(); + long term=in.readLong(); + buckets.add(new Bucket(subsetDf, subsetSize, supersetDf, + supersetSize, term, InternalAggregations.readAggregations(in))); + } + this.buckets = buckets; + this.bucketMap = null; + } + + @Override + public void writeTo(StreamOutput out) throws IOException { + out.writeString(name); + ValueFormatterStreams.writeOptional(valueFormatter, out); + writeSize(requiredSize, out); + out.writeVLong(minDocCount); + out.writeVLong(subsetSize); + out.writeVLong(supersetSize); + out.writeVInt(buckets.size()); + for (InternalSignificantTerms.Bucket bucket : buckets) { + out.writeVLong(((Bucket) bucket).subsetDf); + out.writeVLong(((Bucket) bucket).supersetDf); + out.writeLong(((Bucket) bucket).term); + ((InternalAggregations) bucket.getAggregations()).writeTo(out); + } + } + + @Override + public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { + builder.startObject(name); + builder.field("subset_size", subsetSize); + builder.field("superset_size", supersetSize); + builder.startArray(CommonFields.BUCKETS); + for (InternalSignificantTerms.Bucket bucket : buckets) { + builder.startObject(); + builder.field(CommonFields.KEY, ((Bucket) bucket).term); + if (valueFormatter != null) { + builder.field(CommonFields.KEY_AS_STRING, valueFormatter.format(((Bucket) bucket).term)); + } + builder.field(CommonFields.DOC_COUNT, bucket.getDocCount()); + builder.field("significance_score", bucket.score); + builder.field("superset_doc_freq", bucket.supersetDf); + ((InternalAggregations) bucket.getAggregations()).toXContentInternal(builder, params); + builder.endObject(); + } + builder.endArray(); + builder.endObject(); + return builder; + } + +} diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantLongTermsAggregator.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantLongTermsAggregator.java new file mode 100644 index 0000000000000..41521fbf43083 --- /dev/null +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantLongTermsAggregator.java @@ -0,0 +1,155 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.search.aggregations.bucket.significant; + +import org.apache.lucene.index.AtomicReaderContext; +import org.apache.lucene.index.IndexReader; +import org.elasticsearch.common.lease.Releasables; +import org.elasticsearch.index.fielddata.LongValues; +import org.elasticsearch.search.aggregations.Aggregator; +import org.elasticsearch.search.aggregations.AggregatorFactories; +import org.elasticsearch.search.aggregations.bucket.BucketsAggregator; +import org.elasticsearch.search.aggregations.bucket.LongHash; +import org.elasticsearch.search.aggregations.support.AggregationContext; +import org.elasticsearch.search.aggregations.support.numeric.NumericValuesSource; +import org.elasticsearch.search.internal.ContextIndexSearcher; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Collections; + +/** + * + */ +public class SignificantLongTermsAggregator extends BucketsAggregator { + + private final int requiredSize; + private final int shardSize; + private final long minDocCount; + private final NumericValuesSource valuesSource; + private final LongHash bucketOrds; + private LongValues values; + protected int numCollectedDocs; + private SignificantTermsAggregatorFactory termsAggFactory; + + + + public SignificantLongTermsAggregator(String name, AggregatorFactories factories, NumericValuesSource valuesSource, SignificantTermsAggregatorFactory termsAggFactory, + long estimatedBucketCount, int requiredSize, int shardSize, long minDocCount, AggregationContext aggregationContext, Aggregator parent) { + super(name, BucketAggregationMode.PER_BUCKET, factories, estimatedBucketCount, aggregationContext, parent); + this.valuesSource = valuesSource; + this.termsAggFactory = termsAggFactory; + this.requiredSize = requiredSize; + this.shardSize = shardSize; + this.minDocCount = minDocCount; + bucketOrds = new LongHash(estimatedBucketCount, aggregationContext.pageCacheRecycler()); + } + + @Override + public boolean shouldCollect() { + return true; + } + + @Override + public void setNextReader(AtomicReaderContext reader) { + values = valuesSource.longValues(); + } + + @Override + public void collect(int doc, long owningBucketOrdinal) throws IOException { + assert owningBucketOrdinal == 0; + numCollectedDocs++; + + final int valuesCount = values.setDocument(doc); + + for (int i = 0; i < valuesCount; ++i) { + final long val = values.nextValue(); + long bucketOrdinal = bucketOrds.add(val); + if (bucketOrdinal < 0) { // already seen + bucketOrdinal = - 1 - bucketOrdinal; + } + collectBucket(doc, bucketOrdinal); + } + } + + @Override + public SignificantLongTerms buildAggregation(long owningBucketOrdinal) { + assert owningBucketOrdinal == 0; + + final int size = (int) Math.min(bucketOrds.size(), shardSize); + + ContextIndexSearcher searcher = context.searchContext().searcher(); + IndexReader topReader = searcher.getIndexReader(); + int supersetSize = topReader.numDocs(); + int subsetSize = numCollectedDocs; + + BucketSignificancePriorityQueue ordered = new BucketSignificancePriorityQueue(size); + SignificantLongTerms.Bucket spare = null; + for (long i = 0; i < bucketOrds.capacity(); ++i) { + final long ord = bucketOrds.id(i); + if (ord < 0) { + // slot is not allocated + continue; + } + + if (spare == null) { + spare = new SignificantLongTerms.Bucket(0, 0, 0, 0, 0, null); + } + spare.term = bucketOrds.key(i); + spare.subsetDf = bucketDocCount(ord); + spare.subsetSize = subsetSize; + spare.supersetDf = termsAggFactory.getBackgroundFrequency(topReader, spare.term); + spare.supersetSize = supersetSize; + assert spare.subsetDf <= spare.supersetDf; + // During shard-local down-selection we use subset/superset stats + // that are for this shard only + // Back at the central reducer these properties will be updated with + // global stats + spare.updateScore(); + + spare.bucketOrd = ord; + spare = (SignificantLongTerms.Bucket) ordered.insertWithOverflow(spare); + } + + final InternalSignificantTerms.Bucket[] list = new InternalSignificantTerms.Bucket[ordered.size()]; + for (int i = ordered.size() - 1; i >= 0; --i) { + final SignificantLongTerms.Bucket bucket = (SignificantLongTerms.Bucket) ordered.pop(); + bucket.aggregations = bucketAggregations(bucket.bucketOrd); + list[i] = bucket; + } + return new SignificantLongTerms(subsetSize, supersetSize, name, valuesSource.formatter(), requiredSize, minDocCount, + Arrays.asList(list)); + } + + @Override + public SignificantLongTerms buildEmptyAggregation() { + // We need to account for the significance of a miss in our global stats + // - provide corpus size as context + ContextIndexSearcher searcher = context.searchContext().searcher(); + IndexReader topReader = searcher.getIndexReader(); + int supersetSize = topReader.numDocs(); + return new SignificantLongTerms(0, supersetSize, name, valuesSource.formatter(), requiredSize, minDocCount, Collections.emptyList()); + } + + @Override + public void doRelease() { + Releasables.release(bucketOrds); + } + +} diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantStringTerms.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantStringTerms.java new file mode 100644 index 0000000000000..9fddcca1dae11 --- /dev/null +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantStringTerms.java @@ -0,0 +1,165 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.search.aggregations.bucket.significant; + +import org.apache.lucene.util.BytesRef; +import org.elasticsearch.common.bytes.BytesArray; +import org.elasticsearch.common.io.stream.StreamInput; +import org.elasticsearch.common.io.stream.StreamOutput; +import org.elasticsearch.common.text.BytesText; +import org.elasticsearch.common.text.Text; +import org.elasticsearch.common.xcontent.XContentBuilder; +import org.elasticsearch.search.aggregations.AggregationStreams; +import org.elasticsearch.search.aggregations.InternalAggregation; +import org.elasticsearch.search.aggregations.InternalAggregations; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; + +/** + * + */ +public class SignificantStringTerms extends InternalSignificantTerms { + + public static final InternalAggregation.Type TYPE = new Type("significant_terms", "sigsterms"); + + public static AggregationStreams.Stream STREAM = new AggregationStreams.Stream() { + @Override + public SignificantStringTerms readResult(StreamInput in) throws IOException { + SignificantStringTerms buckets = new SignificantStringTerms(); + buckets.readFrom(in); + return buckets; + } + }; + + public static void registerStreams() { + AggregationStreams.registerStream(STREAM, TYPE.stream()); + } + + + public static class Bucket extends InternalSignificantTerms.Bucket { + + final BytesRef termBytes; + + + public Bucket(BytesRef term, long subsetDf, long subsetSize, long supersetDf, long supersetSize,InternalAggregations aggregations) { + super(subsetDf, subsetSize,supersetDf,supersetSize,aggregations); + this.termBytes = term; + } + + + @Override + public Text getKeyAsText() { + return new BytesText(new BytesArray(termBytes)); + } + + @Override + public Number getKeyAsNumber() { + // this method is needed for scripted numeric faceting + return Double.parseDouble(termBytes.utf8ToString()); + } + + @Override + int compareTerm(SignificantTerms.Bucket other) { + return BytesRef.getUTF8SortedAsUnicodeComparator().compare(termBytes, ((Bucket) other).termBytes); + } + + + @Override + public String getKey() { + return termBytes.utf8ToString(); + } + + } + + SignificantStringTerms() {} // for serialization + + public SignificantStringTerms(long subsetSize, long supersetSize,String name, int requiredSize, long minDocCount, Collection buckets) { + super(subsetSize, supersetSize, name, requiredSize, minDocCount, buckets); + } + + @Override + public Type type() { + return TYPE; + } + + @Override + public void readFrom(StreamInput in) throws IOException { + this.name = in.readString(); + this.requiredSize = readSize(in); + this.minDocCount = in.readVLong(); + this.subsetSize = in.readVLong(); + this.supersetSize = in.readVLong(); + int size = in.readVInt(); + List buckets = new ArrayList(size); + for (int i = 0; i < size; i++) { + BytesRef term = in.readBytesRef(); + long subsetDf= in.readVLong(); + long supersetDf= in.readVLong(); + buckets.add(new Bucket(term,subsetDf, subsetSize, supersetDf, + supersetSize, InternalAggregations.readAggregations(in))); + } + this.buckets = buckets; + this.bucketMap = null; + } + + @Override + public void writeTo(StreamOutput out) throws IOException { + out.writeString(name); + writeSize(requiredSize, out); + out.writeVLong(minDocCount); + out.writeVLong(subsetSize); + out.writeVLong(supersetSize); + out.writeVInt(buckets.size()); + for (InternalSignificantTerms.Bucket bucket : buckets) { + out.writeBytesRef(((Bucket) bucket).termBytes); + out.writeVLong(((Bucket) bucket).subsetDf); + out.writeVLong(((Bucket) bucket).supersetDf); + ((InternalAggregations) bucket.getAggregations()).writeTo(out); + } + } + + @Override + public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { + builder.startObject(name); + builder.field("subset_size", subsetSize); + builder.field("superset_size", supersetSize); + builder.startArray(CommonFields.BUCKETS); + for (InternalSignificantTerms.Bucket bucket : buckets) { + //There is a condition (presumably when only one shard has a bucket?) where reduce is not called + // and I end up with buckets that contravene the user's min_doc_count criteria in my reducer + if(bucket.subsetDf>=minDocCount){ + builder.startObject(); + builder.field(CommonFields.KEY, ((Bucket) bucket).termBytes); + //TODO change reference to "doc" count/freq etc - we may be used to count freq of entities that are not docs + builder.field(CommonFields.DOC_COUNT, bucket.getDocCount()); + builder.field("significance_score", bucket.score); + builder.field("superset_doc_freq", bucket.supersetDf); + ((InternalAggregations) bucket.getAggregations()).toXContentInternal(builder, params); + builder.endObject(); + } + } + builder.endArray(); + builder.endObject(); + return builder; + } + +} diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantStringTermsAggregator.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantStringTermsAggregator.java new file mode 100644 index 0000000000000..1cdcfc2ddebc8 --- /dev/null +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantStringTermsAggregator.java @@ -0,0 +1,276 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.search.aggregations.bucket.significant; + +import com.google.common.collect.UnmodifiableIterator; +import org.apache.lucene.index.AtomicReaderContext; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.util.BytesRef; +import org.elasticsearch.common.lease.Releasables; +import org.elasticsearch.common.util.BigArrays; +import org.elasticsearch.common.util.LongArray; +import org.elasticsearch.index.fielddata.BytesValues; +import org.elasticsearch.index.fielddata.ordinals.Ordinals; +import org.elasticsearch.search.aggregations.Aggregator; +import org.elasticsearch.search.aggregations.AggregatorFactories; +import org.elasticsearch.search.aggregations.bucket.BucketsAggregator; +import org.elasticsearch.search.aggregations.bucket.BytesRefHash; +//import org.elasticsearch.search.aggregations.bucket.significant.StringTerms.Bucket; +//import org.elasticsearch.search.aggregations.bucket.terms.Terms; +//import org.elasticsearch.search.aggregations.bucket.terms.support.BucketPriorityQueue; +import org.elasticsearch.search.aggregations.bucket.terms.support.IncludeExclude; +import org.elasticsearch.search.aggregations.support.AggregationContext; +import org.elasticsearch.search.aggregations.support.ValuesSource; +import org.elasticsearch.search.aggregations.support.bytes.BytesValuesSource; +import org.elasticsearch.search.internal.ContextIndexSearcher; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Collections; +import java.util.Iterator; + +/** + * An aggregator of significant string values. + */ +public class SignificantStringTermsAggregator extends BucketsAggregator { + + private final ValuesSource valuesSource; + private final int requiredSize; + private final int shardSize; + private final long minDocCount; + protected final BytesRefHash bucketOrds; + private final IncludeExclude includeExclude; + private BytesValues values; + protected int numCollectedDocs; + private SignificantTermsAggregatorFactory termsAggFactory; + + public SignificantStringTermsAggregator(String name, AggregatorFactories factories, ValuesSource valuesSource, SignificantTermsAggregatorFactory termsAggFactory, long estimatedBucketCount, + int requiredSize, int shardSize, long minDocCount, + IncludeExclude includeExclude, AggregationContext aggregationContext, Aggregator parent) { + + super(name, BucketAggregationMode.PER_BUCKET, factories, estimatedBucketCount, aggregationContext, parent); + this.valuesSource = valuesSource; + this.termsAggFactory = termsAggFactory; + this.requiredSize = requiredSize; + this.shardSize = shardSize; + this.minDocCount = minDocCount; + this.includeExclude = includeExclude; + bucketOrds = new BytesRefHash(estimatedBucketCount, aggregationContext.pageCacheRecycler()); + } + + @Override + public boolean shouldCollect() { + return true; + } + + @Override + public void setNextReader(AtomicReaderContext reader) { + values = valuesSource.bytesValues(); + } + + @Override + public void collect(int doc, long owningBucketOrdinal) throws IOException { + numCollectedDocs++; + assert owningBucketOrdinal == 0; + final int valuesCount = values.setDocument(doc); + + for (int i = 0; i < valuesCount; ++i) { + final BytesRef bytes = values.nextValue(); + if (includeExclude != null && !includeExclude.accept(bytes)) { + continue; + } + final int hash = values.currentValueHash(); + assert hash == bytes.hashCode(); + long bucketOrdinal = bucketOrds.add(bytes, hash); + if (bucketOrdinal < 0) { // already seen + bucketOrdinal = - 1 - bucketOrdinal; + } + //TODO this system of counting is only based on doc volumes. + // There are scenarios where count distinct of *entities* is + // required e.g. see https://docs.google.com/a/elasticsearch.com/presentation/d/17jkxrsmSq6Gpd2mKAIO4949jSCgkSIM3j2KIOQ8oEEI/edit?usp=sharing + // and the section on credit card fraud which involves counting + // unique payee references not volumes of docs + collectBucket(doc, bucketOrdinal); + } + } + + /** Returns an iterator over the field data terms. */ + private static Iterator terms(final BytesValues.WithOrdinals bytesValues, boolean reverse) { + final Ordinals.Docs ordinals = bytesValues.ordinals(); + if (reverse) { + return new UnmodifiableIterator() { + + long i = ordinals.getMaxOrd() - 1; + + @Override + public boolean hasNext() { + return i >= Ordinals.MIN_ORDINAL; + } + + @Override + public BytesRef next() { + bytesValues.getValueByOrd(i--); + return bytesValues.copyShared(); + } + + }; + } else { + return new UnmodifiableIterator() { + + long i = Ordinals.MIN_ORDINAL; + + @Override + public boolean hasNext() { + return i < ordinals.getMaxOrd(); + } + + @Override + public BytesRef next() { + bytesValues.getValueByOrd(i++); + return bytesValues.copyShared(); + } + + }; + } + } + + + + @Override + public SignificantStringTerms buildAggregation(long owningBucketOrdinal) { + assert owningBucketOrdinal == 0; + + final int size = (int) Math.min(bucketOrds.size(), shardSize); + + ContextIndexSearcher searcher = context.searchContext().searcher(); + IndexReader topReader = searcher.getIndexReader(); + int supersetSize = topReader.numDocs(); + int subsetSize = numCollectedDocs; + + BucketSignificancePriorityQueue ordered = new BucketSignificancePriorityQueue(size); + SignificantStringTerms.Bucket spare = null; + for (int i = 0; i < bucketOrds.size(); i++) { + if (spare == null) { + spare = new SignificantStringTerms.Bucket(new BytesRef(), 0, 0, 0, 0, null); + } + + bucketOrds.get(i, spare.termBytes); + spare.subsetDf = bucketDocCount(i); + spare.subsetSize = subsetSize; + spare.supersetDf = termsAggFactory.getBackgroundFrequency(topReader, spare.termBytes); + spare.supersetSize = supersetSize; + assert spare.subsetDf <= spare.supersetDf; + // During shard-local down-selection we use subset/superset stats + // that are for this shard only + // Back at the central reducer these properties will be updated with + // global stats + spare.updateScore(); + + spare.bucketOrd = i; + spare = (SignificantStringTerms.Bucket) ordered.insertWithOverflow(spare); + } + + final InternalSignificantTerms.Bucket[] list = new InternalSignificantTerms.Bucket[ordered.size()]; + for (int i = ordered.size() - 1; i >= 0; --i) { + final SignificantStringTerms.Bucket bucket = (SignificantStringTerms.Bucket) ordered.pop(); + bucket.aggregations = bucketAggregations(bucket.bucketOrd); + list[i] = bucket; + } + + return new SignificantStringTerms(subsetSize, supersetSize, name, requiredSize, minDocCount, Arrays.asList(list)); + } + + @Override + public SignificantStringTerms buildEmptyAggregation() { + // We need to account for the significance of a miss in our global stats + // - provide corpus size as context + ContextIndexSearcher searcher = context.searchContext().searcher(); + IndexReader topReader = searcher.getIndexReader(); + int supersetSize = topReader.numDocs(); + return new SignificantStringTerms(0, supersetSize, name, requiredSize, minDocCount, Collections. emptyList()); + } + + @Override + public void doRelease() { + Releasables.release(bucketOrds); + } + + /** + * Extension of SignificantStringTermsAggregator that caches bucket ords using terms ordinals. + */ + public static class WithOrdinals extends SignificantStringTermsAggregator { + + private final BytesValuesSource.WithOrdinals valuesSource; + private BytesValues.WithOrdinals bytesValues; + private Ordinals.Docs ordinals; + private LongArray ordinalToBucket; + + public WithOrdinals(String name, AggregatorFactories factories, BytesValuesSource.WithOrdinals valuesSource, SignificantTermsAggregatorFactory indexedFieldName, + long esitmatedBucketCount, int requiredSize, int shardSize, long minDocCount, AggregationContext aggregationContext, Aggregator parent) { + super(name, factories, valuesSource, indexedFieldName, esitmatedBucketCount, requiredSize, shardSize, minDocCount, null, aggregationContext, parent); + this.valuesSource = valuesSource; + } + + @Override + public void setNextReader(AtomicReaderContext reader) { + bytesValues = valuesSource.bytesValues(); + ordinals = bytesValues.ordinals(); + final long maxOrd = ordinals.getMaxOrd(); + if (ordinalToBucket == null || ordinalToBucket.size() < maxOrd) { + if (ordinalToBucket != null) { + ordinalToBucket.release(); + } + ordinalToBucket = BigArrays.newLongArray(BigArrays.overSize(maxOrd), context().pageCacheRecycler(), false); + } + ordinalToBucket.fill(0, maxOrd, -1L); + } + + @Override + public void collect(int doc, long owningBucketOrdinal) throws IOException { + assert owningBucketOrdinal == 0 : "this is a per_bucket aggregator"; + numCollectedDocs++; + final int valuesCount = ordinals.setDocument(doc); + + for (int i = 0; i < valuesCount; ++i) { + final long ord = ordinals.nextOrd(); + long bucketOrd = ordinalToBucket.get(ord); + if (bucketOrd < 0) { // unlikely condition on a low-cardinality + // field + final BytesRef bytes = bytesValues.getValueByOrd(ord); + final int hash = bytesValues.currentValueHash(); + assert hash == bytes.hashCode(); + bucketOrd = bucketOrds.add(bytes, hash); + if (bucketOrd < 0) { // already seen in another segment + bucketOrd = -1 - bucketOrd; + } + ordinalToBucket.set(ord, bucketOrd); + } + + collectBucket(doc, bucketOrd); + } + } + + @Override + public void doRelease() { + Releasables.release(bucketOrds, ordinalToBucket); + } + } + +} + diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTerms.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTerms.java new file mode 100644 index 0000000000000..fdf343791d017 --- /dev/null +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTerms.java @@ -0,0 +1,76 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.search.aggregations.bucket.significant; + +import org.elasticsearch.search.aggregations.Aggregation; +import org.elasticsearch.search.aggregations.bucket.MultiBucketsAggregation; + +import java.util.Collection; + +/** + * + */ +public interface SignificantTerms extends Aggregation, Iterable { + + + static abstract class Bucket implements MultiBucketsAggregation.Bucket { + + long subsetDf; + long subsetSize; + long supersetDf; + long supersetSize; + + Bucket(long subsetDf, long subsetSize, long supersetDf, long supersetSize) { + super(); + this.subsetDf = subsetDf; + this.subsetSize = subsetSize; + this.supersetDf = supersetDf; + this.supersetSize = supersetSize; + } + + public abstract Number getKeyAsNumber(); + + abstract int compareTerm(SignificantTerms.Bucket other); + + public abstract double getSignificanceScore(); + + public long getSubsetDf(){ + return subsetDf; + } + + public long getSupersetDf(){ + return supersetDf; + } + + public long getSupersetSize(){ + return supersetSize; + } + + public long getSubsetSize(){ + return subsetSize; + } + + } + + Collection buckets(); + + Bucket getByTerm(String term); + + +} diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsAggregatorFactory.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsAggregatorFactory.java new file mode 100644 index 0000000000000..6ab5e8b4c9960 --- /dev/null +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsAggregatorFactory.java @@ -0,0 +1,181 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.search.aggregations.bucket.significant; + +import com.carrotsearch.hppc.ObjectObjectOpenHashMap; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.util.BytesRef; +import org.elasticsearch.ElasticsearchException; +import org.elasticsearch.ElasticsearchIllegalArgumentException; +import org.elasticsearch.common.lucene.HashedBytesRef; +import org.elasticsearch.index.mapper.FieldMapper; +import org.elasticsearch.search.aggregations.AggregationExecutionException; +import org.elasticsearch.search.aggregations.Aggregator; +import org.elasticsearch.search.aggregations.Aggregator.BucketAggregationMode; +//import org.elasticsearch.search.aggregations.bucket.terms.LongTermsAggregator; +//import org.elasticsearch.search.aggregations.bucket.terms.DoubleTermsAggregator; +//import org.elasticsearch.search.aggregations.bucket.terms.LongTermsAggregator; +//import org.elasticsearch.search.aggregations.bucket.terms.StringTermsAggregator; +import org.elasticsearch.search.aggregations.bucket.terms.support.IncludeExclude; +import org.elasticsearch.search.aggregations.support.AggregationContext; +import org.elasticsearch.search.aggregations.support.ValueSourceAggregatorFactory; +import org.elasticsearch.search.aggregations.support.ValuesSource; +import org.elasticsearch.search.aggregations.support.ValuesSourceConfig; +import org.elasticsearch.search.aggregations.support.bytes.BytesValuesSource; +import org.elasticsearch.search.aggregations.support.numeric.NumericValuesSource; +import org.elasticsearch.search.internal.SearchContext; + +import java.io.IOException; + +/** + * + */ +public class SignificantTermsAggregatorFactory extends ValueSourceAggregatorFactory { + + public static final String EXECUTION_HINT_VALUE_MAP = "map"; + public static final String EXECUTION_HINT_VALUE_ORDINALS = "ordinals"; + + private final int requiredSize; + private final int shardSize; + private final long minDocCount; + private final IncludeExclude includeExclude; + private final String executionHint; + private final String indexedFieldName; + private FieldMapper mapper; + + public SignificantTermsAggregatorFactory(String name, ValuesSourceConfig valueSourceConfig, int requiredSize, int shardSize, long minDocCount, IncludeExclude includeExclude, String executionHint) { + super(name, SignificantStringTerms.TYPE.name(), valueSourceConfig); + this.requiredSize = requiredSize; + this.shardSize = shardSize; + this.minDocCount = minDocCount; + this.includeExclude = includeExclude; + this.executionHint = executionHint; + this.indexedFieldName=valuesSourceConfig.fieldContext().field(); + SearchContext context = SearchContext.current(); + mapper = context.smartNameFieldMapper(indexedFieldName); + + + + } + + @Override + protected Aggregator createUnmapped(AggregationContext aggregationContext, Aggregator parent) { + return new UnmappedSignificantTermsAggregator(name, requiredSize, minDocCount, aggregationContext, parent); + } + + private static boolean hasParentBucketAggregator(Aggregator parent) { + if (parent == null) { + return false; + } else if (parent.bucketAggregationMode() == BucketAggregationMode.PER_BUCKET) { + return true; + } else { + return hasParentBucketAggregator(parent.parent()); + } + } + + @Override + protected Aggregator create(ValuesSource valuesSource, long expectedBucketsCount, AggregationContext aggregationContext, Aggregator parent) { + long estimatedBucketCount = valuesSource.metaData().maxAtomicUniqueValuesCount(); + if (estimatedBucketCount < 0) { + // there isn't an estimation available.. 50 should be a good start + estimatedBucketCount = 50; + } + + // adding an upper bound on the estimation as some atomic field data in the future (binary doc values) and not + // going to know their exact cardinality and will return upper bounds in AtomicFieldData.getNumberUniqueValues() + // that may be largely over-estimated.. the value chosen here is arbitrary just to play nice with typical CPU cache + // + // Another reason is that it may be faster to resize upon growth than to start directly with the appropriate size. + // And that all values are not necessarily visited by the matches. + estimatedBucketCount = Math.min(estimatedBucketCount, 512); + + if (valuesSource instanceof BytesValuesSource) { + if (executionHint != null && !executionHint.equals(EXECUTION_HINT_VALUE_MAP) && !executionHint.equals(EXECUTION_HINT_VALUE_ORDINALS)) { + throw new ElasticsearchIllegalArgumentException("execution_hint can only be '" + EXECUTION_HINT_VALUE_MAP + "' or '" + EXECUTION_HINT_VALUE_ORDINALS + "', not " + executionHint); + } + String execution = executionHint; + if (!(valuesSource instanceof BytesValuesSource.WithOrdinals)) { + execution = EXECUTION_HINT_VALUE_MAP; + } else if (includeExclude != null) { + execution = EXECUTION_HINT_VALUE_MAP; + } + if (execution == null) { + if ((valuesSource instanceof BytesValuesSource.WithOrdinals) + && !hasParentBucketAggregator(parent)) { + execution = EXECUTION_HINT_VALUE_ORDINALS; + } else { + execution = EXECUTION_HINT_VALUE_MAP; + } + } + assert execution != null; + + if (execution.equals(EXECUTION_HINT_VALUE_ORDINALS)) { + assert includeExclude == null; + return new SignificantStringTermsAggregator.WithOrdinals(name, factories, (BytesValuesSource.WithOrdinals) valuesSource, this, estimatedBucketCount, requiredSize, shardSize, minDocCount, aggregationContext, parent); + } else { + return new SignificantStringTermsAggregator(name, factories, valuesSource, this, estimatedBucketCount, requiredSize, shardSize, minDocCount, includeExclude, aggregationContext, parent); + } + } + + if (includeExclude != null) { + throw new AggregationExecutionException("Aggregation [" + name + "] cannot support the include/exclude " + + "settings as it can only be applied to string values"); + } + + if (valuesSource instanceof NumericValuesSource) { + + if (((NumericValuesSource) valuesSource).isFloatingPoint()) { + throw new UnsupportedOperationException("No support for examining floating point numerics"); + } + return new SignificantLongTermsAggregator(name, factories, (NumericValuesSource) valuesSource, this, estimatedBucketCount, requiredSize, shardSize, minDocCount, aggregationContext, parent); + } + + throw new AggregationExecutionException("sigfnificant_terms aggregation cannot be applied to field [" + valuesSourceConfig.fieldContext().field() + + "]. It can only be applied to numeric or string fields."); + } + + //Cache used to avoid multiple aggs hitting IndexReaders for docFreq info for the same term + final ObjectObjectOpenHashMap cachedDocFreqs = new ObjectObjectOpenHashMap(); + HashedBytesRef spare = new HashedBytesRef(); + + //Many child aggs may ask for the same docFreq information so cache docFreq values for these terms + public long getBackgroundFrequency(IndexReader topReader, BytesRef termBytes) { + spare.reset(termBytes, termBytes.hashCode()); + Integer result = cachedDocFreqs.get(spare); + if (result == null) { + try { + result = topReader.docFreq(new Term(indexedFieldName, termBytes)); + HashedBytesRef key = new HashedBytesRef(BytesRef.deepCopyOf(termBytes), spare.hash); + cachedDocFreqs.put(key, result); + } catch (IOException e) { + throw new ElasticsearchException("IOException reading document frequency", e); + } + } + return result; + } + + // Many child aggs may ask for the same docFreq information so cache docFreq + // values for these terms + public long getBackgroundFrequency(IndexReader topReader, long term) { + BytesRef indexedVal = mapper.indexedValueForSearch(term); + return getBackgroundFrequency(topReader, indexedVal); + } + +} diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsBuilder.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsBuilder.java new file mode 100644 index 0000000000000..074a201f026b2 --- /dev/null +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsBuilder.java @@ -0,0 +1,83 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.search.aggregations.bucket.significant; + +import org.elasticsearch.common.xcontent.XContentBuilder; +import org.elasticsearch.search.aggregations.AggregationBuilder; + +import java.io.IOException; + +/** + * Creates an aggregation based on bucketing points into GeoHashes + * + */ +public class SignificantTermsBuilder extends AggregationBuilder { + + + private String field; + private int requiredSize=SignificantTermsParser.DEFAULT_REQUIRED_SIZE; + private int shardSize=SignificantTermsParser.DEFAULT_SHARD_SIZE; + private int minDocCount=SignificantTermsParser.DEFAULT_MIN_DOC_COUNT; + + public SignificantTermsBuilder(String name) { + //TODO what if chosen field type is long not string? would this work? + super(name, SignificantStringTerms.TYPE.name()); + } + + public SignificantTermsBuilder field(String field) { + this.field = field; + return this; + } + + public SignificantTermsBuilder size(int requiredSize) { + this.requiredSize = requiredSize; + return this; + } + public SignificantTermsBuilder shardSize(int shardSize) { + this.shardSize = shardSize; + return this; + } + public SignificantTermsBuilder minDocCount(int minDocCount) { + this.minDocCount=minDocCount; + return this; + } + + + + @Override + protected XContentBuilder internalXContent(XContentBuilder builder, Params params) throws IOException { + builder.startObject(); + if (field != null) { + builder.field("field", field); + } + if (minDocCount != SignificantTermsParser.DEFAULT_MIN_DOC_COUNT) { + builder.field("minDocCount", minDocCount); + } + if (requiredSize != SignificantTermsParser.DEFAULT_REQUIRED_SIZE) { + builder.field("size", requiredSize); + } + if (shardSize != SignificantTermsParser.DEFAULT_SHARD_SIZE) { + builder.field("shard_size", shardSize); + } + + return builder.endObject(); + } + +} diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsParser.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsParser.java new file mode 100644 index 0000000000000..0cb3cd795a052 --- /dev/null +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsParser.java @@ -0,0 +1,211 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.search.aggregations.bucket.significant; + +import org.elasticsearch.common.regex.Regex; +import org.elasticsearch.common.xcontent.XContentParser; +import org.elasticsearch.index.fielddata.IndexFieldData; +import org.elasticsearch.index.fielddata.IndexNumericFieldData; +import org.elasticsearch.index.mapper.FieldMapper; +import org.elasticsearch.index.mapper.core.DateFieldMapper; +import org.elasticsearch.index.mapper.ip.IpFieldMapper; +import org.elasticsearch.search.SearchParseException; +import org.elasticsearch.search.aggregations.Aggregator; +import org.elasticsearch.search.aggregations.AggregatorFactory; +import org.elasticsearch.search.aggregations.bucket.BucketUtils; +import org.elasticsearch.search.aggregations.bucket.terms.support.IncludeExclude; +import org.elasticsearch.search.aggregations.support.FieldContext; +import org.elasticsearch.search.aggregations.support.ValuesSourceConfig; +import org.elasticsearch.search.aggregations.support.bytes.BytesValuesSource; +import org.elasticsearch.search.aggregations.support.numeric.NumericValuesSource; +import org.elasticsearch.search.aggregations.support.numeric.ValueFormatter; +import org.elasticsearch.search.aggregations.support.numeric.ValueParser; +import org.elasticsearch.search.internal.SearchContext; + +import java.io.IOException; +import java.util.regex.Pattern; + +/** + * + */ +public class SignificantTermsParser implements Aggregator.Parser { + + @Override + public String type() { + return SignificantStringTerms.TYPE.name(); + } + + public static final int DEFAULT_REQUIRED_SIZE=10; + public static final int DEFAULT_SHARD_SIZE=0; + //Typically need more than one occurrence of something for it to be statistically significant + public static final int DEFAULT_MIN_DOC_COUNT = 3; + + @Override + public AggregatorFactory parse(String aggregationName, XContentParser parser, SearchContext context) throws IOException { + + String field = null; + int requiredSize = DEFAULT_REQUIRED_SIZE; + int shardSize = DEFAULT_SHARD_SIZE; + String format = null; + String include = null; + int includeFlags = 0; // 0 means no flags + String exclude = null; + int excludeFlags = 0; // 0 means no flags + String executionHint = null; + long minDocCount = DEFAULT_MIN_DOC_COUNT; + + XContentParser.Token token; + String currentFieldName = null; + while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) { + if (token == XContentParser.Token.FIELD_NAME) { + currentFieldName = parser.currentName(); + } else if (token == XContentParser.Token.VALUE_STRING) { + if ("field".equals(currentFieldName)) { + field = parser.text(); + } else if ("include".equals(currentFieldName)) { + include = parser.text(); + } else if ("exclude".equals(currentFieldName)) { + exclude = parser.text(); + } else if ("execution_hint".equals(currentFieldName) || "executionHint".equals(currentFieldName)) { + executionHint = parser.text(); + } else { + throw new SearchParseException(context, "Unknown key for a " + token + " in [" + aggregationName + "]: [" + currentFieldName + "]."); + } + } else if (token == XContentParser.Token.VALUE_NUMBER) { + if ("size".equals(currentFieldName)) { + requiredSize = parser.intValue(); + } else if ("shard_size".equals(currentFieldName) || "shardSize".equals(currentFieldName)) { + shardSize = parser.intValue(); + } else if ("min_doc_count".equals(currentFieldName) || "minDocCount".equals(currentFieldName)) { + minDocCount = parser.intValue(); + } else { + throw new SearchParseException(context, "Unknown key for a " + token + " in [" + aggregationName + "]: [" + currentFieldName + "]."); + } + } else if (token == XContentParser.Token.START_OBJECT) { + if ("include".equals(currentFieldName)) { + while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) { + if (token == XContentParser.Token.FIELD_NAME) { + currentFieldName = parser.currentName(); + } else if (token == XContentParser.Token.VALUE_STRING) { + if ("pattern".equals(currentFieldName)) { + include = parser.text(); + } else if ("flags".equals(currentFieldName)) { + includeFlags = Regex.flagsFromString(parser.text()); + } + } else if (token == XContentParser.Token.VALUE_NUMBER) { + if ("flags".equals(currentFieldName)) { + includeFlags = parser.intValue(); + } + } + } + } else if ("exclude".equals(currentFieldName)) { + while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) { + if (token == XContentParser.Token.FIELD_NAME) { + currentFieldName = parser.currentName(); + } else if (token == XContentParser.Token.VALUE_STRING) { + if ("pattern".equals(currentFieldName)) { + exclude = parser.text(); + } else if ("flags".equals(currentFieldName)) { + excludeFlags = Regex.flagsFromString(parser.text()); + } + } else if (token == XContentParser.Token.VALUE_NUMBER) { + if ("flags".equals(currentFieldName)) { + excludeFlags = parser.intValue(); + } + } + } + } else { + throw new SearchParseException(context, "Unknown key for a " + token + " in [" + aggregationName + "]: [" + currentFieldName + "]."); + } + } else { + throw new SearchParseException(context, "Unexpected token " + token + " in [" + aggregationName + "]."); + } + } + + if (shardSize == DEFAULT_SHARD_SIZE) { + //The user has not made a shardSize selection . + //Use default heuristic to avoid any wrong-ranking caused by distributed counting + //but request double the usual amount. + //We typically need more than the number of "top" terms requested by other aggregations + //as the significance algorithm is in less of a position to down-select at shard-level - + //some of the things we want to find have only one occurrence on each shard and as + // such are impossible to differentiate from non-significant terms at that early stage. + shardSize = 2 * BucketUtils.suggestShardSideQueueSize(requiredSize, context.numberOfShards()); + + } + + // shard_size cannot be smaller than size as we need to at least fetch entries from every shards in order to return + if (shardSize < requiredSize) { + shardSize = requiredSize; + } + + IncludeExclude includeExclude = null; + if (include != null || exclude != null) { + Pattern includePattern = include != null ? Pattern.compile(include, includeFlags) : null; + Pattern excludePattern = exclude != null ? Pattern.compile(exclude, excludeFlags) : null; + includeExclude = new IncludeExclude(includePattern, excludePattern); + } + + + FieldMapper mapper = context.smartNameFieldMapper(field); + if (mapper == null) { + ValuesSourceConfig config = new ValuesSourceConfig(BytesValuesSource.class); + config.unmapped(true); + return new SignificantTermsAggregatorFactory(aggregationName, config, requiredSize, shardSize, minDocCount, includeExclude, executionHint); + } + IndexFieldData indexFieldData = context.fieldData().getForField(mapper); + + ValuesSourceConfig config; + + if (mapper instanceof DateFieldMapper) { + DateFieldMapper dateMapper = (DateFieldMapper) mapper; + ValueFormatter formatter = format == null ? + new ValueFormatter.DateTime(dateMapper.dateTimeFormatter()) : + new ValueFormatter.DateTime(format); + config = new ValuesSourceConfig(NumericValuesSource.class) + .formatter(formatter) + .parser(new ValueParser.DateMath(dateMapper.dateMathParser())); + + } else if (mapper instanceof IpFieldMapper) { + config = new ValuesSourceConfig(NumericValuesSource.class) + .formatter(ValueFormatter.IPv4) + .parser(ValueParser.IPv4); + + } else if (indexFieldData instanceof IndexNumericFieldData) { + config = new ValuesSourceConfig(NumericValuesSource.class); + if (format != null) { + config.formatter(new ValueFormatter.Number.Pattern(format)); + } + + } else { + config = new ValuesSourceConfig(BytesValuesSource.class); + // TODO: it will make sense to set false instead here if the aggregator factory uses + // ordinals instead of hash tables + config.needsHashes(true); + } + + config.fieldContext(new FieldContext(field, indexFieldData)); + // We need values to be unique to be able to run terms aggs efficiently + config.ensureUnique(true); + + return new SignificantTermsAggregatorFactory(aggregationName, config, requiredSize, shardSize, minDocCount, includeExclude, executionHint); + } + + +} diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/UnmappedSignificantTerms.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/UnmappedSignificantTerms.java new file mode 100644 index 0000000000000..683c6048b5247 --- /dev/null +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/UnmappedSignificantTerms.java @@ -0,0 +1,91 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.search.aggregations.bucket.significant; + +import org.elasticsearch.common.io.stream.StreamInput; +import org.elasticsearch.common.io.stream.StreamOutput; +import org.elasticsearch.common.xcontent.XContentBuilder; +import org.elasticsearch.search.aggregations.AggregationStreams; + +import java.io.IOException; +import java.util.Collection; +import java.util.Collections; +import java.util.Map; + +/** + * + */ +public class UnmappedSignificantTerms extends InternalSignificantTerms { + + public static final Type TYPE = new Type("significant_terms", "umsigterms"); + + private static final Collection BUCKETS = Collections.emptyList(); + private static final Map BUCKETS_MAP = Collections.emptyMap(); + + public static AggregationStreams.Stream STREAM = new AggregationStreams.Stream() { + @Override + public UnmappedSignificantTerms readResult(StreamInput in) throws IOException { + UnmappedSignificantTerms buckets = new UnmappedSignificantTerms(); + buckets.readFrom(in); + return buckets; + } + }; + + public static void registerStreams() { + AggregationStreams.registerStream(STREAM, TYPE.stream()); + } + + UnmappedSignificantTerms() {} // for serialization + + public UnmappedSignificantTerms(String name, int requiredSize, long minDocCount) { + //We pass zero for index/subset sizes because for the purpose of significant term analysis + // we assume an unmapped index's size is irrelevant to the proceedings. + super(0,0,name, requiredSize, minDocCount, BUCKETS); + } + + @Override + public Type type() { + return TYPE; + } + + @Override + public void readFrom(StreamInput in) throws IOException { + this.name = in.readString(); + this.requiredSize = readSize(in); + this.minDocCount = in.readVLong(); + this.buckets = BUCKETS; + this.bucketMap = BUCKETS_MAP; + } + + @Override + public void writeTo(StreamOutput out) throws IOException { + out.writeString(name); + writeSize(requiredSize, out); + out.writeVLong(minDocCount); + } + + @Override + public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { + builder.startObject(name); + builder.startArray(CommonFields.BUCKETS).endArray(); + builder.endObject(); + return builder; + } + +} diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/UnmappedSignificantTermsAggregator.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/UnmappedSignificantTermsAggregator.java new file mode 100644 index 0000000000000..c607d13e4321f --- /dev/null +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/UnmappedSignificantTermsAggregator.java @@ -0,0 +1,66 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.search.aggregations.bucket.significant; + +import org.apache.lucene.index.AtomicReaderContext; +import org.elasticsearch.search.aggregations.Aggregator; +import org.elasticsearch.search.aggregations.AggregatorFactories; +import org.elasticsearch.search.aggregations.InternalAggregation; +import org.elasticsearch.search.aggregations.support.AggregationContext; + +import java.io.IOException; + +/** + * + */ +public class UnmappedSignificantTermsAggregator extends Aggregator { + + private final int requiredSize; + private final long minDocCount; + + public UnmappedSignificantTermsAggregator(String name, int requiredSize, long minDocCount, AggregationContext aggregationContext, Aggregator parent) { + super(name, BucketAggregationMode.PER_BUCKET, AggregatorFactories.EMPTY, 0, aggregationContext, parent); + this.requiredSize = requiredSize; + this.minDocCount = minDocCount; + } + + @Override + public boolean shouldCollect() { + return false; + } + + @Override + public void setNextReader(AtomicReaderContext reader) { + } + + @Override + public void collect(int doc, long owningBucketOrdinal) throws IOException { + } + + @Override + public InternalAggregation buildAggregation(long owningBucketOrdinal) { + assert owningBucketOrdinal == 0; + return new UnmappedSignificantTerms(name, requiredSize, minDocCount); + } + + @Override + public InternalAggregation buildEmptyAggregation() { + return new UnmappedSignificantTerms(name, requiredSize, minDocCount); + } +} diff --git a/src/test/java/org/elasticsearch/search/aggregations/bucket/SignificantTermsTests.java b/src/test/java/org/elasticsearch/search/aggregations/bucket/SignificantTermsTests.java new file mode 100644 index 0000000000000..8a9b34eddbf2b --- /dev/null +++ b/src/test/java/org/elasticsearch/search/aggregations/bucket/SignificantTermsTests.java @@ -0,0 +1,161 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.search.aggregations.bucket; + +import org.elasticsearch.action.admin.indices.refresh.RefreshRequest; +import org.elasticsearch.action.search.SearchResponse; +import org.elasticsearch.action.search.SearchType; +import org.elasticsearch.common.settings.ImmutableSettings; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.query.TermQueryBuilder; +import org.elasticsearch.search.aggregations.bucket.significant.SignificantTerms; +import org.elasticsearch.search.aggregations.bucket.significant.SignificantTerms.Bucket; +import org.elasticsearch.search.aggregations.bucket.significant.SignificantTermsBuilder; +import org.elasticsearch.test.ElasticsearchIntegrationTest; +import org.junit.Before; +import org.junit.Test; + +import java.util.HashMap; + +import static org.elasticsearch.cluster.metadata.IndexMetaData.SETTING_NUMBER_OF_REPLICAS; +import static org.elasticsearch.cluster.metadata.IndexMetaData.SETTING_NUMBER_OF_SHARDS; +import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder; +import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked; + +/** + * + */ +public class SignificantTermsTests extends ElasticsearchIntegrationTest { + + @Override + public Settings indexSettings() { + return ImmutableSettings.builder() + .put("index.number_of_shards", between(1, 5)) + .put("index.number_of_replicas", between(0, 1)) + .build(); + } + + public static final int MUSIC_CATEGORY=1; + public static final int OTHER_CATEGORY=2; + public static final int SNOWBOARDING_CATEGORY=3; + + @Before + public void init() throws Exception { + assertAcked(prepareCreate("test").setSettings(SETTING_NUMBER_OF_SHARDS, 5, SETTING_NUMBER_OF_REPLICAS, 0) + .addMapping("fact", jsonBuilder() + .startObject() + .startObject("fact") + .startObject("_routing") + .field("required", true) + .field("path", "routingID") + .endObject() + .startObject("properties") + .startObject("routingID") + .field("type", "string") + .field("index", "not_analyzed") + .endObject() + .startObject("factCategory") + .field("type", "integer") + .field("index", "not_analyzed") + .endObject() + .startObject("Description") + .field("type", "string") + .field("index", "analyzed") + .endObject() + .endObject() + .endObject() + .endObject())); + ensureGreen(); + String data[]= { + "A\t1\tpaul weller was lead singer of the jam before the style council", + "B\t1\tpaul weller left the jam to form the style council", + "A\t2\tpaul smith is a designer in the fashion industry", + "B\t1\tthe stranglers are a group originally from guildford", + "A\t1\tafter disbanding the style council in 1985 paul weller became a solo artist", + "B\t1\tjean jaques burnel is a bass player in the stranglers and has a black belt in karate", + "A\t1\tmalcolm owen was the lead singer of the ruts", + "B\t1\tpaul weller has denied any possibility of a reunion of the jam", + "A\t1\tformer frontman of the jam paul weller became the father of twins", + "B\t2\tex-england football star paul gascoigne has re-emerged following recent disappearance", + "A\t2\tdavid smith has recently denied connections with the mafia", + "B\t1\tthe damned's new rose single was considered the first 'punk' single in the UK", + "A\t1\tthe sex pistols broke up after a few short years together", + "B\t1\tpaul gascoigne was a midfielder for england football team", + "A\t3\tcraig kelly became the first world champion snowboarder and has a memorial at baldface lodge", + "B\t3\tterje haakonsen has credited craig kelly as his snowboard mentor", + "A\t3\tterje haakonsen and craig kelly were some of the first snowboarders sponsored by burton snowboards", + "B\t3\tlike craig kelly before him terje won the mt baker banked slalom many times - once riding switch", + "A\t3\tterje haakonsen has been a team rider for burton snowboards for over 20 years" + }; + + for (int i = 0; i < data.length; i++) { + String[] parts=data[i].split("\t"); + client().prepareIndex("test", "fact", ""+i).setSource("routingID", parts[0], + "factCategory", parts[1], + "Description", parts[2] + ).get(); + } + client().admin().indices().refresh(new RefreshRequest("test")).get(); + } + + + @Test + public void textAnalysis() throws Exception { + SearchResponse response = client().prepareSearch("test") + .setSearchType(SearchType.QUERY_AND_FETCH) + .setQuery(new TermQueryBuilder("_all", "terje")) + .setFrom(0).setSize(60).setExplain(true) + .addAggregation(new SignificantTermsBuilder("mySignificantTerms").field("Description") + .minDocCount(2) + ) + .execute() + .actionGet(); + SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms"); + HashMaptopWords=new HashMap(); + for (Bucket topTerm : topTerms ){ + topWords.put(topTerm.getKey(),topTerm); + } + assertTrue( topWords.containsKey("haakonsen")); + assertTrue( topWords.containsKey("craig")); + assertTrue( topWords.containsKey("kelly")); + assertTrue( topWords.containsKey("burton")); + assertTrue( topWords.containsKey("snowboards")); + Bucket kellyTerm=topWords.get("kelly"); + assertEquals(3, kellyTerm.getSubsetDf()); + assertEquals(4, kellyTerm.getSupersetDf()); + } + + @Test + public void structuredAnalysis() throws Exception { + SearchResponse response = client().prepareSearch("test") + .setSearchType(SearchType.QUERY_AND_FETCH) + .setQuery(new TermQueryBuilder("_all", "terje")) + .setFrom(0).setSize(60).setExplain(true) + .addAggregation(new SignificantTermsBuilder("mySignificantTerms").field("factCategory") + .minDocCount(2) + ) + .execute() + .actionGet(); + SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms"); + Number topCategory = topTerms.buckets().iterator().next().getKeyAsNumber(); + assertTrue(topCategory.equals(new Long(SNOWBOARDING_CATEGORY))); + } + + +} From 08efb7e5e8a0f565f0282695e7962c47549fb4ad Mon Sep 17 00:00:00 2001 From: markharwood Date: Tue, 18 Feb 2014 13:47:18 +0000 Subject: [PATCH 2/8] Updated following @jpountz review: docs tidy up and made InternalSignificantTerms use new readSize and writeSize methods in base class. Also added support and tests for unmapped indices, --- .../significantterms-aggregation.asciidoc | 127 +++++++++--------- .../significant/InternalSignificantTerms.java | 16 --- .../SignificantTermsAggregatorFactory.java | 12 +- .../bucket/SignificantTermsTests.java | 71 +++++++--- 4 files changed, 125 insertions(+), 101 deletions(-) diff --git a/docs/reference/search/aggregations/bucket/significantterms-aggregation.asciidoc b/docs/reference/search/aggregations/bucket/significantterms-aggregation.asciidoc index e394e00a88c3e..4c3a0353ef6e6 100644 --- a/docs/reference/search/aggregations/bucket/significantterms-aggregation.asciidoc +++ b/docs/reference/search/aggregations/bucket/significantterms-aggregation.asciidoc @@ -25,15 +25,14 @@ Example: [source,js] -------------------------------------------------- { - "query" : - { - "query_string" : {"query":"British AND Transport AND Police"} - }, - "aggregations" : { - "significantCrimeTypes" : { - "significant_terms" : { "field" : "crimeType" } - } - } + "query" : { + "terms" : {"force" : [ "British Transport Police" ]} + }, + "aggregations" : { + "significantCrimeTypes" : { + "significant_terms" : { "field" : "crimeType" } + } + } } -------------------------------------------------- @@ -46,16 +45,16 @@ Response: "aggregations" : { "significantCrimeTypes" : { - "subset_size": 47347, - "superset_size": 5064554, - "buckets" : [ - { - "key": "Bicycle theft", - "doc_count": 3640, - "significance_score": 0.371235374214817, - "superset_doc_freq": 66799 - } - ... + "subset_size": 47347, + "superset_size": 5064554, + "buckets" : [ + { + "key": "Bicycle theft", + "doc_count": 3640, + "significance_score": 0.371235374214817, + "superset_doc_freq": 66799 + } + ... ] } } @@ -63,7 +62,7 @@ Response: -------------------------------------------------- When querying an index of all crimes from all police forces, what these results show is that the British Transport Police force -stand out as a force dealing with a disproportionately large number of bicycle thefts. Ordinarily, bicycle thefts represent only one 1 in +stand out as a force dealing with a disproportionately large number of bicycle thefts. Ordinarily, bicycle thefts represent only 1 in every 100 crimes but for the British Transport Police, who handle crime on railways and stations, 7 in every 100 crimes is a bike theft. This is a significant seven-fold increase in frequency and so this anomaly was highlighted as the top crime type. @@ -104,44 +103,44 @@ Response: ... "aggregations": { - "forces": { - "buckets": [ - { - "key": "Metropolitan Police Service", - "doc_count": 894038, - "significantCrimeTypes": { - "subset_size": 894038, - "superset_size": 5064554, - "buckets": [ - { - "key": "Robbery", - "doc_count": 27617, - "significance_score": 0.0599, - "superset_doc_freq": 53182 - }, - ... - ] - } - }, - { - "key": "British Transport Police", - "doc_count": 47347, - "significantCrimeTypes": { - "subset_size": 47347, - "superset_size": 5064554, - "buckets": [ - { - "key": "Bicycle theft", - "doc_count": 3640, - "significance_score": 0.371, - "superset_doc_freq": 66799 - }, - ... - ] - } - } - ] - } + "forces": { + "buckets": [ + { + "key": "Metropolitan Police Service", + "doc_count": 894038, + "significantCrimeTypes": { + "subset_size": 894038, + "superset_size": 5064554, + "buckets": [ + { + "key": "Robbery", + "doc_count": 27617, + "significance_score": 0.0599, + "superset_doc_freq": 53182 + }, + ... + ] + } + }, + { + "key": "British Transport Police", + "doc_count": 47347, + "significantCrimeTypes": { + "subset_size": 47347, + "superset_size": 5064554, + "buckets": [ + { + "key": "Bicycle theft", + "doc_count": 3640, + "significance_score": 0.371, + "superset_doc_freq": 66799 + }, + ... + ] + } + } + ] + } } -------------------------------------------------- @@ -213,8 +212,11 @@ to load only a sample of top-matching documents and use their contents to count You can spot mis-categorized content by first searching a structured field e.g. `category:adultMovie` and use significant_terms on the free-text "movieDescription" field. Take the suggested words (I'll leave them to your imagination) and then search for all movies NOT marked as category:adultMovie but containing these keywords. You now have a ranked list of badly-categorized movies that you should reclassify or at least remove from the "familyFriendly" category. + +The significance score from each term can also provide a useful `boost` setting to sort matches. Using the `minimum_should_match` setting of the `terms` query with the keywords will help control the balance of precision/recall in the result set i.e a high setting would have a small number of relevant results packed full of keywords and a setting of "1" would produce a more exhaustive results set with all documents containing _any_ keyword. + ********************************** .Hot tip: Show significant_terms in context @@ -233,10 +235,11 @@ a _background_ set other than the index from which all results are ultimately dr background set as the basis for comparisons e.g. to first select the tweets for the TV show "XFactor" and then look for significant terms in a subset of that content which is from this week. -===== No scripts +===== Significant terms must be indexed values Unlike the terms aggregation it is currently not possible to use script-generated terms for counting purposes. Because of the way the significant_terms aggregation must consider both _foreground_ and _background_ frequencies it would be prohibitively expensive to use a script on the entire index to obtain background frequencies for comparisons. +Also DocValues are not supported as sources of term data for similar reasons. ===== No analysis of floating point fields Floating point fields are currently not supported as the subject of significant_terms analysis. @@ -313,9 +316,9 @@ It is possible to only return terms that match more than a configured number of The above aggregation would only return tags which have been found in 10 hits or more. Default value is `3`. WARNING: Setting `min_doc_count` to `1` is generally not advised as it tends to return terms that - are typos or other bizarre curiosities. Finding more than one instance of a term helps - reinforce that, while still rare, the term was not the result of a one-off accident. The - default value of 3 is used to provide a minimum weight-of-evidence. + are typos or other bizarre curiosities. Finding more than one instance of a term helps + reinforce that, while still rare, the term was not the result of a one-off accident. The + default value of 3 is used to provide a minimum weight-of-evidence. ==== Filtering Values diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/InternalSignificantTerms.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/InternalSignificantTerms.java index 2a27b16353c38..c9b3f57bbd5d2 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/InternalSignificantTerms.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/InternalSignificantTerms.java @@ -21,15 +21,12 @@ import com.google.common.collect.Lists; import com.google.common.collect.Maps; import org.elasticsearch.cache.recycler.CacheRecycler; -import org.elasticsearch.common.io.stream.StreamInput; -import org.elasticsearch.common.io.stream.StreamOutput; import org.elasticsearch.common.io.stream.Streamable; import org.elasticsearch.common.xcontent.ToXContent; import org.elasticsearch.search.aggregations.Aggregations; import org.elasticsearch.search.aggregations.InternalAggregation; import org.elasticsearch.search.aggregations.InternalAggregations; -import java.io.IOException; import java.util.*; /** @@ -284,17 +281,4 @@ final void trimExcessEntries() { buckets = newBuckets; } - // 0 actually means unlimited - protected static int readSize(StreamInput in) throws IOException { - final int size = in.readVInt(); - return size == 0 ? Integer.MAX_VALUE : size; - } - - protected static void writeSize(int size, StreamOutput out) throws IOException { - if (size == Integer.MAX_VALUE) { - size = 0; - } - out.writeVInt(size); - } - } diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsAggregatorFactory.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsAggregatorFactory.java index 6ab5e8b4c9960..4c27a8501a91b 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsAggregatorFactory.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsAggregatorFactory.java @@ -57,7 +57,7 @@ public class SignificantTermsAggregatorFactory extends ValueSourceAggregatorFact private final long minDocCount; private final IncludeExclude includeExclude; private final String executionHint; - private final String indexedFieldName; + private String indexedFieldName; private FieldMapper mapper; public SignificantTermsAggregatorFactory(String name, ValuesSourceConfig valueSourceConfig, int requiredSize, int shardSize, long minDocCount, IncludeExclude includeExclude, String executionHint) { @@ -67,12 +67,10 @@ public SignificantTermsAggregatorFactory(String name, ValuesSourceConfig valueSo this.minDocCount = minDocCount; this.includeExclude = includeExclude; this.executionHint = executionHint; - this.indexedFieldName=valuesSourceConfig.fieldContext().field(); - SearchContext context = SearchContext.current(); - mapper = context.smartNameFieldMapper(indexedFieldName); - - - + if (!valueSourceConfig.unmapped()) { + this.indexedFieldName = valuesSourceConfig.fieldContext().field(); + mapper = SearchContext.current().smartNameFieldMapper(indexedFieldName); + } } @Override diff --git a/src/test/java/org/elasticsearch/search/aggregations/bucket/SignificantTermsTests.java b/src/test/java/org/elasticsearch/search/aggregations/bucket/SignificantTermsTests.java index 8a9b34eddbf2b..2a94068004bf6 100644 --- a/src/test/java/org/elasticsearch/search/aggregations/bucket/SignificantTermsTests.java +++ b/src/test/java/org/elasticsearch/search/aggregations/bucket/SignificantTermsTests.java @@ -37,6 +37,7 @@ import static org.elasticsearch.cluster.metadata.IndexMetaData.SETTING_NUMBER_OF_SHARDS; import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder; import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked; +import static org.hamcrest.Matchers.equalTo; /** * @@ -81,6 +82,9 @@ public void init() throws Exception { .endObject() .endObject() .endObject())); + + createIndex("idx_unmapped"); + ensureGreen(); String data[]= { "A\t1\tpaul weller was lead singer of the jam before the style council", @@ -114,6 +118,36 @@ public void init() throws Exception { client().admin().indices().refresh(new RefreshRequest("test")).get(); } + @Test + public void structuredAnalysis() throws Exception { + SearchResponse response = client().prepareSearch("test") + .setSearchType(SearchType.QUERY_AND_FETCH) + .setQuery(new TermQueryBuilder("_all", "terje")) + .setFrom(0).setSize(60).setExplain(true) + .addAggregation(new SignificantTermsBuilder("mySignificantTerms").field("factCategory") + .minDocCount(2) + ) + .execute() + .actionGet(); + SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms"); + Number topCategory = topTerms.buckets().iterator().next().getKeyAsNumber(); + assertTrue(topCategory.equals(new Long(SNOWBOARDING_CATEGORY))); + } + + @Test + public void unmapped() throws Exception { + SearchResponse response = client().prepareSearch("idx_unmapped") + .setSearchType(SearchType.QUERY_AND_FETCH) + .setQuery(new TermQueryBuilder("_all", "terje")) + .setFrom(0).setSize(60).setExplain(true) + .addAggregation(new SignificantTermsBuilder("mySignificantTerms").field("factCategory") + .minDocCount(2) + ) + .execute() + .actionGet(); + SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms"); + assertThat(topTerms.buckets().size(), equalTo(0)); + } @Test public void textAnalysis() throws Exception { @@ -127,6 +161,26 @@ public void textAnalysis() throws Exception { .execute() .actionGet(); SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms"); + checkExpectedStringTermsFound(topTerms); + } + + @Test + public void partiallyUnmapped() throws Exception { + SearchResponse response = client().prepareSearch("idx_unmapped","test") + .setSearchType(SearchType.QUERY_AND_FETCH) + .setQuery(new TermQueryBuilder("_all", "terje")) + .setFrom(0).setSize(60).setExplain(true) + .addAggregation(new SignificantTermsBuilder("mySignificantTerms").field("Description") + .minDocCount(2) + ) + .execute() + .actionGet(); + SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms"); + checkExpectedStringTermsFound(topTerms); + } + + + private void checkExpectedStringTermsFound(SignificantTerms topTerms) { HashMaptopWords=new HashMap(); for (Bucket topTerm : topTerms ){ topWords.put(topTerm.getKey(),topTerm); @@ -140,22 +194,7 @@ public void textAnalysis() throws Exception { assertEquals(3, kellyTerm.getSubsetDf()); assertEquals(4, kellyTerm.getSupersetDf()); } - - @Test - public void structuredAnalysis() throws Exception { - SearchResponse response = client().prepareSearch("test") - .setSearchType(SearchType.QUERY_AND_FETCH) - .setQuery(new TermQueryBuilder("_all", "terje")) - .setFrom(0).setSize(60).setExplain(true) - .addAggregation(new SignificantTermsBuilder("mySignificantTerms").field("factCategory") - .minDocCount(2) - ) - .execute() - .actionGet(); - SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms"); - Number topCategory = topTerms.buckets().iterator().next().getKeyAsNumber(); - assertTrue(topCategory.equals(new Long(SNOWBOARDING_CATEGORY))); - } + } From b224a17f903419ef3011a915173e5857fa000da7 Mon Sep 17 00:00:00 2001 From: markharwood Date: Wed, 19 Feb 2014 16:05:02 +0000 Subject: [PATCH 3/8] Changed Significant(Long/String)TermsAggregator classes to inherit from (Long/String)TermsAggregators, changed visibility of member variables to allow for this. Some minor documentation changes --- .../significantterms-aggregation.asciidoc | 7 +- .../significant/InternalSignificantTerms.java | 3 +- .../SignificantLongTermsAggregator.java | 59 ++------- .../SignificantStringTermsAggregator.java | 119 +++--------------- .../SignificantTermsAggregatorFactory.java | 6 +- .../significant/SignificantTermsBuilder.java | 3 +- .../bucket/terms/LongTermsAggregator.java | 15 +-- .../bucket/terms/StringTermsAggregator.java | 11 +- 8 files changed, 45 insertions(+), 178 deletions(-) diff --git a/docs/reference/search/aggregations/bucket/significantterms-aggregation.asciidoc b/docs/reference/search/aggregations/bucket/significantterms-aggregation.asciidoc index 4c3a0353ef6e6..b8d62f6c5969a 100644 --- a/docs/reference/search/aggregations/bucket/significantterms-aggregation.asciidoc +++ b/docs/reference/search/aggregations/bucket/significantterms-aggregation.asciidoc @@ -62,8 +62,8 @@ Response: -------------------------------------------------- When querying an index of all crimes from all police forces, what these results show is that the British Transport Police force -stand out as a force dealing with a disproportionately large number of bicycle thefts. Ordinarily, bicycle thefts represent only 1 in -every 100 crimes but for the British Transport Police, who handle crime on railways and stations, 7 in every 100 crimes is +stand out as a force dealing with a disproportionately large number of bicycle thefts. Ordinarily, bicycle thefts represent only 1% of crimes (66799/5064554) +but for the British Transport Police, who handle crime on railways and stations, 7% of crimes (3640/47347) is a bike theft. This is a significant seven-fold increase in frequency and so this anomaly was highlighted as the top crime type. The problem with using a query to spot anomalies is it only gives us one subset to use for comparisons. @@ -204,8 +204,7 @@ The significant_terms aggregation can be used effectively on tokenized free-text * keywords for use in percolator queries WARNING: Picking a free-text field as the subject of a significant terms analysis can be expensive! It will attempt -to load every unique word into RAM. It is recommended to only use this on smaller indices. In future we may provide an option -to load only a sample of top-matching documents and use their contents to count word frequencies on-the-fly +to load every unique word into RAM. It is recommended to only use this on smaller indices. .Use the _"like this but not this"_ pattern ********************************** diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/InternalSignificantTerms.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/InternalSignificantTerms.java index c9b3f57bbd5d2..d0fa0b17e1e3e 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/InternalSignificantTerms.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/InternalSignificantTerms.java @@ -253,7 +253,7 @@ public InternalSignificantTerms reduce(ReduceContext reduceContext) { for (Map.Entry> entry : buckets.entrySet()) { List sameTermBuckets = entry.getValue(); final Bucket b = sameTermBuckets.get(0).reduce(sameTermBuckets, reduceContext.cacheRecycler()); - if((b.score>0)&& (b.subsetDf >= minDocCount)) { + if ((b.score > 0) && (b.subsetDf >= minDocCount)) { ordered.insertWithOverflow(b); } } @@ -268,7 +268,6 @@ public InternalSignificantTerms reduce(ReduceContext reduceContext) { } final void trimExcessEntries() { - //TODO is this sorted in the desired order? final List newBuckets = Lists.newArrayList(); for (Bucket b : buckets) { if (newBuckets.size() >= requiredSize) { diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantLongTermsAggregator.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantLongTermsAggregator.java index 41521fbf43083..89beb73d66bff 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantLongTermsAggregator.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantLongTermsAggregator.java @@ -18,14 +18,10 @@ */ package org.elasticsearch.search.aggregations.bucket.significant; -import org.apache.lucene.index.AtomicReaderContext; import org.apache.lucene.index.IndexReader; -import org.elasticsearch.common.lease.Releasables; -import org.elasticsearch.index.fielddata.LongValues; import org.elasticsearch.search.aggregations.Aggregator; import org.elasticsearch.search.aggregations.AggregatorFactories; -import org.elasticsearch.search.aggregations.bucket.BucketsAggregator; -import org.elasticsearch.search.aggregations.bucket.LongHash; +import org.elasticsearch.search.aggregations.bucket.terms.LongTermsAggregator; import org.elasticsearch.search.aggregations.support.AggregationContext; import org.elasticsearch.search.aggregations.support.numeric.NumericValuesSource; import org.elasticsearch.search.internal.ContextIndexSearcher; @@ -37,55 +33,22 @@ /** * */ -public class SignificantLongTermsAggregator extends BucketsAggregator { - - private final int requiredSize; - private final int shardSize; - private final long minDocCount; - private final NumericValuesSource valuesSource; - private final LongHash bucketOrds; - private LongValues values; - protected int numCollectedDocs; - private SignificantTermsAggregatorFactory termsAggFactory; - - +public class SignificantLongTermsAggregator extends LongTermsAggregator { - public SignificantLongTermsAggregator(String name, AggregatorFactories factories, NumericValuesSource valuesSource, SignificantTermsAggregatorFactory termsAggFactory, - long estimatedBucketCount, int requiredSize, int shardSize, long minDocCount, AggregationContext aggregationContext, Aggregator parent) { - super(name, BucketAggregationMode.PER_BUCKET, factories, estimatedBucketCount, aggregationContext, parent); - this.valuesSource = valuesSource; + public SignificantLongTermsAggregator(String name, AggregatorFactories factories, NumericValuesSource valuesSource, + long estimatedBucketCount, int requiredSize, int shardSize, long minDocCount, + AggregationContext aggregationContext, Aggregator parent,SignificantTermsAggregatorFactory termsAggFactory) { + super(name, factories, valuesSource, estimatedBucketCount, null, requiredSize, shardSize, minDocCount, aggregationContext, parent); this.termsAggFactory = termsAggFactory; - this.requiredSize = requiredSize; - this.shardSize = shardSize; - this.minDocCount = minDocCount; - bucketOrds = new LongHash(estimatedBucketCount, aggregationContext.pageCacheRecycler()); } - @Override - public boolean shouldCollect() { - return true; - } - - @Override - public void setNextReader(AtomicReaderContext reader) { - values = valuesSource.longValues(); - } + protected int numCollectedDocs; + private SignificantTermsAggregatorFactory termsAggFactory; @Override public void collect(int doc, long owningBucketOrdinal) throws IOException { - assert owningBucketOrdinal == 0; + super.collect(doc,owningBucketOrdinal); numCollectedDocs++; - - final int valuesCount = values.setDocument(doc); - - for (int i = 0; i < valuesCount; ++i) { - final long val = values.nextValue(); - long bucketOrdinal = bucketOrds.add(val); - if (bucketOrdinal < 0) { // already seen - bucketOrdinal = - 1 - bucketOrdinal; - } - collectBucket(doc, bucketOrdinal); - } } @Override @@ -147,9 +110,5 @@ public SignificantLongTerms buildEmptyAggregation() { return new SignificantLongTerms(0, supersetSize, name, valuesSource.formatter(), requiredSize, minDocCount, Collections.emptyList()); } - @Override - public void doRelease() { - Releasables.release(bucketOrds); - } } diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantStringTermsAggregator.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantStringTermsAggregator.java index 1cdcfc2ddebc8..a9463c1b9e93c 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantStringTermsAggregator.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantStringTermsAggregator.java @@ -18,7 +18,6 @@ */ package org.elasticsearch.search.aggregations.bucket.significant; -import com.google.common.collect.UnmodifiableIterator; import org.apache.lucene.index.AtomicReaderContext; import org.apache.lucene.index.IndexReader; import org.apache.lucene.util.BytesRef; @@ -29,11 +28,7 @@ import org.elasticsearch.index.fielddata.ordinals.Ordinals; import org.elasticsearch.search.aggregations.Aggregator; import org.elasticsearch.search.aggregations.AggregatorFactories; -import org.elasticsearch.search.aggregations.bucket.BucketsAggregator; -import org.elasticsearch.search.aggregations.bucket.BytesRefHash; -//import org.elasticsearch.search.aggregations.bucket.significant.StringTerms.Bucket; -//import org.elasticsearch.search.aggregations.bucket.terms.Terms; -//import org.elasticsearch.search.aggregations.bucket.terms.support.BucketPriorityQueue; +import org.elasticsearch.search.aggregations.bucket.terms.StringTermsAggregator; import org.elasticsearch.search.aggregations.bucket.terms.support.IncludeExclude; import org.elasticsearch.search.aggregations.support.AggregationContext; import org.elasticsearch.search.aggregations.support.ValuesSource; @@ -43,115 +38,29 @@ import java.io.IOException; import java.util.Arrays; import java.util.Collections; -import java.util.Iterator; /** * An aggregator of significant string values. */ -public class SignificantStringTermsAggregator extends BucketsAggregator { +public class SignificantStringTermsAggregator extends StringTermsAggregator { - private final ValuesSource valuesSource; - private final int requiredSize; - private final int shardSize; - private final long minDocCount; - protected final BytesRefHash bucketOrds; - private final IncludeExclude includeExclude; - private BytesValues values; protected int numCollectedDocs; private SignificantTermsAggregatorFactory termsAggFactory; - - public SignificantStringTermsAggregator(String name, AggregatorFactories factories, ValuesSource valuesSource, SignificantTermsAggregatorFactory termsAggFactory, long estimatedBucketCount, - int requiredSize, int shardSize, long minDocCount, - IncludeExclude includeExclude, AggregationContext aggregationContext, Aggregator parent) { - - super(name, BucketAggregationMode.PER_BUCKET, factories, estimatedBucketCount, aggregationContext, parent); - this.valuesSource = valuesSource; - this.termsAggFactory = termsAggFactory; - this.requiredSize = requiredSize; - this.shardSize = shardSize; - this.minDocCount = minDocCount; - this.includeExclude = includeExclude; - bucketOrds = new BytesRefHash(estimatedBucketCount, aggregationContext.pageCacheRecycler()); - } - - @Override - public boolean shouldCollect() { - return true; - } - - @Override - public void setNextReader(AtomicReaderContext reader) { - values = valuesSource.bytesValues(); + + public SignificantStringTermsAggregator(String name, AggregatorFactories factories, ValuesSource valuesSource, + long estimatedBucketCount, int requiredSize, int shardSize, long minDocCount, + IncludeExclude includeExclude, AggregationContext aggregationContext, Aggregator parent, SignificantTermsAggregatorFactory termsAggFactory) { + super(name, factories, valuesSource, estimatedBucketCount, null, requiredSize, shardSize, minDocCount, includeExclude, aggregationContext, + parent); + this.termsAggFactory=termsAggFactory; } @Override public void collect(int doc, long owningBucketOrdinal) throws IOException { + super.collect(doc,owningBucketOrdinal); numCollectedDocs++; - assert owningBucketOrdinal == 0; - final int valuesCount = values.setDocument(doc); - - for (int i = 0; i < valuesCount; ++i) { - final BytesRef bytes = values.nextValue(); - if (includeExclude != null && !includeExclude.accept(bytes)) { - continue; - } - final int hash = values.currentValueHash(); - assert hash == bytes.hashCode(); - long bucketOrdinal = bucketOrds.add(bytes, hash); - if (bucketOrdinal < 0) { // already seen - bucketOrdinal = - 1 - bucketOrdinal; - } - //TODO this system of counting is only based on doc volumes. - // There are scenarios where count distinct of *entities* is - // required e.g. see https://docs.google.com/a/elasticsearch.com/presentation/d/17jkxrsmSq6Gpd2mKAIO4949jSCgkSIM3j2KIOQ8oEEI/edit?usp=sharing - // and the section on credit card fraud which involves counting - // unique payee references not volumes of docs - collectBucket(doc, bucketOrdinal); - } - } - - /** Returns an iterator over the field data terms. */ - private static Iterator terms(final BytesValues.WithOrdinals bytesValues, boolean reverse) { - final Ordinals.Docs ordinals = bytesValues.ordinals(); - if (reverse) { - return new UnmodifiableIterator() { - - long i = ordinals.getMaxOrd() - 1; - - @Override - public boolean hasNext() { - return i >= Ordinals.MIN_ORDINAL; - } - - @Override - public BytesRef next() { - bytesValues.getValueByOrd(i--); - return bytesValues.copyShared(); - } - - }; - } else { - return new UnmodifiableIterator() { - - long i = Ordinals.MIN_ORDINAL; - - @Override - public boolean hasNext() { - return i < ordinals.getMaxOrd(); - } - - @Override - public BytesRef next() { - bytesValues.getValueByOrd(i++); - return bytesValues.copyShared(); - } - - }; - } } - - - + @Override public SignificantStringTerms buildAggregation(long owningBucketOrdinal) { assert owningBucketOrdinal == 0; @@ -221,9 +130,9 @@ public static class WithOrdinals extends SignificantStringTermsAggregator { private Ordinals.Docs ordinals; private LongArray ordinalToBucket; - public WithOrdinals(String name, AggregatorFactories factories, BytesValuesSource.WithOrdinals valuesSource, SignificantTermsAggregatorFactory indexedFieldName, - long esitmatedBucketCount, int requiredSize, int shardSize, long minDocCount, AggregationContext aggregationContext, Aggregator parent) { - super(name, factories, valuesSource, indexedFieldName, esitmatedBucketCount, requiredSize, shardSize, minDocCount, null, aggregationContext, parent); + public WithOrdinals(String name, AggregatorFactories factories, BytesValuesSource.WithOrdinals valuesSource, + long esitmatedBucketCount, int requiredSize, int shardSize, long minDocCount, AggregationContext aggregationContext, Aggregator parent,SignificantTermsAggregatorFactory termsAggFactory) { + super(name, factories, valuesSource, esitmatedBucketCount, requiredSize, shardSize, minDocCount, null, aggregationContext, parent, termsAggFactory); this.valuesSource = valuesSource; } diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsAggregatorFactory.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsAggregatorFactory.java index 4c27a8501a91b..173f1bb645d9d 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsAggregatorFactory.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsAggregatorFactory.java @@ -126,9 +126,9 @@ protected Aggregator create(ValuesSource valuesSource, long expectedBucketsCount if (execution.equals(EXECUTION_HINT_VALUE_ORDINALS)) { assert includeExclude == null; - return new SignificantStringTermsAggregator.WithOrdinals(name, factories, (BytesValuesSource.WithOrdinals) valuesSource, this, estimatedBucketCount, requiredSize, shardSize, minDocCount, aggregationContext, parent); + return new SignificantStringTermsAggregator.WithOrdinals(name, factories, (BytesValuesSource.WithOrdinals) valuesSource, estimatedBucketCount, requiredSize, shardSize, minDocCount, aggregationContext, parent, this ); } else { - return new SignificantStringTermsAggregator(name, factories, valuesSource, this, estimatedBucketCount, requiredSize, shardSize, minDocCount, includeExclude, aggregationContext, parent); + return new SignificantStringTermsAggregator(name, factories, valuesSource, estimatedBucketCount, requiredSize, shardSize, minDocCount, includeExclude, aggregationContext, parent, this); } } @@ -142,7 +142,7 @@ protected Aggregator create(ValuesSource valuesSource, long expectedBucketsCount if (((NumericValuesSource) valuesSource).isFloatingPoint()) { throw new UnsupportedOperationException("No support for examining floating point numerics"); } - return new SignificantLongTermsAggregator(name, factories, (NumericValuesSource) valuesSource, this, estimatedBucketCount, requiredSize, shardSize, minDocCount, aggregationContext, parent); + return new SignificantLongTermsAggregator(name, factories, (NumericValuesSource) valuesSource, estimatedBucketCount, requiredSize, shardSize, minDocCount, aggregationContext, parent,this); } throw new AggregationExecutionException("sigfnificant_terms aggregation cannot be applied to field [" + valuesSourceConfig.fieldContext().field() + diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsBuilder.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsBuilder.java index 074a201f026b2..19a77318ecf38 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsBuilder.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsBuilder.java @@ -25,7 +25,7 @@ import java.io.IOException; /** - * Creates an aggregation based on bucketing points into GeoHashes + * Creates an aggregation that finds interesting or unusual occurrences of terms in a result set * */ public class SignificantTermsBuilder extends AggregationBuilder { @@ -37,7 +37,6 @@ public class SignificantTermsBuilder extends AggregationBuilderemptyList()); } diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringTermsAggregator.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringTermsAggregator.java index 74f1f5a65fe1f..1e7d48f936ff5 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringTermsAggregator.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringTermsAggregator.java @@ -32,6 +32,7 @@ import org.elasticsearch.index.fielddata.ordinals.Ordinals; import org.elasticsearch.search.aggregations.Aggregator; import org.elasticsearch.search.aggregations.AggregatorFactories; +import org.elasticsearch.search.aggregations.InternalAggregation; import org.elasticsearch.search.aggregations.bucket.BucketsAggregator; import org.elasticsearch.common.util.BytesRefHash; import org.elasticsearch.search.aggregations.bucket.terms.support.BucketPriorityQueue; @@ -50,9 +51,9 @@ public class StringTermsAggregator extends BucketsAggregator { private final ValuesSource valuesSource; private final InternalOrder order; - private final int requiredSize; - private final int shardSize; - private final long minDocCount; + protected final int requiredSize; + protected final int shardSize; + protected final long minDocCount; protected final BytesRefHash bucketOrds; private final IncludeExclude includeExclude; private BytesValues values; @@ -142,7 +143,7 @@ public BytesRef next() { } @Override - public StringTerms buildAggregation(long owningBucketOrdinal) { + public InternalAggregation buildAggregation(long owningBucketOrdinal) { assert owningBucketOrdinal == 0; if (minDocCount == 0 && (order != InternalOrder.COUNT_DESC || bucketOrds.size() < requiredSize)) { @@ -244,7 +245,7 @@ public boolean apply(BytesRef input) { } @Override - public StringTerms buildEmptyAggregation() { + public InternalAggregation buildEmptyAggregation() { return new StringTerms(name, order, requiredSize, minDocCount, Collections.emptyList()); } From fae1f42ca9629c99da46960cd614eb3046808531 Mon Sep 17 00:00:00 2001 From: markharwood Date: Thu, 20 Feb 2014 10:42:38 +0000 Subject: [PATCH 4/8] Removed redundant code, name changes to fields in results JSON --- .../significantterms-aggregation.asciidoc | 46 +++++++++---------- .../significant/SignificantLongTerms.java | 7 ++- .../significant/SignificantStringTerms.java | 8 ++-- .../SignificantStringTermsAggregator.java | 4 -- .../SignificantTermsAggregatorFactory.java | 4 -- 5 files changed, 28 insertions(+), 41 deletions(-) diff --git a/docs/reference/search/aggregations/bucket/significantterms-aggregation.asciidoc b/docs/reference/search/aggregations/bucket/significantterms-aggregation.asciidoc index b8d62f6c5969a..66c83341679c5 100644 --- a/docs/reference/search/aggregations/bucket/significantterms-aggregation.asciidoc +++ b/docs/reference/search/aggregations/bucket/significantterms-aggregation.asciidoc @@ -45,14 +45,13 @@ Response: "aggregations" : { "significantCrimeTypes" : { - "subset_size": 47347, - "superset_size": 5064554, + "doc_count": 47347, "buckets" : [ { "key": "Bicycle theft", "doc_count": 3640, - "significance_score": 0.371235374214817, - "superset_doc_freq": 66799 + "score": 0.371235374214817, + "bg_count": 66799 } ... ] @@ -109,14 +108,13 @@ Response: "key": "Metropolitan Police Service", "doc_count": 894038, "significantCrimeTypes": { - "subset_size": 894038, - "superset_size": 5064554, + "doc_count": 894038, "buckets": [ { "key": "Robbery", "doc_count": 27617, - "significance_score": 0.0599, - "superset_doc_freq": 53182 + "score": 0.0599, + "bg_count": 53182 }, ... ] @@ -126,14 +124,13 @@ Response: "key": "British Transport Police", "doc_count": 47347, "significantCrimeTypes": { - "subset_size": 47347, - "superset_size": 5064554, + "doc_count": 47347, "buckets": [ { "key": "Bicycle theft", "doc_count": 3640, - "significance_score": 0.371, - "superset_doc_freq": 66799 + "score": 0.371, + "bg_count": 66799 }, ... ] @@ -185,7 +182,7 @@ where a simple `terms` aggregation would typically show the very popular "consta -.What are the significance_scores? +.How are the scores calculated? ********************************** The numbers returned for scores are primarily intended for ranking different suggestions sensibly rather than something easily understood by end users. The scores are derived from the doc frequencies in _foreground_ and _background_ sets. The _absolute_ change in popularity (foregroundPercent - backgroundPercent) would favour @@ -218,12 +215,14 @@ a high setting would have a small number of relevant results packed full of keyw ********************************** -.Hot tip: Show significant_terms in context -********************************** +[TIP] +============ +.Show significant_terms in context + Free-text significant_terms are much more easily understood when viewed in context. Take the results of `significant_terms` suggestions from a free-text field and use them in a `terms` query on the same field with a `highlight` clause to present users with example snippets of documents. When the terms are presented unstemmed, highlighted, with the right case, in the right order and with some context, their significance/meaning is more readily apparent. -********************************** +============ === Limitations @@ -253,7 +252,7 @@ so there is no difference in document frequencies to observe and from which to m Another consideration is that the significant_terms aggregation produces many candidate results at shard level that are only later pruned on the reducing node once all statistics from all shards are merged. As a result, -it can be inefficient and costly in terms of RAM to embed large child aggregations under a signigicant_terms +it can be inefficient and costly in terms of RAM to embed large child aggregations under a significant_terms aggregation that later discards many candidate terms. It is advisable in these cases to perform two searches - the first to provide a rationalized list of significant_terms and then add this shortlist of terms to a second query to go back and fetch the required child aggregations. @@ -265,7 +264,7 @@ as such may be: * high when considering the background frequency as it may count occurrences found in deleted documents Like most design decisions, this is the basis of a trade-off in which we have chosen to provide fast performance at the cost of some (typically small) inaccuracies. -However, the "size" and "shard size" settings covered in the next section provide tools to help control the accuracy levels. +However, the `size` and `shard size` settings covered in the next section provide tools to help control the accuracy levels. === Parameters @@ -274,17 +273,16 @@ However, the "size" and "shard size" settings covered in the next section provid ==== Size & Shard Size The `size` parameter can be set to define how many term buckets should be returned out of the overall terms list. By -default, the node coordinating the search process will request each shard to provide its own top `size` term buckets +default, the node coordinating the search process will request each shard to provide its own top term buckets and once all shards respond, it will reduce the results to the final list that will then be returned to the client. -This means that if the number of unique terms is greater than `size`, the returned list is slightly off and not accurate +If the number of unique terms is greater than `size`, the returned list can be slightly off and not accurate (it could be that the term counts are slightly off and it could even be that a term that should have been in the top size buckets was not returned). -The higher the requested `size` is, the more accurate the results will be, but also, the more expensive it will be to -compute the final results (both due to bigger priority queues that are managed on a shard level and due to bigger data -transfers between the nodes and the client). +To ensure better accuracy a multiple of the final `size` is used as the number of terms to request from each shard +using a heuristic based on the number of shards. To take manual control of this setting the `shard_size` parameter +can be used to control the volumes of candidate terms produced by each shard. -The `shard_size` parameter can be used to control the volumes of candidate terms produced by each shard. Low-frequency terms can turn out to be the most interesting ones once all results are combined so the significant_terms aggregation can produce higher-quality results when the `shard_size` parameter is set to values significantly higher than the `size` setting. This ensures that a bigger volume of promising candidate terms are given diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantLongTerms.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantLongTerms.java index a7ec384adff7a..a6aff73cab1cb 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantLongTerms.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantLongTerms.java @@ -143,8 +143,7 @@ public void writeTo(StreamOutput out) throws IOException { @Override public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { builder.startObject(name); - builder.field("subset_size", subsetSize); - builder.field("superset_size", supersetSize); + builder.field("doc_count", subsetSize); builder.startArray(CommonFields.BUCKETS); for (InternalSignificantTerms.Bucket bucket : buckets) { builder.startObject(); @@ -153,8 +152,8 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws builder.field(CommonFields.KEY_AS_STRING, valueFormatter.format(((Bucket) bucket).term)); } builder.field(CommonFields.DOC_COUNT, bucket.getDocCount()); - builder.field("significance_score", bucket.score); - builder.field("superset_doc_freq", bucket.supersetDf); + builder.field("score", bucket.score); + builder.field("bg_count", bucket.supersetDf); ((InternalAggregations) bucket.getAggregations()).toXContentInternal(builder, params); builder.endObject(); } diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantStringTerms.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantStringTerms.java index 9fddcca1dae11..c7badc5cff06f 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantStringTerms.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantStringTerms.java @@ -140,8 +140,7 @@ public void writeTo(StreamOutput out) throws IOException { @Override public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { builder.startObject(name); - builder.field("subset_size", subsetSize); - builder.field("superset_size", supersetSize); + builder.field("doc_count", subsetSize); builder.startArray(CommonFields.BUCKETS); for (InternalSignificantTerms.Bucket bucket : buckets) { //There is a condition (presumably when only one shard has a bucket?) where reduce is not called @@ -149,10 +148,9 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws if(bucket.subsetDf>=minDocCount){ builder.startObject(); builder.field(CommonFields.KEY, ((Bucket) bucket).termBytes); - //TODO change reference to "doc" count/freq etc - we may be used to count freq of entities that are not docs builder.field(CommonFields.DOC_COUNT, bucket.getDocCount()); - builder.field("significance_score", bucket.score); - builder.field("superset_doc_freq", bucket.supersetDf); + builder.field("score", bucket.score); + builder.field("bg_count", bucket.supersetDf); ((InternalAggregations) bucket.getAggregations()).toXContentInternal(builder, params); builder.endObject(); } diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantStringTermsAggregator.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantStringTermsAggregator.java index a9463c1b9e93c..de6100f8cc86d 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantStringTermsAggregator.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantStringTermsAggregator.java @@ -175,10 +175,6 @@ public void collect(int doc, long owningBucketOrdinal) throws IOException { } } - @Override - public void doRelease() { - Releasables.release(bucketOrds, ordinalToBucket); - } } } diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsAggregatorFactory.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsAggregatorFactory.java index 173f1bb645d9d..0faba3da15b52 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsAggregatorFactory.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsAggregatorFactory.java @@ -29,10 +29,6 @@ import org.elasticsearch.search.aggregations.AggregationExecutionException; import org.elasticsearch.search.aggregations.Aggregator; import org.elasticsearch.search.aggregations.Aggregator.BucketAggregationMode; -//import org.elasticsearch.search.aggregations.bucket.terms.LongTermsAggregator; -//import org.elasticsearch.search.aggregations.bucket.terms.DoubleTermsAggregator; -//import org.elasticsearch.search.aggregations.bucket.terms.LongTermsAggregator; -//import org.elasticsearch.search.aggregations.bucket.terms.StringTermsAggregator; import org.elasticsearch.search.aggregations.bucket.terms.support.IncludeExclude; import org.elasticsearch.search.aggregations.support.AggregationContext; import org.elasticsearch.search.aggregations.support.ValueSourceAggregatorFactory; From 9387416856ec5b7b927c347c308a8f98c0600b6f Mon Sep 17 00:00:00 2001 From: markharwood Date: Tue, 11 Mar 2014 11:41:47 +0000 Subject: [PATCH 5/8] Switched docFreq cache in SignificantTermsAggregatorFactory to BytesRefHash + IntArray instead of hpcc collection. Code formatting changes. --- .../significant/InternalSignificantTerms.java | 2 +- .../SignificantTermsAggregatorFactory.java | 33 ++++++++++++------- .../significant/SignificantTermsBuilder.java | 8 ++--- 3 files changed, 26 insertions(+), 17 deletions(-) diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/InternalSignificantTerms.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/InternalSignificantTerms.java index d0fa0b17e1e3e..93a888ca79747 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/InternalSignificantTerms.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/InternalSignificantTerms.java @@ -115,7 +115,7 @@ public static final double getSampledTermSignificance(long subsetFreq, long subs } public void updateScore() { - score=getSampledTermSignificance(subsetDf,subsetSize,supersetDf,supersetSize); + score = getSampledTermSignificance(subsetDf, subsetSize, supersetDf, supersetSize); } @Override diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsAggregatorFactory.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsAggregatorFactory.java index 0faba3da15b52..79c5f143cac2e 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsAggregatorFactory.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsAggregatorFactory.java @@ -18,17 +18,19 @@ */ package org.elasticsearch.search.aggregations.bucket.significant; -import com.carrotsearch.hppc.ObjectObjectOpenHashMap; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.util.BytesRef; import org.elasticsearch.ElasticsearchException; import org.elasticsearch.ElasticsearchIllegalArgumentException; -import org.elasticsearch.common.lucene.HashedBytesRef; +import org.elasticsearch.cache.recycler.PageCacheRecycler; +import org.elasticsearch.common.util.BigArrays; +import org.elasticsearch.common.util.IntArray; import org.elasticsearch.index.mapper.FieldMapper; import org.elasticsearch.search.aggregations.AggregationExecutionException; import org.elasticsearch.search.aggregations.Aggregator; import org.elasticsearch.search.aggregations.Aggregator.BucketAggregationMode; +import org.elasticsearch.search.aggregations.bucket.BytesRefHash; import org.elasticsearch.search.aggregations.bucket.terms.support.IncludeExclude; import org.elasticsearch.search.aggregations.support.AggregationContext; import org.elasticsearch.search.aggregations.support.ValueSourceAggregatorFactory; @@ -47,6 +49,7 @@ public class SignificantTermsAggregatorFactory extends ValueSourceAggregatorFact public static final String EXECUTION_HINT_VALUE_MAP = "map"; public static final String EXECUTION_HINT_VALUE_ORDINALS = "ordinals"; + static final int INITIAL_NUM_TERM_FREQS_CACHED = 512; private final int requiredSize; private final int shardSize; @@ -55,6 +58,8 @@ public class SignificantTermsAggregatorFactory extends ValueSourceAggregatorFact private final String executionHint; private String indexedFieldName; private FieldMapper mapper; + private IntArray termDocFreqs; + private BytesRefHash cachedTermOrds; public SignificantTermsAggregatorFactory(String name, ValuesSourceConfig valueSourceConfig, int requiredSize, int shardSize, long minDocCount, IncludeExclude includeExclude, String executionHint) { super(name, SignificantStringTerms.TYPE.name(), valueSourceConfig); @@ -67,6 +72,10 @@ public SignificantTermsAggregatorFactory(String name, ValuesSourceConfig valueSo this.indexedFieldName = valuesSourceConfig.fieldContext().field(); mapper = SearchContext.current().smartNameFieldMapper(indexedFieldName); } + PageCacheRecycler pageCacheRecycler = SearchContext.current().pageCacheRecycler(); + termDocFreqs = BigArrays.newIntArray(INITIAL_NUM_TERM_FREQS_CACHED, pageCacheRecycler, true); + cachedTermOrds = new BytesRefHash(INITIAL_NUM_TERM_FREQS_CACHED, pageCacheRecycler); + } @Override @@ -145,22 +154,22 @@ protected Aggregator create(ValuesSource valuesSource, long expectedBucketsCount "]. It can only be applied to numeric or string fields."); } - //Cache used to avoid multiple aggs hitting IndexReaders for docFreq info for the same term - final ObjectObjectOpenHashMap cachedDocFreqs = new ObjectObjectOpenHashMap(); - HashedBytesRef spare = new HashedBytesRef(); - - //Many child aggs may ask for the same docFreq information so cache docFreq values for these terms + // Many child aggs may ask for the same docFreq information so cache docFreq + // values for these terms public long getBackgroundFrequency(IndexReader topReader, BytesRef termBytes) { - spare.reset(termBytes, termBytes.hashCode()); - Integer result = cachedDocFreqs.get(spare); - if (result == null) { + int result = 0; + long termOrd = cachedTermOrds.add(termBytes); + if (termOrd < 0) { // already seen, return the cached docFreq + termOrd = -1 - termOrd; + result = termDocFreqs.get(termOrd); + } else { // cache miss - read the terms' frequency in this shard and cache it try { result = topReader.docFreq(new Term(indexedFieldName, termBytes)); - HashedBytesRef key = new HashedBytesRef(BytesRef.deepCopyOf(termBytes), spare.hash); - cachedDocFreqs.put(key, result); } catch (IOException e) { throw new ElasticsearchException("IOException reading document frequency", e); } + termDocFreqs = BigArrays.grow(termDocFreqs, termOrd + 1); + termDocFreqs.set(termOrd, result); } return result; } diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsBuilder.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsBuilder.java index 19a77318ecf38..1773d5fccc18c 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsBuilder.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsBuilder.java @@ -32,9 +32,9 @@ public class SignificantTermsBuilder extends AggregationBuilder Date: Wed, 12 Mar 2014 10:08:06 +0000 Subject: [PATCH 6/8] Rebased on latest master and added related changes to memory management --- .../SignificantLongTermsAggregator.java | 5 +++ .../significant/SignificantStringTerms.java | 2 +- .../SignificantStringTermsAggregator.java | 13 ++++++-- .../SignificantTermsAggregatorFactory.java | 33 ++++++++++++++----- .../UnmappedSignificantTermsAggregator.java | 11 ++++++- 5 files changed, 50 insertions(+), 14 deletions(-) diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantLongTermsAggregator.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantLongTermsAggregator.java index 89beb73d66bff..66c1e48445a7f 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantLongTermsAggregator.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantLongTermsAggregator.java @@ -19,6 +19,7 @@ package org.elasticsearch.search.aggregations.bucket.significant; import org.apache.lucene.index.IndexReader; +import org.elasticsearch.common.lease.Releasables; import org.elasticsearch.search.aggregations.Aggregator; import org.elasticsearch.search.aggregations.AggregatorFactories; import org.elasticsearch.search.aggregations.bucket.terms.LongTermsAggregator; @@ -110,5 +111,9 @@ public SignificantLongTerms buildEmptyAggregation() { return new SignificantLongTerms(0, supersetSize, name, valuesSource.formatter(), requiredSize, minDocCount, Collections.emptyList()); } + @Override + public void doRelease() { + Releasables.release(bucketOrds, termsAggFactory); + } } diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantStringTerms.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantStringTerms.java index c7badc5cff06f..cc41171691e00 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantStringTerms.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantStringTerms.java @@ -57,7 +57,7 @@ public static void registerStreams() { public static class Bucket extends InternalSignificantTerms.Bucket { - final BytesRef termBytes; + BytesRef termBytes; public Bucket(BytesRef term, long subsetDf, long subsetSize, long supersetDf, long supersetSize,InternalAggregations aggregations) { diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantStringTermsAggregator.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantStringTermsAggregator.java index de6100f8cc86d..291b39450b08c 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantStringTermsAggregator.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantStringTermsAggregator.java @@ -45,7 +45,7 @@ public class SignificantStringTermsAggregator extends StringTermsAggregator { protected int numCollectedDocs; - private SignificantTermsAggregatorFactory termsAggFactory; + protected SignificantTermsAggregatorFactory termsAggFactory; public SignificantStringTermsAggregator(String name, AggregatorFactories factories, ValuesSource valuesSource, long estimatedBucketCount, int requiredSize, int shardSize, long minDocCount, @@ -98,6 +98,8 @@ public SignificantStringTerms buildAggregation(long owningBucketOrdinal) { final InternalSignificantTerms.Bucket[] list = new InternalSignificantTerms.Bucket[ordered.size()]; for (int i = ordered.size() - 1; i >= 0; --i) { final SignificantStringTerms.Bucket bucket = (SignificantStringTerms.Bucket) ordered.pop(); + // the terms are owned by the BytesRefHash, we need to pull a copy since the BytesRef hash data may be recycled at some point + bucket.termBytes = BytesRef.deepCopyOf(bucket.termBytes); bucket.aggregations = bucketAggregations(bucket.bucketOrd); list[i] = bucket; } @@ -117,7 +119,7 @@ public SignificantStringTerms buildEmptyAggregation() { @Override public void doRelease() { - Releasables.release(bucketOrds); + Releasables.release(bucketOrds, termsAggFactory); } /** @@ -145,7 +147,7 @@ public void setNextReader(AtomicReaderContext reader) { if (ordinalToBucket != null) { ordinalToBucket.release(); } - ordinalToBucket = BigArrays.newLongArray(BigArrays.overSize(maxOrd), context().pageCacheRecycler(), false); + ordinalToBucket = context().bigArrays().newLongArray(BigArrays.overSize(maxOrd), false); } ordinalToBucket.fill(0, maxOrd, -1L); } @@ -174,6 +176,11 @@ public void collect(int doc, long owningBucketOrdinal) throws IOException { collectBucket(doc, bucketOrd); } } + + @Override + public void doRelease() { + Releasables.release(bucketOrds, termsAggFactory, ordinalToBucket); + } } diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsAggregatorFactory.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsAggregatorFactory.java index 79c5f143cac2e..8063c9fc3d146 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsAggregatorFactory.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsAggregatorFactory.java @@ -23,14 +23,14 @@ import org.apache.lucene.util.BytesRef; import org.elasticsearch.ElasticsearchException; import org.elasticsearch.ElasticsearchIllegalArgumentException; -import org.elasticsearch.cache.recycler.PageCacheRecycler; +import org.elasticsearch.common.lease.Releasable; import org.elasticsearch.common.util.BigArrays; +import org.elasticsearch.common.util.BytesRefHash; import org.elasticsearch.common.util.IntArray; import org.elasticsearch.index.mapper.FieldMapper; import org.elasticsearch.search.aggregations.AggregationExecutionException; import org.elasticsearch.search.aggregations.Aggregator; import org.elasticsearch.search.aggregations.Aggregator.BucketAggregationMode; -import org.elasticsearch.search.aggregations.bucket.BytesRefHash; import org.elasticsearch.search.aggregations.bucket.terms.support.IncludeExclude; import org.elasticsearch.search.aggregations.support.AggregationContext; import org.elasticsearch.search.aggregations.support.ValueSourceAggregatorFactory; @@ -45,7 +45,7 @@ /** * */ -public class SignificantTermsAggregatorFactory extends ValueSourceAggregatorFactory { +public class SignificantTermsAggregatorFactory extends ValueSourceAggregatorFactory implements Releasable { public static final String EXECUTION_HINT_VALUE_MAP = "map"; public static final String EXECUTION_HINT_VALUE_ORDINALS = "ordinals"; @@ -60,6 +60,7 @@ public class SignificantTermsAggregatorFactory extends ValueSourceAggregatorFact private FieldMapper mapper; private IntArray termDocFreqs; private BytesRefHash cachedTermOrds; + private BigArrays bigArrays; public SignificantTermsAggregatorFactory(String name, ValuesSourceConfig valueSourceConfig, int requiredSize, int shardSize, long minDocCount, IncludeExclude includeExclude, String executionHint) { super(name, SignificantStringTerms.TYPE.name(), valueSourceConfig); @@ -72,15 +73,14 @@ public SignificantTermsAggregatorFactory(String name, ValuesSourceConfig valueSo this.indexedFieldName = valuesSourceConfig.fieldContext().field(); mapper = SearchContext.current().smartNameFieldMapper(indexedFieldName); } - PageCacheRecycler pageCacheRecycler = SearchContext.current().pageCacheRecycler(); - termDocFreqs = BigArrays.newIntArray(INITIAL_NUM_TERM_FREQS_CACHED, pageCacheRecycler, true); - cachedTermOrds = new BytesRefHash(INITIAL_NUM_TERM_FREQS_CACHED, pageCacheRecycler); - + bigArrays = SearchContext.current().bigArrays(); + termDocFreqs = bigArrays.newIntArray(INITIAL_NUM_TERM_FREQS_CACHED, true); + cachedTermOrds = new BytesRefHash(INITIAL_NUM_TERM_FREQS_CACHED, bigArrays); } @Override protected Aggregator createUnmapped(AggregationContext aggregationContext, Aggregator parent) { - return new UnmappedSignificantTermsAggregator(name, requiredSize, minDocCount, aggregationContext, parent); + return new UnmappedSignificantTermsAggregator(name, requiredSize, minDocCount, aggregationContext, parent, this); } private static boolean hasParentBucketAggregator(Aggregator parent) { @@ -168,12 +168,14 @@ public long getBackgroundFrequency(IndexReader topReader, BytesRef termBytes) { } catch (IOException e) { throw new ElasticsearchException("IOException reading document frequency", e); } - termDocFreqs = BigArrays.grow(termDocFreqs, termOrd + 1); + termDocFreqs = bigArrays.grow(termDocFreqs, termOrd + 1); termDocFreqs.set(termOrd, result); } return result; } + + // Many child aggs may ask for the same docFreq information so cache docFreq // values for these terms public long getBackgroundFrequency(IndexReader topReader, long term) { @@ -181,4 +183,17 @@ public long getBackgroundFrequency(IndexReader topReader, long term) { return getBackgroundFrequency(topReader, indexedVal); } + @Override + public boolean release() throws ElasticsearchException { + if (cachedTermOrds != null) { + cachedTermOrds.release(); + cachedTermOrds = null; + } + if (termDocFreqs != null) { + termDocFreqs.release(); + termDocFreqs = null; + } + return true; + } + } diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/UnmappedSignificantTermsAggregator.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/UnmappedSignificantTermsAggregator.java index c607d13e4321f..d415a7517f069 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/UnmappedSignificantTermsAggregator.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/UnmappedSignificantTermsAggregator.java @@ -19,6 +19,7 @@ package org.elasticsearch.search.aggregations.bucket.significant; import org.apache.lucene.index.AtomicReaderContext; +import org.elasticsearch.common.lease.Releasables; import org.elasticsearch.search.aggregations.Aggregator; import org.elasticsearch.search.aggregations.AggregatorFactories; import org.elasticsearch.search.aggregations.InternalAggregation; @@ -33,11 +34,13 @@ public class UnmappedSignificantTermsAggregator extends Aggregator { private final int requiredSize; private final long minDocCount; + private SignificantTermsAggregatorFactory termsAggFactory; - public UnmappedSignificantTermsAggregator(String name, int requiredSize, long minDocCount, AggregationContext aggregationContext, Aggregator parent) { + public UnmappedSignificantTermsAggregator(String name, int requiredSize, long minDocCount, AggregationContext aggregationContext, Aggregator parent, SignificantTermsAggregatorFactory termsAggFactory) { super(name, BucketAggregationMode.PER_BUCKET, AggregatorFactories.EMPTY, 0, aggregationContext, parent); this.requiredSize = requiredSize; this.minDocCount = minDocCount; + this.termsAggFactory = termsAggFactory; } @Override @@ -63,4 +66,10 @@ public InternalAggregation buildAggregation(long owningBucketOrdinal) { public InternalAggregation buildEmptyAggregation() { return new UnmappedSignificantTerms(name, requiredSize, minDocCount); } + + @Override + protected void doRelease() { + Releasables.release(termsAggFactory); + } + } From b058ba6243952394dc6541e0240be996c53648a0 Mon Sep 17 00:00:00 2001 From: markharwood Date: Wed, 12 Mar 2014 17:41:19 +0000 Subject: [PATCH 7/8] =?UTF-8?q?Updates=20from=20@uboness=20review:=20int?= =?UTF-8?q?=20counts=20changed=20to=20longs,=20asciidoc=20examples=20chang?= =?UTF-8?q?ed=20to=20lowercase,=20base=20class=20change=20to=20Significant?= =?UTF-8?q?Terms,=20code=20formatting,=20parser=20parses=20=E2=80=9Cformat?= =?UTF-8?q?=E2=80=9D=20field.=20I=E2=80=99ve=20added=20a=20=E2=80=9CTODO?= =?UTF-8?q?=E2=80=9D=20comment=20for=20the=20refactoring=20suggestion=20he?= =?UTF-8?q?re:=20https://github.com/elasticsearch/elasticsearch/pull/5146#?= =?UTF-8?q?discussion=5Fr10500246=20-=20as=20this=20should=20be=20consider?= =?UTF-8?q?ed=20as=20part=20of=20future=20changes?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../significantterms-aggregation.asciidoc | 8 +- .../significant/InternalSignificantTerms.java | 4 +- .../SignificantLongTermsAggregator.java | 6 +- .../significant/SignificantStringTerms.java | 2 +- .../SignificantStringTermsAggregator.java | 6 +- .../bucket/significant/SignificantTerms.java | 10 +-- .../SignificantTermsAggregatorFactory.java | 17 +++-- .../significant/SignificantTermsParser.java | 2 + .../bucket/SignificantTermsTests.java | 73 ++++++------------- 9 files changed, 52 insertions(+), 76 deletions(-) diff --git a/docs/reference/search/aggregations/bucket/significantterms-aggregation.asciidoc b/docs/reference/search/aggregations/bucket/significantterms-aggregation.asciidoc index 66c83341679c5..4a2c798af0900 100644 --- a/docs/reference/search/aggregations/bucket/significantterms-aggregation.asciidoc +++ b/docs/reference/search/aggregations/bucket/significantterms-aggregation.asciidoc @@ -30,7 +30,7 @@ Example: }, "aggregations" : { "significantCrimeTypes" : { - "significant_terms" : { "field" : "crimeType" } + "significant_terms" : { "field" : "crime_type" } } } } @@ -86,7 +86,7 @@ Example using a parent aggregation for segmentation: "terms": {"field": "force"}, "aggregations": { "significantCrimeTypes": { - "significant_terms": {"field": "crimeType"} + "significant_terms": {"field": "crime_type"} } } } @@ -159,7 +159,7 @@ area to identify unusual hot-spots of a particular crime type: }, "aggs": { "significantCrimeTypes": { - "significant_terms": {"field": "crimeType"} + "significant_terms": {"field": "crime_type"} } } } @@ -206,7 +206,7 @@ to load every unique word into RAM. It is recommended to only use this on smalle .Use the _"like this but not this"_ pattern ********************************** You can spot mis-categorized content by first searching a structured field e.g. `category:adultMovie` and use significant_terms on the -free-text "movieDescription" field. Take the suggested words (I'll leave them to your imagination) and then search for all movies NOT marked as category:adultMovie but containing these keywords. +free-text "movie_description" field. Take the suggested words (I'll leave them to your imagination) and then search for all movies NOT marked as category:adultMovie but containing these keywords. You now have a ranked list of badly-categorized movies that you should reclassify or at least remove from the "familyFriendly" category. The significance score from each term can also provide a useful `boost` setting to sort matches. diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/InternalSignificantTerms.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/InternalSignificantTerms.java index 93a888ca79747..1466439083cf4 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/InternalSignificantTerms.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/InternalSignificantTerms.java @@ -180,13 +180,13 @@ public Iterator iterator() { } @Override - public Collection buckets() { + public Collection getBuckets() { Object o = buckets; return (Collection) o; } @Override - public SignificantTerms.Bucket getByTerm(String term) { + public SignificantTerms.Bucket getBucketByKey(String term) { if (bucketMap == null) { bucketMap = Maps.newHashMapWithExpectedSize(buckets.size()); for (Bucket bucket : buckets) { diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantLongTermsAggregator.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantLongTermsAggregator.java index 66c1e48445a7f..72db7961b0649 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantLongTermsAggregator.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantLongTermsAggregator.java @@ -43,7 +43,7 @@ public SignificantLongTermsAggregator(String name, AggregatorFactories factories this.termsAggFactory = termsAggFactory; } - protected int numCollectedDocs; + protected long numCollectedDocs; private SignificantTermsAggregatorFactory termsAggFactory; @Override @@ -60,8 +60,8 @@ public SignificantLongTerms buildAggregation(long owningBucketOrdinal) { ContextIndexSearcher searcher = context.searchContext().searcher(); IndexReader topReader = searcher.getIndexReader(); - int supersetSize = topReader.numDocs(); - int subsetSize = numCollectedDocs; + long supersetSize = topReader.numDocs(); + long subsetSize = numCollectedDocs; BucketSignificancePriorityQueue ordered = new BucketSignificancePriorityQueue(size); SignificantLongTerms.Bucket spare = null; diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantStringTerms.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantStringTerms.java index cc41171691e00..37d51c93b8f02 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantStringTerms.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantStringTerms.java @@ -73,7 +73,7 @@ public Text getKeyAsText() { @Override public Number getKeyAsNumber() { - // this method is needed for scripted numeric faceting + // this method is needed for scripted numeric aggregations return Double.parseDouble(termBytes.utf8ToString()); } diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantStringTermsAggregator.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantStringTermsAggregator.java index 291b39450b08c..6b8ce91ff7f74 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantStringTermsAggregator.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantStringTermsAggregator.java @@ -44,7 +44,7 @@ */ public class SignificantStringTermsAggregator extends StringTermsAggregator { - protected int numCollectedDocs; + protected long numCollectedDocs; protected SignificantTermsAggregatorFactory termsAggFactory; public SignificantStringTermsAggregator(String name, AggregatorFactories factories, ValuesSource valuesSource, @@ -69,8 +69,8 @@ public SignificantStringTerms buildAggregation(long owningBucketOrdinal) { ContextIndexSearcher searcher = context.searchContext().searcher(); IndexReader topReader = searcher.getIndexReader(); - int supersetSize = topReader.numDocs(); - int subsetSize = numCollectedDocs; + long supersetSize = topReader.numDocs(); + long subsetSize = numCollectedDocs; BucketSignificancePriorityQueue ordered = new BucketSignificancePriorityQueue(size); SignificantStringTerms.Bucket spare = null; diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTerms.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTerms.java index fdf343791d017..cff13bbfa9787 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTerms.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTerms.java @@ -18,7 +18,6 @@ */ package org.elasticsearch.search.aggregations.bucket.significant; -import org.elasticsearch.search.aggregations.Aggregation; import org.elasticsearch.search.aggregations.bucket.MultiBucketsAggregation; import java.util.Collection; @@ -26,7 +25,7 @@ /** * */ -public interface SignificantTerms extends Aggregation, Iterable { +public interface SignificantTerms extends MultiBucketsAggregation, Iterable { static abstract class Bucket implements MultiBucketsAggregation.Bucket { @@ -68,9 +67,10 @@ public long getSubsetSize(){ } - Collection buckets(); - - Bucket getByTerm(String term); + @Override + Collection getBuckets(); + @Override + Bucket getBucketByKey(String key); } diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsAggregatorFactory.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsAggregatorFactory.java index 8063c9fc3d146..b4aed7d21738d 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsAggregatorFactory.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsAggregatorFactory.java @@ -24,6 +24,7 @@ import org.elasticsearch.ElasticsearchException; import org.elasticsearch.ElasticsearchIllegalArgumentException; import org.elasticsearch.common.lease.Releasable; +import org.elasticsearch.common.lease.Releasables; import org.elasticsearch.common.util.BigArrays; import org.elasticsearch.common.util.BytesRefHash; import org.elasticsearch.common.util.IntArray; @@ -154,8 +155,12 @@ protected Aggregator create(ValuesSource valuesSource, long expectedBucketsCount "]. It can only be applied to numeric or string fields."); } - // Many child aggs may ask for the same docFreq information so cache docFreq - // values for these terms + // Many child aggs may ask for the same docFreq information so here we cache docFreq + // values for these terms. + // TODO this should be re-factored into a more generic system for efficiently checking frequencies of things + // In future we may need to a) check the frequency in a set other than the index e.g. a subset and b) check + // the frequency of an entity other than an a single indexed term e.g. a numeric range. + // This is likely to require some careful design. public long getBackgroundFrequency(IndexReader topReader, BytesRef termBytes) { int result = 0; long termOrd = cachedTermOrds.add(termBytes); @@ -185,12 +190,10 @@ public long getBackgroundFrequency(IndexReader topReader, long term) { @Override public boolean release() throws ElasticsearchException { - if (cachedTermOrds != null) { - cachedTermOrds.release(); + try { + Releasables.release(cachedTermOrds, termDocFreqs); + } finally { cachedTermOrds = null; - } - if (termDocFreqs != null) { - termDocFreqs.release(); termDocFreqs = null; } return true; diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsParser.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsParser.java index 0cb3cd795a052..1f498ea3e2da8 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsParser.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsParser.java @@ -78,6 +78,8 @@ public AggregatorFactory parse(String aggregationName, XContentParser parser, Se } else if (token == XContentParser.Token.VALUE_STRING) { if ("field".equals(currentFieldName)) { field = parser.text(); + } else if ("format".equals(currentFieldName)) { + format = parser.text(); } else if ("include".equals(currentFieldName)) { include = parser.text(); } else if ("exclude".equals(currentFieldName)) { diff --git a/src/test/java/org/elasticsearch/search/aggregations/bucket/SignificantTermsTests.java b/src/test/java/org/elasticsearch/search/aggregations/bucket/SignificantTermsTests.java index 2a94068004bf6..89cf5e46a6374 100644 --- a/src/test/java/org/elasticsearch/search/aggregations/bucket/SignificantTermsTests.java +++ b/src/test/java/org/elasticsearch/search/aggregations/bucket/SignificantTermsTests.java @@ -35,7 +35,6 @@ import static org.elasticsearch.cluster.metadata.IndexMetaData.SETTING_NUMBER_OF_REPLICAS; import static org.elasticsearch.cluster.metadata.IndexMetaData.SETTING_NUMBER_OF_SHARDS; -import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder; import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked; import static org.hamcrest.Matchers.equalTo; @@ -58,35 +57,13 @@ public Settings indexSettings() { @Before public void init() throws Exception { - assertAcked(prepareCreate("test").setSettings(SETTING_NUMBER_OF_SHARDS, 5, SETTING_NUMBER_OF_REPLICAS, 0) - .addMapping("fact", jsonBuilder() - .startObject() - .startObject("fact") - .startObject("_routing") - .field("required", true) - .field("path", "routingID") - .endObject() - .startObject("properties") - .startObject("routingID") - .field("type", "string") - .field("index", "not_analyzed") - .endObject() - .startObject("factCategory") - .field("type", "integer") - .field("index", "not_analyzed") - .endObject() - .startObject("Description") - .field("type", "string") - .field("index", "analyzed") - .endObject() - .endObject() - .endObject() - .endObject())); - - createIndex("idx_unmapped"); + assertAcked(prepareCreate("test").setSettings(SETTING_NUMBER_OF_SHARDS, 5, SETTING_NUMBER_OF_REPLICAS, 0).addMapping("fact", + "_routing", "required=true,path=routing_id", "routing_id", "type=string,index=not_analyzed", "fact_category", + "type=integer,index=not_analyzed", "description", "type=string,index=analyzed")); + createIndex("idx_unmapped"); - ensureGreen(); - String data[]= { + ensureGreen(); + String data[] = { "A\t1\tpaul weller was lead singer of the jam before the style council", "B\t1\tpaul weller left the jam to form the style council", "A\t2\tpaul smith is a designer in the fashion industry", @@ -108,14 +85,12 @@ public void init() throws Exception { "A\t3\tterje haakonsen has been a team rider for burton snowboards for over 20 years" }; - for (int i = 0; i < data.length; i++) { - String[] parts=data[i].split("\t"); - client().prepareIndex("test", "fact", ""+i).setSource("routingID", parts[0], - "factCategory", parts[1], - "Description", parts[2] - ).get(); - } - client().admin().indices().refresh(new RefreshRequest("test")).get(); + for (int i = 0; i < data.length; i++) { + String[] parts = data[i].split("\t"); + client().prepareIndex("test", "fact", "" + i) + .setSource("routing_id", parts[0], "fact_category", parts[1], "description", parts[2]).get(); + } + client().admin().indices().refresh(new RefreshRequest("test")).get(); } @Test @@ -124,13 +99,12 @@ public void structuredAnalysis() throws Exception { .setSearchType(SearchType.QUERY_AND_FETCH) .setQuery(new TermQueryBuilder("_all", "terje")) .setFrom(0).setSize(60).setExplain(true) - .addAggregation(new SignificantTermsBuilder("mySignificantTerms").field("factCategory") - .minDocCount(2) - ) + .addAggregation(new SignificantTermsBuilder("mySignificantTerms").field("fact_category") + .minDocCount(2)) .execute() .actionGet(); SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms"); - Number topCategory = topTerms.buckets().iterator().next().getKeyAsNumber(); + Number topCategory = topTerms.getBuckets().iterator().next().getKeyAsNumber(); assertTrue(topCategory.equals(new Long(SNOWBOARDING_CATEGORY))); } @@ -140,13 +114,12 @@ public void unmapped() throws Exception { .setSearchType(SearchType.QUERY_AND_FETCH) .setQuery(new TermQueryBuilder("_all", "terje")) .setFrom(0).setSize(60).setExplain(true) - .addAggregation(new SignificantTermsBuilder("mySignificantTerms").field("factCategory") - .minDocCount(2) - ) + .addAggregation(new SignificantTermsBuilder("mySignificantTerms").field("fact_category") + .minDocCount(2)) .execute() .actionGet(); SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms"); - assertThat(topTerms.buckets().size(), equalTo(0)); + assertThat(topTerms.getBuckets().size(), equalTo(0)); } @Test @@ -155,9 +128,8 @@ public void textAnalysis() throws Exception { .setSearchType(SearchType.QUERY_AND_FETCH) .setQuery(new TermQueryBuilder("_all", "terje")) .setFrom(0).setSize(60).setExplain(true) - .addAggregation(new SignificantTermsBuilder("mySignificantTerms").field("Description") - .minDocCount(2) - ) + .addAggregation(new SignificantTermsBuilder("mySignificantTerms").field("description") + .minDocCount(2)) .execute() .actionGet(); SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms"); @@ -170,9 +142,8 @@ public void partiallyUnmapped() throws Exception { .setSearchType(SearchType.QUERY_AND_FETCH) .setQuery(new TermQueryBuilder("_all", "terje")) .setFrom(0).setSize(60).setExplain(true) - .addAggregation(new SignificantTermsBuilder("mySignificantTerms").field("Description") - .minDocCount(2) - ) + .addAggregation(new SignificantTermsBuilder("mySignificantTerms").field("description") + .minDocCount(2)) .execute() .actionGet(); SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms"); From bcf27ade2b4e84d694a1c64f029376ef7df0e113 Mon Sep 17 00:00:00 2001 From: markharwood Date: Thu, 13 Mar 2014 11:40:31 +0000 Subject: [PATCH 8/8] =?UTF-8?q?Added=20=E2=80=9Cexperimental=E2=80=9D=20no?= =?UTF-8?q?tices=20to=20documentation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../bucket/significantterms-aggregation.asciidoc | 10 ++++++++++ .../bucket/significant/SignificantTermsBuilder.java | 6 ++++-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/docs/reference/search/aggregations/bucket/significantterms-aggregation.asciidoc b/docs/reference/search/aggregations/bucket/significantterms-aggregation.asciidoc index 4a2c798af0900..88712142e8b3b 100644 --- a/docs/reference/search/aggregations/bucket/significantterms-aggregation.asciidoc +++ b/docs/reference/search/aggregations/bucket/significantterms-aggregation.asciidoc @@ -3,6 +3,16 @@ An aggregation that returns interesting or unusual occurrences of terms in a set. +.Experimental! +[IMPORTANT] +===== +This feature is marked as experimental, and may be subject to change in the +future. If you use this feature, please let us know your experience with it! +===== + +coming[1.1.0] + + .Example use cases: * Suggesting "H5N1" when users search for "bird flu" in text * Identifying the merchant that is the "common point of compromise" from the transaction history of credit card owners reporting loss diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsBuilder.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsBuilder.java index 1773d5fccc18c..183ef6645f958 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsBuilder.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsBuilder.java @@ -25,8 +25,10 @@ import java.io.IOException; /** - * Creates an aggregation that finds interesting or unusual occurrences of terms in a result set - * + * Creates an aggregation that finds interesting or unusual occurrences of terms in a result set. + * + * This feature is marked as experimental, and may be subject to change in the future. If you + * use this feature, please let us know your experience with it! */ public class SignificantTermsBuilder extends AggregationBuilder {