diff --git a/server/src/main/java/org/apache/lucene/analysis/miscellaneous/DuplicateByteSequenceSpotter.java b/server/src/main/java/org/apache/lucene/analysis/miscellaneous/DuplicateByteSequenceSpotter.java
index 7a58eaa1375f1..25ceb67211abd 100644
--- a/server/src/main/java/org/apache/lucene/analysis/miscellaneous/DuplicateByteSequenceSpotter.java
+++ b/server/src/main/java/org/apache/lucene/analysis/miscellaneous/DuplicateByteSequenceSpotter.java
@@ -25,20 +25,18 @@
* A Trie structure for analysing byte streams for duplicate sequences. Bytes
* from a stream are added one at a time using the addByte method and the number
* of times it has been seen as part of a sequence is returned.
- *
+ *
* The minimum required length for a duplicate sequence detected is 6 bytes.
- *
+ *
* The design goals are to maximize speed of lookup while minimizing the space
* required to do so. This has led to a hybrid solution for representing the
* bytes that make up a sequence in the trie.
- *
+ *
* If we have 6 bytes in sequence e.g. abcdef then they are represented as
* object nodes in the tree as follows:
*
* (a)-(b)-(c)-(def as an int)
*
- *
- *
* {@link RootTreeNode} objects are used for the first two levels of the tree
* (representing bytes a and b in the example sequence). The combinations of
* objects at these 2 levels are few so internally these objects allocate an
@@ -61,11 +59,9 @@
* reached
*
halting any growth of the tree
*
- *
* Tests on real-world-text show that the size of the tree is a multiple of the
* input text where that multiplier varies between 10 and 5 times as the content
* size increased from 10 to 100 megabytes of content.
- *
*/
public class DuplicateByteSequenceSpotter {
public static final int TREE_DEPTH = 6;
diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/MapStringTermsAggregator.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/MapStringTermsAggregator.java
index 05dbdfdb965f0..b467d6cefd375 100644
--- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/MapStringTermsAggregator.java
+++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/MapStringTermsAggregator.java
@@ -47,22 +47,23 @@
import java.util.function.BiConsumer;
import java.util.function.Function;
import java.util.function.Supplier;
+import java.util.function.LongConsumer;
/**
* An aggregator of string values that hashes the strings on the fly rather
* than up front like the {@link GlobalOrdinalsStringTermsAggregator}.
*/
public class MapStringTermsAggregator extends AbstractStringTermsAggregator {
+ private final CollectorSource collectorSource;
private final ResultStrategy, ?> resultStrategy;
- private final ValuesSource valuesSource;
private final BytesKeyedBucketOrds bucketOrds;
private final IncludeExclude.StringFilter includeExclude;
public MapStringTermsAggregator(
String name,
AggregatorFactories factories,
+ CollectorSource collectorSource,
Function> resultStrategy,
- ValuesSource valuesSource,
BucketOrder order,
DocValueFormat format,
BucketCountThresholds bucketCountThresholds,
@@ -75,56 +76,39 @@ public MapStringTermsAggregator(
Map metadata
) throws IOException {
super(name, factories, context, parent, order, format, bucketCountThresholds, collectionMode, showTermDocCountError, metadata);
+ this.collectorSource = collectorSource;
this.resultStrategy = resultStrategy.apply(this); // ResultStrategy needs a reference to the Aggregator to do its job.
- this.valuesSource = valuesSource;
this.includeExclude = includeExclude;
bucketOrds = BytesKeyedBucketOrds.build(context.bigArrays(), collectsFromSingleBucket);
}
@Override
public ScoreMode scoreMode() {
- if (valuesSource != null && valuesSource.needsScores()) {
+ if (collectorSource.needsScores()) {
return ScoreMode.COMPLETE;
}
return super.scoreMode();
}
@Override
- public LeafBucketCollector getLeafCollector(LeafReaderContext ctx,
- final LeafBucketCollector sub) throws IOException {
- SortedBinaryDocValues values = valuesSource.bytesValues(ctx);
- return resultStrategy.wrapCollector(new LeafBucketCollectorBase(sub, values) {
- final BytesRefBuilder previous = new BytesRefBuilder();
-
- @Override
- public void collect(int doc, long owningBucketOrd) throws IOException {
- if (false == values.advanceExact(doc)) {
- return;
- }
- int valuesCount = values.docValueCount();
-
- // SortedBinaryDocValues don't guarantee uniqueness so we
- // need to take care of dups
- previous.clear();
- for (int i = 0; i < valuesCount; ++i) {
- final BytesRef bytes = values.nextValue();
- if (includeExclude != null && false == includeExclude.accept(bytes)) {
- continue;
- }
- if (i > 0 && previous.get().equals(bytes)) {
- continue;
- }
+ public LeafBucketCollector getLeafCollector(LeafReaderContext ctx, LeafBucketCollector sub) throws IOException {
+ return resultStrategy.wrapCollector(
+ collectorSource.getLeafCollector(
+ includeExclude,
+ ctx,
+ sub,
+ this::addRequestCircuitBreakerBytes,
+ (s, doc, owningBucketOrd, bytes) -> {
long bucketOrdinal = bucketOrds.add(owningBucketOrd, bytes);
if (bucketOrdinal < 0) { // already seen
bucketOrdinal = -1 - bucketOrdinal;
- collectExistingBucket(sub, doc, bucketOrdinal);
+ collectExistingBucket(s, doc, bucketOrdinal);
} else {
- collectBucket(sub, doc, bucketOrdinal);
+ collectBucket(s, doc, bucketOrdinal);
}
- previous.copyBytes(bytes);
}
- }
- });
+ )
+ );
}
@Override
@@ -146,7 +130,82 @@ public void collectDebugInfo(BiConsumer add) {
@Override
public void doClose() {
- Releasables.close(bucketOrds, resultStrategy);
+ Releasables.close(collectorSource, resultStrategy, bucketOrds);
+ }
+
+ /**
+ * Abstaction on top of building collectors to fetch values.
+ */
+ public interface CollectorSource extends Releasable {
+ boolean needsScores();
+
+ LeafBucketCollector getLeafCollector(
+ IncludeExclude.StringFilter includeExclude,
+ LeafReaderContext ctx,
+ LeafBucketCollector sub,
+ LongConsumer addRequestCircuitBreakerBytes,
+ CollectConsumer consumer
+ ) throws IOException;
+ }
+ @FunctionalInterface
+ public interface CollectConsumer {
+ void accept(LeafBucketCollector sub, int doc, long owningBucketOrd, BytesRef bytes) throws IOException;
+ }
+
+ /**
+ * Fetch values from a {@link ValuesSource}.
+ */
+ public static class ValuesSourceCollectorSource implements CollectorSource {
+ private final ValuesSource valuesSource;
+
+ public ValuesSourceCollectorSource(ValuesSource valuesSource) {
+ this.valuesSource = valuesSource;
+ }
+
+ @Override
+ public boolean needsScores() {
+ return valuesSource.needsScores();
+ }
+
+ @Override
+ public LeafBucketCollector getLeafCollector(
+ IncludeExclude.StringFilter includeExclude,
+ LeafReaderContext ctx,
+ LeafBucketCollector sub,
+ LongConsumer addRequestCircuitBreakerBytes,
+ CollectConsumer consumer
+ ) throws IOException {
+ SortedBinaryDocValues values = valuesSource.bytesValues(ctx);
+ return new LeafBucketCollectorBase(sub, values) {
+ final BytesRefBuilder previous = new BytesRefBuilder();
+
+ @Override
+ public void collect(int doc, long owningBucketOrd) throws IOException {
+ if (false == values.advanceExact(doc)) {
+ return;
+ }
+ int valuesCount = values.docValueCount();
+
+ // SortedBinaryDocValues don't guarantee uniqueness so we
+ // need to take care of dups
+ previous.clear();
+ for (int i = 0; i < valuesCount; ++i) {
+ BytesRef bytes = values.nextValue();
+ if (includeExclude != null && false == includeExclude.accept(bytes)) {
+ continue;
+ }
+ if (i > 0 && previous.get().equals(bytes)) {
+ continue;
+ }
+ previous.copyBytes(bytes);
+ consumer.accept(sub, doc, owningBucketOrd, bytes);
+ }
+ }
+ };
+ }
+
+ @Override
+ public void close() {}
}
/**
@@ -270,6 +329,12 @@ private InternalAggregation[] buildAggregations(long[] owningBucketOrds) throws
* Builds results for the standard {@code terms} aggregation.
*/
class StandardTermsResults extends ResultStrategy {
+ private final ValuesSource valuesSource;
+
+ StandardTermsResults(ValuesSource valuesSource) {
+ this.valuesSource = valuesSource;
+ }
+
@Override
String describe() {
return "terms";
diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/SignificanceLookup.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/SignificanceLookup.java
index 4a6764fb4acd2..dbe48bb040f92 100644
--- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/SignificanceLookup.java
+++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/SignificanceLookup.java
@@ -36,10 +36,11 @@
import org.elasticsearch.common.util.BytesRefHash;
import org.elasticsearch.common.util.LongArray;
import org.elasticsearch.common.util.LongHash;
+import org.elasticsearch.index.mapper.MappedFieldType;
import org.elasticsearch.index.query.QueryBuilder;
import org.elasticsearch.index.query.QueryShardContext;
+import org.elasticsearch.search.DocValueFormat;
import org.elasticsearch.search.aggregations.bucket.terms.heuristic.SignificanceHeuristic;
-import org.elasticsearch.search.aggregations.support.ValuesSourceConfig;
import java.io.IOException;
@@ -62,14 +63,17 @@ interface BackgroundFrequencyForLong extends Releasable {
}
private final QueryShardContext context;
- private final ValuesSourceConfig config;
+ private final MappedFieldType fieldType;
+ private final DocValueFormat format;
private final Query backgroundFilter;
private final int supersetNumDocs;
private TermsEnum termsEnum;
- SignificanceLookup(QueryShardContext context, ValuesSourceConfig config, QueryBuilder backgroundFilter) throws IOException {
+ SignificanceLookup(QueryShardContext context, MappedFieldType fieldType, DocValueFormat format, QueryBuilder backgroundFilter)
+ throws IOException {
this.context = context;
- this.config = config;
+ this.fieldType = fieldType;
+ this.format = format;
this.backgroundFilter = backgroundFilter == null ? null : backgroundFilter.toQuery(context);
/*
* We need to use a superset size that includes deleted docs or we
@@ -129,7 +133,7 @@ public void close() {
* Get the background frequency of a {@link BytesRef} term.
*/
private long getBackgroundFrequency(BytesRef term) throws IOException {
- return getBackgroundFrequency(config.fieldContext().fieldType().termQuery(config.format().format(term).toString(), context));
+ return getBackgroundFrequency(fieldType.termQuery(format.format(term).toString(), context));
}
/**
@@ -174,7 +178,7 @@ public void close() {
* Get the background frequency of a {@code long} term.
*/
private long getBackgroundFrequency(long term) throws IOException {
- return getBackgroundFrequency(config.fieldContext().fieldType().termQuery(config.format().format(term).toString(), context));
+ return getBackgroundFrequency(fieldType.termQuery(format.format(term).toString(), context));
}
private long getBackgroundFrequency(Query query) throws IOException {
@@ -201,7 +205,7 @@ private TermsEnum getTermsEnum(String field) throws IOException {
return termsEnum;
}
IndexReader reader = context.getIndexReader();
- termsEnum = new FilterableTermsEnum(reader, config.fieldContext().field(), PostingsEnum.NONE, backgroundFilter);
+ termsEnum = new FilterableTermsEnum(reader, fieldType.name(), PostingsEnum.NONE, backgroundFilter);
return termsEnum;
}
diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/SignificantTermsAggregatorFactory.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/SignificantTermsAggregatorFactory.java
index 5e92cc9edee1a..174a4c1fcef98 100644
--- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/SignificantTermsAggregatorFactory.java
+++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/SignificantTermsAggregatorFactory.java
@@ -227,7 +227,12 @@ protected Aggregator doCreateInternal(SearchContext searchContext,
bucketCountThresholds.setShardSize(2 * BucketUtils.suggestShardSideQueueSize(bucketCountThresholds.getRequiredSize()));
}
- SignificanceLookup lookup = new SignificanceLookup(queryShardContext, config, backgroundFilter);
+ SignificanceLookup lookup = new SignificanceLookup(
+ queryShardContext,
+ config.fieldContext().fieldType(),
+ config.format(),
+ backgroundFilter
+ );
return sigTermsAggregatorSupplier.build(name, factories, config.getValuesSource(), config.format(),
bucketCountThresholds, includeExclude, executionHint, searchContext, parent,
@@ -256,8 +261,8 @@ Aggregator create(String name,
return new MapStringTermsAggregator(
name,
factories,
+ new MapStringTermsAggregator.ValuesSourceCollectorSource(valuesSource),
a -> a.new SignificantTermsResults(lookup, significanceHeuristic, collectsFromSingleBucket),
- valuesSource,
null,
format,
bucketCountThresholds,
diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/SignificantTextAggregator.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/SignificantTextAggregator.java
deleted file mode 100644
index 87036bf69a987..0000000000000
--- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/SignificantTextAggregator.java
+++ /dev/null
@@ -1,251 +0,0 @@
-/*
- * Licensed to Elasticsearch under one or more contributor
- * license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright
- * ownership. Elasticsearch licenses this file to you under
- * the Apache License, Version 2.0 (the "License"); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.elasticsearch.search.aggregations.bucket.terms;
-
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.miscellaneous.DeDuplicatingTokenFilter;
-import org.apache.lucene.analysis.miscellaneous.DuplicateByteSequenceSpotter;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.LeafReaderContext;
-import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.BytesRefBuilder;
-import org.elasticsearch.common.lease.Releasables;
-import org.elasticsearch.common.util.BytesRefHash;
-import org.elasticsearch.index.mapper.MappedFieldType;
-import org.elasticsearch.search.DocValueFormat;
-import org.elasticsearch.search.aggregations.Aggregator;
-import org.elasticsearch.search.aggregations.AggregatorFactories;
-import org.elasticsearch.search.aggregations.InternalAggregation;
-import org.elasticsearch.search.aggregations.LeafBucketCollector;
-import org.elasticsearch.search.aggregations.LeafBucketCollectorBase;
-import org.elasticsearch.search.aggregations.bucket.BucketsAggregator;
-import org.elasticsearch.search.aggregations.bucket.terms.IncludeExclude.StringFilter;
-import org.elasticsearch.search.aggregations.bucket.terms.TermsAggregator.BucketCountThresholds;
-import org.elasticsearch.search.aggregations.bucket.terms.heuristic.SignificanceHeuristic;
-import org.elasticsearch.search.internal.ContextIndexSearcher;
-import org.elasticsearch.search.internal.SearchContext;
-import org.elasticsearch.search.lookup.SourceLookup;
-
-import java.io.IOException;
-import java.util.Arrays;
-import java.util.List;
-import java.util.Map;
-import java.util.stream.Collectors;
-
-import static java.util.Collections.emptyList;
-
-public class SignificantTextAggregator extends BucketsAggregator {
-
- private final StringFilter includeExclude;
- protected final BucketCountThresholds bucketCountThresholds;
- protected long numCollectedDocs;
- private final BytesRefHash bucketOrds;
- private final SignificanceHeuristic significanceHeuristic;
- private SignificantTextAggregatorFactory termsAggFactory;
- private final DocValueFormat format = DocValueFormat.RAW;
- private final String fieldName;
- private final String[] sourceFieldNames;
- private DuplicateByteSequenceSpotter dupSequenceSpotter = null ;
- private long lastTrieSize;
- private static final int MEMORY_GROWTH_REPORTING_INTERVAL_BYTES = 5000;
-
-
-
- public SignificantTextAggregator(String name, AggregatorFactories factories,
- SearchContext context, Aggregator parent,
- BucketCountThresholds bucketCountThresholds, IncludeExclude.StringFilter includeExclude,
- SignificanceHeuristic significanceHeuristic, SignificantTextAggregatorFactory termsAggFactory,
- String fieldName, String [] sourceFieldNames, boolean filterDuplicateText,
- Map metadata) throws IOException {
- super(name, factories, context, parent, metadata);
- this.bucketCountThresholds = bucketCountThresholds;
- this.includeExclude = includeExclude;
- this.significanceHeuristic = significanceHeuristic;
- this.termsAggFactory = termsAggFactory;
- this.fieldName = fieldName;
- this.sourceFieldNames = sourceFieldNames;
- bucketOrds = new BytesRefHash(1, context.bigArrays());
- if(filterDuplicateText){
- dupSequenceSpotter = new DuplicateByteSequenceSpotter();
- lastTrieSize = dupSequenceSpotter.getEstimatedSizeInBytes();
- }
- }
-
-
-
-
- @Override
- public LeafBucketCollector getLeafCollector(LeafReaderContext ctx,
- final LeafBucketCollector sub) throws IOException {
- final BytesRefBuilder previous = new BytesRefBuilder();
- return new LeafBucketCollectorBase(sub, null) {
-
- @Override
- public void collect(int doc, long bucket) throws IOException {
- collectFromSource(doc, bucket, fieldName, sourceFieldNames);
- numCollectedDocs++;
- if (dupSequenceSpotter != null) {
- dupSequenceSpotter.startNewSequence();
- }
- }
-
- private void processTokenStream(int doc, long bucket, TokenStream ts, BytesRefHash inDocTerms, String fieldText)
- throws IOException{
- if (dupSequenceSpotter != null) {
- ts = new DeDuplicatingTokenFilter(ts, dupSequenceSpotter);
- }
- CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
- ts.reset();
- try {
- while (ts.incrementToken()) {
- if (dupSequenceSpotter != null) {
- long newTrieSize = dupSequenceSpotter.getEstimatedSizeInBytes();
- long growth = newTrieSize - lastTrieSize;
- // Only update the circuitbreaker after
- if (growth > MEMORY_GROWTH_REPORTING_INTERVAL_BYTES) {
- addRequestCircuitBreakerBytes(growth);
- lastTrieSize = newTrieSize;
- }
- }
- previous.clear();
- previous.copyChars(termAtt);
- BytesRef bytes = previous.get();
- if (inDocTerms.add(bytes) >= 0) {
- if (includeExclude == null || includeExclude.accept(bytes)) {
- long bucketOrdinal = bucketOrds.add(bytes);
- if (bucketOrdinal < 0) { // already seen
- bucketOrdinal = -1 - bucketOrdinal;
- collectExistingBucket(sub, doc, bucketOrdinal);
- } else {
- collectBucket(sub, doc, bucketOrdinal);
- }
- }
- }
- }
-
- } finally{
- ts.close();
- }
- }
-
- private void collectFromSource(int doc, long bucket, String indexedFieldName, String[] sourceFieldNames) throws IOException {
- MappedFieldType fieldType = context.getQueryShardContext().fieldMapper(indexedFieldName);
- if(fieldType == null){
- throw new IllegalArgumentException("Aggregation [" + name + "] cannot process field ["+indexedFieldName
- +"] since it is not present");
- }
-
- SourceLookup sourceLookup = context.lookup().source();
- sourceLookup.setSegmentAndDocument(ctx, doc);
- BytesRefHash inDocTerms = new BytesRefHash(256, context.bigArrays());
-
- try {
- for (String sourceField : sourceFieldNames) {
- List