diff --git a/server/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggester.java b/server/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggester.java index 7dc63a8daac78..eaadd399abbca 100644 --- a/server/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggester.java +++ b/server/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggester.java @@ -18,7 +18,6 @@ */ package org.elasticsearch.search.suggest.completion; -import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.search.BulkScorer; import org.apache.lucene.search.CollectionTerminatedException; @@ -34,9 +33,7 @@ import org.elasticsearch.search.suggest.Suggester; import java.io.IOException; -import java.util.ArrayList; import java.util.Collections; -import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Set; @@ -59,15 +56,17 @@ protected Suggest.Suggestion> contexts = Collections.emptyMap(); - if (fieldType.hasContextMappings() && suggestDoc.getContexts().isEmpty() == false) { - contexts = fieldType.getContextMappings().getNamedContexts(suggestDoc.getContexts()); + if (fieldType.hasContextMappings()) { + List rawContexts = collector.getContexts(suggestDoc.doc); + if (rawContexts.size() > 0) { + contexts = fieldType.getContextMappings().getNamedContexts(rawContexts); + } } if (numResult++ < suggestionContext.getSize()) { CompletionSuggestion.Entry.Option option = new CompletionSuggestion.Entry.Option(suggestDoc.doc, @@ -97,120 +96,4 @@ private static void suggest(IndexSearcher searcher, CompletionQuery query, TopSu } } } - - /** - * TODO: this should be refactored and moved to lucene see https://issues.apache.org/jira/browse/LUCENE-6880 - * - * Custom collector that returns top documents from the completion suggester. - * When suggestions are augmented with contexts values this collector groups suggestions coming from the same document - * but matching different contexts together. Each document is counted as 1 entry and the provided size is the expected number - * of documents that should be returned (not the number of suggestions). - * This collector is also able to filter duplicate suggestion coming from different documents. - * When different contexts match the same suggestion form only the best one (sorted by weight) is kept. - * In order to keep this feature fast, the de-duplication of suggestions with different contexts is done - * only on the top N*num_contexts (where N is the number of documents to return) suggestions per segment. - * This means that skip_duplicates will visit at most N*num_contexts suggestions per segment to find unique suggestions - * that match the input. If more than N*num_contexts suggestions are duplicated with different contexts this collector - * will not be able to return more than one suggestion even when N is greater than 1. - **/ - private static final class TopDocumentsCollector extends TopSuggestDocsCollector { - - /** - * Holds a list of suggest meta data for a doc - */ - private static final class SuggestDoc extends TopSuggestDocs.SuggestScoreDoc { - - private List suggestScoreDocs; - - SuggestDoc(int doc, CharSequence key, CharSequence context, float score) { - super(doc, key, context, score); - } - - void add(CharSequence key, CharSequence context, float score) { - if (suggestScoreDocs == null) { - suggestScoreDocs = new ArrayList<>(1); - } - suggestScoreDocs.add(new TopSuggestDocs.SuggestScoreDoc(doc, key, context, score)); - } - - public List getKeys() { - if (suggestScoreDocs == null) { - return Collections.singletonList(key); - } else { - List keys = new ArrayList<>(suggestScoreDocs.size() + 1); - keys.add(key); - for (TopSuggestDocs.SuggestScoreDoc scoreDoc : suggestScoreDocs) { - keys.add(scoreDoc.key); - } - return keys; - } - } - - public List getContexts() { - if (suggestScoreDocs == null) { - if (context != null) { - return Collections.singletonList(context); - } else { - return Collections.emptyList(); - } - } else { - List contexts = new ArrayList<>(suggestScoreDocs.size() + 1); - contexts.add(context); - for (TopSuggestDocs.SuggestScoreDoc scoreDoc : suggestScoreDocs) { - contexts.add(scoreDoc.context); - } - return contexts; - } - } - } - - private final Map docsMap; - - TopDocumentsCollector(int num, boolean skipDuplicates) { - super(Math.max(1, num), skipDuplicates); - this.docsMap = new LinkedHashMap<>(num); - } - - @Override - public void collect(int docID, CharSequence key, CharSequence context, float score) throws IOException { - int globalDoc = docID + docBase; - if (docsMap.containsKey(globalDoc)) { - docsMap.get(globalDoc).add(key, context, score); - } else { - docsMap.put(globalDoc, new SuggestDoc(globalDoc, key, context, score)); - super.collect(docID, key, context, score); - } - } - - @Override - public TopSuggestDocs get() throws IOException { - TopSuggestDocs entries = super.get(); - if (entries.scoreDocs.length == 0) { - return TopSuggestDocs.EMPTY; - } - // The parent class returns suggestions, not documents, and dedup only the surface form (without contexts). - // The following code groups suggestions matching different contexts by document id and dedup the surface form + contexts - // if needed (skip_duplicates). - int size = entries.scoreDocs.length; - final List suggestDocs = new ArrayList<>(size); - final CharArraySet seenSurfaceForms = doSkipDuplicates() ? new CharArraySet(size, false) : null; - for (TopSuggestDocs.SuggestScoreDoc suggestEntry : entries.scoreLookupDocs()) { - final SuggestDoc suggestDoc; - if (docsMap != null) { - suggestDoc = docsMap.get(suggestEntry.doc); - } else { - suggestDoc = new SuggestDoc(suggestEntry.doc, suggestEntry.key, suggestEntry.context, suggestEntry.score); - } - if (doSkipDuplicates()) { - if (seenSurfaceForms.contains(suggestDoc.key)) { - continue; - } - seenSurfaceForms.add(suggestDoc.key); - } - suggestDocs.add(suggestDoc); - } - return new TopSuggestDocs(entries.totalHits, - suggestDocs.toArray(new TopSuggestDocs.SuggestScoreDoc[0])); - } - } } diff --git a/server/src/main/java/org/elasticsearch/search/suggest/completion/TopSuggestGroupDocsCollector.java b/server/src/main/java/org/elasticsearch/search/suggest/completion/TopSuggestGroupDocsCollector.java new file mode 100644 index 0000000000000..879b8114b2a50 --- /dev/null +++ b/server/src/main/java/org/elasticsearch/search/suggest/completion/TopSuggestGroupDocsCollector.java @@ -0,0 +1,232 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.search.suggest.completion; + +import org.apache.lucene.analysis.CharArraySet; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.search.CollectionTerminatedException; +import org.apache.lucene.search.TotalHits; +import org.apache.lucene.search.suggest.Lookup; +import org.apache.lucene.search.suggest.document.TopSuggestDocs; +import org.apache.lucene.search.suggest.document.TopSuggestDocsCollector; +import org.apache.lucene.util.PriorityQueue; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * + * Custom {@link TopSuggestDocsCollector} that returns top documents from the completion suggester. + *

+ * TODO: this should be refactored when https://issues.apache.org/jira/browse/LUCENE-8529 is fixed. + * Unlike the parent class, this collector uses the surface form to tie-break suggestions with identical + * scores. + *

+ * This collector groups suggestions coming from the same document but matching different contexts + * or surface form together. When different contexts or surface forms match the same suggestion form only + * the best one per document (sorted by weight) is kept. + *

+ * This collector is also able to filter duplicate suggestion coming from different documents. + * In order to keep this feature fast, the de-duplication of suggestions with different contexts is done + * only on the top N*num_contexts (where N is the number of documents to return) suggestions per segment. + * This means that skip_duplicates will visit at most N*num_contexts suggestions per segment to find unique suggestions + * that match the input. If more than N*num_contexts suggestions are duplicated with different contexts this collector + * will not be able to return more than one suggestion even when N is greater than 1. + **/ +class TopSuggestGroupDocsCollector extends TopSuggestDocsCollector { + private final class SuggestScoreDocPriorityQueue extends PriorityQueue { + /** + * Creates a new priority queue of the specified size. + */ + private SuggestScoreDocPriorityQueue(int size) { + super(size); + } + + @Override + protected boolean lessThan(TopSuggestDocs.SuggestScoreDoc a, TopSuggestDocs.SuggestScoreDoc b) { + if (a.score == b.score) { + // tie break by completion key + int cmp = Lookup.CHARSEQUENCE_COMPARATOR.compare(a.key, b.key); + // prefer smaller doc id, in case of a tie + return cmp != 0 ? cmp > 0 : a.doc > b.doc; + } + return a.score < b.score; + } + + /** + * Returns the top N results in descending order. + */ + public TopSuggestDocs.SuggestScoreDoc[] getResults() { + int size = size(); + TopSuggestDocs.SuggestScoreDoc[] res = new TopSuggestDocs.SuggestScoreDoc[size]; + for (int i = size - 1; i >= 0; i--) { + res[i] = pop(); + } + return res; + } + } + + + private final SuggestScoreDocPriorityQueue priorityQueue; + private final int num; + + /** Only set if we are deduplicating hits: holds all per-segment hits until the end, when we dedup them */ + private final List pendingResults; + + /** Only set if we are deduplicating hits: holds all surface forms seen so far in the current segment */ + final CharArraySet seenSurfaceForms; + + /** Document base offset for the current Leaf */ + protected int docBase; + + private Map> docContexts = new HashMap<>(); + + /** + * Sole constructor + * + * Collects at most num completions + * with corresponding document and weight + */ + TopSuggestGroupDocsCollector(int num, boolean skipDuplicates) { + super(1, skipDuplicates); + if (num <= 0) { + throw new IllegalArgumentException("'num' must be > 0"); + } + this.num = num; + this.priorityQueue = new SuggestScoreDocPriorityQueue(num); + if (skipDuplicates) { + seenSurfaceForms = new CharArraySet(num, false); + pendingResults = new ArrayList<>(); + } else { + seenSurfaceForms = null; + pendingResults = null; + } + } + + /** + * Returns the contexts associated with the provided doc. + */ + public List getContexts(int doc) { + return docContexts.getOrDefault(doc, Collections.emptyList()); + } + + @Override + protected boolean doSkipDuplicates() { + return seenSurfaceForms != null; + } + + @Override + public int getCountToCollect() { + return num; + } + + @Override + protected void doSetNextReader(LeafReaderContext context) throws IOException { + docBase = context.docBase; + if (seenSurfaceForms != null) { + seenSurfaceForms.clear(); + // NOTE: this also clears the priorityQueue: + for (TopSuggestDocs.SuggestScoreDoc hit : priorityQueue.getResults()) { + pendingResults.add(hit); + } + } + } + + @Override + public void collect(int docID, CharSequence key, CharSequence context, float score) throws IOException { + int globalDoc = docID + docBase; + boolean isDuplicate = docContexts.containsKey(globalDoc); + List contexts = docContexts.computeIfAbsent(globalDoc, k -> new ArrayList<>()); + if (context != null) { + contexts.add(context); + } + if (isDuplicate) { + return; + } + TopSuggestDocs.SuggestScoreDoc current = new TopSuggestDocs.SuggestScoreDoc(globalDoc, key, context, score); + if (current == priorityQueue.insertWithOverflow(current)) { + // if the current SuggestScoreDoc has overflown from pq, + // we can assume all of the successive collections from + // this leaf will be overflown as well + // TODO: reuse the overflow instance? + throw new CollectionTerminatedException(); + } + } + + @Override + public TopSuggestDocs get() throws IOException { + + TopSuggestDocs.SuggestScoreDoc[] suggestScoreDocs; + + if (seenSurfaceForms != null) { + // NOTE: this also clears the priorityQueue: + for (TopSuggestDocs.SuggestScoreDoc hit : priorityQueue.getResults()) { + pendingResults.add(hit); + } + + // Deduplicate all hits: we already dedup'd efficiently within each segment by + // truncating the FST top paths search, but across segments there may still be dups: + seenSurfaceForms.clear(); + + // TODO: we could use a priority queue here to make cost O(N * log(num)) instead of O(N * log(N)), where N = O(num * + // numSegments), but typically numSegments is smallish and num is smallish so this won't matter much in practice: + + Collections.sort(pendingResults, + (a, b) -> { + // sort by higher score + int cmp = Float.compare(b.score, a.score); + if (cmp == 0) { + // tie break by completion key + cmp = Lookup.CHARSEQUENCE_COMPARATOR.compare(a.key, b.key); + if (cmp == 0) { + // prefer smaller doc id, in case of a tie + cmp = Integer.compare(a.doc, b.doc); + } + } + return cmp; + }); + + List hits = new ArrayList<>(); + + for (TopSuggestDocs.SuggestScoreDoc hit : pendingResults) { + if (seenSurfaceForms.contains(hit.key) == false) { + seenSurfaceForms.add(hit.key); + hits.add(hit); + if (hits.size() == num) { + break; + } + } + } + suggestScoreDocs = hits.toArray(new TopSuggestDocs.SuggestScoreDoc[0]); + } else { + suggestScoreDocs = priorityQueue.getResults(); + } + + if (suggestScoreDocs.length > 0) { + return new TopSuggestDocs(new TotalHits(suggestScoreDocs.length, TotalHits.Relation.EQUAL_TO), suggestScoreDocs); + } else { + return TopSuggestDocs.EMPTY; + } + } + +} diff --git a/server/src/test/java/org/elasticsearch/search/suggest/CompletionSuggestSearchIT.java b/server/src/test/java/org/elasticsearch/search/suggest/CompletionSuggestSearchIT.java index 52893a3c0322b..839ae89146aa8 100644 --- a/server/src/test/java/org/elasticsearch/search/suggest/CompletionSuggestSearchIT.java +++ b/server/src/test/java/org/elasticsearch/search/suggest/CompletionSuggestSearchIT.java @@ -54,6 +54,7 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; +import java.util.HashSet; import java.util.LinkedHashMap; import java.util.List; import java.util.Locale; @@ -91,6 +92,41 @@ protected Collection> nodePlugins() { return Arrays.asList(InternalSettingsPlugin.class); } + public void testTieBreak() throws Exception { + final CompletionMappingBuilder mapping = new CompletionMappingBuilder(); + mapping.indexAnalyzer("keyword"); + createIndexAndMapping(mapping); + + int numDocs = randomIntBetween(3, 50); + List indexRequestBuilders = new ArrayList<>(); + Set entrySet = new HashSet<>(); + for (int i = 0; i < numDocs; i++) { + String value = "a" + randomValueOtherThanMany(v -> entrySet.contains(v), + () -> randomAlphaOfLengthBetween(1, 10)); + entrySet.add(value); + indexRequestBuilders.add(client().prepareIndex(INDEX, TYPE, "" + i) + .setSource(jsonBuilder() + .startObject() + .startObject(FIELD) + .field("input", value) + .field("weight", 10) + .endObject() + .endObject() + )); + } + String[] entries = entrySet.stream() + .toArray(String[]::new); + Arrays.sort(entries); + indexRandom(true, indexRequestBuilders); + for (int i = 1; i < numDocs; i++) { + CompletionSuggestionBuilder prefix = SuggestBuilders.completionSuggestion(FIELD) + .prefix("a") + .size(i); + String[] topEntries = Arrays.copyOfRange(entries, 0, i); + assertSuggestions("foo", prefix, topEntries); + } + } + public void testPrefix() throws Exception { final CompletionMappingBuilder mapping = new CompletionMappingBuilder(); createIndexAndMapping(mapping);