elastic · jimczi · Oct 19, 2018 · Oct 16, 2018 · Oct 16, 2018 · Oct 16, 2018
diff --git a/server/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggester.java b/server/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggester.java
@@ -18,7 +18,6 @@
  */
 package org.elasticsearch.search.suggest.completion;
 
-import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.search.BulkScorer;
 import org.apache.lucene.search.CollectionTerminatedException;
@@ -34,9 +33,7 @@
 import org.elasticsearch.search.suggest.Suggester;
 
 import java.io.IOException;
-import java.util.ArrayList;
 import java.util.Collections;
-import java.util.LinkedHashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
@@ -59,15 +56,17 @@ protected Suggest.Suggestion<? extends Suggest.Suggestion.Entry<? extends Sugges
                 new Text(spare.toString()), 0, spare.length());
             completionSuggestion.addTerm(completionSuggestEntry);
             int shardSize = suggestionContext.getShardSize() != null ? suggestionContext.getShardSize() : suggestionContext.getSize();
-            TopSuggestDocsCollector collector = new TopDocumentsCollector(shardSize, suggestionContext.isSkipDuplicates());
+            TopSuggestGroupDocsCollector collector = new TopSuggestGroupDocsCollector(shardSize, suggestionContext.isSkipDuplicates());
             suggest(searcher, suggestionContext.toQuery(), collector);
             int numResult = 0;
-            for (TopSuggestDocs.SuggestScoreDoc suggestScoreDoc : collector.get().scoreLookupDocs()) {
-                TopDocumentsCollector.SuggestDoc suggestDoc = (TopDocumentsCollector.SuggestDoc) suggestScoreDoc;
+            for (TopSuggestDocs.SuggestScoreDoc suggestDoc : collector.get().scoreLookupDocs()) {
                 // collect contexts
                 Map<String, Set<CharSequence>> contexts = Collections.emptyMap();
-                if (fieldType.hasContextMappings() && suggestDoc.getContexts().isEmpty() == false) {
-                    contexts = fieldType.getContextMappings().getNamedContexts(suggestDoc.getContexts());
+                if (fieldType.hasContextMappings()) {
+                    List<CharSequence> rawContexts = collector.getContexts(suggestDoc.doc);
+                    if (rawContexts.size() > 0) {
+                        contexts = fieldType.getContextMappings().getNamedContexts(rawContexts);
+                    }
                 }
                 if (numResult++ < suggestionContext.getSize()) {
                     CompletionSuggestion.Entry.Option option = new CompletionSuggestion.Entry.Option(suggestDoc.doc,
@@ -97,120 +96,4 @@ private static void suggest(IndexSearcher searcher, CompletionQuery query, TopSu
             }
         }
     }
-
-    /**
-     * TODO: this should be refactored and moved to lucene see https://issues.apache.org/jira/browse/LUCENE-6880
-     *
-     * Custom collector that returns top documents from the completion suggester.
-     * When suggestions are augmented with contexts values this collector groups suggestions coming from the same document
-     * but matching different contexts together. Each document is counted as 1 entry and the provided size is the expected number
-     * of documents that should be returned (not the number of suggestions).
-     * This collector is also able to filter duplicate suggestion coming from different documents.
-     * When different contexts match the same suggestion form only the best one (sorted by weight) is kept.
-     * In order to keep this feature fast, the de-duplication of suggestions with different contexts is done
-     * only on the top N*num_contexts (where N is the number of documents to return) suggestions per segment.
-     * This means that skip_duplicates will visit at most N*num_contexts suggestions per segment to find unique suggestions
-     * that match the input. If more than N*num_contexts suggestions are duplicated with different contexts this collector
-     * will not be able to return more than one suggestion even when N is greater than 1.
-     **/
-    private static final class TopDocumentsCollector extends TopSuggestDocsCollector {
-
-        /**
-         * Holds a list of suggest meta data for a doc
-         */
-        private static final class SuggestDoc extends TopSuggestDocs.SuggestScoreDoc {
-
-            private List<TopSuggestDocs.SuggestScoreDoc> suggestScoreDocs;
-
-            SuggestDoc(int doc, CharSequence key, CharSequence context, float score) {
-                super(doc, key, context, score);
-            }
-
-            void add(CharSequence key, CharSequence context, float score) {
-                if (suggestScoreDocs == null) {
-                    suggestScoreDocs = new ArrayList<>(1);
-                }
-                suggestScoreDocs.add(new TopSuggestDocs.SuggestScoreDoc(doc, key, context, score));
-            }
-
-            public List<CharSequence> getKeys() {
-                if (suggestScoreDocs == null) {
-                    return Collections.singletonList(key);
-                } else {
-                    List<CharSequence> keys = new ArrayList<>(suggestScoreDocs.size() + 1);
-                    keys.add(key);
-                    for (TopSuggestDocs.SuggestScoreDoc scoreDoc : suggestScoreDocs) {
-                        keys.add(scoreDoc.key);
-                    }
-                    return keys;
-                }
-            }
-
-            public List<CharSequence> getContexts() {
-                if (suggestScoreDocs == null) {
-                    if (context != null) {
-                        return Collections.singletonList(context);
-                    } else {
-                        return Collections.emptyList();
-                    }
-                } else {
-                    List<CharSequence> contexts = new ArrayList<>(suggestScoreDocs.size() + 1);
-                    contexts.add(context);
-                    for (TopSuggestDocs.SuggestScoreDoc scoreDoc : suggestScoreDocs) {
-                        contexts.add(scoreDoc.context);
-                    }
-                    return contexts;
-                }
-            }
-        }
-
-        private final Map<Integer, SuggestDoc> docsMap;
-
-        TopDocumentsCollector(int num, boolean skipDuplicates) {
-            super(Math.max(1, num), skipDuplicates);
-            this.docsMap = new LinkedHashMap<>(num);
-        }
-
-        @Override
-        public void collect(int docID, CharSequence key, CharSequence context, float score) throws IOException {
-            int globalDoc = docID + docBase;
-            if (docsMap.containsKey(globalDoc)) {
-                docsMap.get(globalDoc).add(key, context, score);
-            } else {
-                docsMap.put(globalDoc, new SuggestDoc(globalDoc, key, context, score));
-                super.collect(docID, key, context, score);
-            }
-        }
-
-        @Override
-        public TopSuggestDocs get() throws IOException {
-            TopSuggestDocs entries = super.get();
-            if (entries.scoreDocs.length == 0) {
-                return TopSuggestDocs.EMPTY;
-            }
-            // The parent class returns suggestions, not documents, and dedup only the surface form (without contexts).
-            // The following code groups suggestions matching different contexts by document id and dedup the surface form + contexts
-            // if needed (skip_duplicates).
-            int size = entries.scoreDocs.length;
-            final List<TopSuggestDocs.SuggestScoreDoc> suggestDocs = new ArrayList<>(size);
-            final CharArraySet seenSurfaceForms = doSkipDuplicates() ? new CharArraySet(size, false) : null;
-            for (TopSuggestDocs.SuggestScoreDoc suggestEntry : entries.scoreLookupDocs()) {
-                final SuggestDoc suggestDoc;
-                if (docsMap != null) {
-                    suggestDoc = docsMap.get(suggestEntry.doc);
-                } else {
-                    suggestDoc = new SuggestDoc(suggestEntry.doc, suggestEntry.key, suggestEntry.context, suggestEntry.score);
-                }
-                if (doSkipDuplicates()) {
-                    if (seenSurfaceForms.contains(suggestDoc.key)) {
-                        continue;
-                    }
-                    seenSurfaceForms.add(suggestDoc.key);
-                }
-                suggestDocs.add(suggestDoc);
-            }
-            return new TopSuggestDocs(entries.totalHits,
-                suggestDocs.toArray(new TopSuggestDocs.SuggestScoreDoc[0]));
-        }
-    }
 }
diff --git a/...c/main/java/org/elasticsearch/search/suggest/completion/TopSuggestGroupDocsCollector.java b/...c/main/java/org/elasticsearch/search/suggest/completion/TopSuggestGroupDocsCollector.java
@@ -0,0 +1,232 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.elasticsearch.search.suggest.completion;
+
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.search.CollectionTerminatedException;
+import org.apache.lucene.search.TotalHits;
+import org.apache.lucene.search.suggest.Lookup;
+import org.apache.lucene.search.suggest.document.TopSuggestDocs;
+import org.apache.lucene.search.suggest.document.TopSuggestDocsCollector;
+import org.apache.lucene.util.PriorityQueue;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+/**
+ *
+ * Custom {@link TopSuggestDocsCollector} that returns top documents from the completion suggester.
+ * <p>
+ * TODO: this should be refactored when https://issues.apache.org/jira/browse/LUCENE-8529 is fixed.
+ * Unlike the parent class, this collector uses the surface form to tie-break suggestions with identical
+ * scores.
+ * <p>
+ * This collector groups suggestions coming from the same document but matching different contexts
+ * or surface form together. When different contexts or surface forms match the same suggestion form only
+ * the best one per document (sorted by weight) is kept.
+ * <p>
+ * This collector is also able to filter duplicate suggestion coming from different documents.
+ * In order to keep this feature fast, the de-duplication of suggestions with different contexts is done
+ * only on the top N*num_contexts (where N is the number of documents to return) suggestions per segment.
+ * This means that skip_duplicates will visit at most N*num_contexts suggestions per segment to find unique suggestions
+ * that match the input. If more than N*num_contexts suggestions are duplicated with different contexts this collector
+ * will not be able to return more than one suggestion even when N is greater than 1.
+ **/
+class TopSuggestGroupDocsCollector extends TopSuggestDocsCollector {
+    private final class SuggestScoreDocPriorityQueue extends PriorityQueue<TopSuggestDocs.SuggestScoreDoc> {
+        /**
+         * Creates a new priority queue of the specified size.
+         */
+        private SuggestScoreDocPriorityQueue(int size) {
+            super(size);
+        }
+
+        @Override
+        protected boolean lessThan(TopSuggestDocs.SuggestScoreDoc a, TopSuggestDocs.SuggestScoreDoc b) {
+            if (a.score == b.score) {
+                // tie break by completion key
+                int cmp = Lookup.CHARSEQUENCE_COMPARATOR.compare(a.key, b.key);
+                // prefer smaller doc id, in case of a tie
+                return cmp != 0 ? cmp > 0 : a.doc > b.doc;
+            }
+            return a.score < b.score;
+        }
+
+        /**
+         * Returns the top N results in descending order.
+         */
+        public TopSuggestDocs.SuggestScoreDoc[] getResults() {
+            int size = size();
+            TopSuggestDocs.SuggestScoreDoc[] res = new TopSuggestDocs.SuggestScoreDoc[size];
+            for (int i = size - 1; i >= 0; i--) {
+                res[i] = pop();
+            }
+            return res;
+        }
+    }
+
+
+    private final SuggestScoreDocPriorityQueue priorityQueue;
+    private final int num;
+
+    /** Only set if we are deduplicating hits: holds all per-segment hits until the end, when we dedup them */
+    private final List<TopSuggestDocs.SuggestScoreDoc> pendingResults;
+
+    /** Only set if we are deduplicating hits: holds all surface forms seen so far in the current segment */
+    final CharArraySet seenSurfaceForms;
+
+    /** Document base offset for the current Leaf */
+    protected int docBase;
+
+    private Map<Integer, List<CharSequence>> docContexts = new HashMap<>();
+
+    /**
+     * Sole constructor
+     *
+     * Collects at most <code>num</code> completions
+     * with corresponding document and weight
+     */
+    TopSuggestGroupDocsCollector(int num, boolean skipDuplicates) {
+        super(1, skipDuplicates);
+        if (num <= 0) {
+            throw new IllegalArgumentException("'num' must be > 0");
+        }
+        this.num = num;
+        this.priorityQueue = new SuggestScoreDocPriorityQueue(num);
+        if (skipDuplicates) {
+            seenSurfaceForms = new CharArraySet(num, false);
+            pendingResults = new ArrayList<>();
+        } else {
+            seenSurfaceForms = null;
+            pendingResults = null;
+        }
+    }
+
+    /**
+     * Returns the contexts associated with the provided <code>doc</code>.
+     */
+    public List<CharSequence> getContexts(int doc) {
+        return docContexts.getOrDefault(doc, Collections.emptyList());
+    }
+
+    @Override
+    protected boolean doSkipDuplicates() {
+        return seenSurfaceForms != null;
+    }
+
+    @Override
+    public int getCountToCollect() {
+        return num;
+    }
+
+    @Override
+    protected void doSetNextReader(LeafReaderContext context) throws IOException {
+        docBase = context.docBase;
+        if (seenSurfaceForms != null) {
+            seenSurfaceForms.clear();
+            // NOTE: this also clears the priorityQueue:
+            for (TopSuggestDocs.SuggestScoreDoc hit : priorityQueue.getResults()) {
+                pendingResults.add(hit);
+            }
+        }
+    }
+
+    @Override
+    public void collect(int docID, CharSequence key, CharSequence context, float score) throws IOException {
+        int globalDoc = docID + docBase;
+        boolean isDuplicate = docContexts.containsKey(globalDoc);
+        List<CharSequence> contexts = docContexts.computeIfAbsent(globalDoc, k -> new ArrayList<>());
+        if (context != null) {
+            contexts.add(context);
+        }
+        if (isDuplicate) {
+            return;
+        }
+        TopSuggestDocs.SuggestScoreDoc current = new TopSuggestDocs.SuggestScoreDoc(globalDoc, key, context, score);
+        if (current == priorityQueue.insertWithOverflow(current)) {
+            // if the current SuggestScoreDoc has overflown from pq,
+            // we can assume all of the successive collections from
+            // this leaf will be overflown as well
+            // TODO: reuse the overflow instance?
+            throw new CollectionTerminatedException();
+        }
+    }
+
+    @Override
+    public TopSuggestDocs get() throws IOException {
+
+        TopSuggestDocs.SuggestScoreDoc[] suggestScoreDocs;
+
+        if (seenSurfaceForms != null) {
+            // NOTE: this also clears the priorityQueue:
+            for (TopSuggestDocs.SuggestScoreDoc hit : priorityQueue.getResults()) {
+                pendingResults.add(hit);
+            }
+
+            // Deduplicate all hits: we already dedup'd efficiently within each segment by
+            // truncating the FST top paths search, but across segments there may still be dups:
+            seenSurfaceForms.clear();
+
+            // TODO: we could use a priority queue here to make cost O(N * log(num)) instead of O(N * log(N)), where N = O(num *
+            // numSegments), but typically numSegments is smallish and num is smallish so this won't matter much in practice:
+
+            Collections.sort(pendingResults,
+                (a, b) -> {
+                    // sort by higher score
+                    int cmp = Float.compare(b.score, a.score);
+                    if (cmp == 0) {
+                        // tie break by completion key
+                        cmp = Lookup.CHARSEQUENCE_COMPARATOR.compare(a.key, b.key);
+                        if (cmp == 0) {
+                            // prefer smaller doc id, in case of a tie
+                            cmp = Integer.compare(a.doc, b.doc);
+                        }
+                    }
+                    return cmp;
+                });
+
+            List<TopSuggestDocs.SuggestScoreDoc> hits = new ArrayList<>();
+
+            for (TopSuggestDocs.SuggestScoreDoc hit : pendingResults) {
+                if (seenSurfaceForms.contains(hit.key) == false) {
+                    seenSurfaceForms.add(hit.key);
+                    hits.add(hit);
+                    if (hits.size() == num) {
+                        break;
+                    }
+                }
+            }
+            suggestScoreDocs = hits.toArray(new TopSuggestDocs.SuggestScoreDoc[0]);
+        } else {
+            suggestScoreDocs = priorityQueue.getResults();
+        }
+
+        if (suggestScoreDocs.length > 0) {
+            return new TopSuggestDocs(new TotalHits(suggestScoreDocs.length, TotalHits.Relation.EQUAL_TO), suggestScoreDocs);
+        } else {
+            return TopSuggestDocs.EMPTY;
+        }
+    }
+
+}