Fix completion suggester's score tie-break (#34508)

jimczi · jimczi · commit ba43df6ed838 · 2018-10-19T20:02:21.000+02:00
The shard suggestion sort uses a different tie-break than the one that is used to merge different shards responses. The former uses the internal document identifier when scores are the same whereas the latter compares the surface form first. Because of this discrepancy some suggestion outputs are linked to the wrong documents because the merge sort reorders the shard suggestions differently. This change fixes this bug by duplicating the Lucene collector in order to be able to apply the same tiebreak strategy than the merge sort. This logic will be removed when https://issues.apache.org/jira/browse/LUCENE-8529 is fixed. Closes #34378
diff --git a/server/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggester.java b/server/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggester.java
@@ -18,7 +18,6 @@
  */
 package org.elasticsearch.search.suggest.completion;
 
-import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.search.BulkScorer;
 import org.apache.lucene.search.CollectionTerminatedException;
@@ -34,9 +33,7 @@
 import org.elasticsearch.search.suggest.Suggester;
 
 import java.io.IOException;
-import java.util.ArrayList;
 import java.util.Collections;
-import java.util.LinkedHashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
@@ -59,15 +56,17 @@ protected Suggest.Suggestion<? extends Suggest.Suggestion.Entry<? extends Sugges
                 new Text(spare.toString()), 0, spare.length());
             completionSuggestion.addTerm(completionSuggestEntry);
             int shardSize = suggestionContext.getShardSize() != null ? suggestionContext.getShardSize() : suggestionContext.getSize();
-            TopSuggestDocsCollector collector = new TopDocumentsCollector(shardSize, suggestionContext.isSkipDuplicates());
+            TopSuggestGroupDocsCollector collector = new TopSuggestGroupDocsCollector(shardSize, suggestionContext.isSkipDuplicates());
             suggest(searcher, suggestionContext.toQuery(), collector);
             int numResult = 0;
-            for (TopSuggestDocs.SuggestScoreDoc suggestScoreDoc : collector.get().scoreLookupDocs()) {
-                TopDocumentsCollector.SuggestDoc suggestDoc = (TopDocumentsCollector.SuggestDoc) suggestScoreDoc;
+            for (TopSuggestDocs.SuggestScoreDoc suggestDoc : collector.get().scoreLookupDocs()) {
                 // collect contexts
                 Map<String, Set<CharSequence>> contexts = Collections.emptyMap();
-                if (fieldType.hasContextMappings() && suggestDoc.getContexts().isEmpty() == false) {
-                    contexts = fieldType.getContextMappings().getNamedContexts(suggestDoc.getContexts());
+                if (fieldType.hasContextMappings()) {
+                    List<CharSequence> rawContexts = collector.getContexts(suggestDoc.doc);
+                    if (rawContexts.size() > 0) {
+                        contexts = fieldType.getContextMappings().getNamedContexts(rawContexts);
+                    }
                 }
                 if (numResult++ < suggestionContext.getSize()) {
                     CompletionSuggestion.Entry.Option option = new CompletionSuggestion.Entry.Option(suggestDoc.doc,
@@ -97,120 +96,4 @@ private static void suggest(IndexSearcher searcher, CompletionQuery query, TopSu
             }
         }
     }
-
-    /**
-     * TODO: this should be refactored and moved to lucene see https://issues.apache.org/jira/browse/LUCENE-6880
-     *
-     * Custom collector that returns top documents from the completion suggester.
-     * When suggestions are augmented with contexts values this collector groups suggestions coming from the same document
-     * but matching different contexts together. Each document is counted as 1 entry and the provided size is the expected number
-     * of documents that should be returned (not the number of suggestions).
-     * This collector is also able to filter duplicate suggestion coming from different documents.
-     * When different contexts match the same suggestion form only the best one (sorted by weight) is kept.
-     * In order to keep this feature fast, the de-duplication of suggestions with different contexts is done
-     * only on the top N*num_contexts (where N is the number of documents to return) suggestions per segment.
-     * This means that skip_duplicates will visit at most N*num_contexts suggestions per segment to find unique suggestions
-     * that match the input. If more than N*num_contexts suggestions are duplicated with different contexts this collector
-     * will not be able to return more than one suggestion even when N is greater than 1.
-     **/
-    private static final class TopDocumentsCollector extends TopSuggestDocsCollector {
-
-        /**
-         * Holds a list of suggest meta data for a doc
-         */
-        private static final class SuggestDoc extends TopSuggestDocs.SuggestScoreDoc {
-
-            private List<TopSuggestDocs.SuggestScoreDoc> suggestScoreDocs;
-
-            SuggestDoc(int doc, CharSequence key, CharSequence context, float score) {
-                super(doc, key, context, score);
-            }
-
-            void add(CharSequence key, CharSequence context, float score) {
-                if (suggestScoreDocs == null) {
-                    suggestScoreDocs = new ArrayList<>(1);
-                }
-                suggestScoreDocs.add(new TopSuggestDocs.SuggestScoreDoc(doc, key, context, score));
-            }
-
-            public List<CharSequence> getKeys() {
-                if (suggestScoreDocs == null) {
-                    return Collections.singletonList(key);
-                } else {
-                    List<CharSequence> keys = new ArrayList<>(suggestScoreDocs.size() + 1);
-                    keys.add(key);
-                    for (TopSuggestDocs.SuggestScoreDoc scoreDoc : suggestScoreDocs) {
-                        keys.add(scoreDoc.key);
-                    }
-                    return keys;
-                }
-            }
-
-            public List<CharSequence> getContexts() {
-                if (suggestScoreDocs == null) {
-                    if (context != null) {
-                        return Collections.singletonList(context);
-                    } else {
-                        return Collections.emptyList();
-                    }
-                } else {
-                    List<CharSequence> contexts = new ArrayList<>(suggestScoreDocs.size() + 1);
-                    contexts.add(context);
-                    for (TopSuggestDocs.SuggestScoreDoc scoreDoc : suggestScoreDocs) {
-                        contexts.add(scoreDoc.context);
-                    }
-                    return contexts;
-                }
-            }
-        }
-
-        private final Map<Integer, SuggestDoc> docsMap;
-
-        TopDocumentsCollector(int num, boolean skipDuplicates) {
-            super(Math.max(1, num), skipDuplicates);
-            this.docsMap = new LinkedHashMap<>(num);
-        }
-
-        @Override
-        public void collect(int docID, CharSequence key, CharSequence context, float score) throws IOException {
-            int globalDoc = docID + docBase;
-            if (docsMap.containsKey(globalDoc)) {
-                docsMap.get(globalDoc).add(key, context, score);
-            } else {
-                docsMap.put(globalDoc, new SuggestDoc(globalDoc, key, context, score));
-                super.collect(docID, key, context, score);
-            }
-        }
-
-        @Override
-        public TopSuggestDocs get() throws IOException {
-            TopSuggestDocs entries = super.get();
-            if (entries.scoreDocs.length == 0) {
-                return TopSuggestDocs.EMPTY;
-            }
-            // The parent class returns suggestions, not documents, and dedup only the surface form (without contexts).
-            // The following code groups suggestions matching different contexts by document id and dedup the surface form + contexts
-            // if needed (skip_duplicates).
-            int size = entries.scoreDocs.length;
-            final List<TopSuggestDocs.SuggestScoreDoc> suggestDocs = new ArrayList(size);
-            final CharArraySet seenSurfaceForms = doSkipDuplicates() ? new CharArraySet(size, false) : null;
-            for (TopSuggestDocs.SuggestScoreDoc suggestEntry : entries.scoreLookupDocs()) {
-                final SuggestDoc suggestDoc;
-                if (docsMap != null) {
-                    suggestDoc = docsMap.get(suggestEntry.doc);
-                } else {
-                    suggestDoc = new SuggestDoc(suggestEntry.doc, suggestEntry.key, suggestEntry.context, suggestEntry.score);
-                }
-                if (doSkipDuplicates()) {
-                    if (seenSurfaceForms.contains(suggestDoc.key)) {
-                        continue;
-                    }
-                    seenSurfaceForms.add(suggestDoc.key);
-                }
-                suggestDocs.add(suggestDoc);
-            }
-            return new TopSuggestDocs((int) entries.totalHits,
-                suggestDocs.toArray(new TopSuggestDocs.SuggestScoreDoc[0]), entries.getMaxScore());
-        }
-    }
 }
diff --git a/server/src/main/java/org/elasticsearch/search/suggest/completion/TopSuggestGroupDocsCollector.java b/server/src/main/java/org/elasticsearch/search/suggest/completion/TopSuggestGroupDocsCollector.java
@@ -0,0 +1,232 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.elasticsearch.search.suggest.completion;
+
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.search.CollectionTerminatedException;
+import org.apache.lucene.search.TotalHits;
+import org.apache.lucene.search.suggest.Lookup;
+import org.apache.lucene.search.suggest.document.TopSuggestDocs;
+import org.apache.lucene.search.suggest.document.TopSuggestDocsCollector;
+import org.apache.lucene.util.PriorityQueue;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+/**
+ *
+ * Custom {@link TopSuggestDocsCollector} that returns top documents from the completion suggester.
+ * <p>
+ * TODO: this should be refactored when https://issues.apache.org/jira/browse/LUCENE-8529 is fixed.
+ * Unlike the parent class, this collector uses the surface form to tie-break suggestions with identical
+ * scores.
+ * <p>
+ * This collector groups suggestions coming from the same document but matching different contexts
+ * or surface form together. When different contexts or surface forms match the same suggestion form only
+ * the best one per document (sorted by weight) is kept.
+ * <p>
+ * This collector is also able to filter duplicate suggestion coming from different documents.
+ * In order to keep this feature fast, the de-duplication of suggestions with different contexts is done
+ * only on the top N*num_contexts (where N is the number of documents to return) suggestions per segment.
+ * This means that skip_duplicates will visit at most N*num_contexts suggestions per segment to find unique suggestions
+ * that match the input. If more than N*num_contexts suggestions are duplicated with different contexts this collector
+ * will not be able to return more than one suggestion even when N is greater than 1.
+ **/
+class TopSuggestGroupDocsCollector extends TopSuggestDocsCollector {
+    private final class SuggestScoreDocPriorityQueue extends PriorityQueue<TopSuggestDocs.SuggestScoreDoc> {
+        /**
+         * Creates a new priority queue of the specified size.
+         */
+        private SuggestScoreDocPriorityQueue(int size) {
+            super(size);
+        }
+
+        @Override
+        protected boolean lessThan(TopSuggestDocs.SuggestScoreDoc a, TopSuggestDocs.SuggestScoreDoc b) {
+            if (a.score == b.score) {
+                // tie break by completion key
+                int cmp = Lookup.CHARSEQUENCE_COMPARATOR.compare(a.key, b.key);
+                // prefer smaller doc id, in case of a tie
+                return cmp != 0 ? cmp > 0 : a.doc > b.doc;
+            }
+            return a.score < b.score;
+        }
+
+        /**
+         * Returns the top N results in descending order.
+         */
+        public TopSuggestDocs.SuggestScoreDoc[] getResults() {
+            int size = size();
+            TopSuggestDocs.SuggestScoreDoc[] res = new TopSuggestDocs.SuggestScoreDoc[size];
+            for (int i = size - 1; i >= 0; i--) {
+                res[i] = pop();
+            }
+            return res;
+        }
+    }
+
+
+    private final SuggestScoreDocPriorityQueue priorityQueue;
+    private final int num;
+
+    /** Only set if we are deduplicating hits: holds all per-segment hits until the end, when we dedup them */
+    private final List<TopSuggestDocs.SuggestScoreDoc> pendingResults;
+
+    /** Only set if we are deduplicating hits: holds all surface forms seen so far in the current segment */
+    final CharArraySet seenSurfaceForms;
+
+    /** Document base offset for the current Leaf */
+    protected int docBase;
+
+    private Map<Integer, List<CharSequence>> docContexts = new HashMap<>();
+
+    /**
+     * Sole constructor
+     *
+     * Collects at most <code>num</code> completions
+     * with corresponding document and weight
+     */
+    TopSuggestGroupDocsCollector(int num, boolean skipDuplicates) {
+        super(1, skipDuplicates);
+        if (num <= 0) {
+            throw new IllegalArgumentException("'num' must be > 0");
+        }
+        this.num = num;
+        this.priorityQueue = new SuggestScoreDocPriorityQueue(num);
+        if (skipDuplicates) {
+            seenSurfaceForms = new CharArraySet(num, false);
+            pendingResults = new ArrayList<>();
+        } else {
+            seenSurfaceForms = null;
+            pendingResults = null;
+        }
+    }
+
+    /**
+     * Returns the contexts associated with the provided <code>doc</code>.
+     */
+    public List<CharSequence> getContexts(int doc) {
+        return docContexts.getOrDefault(doc, Collections.emptyList());
+    }
+
+    @Override
+    protected boolean doSkipDuplicates() {
+        return seenSurfaceForms != null;
+    }
+
+    @Override
+    public int getCountToCollect() {
+        return num;
+    }
+
+    @Override
+    protected void doSetNextReader(LeafReaderContext context) throws IOException {
+        docBase = context.docBase;
+        if (seenSurfaceForms != null) {
+            seenSurfaceForms.clear();
+            // NOTE: this also clears the priorityQueue:
+            for (TopSuggestDocs.SuggestScoreDoc hit : priorityQueue.getResults()) {
+                pendingResults.add(hit);
+            }
+        }
+    }
+
+    @Override
+    public void collect(int docID, CharSequence key, CharSequence context, float score) throws IOException {
+        int globalDoc = docID + docBase;
+        boolean isDuplicate = docContexts.containsKey(globalDoc);
+        List<CharSequence> contexts = docContexts.computeIfAbsent(globalDoc, k -> new ArrayList<>());
+        if (context != null) {
+            contexts.add(context);
+        }
+        if (isDuplicate) {
+            return;
+        }
+        TopSuggestDocs.SuggestScoreDoc current = new TopSuggestDocs.SuggestScoreDoc(globalDoc, key, context, score);
+        if (current == priorityQueue.insertWithOverflow(current)) {
+            // if the current SuggestScoreDoc has overflown from pq,
+            // we can assume all of the successive collections from
+            // this leaf will be overflown as well
+            // TODO: reuse the overflow instance?
+            throw new CollectionTerminatedException();
+        }
+    }
+
+    @Override
+    public TopSuggestDocs get() throws IOException {
+
+        TopSuggestDocs.SuggestScoreDoc[] suggestScoreDocs;
+
+        if (seenSurfaceForms != null) {
+            // NOTE: this also clears the priorityQueue:
+            for (TopSuggestDocs.SuggestScoreDoc hit : priorityQueue.getResults()) {
+                pendingResults.add(hit);
+            }
+
+            // Deduplicate all hits: we already dedup'd efficiently within each segment by
+            // truncating the FST top paths search, but across segments there may still be dups:
+            seenSurfaceForms.clear();
+
+            // TODO: we could use a priority queue here to make cost O(N * log(num)) instead of O(N * log(N)), where N = O(num *
+            // numSegments), but typically numSegments is smallish and num is smallish so this won't matter much in practice:
+
+            Collections.sort(pendingResults,
+                (a, b) -> {
+                    // sort by higher score
+                    int cmp = Float.compare(b.score, a.score);
+                    if (cmp == 0) {
+                        // tie break by completion key
+                        cmp = Lookup.CHARSEQUENCE_COMPARATOR.compare(a.key, b.key);
+                        if (cmp == 0) {
+                            // prefer smaller doc id, in case of a tie
+                            cmp = Integer.compare(a.doc, b.doc);
+                        }
+                    }
+                    return cmp;
+                });
+
+            List<TopSuggestDocs.SuggestScoreDoc> hits = new ArrayList<>();
+
+            for (TopSuggestDocs.SuggestScoreDoc hit : pendingResults) {
+                if (seenSurfaceForms.contains(hit.key) == false) {
+                    seenSurfaceForms.add(hit.key);
+                    hits.add(hit);
+                    if (hits.size() == num) {
+                        break;
+                    }
+                }
+            }
+            suggestScoreDocs = hits.toArray(new TopSuggestDocs.SuggestScoreDoc[0]);
+        } else {
+            suggestScoreDocs = priorityQueue.getResults();
+        }
+
+        if (suggestScoreDocs.length > 0) {
+            return new TopSuggestDocs(new TotalHits(suggestScoreDocs.length, TotalHits.Relation.EQUAL_TO), suggestScoreDocs);
+        } else {
+            return TopSuggestDocs.EMPTY;
+        }
+    }
+
+}
diff --git a/server/src/test/java/org/elasticsearch/search/suggest/CompletionSuggestSearchIT.java b/server/src/test/java/org/elasticsearch/search/suggest/CompletionSuggestSearchIT.java