Skip to content

Commit ba43df6

Browse files
committed
Fix completion suggester's score tie-break (#34508)
The shard suggestion sort uses a different tie-break than the one that is used to merge different shards responses. The former uses the internal document identifier when scores are the same whereas the latter compares the surface form first. Because of this discrepancy some suggestion outputs are linked to the wrong documents because the merge sort reorders the shard suggestions differently. This change fixes this bug by duplicating the Lucene collector in order to be able to apply the same tiebreak strategy than the merge sort. This logic will be removed when https://issues.apache.org/jira/browse/LUCENE-8529 is fixed. Closes #34378
1 parent c5b6770 commit ba43df6

File tree

3 files changed

+275
-124
lines changed

3 files changed

+275
-124
lines changed

server/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggester.java

Lines changed: 7 additions & 124 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818
*/
1919
package org.elasticsearch.search.suggest.completion;
2020

21-
import org.apache.lucene.analysis.CharArraySet;
2221
import org.apache.lucene.index.LeafReaderContext;
2322
import org.apache.lucene.search.BulkScorer;
2423
import org.apache.lucene.search.CollectionTerminatedException;
@@ -34,9 +33,7 @@
3433
import org.elasticsearch.search.suggest.Suggester;
3534

3635
import java.io.IOException;
37-
import java.util.ArrayList;
3836
import java.util.Collections;
39-
import java.util.LinkedHashMap;
4037
import java.util.List;
4138
import java.util.Map;
4239
import java.util.Set;
@@ -59,15 +56,17 @@ protected Suggest.Suggestion<? extends Suggest.Suggestion.Entry<? extends Sugges
5956
new Text(spare.toString()), 0, spare.length());
6057
completionSuggestion.addTerm(completionSuggestEntry);
6158
int shardSize = suggestionContext.getShardSize() != null ? suggestionContext.getShardSize() : suggestionContext.getSize();
62-
TopSuggestDocsCollector collector = new TopDocumentsCollector(shardSize, suggestionContext.isSkipDuplicates());
59+
TopSuggestGroupDocsCollector collector = new TopSuggestGroupDocsCollector(shardSize, suggestionContext.isSkipDuplicates());
6360
suggest(searcher, suggestionContext.toQuery(), collector);
6461
int numResult = 0;
65-
for (TopSuggestDocs.SuggestScoreDoc suggestScoreDoc : collector.get().scoreLookupDocs()) {
66-
TopDocumentsCollector.SuggestDoc suggestDoc = (TopDocumentsCollector.SuggestDoc) suggestScoreDoc;
62+
for (TopSuggestDocs.SuggestScoreDoc suggestDoc : collector.get().scoreLookupDocs()) {
6763
// collect contexts
6864
Map<String, Set<CharSequence>> contexts = Collections.emptyMap();
69-
if (fieldType.hasContextMappings() && suggestDoc.getContexts().isEmpty() == false) {
70-
contexts = fieldType.getContextMappings().getNamedContexts(suggestDoc.getContexts());
65+
if (fieldType.hasContextMappings()) {
66+
List<CharSequence> rawContexts = collector.getContexts(suggestDoc.doc);
67+
if (rawContexts.size() > 0) {
68+
contexts = fieldType.getContextMappings().getNamedContexts(rawContexts);
69+
}
7170
}
7271
if (numResult++ < suggestionContext.getSize()) {
7372
CompletionSuggestion.Entry.Option option = new CompletionSuggestion.Entry.Option(suggestDoc.doc,
@@ -97,120 +96,4 @@ private static void suggest(IndexSearcher searcher, CompletionQuery query, TopSu
9796
}
9897
}
9998
}
100-
101-
/**
102-
* TODO: this should be refactored and moved to lucene see https://issues.apache.org/jira/browse/LUCENE-6880
103-
*
104-
* Custom collector that returns top documents from the completion suggester.
105-
* When suggestions are augmented with contexts values this collector groups suggestions coming from the same document
106-
* but matching different contexts together. Each document is counted as 1 entry and the provided size is the expected number
107-
* of documents that should be returned (not the number of suggestions).
108-
* This collector is also able to filter duplicate suggestion coming from different documents.
109-
* When different contexts match the same suggestion form only the best one (sorted by weight) is kept.
110-
* In order to keep this feature fast, the de-duplication of suggestions with different contexts is done
111-
* only on the top N*num_contexts (where N is the number of documents to return) suggestions per segment.
112-
* This means that skip_duplicates will visit at most N*num_contexts suggestions per segment to find unique suggestions
113-
* that match the input. If more than N*num_contexts suggestions are duplicated with different contexts this collector
114-
* will not be able to return more than one suggestion even when N is greater than 1.
115-
**/
116-
private static final class TopDocumentsCollector extends TopSuggestDocsCollector {
117-
118-
/**
119-
* Holds a list of suggest meta data for a doc
120-
*/
121-
private static final class SuggestDoc extends TopSuggestDocs.SuggestScoreDoc {
122-
123-
private List<TopSuggestDocs.SuggestScoreDoc> suggestScoreDocs;
124-
125-
SuggestDoc(int doc, CharSequence key, CharSequence context, float score) {
126-
super(doc, key, context, score);
127-
}
128-
129-
void add(CharSequence key, CharSequence context, float score) {
130-
if (suggestScoreDocs == null) {
131-
suggestScoreDocs = new ArrayList<>(1);
132-
}
133-
suggestScoreDocs.add(new TopSuggestDocs.SuggestScoreDoc(doc, key, context, score));
134-
}
135-
136-
public List<CharSequence> getKeys() {
137-
if (suggestScoreDocs == null) {
138-
return Collections.singletonList(key);
139-
} else {
140-
List<CharSequence> keys = new ArrayList<>(suggestScoreDocs.size() + 1);
141-
keys.add(key);
142-
for (TopSuggestDocs.SuggestScoreDoc scoreDoc : suggestScoreDocs) {
143-
keys.add(scoreDoc.key);
144-
}
145-
return keys;
146-
}
147-
}
148-
149-
public List<CharSequence> getContexts() {
150-
if (suggestScoreDocs == null) {
151-
if (context != null) {
152-
return Collections.singletonList(context);
153-
} else {
154-
return Collections.emptyList();
155-
}
156-
} else {
157-
List<CharSequence> contexts = new ArrayList<>(suggestScoreDocs.size() + 1);
158-
contexts.add(context);
159-
for (TopSuggestDocs.SuggestScoreDoc scoreDoc : suggestScoreDocs) {
160-
contexts.add(scoreDoc.context);
161-
}
162-
return contexts;
163-
}
164-
}
165-
}
166-
167-
private final Map<Integer, SuggestDoc> docsMap;
168-
169-
TopDocumentsCollector(int num, boolean skipDuplicates) {
170-
super(Math.max(1, num), skipDuplicates);
171-
this.docsMap = new LinkedHashMap<>(num);
172-
}
173-
174-
@Override
175-
public void collect(int docID, CharSequence key, CharSequence context, float score) throws IOException {
176-
int globalDoc = docID + docBase;
177-
if (docsMap.containsKey(globalDoc)) {
178-
docsMap.get(globalDoc).add(key, context, score);
179-
} else {
180-
docsMap.put(globalDoc, new SuggestDoc(globalDoc, key, context, score));
181-
super.collect(docID, key, context, score);
182-
}
183-
}
184-
185-
@Override
186-
public TopSuggestDocs get() throws IOException {
187-
TopSuggestDocs entries = super.get();
188-
if (entries.scoreDocs.length == 0) {
189-
return TopSuggestDocs.EMPTY;
190-
}
191-
// The parent class returns suggestions, not documents, and dedup only the surface form (without contexts).
192-
// The following code groups suggestions matching different contexts by document id and dedup the surface form + contexts
193-
// if needed (skip_duplicates).
194-
int size = entries.scoreDocs.length;
195-
final List<TopSuggestDocs.SuggestScoreDoc> suggestDocs = new ArrayList(size);
196-
final CharArraySet seenSurfaceForms = doSkipDuplicates() ? new CharArraySet(size, false) : null;
197-
for (TopSuggestDocs.SuggestScoreDoc suggestEntry : entries.scoreLookupDocs()) {
198-
final SuggestDoc suggestDoc;
199-
if (docsMap != null) {
200-
suggestDoc = docsMap.get(suggestEntry.doc);
201-
} else {
202-
suggestDoc = new SuggestDoc(suggestEntry.doc, suggestEntry.key, suggestEntry.context, suggestEntry.score);
203-
}
204-
if (doSkipDuplicates()) {
205-
if (seenSurfaceForms.contains(suggestDoc.key)) {
206-
continue;
207-
}
208-
seenSurfaceForms.add(suggestDoc.key);
209-
}
210-
suggestDocs.add(suggestDoc);
211-
}
212-
return new TopSuggestDocs((int) entries.totalHits,
213-
suggestDocs.toArray(new TopSuggestDocs.SuggestScoreDoc[0]), entries.getMaxScore());
214-
}
215-
}
21699
}
Lines changed: 232 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,232 @@
1+
/*
2+
* Licensed to Elasticsearch under one or more contributor
3+
* license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright
5+
* ownership. Elasticsearch licenses this file to you under
6+
* the Apache License, Version 2.0 (the "License"); you may
7+
* not use this file except in compliance with the License.
8+
* You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
package org.elasticsearch.search.suggest.completion;
20+
21+
import org.apache.lucene.analysis.CharArraySet;
22+
import org.apache.lucene.index.LeafReaderContext;
23+
import org.apache.lucene.search.CollectionTerminatedException;
24+
import org.apache.lucene.search.TotalHits;
25+
import org.apache.lucene.search.suggest.Lookup;
26+
import org.apache.lucene.search.suggest.document.TopSuggestDocs;
27+
import org.apache.lucene.search.suggest.document.TopSuggestDocsCollector;
28+
import org.apache.lucene.util.PriorityQueue;
29+
30+
import java.io.IOException;
31+
import java.util.ArrayList;
32+
import java.util.Collections;
33+
import java.util.HashMap;
34+
import java.util.List;
35+
import java.util.Map;
36+
37+
/**
38+
*
39+
* Custom {@link TopSuggestDocsCollector} that returns top documents from the completion suggester.
40+
* <p>
41+
* TODO: this should be refactored when https://issues.apache.org/jira/browse/LUCENE-8529 is fixed.
42+
* Unlike the parent class, this collector uses the surface form to tie-break suggestions with identical
43+
* scores.
44+
* <p>
45+
* This collector groups suggestions coming from the same document but matching different contexts
46+
* or surface form together. When different contexts or surface forms match the same suggestion form only
47+
* the best one per document (sorted by weight) is kept.
48+
* <p>
49+
* This collector is also able to filter duplicate suggestion coming from different documents.
50+
* In order to keep this feature fast, the de-duplication of suggestions with different contexts is done
51+
* only on the top N*num_contexts (where N is the number of documents to return) suggestions per segment.
52+
* This means that skip_duplicates will visit at most N*num_contexts suggestions per segment to find unique suggestions
53+
* that match the input. If more than N*num_contexts suggestions are duplicated with different contexts this collector
54+
* will not be able to return more than one suggestion even when N is greater than 1.
55+
**/
56+
class TopSuggestGroupDocsCollector extends TopSuggestDocsCollector {
57+
private final class SuggestScoreDocPriorityQueue extends PriorityQueue<TopSuggestDocs.SuggestScoreDoc> {
58+
/**
59+
* Creates a new priority queue of the specified size.
60+
*/
61+
private SuggestScoreDocPriorityQueue(int size) {
62+
super(size);
63+
}
64+
65+
@Override
66+
protected boolean lessThan(TopSuggestDocs.SuggestScoreDoc a, TopSuggestDocs.SuggestScoreDoc b) {
67+
if (a.score == b.score) {
68+
// tie break by completion key
69+
int cmp = Lookup.CHARSEQUENCE_COMPARATOR.compare(a.key, b.key);
70+
// prefer smaller doc id, in case of a tie
71+
return cmp != 0 ? cmp > 0 : a.doc > b.doc;
72+
}
73+
return a.score < b.score;
74+
}
75+
76+
/**
77+
* Returns the top N results in descending order.
78+
*/
79+
public TopSuggestDocs.SuggestScoreDoc[] getResults() {
80+
int size = size();
81+
TopSuggestDocs.SuggestScoreDoc[] res = new TopSuggestDocs.SuggestScoreDoc[size];
82+
for (int i = size - 1; i >= 0; i--) {
83+
res[i] = pop();
84+
}
85+
return res;
86+
}
87+
}
88+
89+
90+
private final SuggestScoreDocPriorityQueue priorityQueue;
91+
private final int num;
92+
93+
/** Only set if we are deduplicating hits: holds all per-segment hits until the end, when we dedup them */
94+
private final List<TopSuggestDocs.SuggestScoreDoc> pendingResults;
95+
96+
/** Only set if we are deduplicating hits: holds all surface forms seen so far in the current segment */
97+
final CharArraySet seenSurfaceForms;
98+
99+
/** Document base offset for the current Leaf */
100+
protected int docBase;
101+
102+
private Map<Integer, List<CharSequence>> docContexts = new HashMap<>();
103+
104+
/**
105+
* Sole constructor
106+
*
107+
* Collects at most <code>num</code> completions
108+
* with corresponding document and weight
109+
*/
110+
TopSuggestGroupDocsCollector(int num, boolean skipDuplicates) {
111+
super(1, skipDuplicates);
112+
if (num <= 0) {
113+
throw new IllegalArgumentException("'num' must be > 0");
114+
}
115+
this.num = num;
116+
this.priorityQueue = new SuggestScoreDocPriorityQueue(num);
117+
if (skipDuplicates) {
118+
seenSurfaceForms = new CharArraySet(num, false);
119+
pendingResults = new ArrayList<>();
120+
} else {
121+
seenSurfaceForms = null;
122+
pendingResults = null;
123+
}
124+
}
125+
126+
/**
127+
* Returns the contexts associated with the provided <code>doc</code>.
128+
*/
129+
public List<CharSequence> getContexts(int doc) {
130+
return docContexts.getOrDefault(doc, Collections.emptyList());
131+
}
132+
133+
@Override
134+
protected boolean doSkipDuplicates() {
135+
return seenSurfaceForms != null;
136+
}
137+
138+
@Override
139+
public int getCountToCollect() {
140+
return num;
141+
}
142+
143+
@Override
144+
protected void doSetNextReader(LeafReaderContext context) throws IOException {
145+
docBase = context.docBase;
146+
if (seenSurfaceForms != null) {
147+
seenSurfaceForms.clear();
148+
// NOTE: this also clears the priorityQueue:
149+
for (TopSuggestDocs.SuggestScoreDoc hit : priorityQueue.getResults()) {
150+
pendingResults.add(hit);
151+
}
152+
}
153+
}
154+
155+
@Override
156+
public void collect(int docID, CharSequence key, CharSequence context, float score) throws IOException {
157+
int globalDoc = docID + docBase;
158+
boolean isDuplicate = docContexts.containsKey(globalDoc);
159+
List<CharSequence> contexts = docContexts.computeIfAbsent(globalDoc, k -> new ArrayList<>());
160+
if (context != null) {
161+
contexts.add(context);
162+
}
163+
if (isDuplicate) {
164+
return;
165+
}
166+
TopSuggestDocs.SuggestScoreDoc current = new TopSuggestDocs.SuggestScoreDoc(globalDoc, key, context, score);
167+
if (current == priorityQueue.insertWithOverflow(current)) {
168+
// if the current SuggestScoreDoc has overflown from pq,
169+
// we can assume all of the successive collections from
170+
// this leaf will be overflown as well
171+
// TODO: reuse the overflow instance?
172+
throw new CollectionTerminatedException();
173+
}
174+
}
175+
176+
@Override
177+
public TopSuggestDocs get() throws IOException {
178+
179+
TopSuggestDocs.SuggestScoreDoc[] suggestScoreDocs;
180+
181+
if (seenSurfaceForms != null) {
182+
// NOTE: this also clears the priorityQueue:
183+
for (TopSuggestDocs.SuggestScoreDoc hit : priorityQueue.getResults()) {
184+
pendingResults.add(hit);
185+
}
186+
187+
// Deduplicate all hits: we already dedup'd efficiently within each segment by
188+
// truncating the FST top paths search, but across segments there may still be dups:
189+
seenSurfaceForms.clear();
190+
191+
// TODO: we could use a priority queue here to make cost O(N * log(num)) instead of O(N * log(N)), where N = O(num *
192+
// numSegments), but typically numSegments is smallish and num is smallish so this won't matter much in practice:
193+
194+
Collections.sort(pendingResults,
195+
(a, b) -> {
196+
// sort by higher score
197+
int cmp = Float.compare(b.score, a.score);
198+
if (cmp == 0) {
199+
// tie break by completion key
200+
cmp = Lookup.CHARSEQUENCE_COMPARATOR.compare(a.key, b.key);
201+
if (cmp == 0) {
202+
// prefer smaller doc id, in case of a tie
203+
cmp = Integer.compare(a.doc, b.doc);
204+
}
205+
}
206+
return cmp;
207+
});
208+
209+
List<TopSuggestDocs.SuggestScoreDoc> hits = new ArrayList<>();
210+
211+
for (TopSuggestDocs.SuggestScoreDoc hit : pendingResults) {
212+
if (seenSurfaceForms.contains(hit.key) == false) {
213+
seenSurfaceForms.add(hit.key);
214+
hits.add(hit);
215+
if (hits.size() == num) {
216+
break;
217+
}
218+
}
219+
}
220+
suggestScoreDocs = hits.toArray(new TopSuggestDocs.SuggestScoreDoc[0]);
221+
} else {
222+
suggestScoreDocs = priorityQueue.getResults();
223+
}
224+
225+
if (suggestScoreDocs.length > 0) {
226+
return new TopSuggestDocs(new TotalHits(suggestScoreDocs.length, TotalHits.Relation.EQUAL_TO), suggestScoreDocs);
227+
} else {
228+
return TopSuggestDocs.EMPTY;
229+
}
230+
}
231+
232+
}

0 commit comments

Comments
 (0)