Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
*/
package org.elasticsearch.search.suggest.completion;

import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.BulkScorer;
import org.apache.lucene.search.CollectionTerminatedException;
Expand All @@ -34,9 +33,7 @@
import org.elasticsearch.search.suggest.Suggester;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
Expand All @@ -59,15 +56,17 @@ protected Suggest.Suggestion<? extends Suggest.Suggestion.Entry<? extends Sugges
new Text(spare.toString()), 0, spare.length());
completionSuggestion.addTerm(completionSuggestEntry);
int shardSize = suggestionContext.getShardSize() != null ? suggestionContext.getShardSize() : suggestionContext.getSize();
TopSuggestDocsCollector collector = new TopDocumentsCollector(shardSize, suggestionContext.isSkipDuplicates());
TopSuggestGroupDocsCollector collector = new TopSuggestGroupDocsCollector(shardSize, suggestionContext.isSkipDuplicates());
suggest(searcher, suggestionContext.toQuery(), collector);
int numResult = 0;
for (TopSuggestDocs.SuggestScoreDoc suggestScoreDoc : collector.get().scoreLookupDocs()) {
TopDocumentsCollector.SuggestDoc suggestDoc = (TopDocumentsCollector.SuggestDoc) suggestScoreDoc;
for (TopSuggestDocs.SuggestScoreDoc suggestDoc : collector.get().scoreLookupDocs()) {
// collect contexts
Map<String, Set<CharSequence>> contexts = Collections.emptyMap();
if (fieldType.hasContextMappings() && suggestDoc.getContexts().isEmpty() == false) {
contexts = fieldType.getContextMappings().getNamedContexts(suggestDoc.getContexts());
if (fieldType.hasContextMappings()) {
List<CharSequence> rawContexts = collector.getContexts(suggestDoc.doc);
if (rawContexts.size() > 0) {
contexts = fieldType.getContextMappings().getNamedContexts(rawContexts);
}
}
if (numResult++ < suggestionContext.getSize()) {
CompletionSuggestion.Entry.Option option = new CompletionSuggestion.Entry.Option(suggestDoc.doc,
Expand Down Expand Up @@ -97,120 +96,4 @@ private static void suggest(IndexSearcher searcher, CompletionQuery query, TopSu
}
}
}

/**
* TODO: this should be refactored and moved to lucene see https://issues.apache.org/jira/browse/LUCENE-6880
*
* Custom collector that returns top documents from the completion suggester.
* When suggestions are augmented with contexts values this collector groups suggestions coming from the same document
* but matching different contexts together. Each document is counted as 1 entry and the provided size is the expected number
* of documents that should be returned (not the number of suggestions).
* This collector is also able to filter duplicate suggestion coming from different documents.
* When different contexts match the same suggestion form only the best one (sorted by weight) is kept.
* In order to keep this feature fast, the de-duplication of suggestions with different contexts is done
* only on the top N*num_contexts (where N is the number of documents to return) suggestions per segment.
* This means that skip_duplicates will visit at most N*num_contexts suggestions per segment to find unique suggestions
* that match the input. If more than N*num_contexts suggestions are duplicated with different contexts this collector
* will not be able to return more than one suggestion even when N is greater than 1.
**/
private static final class TopDocumentsCollector extends TopSuggestDocsCollector {

/**
* Holds a list of suggest meta data for a doc
*/
private static final class SuggestDoc extends TopSuggestDocs.SuggestScoreDoc {

private List<TopSuggestDocs.SuggestScoreDoc> suggestScoreDocs;

SuggestDoc(int doc, CharSequence key, CharSequence context, float score) {
super(doc, key, context, score);
}

void add(CharSequence key, CharSequence context, float score) {
if (suggestScoreDocs == null) {
suggestScoreDocs = new ArrayList<>(1);
}
suggestScoreDocs.add(new TopSuggestDocs.SuggestScoreDoc(doc, key, context, score));
}

public List<CharSequence> getKeys() {
if (suggestScoreDocs == null) {
return Collections.singletonList(key);
} else {
List<CharSequence> keys = new ArrayList<>(suggestScoreDocs.size() + 1);
keys.add(key);
for (TopSuggestDocs.SuggestScoreDoc scoreDoc : suggestScoreDocs) {
keys.add(scoreDoc.key);
}
return keys;
}
}

public List<CharSequence> getContexts() {
if (suggestScoreDocs == null) {
if (context != null) {
return Collections.singletonList(context);
} else {
return Collections.emptyList();
}
} else {
List<CharSequence> contexts = new ArrayList<>(suggestScoreDocs.size() + 1);
contexts.add(context);
for (TopSuggestDocs.SuggestScoreDoc scoreDoc : suggestScoreDocs) {
contexts.add(scoreDoc.context);
}
return contexts;
}
}
}

private final Map<Integer, SuggestDoc> docsMap;

TopDocumentsCollector(int num, boolean skipDuplicates) {
super(Math.max(1, num), skipDuplicates);
this.docsMap = new LinkedHashMap<>(num);
}

@Override
public void collect(int docID, CharSequence key, CharSequence context, float score) throws IOException {
int globalDoc = docID + docBase;
if (docsMap.containsKey(globalDoc)) {
docsMap.get(globalDoc).add(key, context, score);
} else {
docsMap.put(globalDoc, new SuggestDoc(globalDoc, key, context, score));
super.collect(docID, key, context, score);
}
}

@Override
public TopSuggestDocs get() throws IOException {
TopSuggestDocs entries = super.get();
if (entries.scoreDocs.length == 0) {
return TopSuggestDocs.EMPTY;
}
// The parent class returns suggestions, not documents, and dedup only the surface form (without contexts).
// The following code groups suggestions matching different contexts by document id and dedup the surface form + contexts
// if needed (skip_duplicates).
int size = entries.scoreDocs.length;
final List<TopSuggestDocs.SuggestScoreDoc> suggestDocs = new ArrayList<>(size);
final CharArraySet seenSurfaceForms = doSkipDuplicates() ? new CharArraySet(size, false) : null;
for (TopSuggestDocs.SuggestScoreDoc suggestEntry : entries.scoreLookupDocs()) {
final SuggestDoc suggestDoc;
if (docsMap != null) {
suggestDoc = docsMap.get(suggestEntry.doc);
} else {
suggestDoc = new SuggestDoc(suggestEntry.doc, suggestEntry.key, suggestEntry.context, suggestEntry.score);
}
if (doSkipDuplicates()) {
if (seenSurfaceForms.contains(suggestDoc.key)) {
continue;
}
seenSurfaceForms.add(suggestDoc.key);
}
suggestDocs.add(suggestDoc);
}
return new TopSuggestDocs(entries.totalHits,
suggestDocs.toArray(new TopSuggestDocs.SuggestScoreDoc[0]));
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,232 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.search.suggest.completion;

import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.CollectionTerminatedException;
import org.apache.lucene.search.TotalHits;
import org.apache.lucene.search.suggest.Lookup;
import org.apache.lucene.search.suggest.document.TopSuggestDocs;
import org.apache.lucene.search.suggest.document.TopSuggestDocsCollector;
import org.apache.lucene.util.PriorityQueue;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
*
* Custom {@link TopSuggestDocsCollector} that returns top documents from the completion suggester.
* <p>
* TODO: this should be refactored when https://issues.apache.org/jira/browse/LUCENE-8529 is fixed.
* Unlike the parent class, this collector uses the surface form to tie-break suggestions with identical
* scores.
* <p>
* This collector groups suggestions coming from the same document but matching different contexts
* or surface form together. When different contexts or surface forms match the same suggestion form only
* the best one per document (sorted by weight) is kept.
* <p>
* This collector is also able to filter duplicate suggestion coming from different documents.
* In order to keep this feature fast, the de-duplication of suggestions with different contexts is done
* only on the top N*num_contexts (where N is the number of documents to return) suggestions per segment.
* This means that skip_duplicates will visit at most N*num_contexts suggestions per segment to find unique suggestions
* that match the input. If more than N*num_contexts suggestions are duplicated with different contexts this collector
* will not be able to return more than one suggestion even when N is greater than 1.
**/
class TopSuggestGroupDocsCollector extends TopSuggestDocsCollector {
private final class SuggestScoreDocPriorityQueue extends PriorityQueue<TopSuggestDocs.SuggestScoreDoc> {
/**
* Creates a new priority queue of the specified size.
*/
private SuggestScoreDocPriorityQueue(int size) {
super(size);
}

@Override
protected boolean lessThan(TopSuggestDocs.SuggestScoreDoc a, TopSuggestDocs.SuggestScoreDoc b) {
if (a.score == b.score) {
// tie break by completion key
int cmp = Lookup.CHARSEQUENCE_COMPARATOR.compare(a.key, b.key);
// prefer smaller doc id, in case of a tie
return cmp != 0 ? cmp > 0 : a.doc > b.doc;
}
return a.score < b.score;
}

/**
* Returns the top N results in descending order.
*/
public TopSuggestDocs.SuggestScoreDoc[] getResults() {
int size = size();
TopSuggestDocs.SuggestScoreDoc[] res = new TopSuggestDocs.SuggestScoreDoc[size];
for (int i = size - 1; i >= 0; i--) {
res[i] = pop();
}
return res;
}
}


private final SuggestScoreDocPriorityQueue priorityQueue;
private final int num;

/** Only set if we are deduplicating hits: holds all per-segment hits until the end, when we dedup them */
private final List<TopSuggestDocs.SuggestScoreDoc> pendingResults;

/** Only set if we are deduplicating hits: holds all surface forms seen so far in the current segment */
final CharArraySet seenSurfaceForms;

/** Document base offset for the current Leaf */
protected int docBase;

private Map<Integer, List<CharSequence>> docContexts = new HashMap<>();

/**
* Sole constructor
*
* Collects at most <code>num</code> completions
* with corresponding document and weight
*/
TopSuggestGroupDocsCollector(int num, boolean skipDuplicates) {
super(1, skipDuplicates);
if (num <= 0) {
throw new IllegalArgumentException("'num' must be > 0");
}
this.num = num;
this.priorityQueue = new SuggestScoreDocPriorityQueue(num);
if (skipDuplicates) {
seenSurfaceForms = new CharArraySet(num, false);
pendingResults = new ArrayList<>();
} else {
seenSurfaceForms = null;
pendingResults = null;
}
}

/**
* Returns the contexts associated with the provided <code>doc</code>.
*/
public List<CharSequence> getContexts(int doc) {
return docContexts.getOrDefault(doc, Collections.emptyList());
}

@Override
protected boolean doSkipDuplicates() {
return seenSurfaceForms != null;
}

@Override
public int getCountToCollect() {
return num;
}

@Override
protected void doSetNextReader(LeafReaderContext context) throws IOException {
docBase = context.docBase;
if (seenSurfaceForms != null) {
seenSurfaceForms.clear();
// NOTE: this also clears the priorityQueue:
for (TopSuggestDocs.SuggestScoreDoc hit : priorityQueue.getResults()) {
pendingResults.add(hit);
}
}
}

@Override
public void collect(int docID, CharSequence key, CharSequence context, float score) throws IOException {
int globalDoc = docID + docBase;
boolean isDuplicate = docContexts.containsKey(globalDoc);
List<CharSequence> contexts = docContexts.computeIfAbsent(globalDoc, k -> new ArrayList<>());
if (context != null) {
contexts.add(context);
}
if (isDuplicate) {
return;
}
TopSuggestDocs.SuggestScoreDoc current = new TopSuggestDocs.SuggestScoreDoc(globalDoc, key, context, score);
if (current == priorityQueue.insertWithOverflow(current)) {
// if the current SuggestScoreDoc has overflown from pq,
// we can assume all of the successive collections from
// this leaf will be overflown as well
// TODO: reuse the overflow instance?
throw new CollectionTerminatedException();
}
}

@Override
public TopSuggestDocs get() throws IOException {

TopSuggestDocs.SuggestScoreDoc[] suggestScoreDocs;

if (seenSurfaceForms != null) {
// NOTE: this also clears the priorityQueue:
for (TopSuggestDocs.SuggestScoreDoc hit : priorityQueue.getResults()) {
pendingResults.add(hit);
}

// Deduplicate all hits: we already dedup'd efficiently within each segment by
// truncating the FST top paths search, but across segments there may still be dups:
seenSurfaceForms.clear();

// TODO: we could use a priority queue here to make cost O(N * log(num)) instead of O(N * log(N)), where N = O(num *
// numSegments), but typically numSegments is smallish and num is smallish so this won't matter much in practice:

Collections.sort(pendingResults,
(a, b) -> {
// sort by higher score
int cmp = Float.compare(b.score, a.score);
if (cmp == 0) {
// tie break by completion key
cmp = Lookup.CHARSEQUENCE_COMPARATOR.compare(a.key, b.key);
if (cmp == 0) {
// prefer smaller doc id, in case of a tie
cmp = Integer.compare(a.doc, b.doc);
}
}
return cmp;
});

List<TopSuggestDocs.SuggestScoreDoc> hits = new ArrayList<>();

for (TopSuggestDocs.SuggestScoreDoc hit : pendingResults) {
if (seenSurfaceForms.contains(hit.key) == false) {
seenSurfaceForms.add(hit.key);
hits.add(hit);
if (hits.size() == num) {
break;
}
}
}
suggestScoreDocs = hits.toArray(new TopSuggestDocs.SuggestScoreDoc[0]);
} else {
suggestScoreDocs = priorityQueue.getResults();
}

if (suggestScoreDocs.length > 0) {
return new TopSuggestDocs(new TotalHits(suggestScoreDocs.length, TotalHits.Relation.EQUAL_TO), suggestScoreDocs);
} else {
return TopSuggestDocs.EMPTY;
}
}

}
Loading