Skip to content

Commit b8c352f

Browse files
authored
Add support for fragment_length in the unified highlighter (#23431)
* Add support for fragment_length in the unified highlighter This commit introduce a new break iterator (a BoundedBreakIterator) designed for the unified highlighter that is able to limit the size of fragments produced by generic break iterator like `sentence`. The `unified` highlighter now supports `boundary_scanner` which can `words` or `sentence`. The `sentence` mode will use the bounded break iterator in order to limit the size of the sentence to `fragment_length`. When sentences bigger than `fragment_length` are produced, this mode will break the sentence at the next word boundary **after** `fragment_length` is reached.
1 parent c462d7d commit b8c352f

File tree

10 files changed

+761
-336
lines changed

10 files changed

+761
-336
lines changed
Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
/*
2+
* Licensed to Elasticsearch under one or more contributor
3+
* license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright
5+
* ownership. Elasticsearch licenses this file to you under
6+
* the Apache License, Version 2.0 (the "License"); you may
7+
* not use this file except in compliance with the License.
8+
* You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
package org.apache.lucene.search.uhighlight;
20+
21+
import java.text.BreakIterator;
22+
import java.text.CharacterIterator;
23+
import java.util.Locale;
24+
25+
/**
26+
* A custom break iterator that scans text to find break-delimited passages bounded by
27+
* a provided maximum length. This class delegates the boundary search to a first level
28+
* break iterator. When this break iterator finds a passage greater than the maximum length
29+
* a secondary break iterator is used to re-split the passage at the first boundary after
30+
* maximum length.
31+
* This is useful to split passages created by {@link BreakIterator}s like `sentence` that
32+
* can create big outliers on semi-structured text.
33+
*
34+
* WARNING: This break iterator is designed to work with the {@link UnifiedHighlighter}.
35+
**/
36+
public class BoundedBreakIteratorScanner extends BreakIterator {
37+
private final BreakIterator mainBreak;
38+
private final BreakIterator innerBreak;
39+
private final int maxLen;
40+
41+
private int lastPrecedingOffset = -1;
42+
private int windowStart = -1;
43+
private int windowEnd = -1;
44+
private int innerStart = -1;
45+
private int innerEnd = 0;
46+
47+
private BoundedBreakIteratorScanner(BreakIterator mainBreak,
48+
BreakIterator innerBreak,
49+
int maxLen) {
50+
this.mainBreak = mainBreak;
51+
this.innerBreak = innerBreak;
52+
this.maxLen = maxLen;
53+
}
54+
55+
@Override
56+
public CharacterIterator getText() {
57+
return mainBreak.getText();
58+
}
59+
60+
@Override
61+
public void setText(CharacterIterator newText) {
62+
reset();
63+
mainBreak.setText(newText);
64+
innerBreak.setText(newText);
65+
}
66+
67+
@Override
68+
public void setText(String newText) {
69+
reset();
70+
mainBreak.setText(newText);
71+
innerBreak.setText(newText);
72+
}
73+
74+
private void reset() {
75+
lastPrecedingOffset = -1;
76+
windowStart = -1;
77+
windowEnd = -1;
78+
innerStart = -1;
79+
innerEnd = 0;
80+
}
81+
82+
/**
83+
* Must be called with increasing offset. See {@link FieldHighlighter} for usage.
84+
*/
85+
@Override
86+
public int preceding(int offset) {
87+
if (offset < lastPrecedingOffset) {
88+
throw new IllegalArgumentException("offset < lastPrecedingOffset: " +
89+
"usage doesn't look like UnifiedHighlighter");
90+
}
91+
if (offset > windowStart && offset < windowEnd) {
92+
innerStart = innerEnd;
93+
innerEnd = windowEnd;
94+
} else {
95+
windowStart = innerStart = mainBreak.preceding(offset);
96+
windowEnd = innerEnd = mainBreak.following(offset-1);
97+
}
98+
99+
if (innerEnd - innerStart > maxLen) {
100+
// the current split is too big,
101+
// so starting from the current term we try to find boundaries on the left first
102+
if (offset - maxLen > innerStart) {
103+
innerStart = Math.max(innerStart,
104+
innerBreak.preceding(offset - maxLen));
105+
}
106+
// and then we try to expand the passage to the right with the remaining size
107+
int remaining = Math.max(0, maxLen - (offset - innerStart));
108+
if (offset + remaining < windowEnd) {
109+
innerEnd = Math.min(windowEnd,
110+
innerBreak.following(offset + remaining));
111+
}
112+
}
113+
lastPrecedingOffset = offset - 1;
114+
return innerStart;
115+
}
116+
117+
/**
118+
* Can be invoked only after a call to preceding(offset+1).
119+
* See {@link FieldHighlighter} for usage.
120+
*/
121+
@Override
122+
public int following(int offset) {
123+
if (offset != lastPrecedingOffset || innerEnd == -1) {
124+
throw new IllegalArgumentException("offset != lastPrecedingOffset: " +
125+
"usage doesn't look like UnifiedHighlighter");
126+
}
127+
return innerEnd;
128+
}
129+
130+
/**
131+
* Returns a {@link BreakIterator#getSentenceInstance(Locale)} bounded to maxLen.
132+
* Secondary boundaries are found using a {@link BreakIterator#getWordInstance(Locale)}.
133+
*/
134+
public static BreakIterator getSentence(Locale locale, int maxLen) {
135+
final BreakIterator sBreak = BreakIterator.getSentenceInstance(locale);
136+
final BreakIterator wBreak = BreakIterator.getWordInstance(locale);
137+
return new BoundedBreakIteratorScanner(sBreak, wBreak, maxLen);
138+
}
139+
140+
141+
@Override
142+
public int current() {
143+
// Returns the last offset of the current split
144+
return this.innerEnd;
145+
}
146+
147+
@Override
148+
public int first() {
149+
throw new IllegalStateException("first() should not be called in this context");
150+
}
151+
152+
@Override
153+
public int next() {
154+
throw new IllegalStateException("next() should not be called in this context");
155+
}
156+
157+
@Override
158+
public int last() {
159+
throw new IllegalStateException("last() should not be called in this context");
160+
}
161+
162+
@Override
163+
public int next(int n) {
164+
throw new IllegalStateException("next(n) should not be called in this context");
165+
}
166+
167+
@Override
168+
public int previous() {
169+
throw new IllegalStateException("previous() should not be called in this context");
170+
}
171+
}
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
/*
2+
* Licensed to Elasticsearch under one or more contributor
3+
* license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright
5+
* ownership. Elasticsearch licenses this file to you under
6+
* the Apache License, Version 2.0 (the "License"); you may
7+
* not use this file except in compliance with the License.
8+
* You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package org.apache.lucene.search.uhighlight;
21+
22+
import java.text.BreakIterator;
23+
import java.util.Locale;
24+
25+
import static org.apache.lucene.search.uhighlight.CustomUnifiedHighlighter.MULTIVAL_SEP_CHAR;
26+
27+
/**
28+
* Custom {@link FieldHighlighter} that creates a single passage bounded to {@code noMatchSize} when
29+
* no highlights were found.
30+
*/
31+
class CustomFieldHighlighter extends FieldHighlighter {
32+
private static final Passage[] EMPTY_PASSAGE = new Passage[0];
33+
34+
private final Locale breakIteratorLocale;
35+
private final int noMatchSize;
36+
private final String fieldValue;
37+
38+
CustomFieldHighlighter(String field, FieldOffsetStrategy fieldOffsetStrategy,
39+
Locale breakIteratorLocale, BreakIterator breakIterator,
40+
PassageScorer passageScorer, int maxPassages, int maxNoHighlightPassages,
41+
PassageFormatter passageFormatter, int noMatchSize, String fieldValue) {
42+
super(field, fieldOffsetStrategy, breakIterator, passageScorer, maxPassages,
43+
maxNoHighlightPassages, passageFormatter);
44+
this.breakIteratorLocale = breakIteratorLocale;
45+
this.noMatchSize = noMatchSize;
46+
this.fieldValue = fieldValue;
47+
}
48+
49+
@Override
50+
protected Passage[] getSummaryPassagesNoHighlight(int maxPassages) {
51+
if (noMatchSize > 0) {
52+
int pos = 0;
53+
while (pos < fieldValue.length() && fieldValue.charAt(pos) == MULTIVAL_SEP_CHAR) {
54+
pos ++;
55+
}
56+
if (pos < fieldValue.length()) {
57+
int end = fieldValue.indexOf(MULTIVAL_SEP_CHAR, pos);
58+
if (end == -1) {
59+
end = fieldValue.length();
60+
}
61+
if (noMatchSize+pos < end) {
62+
BreakIterator bi = BreakIterator.getWordInstance(breakIteratorLocale);
63+
bi.setText(fieldValue);
64+
// Finds the next word boundary **after** noMatchSize.
65+
end = bi.following(noMatchSize + pos);
66+
if (end == BreakIterator.DONE) {
67+
end = fieldValue.length();
68+
}
69+
}
70+
Passage passage = new Passage();
71+
passage.setScore(Float.NaN);
72+
passage.setStartOffset(pos);
73+
passage.setEndOffset(end);
74+
return new Passage[]{passage};
75+
}
76+
}
77+
return EMPTY_PASSAGE;
78+
}
79+
}

core/src/main/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighter.java

Lines changed: 30 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,8 @@
3333
import org.apache.lucene.search.spans.SpanOrQuery;
3434
import org.apache.lucene.search.spans.SpanQuery;
3535
import org.apache.lucene.search.spans.SpanTermQuery;
36+
import org.apache.lucene.util.BytesRef;
37+
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
3638
import org.elasticsearch.common.Nullable;
3739
import org.elasticsearch.common.lucene.all.AllTermQuery;
3840
import org.elasticsearch.common.lucene.search.MultiPhrasePrefixQuery;
@@ -47,6 +49,7 @@
4749
import java.util.List;
4850
import java.util.Locale;
4951
import java.util.Map;
52+
import java.util.Set;
5053

5154
/**
5255
* Subclass of the {@link UnifiedHighlighter} that works for a single field in a single document.
@@ -57,37 +60,41 @@
5760
* Supports both returning empty snippets and non highlighted snippets when no highlighting can be performed.
5861
*/
5962
public class CustomUnifiedHighlighter extends UnifiedHighlighter {
63+
public static final char MULTIVAL_SEP_CHAR = (char) 0;
6064
private static final Snippet[] EMPTY_SNIPPET = new Snippet[0];
6165

6266
private final String fieldValue;
6367
private final PassageFormatter passageFormatter;
6468
private final BreakIterator breakIterator;
65-
private final boolean returnNonHighlightedSnippets;
69+
private final Locale breakIteratorLocale;
70+
private final int noMatchSize;
6671

6772
/**
6873
* Creates a new instance of {@link CustomUnifiedHighlighter}
6974
*
7075
* @param analyzer the analyzer used for the field at index time, used for multi term queries internally
7176
* @param passageFormatter our own {@link CustomPassageFormatter}
72-
* which generates snippets in forms of {@link Snippet} objects
77+
* which generates snippets in forms of {@link Snippet} objects
78+
* @param breakIteratorLocale the {@link Locale} to use for dividing text into passages.
79+
* If null {@link Locale#ROOT} is used
7380
* @param breakIterator the {@link BreakIterator} to use for dividing text into passages.
74-
* If null {@link BreakIterator#getSentenceInstance(Locale)} is used.
75-
* @param fieldValue the original field values as constructor argument, loaded from the _source field or
76-
* the relevant stored field.
77-
* @param returnNonHighlightedSnippets whether non highlighted snippets should be
78-
* returned rather than empty snippets when no highlighting can be performed
81+
* If null {@link BreakIterator#getSentenceInstance(Locale)} is used.
82+
* @param fieldValue the original field values delimited by MULTIVAL_SEP_CHAR
83+
* @param noMatchSize The size of the text that should be returned when no highlighting can be performed
7984
*/
8085
public CustomUnifiedHighlighter(IndexSearcher searcher,
8186
Analyzer analyzer,
8287
PassageFormatter passageFormatter,
88+
@Nullable Locale breakIteratorLocale,
8389
@Nullable BreakIterator breakIterator,
8490
String fieldValue,
85-
boolean returnNonHighlightedSnippets) {
91+
int noMatchSize) {
8692
super(searcher, analyzer);
8793
this.breakIterator = breakIterator;
94+
this.breakIteratorLocale = breakIteratorLocale == null ? Locale.ROOT : breakIteratorLocale;
8895
this.passageFormatter = passageFormatter;
8996
this.fieldValue = fieldValue;
90-
this.returnNonHighlightedSnippets = returnNonHighlightedSnippets;
97+
this.noMatchSize = noMatchSize;
9198
}
9299

93100
/**
@@ -111,16 +118,13 @@ public Snippet[] highlightField(String field, Query query, int docId, int maxPas
111118
@Override
112119
protected List<CharSequence[]> loadFieldValues(String[] fields, DocIdSetIterator docIter,
113120
int cacheCharsThreshold) throws IOException {
114-
//we only highlight one field, one document at a time
121+
// we only highlight one field, one document at a time
115122
return Collections.singletonList(new String[]{fieldValue});
116123
}
117124

118125
@Override
119126
protected BreakIterator getBreakIterator(String field) {
120-
if (breakIterator != null) {
121-
return breakIterator;
122-
}
123-
return super.getBreakIterator(field);
127+
return breakIterator;
124128
}
125129

126130
@Override
@@ -129,11 +133,18 @@ protected PassageFormatter getFormatter(String field) {
129133
}
130134

131135
@Override
132-
protected int getMaxNoHighlightPassages(String field) {
133-
if (returnNonHighlightedSnippets) {
134-
return 1;
135-
}
136-
return 0;
136+
protected FieldHighlighter getFieldHighlighter(String field, Query query, Set<Term> allTerms, int maxPassages) {
137+
BytesRef[] terms = filterExtractedTerms(getFieldMatcher(field), allTerms);
138+
Set<HighlightFlag> highlightFlags = getFlags(field);
139+
PhraseHelper phraseHelper = getPhraseHelper(field, query, highlightFlags);
140+
CharacterRunAutomaton[] automata = getAutomata(field, query, highlightFlags);
141+
OffsetSource offsetSource = getOptimizedOffsetSource(field, terms, phraseHelper, automata);
142+
BreakIterator breakIterator = new SplittingBreakIterator(getBreakIterator(field),
143+
UnifiedHighlighter.MULTIVAL_SEP_CHAR);
144+
FieldOffsetStrategy strategy =
145+
getOffsetStrategy(offsetSource, field, terms, phraseHelper, automata, highlightFlags);
146+
return new CustomFieldHighlighter(field, strategy, breakIteratorLocale, breakIterator,
147+
getScorer(field), maxPassages, (noMatchSize > 0 ? 1 : 0), getFormatter(field), noMatchSize, fieldValue);
137148
}
138149

139150
@Override
@@ -146,7 +157,6 @@ protected Collection<Query> preSpanQueryRewrite(Query query) {
146157
return rewriteCustomQuery(query);
147158
}
148159

149-
150160
/**
151161
* Translate custom queries in queries that are supported by the unified highlighter.
152162
*/

0 commit comments

Comments
 (0)