Skip to content

Commit 87c841d

Browse files
authored
Include all sentences smaller than fragment_size in the unified highlighter (#28132)
The unified highlighter selects a single sentence per fragment from the offset of the first highlighted term. This change modifies this selection and allows more than one sentence in a single fragment. The expansion is done forward (on the right of the matching offset), sentences are added to the current fragment iff the overall size of the fragment is smaller than the maximum length (fragment_size). We should also add a way to expand the left context with the surrounding sentences but this is currently avoided because the unified highlighter in Lucene uses only the first offset that matches the query to derive the start and end offset of the next fragment. If we expand on the left we could split multiple terms that would be grouped otherwise. Breaking this limitation implies some changes in the core of the unified highlighter. Closes #28089
1 parent 3c032f8 commit 87c841d

File tree

3 files changed

+125
-43
lines changed

3 files changed

+125
-43
lines changed

core/src/main/java/org/apache/lucene/search/uhighlight/BoundedBreakIteratorScanner.java

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,15 +23,23 @@
2323
import java.util.Locale;
2424

2525
/**
26-
* A custom break iterator that scans text to find break-delimited passages bounded by
27-
* a provided maximum length. This class delegates the boundary search to a first level
28-
* break iterator. When this break iterator finds a passage greater than the maximum length
26+
* A custom break iterator that is used to find break-delimited passages bounded by
27+
* a provided maximum length in the {@link UnifiedHighlighter} context.
28+
* This class uses a {@link BreakIterator} to find the last break after the provided offset
29+
* that would create a passage smaller than <code>maxLen</code>.
30+
* If the {@link BreakIterator} cannot find a passage smaller than the maximum length,
2931
* a secondary break iterator is used to re-split the passage at the first boundary after
3032
* maximum length.
33+
*
3134
* This is useful to split passages created by {@link BreakIterator}s like `sentence` that
3235
* can create big outliers on semi-structured text.
3336
*
37+
*
3438
* WARNING: This break iterator is designed to work with the {@link UnifiedHighlighter}.
39+
*
40+
* TODO: We should be able to create passages incrementally, starting from the offset of the first match and expanding or not
41+
* depending on the offsets of subsequent matches. This is currently impossible because {@link FieldHighlighter} uses
42+
* only the first matching offset to derive the start and end of each passage.
3543
**/
3644
public class BoundedBreakIteratorScanner extends BreakIterator {
3745
private final BreakIterator mainBreak;
@@ -93,7 +101,15 @@ public int preceding(int offset) {
93101
innerEnd = windowEnd;
94102
} else {
95103
windowStart = innerStart = mainBreak.preceding(offset);
96-
windowEnd = innerEnd = mainBreak.following(offset-1);
104+
windowEnd = innerEnd = mainBreak.following(offset - 1);
105+
// expand to next break until we reach maxLen
106+
while (innerEnd - innerStart < maxLen) {
107+
int newEnd = mainBreak.following(innerEnd);
108+
if (newEnd == DONE || (newEnd - innerStart) > maxLen) {
109+
break;
110+
}
111+
windowEnd = innerEnd = newEnd;
112+
}
97113
}
98114

99115
if (innerEnd - innerStart > maxLen) {

core/src/test/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,20 @@ public void testSentenceBoundedBreakIterator() throws Exception {
184184
BoundedBreakIteratorScanner.getSentence(Locale.ROOT, 10), 0, outputs);
185185
}
186186

187+
public void testSmallSentenceBoundedBreakIterator() throws Exception {
188+
final String[] inputs = {
189+
"A short sentence. Followed by a bigger sentence that should be truncated. And a last short sentence."
190+
};
191+
final String[] outputs = {
192+
"A short <b>sentence</b>.",
193+
"Followed by a bigger <b>sentence</b>",
194+
"And a last short <b>sentence</b>"
195+
};
196+
TermQuery query = new TermQuery(new Term("text", "sentence"));
197+
assertHighlightOneDoc("text", inputs, new StandardAnalyzer(), query, Locale.ROOT,
198+
BoundedBreakIteratorScanner.getSentence(Locale.ROOT, 20), 0, outputs);
199+
}
200+
187201
public void testRepeat() throws Exception {
188202
final String[] inputs = {
189203
"Fun fun fun fun fun fun fun fun fun fun"
@@ -205,4 +219,25 @@ public void testRepeat() throws Exception {
205219
assertHighlightOneDoc("text", inputs, new StandardAnalyzer(), query, Locale.ROOT,
206220
BoundedBreakIteratorScanner.getSentence(Locale.ROOT, 10), 0, outputs);
207221
}
222+
223+
public void testGroupSentences() throws Exception {
224+
final String[] inputs = {
225+
"Two words. Followed by many words in a big sentence. One. Two. Three. And more words."
226+
};
227+
final String[] outputs = {
228+
"<b>Two</b> <b>words</b>.",
229+
"Followed by many <b>words</b>",
230+
"<b>One</b>. <b>Two</b>. <b>Three</b>.",
231+
"And more <b>words</b>.",
232+
};
233+
BooleanQuery query = new BooleanQuery.Builder()
234+
.add(new TermQuery(new Term("text", "one")), BooleanClause.Occur.SHOULD)
235+
.add(new TermQuery(new Term("text", "two")), BooleanClause.Occur.SHOULD)
236+
.add(new TermQuery(new Term("text", "three")), BooleanClause.Occur.SHOULD)
237+
.add(new TermQuery(new Term("text", "words")), BooleanClause.Occur.SHOULD)
238+
.build();
239+
assertHighlightOneDoc("text", inputs, new StandardAnalyzer(), query, Locale.ROOT,
240+
BoundedBreakIteratorScanner.getSentence(Locale.ROOT, 20), 0, outputs);
241+
}
242+
208243
}

0 commit comments

Comments
 (0)