Include all sentences smaller than fragment_size in the unified highlighter (#28132)

jimczi · web-flow · commit 87c841d17875 · 2018-01-11T13:26:11.000+01:00
The unified highlighter selects a single sentence per fragment from the offset of the first highlighted term. This change modifies this selection and allows more than one sentence in a single fragment. The expansion is done forward (on the right of the matching offset), sentences are added to the current fragment iff the overall size of the fragment is smaller than the maximum length (fragment_size). We should also add a way to expand the left context with the surrounding sentences but this is currently avoided because the unified highlighter in Lucene uses only the first offset that matches the query to derive the start and end offset of the next fragment. If we expand on the left we could split multiple terms that would be grouped otherwise. Breaking this limitation implies some changes in the core of the unified highlighter. Closes #28089
diff --git a/core/src/main/java/org/apache/lucene/search/uhighlight/BoundedBreakIteratorScanner.java b/core/src/main/java/org/apache/lucene/search/uhighlight/BoundedBreakIteratorScanner.java
@@ -23,15 +23,23 @@
 import java.util.Locale;
 
 /**
- * A custom break iterator that scans text to find break-delimited passages bounded by
- * a provided maximum length. This class delegates the boundary search to a first level
- * break iterator. When this break iterator finds a passage greater than the maximum length
+ * A custom break iterator that is used to find break-delimited passages bounded by
+ * a provided maximum length in the {@link UnifiedHighlighter} context.
+ * This class uses a {@link BreakIterator} to find the last break after the provided offset
+ * that would create a passage smaller than <code>maxLen</code>.
+ * If the {@link BreakIterator} cannot find a passage smaller than the maximum length,
  * a secondary break iterator is used to re-split the passage at the first boundary after
  * maximum length.
+ *
  * This is useful to split passages created by {@link BreakIterator}s like `sentence` that
  * can create big outliers on semi-structured text.
  *
+ *
  * WARNING: This break iterator is designed to work with the {@link UnifiedHighlighter}.
+ *
+ * TODO: We should be able to create passages incrementally, starting from the offset of the first match and expanding or not
+ * depending on the offsets of subsequent matches. This is currently impossible because {@link FieldHighlighter} uses
+ * only the first matching offset to derive the start and end of each passage.
  **/
 public class BoundedBreakIteratorScanner extends BreakIterator {
     private final BreakIterator mainBreak;
@@ -93,7 +101,15 @@ public int preceding(int offset) {
             innerEnd = windowEnd;
         } else {
             windowStart = innerStart = mainBreak.preceding(offset);
-            windowEnd = innerEnd = mainBreak.following(offset-1);
+            windowEnd = innerEnd = mainBreak.following(offset - 1);
+            // expand to next break until we reach maxLen
+            while (innerEnd - innerStart < maxLen) {
+                int newEnd = mainBreak.following(innerEnd);
+                if (newEnd == DONE || (newEnd - innerStart) > maxLen) {
+                    break;
+                }
+                windowEnd = innerEnd = newEnd;
+            }
         }
 
         if (innerEnd - innerStart > maxLen) {
diff --git a/core/src/test/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java b/core/src/test/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java
@@ -184,6 +184,20 @@ public void testSentenceBoundedBreakIterator() throws Exception {
             BoundedBreakIteratorScanner.getSentence(Locale.ROOT, 10), 0, outputs);
     }
 
+    public void testSmallSentenceBoundedBreakIterator() throws Exception {
+        final String[] inputs = {
+            "A short sentence. Followed by a bigger sentence that should be truncated. And a last short sentence."
+        };
+        final String[] outputs = {
+            "A short <b>sentence</b>.",
+            "Followed by a bigger <b>sentence</b>",
+            "And a last short <b>sentence</b>"
+        };
+        TermQuery query = new TermQuery(new Term("text", "sentence"));
+        assertHighlightOneDoc("text", inputs, new StandardAnalyzer(), query, Locale.ROOT,
+            BoundedBreakIteratorScanner.getSentence(Locale.ROOT, 20), 0, outputs);
+    }
+
     public void testRepeat() throws Exception {
         final String[] inputs = {
             "Fun  fun fun  fun  fun  fun  fun  fun  fun  fun"
@@ -205,4 +219,25 @@ public void testRepeat() throws Exception {
         assertHighlightOneDoc("text", inputs, new StandardAnalyzer(), query, Locale.ROOT,
             BoundedBreakIteratorScanner.getSentence(Locale.ROOT, 10), 0, outputs);
     }
+
+    public void testGroupSentences() throws Exception {
+        final String[] inputs = {
+            "Two words. Followed by many words in a big sentence. One. Two. Three. And more words."
+        };
+        final String[] outputs = {
+            "<b>Two</b> <b>words</b>.",
+            "Followed by many <b>words</b>",
+            "<b>One</b>. <b>Two</b>. <b>Three</b>.",
+            "And more <b>words</b>.",
+        };
+        BooleanQuery query = new BooleanQuery.Builder()
+            .add(new TermQuery(new Term("text", "one")), BooleanClause.Occur.SHOULD)
+            .add(new TermQuery(new Term("text", "two")), BooleanClause.Occur.SHOULD)
+            .add(new TermQuery(new Term("text", "three")), BooleanClause.Occur.SHOULD)
+            .add(new TermQuery(new Term("text", "words")), BooleanClause.Occur.SHOULD)
+            .build();
+        assertHighlightOneDoc("text", inputs, new StandardAnalyzer(), query, Locale.ROOT,
+            BoundedBreakIteratorScanner.getSentence(Locale.ROOT, 20), 0, outputs);
+    }
+
 }
diff --git a/core/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java b/core/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java