From 792160af87b403b4a493e653e5377a919a490395 Mon Sep 17 00:00:00 2001
From: Jim Ferenczi <jim.ferenczi@elastic.co>
Date: Wed, 1 Mar 2017 13:31:50 +0100
Subject: [PATCH 1/2] Add support for fragment_length in the unified
 highlighter

This commit introduce a new break iterator (a BoundedBreakIterator) designed for the unified highlighter
 that is able to limit the size of fragments produced by generic break iterator like `sentence`.
The `unified` highlighter now supports `boundary_scanner` which can `words` or `sentence`.
The `sentence` mode will use the bounded break iterator in order to limit the size of the sentence to `fragment_length`.
When sentences bigger than `fragment_length` are produced, this mode will break the sentence at the next word boundary **after**
 `fragment_length` is reached.
---
 .../BoundedBreakIteratorScanner.java          | 149 +++++++++
 .../uhighlight/CustomFieldHighlighter.java    |  59 ++++
 .../uhighlight/CustomUnifiedHighlighter.java  |  50 +--
 .../highlight/FastVectorHighlighter.java      | 110 +++---
 .../subphase/highlight/HighlightBuilder.java  |   2 +-
 .../highlight/UnifiedHighlighter.java         |  44 ++-
 .../BoundedBreakIteratorScannerTests.java     | 111 +++++++
 .../CustomUnifiedHighlighterTests.java        | 314 ++++++++----------
 .../highlight/HighlighterSearchIT.java        | 167 ++++++----
 .../search/request/highlighting.asciidoc      |  22 +-
 10 files changed, 692 insertions(+), 336 deletions(-)
 create mode 100644 core/src/main/java/org/apache/lucene/search/uhighlight/BoundedBreakIteratorScanner.java
 create mode 100644 core/src/main/java/org/apache/lucene/search/uhighlight/CustomFieldHighlighter.java
 create mode 100644 core/src/test/java/org/apache/lucene/search/uhighlight/BoundedBreakIteratorScannerTests.java

diff --git a/core/src/main/java/org/apache/lucene/search/uhighlight/BoundedBreakIteratorScanner.java b/core/src/main/java/org/apache/lucene/search/uhighlight/BoundedBreakIteratorScanner.java
new file mode 100644
index 0000000000000..6aeb1b8ab7e20
--- /dev/null
+++ b/core/src/main/java/org/apache/lucene/search/uhighlight/BoundedBreakIteratorScanner.java
@@ -0,0 +1,149 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.lucene.search.uhighlight;
+
+import java.text.BreakIterator;
+import java.text.CharacterIterator;
+import java.util.Locale;
+
+/**
+ * A custom break iterator that scans text to find break-delimited passages bounded by a provided {@code maxLen}.
+ * This scanner uses two level of {@link BreakIterator} to bound passage size "close" to {@maxLen}.
+ * This is useful to "bound" {@link BreakIterator}s like `sentence` that can create big outliers on
+ * semi-structured text.
+ *
+ * @warning This break iterator is designed to work with the {@link UnifiedHighlighter} only.
+ **/
+public class BoundedBreakIteratorScanner extends BreakIterator {
+    private final BreakIterator mainBreak;
+    private final BreakIterator innerBreak;
+    private final int maxLen;
+
+    private int lastPrecedingOffset = -1;
+    private int windowStart = -1;
+    private int windowEnd = -1;
+    private int innerStart = -1;
+    private int innerEnd = -1;
+
+    private BoundedBreakIteratorScanner(BreakIterator mainBreak, BreakIterator innerBreak, int maxLen) {
+        this.mainBreak = mainBreak;
+        this.innerBreak = innerBreak;
+        this.maxLen = maxLen;
+    }
+
+    @Override
+    public CharacterIterator getText() {
+        return mainBreak.getText();
+    }
+
+    @Override
+    public void setText(CharacterIterator newText) {
+        reset();
+        mainBreak.setText(newText);
+        innerBreak.setText(newText);
+    }
+
+    @Override
+    public void setText(String newText) {
+        reset();
+        mainBreak.setText(newText);
+        innerBreak.setText(newText);
+    }
+
+    private void reset() {
+        lastPrecedingOffset = -1;
+        windowStart = -1;
+        windowEnd = -1;
+        innerStart = -1;
+        innerEnd = -1;
+    }
+
+    @Override
+    public int preceding(int offset) {
+        assert(offset > lastPrecedingOffset);
+        if (offset > windowStart && offset < windowEnd) {
+            innerStart = innerEnd;
+            innerEnd = windowEnd;
+        } else {
+            windowStart = innerStart = mainBreak.preceding(offset);
+            windowEnd = innerEnd = mainBreak.following(offset-1);
+        }
+
+        if (innerEnd - innerStart > maxLen) {
+            // the current split is too big,
+            // so starting from the current term we try to find boundaries on the left first
+            if (offset - maxLen > innerStart) {
+                innerStart = Math.max(innerStart, innerBreak.preceding(offset - maxLen));
+            }
+            // and then we try to expand the passage to the right with the remaining size
+            int remaining = Math.max(0, maxLen - (offset - innerStart));
+            if (offset + remaining < windowEnd) {
+                innerEnd = Math.min(windowEnd, innerBreak.following(offset + remaining));
+            }
+        }
+        lastPrecedingOffset = offset - 1;
+        return innerStart;
+    }
+
+    @Override
+    public int following(int offset) {
+        assert(offset == lastPrecedingOffset && innerEnd != -1);
+        return innerEnd;
+    }
+
+    /**
+     * Returns a {@link BreakIterator#getSentenceInstance(Locale)} bounded to {@code maxLen}.
+     * Secondary boundaries are found using a {@link BreakIterator#getWordInstance(Locale)}.
+     */
+    public static BreakIterator getSentence(Locale locale, int maxLen) {
+        final BreakIterator sBreak = BreakIterator.getSentenceInstance(locale);
+        final BreakIterator wBreak = BreakIterator.getWordInstance(locale);
+        return new BoundedBreakIteratorScanner(sBreak, wBreak, maxLen);
+    }
+
+    @Override
+    public int first() {
+        return 0;
+    }
+
+    @Override
+    public int current() {
+        return 0;
+    }
+
+    @Override
+    public int next() {
+        throw new IllegalStateException("next() should not be called in this context");
+    }
+
+    @Override
+    public int last() {
+        throw new IllegalStateException("last should not be called in this context");
+    }
+
+    @Override
+    public int next(int n) {
+        throw new IllegalStateException("next(n) should not be call");
+    }
+
+    @Override
+    public int previous() {
+        throw new IllegalStateException("previous should not be call");
+    }
+}
diff --git a/core/src/main/java/org/apache/lucene/search/uhighlight/CustomFieldHighlighter.java b/core/src/main/java/org/apache/lucene/search/uhighlight/CustomFieldHighlighter.java
new file mode 100644
index 0000000000000..9515648355f04
--- /dev/null
+++ b/core/src/main/java/org/apache/lucene/search/uhighlight/CustomFieldHighlighter.java
@@ -0,0 +1,59 @@
+package org.apache.lucene.search.uhighlight;
+
+import java.text.BreakIterator;
+import java.util.Locale;
+
+import static org.apache.lucene.search.uhighlight.CustomUnifiedHighlighter.MULTIVAL_SEP_CHAR;
+
+/**
+ * Custom {@link FieldHighlighter} that creates a single passage bounded to {@code noMatchSize} when
+ * no highlights were found.
+ */
+class CustomFieldHighlighter extends FieldHighlighter {
+    private static final Passage[] EMPTY_PASSAGE = new Passage[0];
+
+    private final Locale breakIteratorLocale;
+    private final int noMatchSize;
+    private final String fieldValue;
+
+    public CustomFieldHighlighter(String field, FieldOffsetStrategy fieldOffsetStrategy,
+                                  Locale breakIteratorLocale, BreakIterator breakIterator,
+                                  PassageScorer passageScorer, int maxPassages, int maxNoHighlightPassages,
+                                  PassageFormatter passageFormatter, int noMatchSize, String fieldValue) {
+        super(field, fieldOffsetStrategy, breakIterator, passageScorer, maxPassages, maxNoHighlightPassages, passageFormatter);
+        this.breakIteratorLocale = breakIteratorLocale;
+        this.noMatchSize = noMatchSize;
+        this.fieldValue = fieldValue;
+    }
+
+    @Override
+    protected Passage[] getSummaryPassagesNoHighlight(int maxPassages) {
+        if (noMatchSize > 0) {
+            int pos = 0;
+            while (pos < fieldValue.length() && fieldValue.charAt(pos) == MULTIVAL_SEP_CHAR) {
+                pos ++;
+            }
+            if (pos < fieldValue.length()) {
+                int end = fieldValue.indexOf(MULTIVAL_SEP_CHAR, pos);
+                if (end == -1) {
+                    end = fieldValue.length();
+                }
+                if (noMatchSize < end) {
+                    BreakIterator bi = BreakIterator.getWordInstance(breakIteratorLocale);
+                    bi.setText(fieldValue);
+                    // Finds the next word boundary **after** noMatchSize.
+                    end = bi.following(noMatchSize);
+                    if (end == BreakIterator.DONE) {
+                        end = fieldValue.length();
+                    }
+                }
+                Passage passage = new Passage();
+                passage.setScore(Float.NaN);
+                passage.setStartOffset(0);
+                passage.setEndOffset(end);
+                return new Passage[]{passage};
+            }
+        }
+        return EMPTY_PASSAGE;
+    }
+}
diff --git a/core/src/main/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighter.java b/core/src/main/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighter.java
index 4f1ec5fdb83a4..4a20fb0478f9a 100644
--- a/core/src/main/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighter.java
+++ b/core/src/main/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighter.java
@@ -33,6 +33,8 @@
 import org.apache.lucene.search.spans.SpanOrQuery;
 import org.apache.lucene.search.spans.SpanQuery;
 import org.apache.lucene.search.spans.SpanTermQuery;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.automaton.CharacterRunAutomaton;
 import org.elasticsearch.common.Nullable;
 import org.elasticsearch.common.lucene.all.AllTermQuery;
 import org.elasticsearch.common.lucene.search.MultiPhrasePrefixQuery;
@@ -47,6 +49,7 @@
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
+import java.util.Set;
 
 /**
  * Subclass of the {@link UnifiedHighlighter} that works for a single field in a single document.
@@ -57,37 +60,41 @@
  * Supports both returning empty snippets and non highlighted snippets when no highlighting can be performed.
  */
 public class CustomUnifiedHighlighter extends UnifiedHighlighter {
+    public static final char MULTIVAL_SEP_CHAR = (char) 0;
     private static final Snippet[] EMPTY_SNIPPET = new Snippet[0];
 
     private final String fieldValue;
     private final PassageFormatter passageFormatter;
     private final BreakIterator breakIterator;
-    private final boolean returnNonHighlightedSnippets;
+    private final Locale breakIteratorLocale;
+    private final int noMatchSize;
 
     /**
      * Creates a new instance of {@link CustomUnifiedHighlighter}
      *
      * @param analyzer the analyzer used for the field at index time, used for multi term queries internally
      * @param passageFormatter our own {@link CustomPassageFormatter}
-     *                         which generates snippets in forms of {@link Snippet} objects
+     *                    which generates snippets in forms of {@link Snippet} objects
+     * @param breakIteratorLocale the {@link Locale} to use for dividing text into passages.
+     *                    If null {@link Locale#ROOT} is used
      * @param breakIterator the {@link BreakIterator} to use for dividing text into passages.
-     *                      If null {@link BreakIterator#getSentenceInstance(Locale)} is used.
-     * @param fieldValue the original field values as constructor argument, loaded from the _source field or
-     *                   the relevant stored field.
-     * @param returnNonHighlightedSnippets whether non highlighted snippets should be
-     *                                     returned rather than empty snippets when no highlighting can be performed
+     *                    If null {@link BreakIterator#getSentenceInstance(Locale)} is used.
+     * @param fieldValue the original field values delimited by MULTIVAL_SEP_CHAR
+     * @param noMatchSize The size of the text that should be returned when no highlighting can be performed
      */
     public CustomUnifiedHighlighter(IndexSearcher searcher,
                                     Analyzer analyzer,
                                     PassageFormatter passageFormatter,
+                                    @Nullable Locale breakIteratorLocale,
                                     @Nullable BreakIterator breakIterator,
                                     String fieldValue,
-                                    boolean returnNonHighlightedSnippets) {
+                                    int noMatchSize) {
         super(searcher, analyzer);
         this.breakIterator = breakIterator;
+        this.breakIteratorLocale = breakIteratorLocale == null ? Locale.ROOT : breakIteratorLocale;
         this.passageFormatter = passageFormatter;
         this.fieldValue = fieldValue;
-        this.returnNonHighlightedSnippets = returnNonHighlightedSnippets;
+        this.noMatchSize = noMatchSize;
     }
 
     /**
@@ -111,16 +118,13 @@ public Snippet[] highlightField(String field, Query query, int docId, int maxPas
     @Override
     protected List<CharSequence[]> loadFieldValues(String[] fields, DocIdSetIterator docIter,
                                                    int cacheCharsThreshold) throws IOException {
-        //we only highlight one field, one document at a time
+        // we only highlight one field, one document at a time
         return Collections.singletonList(new String[]{fieldValue});
     }
 
     @Override
     protected BreakIterator getBreakIterator(String field) {
-        if (breakIterator != null) {
-            return breakIterator;
-        }
-        return super.getBreakIterator(field);
+        return breakIterator;
     }
 
     @Override
@@ -129,11 +133,18 @@ protected PassageFormatter getFormatter(String field) {
     }
 
     @Override
-    protected int getMaxNoHighlightPassages(String field) {
-        if (returnNonHighlightedSnippets) {
-            return 1;
-        }
-        return 0;
+    protected FieldHighlighter getFieldHighlighter(String field, Query query, Set<Term> allTerms, int maxPassages) {
+        BytesRef[] terms = filterExtractedTerms(getFieldMatcher(field), allTerms);
+        Set<HighlightFlag> highlightFlags = getFlags(field);
+        PhraseHelper phraseHelper = getPhraseHelper(field, query, highlightFlags);
+        CharacterRunAutomaton[] automata = getAutomata(field, query, highlightFlags);
+        OffsetSource offsetSource = getOptimizedOffsetSource(field, terms, phraseHelper, automata);
+        BreakIterator breakIterator = new SplittingBreakIterator(getBreakIterator(field),
+            UnifiedHighlighter.MULTIVAL_SEP_CHAR);
+        FieldOffsetStrategy strategy =
+            getOffsetStrategy(offsetSource, field, terms, phraseHelper, automata, highlightFlags);
+        return new CustomFieldHighlighter(field, strategy, breakIteratorLocale, breakIterator,
+            getScorer(field), maxPassages, (noMatchSize > 0 ? 1 : 0), getFormatter(field), noMatchSize, fieldValue);
     }
 
     @Override
@@ -146,7 +157,6 @@ protected Collection<Query> preSpanQueryRewrite(Query query) {
         return rewriteCustomQuery(query);
     }
 
-
     /**
      * Translate custom queries in queries that are supported by the unified highlighter.
      */
diff --git a/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/FastVectorHighlighter.java b/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/FastVectorHighlighter.java
index 0c15f1106b19c..b1d557e851a6c 100644
--- a/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/FastVectorHighlighter.java
+++ b/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/FastVectorHighlighter.java
@@ -52,13 +52,14 @@
 public class FastVectorHighlighter implements Highlighter {
 
     private static final BoundaryScanner DEFAULT_SIMPLE_BOUNDARY_SCANNER = new SimpleBoundaryScanner();
-    private static final BoundaryScanner DEFAULT_SENTENCE_BOUNDARY_SCANNER = new BreakIteratorBoundaryScanner(
-            BreakIterator.getSentenceInstance(Locale.ROOT));
-    private static final BoundaryScanner DEFAULT_WORD_BOUNDARY_SCANNER = new BreakIteratorBoundaryScanner(
-            BreakIterator.getWordInstance(Locale.ROOT));
+    private static final BoundaryScanner DEFAULT_SENTENCE_BOUNDARY_SCANNER =
+        new BreakIteratorBoundaryScanner(BreakIterator.getSentenceInstance(Locale.ROOT));
+    private static final BoundaryScanner DEFAULT_WORD_BOUNDARY_SCANNER =
+        new BreakIteratorBoundaryScanner(BreakIterator.getWordInstance(Locale.ROOT));
+
+    public static final Setting<Boolean> SETTING_TV_HIGHLIGHT_MULTI_VALUE =
+        Setting.boolSetting("search.highlight.term_vector_multi_value", true, Setting.Property.NodeScope);
 
-    public static final Setting<Boolean> SETTING_TV_HIGHLIGHT_MULTI_VALUE = Setting.boolSetting("search.highlight.term_vector_multi_value",
-        true, Setting.Property.NodeScope);
     private static final String CACHE_KEY = "highlight-fsv";
     private final Boolean termVectorMultiValue;
 
@@ -74,11 +75,12 @@ public HighlightField highlight(HighlighterContext highlighterContext) {
         FieldMapper mapper = highlighterContext.mapper;
 
         if (canHighlight(mapper) == false) {
-            throw new IllegalArgumentException("the field [" + highlighterContext.fieldName
-                    + "] should be indexed with term vector with position offsets to be used with fast vector highlighter");
+            throw new IllegalArgumentException("the field [" + highlighterContext.fieldName +
+                "] should be indexed with term vector with position offsets to be used with fast vector highlighter");
         }
 
-        Encoder encoder = field.fieldOptions().encoder().equals("html") ? HighlightUtils.Encoders.HTML : HighlightUtils.Encoders.DEFAULT;
+        Encoder encoder = field.fieldOptions().encoder().equals("html") ?
+            HighlightUtils.Encoders.HTML : HighlightUtils.Encoders.DEFAULT;
 
         if (!hitContext.cache().containsKey(CACHE_KEY)) {
             hitContext.cache().put(CACHE_KEY, new HighlighterEntry());
@@ -90,21 +92,21 @@ public HighlightField highlight(HighlighterContext highlighterContext) {
             if (field.fieldOptions().requireFieldMatch()) {
                 if (cache.fieldMatchFieldQuery == null) {
                     /*
-                     * we use top level reader to rewrite the query against all readers, with use caching it across hits (and across
-                     * readers...)
+                     * we use top level reader to rewrite the query against all readers,
+                     * with use caching it across hits (and across readers...)
                      */
-                    cache.fieldMatchFieldQuery = new CustomFieldQuery(highlighterContext.query, hitContext.topLevelReader(),
-                            true, field.fieldOptions().requireFieldMatch());
+                    cache.fieldMatchFieldQuery = new CustomFieldQuery(highlighterContext.query,
+                        hitContext.topLevelReader(), true, field.fieldOptions().requireFieldMatch());
                 }
                 fieldQuery = cache.fieldMatchFieldQuery;
             } else {
                 if (cache.noFieldMatchFieldQuery == null) {
                     /*
-                     * we use top level reader to rewrite the query against all readers, with use caching it across hits (and across
-                     * readers...)
+                     * we use top level reader to rewrite the query against all readers,
+                     * with use caching it across hits (and across readers...)
                      */
-                    cache.noFieldMatchFieldQuery = new CustomFieldQuery(highlighterContext.query, hitContext.topLevelReader(),
-                            true, field.fieldOptions().requireFieldMatch());
+                    cache.noFieldMatchFieldQuery = new CustomFieldQuery(highlighterContext.query,
+                        hitContext.topLevelReader(), true, field.fieldOptions().requireFieldMatch());
                 }
                 fieldQuery = cache.noFieldMatchFieldQuery;
             }
@@ -128,7 +130,7 @@ public HighlightField highlight(HighlighterContext highlighterContext) {
                     }
                 } else {
                     fragListBuilder = field.fieldOptions().fragmentOffset() == -1 ?
-                            new SimpleFragListBuilder() : new SimpleFragListBuilder(field.fieldOptions().fragmentOffset());
+                        new SimpleFragListBuilder() : new SimpleFragListBuilder(field.fieldOptions().fragmentOffset());
                     if (field.fieldOptions().scoreOrdered()) {
                         if (!forceSource && mapper.fieldType().stored()) {
                             fragmentsBuilder = new ScoreOrderFragmentsBuilder(field.fieldOptions().preTags(),
@@ -142,7 +144,8 @@ public HighlightField highlight(HighlighterContext highlighterContext) {
                             fragmentsBuilder = new SimpleFragmentsBuilder(mapper, field.fieldOptions().preTags(),
                                     field.fieldOptions().postTags(), boundaryScanner);
                         } else {
-                            fragmentsBuilder = new SourceSimpleFragmentsBuilder(mapper, context, field.fieldOptions().preTags(),
+                            fragmentsBuilder =
+                                new SourceSimpleFragmentsBuilder(mapper, context, field.fieldOptions().preTags(),
                                     field.fieldOptions().postTags(), boundaryScanner);
                         }
                     }
@@ -153,8 +156,8 @@ public HighlightField highlight(HighlighterContext highlighterContext) {
                 entry.fragmentsBuilder = fragmentsBuilder;
                 if (cache.fvh == null) {
                     // parameters to FVH are not requires since:
-                    // first two booleans are not relevant since they are set on the CustomFieldQuery (phrase and fieldMatch)
-                    // fragment builders are used explicitly
+                    // first two booleans are not relevant since they are set on the CustomFieldQuery
+                    // (phrase and fieldMatch) fragment builders are used explicitly
                     cache.fvh = new org.apache.lucene.search.vectorhighlight.FastVectorHighlighter();
                 }
                 CustomFieldQuery.highlightFilters.set(field.fieldOptions().highlightFilter());
@@ -172,13 +175,14 @@ public HighlightField highlight(HighlighterContext highlighterContext) {
             // we highlight against the low level reader and docId, because if we load source, we want to reuse it if possible
             // Only send matched fields if they were requested to save time.
             if (field.fieldOptions().matchedFields() != null && !field.fieldOptions().matchedFields().isEmpty()) {
-                fragments = cache.fvh.getBestFragments(fieldQuery, hitContext.reader(), hitContext.docId(), mapper.fieldType().name(),
-                        field.fieldOptions().matchedFields(), fragmentCharSize, numberOfFragments, entry.fragListBuilder,
-                        entry.fragmentsBuilder, field.fieldOptions().preTags(), field.fieldOptions().postTags(), encoder);
+                fragments = cache.fvh.getBestFragments(fieldQuery, hitContext.reader(), hitContext.docId(),
+                    mapper.fieldType().name(), field.fieldOptions().matchedFields(), fragmentCharSize,
+                    numberOfFragments, entry.fragListBuilder, entry.fragmentsBuilder, field.fieldOptions().preTags(),
+                    field.fieldOptions().postTags(), encoder);
             } else {
-                fragments = cache.fvh.getBestFragments(fieldQuery, hitContext.reader(), hitContext.docId(), mapper.fieldType().name(),
-                        fragmentCharSize, numberOfFragments, entry.fragListBuilder, entry.fragmentsBuilder, field.fieldOptions().preTags(),
-                        field.fieldOptions().postTags(), encoder);
+                fragments = cache.fvh.getBestFragments(fieldQuery, hitContext.reader(), hitContext.docId(),
+                    mapper.fieldType().name(), fragmentCharSize, numberOfFragments, entry.fragListBuilder,
+                    entry.fragmentsBuilder, field.fieldOptions().preTags(), field.fieldOptions().postTags(), encoder);
             }
 
             if (fragments != null && fragments.length > 0) {
@@ -187,11 +191,13 @@ public HighlightField highlight(HighlighterContext highlighterContext) {
 
             int noMatchSize = highlighterContext.field.fieldOptions().noMatchSize();
             if (noMatchSize > 0) {
-                // Essentially we just request that a fragment is built from 0 to noMatchSize using the normal fragmentsBuilder
+                // Essentially we just request that a fragment is built from 0 to noMatchSize using
+                // the normal fragmentsBuilder
                 FieldFragList fieldFragList = new SimpleFieldFragList(-1 /*ignored*/);
                 fieldFragList.add(0, noMatchSize, Collections.<WeightedPhraseInfo>emptyList());
-                fragments = entry.fragmentsBuilder.createFragments(hitContext.reader(), hitContext.docId(), mapper.fieldType().name(),
-                        fieldFragList, 1, field.fieldOptions().preTags(), field.fieldOptions().postTags(), encoder);
+                fragments = entry.fragmentsBuilder.createFragments(hitContext.reader(), hitContext.docId(),
+                    mapper.fieldType().name(), fieldFragList, 1, field.fieldOptions().preTags(),
+                    field.fieldOptions().postTags(), encoder);
                 if (fragments != null && fragments.length > 0) {
                     return new HighlightField(highlighterContext.fieldName, Text.convertFromStringArray(fragments));
                 }
@@ -200,7 +206,8 @@ public HighlightField highlight(HighlighterContext highlighterContext) {
             return null;
 
         } catch (Exception e) {
-            throw new FetchPhaseExecutionException(context, "Failed to highlight field [" + highlighterContext.fieldName + "]", e);
+            throw new FetchPhaseExecutionException(context,
+                "Failed to highlight field [" + highlighterContext.fieldName + "]", e);
         }
     }
 
@@ -212,24 +219,31 @@ public boolean canHighlight(FieldMapper fieldMapper) {
 
     private static BoundaryScanner getBoundaryScanner(Field field) {
         final FieldOptions fieldOptions = field.fieldOptions();
-        final Locale boundaryScannerLocale = fieldOptions.boundaryScannerLocale();
-        switch(fieldOptions.boundaryScannerType()) {
-        case SENTENCE:
-            if (boundaryScannerLocale != null) {
-                return new BreakIteratorBoundaryScanner(BreakIterator.getSentenceInstance(boundaryScannerLocale));
-            }
-            return DEFAULT_SENTENCE_BOUNDARY_SCANNER;
-        case WORD:
-            if (boundaryScannerLocale != null) {
-                return new BreakIteratorBoundaryScanner(BreakIterator.getWordInstance(boundaryScannerLocale));
-            }
-            return DEFAULT_WORD_BOUNDARY_SCANNER;
-        default:
-            if (fieldOptions.boundaryMaxScan() != SimpleBoundaryScanner.DEFAULT_MAX_SCAN
+        final Locale boundaryScannerLocale =
+            fieldOptions.boundaryScannerLocale() != null ? fieldOptions.boundaryScannerLocale() :
+                Locale.ROOT;
+        final HighlightBuilder.BoundaryScannerType type =
+            fieldOptions.boundaryScannerType()  != null ? fieldOptions.boundaryScannerType() :
+                HighlightBuilder.BoundaryScannerType.CHARS;
+        switch(type) {
+            case SENTENCE:
+                if (boundaryScannerLocale != null) {
+                    return new BreakIteratorBoundaryScanner(BreakIterator.getSentenceInstance(boundaryScannerLocale));
+                }
+                return DEFAULT_SENTENCE_BOUNDARY_SCANNER;
+            case WORD:
+                if (boundaryScannerLocale != null) {
+                    return new BreakIteratorBoundaryScanner(BreakIterator.getWordInstance(boundaryScannerLocale));
+                }
+                return DEFAULT_WORD_BOUNDARY_SCANNER;
+            case CHARS:
+                if (fieldOptions.boundaryMaxScan() != SimpleBoundaryScanner.DEFAULT_MAX_SCAN
                     || fieldOptions.boundaryChars() != SimpleBoundaryScanner.DEFAULT_BOUNDARY_CHARS) {
-                return new SimpleBoundaryScanner(fieldOptions.boundaryMaxScan(), fieldOptions.boundaryChars());
-            }
-            return DEFAULT_SIMPLE_BOUNDARY_SCANNER;
+                    return new SimpleBoundaryScanner(fieldOptions.boundaryMaxScan(), fieldOptions.boundaryChars());
+                }
+                return DEFAULT_SIMPLE_BOUNDARY_SCANNER;
+            default:
+                throw new IllegalArgumentException("Invalid boundary scanner type: " + type.toString());
         }
     }
 
diff --git a/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightBuilder.java b/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightBuilder.java
index 45b8c612a7610..c7c9c547b511c 100644
--- a/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightBuilder.java
+++ b/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightBuilder.java
@@ -95,7 +95,7 @@ public class HighlightBuilder extends AbstractHighlighterBuilder<HighlightBuilde
             .preTags(DEFAULT_PRE_TAGS).postTags(DEFAULT_POST_TAGS).scoreOrdered(DEFAULT_SCORE_ORDERED)
             .highlightFilter(DEFAULT_HIGHLIGHT_FILTER).requireFieldMatch(DEFAULT_REQUIRE_FIELD_MATCH)
             .forceSource(DEFAULT_FORCE_SOURCE).fragmentCharSize(DEFAULT_FRAGMENT_CHAR_SIZE)
-            .numberOfFragments(DEFAULT_NUMBER_OF_FRAGMENTS).encoder(DEFAULT_ENCODER).boundaryScannerType(BoundaryScannerType.CHARS)
+            .numberOfFragments(DEFAULT_NUMBER_OF_FRAGMENTS).encoder(DEFAULT_ENCODER)
             .boundaryMaxScan(SimpleBoundaryScanner.DEFAULT_MAX_SCAN).boundaryChars(SimpleBoundaryScanner.DEFAULT_BOUNDARY_CHARS)
             .boundaryScannerLocale(Locale.ROOT).noMatchSize(DEFAULT_NO_MATCH_SIZE).phraseLimit(DEFAULT_PHRASE_LIMIT).build();
 
diff --git a/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/UnifiedHighlighter.java b/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/UnifiedHighlighter.java
index 2ffa1a28efd70..d3a94d0411b40 100644
--- a/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/UnifiedHighlighter.java
+++ b/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/UnifiedHighlighter.java
@@ -22,6 +22,7 @@
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.highlight.Encoder;
 import org.apache.lucene.search.highlight.Snippet;
+import org.apache.lucene.search.uhighlight.BoundedBreakIteratorScanner;
 import org.apache.lucene.search.uhighlight.CustomPassageFormatter;
 import org.apache.lucene.search.uhighlight.CustomUnifiedHighlighter;
 import org.apache.lucene.util.BytesRef;
@@ -34,12 +35,15 @@
 import org.elasticsearch.search.internal.SearchContext;
 
 import java.io.IOException;
+import java.text.BreakIterator;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
+import java.util.Locale;
 import java.util.Map;
 import java.util.stream.Collectors;
 
+import static org.apache.lucene.search.uhighlight.CustomUnifiedHighlighter.MULTIVAL_SEP_CHAR;
 import static org.elasticsearch.search.fetch.subphase.highlight.PostingsHighlighter.filterSnippets;
 import static org.elasticsearch.search.fetch.subphase.highlight.PostingsHighlighter.mergeFieldValues;
 
@@ -93,19 +97,22 @@ public HighlightField highlight(HighlighterContext highlighterContext) {
                 // we use a control char to separate values, which is the only char that the custom break iterator
                 // breaks the text on, so we don't lose the distinction between the different values of a field and we
                 // get back a snippet per value
-                String fieldValue = mergeFieldValues(fieldValues, HighlightUtils.NULL_SEPARATOR);
+                String fieldValue = mergeFieldValues(fieldValues, MULTIVAL_SEP_CHAR);
                 org.apache.lucene.search.postingshighlight.CustomSeparatorBreakIterator breakIterator =
                     new org.apache.lucene.search.postingshighlight
-                        .CustomSeparatorBreakIterator(HighlightUtils.NULL_SEPARATOR);
+                        .CustomSeparatorBreakIterator(MULTIVAL_SEP_CHAR);
                 highlighter =
                     new CustomUnifiedHighlighter(searcher, analyzer, mapperHighlighterEntry.passageFormatter,
-                        breakIterator, fieldValue, field.fieldOptions().noMatchSize() > 0);
+                        field.fieldOptions().boundaryScannerLocale(), breakIterator, fieldValue,
+                        field.fieldOptions().noMatchSize());
                 numberOfFragments = fieldValues.size(); // we are highlighting the whole content, one snippet per value
             } else {
                 //using paragraph separator we make sure that each field value holds a discrete passage for highlighting
-                String fieldValue = mergeFieldValues(fieldValues, HighlightUtils.PARAGRAPH_SEPARATOR);
+                String fieldValue = mergeFieldValues(fieldValues, MULTIVAL_SEP_CHAR);
+                BreakIterator bi = getBreakIterator(field);
                 highlighter = new CustomUnifiedHighlighter(searcher, analyzer,
-                    mapperHighlighterEntry.passageFormatter, null, fieldValue, field.fieldOptions().noMatchSize() > 0);
+                    mapperHighlighterEntry.passageFormatter, field.fieldOptions().boundaryScannerLocale(), bi,
+                    fieldValue, field.fieldOptions().noMatchSize());
                 numberOfFragments = field.fieldOptions().numberOfFragments();
             }
             if (field.fieldOptions().requireFieldMatch()) {
@@ -144,11 +151,34 @@ public HighlightField highlight(HighlighterContext highlighterContext) {
         return null;
     }
 
-    static class HighlighterEntry {
+    private BreakIterator getBreakIterator(SearchContextHighlight.Field field) {
+        final SearchContextHighlight.FieldOptions fieldOptions = field.fieldOptions();
+        final Locale locale =
+            fieldOptions.boundaryScannerLocale() != null ? fieldOptions.boundaryScannerLocale() :
+                Locale.ROOT;
+        final HighlightBuilder.BoundaryScannerType type =
+            fieldOptions.boundaryScannerType()  != null ? fieldOptions.boundaryScannerType() :
+                HighlightBuilder.BoundaryScannerType.SENTENCE;
+        int maxLen = fieldOptions.fragmentCharSize();
+        switch (type) {
+            case SENTENCE:
+                if (maxLen > 0) {
+                    return BoundedBreakIteratorScanner.getSentence(locale, maxLen);
+                }
+                return BreakIterator.getSentenceInstance(locale);
+            case WORD:
+                // ignore maxLen
+                return BreakIterator.getWordInstance(locale);
+            default:
+                throw new IllegalArgumentException("Invalid boundary scanner type: " + type.toString());
+        }
+    }
+
+    private static class HighlighterEntry {
         Map<FieldMapper, MapperHighlighterEntry> mappers = new HashMap<>();
     }
 
-    static class MapperHighlighterEntry {
+    private static class MapperHighlighterEntry {
         final CustomPassageFormatter passageFormatter;
 
         private MapperHighlighterEntry(CustomPassageFormatter passageFormatter) {
diff --git a/core/src/test/java/org/apache/lucene/search/uhighlight/BoundedBreakIteratorScannerTests.java b/core/src/test/java/org/apache/lucene/search/uhighlight/BoundedBreakIteratorScannerTests.java
new file mode 100644
index 0000000000000..2c41eb3319fd2
--- /dev/null
+++ b/core/src/test/java/org/apache/lucene/search/uhighlight/BoundedBreakIteratorScannerTests.java
@@ -0,0 +1,111 @@
+package org.apache.lucene.search.uhighlight;
+
+import org.elasticsearch.test.ESTestCase;
+
+import java.text.BreakIterator;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Locale;
+
+import static org.hamcrest.Matchers.greaterThan;
+import static org.hamcrest.Matchers.greaterThanOrEqualTo;
+import static org.hamcrest.Matchers.lessThanOrEqualTo;
+
+public class BoundedBreakIteratorScannerTests extends ESTestCase {
+    private void testRandomAsciiTextCase(BreakIterator bi, int maxLen) {
+        // Generate a random set of unique terms with ascii character
+        int maxSize = randomIntBetween(5, 20);
+        String[] vocabulary = new String[maxSize];
+        for (int i = 0; i < maxSize; i++) {
+            if (rarely()) {
+                vocabulary[i] = randomAsciiOfLengthBetween(50, 200);
+            } else {
+                vocabulary[i] = randomAsciiOfLengthBetween(1, 30);
+            }
+        }
+
+        // Generate a random text made of random terms separated with word-boundaries and sentence-boundaries.
+        StringBuilder text = new StringBuilder();
+        List<Integer> offsetList = new ArrayList<> ();
+        List<Integer> sizeList = new ArrayList<> ();
+        // the number of sentences to generate
+        int numSentences = randomIntBetween(10, 100);
+        int maxTermLen = 0;
+        for (int i = 0; i < numSentences; i++) {
+            // the number of terms in the sentence
+            int numTerms = randomIntBetween(5, 10);
+            for (int j = 0; j < numTerms; j++) {
+                int termId = randomIntBetween(0, vocabulary.length - 1);
+                String term = vocabulary[termId].toLowerCase();
+                if (j == 0) {
+                    // capitalize the first letter of the first term in the sentence
+                    term = term.substring(0, 1).toUpperCase() + term.substring(1);
+                } else {
+                    // word boundary
+                    String sep = randomFrom(" ", "  ",  "\t", "#", "\n");
+                    text.append(sep);
+                }
+                maxTermLen = Math.max(term.length(), maxTermLen);
+                offsetList.add(text.length());
+                sizeList.add(term.length());
+                text.append(term);
+            }
+            // sentence boundary
+            String boundary = randomFrom("! ", "? ", ". ", ".\n", ".\n\n");
+            text.append(boundary);
+        }
+
+        int[] sizes = sizeList.stream().mapToInt(i->i).toArray();
+        int[] offsets = offsetList.stream().mapToInt(i->i).toArray();
+
+        bi.setText(text.toString());
+        int currentPos = randomIntBetween(0, 20);
+        int lastEnd = -1;
+        int maxPassageLen = maxLen+(maxTermLen*2);
+        while (currentPos < offsets.length) {
+            // find the passage that contains the current term
+            int nextOffset = offsets[currentPos];
+            int start = bi.preceding(nextOffset+1);
+            int end = bi.following(nextOffset);
+
+            // check that the passage is valid
+            assertThat(start, greaterThanOrEqualTo(lastEnd));
+            assertThat(end, greaterThan(start));
+            assertThat(start, lessThanOrEqualTo(nextOffset));
+            assertThat(end, greaterThanOrEqualTo(nextOffset));
+            int passageLen = end-start;
+            assertThat(passageLen, lessThanOrEqualTo(maxPassageLen));
+
+            // checks that the start and end of the passage are on word boundaries.
+            int startPos = Arrays.binarySearch(offsets, start);
+            int endPos = Arrays.binarySearch(offsets, end);
+            if (startPos < 0) {
+                int lastWordEnd = offsets[Math.abs(startPos)-2] + sizes[Math.abs(startPos)-2];
+                assertThat(start, greaterThanOrEqualTo(lastWordEnd));
+            }
+            if (endPos < 0) {
+                if (Math.abs(endPos)-2 < offsets.length) {
+                    int lastWordEnd = offsets[Math.abs(endPos) - 2] + sizes[Math.abs(endPos) - 2];
+                    assertThat(end, greaterThanOrEqualTo(lastWordEnd));
+                }
+                // advance the position to the end of the current passage
+                currentPos = (Math.abs(endPos) - 1);
+            } else {
+                // advance the position to the end of the current passage
+                currentPos = endPos;
+            }
+            // randomly advance to the next term to highlight
+            currentPos += randomIntBetween(0, 20);
+            lastEnd = end;
+        }
+    }
+
+    public void testBoundedSentence() {
+        for (int i = 0; i < 20; i++) {
+            int maxLen = randomIntBetween(10, 500);
+            testRandomAsciiTextCase(BoundedBreakIteratorScanner.getSentence(Locale.ROOT, maxLen),
+                maxLen);
+        }
+    }
+}
diff --git a/core/src/test/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java b/core/src/test/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java
index 83b42750f92ce..23e867d2573ce 100644
--- a/core/src/test/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java
+++ b/core/src/test/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java
@@ -20,20 +20,22 @@
 package org.apache.lucene.search.uhighlight;
 
 import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.MockAnalyzer;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.FieldType;
 import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.IndexOptions;
-import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexWriterConfig;
 import org.apache.lucene.index.RandomIndexWriter;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.queries.CommonTermsQuery;
 import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.BooleanQuery;
 import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.MatchAllDocsQuery;
+import org.apache.lucene.search.PhraseQuery;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.Sort;
 import org.apache.lucene.search.TermQuery;
@@ -41,219 +43,167 @@
 import org.apache.lucene.search.highlight.DefaultEncoder;
 import org.apache.lucene.search.highlight.Snippet;
 import org.apache.lucene.store.Directory;
+import org.elasticsearch.common.Strings;
 import org.elasticsearch.common.lucene.all.AllTermQuery;
 import org.elasticsearch.common.lucene.search.MultiPhrasePrefixQuery;
-import org.elasticsearch.search.fetch.subphase.highlight.HighlightUtils;
 import org.elasticsearch.test.ESTestCase;
 
-import java.io.IOException;
+import java.text.BreakIterator;
+import java.util.Locale;
 
+import static org.apache.lucene.search.uhighlight.CustomUnifiedHighlighter.MULTIVAL_SEP_CHAR;
 import static org.hamcrest.CoreMatchers.equalTo;
 
 public class CustomUnifiedHighlighterTests extends ESTestCase {
-    public void testCustomUnifiedHighlighter() throws Exception {
+    private void assertHighlightOneDoc(String fieldName, String[] inputs, Analyzer analyzer, Query query,
+                                       Locale locale, BreakIterator breakIterator,
+                                       int noMatchSize, String[] expectedPassages) throws Exception {
         Directory dir = newDirectory();
-        IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random()));
-        iwc.setMergePolicy(newLogMergePolicy());
-        RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
-
-        FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
-        offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
-        offsetsType.setStoreTermVectorOffsets(true);
-        offsetsType.setStoreTermVectorPositions(true);
-        offsetsType.setStoreTermVectors(true);
-
-        //good position but only one match
-        final String firstValue = "This is a test. Just a test1 highlighting from unified highlighter.";
-        Field body = new Field("body", "", offsetsType);
-        Document doc = new Document();
-        doc.add(body);
-        body.setStringValue(firstValue);
-
-        //two matches, not the best snippet due to its length though
-        final String secondValue = "This is the second highlighting value to perform highlighting on a longer text " +
-            "that gets scored lower.";
-        Field body2 = new Field("body", "", offsetsType);
-        doc.add(body2);
-        body2.setStringValue(secondValue);
-
-        //two matches and short, will be scored highest
-        final String thirdValue = "This is highlighting the third short highlighting value.";
-        Field body3 = new Field("body", "", offsetsType);
-        doc.add(body3);
-        body3.setStringValue(thirdValue);
-
-        //one match, same as first but at the end, will be scored lower due to its position
-        final String fourthValue = "Just a test4 highlighting from unified highlighter.";
-        Field body4 = new Field("body", "", offsetsType);
-        doc.add(body4);
-        body4.setStringValue(fourthValue);
-
-        iw.addDocument(doc);
-
-        IndexReader ir = iw.getReader();
-        iw.close();
-
-        String firstHlValue = "Just a test1 <b>highlighting</b> from unified highlighter.";
-        String secondHlValue = "This is the second <b>highlighting</b> value to perform <b>highlighting</b> on a" +
-            " longer text that gets scored lower.";
-        String thirdHlValue = "This is <b>highlighting</b> the third short <b>highlighting</b> value.";
-        String fourthHlValue = "Just a test4 <b>highlighting</b> from unified highlighter.";
-
-        IndexSearcher searcher = newSearcher(ir);
-        Query query = new TermQuery(new Term("body", "highlighting"));
-
-        TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
-        assertThat(topDocs.totalHits, equalTo(1));
-
-        int docId = topDocs.scoreDocs[0].doc;
-
-        String fieldValue = firstValue + HighlightUtils.PARAGRAPH_SEPARATOR + secondValue +
-            HighlightUtils.PARAGRAPH_SEPARATOR + thirdValue + HighlightUtils.PARAGRAPH_SEPARATOR + fourthValue;
-
-        CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(searcher, iwc.getAnalyzer(),
-            new CustomPassageFormatter("<b>", "</b>", new DefaultEncoder()), null, fieldValue, true);
-        Snippet[] snippets = highlighter.highlightField("body", query, docId, 5);
-
-        assertThat(snippets.length, equalTo(4));
-
-        assertThat(snippets[0].getText(), equalTo(firstHlValue));
-        assertThat(snippets[1].getText(), equalTo(secondHlValue));
-        assertThat(snippets[2].getText(), equalTo(thirdHlValue));
-        assertThat(snippets[3].getText(), equalTo(fourthHlValue));
-        ir.close();
-        dir.close();
-    }
-
-    public void testNoMatchSize() throws Exception {
-        Directory dir = newDirectory();
-        Analyzer analyzer = new StandardAnalyzer();
         IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
-        iwc.setMergePolicy(newLogMergePolicy());
+        iwc.setMergePolicy(newTieredMergePolicy(random()));
         RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
-
-        FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
-        offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
-        offsetsType.setStoreTermVectorOffsets(true);
-        offsetsType.setStoreTermVectorPositions(true);
-        offsetsType.setStoreTermVectors(true);
-        Field body = new Field("body", "", offsetsType);
-        Field none = new Field("none", "", offsetsType);
+        FieldType ft = new FieldType(TextField.TYPE_STORED);
+        ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
+        ft.freeze();
         Document doc = new Document();
-        doc.add(body);
-        doc.add(none);
-
-        String firstValue = "This is a test. Just a test highlighting from unified. Feel free to ignore.";
-        body.setStringValue(firstValue);
-        none.setStringValue(firstValue);
+        for (String input : inputs) {
+            Field field = new Field(fieldName, "", ft);
+            field.setStringValue(input);
+            doc.add(field);
+        }
         iw.addDocument(doc);
-
-        IndexReader ir = iw.getReader();
+        DirectoryReader reader = iw.getReader();
+        IndexSearcher searcher = newSearcher(reader);
         iw.close();
-
-        Query query = new TermQuery(new Term("none", "highlighting"));
-
-        IndexSearcher searcher = newSearcher(ir);
-        TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
+        TopDocs topDocs = searcher.search(new MatchAllDocsQuery(), 1, Sort.INDEXORDER);
         assertThat(topDocs.totalHits, equalTo(1));
-        int docId = topDocs.scoreDocs[0].doc;
-
-        CustomPassageFormatter passageFormatter = new CustomPassageFormatter("<b>", "</b>", new DefaultEncoder());
-        CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(searcher, analyzer, passageFormatter,
-            null, firstValue, false);
-        Snippet[] snippets = highlighter.highlightField("body", query, docId, 5);
-        assertThat(snippets.length, equalTo(0));
-
-        highlighter = new CustomUnifiedHighlighter(searcher, analyzer, passageFormatter, null, firstValue, true);
-        snippets = highlighter.highlightField("body", query, docId, 5);
-        assertThat(snippets.length, equalTo(1));
-        assertThat(snippets[0].getText(), equalTo("This is a test."));
-        ir.close();
+        String rawValue = Strings.arrayToDelimitedString(inputs, String.valueOf(MULTIVAL_SEP_CHAR));
+        CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(searcher, analyzer,
+            new CustomPassageFormatter("<b>", "</b>", new DefaultEncoder()), locale, breakIterator, rawValue,
+            noMatchSize);
+        highlighter.setFieldMatcher((name) -> "text".equals(name));
+        final Snippet[] snippets =
+            highlighter.highlightField("text", query, topDocs.scoreDocs[0].doc, expectedPassages.length);
+        assertEquals(snippets.length, expectedPassages.length);
+        for (int i = 0; i < snippets.length; i++) {
+            assertEquals(snippets[i].getText(), expectedPassages[i]);
+        }
+        reader.close();
         dir.close();
     }
 
+    public void testSimple() throws Exception {
+        final String[] inputs = {
+            "This is a test. Just a test1 highlighting from unified highlighter.",
+            "This is the second highlighting value to perform highlighting on a longer text that gets scored lower.",
+            "This is highlighting the third short highlighting value.",
+            "Just a test4 highlighting from unified highlighter."
+        };
+
+        String[] expectedPassages = {
+            "Just a test1 <b>highlighting</b> from unified highlighter.",
+            "This is the second <b>highlighting</b> value to perform <b>highlighting</b> on a" +
+                " longer text that gets scored lower.",
+            "This is <b>highlighting</b> the third short <b>highlighting</b> value.",
+            "Just a test4 <b>highlighting</b> from unified highlighter."
+        };
+        Query query = new TermQuery(new Term("text", "highlighting"));
+        assertHighlightOneDoc("text", inputs, new StandardAnalyzer(), query, Locale.ROOT,
+            BreakIterator.getSentenceInstance(Locale.ROOT), 0, expectedPassages);
+    }
 
-    private IndexReader indexOneDoc(Directory dir, String field, String value, Analyzer analyzer) throws IOException {
-        IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
-        iwc.setMergePolicy(newLogMergePolicy());
-        RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
-
-        FieldType ft = new FieldType(TextField.TYPE_STORED);
-        ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
-        Field textField = new Field(field, "", ft);
-        Document doc = new Document();
-        doc.add(textField);
-
-        textField.setStringValue(value);
-        iw.addDocument(doc);
-        IndexReader ir = iw.getReader();
-        iw.close();
-        return ir;
+    public void testNoMatchSize() throws Exception {
+        final String[] inputs = {
+            "This is a test. Just a test highlighting from unified. Feel free to ignore."
+        };
+        Query query = new TermQuery(new Term("body", "highlighting"));
+        assertHighlightOneDoc("text", inputs, new StandardAnalyzer(), query, Locale.ROOT,
+            BreakIterator.getSentenceInstance(Locale.ROOT), 100, inputs);
     }
 
     public void testMultiPhrasePrefixQuery() throws Exception {
-        Analyzer analyzer = new StandardAnalyzer();
-        Directory dir = newDirectory();
-        String value = "The quick brown fox.";
-        IndexReader ir = indexOneDoc(dir, "text", value, analyzer);
+        final String[] inputs = {
+            "The quick brown fox."
+        };
+        final String[] outputs = {
+            "The <b>quick</b> <b>brown</b> <b>fox</b>."
+        };
         MultiPhrasePrefixQuery query = new MultiPhrasePrefixQuery();
         query.add(new Term("text", "quick"));
         query.add(new Term("text", "brown"));
         query.add(new Term("text", "fo"));
-        IndexSearcher searcher = newSearcher(ir);
-        TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
-        assertThat(topDocs.totalHits, equalTo(1));
-        int docId = topDocs.scoreDocs[0].doc;
-        CustomPassageFormatter passageFormatter = new CustomPassageFormatter("<b>", "</b>", new DefaultEncoder());
-        CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(searcher, analyzer,
-            passageFormatter, null, value, false);
-        Snippet[] snippets = highlighter.highlightField("text", query, docId, 5);
-        assertThat(snippets.length, equalTo(1));
-        assertThat(snippets[0].getText(), equalTo("The <b>quick</b> <b>brown</b> <b>fox</b>."));
-        ir.close();
-        dir.close();
+        assertHighlightOneDoc("text", inputs, new StandardAnalyzer(), query, Locale.ROOT,
+            BreakIterator.getSentenceInstance(Locale.ROOT), 0, outputs);
     }
 
-    public void testAllTermQuery() throws IOException {
-        Directory dir = newDirectory();
-        String value = "The quick brown fox.";
-        Analyzer analyzer = new StandardAnalyzer();
-        IndexReader ir = indexOneDoc(dir, "all", value, analyzer);
-        AllTermQuery query = new AllTermQuery(new Term("all", "fox"));
-        IndexSearcher searcher = newSearcher(ir);
-        TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
-        assertThat(topDocs.totalHits, equalTo(1));
-        int docId = topDocs.scoreDocs[0].doc;
-        CustomPassageFormatter passageFormatter = new CustomPassageFormatter("<b>", "</b>", new DefaultEncoder());
-        CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(searcher, analyzer,
-            passageFormatter, null, value, false);
-        Snippet[] snippets = highlighter.highlightField("all", query, docId, 5);
-        assertThat(snippets.length, equalTo(1));
-        assertThat(snippets[0].getText(), equalTo("The quick brown <b>fox</b>."));
-        ir.close();
-        dir.close();
+    public void testAllTermQuery() throws Exception {
+        final String[] inputs = {
+            "The quick brown fox."
+        };
+        final String[] outputs = {
+            "The quick brown <b>fox</b>."
+        };
+        AllTermQuery query = new AllTermQuery(new Term("text", "fox"));
+        assertHighlightOneDoc("text", inputs, new StandardAnalyzer(), query, Locale.ROOT,
+            BreakIterator.getSentenceInstance(Locale.ROOT), 0, outputs);
     }
 
-    public void testCommonTermsQuery() throws IOException {
-        Directory dir = newDirectory();
-        String value = "The quick brown fox.";
-        Analyzer analyzer = new StandardAnalyzer();
-        IndexReader ir = indexOneDoc(dir, "text", value, analyzer);
+    public void testCommonTermsQuery() throws Exception {
+        final String[] inputs = {
+            "The quick brown fox."
+        };
+        final String[] outputs = {
+            "The <b>quick</b> <b>brown</b> <b>fox</b>."
+        };
         CommonTermsQuery query = new CommonTermsQuery(BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD, 128);
         query.add(new Term("text", "quick"));
         query.add(new Term("text", "brown"));
         query.add(new Term("text", "fox"));
-        IndexSearcher searcher = newSearcher(ir);
-        TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
-        assertThat(topDocs.totalHits, equalTo(1));
-        int docId = topDocs.scoreDocs[0].doc;
-        CustomPassageFormatter passageFormatter = new CustomPassageFormatter("<b>", "</b>", new DefaultEncoder());
-        CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(searcher, analyzer,
-            passageFormatter, null, value, false);
-        Snippet[] snippets = highlighter.highlightField("text", query, docId, 5);
-        assertThat(snippets.length, equalTo(1));
-        assertThat(snippets[0].getText(), equalTo("The <b>quick</b> <b>brown</b> <b>fox</b>."));
-        ir.close();
-        dir.close();
+        assertHighlightOneDoc("text", inputs, new StandardAnalyzer(), query, Locale.ROOT,
+            BreakIterator.getSentenceInstance(Locale.ROOT), 0, outputs);
+    }
+
+    public void testSentenceBoundedBreakIterator() throws Exception {
+        final String[] inputs = {
+            "The quick brown fox in a long sentence with another quick brown fox. " +
+                "Another sentence with brown fox."
+        };
+        final String[] outputs = {
+            "The <b>quick</b> <b>brown</b>",
+            "<b>fox</b> in a long",
+            "with another <b>quick</b>",
+            "<b>brown</b> <b>fox</b>.",
+            "sentence with <b>brown</b>",
+            "<b>fox</b>.",
+        };
+        BooleanQuery query = new BooleanQuery.Builder()
+            .add(new TermQuery(new Term("text", "quick")), BooleanClause.Occur.SHOULD)
+            .add(new TermQuery(new Term("text", "brown")), BooleanClause.Occur.SHOULD)
+            .add(new TermQuery(new Term("text", "fox")), BooleanClause.Occur.SHOULD)
+            .build();
+        assertHighlightOneDoc("text", inputs, new StandardAnalyzer(), query, Locale.ROOT,
+            BoundedBreakIteratorScanner.getSentence(Locale.ROOT, 10), 0, outputs);
+    }
+
+    public void testRepeat() throws Exception {
+        final String[] inputs = {
+            "Fun  fun fun  fun  fun  fun  fun  fun  fun  fun"
+        };
+        final String[] outputs = {
+            "<b>Fun</b>  <b>fun</b> <b>fun</b>",
+            "<b>fun</b>  <b>fun</b>",
+            "<b>fun</b>  <b>fun</b>  <b>fun</b>",
+            "<b>fun</b>  <b>fun</b>"
+        };
+        Query query = new TermQuery(new Term("text", "fun"));
+        assertHighlightOneDoc("text", inputs, new StandardAnalyzer(), query, Locale.ROOT,
+            BoundedBreakIteratorScanner.getSentence(Locale.ROOT, 10), 0, outputs);
+
+        query = new PhraseQuery.Builder()
+            .add(new Term("text", "fun"))
+            .add(new Term("text", "fun"))
+            .build();
+        assertHighlightOneDoc("text", inputs, new StandardAnalyzer(), query, Locale.ROOT,
+            BoundedBreakIteratorScanner.getSentence(Locale.ROOT, 10), 0, outputs);
     }
 }
diff --git a/core/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java b/core/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java
index 7db99ff3232ee..11496309d47a9 100644
--- a/core/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java
+++ b/core/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java
@@ -751,52 +751,69 @@ public void testFastVectorHighlighter() throws Exception {
         assertHighlight(searchResponse, 0, "field2", 0, 1, equalTo("The <em>quick</em> brown fox jumps over"));
     }
 
-    public void testFastVectorHighlighterWithSentenceBoundaryScanner() throws Exception {
+    public void testHighlighterWithSentenceBoundaryScanner() throws Exception {
         assertAcked(prepareCreate("test").addMapping("type1", type1TermVectorMapping()));
         ensureGreen();
 
         indexRandom(true, client().prepareIndex("test", "type1")
                 .setSource("field1", "A sentence with few words. Another sentence with even more words."));
 
-        logger.info("--> highlighting and searching on 'field' with sentence boundary_scanner");
-        SearchSourceBuilder source = searchSource()
+        for (String type : new String[] {"unified", "fvh"}) {
+            logger.info("--> highlighting and searching on 'field' with sentence boundary_scanner");
+            SearchSourceBuilder source = searchSource()
                 .query(termQuery("field1", "sentence"))
                 .highlighter(highlight()
-                        .field("field1", 20, 2)
-                        .order("score")
-                        .preTags("<xxx>").postTags("</xxx>")
-                        .boundaryScannerType(BoundaryScannerType.SENTENCE));
+                    .field("field1", 21, 2)
+                    .highlighterType(type)
+                    .preTags("<xxx>").postTags("</xxx>")
+                    .boundaryScannerType(BoundaryScannerType.SENTENCE));
+            SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get();
 
-        SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get();
+            assertHighlight(searchResponse, 0, "field1", 0, 2, anyOf(
+                equalTo("A <xxx>sentence</xxx> with few words"),
+                equalTo("A <xxx>sentence</xxx> with few words. ")
+            ));
 
-        assertHighlight(searchResponse, 0, "field1", 0, 2, equalTo("A <xxx>sentence</xxx> with few words. "));
-        assertHighlight(searchResponse, 0, "field1", 1, 2, equalTo("Another <xxx>sentence</xxx> with even more words. "));
+            assertHighlight(searchResponse, 0, "field1", 1, 2, anyOf(
+                equalTo("Another <xxx>sentence</xxx> with"),
+                equalTo("Another <xxx>sentence</xxx> with even more words. ")
+            ));
+        }
     }
 
-    public void testFastVectorHighlighterWithSentenceBoundaryScannerAndLocale() throws Exception {
+    public void testHighlighterWithSentenceBoundaryScannerAndLocale() throws Exception {
         assertAcked(prepareCreate("test").addMapping("type1", type1TermVectorMapping()));
         ensureGreen();
 
         indexRandom(true, client().prepareIndex("test", "type1")
                 .setSource("field1", "A sentence with few words. Another sentence with even more words."));
 
-        logger.info("--> highlighting and searching on 'field' with sentence boundary_scanner");
-        SearchSourceBuilder source = searchSource()
+        for (String type : new String[] {"fvh", "unified"}) {
+            logger.info("--> highlighting and searching on 'field' with sentence boundary_scanner");
+            SearchSourceBuilder source = searchSource()
                 .query(termQuery("field1", "sentence"))
                 .highlighter(highlight()
-                        .field("field1", 20, 2)
-                        .order("score")
-                        .preTags("<xxx>").postTags("</xxx>")
-                        .boundaryScannerType(BoundaryScannerType.SENTENCE)
-                        .boundaryScannerLocale(Locale.ENGLISH.toLanguageTag()));
+                    .field("field1", 21, 2)
+                    .highlighterType(type)
+                    .preTags("<xxx>").postTags("</xxx>")
+                    .boundaryScannerType(BoundaryScannerType.SENTENCE)
+                    .boundaryScannerLocale(Locale.ENGLISH.toLanguageTag()));
 
-        SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get();
+            SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get();
 
-        assertHighlight(searchResponse, 0, "field1", 0, 2, equalTo("A <xxx>sentence</xxx> with few words. "));
-        assertHighlight(searchResponse, 0, "field1", 1, 2, equalTo("Another <xxx>sentence</xxx> with even more words. "));
+            assertHighlight(searchResponse, 0, "field1", 0, 2, anyOf(
+                equalTo("A <xxx>sentence</xxx> with few words"),
+                equalTo("A <xxx>sentence</xxx> with few words. ")
+            ));
+
+            assertHighlight(searchResponse, 0, "field1", 1, 2, anyOf(
+                equalTo("Another <xxx>sentence</xxx> with"),
+                equalTo("Another <xxx>sentence</xxx> with even more words. ")
+            ));
+        }
     }
 
-    public void testFastVectorHighlighterWithWordBoundaryScanner() throws Exception {
+    public void testHighlighterWithWordBoundaryScanner() throws Exception {
         assertAcked(prepareCreate("test").addMapping("type1", type1TermVectorMapping()));
         ensureGreen();
 
@@ -804,39 +821,48 @@ public void testFastVectorHighlighterWithWordBoundaryScanner() throws Exception
                 .setSource("field1", "some quick and hairy brown:fox jumped over the lazy dog"));
 
         logger.info("--> highlighting and searching on 'field' with word boundary_scanner");
-        SearchSourceBuilder source = searchSource()
-                .query(termQuery("field1", "some"))
-                .highlighter(highlight()
-                        .field("field1", 23, 1)
-                        .order("score")
-                        .preTags("<xxx>").postTags("</xxx>")
-                        .boundaryScannerType(BoundaryScannerType.WORD));
+        for (String type : new String[] {"unified", "fvh"}) {
+            SearchSourceBuilder source = searchSource()
+                    .query(termQuery("field1", "some"))
+                    .highlighter(highlight()
+                            .field("field1", 23, 1)
+                            .highlighterType(type)
+                            .preTags("<xxx>").postTags("</xxx>")
+                            .boundaryScannerType(BoundaryScannerType.WORD));
 
-        SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get();
+            SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get();
 
-        assertHighlight(searchResponse, 0, "field1", 0, 1, equalTo("<xxx>some</xxx> quick and hairy brown"));
+            assertHighlight(searchResponse, 0, "field1", 0, 1, anyOf(
+                equalTo("<xxx>some</xxx> quick and hairy brown"),
+                equalTo("<xxx>some</xxx>")
+            ));
+        }
     }
 
-    public void testFastVectorHighlighterWithWordBoundaryScannerAndLocale() throws Exception {
+    public void testHighlighterWithWordBoundaryScannerAndLocale() throws Exception {
         assertAcked(prepareCreate("test").addMapping("type1", type1TermVectorMapping()));
         ensureGreen();
 
         indexRandom(true, client().prepareIndex("test", "type1")
                 .setSource("field1", "some quick and hairy brown:fox jumped over the lazy dog"));
 
-        logger.info("--> highlighting and searching on 'field' with word boundary_scanner");
-        SearchSourceBuilder source = searchSource()
+        for (String type : new String[] {"unified", "fvh"}) {
+            SearchSourceBuilder source = searchSource()
                 .query(termQuery("field1", "some"))
                 .highlighter(highlight()
-                        .field("field1", 23, 1)
-                        .order("score")
-                        .preTags("<xxx>").postTags("</xxx>")
-                        .boundaryScannerType(BoundaryScannerType.WORD)
-                        .boundaryScannerLocale(Locale.ENGLISH.toLanguageTag()));
+                    .field("field1", 23, 1)
+                    .highlighterType(type)
+                    .preTags("<xxx>").postTags("</xxx>")
+                    .boundaryScannerType(BoundaryScannerType.WORD)
+                    .boundaryScannerLocale(Locale.ENGLISH.toLanguageTag()));
 
-        SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get();
+            SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get();
 
-        assertHighlight(searchResponse, 0, "field1", 0, 1, equalTo("<xxx>some</xxx> quick and hairy brown"));
+            assertHighlight(searchResponse, 0, "field1", 0, 1, anyOf(
+                equalTo("<xxx>some</xxx> quick and hairy brown"),
+                equalTo("<xxx>some</xxx>")
+            ));
+        }
     }
 
     /**
@@ -1841,13 +1867,13 @@ public void testHighlightNoMatchSize() throws IOException {
         response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
         assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some"));
 
-        // Postings hl also works but the fragment is the whole first sentence (size ignored)
-        field.highlighterType("postings");
+        // Unified hl also works but the fragment is longer than the plain highlighter because of the boundary is the word
+        field.highlighterType("unified");
         response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
-        assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some of me should get cut off."));
+        assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some"));
 
-        // Unified hl also works but the fragment is the whole first sentence (size ignored)
-        field.highlighterType("unified");
+        // Postings hl also works but the fragment is the whole first sentence (size ignored)
+        field.highlighterType("postings");
         response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
         assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some of me should get cut off."));
 
@@ -1860,13 +1886,12 @@ public void testHighlightNoMatchSize() throws IOException {
         response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
         assertHighlight(response, 0, "text", 0, 1, equalTo(text));
 
-        //no difference using postings hl as the noMatchSize is ignored (just needs to be greater than 0)
-        field.highlighterType("postings");
+        field.highlighterType("unified");
         response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
-        assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some of me should get cut off."));
+        assertHighlight(response, 0, "text", 0, 1, equalTo(text));
 
-        //no difference using unified hl as the noMatchSize is ignored (just needs to be greater than 0)
-        field.highlighterType("unified");
+        //no difference using postings hl as the noMatchSize is ignored (just needs to be greater than 0)
+        field.highlighterType("postings");
         response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
         assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some of me should get cut off."));
 
@@ -1879,13 +1904,13 @@ public void testHighlightNoMatchSize() throws IOException {
         response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
         assertHighlight(response, 0, "text", 0, 1, equalTo(text));
 
-        //no difference using postings hl as the noMatchSize is ignored (just needs to be greater than 0)
-        field.highlighterType("postings");
+        // unified hl returns the first sentence as the noMatchSize does not cross sentence boundary.
+        field.highlighterType("unified");
         response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
-        assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some of me should get cut off."));
+        assertHighlight(response, 0, "text", 0, 1, equalTo(text));
 
-        //no difference using unified hl as the noMatchSize is ignored (just needs to be greater than 0)
-        field.highlighterType("unified");
+        //no difference using postings hl as the noMatchSize is ignored (just needs to be greater than 0)
+        field.highlighterType("postings");
         response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
         assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some of me should get cut off."));
 
@@ -1898,11 +1923,11 @@ public void testHighlightNoMatchSize() throws IOException {
         response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field).noMatchSize(21)).get();
         assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some"));
 
-        field.highlighterType("postings");
+        field.highlighterType("unified");
         response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field).noMatchSize(21)).get();
-        assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some of me should get cut off."));
+        assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some"));
 
-        field.highlighterType("unified");
+        field.highlighterType("postings");
         response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field).noMatchSize(21)).get();
         assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some of me should get cut off."));
 
@@ -1947,13 +1972,12 @@ public void testHighlightNoMatchSizeWithMultivaluedFields() throws IOException {
         response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
         assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some"));
 
-        // Postings hl also works but the fragment is the whole first sentence (size ignored)
-        field.highlighterType("postings");
+        field.highlighterType("unified");
         response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
-        assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some of me should get cut off."));
+        assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some"));
 
-        // Unified hl also works but the fragment is the whole first sentence (size ignored)
-        field.highlighterType("unified");
+        // Postings hl also works but the fragment is the whole first sentence (size ignored)
+        field.highlighterType("postings");
         response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
         assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some of me should get cut off."));
 
@@ -1980,11 +2004,12 @@ public void testHighlightNoMatchSizeWithMultivaluedFields() throws IOException {
                 .highlighter(new HighlightBuilder().field(field)).get();
         assertNotHighlighted(response, 0, "text");
 
+        // except for the unified highlighter which starts from the first string with actual content
         field.highlighterType("unified");
         response = client().prepareSearch("test")
             .setQuery(idsQueryBuilder)
             .highlighter(new HighlightBuilder().field(field)).get();
-        assertNotHighlighted(response, 0, "text");
+        assertHighlight(response, 0, "text", 0, 1, equalTo("I am short"));
 
         // But if the field was actually empty then you should get no highlighting field
         index("test", "type1", "3", "text", new String[] {});
@@ -2031,7 +2056,7 @@ public void testHighlightNoMatchSizeWithMultivaluedFields() throws IOException {
                 .highlighter(new HighlightBuilder().field(field)).get();
         assertNotHighlighted(response, 0, "text");
 
-        field.highlighterType("fvh");
+        field.highlighterType("unified");
         response = client().prepareSearch("test")
                 .setQuery(idsQueryBuilder)
                 .highlighter(new HighlightBuilder().field(field)).get();
@@ -2081,13 +2106,13 @@ public void testHighlightNoMatchSizeNumberOfFragments() throws IOException {
         response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
         assertHighlight(response, 0, "text", 0, 1, equalTo("This is the first sentence"));
 
-        // Postings hl also works but the fragment is the whole first sentence (size ignored)
-        field.highlighterType("postings");
+        field.highlighterType("unified");
         response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
-        assertHighlight(response, 0, "text", 0, 1, equalTo("This is the first sentence."));
+        assertHighlight(response, 0, "text", 0, 1, equalTo("This is the first sentence"));
 
-        // Unified hl also works but the fragment is the whole first sentence (size ignored)
-        field.highlighterType("unified");
+
+        // Postings hl also works but the fragment is the whole first sentence (size ignored)
+        field.highlighterType("postings");
         response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get();
         assertHighlight(response, 0, "text", 0, 1, equalTo("This is the first sentence."));
 
diff --git a/docs/reference/search/request/highlighting.asciidoc b/docs/reference/search/request/highlighting.asciidoc
index 81f454bb15852..033ba01cfc4bb 100644
--- a/docs/reference/search/request/highlighting.asciidoc
+++ b/docs/reference/search/request/highlighting.asciidoc
@@ -140,6 +140,9 @@ It supports accurate phrase and multi-term (fuzzy, prefix, regex) highlighting a
 * `highlight_query`
 * `pre_tags and `post_tags`
 * `require_field_match`
+* `boundary_scanner` (`sentence` (**default**)  or `word`)
+* `max_fragment_length` (only for `sentence` scanner)
+* `no_match_size`
 
 ==== Force highlighter type
 
@@ -345,7 +348,7 @@ parameter to control the margin to start highlighting from.
 In the case where there is no matching fragment to highlight, the default is
 to not return anything. Instead, we can return a snippet of text from the
 beginning of the field by setting `no_match_size` (default `0`) to the length
-of the text that you want returned. The actual length may be shorter than
+of the text that you want returned. The actual length may be shorter or longer than
 specified as it tries to break on a word boundary. When using the postings
 highlighter it is not possible to control the actual size of the snippet,
 therefore the first sentence gets returned whenever `no_match_size` is
@@ -504,21 +507,26 @@ GET /_search
 [[boundary-scanners]]
 ==== Boundary Scanners
 
-When highlighting a field using the fast vector highlighter, you can specify
-how to break the highlighted fragments using `boundary_scanner`, which accepts
+When highlighting a field using the unified highlighter or the fast vector highlighter,
+you can specify how to break the highlighted fragments using `boundary_scanner`, which accepts
 the following values:
 
-* `chars` (default): allows to configure which characters (`boundary_chars`)
+* `chars` (default mode for the FVH): allows to configure which characters (`boundary_chars`)
 constitute a boundary for highlighting. It's a single string with each boundary
 character defined in it (defaults to `.,!? \t\n`). It also allows configuring
 the `boundary_max_scan` to control how far to look for boundary characters
-(defaults to `20`).
+(defaults to `20`). Works only with the Fast Vector Highlighter.
 
-* `word` and `sentence`: use Java's https://docs.oracle.com/javase/8/docs/api/java/text/BreakIterator.html[BreakIterator]
-to break the highlighted fragments at the next _word_ or _sentence_ boundary.
+* `sentence` and `word`: use Java's https://docs.oracle.com/javase/8/docs/api/java/text/BreakIterator.html[BreakIterator]
+to break the highlighted fragments at the next  _sentence_ or _word_ boundary.
 You can further specify `boundary_scanner_locale` to control which Locale is used
 to search the text for these boundaries.
 
+[NOTE]
+When used with the `unified` highlighter, the `sentence` scanner splits sentence
+bigger than `fragment_size` at the first word boundary next to `fragment_size`.
+You can set `fragment_size` to 0 to never split any sentence.
+
 [[matched-fields]]
 ==== Matched Fields
 The Fast Vector Highlighter can combine matches on multiple fields to

From 3d447a039380fedaca7b62880d0afb7101767cd0 Mon Sep 17 00:00:00 2001
From: Jim Ferenczi <jim.ferenczi@elastic.co>
Date: Fri, 17 Mar 2017 16:59:01 +0100
Subject: [PATCH 2/2] Address review comments and fix styles

---
 .../BoundedBreakIteratorScanner.java          | 62 +++++++++++++------
 .../uhighlight/CustomFieldHighlighter.java    | 36 ++++++++---
 .../BoundedBreakIteratorScannerTests.java     | 51 +++++++++++----
 3 files changed, 109 insertions(+), 40 deletions(-)

diff --git a/core/src/main/java/org/apache/lucene/search/uhighlight/BoundedBreakIteratorScanner.java b/core/src/main/java/org/apache/lucene/search/uhighlight/BoundedBreakIteratorScanner.java
index 6aeb1b8ab7e20..1cd5fb9340d7f 100644
--- a/core/src/main/java/org/apache/lucene/search/uhighlight/BoundedBreakIteratorScanner.java
+++ b/core/src/main/java/org/apache/lucene/search/uhighlight/BoundedBreakIteratorScanner.java
@@ -23,12 +23,15 @@
 import java.util.Locale;
 
 /**
- * A custom break iterator that scans text to find break-delimited passages bounded by a provided {@code maxLen}.
- * This scanner uses two level of {@link BreakIterator} to bound passage size "close" to {@maxLen}.
- * This is useful to "bound" {@link BreakIterator}s like `sentence` that can create big outliers on
- * semi-structured text.
+ * A custom break iterator that scans text to find break-delimited passages bounded by
+ * a provided maximum length. This class delegates the boundary search to a first level
+ * break iterator. When this break iterator finds a passage greater than the maximum length
+ * a secondary break iterator is used to re-split the passage at the first boundary after
+ * maximum length.
+ * This is useful to split passages created by {@link BreakIterator}s like `sentence` that
+ * can create big outliers on semi-structured text.
  *
- * @warning This break iterator is designed to work with the {@link UnifiedHighlighter} only.
+ * WARNING: This break iterator is designed to work with the {@link UnifiedHighlighter}.
  **/
 public class BoundedBreakIteratorScanner extends BreakIterator {
     private final BreakIterator mainBreak;
@@ -39,9 +42,11 @@ public class BoundedBreakIteratorScanner extends BreakIterator {
     private int windowStart = -1;
     private int windowEnd = -1;
     private int innerStart = -1;
-    private int innerEnd = -1;
+    private int innerEnd = 0;
 
-    private BoundedBreakIteratorScanner(BreakIterator mainBreak, BreakIterator innerBreak, int maxLen) {
+    private BoundedBreakIteratorScanner(BreakIterator mainBreak,
+                                        BreakIterator innerBreak,
+                                        int maxLen) {
         this.mainBreak = mainBreak;
         this.innerBreak = innerBreak;
         this.maxLen = maxLen;
@@ -71,12 +76,18 @@ private void reset() {
         windowStart = -1;
         windowEnd = -1;
         innerStart = -1;
-        innerEnd = -1;
+        innerEnd = 0;
     }
 
+    /**
+     * Must be called with increasing offset. See {@link FieldHighlighter} for usage.
+     */
     @Override
     public int preceding(int offset) {
-        assert(offset > lastPrecedingOffset);
+        if (offset < lastPrecedingOffset) {
+            throw new IllegalArgumentException("offset < lastPrecedingOffset: " +
+                "usage doesn't look like UnifiedHighlighter");
+        }
         if (offset > windowStart && offset < windowEnd) {
             innerStart = innerEnd;
             innerEnd = windowEnd;
@@ -89,26 +100,35 @@ public int preceding(int offset) {
             // the current split is too big,
             // so starting from the current term we try to find boundaries on the left first
             if (offset - maxLen > innerStart) {
-                innerStart = Math.max(innerStart, innerBreak.preceding(offset - maxLen));
+                innerStart = Math.max(innerStart,
+                    innerBreak.preceding(offset - maxLen));
             }
             // and then we try to expand the passage to the right with the remaining size
             int remaining = Math.max(0, maxLen - (offset - innerStart));
             if (offset + remaining < windowEnd) {
-                innerEnd = Math.min(windowEnd, innerBreak.following(offset + remaining));
+                innerEnd = Math.min(windowEnd,
+                    innerBreak.following(offset + remaining));
             }
         }
         lastPrecedingOffset = offset - 1;
         return innerStart;
     }
 
+    /**
+     * Can be invoked only after a call to preceding(offset+1).
+     * See {@link FieldHighlighter} for usage.
+     */
     @Override
     public int following(int offset) {
-        assert(offset == lastPrecedingOffset && innerEnd != -1);
+        if (offset != lastPrecedingOffset || innerEnd == -1) {
+            throw new IllegalArgumentException("offset != lastPrecedingOffset: " +
+                "usage doesn't look like UnifiedHighlighter");
+        }
         return innerEnd;
     }
 
     /**
-     * Returns a {@link BreakIterator#getSentenceInstance(Locale)} bounded to {@code maxLen}.
+     * Returns a {@link BreakIterator#getSentenceInstance(Locale)} bounded to maxLen.
      * Secondary boundaries are found using a {@link BreakIterator#getWordInstance(Locale)}.
      */
     public static BreakIterator getSentence(Locale locale, int maxLen) {
@@ -117,14 +137,16 @@ public static BreakIterator getSentence(Locale locale, int maxLen) {
         return new BoundedBreakIteratorScanner(sBreak, wBreak, maxLen);
     }
 
+
     @Override
-    public int first() {
-        return 0;
+    public int current() {
+        // Returns the last offset of the current split
+        return this.innerEnd;
     }
 
     @Override
-    public int current() {
-        return 0;
+    public int first() {
+        throw new IllegalStateException("first() should not be called in this context");
     }
 
     @Override
@@ -134,16 +156,16 @@ public int next() {
 
     @Override
     public int last() {
-        throw new IllegalStateException("last should not be called in this context");
+        throw new IllegalStateException("last() should not be called in this context");
     }
 
     @Override
     public int next(int n) {
-        throw new IllegalStateException("next(n) should not be call");
+        throw new IllegalStateException("next(n) should not be called in this context");
     }
 
     @Override
     public int previous() {
-        throw new IllegalStateException("previous should not be call");
+        throw new IllegalStateException("previous() should not be called in this context");
     }
 }
diff --git a/core/src/main/java/org/apache/lucene/search/uhighlight/CustomFieldHighlighter.java b/core/src/main/java/org/apache/lucene/search/uhighlight/CustomFieldHighlighter.java
index 9515648355f04..915e7cc153128 100644
--- a/core/src/main/java/org/apache/lucene/search/uhighlight/CustomFieldHighlighter.java
+++ b/core/src/main/java/org/apache/lucene/search/uhighlight/CustomFieldHighlighter.java
@@ -1,3 +1,22 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 package org.apache.lucene.search.uhighlight;
 
 import java.text.BreakIterator;
@@ -16,11 +35,12 @@ class CustomFieldHighlighter extends FieldHighlighter {
     private final int noMatchSize;
     private final String fieldValue;
 
-    public CustomFieldHighlighter(String field, FieldOffsetStrategy fieldOffsetStrategy,
-                                  Locale breakIteratorLocale, BreakIterator breakIterator,
-                                  PassageScorer passageScorer, int maxPassages, int maxNoHighlightPassages,
-                                  PassageFormatter passageFormatter, int noMatchSize, String fieldValue) {
-        super(field, fieldOffsetStrategy, breakIterator, passageScorer, maxPassages, maxNoHighlightPassages, passageFormatter);
+    CustomFieldHighlighter(String field, FieldOffsetStrategy fieldOffsetStrategy,
+                           Locale breakIteratorLocale, BreakIterator breakIterator,
+                           PassageScorer passageScorer, int maxPassages, int maxNoHighlightPassages,
+                           PassageFormatter passageFormatter, int noMatchSize, String fieldValue) {
+        super(field, fieldOffsetStrategy, breakIterator, passageScorer, maxPassages,
+            maxNoHighlightPassages, passageFormatter);
         this.breakIteratorLocale = breakIteratorLocale;
         this.noMatchSize = noMatchSize;
         this.fieldValue = fieldValue;
@@ -38,18 +58,18 @@ protected Passage[] getSummaryPassagesNoHighlight(int maxPassages) {
                 if (end == -1) {
                     end = fieldValue.length();
                 }
-                if (noMatchSize < end) {
+                if (noMatchSize+pos < end) {
                     BreakIterator bi = BreakIterator.getWordInstance(breakIteratorLocale);
                     bi.setText(fieldValue);
                     // Finds the next word boundary **after** noMatchSize.
-                    end = bi.following(noMatchSize);
+                    end = bi.following(noMatchSize + pos);
                     if (end == BreakIterator.DONE) {
                         end = fieldValue.length();
                     }
                 }
                 Passage passage = new Passage();
                 passage.setScore(Float.NaN);
-                passage.setStartOffset(0);
+                passage.setStartOffset(pos);
                 passage.setEndOffset(end);
                 return new Passage[]{passage};
             }
diff --git a/core/src/test/java/org/apache/lucene/search/uhighlight/BoundedBreakIteratorScannerTests.java b/core/src/test/java/org/apache/lucene/search/uhighlight/BoundedBreakIteratorScannerTests.java
index 2c41eb3319fd2..0cf62e8ce6c42 100644
--- a/core/src/test/java/org/apache/lucene/search/uhighlight/BoundedBreakIteratorScannerTests.java
+++ b/core/src/test/java/org/apache/lucene/search/uhighlight/BoundedBreakIteratorScannerTests.java
@@ -1,3 +1,22 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 package org.apache.lucene.search.uhighlight;
 
 import org.elasticsearch.test.ESTestCase;
@@ -13,9 +32,14 @@
 import static org.hamcrest.Matchers.lessThanOrEqualTo;
 
 public class BoundedBreakIteratorScannerTests extends ESTestCase {
+    private static final String[] WORD_BOUNDARIES =
+        new String[] { " ", "  ",  "\t", "#", "\n" };
+    private static final String[] SENTENCE_BOUNDARIES =
+        new String[] { "! ", "? ", ". ", ".\n", ".\n\n" };
+
     private void testRandomAsciiTextCase(BreakIterator bi, int maxLen) {
         // Generate a random set of unique terms with ascii character
-        int maxSize = randomIntBetween(5, 20);
+        int maxSize = randomIntBetween(5, 100);
         String[] vocabulary = new String[maxSize];
         for (int i = 0; i < maxSize; i++) {
             if (rarely()) {
@@ -25,7 +49,8 @@ private void testRandomAsciiTextCase(BreakIterator bi, int maxLen) {
             }
         }
 
-        // Generate a random text made of random terms separated with word-boundaries and sentence-boundaries.
+        // Generate a random text made of random terms separated with word-boundaries
+        // and sentence-boundaries.
         StringBuilder text = new StringBuilder();
         List<Integer> offsetList = new ArrayList<> ();
         List<Integer> sizeList = new ArrayList<> ();
@@ -37,13 +62,12 @@ private void testRandomAsciiTextCase(BreakIterator bi, int maxLen) {
             int numTerms = randomIntBetween(5, 10);
             for (int j = 0; j < numTerms; j++) {
                 int termId = randomIntBetween(0, vocabulary.length - 1);
-                String term = vocabulary[termId].toLowerCase();
+                String term = vocabulary[termId].toLowerCase(Locale.ROOT);
                 if (j == 0) {
                     // capitalize the first letter of the first term in the sentence
-                    term = term.substring(0, 1).toUpperCase() + term.substring(1);
+                    term = term.substring(0, 1).toUpperCase(Locale.ROOT) + term.substring(1);
                 } else {
-                    // word boundary
-                    String sep = randomFrom(" ", "  ",  "\t", "#", "\n");
+                    String sep = randomFrom(WORD_BOUNDARIES);
                     text.append(sep);
                 }
                 maxTermLen = Math.max(term.length(), maxTermLen);
@@ -51,8 +75,7 @@ private void testRandomAsciiTextCase(BreakIterator bi, int maxLen) {
                 sizeList.add(term.length());
                 text.append(term);
             }
-            // sentence boundary
-            String boundary = randomFrom("! ", "? ", ". ", ".\n", ".\n\n");
+            String boundary = randomFrom(SENTENCE_BOUNDARIES);
             text.append(boundary);
         }
 
@@ -81,12 +104,14 @@ private void testRandomAsciiTextCase(BreakIterator bi, int maxLen) {
             int startPos = Arrays.binarySearch(offsets, start);
             int endPos = Arrays.binarySearch(offsets, end);
             if (startPos < 0) {
-                int lastWordEnd = offsets[Math.abs(startPos)-2] + sizes[Math.abs(startPos)-2];
+                int lastWordEnd =
+                    offsets[Math.abs(startPos)-2] + sizes[Math.abs(startPos)-2];
                 assertThat(start, greaterThanOrEqualTo(lastWordEnd));
             }
             if (endPos < 0) {
                 if (Math.abs(endPos)-2 < offsets.length) {
-                    int lastWordEnd = offsets[Math.abs(endPos) - 2] + sizes[Math.abs(endPos) - 2];
+                    int lastWordEnd =
+                        offsets[Math.abs(endPos) - 2] + sizes[Math.abs(endPos) - 2];
                     assertThat(end, greaterThanOrEqualTo(lastWordEnd));
                 }
                 // advance the position to the end of the current passage
@@ -104,8 +129,10 @@ private void testRandomAsciiTextCase(BreakIterator bi, int maxLen) {
     public void testBoundedSentence() {
         for (int i = 0; i < 20; i++) {
             int maxLen = randomIntBetween(10, 500);
-            testRandomAsciiTextCase(BoundedBreakIteratorScanner.getSentence(Locale.ROOT, maxLen),
-                maxLen);
+            testRandomAsciiTextCase(
+                BoundedBreakIteratorScanner.getSentence(Locale.ROOT, maxLen),
+                maxLen
+            );
         }
     }
 }