elastic · jimczi · Mar 17, 2017 · Mar 1, 2017 · Mar 17, 2017
diff --git a/core/src/main/java/org/apache/lucene/search/uhighlight/BoundedBreakIteratorScanner.java b/core/src/main/java/org/apache/lucene/search/uhighlight/BoundedBreakIteratorScanner.java
@@ -0,0 +1,171 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.lucene.search.uhighlight;
+
+import java.text.BreakIterator;
+import java.text.CharacterIterator;
+import java.util.Locale;
+
+/**
+ * A custom break iterator that scans text to find break-delimited passages bounded by
+ * a provided maximum length. This class delegates the boundary search to a first level
+ * break iterator. When this break iterator finds a passage greater than the maximum length
+ * a secondary break iterator is used to re-split the passage at the first boundary after
+ * maximum length.
+ * This is useful to split passages created by {@link BreakIterator}s like `sentence` that
+ * can create big outliers on semi-structured text.
+ *
+ * WARNING: This break iterator is designed to work with the {@link UnifiedHighlighter}.
+ **/
+public class BoundedBreakIteratorScanner extends BreakIterator {
+    private final BreakIterator mainBreak;
+    private final BreakIterator innerBreak;
+    private final int maxLen;
+
+    private int lastPrecedingOffset = -1;
+    private int windowStart = -1;
+    private int windowEnd = -1;
+    private int innerStart = -1;
+    private int innerEnd = 0;
+
+    private BoundedBreakIteratorScanner(BreakIterator mainBreak,
+                                        BreakIterator innerBreak,
+                                        int maxLen) {
+        this.mainBreak = mainBreak;
+        this.innerBreak = innerBreak;
+        this.maxLen = maxLen;
+    }
+
+    @Override
+    public CharacterIterator getText() {
+        return mainBreak.getText();
+    }
+
+    @Override
+    public void setText(CharacterIterator newText) {
+        reset();
+        mainBreak.setText(newText);
+        innerBreak.setText(newText);
+    }
+
+    @Override
+    public void setText(String newText) {
+        reset();
+        mainBreak.setText(newText);
+        innerBreak.setText(newText);
+    }
+
+    private void reset() {
+        lastPrecedingOffset = -1;
+        windowStart = -1;
+        windowEnd = -1;
+        innerStart = -1;
+        innerEnd = 0;
+    }
+
+    /**
+     * Must be called with increasing offset. See {@link FieldHighlighter} for usage.
+     */
+    @Override
+    public int preceding(int offset) {
+        if (offset < lastPrecedingOffset) {
+            throw new IllegalArgumentException("offset < lastPrecedingOffset: " +
+                "usage doesn't look like UnifiedHighlighter");
+        }
+        if (offset > windowStart && offset < windowEnd) {
+            innerStart = innerEnd;
+            innerEnd = windowEnd;
+        } else {
+            windowStart = innerStart = mainBreak.preceding(offset);
+            windowEnd = innerEnd = mainBreak.following(offset-1);
+        }
+
+        if (innerEnd - innerStart > maxLen) {
+            // the current split is too big,
+            // so starting from the current term we try to find boundaries on the left first
+            if (offset - maxLen > innerStart) {
+                innerStart = Math.max(innerStart,
+                    innerBreak.preceding(offset - maxLen));
+            }
+            // and then we try to expand the passage to the right with the remaining size
+            int remaining = Math.max(0, maxLen - (offset - innerStart));
+            if (offset + remaining < windowEnd) {
+                innerEnd = Math.min(windowEnd,
+                    innerBreak.following(offset + remaining));
+            }
+        }
+        lastPrecedingOffset = offset - 1;
+        return innerStart;
+    }
+
+    /**
+     * Can be invoked only after a call to preceding(offset+1).
+     * See {@link FieldHighlighter} for usage.
+     */
+    @Override
+    public int following(int offset) {
+        if (offset != lastPrecedingOffset || innerEnd == -1) {
+            throw new IllegalArgumentException("offset != lastPrecedingOffset: " +
+                "usage doesn't look like UnifiedHighlighter");
+        }
+        return innerEnd;
+    }
+
+    /**
+     * Returns a {@link BreakIterator#getSentenceInstance(Locale)} bounded to maxLen.
+     * Secondary boundaries are found using a {@link BreakIterator#getWordInstance(Locale)}.
+     */
+    public static BreakIterator getSentence(Locale locale, int maxLen) {
+        final BreakIterator sBreak = BreakIterator.getSentenceInstance(locale);
+        final BreakIterator wBreak = BreakIterator.getWordInstance(locale);
+        return new BoundedBreakIteratorScanner(sBreak, wBreak, maxLen);
+    }
+
+
+    @Override
+    public int current() {
+        // Returns the last offset of the current split
+        return this.innerEnd;
+    }
+
+    @Override
+    public int first() {
+        throw new IllegalStateException("first() should not be called in this context");
+    }
+
+    @Override
+    public int next() {
+        throw new IllegalStateException("next() should not be called in this context");
+    }
+
+    @Override
+    public int last() {
+        throw new IllegalStateException("last() should not be called in this context");
+    }
+
+    @Override
+    public int next(int n) {
+        throw new IllegalStateException("next(n) should not be called in this context");
+    }
+
+    @Override
+    public int previous() {
+        throw new IllegalStateException("previous() should not be called in this context");
+    }
+}
diff --git a/core/src/main/java/org/apache/lucene/search/uhighlight/CustomFieldHighlighter.java b/core/src/main/java/org/apache/lucene/search/uhighlight/CustomFieldHighlighter.java
@@ -0,0 +1,79 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.lucene.search.uhighlight;
+
+import java.text.BreakIterator;
+import java.util.Locale;
+
+import static org.apache.lucene.search.uhighlight.CustomUnifiedHighlighter.MULTIVAL_SEP_CHAR;
+
+/**
+ * Custom {@link FieldHighlighter} that creates a single passage bounded to {@code noMatchSize} when
+ * no highlights were found.
+ */
+class CustomFieldHighlighter extends FieldHighlighter {
+    private static final Passage[] EMPTY_PASSAGE = new Passage[0];
+
+    private final Locale breakIteratorLocale;
+    private final int noMatchSize;
+    private final String fieldValue;
+
+    CustomFieldHighlighter(String field, FieldOffsetStrategy fieldOffsetStrategy,
+                           Locale breakIteratorLocale, BreakIterator breakIterator,
+                           PassageScorer passageScorer, int maxPassages, int maxNoHighlightPassages,
+                           PassageFormatter passageFormatter, int noMatchSize, String fieldValue) {
+        super(field, fieldOffsetStrategy, breakIterator, passageScorer, maxPassages,
+            maxNoHighlightPassages, passageFormatter);
+        this.breakIteratorLocale = breakIteratorLocale;
+        this.noMatchSize = noMatchSize;
+        this.fieldValue = fieldValue;
+    }
+
+    @Override
+    protected Passage[] getSummaryPassagesNoHighlight(int maxPassages) {
+        if (noMatchSize > 0) {
+            int pos = 0;
+            while (pos < fieldValue.length() && fieldValue.charAt(pos) == MULTIVAL_SEP_CHAR) {
+                pos ++;
+            }
+            if (pos < fieldValue.length()) {
+                int end = fieldValue.indexOf(MULTIVAL_SEP_CHAR, pos);
+                if (end == -1) {
+                    end = fieldValue.length();
+                }
+                if (noMatchSize+pos < end) {
+                    BreakIterator bi = BreakIterator.getWordInstance(breakIteratorLocale);
+                    bi.setText(fieldValue);
+                    // Finds the next word boundary **after** noMatchSize.
+                    end = bi.following(noMatchSize + pos);
+                    if (end == BreakIterator.DONE) {
+                        end = fieldValue.length();
+                    }
+                }
+                Passage passage = new Passage();
+                passage.setScore(Float.NaN);
+                passage.setStartOffset(pos);
+                passage.setEndOffset(end);
+                return new Passage[]{passage};
+            }
+        }
+        return EMPTY_PASSAGE;
+    }
+}
diff --git a/core/src/main/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighter.java b/core/src/main/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighter.java
@@ -33,6 +33,8 @@
 import org.apache.lucene.search.spans.SpanOrQuery;
 import org.apache.lucene.search.spans.SpanQuery;
 import org.apache.lucene.search.spans.SpanTermQuery;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.automaton.CharacterRunAutomaton;
 import org.elasticsearch.common.Nullable;
 import org.elasticsearch.common.lucene.all.AllTermQuery;
 import org.elasticsearch.common.lucene.search.MultiPhrasePrefixQuery;
@@ -47,6 +49,7 @@
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
+import java.util.Set;
 
 /**
  * Subclass of the {@link UnifiedHighlighter} that works for a single field in a single document.
@@ -57,37 +60,41 @@
  * Supports both returning empty snippets and non highlighted snippets when no highlighting can be performed.
  */
 public class CustomUnifiedHighlighter extends UnifiedHighlighter {
+    public static final char MULTIVAL_SEP_CHAR = (char) 0;
     private static final Snippet[] EMPTY_SNIPPET = new Snippet[0];
 
     private final String fieldValue;
     private final PassageFormatter passageFormatter;
     private final BreakIterator breakIterator;
-    private final boolean returnNonHighlightedSnippets;
+    private final Locale breakIteratorLocale;
+    private final int noMatchSize;
 
     /**
      * Creates a new instance of {@link CustomUnifiedHighlighter}
      *
      * @param analyzer the analyzer used for the field at index time, used for multi term queries internally
      * @param passageFormatter our own {@link CustomPassageFormatter}
-     *                         which generates snippets in forms of {@link Snippet} objects
+     *                    which generates snippets in forms of {@link Snippet} objects
+     * @param breakIteratorLocale the {@link Locale} to use for dividing text into passages.
+     *                    If null {@link Locale#ROOT} is used
      * @param breakIterator the {@link BreakIterator} to use for dividing text into passages.
-     *                      If null {@link BreakIterator#getSentenceInstance(Locale)} is used.
-     * @param fieldValue the original field values as constructor argument, loaded from the _source field or
-     *                   the relevant stored field.
-     * @param returnNonHighlightedSnippets whether non highlighted snippets should be
-     *                                     returned rather than empty snippets when no highlighting can be performed
+     *                    If null {@link BreakIterator#getSentenceInstance(Locale)} is used.
+     * @param fieldValue the original field values delimited by MULTIVAL_SEP_CHAR
+     * @param noMatchSize The size of the text that should be returned when no highlighting can be performed
      */
     public CustomUnifiedHighlighter(IndexSearcher searcher,
                                     Analyzer analyzer,
                                     PassageFormatter passageFormatter,
+                                    @Nullable Locale breakIteratorLocale,
                                     @Nullable BreakIterator breakIterator,
                                     String fieldValue,
-                                    boolean returnNonHighlightedSnippets) {
+                                    int noMatchSize) {
         super(searcher, analyzer);
         this.breakIterator = breakIterator;
+        this.breakIteratorLocale = breakIteratorLocale == null ? Locale.ROOT : breakIteratorLocale;
         this.passageFormatter = passageFormatter;
         this.fieldValue = fieldValue;
-        this.returnNonHighlightedSnippets = returnNonHighlightedSnippets;
+        this.noMatchSize = noMatchSize;
     }
 
     /**
@@ -111,16 +118,13 @@ public Snippet[] highlightField(String field, Query query, int docId, int maxPas
     @Override
     protected List<CharSequence[]> loadFieldValues(String[] fields, DocIdSetIterator docIter,
                                                    int cacheCharsThreshold) throws IOException {
-        //we only highlight one field, one document at a time
+        // we only highlight one field, one document at a time
         return Collections.singletonList(new String[]{fieldValue});
     }
 
     @Override
     protected BreakIterator getBreakIterator(String field) {
-        if (breakIterator != null) {
-            return breakIterator;
-        }
-        return super.getBreakIterator(field);
+        return breakIterator;
     }
 
     @Override
@@ -129,11 +133,18 @@ protected PassageFormatter getFormatter(String field) {
     }
 
     @Override
-    protected int getMaxNoHighlightPassages(String field) {
-        if (returnNonHighlightedSnippets) {
-            return 1;
-        }
-        return 0;
+    protected FieldHighlighter getFieldHighlighter(String field, Query query, Set<Term> allTerms, int maxPassages) {
+        BytesRef[] terms = filterExtractedTerms(getFieldMatcher(field), allTerms);
+        Set<HighlightFlag> highlightFlags = getFlags(field);
+        PhraseHelper phraseHelper = getPhraseHelper(field, query, highlightFlags);
+        CharacterRunAutomaton[] automata = getAutomata(field, query, highlightFlags);
+        OffsetSource offsetSource = getOptimizedOffsetSource(field, terms, phraseHelper, automata);
+        BreakIterator breakIterator = new SplittingBreakIterator(getBreakIterator(field),
+            UnifiedHighlighter.MULTIVAL_SEP_CHAR);
+        FieldOffsetStrategy strategy =
+            getOffsetStrategy(offsetSource, field, terms, phraseHelper, automata, highlightFlags);
+        return new CustomFieldHighlighter(field, strategy, breakIteratorLocale, breakIterator,
+            getScorer(field), maxPassages, (noMatchSize > 0 ? 1 : 0), getFormatter(field), noMatchSize, fieldValue);
     }
 
     @Override
@@ -146,7 +157,6 @@ protected Collection<Query> preSpanQueryRewrite(Query query) {
         return rewriteCustomQuery(query);
     }
 
-
     /**
      * Translate custom queries in queries that are supported by the unified highlighter.
      */