From 792160af87b403b4a493e653e5377a919a490395 Mon Sep 17 00:00:00 2001 From: Jim Ferenczi Date: Wed, 1 Mar 2017 13:31:50 +0100 Subject: [PATCH 1/2] Add support for fragment_length in the unified highlighter This commit introduce a new break iterator (a BoundedBreakIterator) designed for the unified highlighter that is able to limit the size of fragments produced by generic break iterator like `sentence`. The `unified` highlighter now supports `boundary_scanner` which can `words` or `sentence`. The `sentence` mode will use the bounded break iterator in order to limit the size of the sentence to `fragment_length`. When sentences bigger than `fragment_length` are produced, this mode will break the sentence at the next word boundary **after** `fragment_length` is reached. --- .../BoundedBreakIteratorScanner.java | 149 +++++++++ .../uhighlight/CustomFieldHighlighter.java | 59 ++++ .../uhighlight/CustomUnifiedHighlighter.java | 50 +-- .../highlight/FastVectorHighlighter.java | 110 +++--- .../subphase/highlight/HighlightBuilder.java | 2 +- .../highlight/UnifiedHighlighter.java | 44 ++- .../BoundedBreakIteratorScannerTests.java | 111 +++++++ .../CustomUnifiedHighlighterTests.java | 314 ++++++++---------- .../highlight/HighlighterSearchIT.java | 167 ++++++---- .../search/request/highlighting.asciidoc | 22 +- 10 files changed, 692 insertions(+), 336 deletions(-) create mode 100644 core/src/main/java/org/apache/lucene/search/uhighlight/BoundedBreakIteratorScanner.java create mode 100644 core/src/main/java/org/apache/lucene/search/uhighlight/CustomFieldHighlighter.java create mode 100644 core/src/test/java/org/apache/lucene/search/uhighlight/BoundedBreakIteratorScannerTests.java diff --git a/core/src/main/java/org/apache/lucene/search/uhighlight/BoundedBreakIteratorScanner.java b/core/src/main/java/org/apache/lucene/search/uhighlight/BoundedBreakIteratorScanner.java new file mode 100644 index 0000000000000..6aeb1b8ab7e20 --- /dev/null +++ b/core/src/main/java/org/apache/lucene/search/uhighlight/BoundedBreakIteratorScanner.java @@ -0,0 +1,149 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.lucene.search.uhighlight; + +import java.text.BreakIterator; +import java.text.CharacterIterator; +import java.util.Locale; + +/** + * A custom break iterator that scans text to find break-delimited passages bounded by a provided {@code maxLen}. + * This scanner uses two level of {@link BreakIterator} to bound passage size "close" to {@maxLen}. + * This is useful to "bound" {@link BreakIterator}s like `sentence` that can create big outliers on + * semi-structured text. + * + * @warning This break iterator is designed to work with the {@link UnifiedHighlighter} only. + **/ +public class BoundedBreakIteratorScanner extends BreakIterator { + private final BreakIterator mainBreak; + private final BreakIterator innerBreak; + private final int maxLen; + + private int lastPrecedingOffset = -1; + private int windowStart = -1; + private int windowEnd = -1; + private int innerStart = -1; + private int innerEnd = -1; + + private BoundedBreakIteratorScanner(BreakIterator mainBreak, BreakIterator innerBreak, int maxLen) { + this.mainBreak = mainBreak; + this.innerBreak = innerBreak; + this.maxLen = maxLen; + } + + @Override + public CharacterIterator getText() { + return mainBreak.getText(); + } + + @Override + public void setText(CharacterIterator newText) { + reset(); + mainBreak.setText(newText); + innerBreak.setText(newText); + } + + @Override + public void setText(String newText) { + reset(); + mainBreak.setText(newText); + innerBreak.setText(newText); + } + + private void reset() { + lastPrecedingOffset = -1; + windowStart = -1; + windowEnd = -1; + innerStart = -1; + innerEnd = -1; + } + + @Override + public int preceding(int offset) { + assert(offset > lastPrecedingOffset); + if (offset > windowStart && offset < windowEnd) { + innerStart = innerEnd; + innerEnd = windowEnd; + } else { + windowStart = innerStart = mainBreak.preceding(offset); + windowEnd = innerEnd = mainBreak.following(offset-1); + } + + if (innerEnd - innerStart > maxLen) { + // the current split is too big, + // so starting from the current term we try to find boundaries on the left first + if (offset - maxLen > innerStart) { + innerStart = Math.max(innerStart, innerBreak.preceding(offset - maxLen)); + } + // and then we try to expand the passage to the right with the remaining size + int remaining = Math.max(0, maxLen - (offset - innerStart)); + if (offset + remaining < windowEnd) { + innerEnd = Math.min(windowEnd, innerBreak.following(offset + remaining)); + } + } + lastPrecedingOffset = offset - 1; + return innerStart; + } + + @Override + public int following(int offset) { + assert(offset == lastPrecedingOffset && innerEnd != -1); + return innerEnd; + } + + /** + * Returns a {@link BreakIterator#getSentenceInstance(Locale)} bounded to {@code maxLen}. + * Secondary boundaries are found using a {@link BreakIterator#getWordInstance(Locale)}. + */ + public static BreakIterator getSentence(Locale locale, int maxLen) { + final BreakIterator sBreak = BreakIterator.getSentenceInstance(locale); + final BreakIterator wBreak = BreakIterator.getWordInstance(locale); + return new BoundedBreakIteratorScanner(sBreak, wBreak, maxLen); + } + + @Override + public int first() { + return 0; + } + + @Override + public int current() { + return 0; + } + + @Override + public int next() { + throw new IllegalStateException("next() should not be called in this context"); + } + + @Override + public int last() { + throw new IllegalStateException("last should not be called in this context"); + } + + @Override + public int next(int n) { + throw new IllegalStateException("next(n) should not be call"); + } + + @Override + public int previous() { + throw new IllegalStateException("previous should not be call"); + } +} diff --git a/core/src/main/java/org/apache/lucene/search/uhighlight/CustomFieldHighlighter.java b/core/src/main/java/org/apache/lucene/search/uhighlight/CustomFieldHighlighter.java new file mode 100644 index 0000000000000..9515648355f04 --- /dev/null +++ b/core/src/main/java/org/apache/lucene/search/uhighlight/CustomFieldHighlighter.java @@ -0,0 +1,59 @@ +package org.apache.lucene.search.uhighlight; + +import java.text.BreakIterator; +import java.util.Locale; + +import static org.apache.lucene.search.uhighlight.CustomUnifiedHighlighter.MULTIVAL_SEP_CHAR; + +/** + * Custom {@link FieldHighlighter} that creates a single passage bounded to {@code noMatchSize} when + * no highlights were found. + */ +class CustomFieldHighlighter extends FieldHighlighter { + private static final Passage[] EMPTY_PASSAGE = new Passage[0]; + + private final Locale breakIteratorLocale; + private final int noMatchSize; + private final String fieldValue; + + public CustomFieldHighlighter(String field, FieldOffsetStrategy fieldOffsetStrategy, + Locale breakIteratorLocale, BreakIterator breakIterator, + PassageScorer passageScorer, int maxPassages, int maxNoHighlightPassages, + PassageFormatter passageFormatter, int noMatchSize, String fieldValue) { + super(field, fieldOffsetStrategy, breakIterator, passageScorer, maxPassages, maxNoHighlightPassages, passageFormatter); + this.breakIteratorLocale = breakIteratorLocale; + this.noMatchSize = noMatchSize; + this.fieldValue = fieldValue; + } + + @Override + protected Passage[] getSummaryPassagesNoHighlight(int maxPassages) { + if (noMatchSize > 0) { + int pos = 0; + while (pos < fieldValue.length() && fieldValue.charAt(pos) == MULTIVAL_SEP_CHAR) { + pos ++; + } + if (pos < fieldValue.length()) { + int end = fieldValue.indexOf(MULTIVAL_SEP_CHAR, pos); + if (end == -1) { + end = fieldValue.length(); + } + if (noMatchSize < end) { + BreakIterator bi = BreakIterator.getWordInstance(breakIteratorLocale); + bi.setText(fieldValue); + // Finds the next word boundary **after** noMatchSize. + end = bi.following(noMatchSize); + if (end == BreakIterator.DONE) { + end = fieldValue.length(); + } + } + Passage passage = new Passage(); + passage.setScore(Float.NaN); + passage.setStartOffset(0); + passage.setEndOffset(end); + return new Passage[]{passage}; + } + } + return EMPTY_PASSAGE; + } +} diff --git a/core/src/main/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighter.java b/core/src/main/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighter.java index 4f1ec5fdb83a4..4a20fb0478f9a 100644 --- a/core/src/main/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighter.java +++ b/core/src/main/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighter.java @@ -33,6 +33,8 @@ import org.apache.lucene.search.spans.SpanOrQuery; import org.apache.lucene.search.spans.SpanQuery; import org.apache.lucene.search.spans.SpanTermQuery; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.automaton.CharacterRunAutomaton; import org.elasticsearch.common.Nullable; import org.elasticsearch.common.lucene.all.AllTermQuery; import org.elasticsearch.common.lucene.search.MultiPhrasePrefixQuery; @@ -47,6 +49,7 @@ import java.util.List; import java.util.Locale; import java.util.Map; +import java.util.Set; /** * Subclass of the {@link UnifiedHighlighter} that works for a single field in a single document. @@ -57,37 +60,41 @@ * Supports both returning empty snippets and non highlighted snippets when no highlighting can be performed. */ public class CustomUnifiedHighlighter extends UnifiedHighlighter { + public static final char MULTIVAL_SEP_CHAR = (char) 0; private static final Snippet[] EMPTY_SNIPPET = new Snippet[0]; private final String fieldValue; private final PassageFormatter passageFormatter; private final BreakIterator breakIterator; - private final boolean returnNonHighlightedSnippets; + private final Locale breakIteratorLocale; + private final int noMatchSize; /** * Creates a new instance of {@link CustomUnifiedHighlighter} * * @param analyzer the analyzer used for the field at index time, used for multi term queries internally * @param passageFormatter our own {@link CustomPassageFormatter} - * which generates snippets in forms of {@link Snippet} objects + * which generates snippets in forms of {@link Snippet} objects + * @param breakIteratorLocale the {@link Locale} to use for dividing text into passages. + * If null {@link Locale#ROOT} is used * @param breakIterator the {@link BreakIterator} to use for dividing text into passages. - * If null {@link BreakIterator#getSentenceInstance(Locale)} is used. - * @param fieldValue the original field values as constructor argument, loaded from the _source field or - * the relevant stored field. - * @param returnNonHighlightedSnippets whether non highlighted snippets should be - * returned rather than empty snippets when no highlighting can be performed + * If null {@link BreakIterator#getSentenceInstance(Locale)} is used. + * @param fieldValue the original field values delimited by MULTIVAL_SEP_CHAR + * @param noMatchSize The size of the text that should be returned when no highlighting can be performed */ public CustomUnifiedHighlighter(IndexSearcher searcher, Analyzer analyzer, PassageFormatter passageFormatter, + @Nullable Locale breakIteratorLocale, @Nullable BreakIterator breakIterator, String fieldValue, - boolean returnNonHighlightedSnippets) { + int noMatchSize) { super(searcher, analyzer); this.breakIterator = breakIterator; + this.breakIteratorLocale = breakIteratorLocale == null ? Locale.ROOT : breakIteratorLocale; this.passageFormatter = passageFormatter; this.fieldValue = fieldValue; - this.returnNonHighlightedSnippets = returnNonHighlightedSnippets; + this.noMatchSize = noMatchSize; } /** @@ -111,16 +118,13 @@ public Snippet[] highlightField(String field, Query query, int docId, int maxPas @Override protected List loadFieldValues(String[] fields, DocIdSetIterator docIter, int cacheCharsThreshold) throws IOException { - //we only highlight one field, one document at a time + // we only highlight one field, one document at a time return Collections.singletonList(new String[]{fieldValue}); } @Override protected BreakIterator getBreakIterator(String field) { - if (breakIterator != null) { - return breakIterator; - } - return super.getBreakIterator(field); + return breakIterator; } @Override @@ -129,11 +133,18 @@ protected PassageFormatter getFormatter(String field) { } @Override - protected int getMaxNoHighlightPassages(String field) { - if (returnNonHighlightedSnippets) { - return 1; - } - return 0; + protected FieldHighlighter getFieldHighlighter(String field, Query query, Set allTerms, int maxPassages) { + BytesRef[] terms = filterExtractedTerms(getFieldMatcher(field), allTerms); + Set highlightFlags = getFlags(field); + PhraseHelper phraseHelper = getPhraseHelper(field, query, highlightFlags); + CharacterRunAutomaton[] automata = getAutomata(field, query, highlightFlags); + OffsetSource offsetSource = getOptimizedOffsetSource(field, terms, phraseHelper, automata); + BreakIterator breakIterator = new SplittingBreakIterator(getBreakIterator(field), + UnifiedHighlighter.MULTIVAL_SEP_CHAR); + FieldOffsetStrategy strategy = + getOffsetStrategy(offsetSource, field, terms, phraseHelper, automata, highlightFlags); + return new CustomFieldHighlighter(field, strategy, breakIteratorLocale, breakIterator, + getScorer(field), maxPassages, (noMatchSize > 0 ? 1 : 0), getFormatter(field), noMatchSize, fieldValue); } @Override @@ -146,7 +157,6 @@ protected Collection preSpanQueryRewrite(Query query) { return rewriteCustomQuery(query); } - /** * Translate custom queries in queries that are supported by the unified highlighter. */ diff --git a/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/FastVectorHighlighter.java b/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/FastVectorHighlighter.java index 0c15f1106b19c..b1d557e851a6c 100644 --- a/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/FastVectorHighlighter.java +++ b/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/FastVectorHighlighter.java @@ -52,13 +52,14 @@ public class FastVectorHighlighter implements Highlighter { private static final BoundaryScanner DEFAULT_SIMPLE_BOUNDARY_SCANNER = new SimpleBoundaryScanner(); - private static final BoundaryScanner DEFAULT_SENTENCE_BOUNDARY_SCANNER = new BreakIteratorBoundaryScanner( - BreakIterator.getSentenceInstance(Locale.ROOT)); - private static final BoundaryScanner DEFAULT_WORD_BOUNDARY_SCANNER = new BreakIteratorBoundaryScanner( - BreakIterator.getWordInstance(Locale.ROOT)); + private static final BoundaryScanner DEFAULT_SENTENCE_BOUNDARY_SCANNER = + new BreakIteratorBoundaryScanner(BreakIterator.getSentenceInstance(Locale.ROOT)); + private static final BoundaryScanner DEFAULT_WORD_BOUNDARY_SCANNER = + new BreakIteratorBoundaryScanner(BreakIterator.getWordInstance(Locale.ROOT)); + + public static final Setting SETTING_TV_HIGHLIGHT_MULTI_VALUE = + Setting.boolSetting("search.highlight.term_vector_multi_value", true, Setting.Property.NodeScope); - public static final Setting SETTING_TV_HIGHLIGHT_MULTI_VALUE = Setting.boolSetting("search.highlight.term_vector_multi_value", - true, Setting.Property.NodeScope); private static final String CACHE_KEY = "highlight-fsv"; private final Boolean termVectorMultiValue; @@ -74,11 +75,12 @@ public HighlightField highlight(HighlighterContext highlighterContext) { FieldMapper mapper = highlighterContext.mapper; if (canHighlight(mapper) == false) { - throw new IllegalArgumentException("the field [" + highlighterContext.fieldName - + "] should be indexed with term vector with position offsets to be used with fast vector highlighter"); + throw new IllegalArgumentException("the field [" + highlighterContext.fieldName + + "] should be indexed with term vector with position offsets to be used with fast vector highlighter"); } - Encoder encoder = field.fieldOptions().encoder().equals("html") ? HighlightUtils.Encoders.HTML : HighlightUtils.Encoders.DEFAULT; + Encoder encoder = field.fieldOptions().encoder().equals("html") ? + HighlightUtils.Encoders.HTML : HighlightUtils.Encoders.DEFAULT; if (!hitContext.cache().containsKey(CACHE_KEY)) { hitContext.cache().put(CACHE_KEY, new HighlighterEntry()); @@ -90,21 +92,21 @@ public HighlightField highlight(HighlighterContext highlighterContext) { if (field.fieldOptions().requireFieldMatch()) { if (cache.fieldMatchFieldQuery == null) { /* - * we use top level reader to rewrite the query against all readers, with use caching it across hits (and across - * readers...) + * we use top level reader to rewrite the query against all readers, + * with use caching it across hits (and across readers...) */ - cache.fieldMatchFieldQuery = new CustomFieldQuery(highlighterContext.query, hitContext.topLevelReader(), - true, field.fieldOptions().requireFieldMatch()); + cache.fieldMatchFieldQuery = new CustomFieldQuery(highlighterContext.query, + hitContext.topLevelReader(), true, field.fieldOptions().requireFieldMatch()); } fieldQuery = cache.fieldMatchFieldQuery; } else { if (cache.noFieldMatchFieldQuery == null) { /* - * we use top level reader to rewrite the query against all readers, with use caching it across hits (and across - * readers...) + * we use top level reader to rewrite the query against all readers, + * with use caching it across hits (and across readers...) */ - cache.noFieldMatchFieldQuery = new CustomFieldQuery(highlighterContext.query, hitContext.topLevelReader(), - true, field.fieldOptions().requireFieldMatch()); + cache.noFieldMatchFieldQuery = new CustomFieldQuery(highlighterContext.query, + hitContext.topLevelReader(), true, field.fieldOptions().requireFieldMatch()); } fieldQuery = cache.noFieldMatchFieldQuery; } @@ -128,7 +130,7 @@ public HighlightField highlight(HighlighterContext highlighterContext) { } } else { fragListBuilder = field.fieldOptions().fragmentOffset() == -1 ? - new SimpleFragListBuilder() : new SimpleFragListBuilder(field.fieldOptions().fragmentOffset()); + new SimpleFragListBuilder() : new SimpleFragListBuilder(field.fieldOptions().fragmentOffset()); if (field.fieldOptions().scoreOrdered()) { if (!forceSource && mapper.fieldType().stored()) { fragmentsBuilder = new ScoreOrderFragmentsBuilder(field.fieldOptions().preTags(), @@ -142,7 +144,8 @@ public HighlightField highlight(HighlighterContext highlighterContext) { fragmentsBuilder = new SimpleFragmentsBuilder(mapper, field.fieldOptions().preTags(), field.fieldOptions().postTags(), boundaryScanner); } else { - fragmentsBuilder = new SourceSimpleFragmentsBuilder(mapper, context, field.fieldOptions().preTags(), + fragmentsBuilder = + new SourceSimpleFragmentsBuilder(mapper, context, field.fieldOptions().preTags(), field.fieldOptions().postTags(), boundaryScanner); } } @@ -153,8 +156,8 @@ public HighlightField highlight(HighlighterContext highlighterContext) { entry.fragmentsBuilder = fragmentsBuilder; if (cache.fvh == null) { // parameters to FVH are not requires since: - // first two booleans are not relevant since they are set on the CustomFieldQuery (phrase and fieldMatch) - // fragment builders are used explicitly + // first two booleans are not relevant since they are set on the CustomFieldQuery + // (phrase and fieldMatch) fragment builders are used explicitly cache.fvh = new org.apache.lucene.search.vectorhighlight.FastVectorHighlighter(); } CustomFieldQuery.highlightFilters.set(field.fieldOptions().highlightFilter()); @@ -172,13 +175,14 @@ public HighlightField highlight(HighlighterContext highlighterContext) { // we highlight against the low level reader and docId, because if we load source, we want to reuse it if possible // Only send matched fields if they were requested to save time. if (field.fieldOptions().matchedFields() != null && !field.fieldOptions().matchedFields().isEmpty()) { - fragments = cache.fvh.getBestFragments(fieldQuery, hitContext.reader(), hitContext.docId(), mapper.fieldType().name(), - field.fieldOptions().matchedFields(), fragmentCharSize, numberOfFragments, entry.fragListBuilder, - entry.fragmentsBuilder, field.fieldOptions().preTags(), field.fieldOptions().postTags(), encoder); + fragments = cache.fvh.getBestFragments(fieldQuery, hitContext.reader(), hitContext.docId(), + mapper.fieldType().name(), field.fieldOptions().matchedFields(), fragmentCharSize, + numberOfFragments, entry.fragListBuilder, entry.fragmentsBuilder, field.fieldOptions().preTags(), + field.fieldOptions().postTags(), encoder); } else { - fragments = cache.fvh.getBestFragments(fieldQuery, hitContext.reader(), hitContext.docId(), mapper.fieldType().name(), - fragmentCharSize, numberOfFragments, entry.fragListBuilder, entry.fragmentsBuilder, field.fieldOptions().preTags(), - field.fieldOptions().postTags(), encoder); + fragments = cache.fvh.getBestFragments(fieldQuery, hitContext.reader(), hitContext.docId(), + mapper.fieldType().name(), fragmentCharSize, numberOfFragments, entry.fragListBuilder, + entry.fragmentsBuilder, field.fieldOptions().preTags(), field.fieldOptions().postTags(), encoder); } if (fragments != null && fragments.length > 0) { @@ -187,11 +191,13 @@ public HighlightField highlight(HighlighterContext highlighterContext) { int noMatchSize = highlighterContext.field.fieldOptions().noMatchSize(); if (noMatchSize > 0) { - // Essentially we just request that a fragment is built from 0 to noMatchSize using the normal fragmentsBuilder + // Essentially we just request that a fragment is built from 0 to noMatchSize using + // the normal fragmentsBuilder FieldFragList fieldFragList = new SimpleFieldFragList(-1 /*ignored*/); fieldFragList.add(0, noMatchSize, Collections.emptyList()); - fragments = entry.fragmentsBuilder.createFragments(hitContext.reader(), hitContext.docId(), mapper.fieldType().name(), - fieldFragList, 1, field.fieldOptions().preTags(), field.fieldOptions().postTags(), encoder); + fragments = entry.fragmentsBuilder.createFragments(hitContext.reader(), hitContext.docId(), + mapper.fieldType().name(), fieldFragList, 1, field.fieldOptions().preTags(), + field.fieldOptions().postTags(), encoder); if (fragments != null && fragments.length > 0) { return new HighlightField(highlighterContext.fieldName, Text.convertFromStringArray(fragments)); } @@ -200,7 +206,8 @@ public HighlightField highlight(HighlighterContext highlighterContext) { return null; } catch (Exception e) { - throw new FetchPhaseExecutionException(context, "Failed to highlight field [" + highlighterContext.fieldName + "]", e); + throw new FetchPhaseExecutionException(context, + "Failed to highlight field [" + highlighterContext.fieldName + "]", e); } } @@ -212,24 +219,31 @@ public boolean canHighlight(FieldMapper fieldMapper) { private static BoundaryScanner getBoundaryScanner(Field field) { final FieldOptions fieldOptions = field.fieldOptions(); - final Locale boundaryScannerLocale = fieldOptions.boundaryScannerLocale(); - switch(fieldOptions.boundaryScannerType()) { - case SENTENCE: - if (boundaryScannerLocale != null) { - return new BreakIteratorBoundaryScanner(BreakIterator.getSentenceInstance(boundaryScannerLocale)); - } - return DEFAULT_SENTENCE_BOUNDARY_SCANNER; - case WORD: - if (boundaryScannerLocale != null) { - return new BreakIteratorBoundaryScanner(BreakIterator.getWordInstance(boundaryScannerLocale)); - } - return DEFAULT_WORD_BOUNDARY_SCANNER; - default: - if (fieldOptions.boundaryMaxScan() != SimpleBoundaryScanner.DEFAULT_MAX_SCAN + final Locale boundaryScannerLocale = + fieldOptions.boundaryScannerLocale() != null ? fieldOptions.boundaryScannerLocale() : + Locale.ROOT; + final HighlightBuilder.BoundaryScannerType type = + fieldOptions.boundaryScannerType() != null ? fieldOptions.boundaryScannerType() : + HighlightBuilder.BoundaryScannerType.CHARS; + switch(type) { + case SENTENCE: + if (boundaryScannerLocale != null) { + return new BreakIteratorBoundaryScanner(BreakIterator.getSentenceInstance(boundaryScannerLocale)); + } + return DEFAULT_SENTENCE_BOUNDARY_SCANNER; + case WORD: + if (boundaryScannerLocale != null) { + return new BreakIteratorBoundaryScanner(BreakIterator.getWordInstance(boundaryScannerLocale)); + } + return DEFAULT_WORD_BOUNDARY_SCANNER; + case CHARS: + if (fieldOptions.boundaryMaxScan() != SimpleBoundaryScanner.DEFAULT_MAX_SCAN || fieldOptions.boundaryChars() != SimpleBoundaryScanner.DEFAULT_BOUNDARY_CHARS) { - return new SimpleBoundaryScanner(fieldOptions.boundaryMaxScan(), fieldOptions.boundaryChars()); - } - return DEFAULT_SIMPLE_BOUNDARY_SCANNER; + return new SimpleBoundaryScanner(fieldOptions.boundaryMaxScan(), fieldOptions.boundaryChars()); + } + return DEFAULT_SIMPLE_BOUNDARY_SCANNER; + default: + throw new IllegalArgumentException("Invalid boundary scanner type: " + type.toString()); } } diff --git a/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightBuilder.java b/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightBuilder.java index 45b8c612a7610..c7c9c547b511c 100644 --- a/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightBuilder.java +++ b/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightBuilder.java @@ -95,7 +95,7 @@ public class HighlightBuilder extends AbstractHighlighterBuilder 0); + field.fieldOptions().boundaryScannerLocale(), breakIterator, fieldValue, + field.fieldOptions().noMatchSize()); numberOfFragments = fieldValues.size(); // we are highlighting the whole content, one snippet per value } else { //using paragraph separator we make sure that each field value holds a discrete passage for highlighting - String fieldValue = mergeFieldValues(fieldValues, HighlightUtils.PARAGRAPH_SEPARATOR); + String fieldValue = mergeFieldValues(fieldValues, MULTIVAL_SEP_CHAR); + BreakIterator bi = getBreakIterator(field); highlighter = new CustomUnifiedHighlighter(searcher, analyzer, - mapperHighlighterEntry.passageFormatter, null, fieldValue, field.fieldOptions().noMatchSize() > 0); + mapperHighlighterEntry.passageFormatter, field.fieldOptions().boundaryScannerLocale(), bi, + fieldValue, field.fieldOptions().noMatchSize()); numberOfFragments = field.fieldOptions().numberOfFragments(); } if (field.fieldOptions().requireFieldMatch()) { @@ -144,11 +151,34 @@ public HighlightField highlight(HighlighterContext highlighterContext) { return null; } - static class HighlighterEntry { + private BreakIterator getBreakIterator(SearchContextHighlight.Field field) { + final SearchContextHighlight.FieldOptions fieldOptions = field.fieldOptions(); + final Locale locale = + fieldOptions.boundaryScannerLocale() != null ? fieldOptions.boundaryScannerLocale() : + Locale.ROOT; + final HighlightBuilder.BoundaryScannerType type = + fieldOptions.boundaryScannerType() != null ? fieldOptions.boundaryScannerType() : + HighlightBuilder.BoundaryScannerType.SENTENCE; + int maxLen = fieldOptions.fragmentCharSize(); + switch (type) { + case SENTENCE: + if (maxLen > 0) { + return BoundedBreakIteratorScanner.getSentence(locale, maxLen); + } + return BreakIterator.getSentenceInstance(locale); + case WORD: + // ignore maxLen + return BreakIterator.getWordInstance(locale); + default: + throw new IllegalArgumentException("Invalid boundary scanner type: " + type.toString()); + } + } + + private static class HighlighterEntry { Map mappers = new HashMap<>(); } - static class MapperHighlighterEntry { + private static class MapperHighlighterEntry { final CustomPassageFormatter passageFormatter; private MapperHighlighterEntry(CustomPassageFormatter passageFormatter) { diff --git a/core/src/test/java/org/apache/lucene/search/uhighlight/BoundedBreakIteratorScannerTests.java b/core/src/test/java/org/apache/lucene/search/uhighlight/BoundedBreakIteratorScannerTests.java new file mode 100644 index 0000000000000..2c41eb3319fd2 --- /dev/null +++ b/core/src/test/java/org/apache/lucene/search/uhighlight/BoundedBreakIteratorScannerTests.java @@ -0,0 +1,111 @@ +package org.apache.lucene.search.uhighlight; + +import org.elasticsearch.test.ESTestCase; + +import java.text.BreakIterator; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Locale; + +import static org.hamcrest.Matchers.greaterThan; +import static org.hamcrest.Matchers.greaterThanOrEqualTo; +import static org.hamcrest.Matchers.lessThanOrEqualTo; + +public class BoundedBreakIteratorScannerTests extends ESTestCase { + private void testRandomAsciiTextCase(BreakIterator bi, int maxLen) { + // Generate a random set of unique terms with ascii character + int maxSize = randomIntBetween(5, 20); + String[] vocabulary = new String[maxSize]; + for (int i = 0; i < maxSize; i++) { + if (rarely()) { + vocabulary[i] = randomAsciiOfLengthBetween(50, 200); + } else { + vocabulary[i] = randomAsciiOfLengthBetween(1, 30); + } + } + + // Generate a random text made of random terms separated with word-boundaries and sentence-boundaries. + StringBuilder text = new StringBuilder(); + List offsetList = new ArrayList<> (); + List sizeList = new ArrayList<> (); + // the number of sentences to generate + int numSentences = randomIntBetween(10, 100); + int maxTermLen = 0; + for (int i = 0; i < numSentences; i++) { + // the number of terms in the sentence + int numTerms = randomIntBetween(5, 10); + for (int j = 0; j < numTerms; j++) { + int termId = randomIntBetween(0, vocabulary.length - 1); + String term = vocabulary[termId].toLowerCase(); + if (j == 0) { + // capitalize the first letter of the first term in the sentence + term = term.substring(0, 1).toUpperCase() + term.substring(1); + } else { + // word boundary + String sep = randomFrom(" ", " ", "\t", "#", "\n"); + text.append(sep); + } + maxTermLen = Math.max(term.length(), maxTermLen); + offsetList.add(text.length()); + sizeList.add(term.length()); + text.append(term); + } + // sentence boundary + String boundary = randomFrom("! ", "? ", ". ", ".\n", ".\n\n"); + text.append(boundary); + } + + int[] sizes = sizeList.stream().mapToInt(i->i).toArray(); + int[] offsets = offsetList.stream().mapToInt(i->i).toArray(); + + bi.setText(text.toString()); + int currentPos = randomIntBetween(0, 20); + int lastEnd = -1; + int maxPassageLen = maxLen+(maxTermLen*2); + while (currentPos < offsets.length) { + // find the passage that contains the current term + int nextOffset = offsets[currentPos]; + int start = bi.preceding(nextOffset+1); + int end = bi.following(nextOffset); + + // check that the passage is valid + assertThat(start, greaterThanOrEqualTo(lastEnd)); + assertThat(end, greaterThan(start)); + assertThat(start, lessThanOrEqualTo(nextOffset)); + assertThat(end, greaterThanOrEqualTo(nextOffset)); + int passageLen = end-start; + assertThat(passageLen, lessThanOrEqualTo(maxPassageLen)); + + // checks that the start and end of the passage are on word boundaries. + int startPos = Arrays.binarySearch(offsets, start); + int endPos = Arrays.binarySearch(offsets, end); + if (startPos < 0) { + int lastWordEnd = offsets[Math.abs(startPos)-2] + sizes[Math.abs(startPos)-2]; + assertThat(start, greaterThanOrEqualTo(lastWordEnd)); + } + if (endPos < 0) { + if (Math.abs(endPos)-2 < offsets.length) { + int lastWordEnd = offsets[Math.abs(endPos) - 2] + sizes[Math.abs(endPos) - 2]; + assertThat(end, greaterThanOrEqualTo(lastWordEnd)); + } + // advance the position to the end of the current passage + currentPos = (Math.abs(endPos) - 1); + } else { + // advance the position to the end of the current passage + currentPos = endPos; + } + // randomly advance to the next term to highlight + currentPos += randomIntBetween(0, 20); + lastEnd = end; + } + } + + public void testBoundedSentence() { + for (int i = 0; i < 20; i++) { + int maxLen = randomIntBetween(10, 500); + testRandomAsciiTextCase(BoundedBreakIteratorScanner.getSentence(Locale.ROOT, maxLen), + maxLen); + } + } +} diff --git a/core/src/test/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java b/core/src/test/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java index 83b42750f92ce..23e867d2573ce 100644 --- a/core/src/test/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java +++ b/core/src/test/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java @@ -20,20 +20,22 @@ package org.apache.lucene.search.uhighlight; import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; import org.apache.lucene.document.TextField; +import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexOptions; -import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.queries.CommonTermsQuery; import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.Sort; import org.apache.lucene.search.TermQuery; @@ -41,219 +43,167 @@ import org.apache.lucene.search.highlight.DefaultEncoder; import org.apache.lucene.search.highlight.Snippet; import org.apache.lucene.store.Directory; +import org.elasticsearch.common.Strings; import org.elasticsearch.common.lucene.all.AllTermQuery; import org.elasticsearch.common.lucene.search.MultiPhrasePrefixQuery; -import org.elasticsearch.search.fetch.subphase.highlight.HighlightUtils; import org.elasticsearch.test.ESTestCase; -import java.io.IOException; +import java.text.BreakIterator; +import java.util.Locale; +import static org.apache.lucene.search.uhighlight.CustomUnifiedHighlighter.MULTIVAL_SEP_CHAR; import static org.hamcrest.CoreMatchers.equalTo; public class CustomUnifiedHighlighterTests extends ESTestCase { - public void testCustomUnifiedHighlighter() throws Exception { + private void assertHighlightOneDoc(String fieldName, String[] inputs, Analyzer analyzer, Query query, + Locale locale, BreakIterator breakIterator, + int noMatchSize, String[] expectedPassages) throws Exception { Directory dir = newDirectory(); - IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random())); - iwc.setMergePolicy(newLogMergePolicy()); - RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); - - FieldType offsetsType = new FieldType(TextField.TYPE_STORED); - offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); - offsetsType.setStoreTermVectorOffsets(true); - offsetsType.setStoreTermVectorPositions(true); - offsetsType.setStoreTermVectors(true); - - //good position but only one match - final String firstValue = "This is a test. Just a test1 highlighting from unified highlighter."; - Field body = new Field("body", "", offsetsType); - Document doc = new Document(); - doc.add(body); - body.setStringValue(firstValue); - - //two matches, not the best snippet due to its length though - final String secondValue = "This is the second highlighting value to perform highlighting on a longer text " + - "that gets scored lower."; - Field body2 = new Field("body", "", offsetsType); - doc.add(body2); - body2.setStringValue(secondValue); - - //two matches and short, will be scored highest - final String thirdValue = "This is highlighting the third short highlighting value."; - Field body3 = new Field("body", "", offsetsType); - doc.add(body3); - body3.setStringValue(thirdValue); - - //one match, same as first but at the end, will be scored lower due to its position - final String fourthValue = "Just a test4 highlighting from unified highlighter."; - Field body4 = new Field("body", "", offsetsType); - doc.add(body4); - body4.setStringValue(fourthValue); - - iw.addDocument(doc); - - IndexReader ir = iw.getReader(); - iw.close(); - - String firstHlValue = "Just a test1 highlighting from unified highlighter."; - String secondHlValue = "This is the second highlighting value to perform highlighting on a" + - " longer text that gets scored lower."; - String thirdHlValue = "This is highlighting the third short highlighting value."; - String fourthHlValue = "Just a test4 highlighting from unified highlighter."; - - IndexSearcher searcher = newSearcher(ir); - Query query = new TermQuery(new Term("body", "highlighting")); - - TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertThat(topDocs.totalHits, equalTo(1)); - - int docId = topDocs.scoreDocs[0].doc; - - String fieldValue = firstValue + HighlightUtils.PARAGRAPH_SEPARATOR + secondValue + - HighlightUtils.PARAGRAPH_SEPARATOR + thirdValue + HighlightUtils.PARAGRAPH_SEPARATOR + fourthValue; - - CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(searcher, iwc.getAnalyzer(), - new CustomPassageFormatter("", "", new DefaultEncoder()), null, fieldValue, true); - Snippet[] snippets = highlighter.highlightField("body", query, docId, 5); - - assertThat(snippets.length, equalTo(4)); - - assertThat(snippets[0].getText(), equalTo(firstHlValue)); - assertThat(snippets[1].getText(), equalTo(secondHlValue)); - assertThat(snippets[2].getText(), equalTo(thirdHlValue)); - assertThat(snippets[3].getText(), equalTo(fourthHlValue)); - ir.close(); - dir.close(); - } - - public void testNoMatchSize() throws Exception { - Directory dir = newDirectory(); - Analyzer analyzer = new StandardAnalyzer(); IndexWriterConfig iwc = newIndexWriterConfig(analyzer); - iwc.setMergePolicy(newLogMergePolicy()); + iwc.setMergePolicy(newTieredMergePolicy(random())); RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); - - FieldType offsetsType = new FieldType(TextField.TYPE_STORED); - offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); - offsetsType.setStoreTermVectorOffsets(true); - offsetsType.setStoreTermVectorPositions(true); - offsetsType.setStoreTermVectors(true); - Field body = new Field("body", "", offsetsType); - Field none = new Field("none", "", offsetsType); + FieldType ft = new FieldType(TextField.TYPE_STORED); + ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); + ft.freeze(); Document doc = new Document(); - doc.add(body); - doc.add(none); - - String firstValue = "This is a test. Just a test highlighting from unified. Feel free to ignore."; - body.setStringValue(firstValue); - none.setStringValue(firstValue); + for (String input : inputs) { + Field field = new Field(fieldName, "", ft); + field.setStringValue(input); + doc.add(field); + } iw.addDocument(doc); - - IndexReader ir = iw.getReader(); + DirectoryReader reader = iw.getReader(); + IndexSearcher searcher = newSearcher(reader); iw.close(); - - Query query = new TermQuery(new Term("none", "highlighting")); - - IndexSearcher searcher = newSearcher(ir); - TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); + TopDocs topDocs = searcher.search(new MatchAllDocsQuery(), 1, Sort.INDEXORDER); assertThat(topDocs.totalHits, equalTo(1)); - int docId = topDocs.scoreDocs[0].doc; - - CustomPassageFormatter passageFormatter = new CustomPassageFormatter("", "", new DefaultEncoder()); - CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(searcher, analyzer, passageFormatter, - null, firstValue, false); - Snippet[] snippets = highlighter.highlightField("body", query, docId, 5); - assertThat(snippets.length, equalTo(0)); - - highlighter = new CustomUnifiedHighlighter(searcher, analyzer, passageFormatter, null, firstValue, true); - snippets = highlighter.highlightField("body", query, docId, 5); - assertThat(snippets.length, equalTo(1)); - assertThat(snippets[0].getText(), equalTo("This is a test.")); - ir.close(); + String rawValue = Strings.arrayToDelimitedString(inputs, String.valueOf(MULTIVAL_SEP_CHAR)); + CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(searcher, analyzer, + new CustomPassageFormatter("", "", new DefaultEncoder()), locale, breakIterator, rawValue, + noMatchSize); + highlighter.setFieldMatcher((name) -> "text".equals(name)); + final Snippet[] snippets = + highlighter.highlightField("text", query, topDocs.scoreDocs[0].doc, expectedPassages.length); + assertEquals(snippets.length, expectedPassages.length); + for (int i = 0; i < snippets.length; i++) { + assertEquals(snippets[i].getText(), expectedPassages[i]); + } + reader.close(); dir.close(); } + public void testSimple() throws Exception { + final String[] inputs = { + "This is a test. Just a test1 highlighting from unified highlighter.", + "This is the second highlighting value to perform highlighting on a longer text that gets scored lower.", + "This is highlighting the third short highlighting value.", + "Just a test4 highlighting from unified highlighter." + }; + + String[] expectedPassages = { + "Just a test1 highlighting from unified highlighter.", + "This is the second highlighting value to perform highlighting on a" + + " longer text that gets scored lower.", + "This is highlighting the third short highlighting value.", + "Just a test4 highlighting from unified highlighter." + }; + Query query = new TermQuery(new Term("text", "highlighting")); + assertHighlightOneDoc("text", inputs, new StandardAnalyzer(), query, Locale.ROOT, + BreakIterator.getSentenceInstance(Locale.ROOT), 0, expectedPassages); + } - private IndexReader indexOneDoc(Directory dir, String field, String value, Analyzer analyzer) throws IOException { - IndexWriterConfig iwc = newIndexWriterConfig(analyzer); - iwc.setMergePolicy(newLogMergePolicy()); - RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); - - FieldType ft = new FieldType(TextField.TYPE_STORED); - ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); - Field textField = new Field(field, "", ft); - Document doc = new Document(); - doc.add(textField); - - textField.setStringValue(value); - iw.addDocument(doc); - IndexReader ir = iw.getReader(); - iw.close(); - return ir; + public void testNoMatchSize() throws Exception { + final String[] inputs = { + "This is a test. Just a test highlighting from unified. Feel free to ignore." + }; + Query query = new TermQuery(new Term("body", "highlighting")); + assertHighlightOneDoc("text", inputs, new StandardAnalyzer(), query, Locale.ROOT, + BreakIterator.getSentenceInstance(Locale.ROOT), 100, inputs); } public void testMultiPhrasePrefixQuery() throws Exception { - Analyzer analyzer = new StandardAnalyzer(); - Directory dir = newDirectory(); - String value = "The quick brown fox."; - IndexReader ir = indexOneDoc(dir, "text", value, analyzer); + final String[] inputs = { + "The quick brown fox." + }; + final String[] outputs = { + "The quick brown fox." + }; MultiPhrasePrefixQuery query = new MultiPhrasePrefixQuery(); query.add(new Term("text", "quick")); query.add(new Term("text", "brown")); query.add(new Term("text", "fo")); - IndexSearcher searcher = newSearcher(ir); - TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertThat(topDocs.totalHits, equalTo(1)); - int docId = topDocs.scoreDocs[0].doc; - CustomPassageFormatter passageFormatter = new CustomPassageFormatter("", "", new DefaultEncoder()); - CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(searcher, analyzer, - passageFormatter, null, value, false); - Snippet[] snippets = highlighter.highlightField("text", query, docId, 5); - assertThat(snippets.length, equalTo(1)); - assertThat(snippets[0].getText(), equalTo("The quick brown fox.")); - ir.close(); - dir.close(); + assertHighlightOneDoc("text", inputs, new StandardAnalyzer(), query, Locale.ROOT, + BreakIterator.getSentenceInstance(Locale.ROOT), 0, outputs); } - public void testAllTermQuery() throws IOException { - Directory dir = newDirectory(); - String value = "The quick brown fox."; - Analyzer analyzer = new StandardAnalyzer(); - IndexReader ir = indexOneDoc(dir, "all", value, analyzer); - AllTermQuery query = new AllTermQuery(new Term("all", "fox")); - IndexSearcher searcher = newSearcher(ir); - TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertThat(topDocs.totalHits, equalTo(1)); - int docId = topDocs.scoreDocs[0].doc; - CustomPassageFormatter passageFormatter = new CustomPassageFormatter("", "", new DefaultEncoder()); - CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(searcher, analyzer, - passageFormatter, null, value, false); - Snippet[] snippets = highlighter.highlightField("all", query, docId, 5); - assertThat(snippets.length, equalTo(1)); - assertThat(snippets[0].getText(), equalTo("The quick brown fox.")); - ir.close(); - dir.close(); + public void testAllTermQuery() throws Exception { + final String[] inputs = { + "The quick brown fox." + }; + final String[] outputs = { + "The quick brown fox." + }; + AllTermQuery query = new AllTermQuery(new Term("text", "fox")); + assertHighlightOneDoc("text", inputs, new StandardAnalyzer(), query, Locale.ROOT, + BreakIterator.getSentenceInstance(Locale.ROOT), 0, outputs); } - public void testCommonTermsQuery() throws IOException { - Directory dir = newDirectory(); - String value = "The quick brown fox."; - Analyzer analyzer = new StandardAnalyzer(); - IndexReader ir = indexOneDoc(dir, "text", value, analyzer); + public void testCommonTermsQuery() throws Exception { + final String[] inputs = { + "The quick brown fox." + }; + final String[] outputs = { + "The quick brown fox." + }; CommonTermsQuery query = new CommonTermsQuery(BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD, 128); query.add(new Term("text", "quick")); query.add(new Term("text", "brown")); query.add(new Term("text", "fox")); - IndexSearcher searcher = newSearcher(ir); - TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); - assertThat(topDocs.totalHits, equalTo(1)); - int docId = topDocs.scoreDocs[0].doc; - CustomPassageFormatter passageFormatter = new CustomPassageFormatter("", "", new DefaultEncoder()); - CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(searcher, analyzer, - passageFormatter, null, value, false); - Snippet[] snippets = highlighter.highlightField("text", query, docId, 5); - assertThat(snippets.length, equalTo(1)); - assertThat(snippets[0].getText(), equalTo("The quick brown fox.")); - ir.close(); - dir.close(); + assertHighlightOneDoc("text", inputs, new StandardAnalyzer(), query, Locale.ROOT, + BreakIterator.getSentenceInstance(Locale.ROOT), 0, outputs); + } + + public void testSentenceBoundedBreakIterator() throws Exception { + final String[] inputs = { + "The quick brown fox in a long sentence with another quick brown fox. " + + "Another sentence with brown fox." + }; + final String[] outputs = { + "The quick brown", + "fox in a long", + "with another quick", + "brown fox.", + "sentence with brown", + "fox.", + }; + BooleanQuery query = new BooleanQuery.Builder() + .add(new TermQuery(new Term("text", "quick")), BooleanClause.Occur.SHOULD) + .add(new TermQuery(new Term("text", "brown")), BooleanClause.Occur.SHOULD) + .add(new TermQuery(new Term("text", "fox")), BooleanClause.Occur.SHOULD) + .build(); + assertHighlightOneDoc("text", inputs, new StandardAnalyzer(), query, Locale.ROOT, + BoundedBreakIteratorScanner.getSentence(Locale.ROOT, 10), 0, outputs); + } + + public void testRepeat() throws Exception { + final String[] inputs = { + "Fun fun fun fun fun fun fun fun fun fun" + }; + final String[] outputs = { + "Fun fun fun", + "fun fun", + "fun fun fun", + "fun fun" + }; + Query query = new TermQuery(new Term("text", "fun")); + assertHighlightOneDoc("text", inputs, new StandardAnalyzer(), query, Locale.ROOT, + BoundedBreakIteratorScanner.getSentence(Locale.ROOT, 10), 0, outputs); + + query = new PhraseQuery.Builder() + .add(new Term("text", "fun")) + .add(new Term("text", "fun")) + .build(); + assertHighlightOneDoc("text", inputs, new StandardAnalyzer(), query, Locale.ROOT, + BoundedBreakIteratorScanner.getSentence(Locale.ROOT, 10), 0, outputs); } } diff --git a/core/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java b/core/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java index 7db99ff3232ee..11496309d47a9 100644 --- a/core/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java +++ b/core/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java @@ -751,52 +751,69 @@ public void testFastVectorHighlighter() throws Exception { assertHighlight(searchResponse, 0, "field2", 0, 1, equalTo("The quick brown fox jumps over")); } - public void testFastVectorHighlighterWithSentenceBoundaryScanner() throws Exception { + public void testHighlighterWithSentenceBoundaryScanner() throws Exception { assertAcked(prepareCreate("test").addMapping("type1", type1TermVectorMapping())); ensureGreen(); indexRandom(true, client().prepareIndex("test", "type1") .setSource("field1", "A sentence with few words. Another sentence with even more words.")); - logger.info("--> highlighting and searching on 'field' with sentence boundary_scanner"); - SearchSourceBuilder source = searchSource() + for (String type : new String[] {"unified", "fvh"}) { + logger.info("--> highlighting and searching on 'field' with sentence boundary_scanner"); + SearchSourceBuilder source = searchSource() .query(termQuery("field1", "sentence")) .highlighter(highlight() - .field("field1", 20, 2) - .order("score") - .preTags("").postTags("") - .boundaryScannerType(BoundaryScannerType.SENTENCE)); + .field("field1", 21, 2) + .highlighterType(type) + .preTags("").postTags("") + .boundaryScannerType(BoundaryScannerType.SENTENCE)); + SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get(); - SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get(); + assertHighlight(searchResponse, 0, "field1", 0, 2, anyOf( + equalTo("A sentence with few words"), + equalTo("A sentence with few words. ") + )); - assertHighlight(searchResponse, 0, "field1", 0, 2, equalTo("A sentence with few words. ")); - assertHighlight(searchResponse, 0, "field1", 1, 2, equalTo("Another sentence with even more words. ")); + assertHighlight(searchResponse, 0, "field1", 1, 2, anyOf( + equalTo("Another sentence with"), + equalTo("Another sentence with even more words. ") + )); + } } - public void testFastVectorHighlighterWithSentenceBoundaryScannerAndLocale() throws Exception { + public void testHighlighterWithSentenceBoundaryScannerAndLocale() throws Exception { assertAcked(prepareCreate("test").addMapping("type1", type1TermVectorMapping())); ensureGreen(); indexRandom(true, client().prepareIndex("test", "type1") .setSource("field1", "A sentence with few words. Another sentence with even more words.")); - logger.info("--> highlighting and searching on 'field' with sentence boundary_scanner"); - SearchSourceBuilder source = searchSource() + for (String type : new String[] {"fvh", "unified"}) { + logger.info("--> highlighting and searching on 'field' with sentence boundary_scanner"); + SearchSourceBuilder source = searchSource() .query(termQuery("field1", "sentence")) .highlighter(highlight() - .field("field1", 20, 2) - .order("score") - .preTags("").postTags("") - .boundaryScannerType(BoundaryScannerType.SENTENCE) - .boundaryScannerLocale(Locale.ENGLISH.toLanguageTag())); + .field("field1", 21, 2) + .highlighterType(type) + .preTags("").postTags("") + .boundaryScannerType(BoundaryScannerType.SENTENCE) + .boundaryScannerLocale(Locale.ENGLISH.toLanguageTag())); - SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get(); + SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get(); - assertHighlight(searchResponse, 0, "field1", 0, 2, equalTo("A sentence with few words. ")); - assertHighlight(searchResponse, 0, "field1", 1, 2, equalTo("Another sentence with even more words. ")); + assertHighlight(searchResponse, 0, "field1", 0, 2, anyOf( + equalTo("A sentence with few words"), + equalTo("A sentence with few words. ") + )); + + assertHighlight(searchResponse, 0, "field1", 1, 2, anyOf( + equalTo("Another sentence with"), + equalTo("Another sentence with even more words. ") + )); + } } - public void testFastVectorHighlighterWithWordBoundaryScanner() throws Exception { + public void testHighlighterWithWordBoundaryScanner() throws Exception { assertAcked(prepareCreate("test").addMapping("type1", type1TermVectorMapping())); ensureGreen(); @@ -804,39 +821,48 @@ public void testFastVectorHighlighterWithWordBoundaryScanner() throws Exception .setSource("field1", "some quick and hairy brown:fox jumped over the lazy dog")); logger.info("--> highlighting and searching on 'field' with word boundary_scanner"); - SearchSourceBuilder source = searchSource() - .query(termQuery("field1", "some")) - .highlighter(highlight() - .field("field1", 23, 1) - .order("score") - .preTags("").postTags("") - .boundaryScannerType(BoundaryScannerType.WORD)); + for (String type : new String[] {"unified", "fvh"}) { + SearchSourceBuilder source = searchSource() + .query(termQuery("field1", "some")) + .highlighter(highlight() + .field("field1", 23, 1) + .highlighterType(type) + .preTags("").postTags("") + .boundaryScannerType(BoundaryScannerType.WORD)); - SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get(); + SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get(); - assertHighlight(searchResponse, 0, "field1", 0, 1, equalTo("some quick and hairy brown")); + assertHighlight(searchResponse, 0, "field1", 0, 1, anyOf( + equalTo("some quick and hairy brown"), + equalTo("some") + )); + } } - public void testFastVectorHighlighterWithWordBoundaryScannerAndLocale() throws Exception { + public void testHighlighterWithWordBoundaryScannerAndLocale() throws Exception { assertAcked(prepareCreate("test").addMapping("type1", type1TermVectorMapping())); ensureGreen(); indexRandom(true, client().prepareIndex("test", "type1") .setSource("field1", "some quick and hairy brown:fox jumped over the lazy dog")); - logger.info("--> highlighting and searching on 'field' with word boundary_scanner"); - SearchSourceBuilder source = searchSource() + for (String type : new String[] {"unified", "fvh"}) { + SearchSourceBuilder source = searchSource() .query(termQuery("field1", "some")) .highlighter(highlight() - .field("field1", 23, 1) - .order("score") - .preTags("").postTags("") - .boundaryScannerType(BoundaryScannerType.WORD) - .boundaryScannerLocale(Locale.ENGLISH.toLanguageTag())); + .field("field1", 23, 1) + .highlighterType(type) + .preTags("").postTags("") + .boundaryScannerType(BoundaryScannerType.WORD) + .boundaryScannerLocale(Locale.ENGLISH.toLanguageTag())); - SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get(); + SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get(); - assertHighlight(searchResponse, 0, "field1", 0, 1, equalTo("some quick and hairy brown")); + assertHighlight(searchResponse, 0, "field1", 0, 1, anyOf( + equalTo("some quick and hairy brown"), + equalTo("some") + )); + } } /** @@ -1841,13 +1867,13 @@ public void testHighlightNoMatchSize() throws IOException { response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get(); assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some")); - // Postings hl also works but the fragment is the whole first sentence (size ignored) - field.highlighterType("postings"); + // Unified hl also works but the fragment is longer than the plain highlighter because of the boundary is the word + field.highlighterType("unified"); response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get(); - assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some of me should get cut off.")); + assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some")); - // Unified hl also works but the fragment is the whole first sentence (size ignored) - field.highlighterType("unified"); + // Postings hl also works but the fragment is the whole first sentence (size ignored) + field.highlighterType("postings"); response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get(); assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some of me should get cut off.")); @@ -1860,13 +1886,12 @@ public void testHighlightNoMatchSize() throws IOException { response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get(); assertHighlight(response, 0, "text", 0, 1, equalTo(text)); - //no difference using postings hl as the noMatchSize is ignored (just needs to be greater than 0) - field.highlighterType("postings"); + field.highlighterType("unified"); response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get(); - assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some of me should get cut off.")); + assertHighlight(response, 0, "text", 0, 1, equalTo(text)); - //no difference using unified hl as the noMatchSize is ignored (just needs to be greater than 0) - field.highlighterType("unified"); + //no difference using postings hl as the noMatchSize is ignored (just needs to be greater than 0) + field.highlighterType("postings"); response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get(); assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some of me should get cut off.")); @@ -1879,13 +1904,13 @@ public void testHighlightNoMatchSize() throws IOException { response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get(); assertHighlight(response, 0, "text", 0, 1, equalTo(text)); - //no difference using postings hl as the noMatchSize is ignored (just needs to be greater than 0) - field.highlighterType("postings"); + // unified hl returns the first sentence as the noMatchSize does not cross sentence boundary. + field.highlighterType("unified"); response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get(); - assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some of me should get cut off.")); + assertHighlight(response, 0, "text", 0, 1, equalTo(text)); - //no difference using unified hl as the noMatchSize is ignored (just needs to be greater than 0) - field.highlighterType("unified"); + //no difference using postings hl as the noMatchSize is ignored (just needs to be greater than 0) + field.highlighterType("postings"); response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get(); assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some of me should get cut off.")); @@ -1898,11 +1923,11 @@ public void testHighlightNoMatchSize() throws IOException { response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field).noMatchSize(21)).get(); assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some")); - field.highlighterType("postings"); + field.highlighterType("unified"); response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field).noMatchSize(21)).get(); - assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some of me should get cut off.")); + assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some")); - field.highlighterType("unified"); + field.highlighterType("postings"); response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field).noMatchSize(21)).get(); assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some of me should get cut off.")); @@ -1947,13 +1972,12 @@ public void testHighlightNoMatchSizeWithMultivaluedFields() throws IOException { response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get(); assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some")); - // Postings hl also works but the fragment is the whole first sentence (size ignored) - field.highlighterType("postings"); + field.highlighterType("unified"); response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get(); - assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some of me should get cut off.")); + assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some")); - // Unified hl also works but the fragment is the whole first sentence (size ignored) - field.highlighterType("unified"); + // Postings hl also works but the fragment is the whole first sentence (size ignored) + field.highlighterType("postings"); response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get(); assertHighlight(response, 0, "text", 0, 1, equalTo("I am pretty long so some of me should get cut off.")); @@ -1980,11 +2004,12 @@ public void testHighlightNoMatchSizeWithMultivaluedFields() throws IOException { .highlighter(new HighlightBuilder().field(field)).get(); assertNotHighlighted(response, 0, "text"); + // except for the unified highlighter which starts from the first string with actual content field.highlighterType("unified"); response = client().prepareSearch("test") .setQuery(idsQueryBuilder) .highlighter(new HighlightBuilder().field(field)).get(); - assertNotHighlighted(response, 0, "text"); + assertHighlight(response, 0, "text", 0, 1, equalTo("I am short")); // But if the field was actually empty then you should get no highlighting field index("test", "type1", "3", "text", new String[] {}); @@ -2031,7 +2056,7 @@ public void testHighlightNoMatchSizeWithMultivaluedFields() throws IOException { .highlighter(new HighlightBuilder().field(field)).get(); assertNotHighlighted(response, 0, "text"); - field.highlighterType("fvh"); + field.highlighterType("unified"); response = client().prepareSearch("test") .setQuery(idsQueryBuilder) .highlighter(new HighlightBuilder().field(field)).get(); @@ -2081,13 +2106,13 @@ public void testHighlightNoMatchSizeNumberOfFragments() throws IOException { response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get(); assertHighlight(response, 0, "text", 0, 1, equalTo("This is the first sentence")); - // Postings hl also works but the fragment is the whole first sentence (size ignored) - field.highlighterType("postings"); + field.highlighterType("unified"); response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get(); - assertHighlight(response, 0, "text", 0, 1, equalTo("This is the first sentence.")); + assertHighlight(response, 0, "text", 0, 1, equalTo("This is the first sentence")); - // Unified hl also works but the fragment is the whole first sentence (size ignored) - field.highlighterType("unified"); + + // Postings hl also works but the fragment is the whole first sentence (size ignored) + field.highlighterType("postings"); response = client().prepareSearch("test").highlighter(new HighlightBuilder().field(field)).get(); assertHighlight(response, 0, "text", 0, 1, equalTo("This is the first sentence.")); diff --git a/docs/reference/search/request/highlighting.asciidoc b/docs/reference/search/request/highlighting.asciidoc index 81f454bb15852..033ba01cfc4bb 100644 --- a/docs/reference/search/request/highlighting.asciidoc +++ b/docs/reference/search/request/highlighting.asciidoc @@ -140,6 +140,9 @@ It supports accurate phrase and multi-term (fuzzy, prefix, regex) highlighting a * `highlight_query` * `pre_tags and `post_tags` * `require_field_match` +* `boundary_scanner` (`sentence` (**default**) or `word`) +* `max_fragment_length` (only for `sentence` scanner) +* `no_match_size` ==== Force highlighter type @@ -345,7 +348,7 @@ parameter to control the margin to start highlighting from. In the case where there is no matching fragment to highlight, the default is to not return anything. Instead, we can return a snippet of text from the beginning of the field by setting `no_match_size` (default `0`) to the length -of the text that you want returned. The actual length may be shorter than +of the text that you want returned. The actual length may be shorter or longer than specified as it tries to break on a word boundary. When using the postings highlighter it is not possible to control the actual size of the snippet, therefore the first sentence gets returned whenever `no_match_size` is @@ -504,21 +507,26 @@ GET /_search [[boundary-scanners]] ==== Boundary Scanners -When highlighting a field using the fast vector highlighter, you can specify -how to break the highlighted fragments using `boundary_scanner`, which accepts +When highlighting a field using the unified highlighter or the fast vector highlighter, +you can specify how to break the highlighted fragments using `boundary_scanner`, which accepts the following values: -* `chars` (default): allows to configure which characters (`boundary_chars`) +* `chars` (default mode for the FVH): allows to configure which characters (`boundary_chars`) constitute a boundary for highlighting. It's a single string with each boundary character defined in it (defaults to `.,!? \t\n`). It also allows configuring the `boundary_max_scan` to control how far to look for boundary characters -(defaults to `20`). +(defaults to `20`). Works only with the Fast Vector Highlighter. -* `word` and `sentence`: use Java's https://docs.oracle.com/javase/8/docs/api/java/text/BreakIterator.html[BreakIterator] -to break the highlighted fragments at the next _word_ or _sentence_ boundary. +* `sentence` and `word`: use Java's https://docs.oracle.com/javase/8/docs/api/java/text/BreakIterator.html[BreakIterator] +to break the highlighted fragments at the next _sentence_ or _word_ boundary. You can further specify `boundary_scanner_locale` to control which Locale is used to search the text for these boundaries. +[NOTE] +When used with the `unified` highlighter, the `sentence` scanner splits sentence +bigger than `fragment_size` at the first word boundary next to `fragment_size`. +You can set `fragment_size` to 0 to never split any sentence. + [[matched-fields]] ==== Matched Fields The Fast Vector Highlighter can combine matches on multiple fields to From 3d447a039380fedaca7b62880d0afb7101767cd0 Mon Sep 17 00:00:00 2001 From: Jim Ferenczi Date: Fri, 17 Mar 2017 16:59:01 +0100 Subject: [PATCH 2/2] Address review comments and fix styles --- .../BoundedBreakIteratorScanner.java | 62 +++++++++++++------ .../uhighlight/CustomFieldHighlighter.java | 36 ++++++++--- .../BoundedBreakIteratorScannerTests.java | 51 +++++++++++---- 3 files changed, 109 insertions(+), 40 deletions(-) diff --git a/core/src/main/java/org/apache/lucene/search/uhighlight/BoundedBreakIteratorScanner.java b/core/src/main/java/org/apache/lucene/search/uhighlight/BoundedBreakIteratorScanner.java index 6aeb1b8ab7e20..1cd5fb9340d7f 100644 --- a/core/src/main/java/org/apache/lucene/search/uhighlight/BoundedBreakIteratorScanner.java +++ b/core/src/main/java/org/apache/lucene/search/uhighlight/BoundedBreakIteratorScanner.java @@ -23,12 +23,15 @@ import java.util.Locale; /** - * A custom break iterator that scans text to find break-delimited passages bounded by a provided {@code maxLen}. - * This scanner uses two level of {@link BreakIterator} to bound passage size "close" to {@maxLen}. - * This is useful to "bound" {@link BreakIterator}s like `sentence` that can create big outliers on - * semi-structured text. + * A custom break iterator that scans text to find break-delimited passages bounded by + * a provided maximum length. This class delegates the boundary search to a first level + * break iterator. When this break iterator finds a passage greater than the maximum length + * a secondary break iterator is used to re-split the passage at the first boundary after + * maximum length. + * This is useful to split passages created by {@link BreakIterator}s like `sentence` that + * can create big outliers on semi-structured text. * - * @warning This break iterator is designed to work with the {@link UnifiedHighlighter} only. + * WARNING: This break iterator is designed to work with the {@link UnifiedHighlighter}. **/ public class BoundedBreakIteratorScanner extends BreakIterator { private final BreakIterator mainBreak; @@ -39,9 +42,11 @@ public class BoundedBreakIteratorScanner extends BreakIterator { private int windowStart = -1; private int windowEnd = -1; private int innerStart = -1; - private int innerEnd = -1; + private int innerEnd = 0; - private BoundedBreakIteratorScanner(BreakIterator mainBreak, BreakIterator innerBreak, int maxLen) { + private BoundedBreakIteratorScanner(BreakIterator mainBreak, + BreakIterator innerBreak, + int maxLen) { this.mainBreak = mainBreak; this.innerBreak = innerBreak; this.maxLen = maxLen; @@ -71,12 +76,18 @@ private void reset() { windowStart = -1; windowEnd = -1; innerStart = -1; - innerEnd = -1; + innerEnd = 0; } + /** + * Must be called with increasing offset. See {@link FieldHighlighter} for usage. + */ @Override public int preceding(int offset) { - assert(offset > lastPrecedingOffset); + if (offset < lastPrecedingOffset) { + throw new IllegalArgumentException("offset < lastPrecedingOffset: " + + "usage doesn't look like UnifiedHighlighter"); + } if (offset > windowStart && offset < windowEnd) { innerStart = innerEnd; innerEnd = windowEnd; @@ -89,26 +100,35 @@ public int preceding(int offset) { // the current split is too big, // so starting from the current term we try to find boundaries on the left first if (offset - maxLen > innerStart) { - innerStart = Math.max(innerStart, innerBreak.preceding(offset - maxLen)); + innerStart = Math.max(innerStart, + innerBreak.preceding(offset - maxLen)); } // and then we try to expand the passage to the right with the remaining size int remaining = Math.max(0, maxLen - (offset - innerStart)); if (offset + remaining < windowEnd) { - innerEnd = Math.min(windowEnd, innerBreak.following(offset + remaining)); + innerEnd = Math.min(windowEnd, + innerBreak.following(offset + remaining)); } } lastPrecedingOffset = offset - 1; return innerStart; } + /** + * Can be invoked only after a call to preceding(offset+1). + * See {@link FieldHighlighter} for usage. + */ @Override public int following(int offset) { - assert(offset == lastPrecedingOffset && innerEnd != -1); + if (offset != lastPrecedingOffset || innerEnd == -1) { + throw new IllegalArgumentException("offset != lastPrecedingOffset: " + + "usage doesn't look like UnifiedHighlighter"); + } return innerEnd; } /** - * Returns a {@link BreakIterator#getSentenceInstance(Locale)} bounded to {@code maxLen}. + * Returns a {@link BreakIterator#getSentenceInstance(Locale)} bounded to maxLen. * Secondary boundaries are found using a {@link BreakIterator#getWordInstance(Locale)}. */ public static BreakIterator getSentence(Locale locale, int maxLen) { @@ -117,14 +137,16 @@ public static BreakIterator getSentence(Locale locale, int maxLen) { return new BoundedBreakIteratorScanner(sBreak, wBreak, maxLen); } + @Override - public int first() { - return 0; + public int current() { + // Returns the last offset of the current split + return this.innerEnd; } @Override - public int current() { - return 0; + public int first() { + throw new IllegalStateException("first() should not be called in this context"); } @Override @@ -134,16 +156,16 @@ public int next() { @Override public int last() { - throw new IllegalStateException("last should not be called in this context"); + throw new IllegalStateException("last() should not be called in this context"); } @Override public int next(int n) { - throw new IllegalStateException("next(n) should not be call"); + throw new IllegalStateException("next(n) should not be called in this context"); } @Override public int previous() { - throw new IllegalStateException("previous should not be call"); + throw new IllegalStateException("previous() should not be called in this context"); } } diff --git a/core/src/main/java/org/apache/lucene/search/uhighlight/CustomFieldHighlighter.java b/core/src/main/java/org/apache/lucene/search/uhighlight/CustomFieldHighlighter.java index 9515648355f04..915e7cc153128 100644 --- a/core/src/main/java/org/apache/lucene/search/uhighlight/CustomFieldHighlighter.java +++ b/core/src/main/java/org/apache/lucene/search/uhighlight/CustomFieldHighlighter.java @@ -1,3 +1,22 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + package org.apache.lucene.search.uhighlight; import java.text.BreakIterator; @@ -16,11 +35,12 @@ class CustomFieldHighlighter extends FieldHighlighter { private final int noMatchSize; private final String fieldValue; - public CustomFieldHighlighter(String field, FieldOffsetStrategy fieldOffsetStrategy, - Locale breakIteratorLocale, BreakIterator breakIterator, - PassageScorer passageScorer, int maxPassages, int maxNoHighlightPassages, - PassageFormatter passageFormatter, int noMatchSize, String fieldValue) { - super(field, fieldOffsetStrategy, breakIterator, passageScorer, maxPassages, maxNoHighlightPassages, passageFormatter); + CustomFieldHighlighter(String field, FieldOffsetStrategy fieldOffsetStrategy, + Locale breakIteratorLocale, BreakIterator breakIterator, + PassageScorer passageScorer, int maxPassages, int maxNoHighlightPassages, + PassageFormatter passageFormatter, int noMatchSize, String fieldValue) { + super(field, fieldOffsetStrategy, breakIterator, passageScorer, maxPassages, + maxNoHighlightPassages, passageFormatter); this.breakIteratorLocale = breakIteratorLocale; this.noMatchSize = noMatchSize; this.fieldValue = fieldValue; @@ -38,18 +58,18 @@ protected Passage[] getSummaryPassagesNoHighlight(int maxPassages) { if (end == -1) { end = fieldValue.length(); } - if (noMatchSize < end) { + if (noMatchSize+pos < end) { BreakIterator bi = BreakIterator.getWordInstance(breakIteratorLocale); bi.setText(fieldValue); // Finds the next word boundary **after** noMatchSize. - end = bi.following(noMatchSize); + end = bi.following(noMatchSize + pos); if (end == BreakIterator.DONE) { end = fieldValue.length(); } } Passage passage = new Passage(); passage.setScore(Float.NaN); - passage.setStartOffset(0); + passage.setStartOffset(pos); passage.setEndOffset(end); return new Passage[]{passage}; } diff --git a/core/src/test/java/org/apache/lucene/search/uhighlight/BoundedBreakIteratorScannerTests.java b/core/src/test/java/org/apache/lucene/search/uhighlight/BoundedBreakIteratorScannerTests.java index 2c41eb3319fd2..0cf62e8ce6c42 100644 --- a/core/src/test/java/org/apache/lucene/search/uhighlight/BoundedBreakIteratorScannerTests.java +++ b/core/src/test/java/org/apache/lucene/search/uhighlight/BoundedBreakIteratorScannerTests.java @@ -1,3 +1,22 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + package org.apache.lucene.search.uhighlight; import org.elasticsearch.test.ESTestCase; @@ -13,9 +32,14 @@ import static org.hamcrest.Matchers.lessThanOrEqualTo; public class BoundedBreakIteratorScannerTests extends ESTestCase { + private static final String[] WORD_BOUNDARIES = + new String[] { " ", " ", "\t", "#", "\n" }; + private static final String[] SENTENCE_BOUNDARIES = + new String[] { "! ", "? ", ". ", ".\n", ".\n\n" }; + private void testRandomAsciiTextCase(BreakIterator bi, int maxLen) { // Generate a random set of unique terms with ascii character - int maxSize = randomIntBetween(5, 20); + int maxSize = randomIntBetween(5, 100); String[] vocabulary = new String[maxSize]; for (int i = 0; i < maxSize; i++) { if (rarely()) { @@ -25,7 +49,8 @@ private void testRandomAsciiTextCase(BreakIterator bi, int maxLen) { } } - // Generate a random text made of random terms separated with word-boundaries and sentence-boundaries. + // Generate a random text made of random terms separated with word-boundaries + // and sentence-boundaries. StringBuilder text = new StringBuilder(); List offsetList = new ArrayList<> (); List sizeList = new ArrayList<> (); @@ -37,13 +62,12 @@ private void testRandomAsciiTextCase(BreakIterator bi, int maxLen) { int numTerms = randomIntBetween(5, 10); for (int j = 0; j < numTerms; j++) { int termId = randomIntBetween(0, vocabulary.length - 1); - String term = vocabulary[termId].toLowerCase(); + String term = vocabulary[termId].toLowerCase(Locale.ROOT); if (j == 0) { // capitalize the first letter of the first term in the sentence - term = term.substring(0, 1).toUpperCase() + term.substring(1); + term = term.substring(0, 1).toUpperCase(Locale.ROOT) + term.substring(1); } else { - // word boundary - String sep = randomFrom(" ", " ", "\t", "#", "\n"); + String sep = randomFrom(WORD_BOUNDARIES); text.append(sep); } maxTermLen = Math.max(term.length(), maxTermLen); @@ -51,8 +75,7 @@ private void testRandomAsciiTextCase(BreakIterator bi, int maxLen) { sizeList.add(term.length()); text.append(term); } - // sentence boundary - String boundary = randomFrom("! ", "? ", ". ", ".\n", ".\n\n"); + String boundary = randomFrom(SENTENCE_BOUNDARIES); text.append(boundary); } @@ -81,12 +104,14 @@ private void testRandomAsciiTextCase(BreakIterator bi, int maxLen) { int startPos = Arrays.binarySearch(offsets, start); int endPos = Arrays.binarySearch(offsets, end); if (startPos < 0) { - int lastWordEnd = offsets[Math.abs(startPos)-2] + sizes[Math.abs(startPos)-2]; + int lastWordEnd = + offsets[Math.abs(startPos)-2] + sizes[Math.abs(startPos)-2]; assertThat(start, greaterThanOrEqualTo(lastWordEnd)); } if (endPos < 0) { if (Math.abs(endPos)-2 < offsets.length) { - int lastWordEnd = offsets[Math.abs(endPos) - 2] + sizes[Math.abs(endPos) - 2]; + int lastWordEnd = + offsets[Math.abs(endPos) - 2] + sizes[Math.abs(endPos) - 2]; assertThat(end, greaterThanOrEqualTo(lastWordEnd)); } // advance the position to the end of the current passage @@ -104,8 +129,10 @@ private void testRandomAsciiTextCase(BreakIterator bi, int maxLen) { public void testBoundedSentence() { for (int i = 0; i < 20; i++) { int maxLen = randomIntBetween(10, 500); - testRandomAsciiTextCase(BoundedBreakIteratorScanner.getSentence(Locale.ROOT, maxLen), - maxLen); + testRandomAsciiTextCase( + BoundedBreakIteratorScanner.getSentence(Locale.ROOT, maxLen), + maxLen + ); } } }