Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.lucene.search.uhighlight;

import java.text.BreakIterator;
import java.text.CharacterIterator;
import java.util.Locale;

/**
* A custom break iterator that scans text to find break-delimited passages bounded by
* a provided maximum length. This class delegates the boundary search to a first level
* break iterator. When this break iterator finds a passage greater than the maximum length
* a secondary break iterator is used to re-split the passage at the first boundary after
* maximum length.
* This is useful to split passages created by {@link BreakIterator}s like `sentence` that
* can create big outliers on semi-structured text.
*
* WARNING: This break iterator is designed to work with the {@link UnifiedHighlighter}.
**/
public class BoundedBreakIteratorScanner extends BreakIterator {
private final BreakIterator mainBreak;
private final BreakIterator innerBreak;
private final int maxLen;

private int lastPrecedingOffset = -1;
private int windowStart = -1;
private int windowEnd = -1;
private int innerStart = -1;
private int innerEnd = 0;

private BoundedBreakIteratorScanner(BreakIterator mainBreak,
BreakIterator innerBreak,
int maxLen) {
this.mainBreak = mainBreak;
this.innerBreak = innerBreak;
this.maxLen = maxLen;
}

@Override
public CharacterIterator getText() {
return mainBreak.getText();
}

@Override
public void setText(CharacterIterator newText) {
reset();
mainBreak.setText(newText);
innerBreak.setText(newText);
}

@Override
public void setText(String newText) {
reset();
mainBreak.setText(newText);
innerBreak.setText(newText);
}

private void reset() {
lastPrecedingOffset = -1;
windowStart = -1;
windowEnd = -1;
innerStart = -1;
innerEnd = 0;
}

/**
* Must be called with increasing offset. See {@link FieldHighlighter} for usage.
*/
@Override
public int preceding(int offset) {
if (offset < lastPrecedingOffset) {
throw new IllegalArgumentException("offset < lastPrecedingOffset: " +
"usage doesn't look like UnifiedHighlighter");
}
if (offset > windowStart && offset < windowEnd) {
innerStart = innerEnd;
innerEnd = windowEnd;
} else {
windowStart = innerStart = mainBreak.preceding(offset);
windowEnd = innerEnd = mainBreak.following(offset-1);
}

if (innerEnd - innerStart > maxLen) {
// the current split is too big,
// so starting from the current term we try to find boundaries on the left first
if (offset - maxLen > innerStart) {
innerStart = Math.max(innerStart,
innerBreak.preceding(offset - maxLen));
}
// and then we try to expand the passage to the right with the remaining size
int remaining = Math.max(0, maxLen - (offset - innerStart));
if (offset + remaining < windowEnd) {
innerEnd = Math.min(windowEnd,
innerBreak.following(offset + remaining));
}
}
lastPrecedingOffset = offset - 1;
return innerStart;
}

/**
* Can be invoked only after a call to preceding(offset+1).
* See {@link FieldHighlighter} for usage.
*/
@Override
public int following(int offset) {
if (offset != lastPrecedingOffset || innerEnd == -1) {
throw new IllegalArgumentException("offset != lastPrecedingOffset: " +
"usage doesn't look like UnifiedHighlighter");
}
return innerEnd;
}

/**
* Returns a {@link BreakIterator#getSentenceInstance(Locale)} bounded to maxLen.
* Secondary boundaries are found using a {@link BreakIterator#getWordInstance(Locale)}.
*/
public static BreakIterator getSentence(Locale locale, int maxLen) {
final BreakIterator sBreak = BreakIterator.getSentenceInstance(locale);
final BreakIterator wBreak = BreakIterator.getWordInstance(locale);
return new BoundedBreakIteratorScanner(sBreak, wBreak, maxLen);
}


@Override
public int current() {
// Returns the last offset of the current split
return this.innerEnd;
}

@Override
public int first() {
throw new IllegalStateException("first() should not be called in this context");
}

@Override
public int next() {
throw new IllegalStateException("next() should not be called in this context");
}

@Override
public int last() {
throw new IllegalStateException("last() should not be called in this context");
}

@Override
public int next(int n) {
throw new IllegalStateException("next(n) should not be called in this context");
}

@Override
public int previous() {
throw new IllegalStateException("previous() should not be called in this context");
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.apache.lucene.search.uhighlight;

import java.text.BreakIterator;
import java.util.Locale;

import static org.apache.lucene.search.uhighlight.CustomUnifiedHighlighter.MULTIVAL_SEP_CHAR;

/**
* Custom {@link FieldHighlighter} that creates a single passage bounded to {@code noMatchSize} when
* no highlights were found.
*/
class CustomFieldHighlighter extends FieldHighlighter {
private static final Passage[] EMPTY_PASSAGE = new Passage[0];

private final Locale breakIteratorLocale;
private final int noMatchSize;
private final String fieldValue;

CustomFieldHighlighter(String field, FieldOffsetStrategy fieldOffsetStrategy,
Locale breakIteratorLocale, BreakIterator breakIterator,
PassageScorer passageScorer, int maxPassages, int maxNoHighlightPassages,
PassageFormatter passageFormatter, int noMatchSize, String fieldValue) {
super(field, fieldOffsetStrategy, breakIterator, passageScorer, maxPassages,
maxNoHighlightPassages, passageFormatter);
this.breakIteratorLocale = breakIteratorLocale;
this.noMatchSize = noMatchSize;
this.fieldValue = fieldValue;
}

@Override
protected Passage[] getSummaryPassagesNoHighlight(int maxPassages) {
if (noMatchSize > 0) {
int pos = 0;
while (pos < fieldValue.length() && fieldValue.charAt(pos) == MULTIVAL_SEP_CHAR) {
pos ++;
}
if (pos < fieldValue.length()) {
int end = fieldValue.indexOf(MULTIVAL_SEP_CHAR, pos);
if (end == -1) {
end = fieldValue.length();
}
if (noMatchSize+pos < end) {
BreakIterator bi = BreakIterator.getWordInstance(breakIteratorLocale);
bi.setText(fieldValue);
// Finds the next word boundary **after** noMatchSize.
end = bi.following(noMatchSize + pos);
if (end == BreakIterator.DONE) {
end = fieldValue.length();
}
}
Passage passage = new Passage();
passage.setScore(Float.NaN);
passage.setStartOffset(pos);
passage.setEndOffset(end);
return new Passage[]{passage};
}
}
return EMPTY_PASSAGE;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@
import org.apache.lucene.search.spans.SpanOrQuery;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
import org.elasticsearch.common.Nullable;
import org.elasticsearch.common.lucene.all.AllTermQuery;
import org.elasticsearch.common.lucene.search.MultiPhrasePrefixQuery;
Expand All @@ -47,6 +49,7 @@
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;

/**
* Subclass of the {@link UnifiedHighlighter} that works for a single field in a single document.
Expand All @@ -57,37 +60,41 @@
* Supports both returning empty snippets and non highlighted snippets when no highlighting can be performed.
*/
public class CustomUnifiedHighlighter extends UnifiedHighlighter {
public static final char MULTIVAL_SEP_CHAR = (char) 0;
private static final Snippet[] EMPTY_SNIPPET = new Snippet[0];

private final String fieldValue;
private final PassageFormatter passageFormatter;
private final BreakIterator breakIterator;
private final boolean returnNonHighlightedSnippets;
private final Locale breakIteratorLocale;
private final int noMatchSize;

/**
* Creates a new instance of {@link CustomUnifiedHighlighter}
*
* @param analyzer the analyzer used for the field at index time, used for multi term queries internally
* @param passageFormatter our own {@link CustomPassageFormatter}
* which generates snippets in forms of {@link Snippet} objects
* which generates snippets in forms of {@link Snippet} objects
* @param breakIteratorLocale the {@link Locale} to use for dividing text into passages.
* If null {@link Locale#ROOT} is used
* @param breakIterator the {@link BreakIterator} to use for dividing text into passages.
* If null {@link BreakIterator#getSentenceInstance(Locale)} is used.
* @param fieldValue the original field values as constructor argument, loaded from the _source field or
* the relevant stored field.
* @param returnNonHighlightedSnippets whether non highlighted snippets should be
* returned rather than empty snippets when no highlighting can be performed
* If null {@link BreakIterator#getSentenceInstance(Locale)} is used.
* @param fieldValue the original field values delimited by MULTIVAL_SEP_CHAR
* @param noMatchSize The size of the text that should be returned when no highlighting can be performed
*/
public CustomUnifiedHighlighter(IndexSearcher searcher,
Analyzer analyzer,
PassageFormatter passageFormatter,
@Nullable Locale breakIteratorLocale,
@Nullable BreakIterator breakIterator,
String fieldValue,
boolean returnNonHighlightedSnippets) {
int noMatchSize) {
super(searcher, analyzer);
this.breakIterator = breakIterator;
this.breakIteratorLocale = breakIteratorLocale == null ? Locale.ROOT : breakIteratorLocale;
this.passageFormatter = passageFormatter;
this.fieldValue = fieldValue;
this.returnNonHighlightedSnippets = returnNonHighlightedSnippets;
this.noMatchSize = noMatchSize;
}

/**
Expand All @@ -111,16 +118,13 @@ public Snippet[] highlightField(String field, Query query, int docId, int maxPas
@Override
protected List<CharSequence[]> loadFieldValues(String[] fields, DocIdSetIterator docIter,
int cacheCharsThreshold) throws IOException {
//we only highlight one field, one document at a time
// we only highlight one field, one document at a time
return Collections.singletonList(new String[]{fieldValue});
}

@Override
protected BreakIterator getBreakIterator(String field) {
if (breakIterator != null) {
return breakIterator;
}
return super.getBreakIterator(field);
return breakIterator;
}

@Override
Expand All @@ -129,11 +133,18 @@ protected PassageFormatter getFormatter(String field) {
}

@Override
protected int getMaxNoHighlightPassages(String field) {
if (returnNonHighlightedSnippets) {
return 1;
}
return 0;
protected FieldHighlighter getFieldHighlighter(String field, Query query, Set<Term> allTerms, int maxPassages) {
BytesRef[] terms = filterExtractedTerms(getFieldMatcher(field), allTerms);
Set<HighlightFlag> highlightFlags = getFlags(field);
PhraseHelper phraseHelper = getPhraseHelper(field, query, highlightFlags);
CharacterRunAutomaton[] automata = getAutomata(field, query, highlightFlags);
OffsetSource offsetSource = getOptimizedOffsetSource(field, terms, phraseHelper, automata);
BreakIterator breakIterator = new SplittingBreakIterator(getBreakIterator(field),
UnifiedHighlighter.MULTIVAL_SEP_CHAR);
FieldOffsetStrategy strategy =
getOffsetStrategy(offsetSource, field, terms, phraseHelper, automata, highlightFlags);
return new CustomFieldHighlighter(field, strategy, breakIteratorLocale, breakIterator,
getScorer(field), maxPassages, (noMatchSize > 0 ? 1 : 0), getFormatter(field), noMatchSize, fieldValue);
}

@Override
Expand All @@ -146,7 +157,6 @@ protected Collection<Query> preSpanQueryRewrite(Query query) {
return rewriteCustomQuery(query);
}


/**
* Translate custom queries in queries that are supported by the unified highlighter.
*/
Expand Down
Loading