Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleSpanFragmenter;
import org.elasticsearch.Version;
import org.elasticsearch.action.support.ToXContentToBytes;
import org.elasticsearch.common.ParseField;
import org.elasticsearch.common.ParsingException;
Expand All @@ -32,10 +33,12 @@
import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.index.query.QueryBuilder;
import org.elasticsearch.index.query.QueryParseContext;
import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder.BoundaryScannerType;
import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder.Order;

import java.io.IOException;
import java.util.Arrays;
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import java.util.function.BiFunction;
Expand All @@ -57,8 +60,10 @@ public abstract class AbstractHighlighterBuilder<HB extends AbstractHighlighterB
public static final ParseField NUMBER_OF_FRAGMENTS_FIELD = new ParseField("number_of_fragments");
public static final ParseField ENCODER_FIELD = new ParseField("encoder");
public static final ParseField REQUIRE_FIELD_MATCH_FIELD = new ParseField("require_field_match");
public static final ParseField BOUNDARY_SCANNER_FIELD = new ParseField("boundary_scanner");
public static final ParseField BOUNDARY_MAX_SCAN_FIELD = new ParseField("boundary_max_scan");
public static final ParseField BOUNDARY_CHARS_FIELD = new ParseField("boundary_chars");
public static final ParseField BOUNDARY_SCANNER_LOCALE_FIELD = new ParseField("boundary_scanner_locale");
public static final ParseField TYPE_FIELD = new ParseField("type");
public static final ParseField FRAGMENTER_FIELD = new ParseField("fragmenter");
public static final ParseField NO_MATCH_SIZE_FIELD = new ParseField("no_match_size");
Expand Down Expand Up @@ -88,10 +93,14 @@ public abstract class AbstractHighlighterBuilder<HB extends AbstractHighlighterB

protected Boolean forceSource;

protected BoundaryScannerType boundaryScannerType;

protected Integer boundaryMaxScan;

protected char[] boundaryChars;

protected Locale boundaryScannerLocale;

protected Integer noMatchSize;

protected Integer phraseLimit;
Expand Down Expand Up @@ -119,10 +128,18 @@ protected AbstractHighlighterBuilder(StreamInput in) throws IOException {
order(in.readOptionalWriteable(Order::readFromStream));
highlightFilter(in.readOptionalBoolean());
forceSource(in.readOptionalBoolean());
if (in.getVersion().onOrAfter(Version.V_5_4_0_UNRELEASED)) {
boundaryScannerType(in.readOptionalWriteable(BoundaryScannerType::readFromStream));
}
boundaryMaxScan(in.readOptionalVInt());
if (in.readBoolean()) {
boundaryChars(in.readString().toCharArray());
}
if (in.getVersion().onOrAfter(Version.V_5_4_0_UNRELEASED)) {
if (in.readBoolean()) {
boundaryScannerLocale(in.readString());
}
}
noMatchSize(in.readOptionalVInt());
phraseLimit(in.readOptionalVInt());
if (in.readBoolean()) {
Expand Down Expand Up @@ -150,12 +167,22 @@ public final void writeTo(StreamOutput out) throws IOException {
out.writeOptionalWriteable(order);
out.writeOptionalBoolean(highlightFilter);
out.writeOptionalBoolean(forceSource);
if (out.getVersion().onOrAfter(Version.V_5_4_0_UNRELEASED)) {
out.writeOptionalWriteable(boundaryScannerType);
}
out.writeOptionalVInt(boundaryMaxScan);
boolean hasBounaryChars = boundaryChars != null;
out.writeBoolean(hasBounaryChars);
if (hasBounaryChars) {
out.writeString(String.valueOf(boundaryChars));
}
if (out.getVersion().onOrAfter(Version.V_5_4_0_UNRELEASED)) {
boolean hasBoundaryScannerLocale = boundaryScannerLocale != null;
out.writeBoolean(hasBoundaryScannerLocale);
if (hasBoundaryScannerLocale) {
out.writeString(boundaryScannerLocale.toLanguageTag());
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

out.writeOptionalString() since you use in.readOptionalString in the readFrom ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, notice that since boundaryScannerLocale is a Locale, I first write a boolean, and then write the String if it's not null. If I wanted to use writeOptionalString I would need to do something like:

out.writeOptionalString(boundaryScannerLocale == null ? null : boundaryScannerLocale.toLanguageTag())

Do you think that's preferred?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

++

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just for reference @jimczi, I borrowed this pattern from the code just above those lines which handles boundaryChars:

        boolean hasBounaryChars = boundaryChars != null;
        out.writeBoolean(hasBounaryChars);
        if (hasBounaryChars) {
            out.writeString(String.valueOf(boundaryChars));
        }

So before I change the code I wrote, does this change your opinion? not that I mind changing the code, just prefer the code to be consistent with the rest.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't mind as long as we use writeString/readString and writeOptionalString/readOptionalString consistently.
So you can maybe just change the readFrom to explicitly use readBoolean.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK done.

}
}
out.writeOptionalVInt(noMatchSize);
out.writeOptionalVInt(phraseLimit);
boolean hasOptions = options != null;
Expand Down Expand Up @@ -331,6 +358,33 @@ public Boolean highlightFilter() {
return this.highlightFilter;
}

/**
* When using the highlighterType <tt>fvh</tt> this setting
* controls which scanner to use for fragment boundaries, and defaults to "simple".
*/
@SuppressWarnings("unchecked")
public HB boundaryScannerType(String boundaryScannerType) {
this.boundaryScannerType = BoundaryScannerType.fromString(boundaryScannerType);
return (HB) this;
}

/**
* When using the highlighterType <tt>fvh</tt> this setting
* controls which scanner to use for fragment boundaries, and defaults to "simple".
*/
@SuppressWarnings("unchecked")
public HB boundaryScannerType(BoundaryScannerType boundaryScannerType) {
this.boundaryScannerType = boundaryScannerType;
return (HB) this;
}

/**
* @return the value set by {@link #boundaryScannerType(String)}
*/
public BoundaryScannerType boundaryScannerType() {
return this.boundaryScannerType;
}

/**
* When using the highlighterType <tt>fvh</tt> this setting
* controls how far to look for boundary characters, and defaults to 20.
Expand Down Expand Up @@ -366,6 +420,25 @@ public char[] boundaryChars() {
return this.boundaryChars;
}

/**
* When using the highlighterType <tt>fvh</tt> and boundaryScannerType <tt>break_iterator</tt>, this setting
* controls the locale to use by the BreakIterator, defaults to "root".
*/
@SuppressWarnings("unchecked")
public HB boundaryScannerLocale(String boundaryScannerLocale) {
if (boundaryScannerLocale != null) {
this.boundaryScannerLocale = Locale.forLanguageTag(boundaryScannerLocale);
}
return (HB) this;
}

/**
* @return the value set by {@link #boundaryScannerLocale(String)}
*/
public Locale boundaryScannerLocale() {
return this.boundaryScannerLocale;
}

/**
* Allows to set custom options for custom highlighters.
*/
Expand Down Expand Up @@ -491,12 +564,18 @@ void commonOptionsToXContent(XContentBuilder builder) throws IOException {
if (highlightFilter != null) {
builder.field(HIGHLIGHT_FILTER_FIELD.getPreferredName(), highlightFilter);
}
if (boundaryScannerType != null) {
builder.field(BOUNDARY_SCANNER_FIELD.getPreferredName(), boundaryScannerType.name());
}
if (boundaryMaxScan != null) {
builder.field(BOUNDARY_MAX_SCAN_FIELD.getPreferredName(), boundaryMaxScan);
}
if (boundaryChars != null) {
builder.field(BOUNDARY_CHARS_FIELD.getPreferredName(), new String(boundaryChars));
}
if (boundaryScannerLocale != null) {
builder.field(BOUNDARY_SCANNER_LOCALE_FIELD.getPreferredName(), boundaryScannerLocale.toLanguageTag());
}
if (options != null && options.size() > 0) {
builder.field(OPTIONS_FIELD.getPreferredName(), options);
}
Expand All @@ -523,8 +602,10 @@ static <HB extends AbstractHighlighterBuilder<HB>> BiFunction<QueryParseContext,
parser.declareInt(HB::fragmentSize, FRAGMENT_SIZE_FIELD);
parser.declareInt(HB::numOfFragments, NUMBER_OF_FRAGMENTS_FIELD);
parser.declareBoolean(HB::requireFieldMatch, REQUIRE_FIELD_MATCH_FIELD);
parser.declareString(HB::boundaryScannerType, BOUNDARY_SCANNER_FIELD);
parser.declareInt(HB::boundaryMaxScan, BOUNDARY_MAX_SCAN_FIELD);
parser.declareString((HB hb, String bc) -> hb.boundaryChars(bc.toCharArray()) , BOUNDARY_CHARS_FIELD);
parser.declareString(HB::boundaryScannerLocale, BOUNDARY_SCANNER_LOCALE_FIELD);
parser.declareString(HB::highlighterType, TYPE_FIELD);
parser.declareString(HB::fragmenter, FRAGMENTER_FIELD);
parser.declareInt(HB::noMatchSize, NO_MATCH_SIZE_FIELD);
Expand Down Expand Up @@ -562,8 +643,8 @@ static <HB extends AbstractHighlighterBuilder<HB>> BiFunction<QueryParseContext,
public final int hashCode() {
return Objects.hash(getClass(), Arrays.hashCode(preTags), Arrays.hashCode(postTags), fragmentSize,
numOfFragments, highlighterType, fragmenter, highlightQuery, order, highlightFilter,
forceSource, boundaryMaxScan, Arrays.hashCode(boundaryChars), noMatchSize,
phraseLimit, options, requireFieldMatch, doHashCode());
forceSource, boundaryScannerType, boundaryMaxScan, Arrays.hashCode(boundaryChars), boundaryScannerLocale,
noMatchSize, phraseLimit, options, requireFieldMatch, doHashCode());
}

/**
Expand Down Expand Up @@ -591,8 +672,10 @@ public final boolean equals(Object obj) {
Objects.equals(order, other.order) &&
Objects.equals(highlightFilter, other.highlightFilter) &&
Objects.equals(forceSource, other.forceSource) &&
Objects.equals(boundaryScannerType, other.boundaryScannerType) &&
Objects.equals(boundaryMaxScan, other.boundaryMaxScan) &&
Arrays.equals(boundaryChars, other.boundaryChars) &&
Objects.equals(boundaryScannerLocale, other.boundaryScannerLocale) &&
Objects.equals(noMatchSize, other.noMatchSize) &&
Objects.equals(phraseLimit, other.phraseLimit) &&
Objects.equals(options, other.options) &&
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import org.apache.lucene.search.highlight.Encoder;
import org.apache.lucene.search.vectorhighlight.BaseFragmentsBuilder;
import org.apache.lucene.search.vectorhighlight.BoundaryScanner;
import org.apache.lucene.search.vectorhighlight.BreakIteratorBoundaryScanner;
import org.apache.lucene.search.vectorhighlight.CustomFieldQuery;
import org.apache.lucene.search.vectorhighlight.FieldFragList;
import org.apache.lucene.search.vectorhighlight.FieldPhraseList.WeightedPhraseInfo;
Expand All @@ -38,15 +39,23 @@
import org.elasticsearch.index.mapper.FieldMapper;
import org.elasticsearch.search.fetch.FetchPhaseExecutionException;
import org.elasticsearch.search.fetch.FetchSubPhase;
import org.elasticsearch.search.fetch.subphase.highlight.SearchContextHighlight.Field;
import org.elasticsearch.search.fetch.subphase.highlight.SearchContextHighlight.FieldOptions;
import org.elasticsearch.search.internal.SearchContext;

import java.text.BreakIterator;
import java.util.Collections;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;

public class FastVectorHighlighter implements Highlighter {

private static final SimpleBoundaryScanner DEFAULT_BOUNDARY_SCANNER = new SimpleBoundaryScanner();
private static final BoundaryScanner DEFAULT_SIMPLE_BOUNDARY_SCANNER = new SimpleBoundaryScanner();
private static final BoundaryScanner DEFAULT_SENTENCE_BOUNDARY_SCANNER = new BreakIteratorBoundaryScanner(
BreakIterator.getSentenceInstance(Locale.ROOT));
private static final BoundaryScanner DEFAULT_WORD_BOUNDARY_SCANNER = new BreakIteratorBoundaryScanner(
BreakIterator.getWordInstance(Locale.ROOT));

public static final Setting<Boolean> SETTING_TV_HIGHLIGHT_MULTI_VALUE = Setting.boolSetting("search.highlight.term_vector_multi_value",
true, Setting.Property.NodeScope);
Expand Down Expand Up @@ -105,12 +114,7 @@ public HighlightField highlight(HighlighterContext highlighterContext) {
FragListBuilder fragListBuilder;
BaseFragmentsBuilder fragmentsBuilder;

BoundaryScanner boundaryScanner = DEFAULT_BOUNDARY_SCANNER;
if (field.fieldOptions().boundaryMaxScan() != SimpleBoundaryScanner.DEFAULT_MAX_SCAN
|| field.fieldOptions().boundaryChars() != SimpleBoundaryScanner.DEFAULT_BOUNDARY_CHARS) {
boundaryScanner = new SimpleBoundaryScanner(field.fieldOptions().boundaryMaxScan(),
field.fieldOptions().boundaryChars());
}
final BoundaryScanner boundaryScanner = getBoundaryScanner(field);
boolean forceSource = context.highlight().forceSource(field);
if (field.fieldOptions().numberOfFragments() == 0) {
fragListBuilder = new SingleFragListBuilder();
Expand Down Expand Up @@ -206,6 +210,29 @@ public boolean canHighlight(FieldMapper fieldMapper) {
&& fieldMapper.fieldType().storeTermVectorPositions();
}

private static BoundaryScanner getBoundaryScanner(Field field) {
final FieldOptions fieldOptions = field.fieldOptions();
final Locale boundaryScannerLocale = fieldOptions.boundaryScannerLocale();
switch(fieldOptions.boundaryScannerType()) {
case SENTENCE:
if (boundaryScannerLocale != null) {
return new BreakIteratorBoundaryScanner(BreakIterator.getSentenceInstance(boundaryScannerLocale));
}
return DEFAULT_SENTENCE_BOUNDARY_SCANNER;
case WORD:
if (boundaryScannerLocale != null) {
return new BreakIteratorBoundaryScanner(BreakIterator.getWordInstance(boundaryScannerLocale));
}
return DEFAULT_WORD_BOUNDARY_SCANNER;
default:
if (fieldOptions.boundaryMaxScan() != SimpleBoundaryScanner.DEFAULT_MAX_SCAN
|| fieldOptions.boundaryChars() != SimpleBoundaryScanner.DEFAULT_BOUNDARY_CHARS) {
return new SimpleBoundaryScanner(fieldOptions.boundaryMaxScan(), fieldOptions.boundaryChars());
}
return DEFAULT_SIMPLE_BOUNDARY_SCANNER;
}
}

private class MapperHighlightEntry {
public FragListBuilder fragListBuilder;
public FragmentsBuilder fragmentsBuilder;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -95,9 +95,9 @@ public class HighlightBuilder extends AbstractHighlighterBuilder<HighlightBuilde
.preTags(DEFAULT_PRE_TAGS).postTags(DEFAULT_POST_TAGS).scoreOrdered(DEFAULT_SCORE_ORDERED)
.highlightFilter(DEFAULT_HIGHLIGHT_FILTER).requireFieldMatch(DEFAULT_REQUIRE_FIELD_MATCH)
.forceSource(DEFAULT_FORCE_SOURCE).fragmentCharSize(DEFAULT_FRAGMENT_CHAR_SIZE)
.numberOfFragments(DEFAULT_NUMBER_OF_FRAGMENTS).encoder(DEFAULT_ENCODER)
.numberOfFragments(DEFAULT_NUMBER_OF_FRAGMENTS).encoder(DEFAULT_ENCODER).boundaryScannerType(BoundaryScannerType.CHARS)
.boundaryMaxScan(SimpleBoundaryScanner.DEFAULT_MAX_SCAN).boundaryChars(SimpleBoundaryScanner.DEFAULT_BOUNDARY_CHARS)
.noMatchSize(DEFAULT_NO_MATCH_SIZE).phraseLimit(DEFAULT_PHRASE_LIMIT).build();
.boundaryScannerLocale(Locale.ROOT).noMatchSize(DEFAULT_NO_MATCH_SIZE).phraseLimit(DEFAULT_PHRASE_LIMIT).build();

private final List<Field> fields = new ArrayList<>();

Expand Down Expand Up @@ -327,12 +327,18 @@ private static void transferOptions(AbstractHighlighterBuilder highlighterBuilde
if (highlighterBuilder.requireFieldMatch != null) {
targetOptionsBuilder.requireFieldMatch(highlighterBuilder.requireFieldMatch);
}
if (highlighterBuilder.boundaryScannerType != null) {
targetOptionsBuilder.boundaryScannerType(highlighterBuilder.boundaryScannerType);
}
if (highlighterBuilder.boundaryMaxScan != null) {
targetOptionsBuilder.boundaryMaxScan(highlighterBuilder.boundaryMaxScan);
}
if (highlighterBuilder.boundaryChars != null) {
targetOptionsBuilder.boundaryChars(convertCharArray(highlighterBuilder.boundaryChars));
}
if (highlighterBuilder.boundaryScannerLocale != null) {
targetOptionsBuilder.boundaryScannerLocale(highlighterBuilder.boundaryScannerLocale);
}
if (highlighterBuilder.highlighterType != null) {
targetOptionsBuilder.highlighterType(highlighterBuilder.highlighterType);
}
Expand Down Expand Up @@ -522,4 +528,30 @@ public String toString() {
return name().toLowerCase(Locale.ROOT);
}
}

public enum BoundaryScannerType implements Writeable {
CHARS, WORD, SENTENCE;

public static BoundaryScannerType readFromStream(StreamInput in) throws IOException {
int ordinal = in.readVInt();
if (ordinal < 0 || ordinal >= values().length) {
throw new IOException("Unknown BoundaryScannerType ordinal [" + ordinal + "]");
}
return values()[ordinal];
}

@Override
public void writeTo(StreamOutput out) throws IOException {
out.writeVInt(this.ordinal());
}

public static BoundaryScannerType fromString(String boundaryScannerType) {
return valueOf(boundaryScannerType.toUpperCase(Locale.ROOT));
}

@Override
public String toString() {
return name().toLowerCase(Locale.ROOT);
}
}
}
Loading