From 3399af630030043e2ead4fc722efcb290601909f Mon Sep 17 00:00:00 2001 From: Shai Erera Date: Sun, 19 Feb 2017 11:50:27 +0200 Subject: [PATCH 1/4] Add BreakIteratorBoundaryScanner support This commit adds a boundary_scanner property to the search highlight request (defaults to "simple", per current behavior) so the user can specify "break_iterator" and its type ("sentence", "word", "line" and "character"). Also adds "boundary_scanner_locale" to define which one should be used when highlighting the text. --- .../highlight/AbstractHighlighterBuilder.java | 76 ++++++++++++++++++- .../highlight/FastVectorHighlighter.java | 34 +++++++-- .../subphase/highlight/HighlightBuilder.java | 36 ++++++++- .../highlight/SearchContextHighlight.java | 27 +++++++ .../highlight/HighlightBuilderTests.java | 13 ++++ .../highlight/HighlighterSearchIT.java | 46 +++++++++++ 6 files changed, 221 insertions(+), 11 deletions(-) diff --git a/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/AbstractHighlighterBuilder.java b/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/AbstractHighlighterBuilder.java index e3a78227d9ce7..7616d515fd34a 100644 --- a/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/AbstractHighlighterBuilder.java +++ b/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/AbstractHighlighterBuilder.java @@ -32,10 +32,12 @@ import org.elasticsearch.common.xcontent.XContentParser; import org.elasticsearch.index.query.QueryBuilder; import org.elasticsearch.index.query.QueryParseContext; +import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder.BoundaryScannerType; import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder.Order; import java.io.IOException; import java.util.Arrays; +import java.util.Locale; import java.util.Map; import java.util.Objects; import java.util.function.BiFunction; @@ -57,8 +59,10 @@ public abstract class AbstractHighlighterBuilderfvh this setting + * controls which scanner to use for fragment boundaries, and defaults to "simple". + */ + @SuppressWarnings("unchecked") + public HB boundaryScannerType(String boundaryScannerType) { + this.boundaryScannerType = BoundaryScannerType.fromString(boundaryScannerType); + return (HB) this; + } + + /** + * When using the highlighterType fvh this setting + * controls which scanner to use for fragment boundaries, and defaults to "simple". + */ + @SuppressWarnings("unchecked") + public HB boundaryScannerType(BoundaryScannerType boundaryScannerType) { + this.boundaryScannerType = boundaryScannerType; + return (HB) this; + } + + /** + * @return the value set by {@link #boundaryScannerType(String)} + */ + public BoundaryScannerType boundaryScannerType() { + return this.boundaryScannerType; + } + /** * When using the highlighterType fvh this setting * controls how far to look for boundary characters, and defaults to 20. @@ -366,6 +409,25 @@ public char[] boundaryChars() { return this.boundaryChars; } + /** + * When using the highlighterType fvh and boundaryScannerType break_iterator, this setting + * controls the locale to use by the BreakIterator, defaults to "root". + */ + @SuppressWarnings("unchecked") + public HB boundaryScannerLocale(String boundaryScannerLocale) { + if (boundaryScannerLocale != null) { + this.boundaryScannerLocale = Locale.forLanguageTag(boundaryScannerLocale); + } + return (HB) this; + } + + /** + * @return the value set by {@link #boundaryScannerLocale(String)} + */ + public Locale boundaryScannerLocale() { + return this.boundaryScannerLocale; + } + /** * Allows to set custom options for custom highlighters. */ @@ -491,12 +553,18 @@ void commonOptionsToXContent(XContentBuilder builder) throws IOException { if (highlightFilter != null) { builder.field(HIGHLIGHT_FILTER_FIELD.getPreferredName(), highlightFilter); } + if (boundaryScannerType != null) { + builder.field(BOUNDARY_SCANNER_FIELD.getPreferredName(), boundaryScannerType.name()); + } if (boundaryMaxScan != null) { builder.field(BOUNDARY_MAX_SCAN_FIELD.getPreferredName(), boundaryMaxScan); } if (boundaryChars != null) { builder.field(BOUNDARY_CHARS_FIELD.getPreferredName(), new String(boundaryChars)); } + if (boundaryScannerLocale != null && boundaryScannerLocale != Locale.ROOT) { + builder.field(BOUNDARY_SCANNER_LOCALE_FIELD.getPreferredName(), boundaryScannerLocale.toLanguageTag()); + } if (options != null && options.size() > 0) { builder.field(OPTIONS_FIELD.getPreferredName(), options); } @@ -523,8 +591,10 @@ static > BiFunction hb.boundaryChars(bc.toCharArray()) , BOUNDARY_CHARS_FIELD); + parser.declareString(HB::boundaryScannerLocale, BOUNDARY_SCANNER_LOCALE_FIELD); parser.declareString(HB::highlighterType, TYPE_FIELD); parser.declareString(HB::fragmenter, FRAGMENTER_FIELD); parser.declareInt(HB::noMatchSize, NO_MATCH_SIZE_FIELD); @@ -562,8 +632,8 @@ static > BiFunction SETTING_TV_HIGHLIGHT_MULTI_VALUE = Setting.boolSetting("search.highlight.term_vector_multi_value", true, Setting.Property.NodeScope); @@ -105,12 +112,7 @@ public HighlightField highlight(HighlighterContext highlighterContext) { FragListBuilder fragListBuilder; BaseFragmentsBuilder fragmentsBuilder; - BoundaryScanner boundaryScanner = DEFAULT_BOUNDARY_SCANNER; - if (field.fieldOptions().boundaryMaxScan() != SimpleBoundaryScanner.DEFAULT_MAX_SCAN - || field.fieldOptions().boundaryChars() != SimpleBoundaryScanner.DEFAULT_BOUNDARY_CHARS) { - boundaryScanner = new SimpleBoundaryScanner(field.fieldOptions().boundaryMaxScan(), - field.fieldOptions().boundaryChars()); - } + final BoundaryScanner boundaryScanner = getBoundaryScanner(field); boolean forceSource = context.highlight().forceSource(field); if (field.fieldOptions().numberOfFragments() == 0) { fragListBuilder = new SingleFragListBuilder(); @@ -206,6 +208,24 @@ public boolean canHighlight(FieldMapper fieldMapper) { && fieldMapper.fieldType().storeTermVectorPositions(); } + private static BoundaryScanner getBoundaryScanner(Field field) { + final FieldOptions fieldOptions = field.fieldOptions(); + switch(fieldOptions.boundaryScannerType()) { + case BREAK_ITERATOR: + Locale boundaryScannerLocale = fieldOptions.boundaryScannerLocale(); + if (boundaryScannerLocale != null) { + return new BreakIteratorBoundaryScanner(BreakIterator.getSentenceInstance(boundaryScannerLocale)); + } + return DEFAULT_BREAK_ITERATOR_BOUNDARY_SCANNER; + default: + if (fieldOptions.boundaryMaxScan() != SimpleBoundaryScanner.DEFAULT_MAX_SCAN + || fieldOptions.boundaryChars() != SimpleBoundaryScanner.DEFAULT_BOUNDARY_CHARS) { + return new SimpleBoundaryScanner(fieldOptions.boundaryMaxScan(), fieldOptions.boundaryChars()); + } + return DEFAULT_SIMPLE_BOUNDARY_SCANNER; + } + } + private class MapperHighlightEntry { public FragListBuilder fragListBuilder; public FragmentsBuilder fragmentsBuilder; diff --git a/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightBuilder.java b/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightBuilder.java index a063b2900d547..b06a4a86bae83 100644 --- a/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightBuilder.java +++ b/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightBuilder.java @@ -95,9 +95,9 @@ public class HighlightBuilder extends AbstractHighlighterBuilder fields = new ArrayList<>(); @@ -327,6 +327,9 @@ private static void transferOptions(AbstractHighlighterBuilder highlighterBuilde if (highlighterBuilder.requireFieldMatch != null) { targetOptionsBuilder.requireFieldMatch(highlighterBuilder.requireFieldMatch); } + if (highlighterBuilder.boundaryScannerType != null) { + targetOptionsBuilder.boundaryScannerType(highlighterBuilder.boundaryScannerType); + } if (highlighterBuilder.boundaryMaxScan != null) { targetOptionsBuilder.boundaryMaxScan(highlighterBuilder.boundaryMaxScan); } @@ -522,4 +525,33 @@ public String toString() { return name().toLowerCase(Locale.ROOT); } } + + public static enum BoundaryScannerType implements Writeable { + SIMPLE, BREAK_ITERATOR; + + public static BoundaryScannerType readFromStream(StreamInput in) throws IOException { + int ordinal = in.readVInt(); + if (ordinal < 0 || ordinal >= values().length) { + throw new IOException("Unknown BoundaryScannerType ordinal [" + ordinal + "]"); + } + return values()[ordinal]; + } + + @Override + public void writeTo(StreamOutput out) throws IOException { + out.writeVInt(this.ordinal()); + } + + public static BoundaryScannerType fromString(String boundaryScannerType) { + if (boundaryScannerType.toUpperCase(Locale.ROOT).equals(BREAK_ITERATOR.name())) { + return BoundaryScannerType.BREAK_ITERATOR; + } + return SIMPLE; + } + + @Override + public String toString() { + return name().toLowerCase(Locale.ROOT); + } + } } diff --git a/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/SearchContextHighlight.java b/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/SearchContextHighlight.java index d4731718793a9..844c58f1ea358 100644 --- a/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/SearchContextHighlight.java +++ b/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/SearchContextHighlight.java @@ -20,11 +20,13 @@ package org.elasticsearch.search.fetch.subphase.highlight; import org.apache.lucene.search.Query; +import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder.BoundaryScannerType; import java.util.Arrays; import java.util.Collection; import java.util.HashMap; import java.util.LinkedHashMap; +import java.util.Locale; import java.util.Map; import java.util.Set; @@ -110,10 +112,14 @@ public static class FieldOptions { private String fragmenter; + private BoundaryScannerType boundaryScannerType; + private int boundaryMaxScan = -1; private Character[] boundaryChars = null; + private Locale boundaryScannerLocale; + private Query highlightQuery; private int noMatchSize = -1; @@ -168,6 +174,10 @@ public String fragmenter() { return fragmenter; } + public BoundaryScannerType boundaryScannerType() { + return boundaryScannerType; + } + public int boundaryMaxScan() { return boundaryMaxScan; } @@ -176,6 +186,10 @@ public Character[] boundaryChars() { return boundaryChars; } + public Locale boundaryScannerLocale() { + return boundaryScannerLocale; + } + public Query highlightQuery() { return highlightQuery; } @@ -260,6 +274,11 @@ Builder fragmenter(String fragmenter) { return this; } + Builder boundaryScannerType(BoundaryScannerType boundaryScanner) { + fieldOptions.boundaryScannerType = boundaryScanner; + return this; + } + Builder boundaryMaxScan(int boundaryMaxScan) { fieldOptions.boundaryMaxScan = boundaryMaxScan; return this; @@ -270,6 +289,11 @@ Builder boundaryChars(Character[] boundaryChars) { return this; } + Builder boundaryScannerLocale(Locale boundaryScannerLocale) { + fieldOptions.boundaryScannerLocale = boundaryScannerLocale; + return this; + } + Builder highlightQuery(Query highlightQuery) { fieldOptions.highlightQuery = highlightQuery; return this; @@ -324,6 +348,9 @@ Builder merge(FieldOptions globalOptions) { if (fieldOptions.requireFieldMatch == null) { fieldOptions.requireFieldMatch = globalOptions.requireFieldMatch; } + if (fieldOptions.boundaryScannerType == null) { + fieldOptions.boundaryScannerType = globalOptions.boundaryScannerType; + } if (fieldOptions.boundaryMaxScan == -1) { fieldOptions.boundaryMaxScan = globalOptions.boundaryMaxScan; } diff --git a/core/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightBuilderTests.java b/core/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightBuilderTests.java index 944427b7e174f..5078ed9ae706d 100644 --- a/core/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightBuilderTests.java +++ b/core/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightBuilderTests.java @@ -47,6 +47,7 @@ import org.elasticsearch.index.query.QueryShardContext; import org.elasticsearch.index.query.TermQueryBuilder; import org.elasticsearch.search.SearchModule; +import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder.BoundaryScannerType; import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder.Field; import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder.Order; import org.elasticsearch.search.fetch.subphase.highlight.SearchContextHighlight.FieldOptions; @@ -288,6 +289,7 @@ public MappedFieldType fieldMapper(String name) { mergeBeforeChek(highlightBuilder, fieldBuilder, fieldOptions); checkSame.accept(AbstractHighlighterBuilder::boundaryChars, FieldOptions::boundaryChars); + checkSame.accept(AbstractHighlighterBuilder::boundaryScannerType, FieldOptions::boundaryScannerType); checkSame.accept(AbstractHighlighterBuilder::boundaryMaxScan, FieldOptions::boundaryMaxScan); checkSame.accept(AbstractHighlighterBuilder::fragmentSize, FieldOptions::fragmentCharSize); checkSame.accept(AbstractHighlighterBuilder::fragmenter, FieldOptions::fragmenter); @@ -557,12 +559,23 @@ private static void setRandomCommonOptions(AbstractHighlighterBuilder highlightB if (randomBoolean()) { highlightBuilder.forceSource(randomBoolean()); } + if (randomBoolean()) { + if (randomBoolean()) { + highlightBuilder.boundaryScannerType(randomFrom(BoundaryScannerType.values())); + } else { + // also test the string setter + highlightBuilder.boundaryScannerType(randomFrom(BoundaryScannerType.values().toString())); + } + } if (randomBoolean()) { highlightBuilder.boundaryMaxScan(randomIntBetween(0, 10)); } if (randomBoolean()) { highlightBuilder.boundaryChars(randomAsciiOfLengthBetween(1, 10).toCharArray()); } + if (randomBoolean()) { + highlightBuilder.boundaryScannerLocale(randomLocale(random()).toLanguageTag()); + } if (randomBoolean()) { highlightBuilder.noMatchSize(randomIntBetween(0, 10)); } diff --git a/core/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java b/core/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java index 815998ad0936b..54c2cd699700b 100644 --- a/core/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java +++ b/core/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java @@ -44,6 +44,7 @@ import org.elasticsearch.rest.RestStatus; import org.elasticsearch.search.SearchHit; import org.elasticsearch.search.builder.SearchSourceBuilder; +import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder.BoundaryScannerType; import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder.Field; import org.elasticsearch.search.sort.SortOrder; import org.elasticsearch.test.ESIntegTestCase; @@ -57,6 +58,7 @@ import java.util.Collection; import java.util.Collections; import java.util.HashMap; +import java.util.Locale; import java.util.Map; import static org.elasticsearch.client.Requests.searchRequest; @@ -747,7 +749,51 @@ public void testFastVectorHighlighter() throws Exception { searchResponse = client().prepareSearch("test").setSource(source).get(); assertHighlight(searchResponse, 0, "field2", 0, 1, equalTo("The quick brown fox jumps over")); + } + + public void testFastVectorHighlighterWithBreakIterator() throws Exception { + assertAcked(prepareCreate("test").addMapping("type1", type1TermVectorMapping())); + ensureGreen(); + + indexRandom(true, client().prepareIndex("test", "type1") + .setSource("field1", "A sentence with few words. Another sentence with even more words.")); + + logger.info("--> highlighting and searching on 'field' with breakIterator"); + SearchSourceBuilder source = searchSource() + .query(termQuery("field1", "sentence")) + .highlighter(highlight() + .field("field1", 20, 2) + .order("score") + .preTags("").postTags("") + .boundaryScannerType(BoundaryScannerType.BREAK_ITERATOR)); + + SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get(); + + assertHighlight(searchResponse, 0, "field1", 0, 2, equalTo("A sentence with few words. ")); + assertHighlight(searchResponse, 0, "field1", 1, 2, equalTo("Another sentence with even more words. ")); + } + + public void testFastVectorHighlighterWithBreakIteratorAndLocale() throws Exception { + assertAcked(prepareCreate("test").addMapping("type1", type1TermVectorMapping())); + ensureGreen(); + + indexRandom(true, client().prepareIndex("test", "type1") + .setSource("field1", "A sentence with few words. Another sentence with even more words.")); + + logger.info("--> highlighting and searching on 'field' with breakIterator"); + SearchSourceBuilder source = searchSource() + .query(termQuery("field1", "sentence")) + .highlighter(highlight() + .field("field1", 20, 2) + .order("score") + .preTags("").postTags("") + .boundaryScannerType(BoundaryScannerType.BREAK_ITERATOR) + .boundaryScannerLocale(Locale.ENGLISH.toLanguageTag())); + + SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get(); + assertHighlight(searchResponse, 0, "field1", 0, 2, equalTo("A sentence with few words. ")); + assertHighlight(searchResponse, 0, "field1", 1, 2, equalTo("Another sentence with even more words. ")); } /** From 4edf12c365c88c374647f561cb97749e17e9943b Mon Sep 17 00:00:00 2001 From: Shai Erera Date: Wed, 22 Feb 2017 21:25:55 +0200 Subject: [PATCH 2/4] Fix per review comments --- .../highlight/AbstractHighlighterBuilder.java | 23 +++++--- .../highlight/FastVectorHighlighter.java | 15 +++-- .../subphase/highlight/HighlightBuilder.java | 7 +-- .../highlight/HighlighterSearchIT.java | 55 +++++++++++++++++-- 4 files changed, 78 insertions(+), 22 deletions(-) diff --git a/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/AbstractHighlighterBuilder.java b/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/AbstractHighlighterBuilder.java index 7616d515fd34a..0b8799c0f1114 100644 --- a/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/AbstractHighlighterBuilder.java +++ b/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/AbstractHighlighterBuilder.java @@ -21,6 +21,7 @@ import org.apache.lucene.search.highlight.SimpleFragmenter; import org.apache.lucene.search.highlight.SimpleSpanFragmenter; +import org.elasticsearch.Version; import org.elasticsearch.action.support.ToXContentToBytes; import org.elasticsearch.common.ParseField; import org.elasticsearch.common.ParsingException; @@ -127,12 +128,16 @@ protected AbstractHighlighterBuilder(StreamInput in) throws IOException { order(in.readOptionalWriteable(Order::readFromStream)); highlightFilter(in.readOptionalBoolean()); forceSource(in.readOptionalBoolean()); - boundaryScannerType(in.readOptionalWriteable(BoundaryScannerType::readFromStream)); + if (in.getVersion().onOrAfter(Version.V_5_4_0_UNRELEASED)) { + boundaryScannerType(in.readOptionalWriteable(BoundaryScannerType::readFromStream)); + } boundaryMaxScan(in.readOptionalVInt()); if (in.readBoolean()) { boundaryChars(in.readString().toCharArray()); } - boundaryScannerLocale(in.readOptionalString()); + if (in.getVersion().onOrAfter(Version.V_5_4_0_UNRELEASED)) { + boundaryScannerLocale(in.readOptionalString()); + } noMatchSize(in.readOptionalVInt()); phraseLimit(in.readOptionalVInt()); if (in.readBoolean()) { @@ -160,17 +165,21 @@ public final void writeTo(StreamOutput out) throws IOException { out.writeOptionalWriteable(order); out.writeOptionalBoolean(highlightFilter); out.writeOptionalBoolean(forceSource); - out.writeOptionalWriteable(boundaryScannerType); + if (out.getVersion().onOrAfter(Version.V_5_4_0_UNRELEASED)) { + out.writeOptionalWriteable(boundaryScannerType); + } out.writeOptionalVInt(boundaryMaxScan); boolean hasBounaryChars = boundaryChars != null; out.writeBoolean(hasBounaryChars); if (hasBounaryChars) { out.writeString(String.valueOf(boundaryChars)); } - boolean hasBoundaryScannerLocale = boundaryScannerLocale != null; - out.writeBoolean(hasBoundaryScannerLocale); - if (hasBoundaryScannerLocale) { - out.writeString(boundaryScannerLocale.toLanguageTag()); + if (out.getVersion().onOrAfter(Version.V_5_4_0_UNRELEASED)) { + boolean hasBoundaryScannerLocale = boundaryScannerLocale != null; + out.writeBoolean(hasBoundaryScannerLocale); + if (hasBoundaryScannerLocale) { + out.writeString(boundaryScannerLocale.toLanguageTag()); + } } out.writeOptionalVInt(noMatchSize); out.writeOptionalVInt(phraseLimit); diff --git a/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/FastVectorHighlighter.java b/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/FastVectorHighlighter.java index e8a95c3430b2a..0c15f1106b19c 100644 --- a/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/FastVectorHighlighter.java +++ b/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/FastVectorHighlighter.java @@ -52,8 +52,10 @@ public class FastVectorHighlighter implements Highlighter { private static final BoundaryScanner DEFAULT_SIMPLE_BOUNDARY_SCANNER = new SimpleBoundaryScanner(); - private static final BoundaryScanner DEFAULT_BREAK_ITERATOR_BOUNDARY_SCANNER = new BreakIteratorBoundaryScanner( + private static final BoundaryScanner DEFAULT_SENTENCE_BOUNDARY_SCANNER = new BreakIteratorBoundaryScanner( BreakIterator.getSentenceInstance(Locale.ROOT)); + private static final BoundaryScanner DEFAULT_WORD_BOUNDARY_SCANNER = new BreakIteratorBoundaryScanner( + BreakIterator.getWordInstance(Locale.ROOT)); public static final Setting SETTING_TV_HIGHLIGHT_MULTI_VALUE = Setting.boolSetting("search.highlight.term_vector_multi_value", true, Setting.Property.NodeScope); @@ -210,13 +212,18 @@ public boolean canHighlight(FieldMapper fieldMapper) { private static BoundaryScanner getBoundaryScanner(Field field) { final FieldOptions fieldOptions = field.fieldOptions(); + final Locale boundaryScannerLocale = fieldOptions.boundaryScannerLocale(); switch(fieldOptions.boundaryScannerType()) { - case BREAK_ITERATOR: - Locale boundaryScannerLocale = fieldOptions.boundaryScannerLocale(); + case SENTENCE: if (boundaryScannerLocale != null) { return new BreakIteratorBoundaryScanner(BreakIterator.getSentenceInstance(boundaryScannerLocale)); } - return DEFAULT_BREAK_ITERATOR_BOUNDARY_SCANNER; + return DEFAULT_SENTENCE_BOUNDARY_SCANNER; + case WORD: + if (boundaryScannerLocale != null) { + return new BreakIteratorBoundaryScanner(BreakIterator.getWordInstance(boundaryScannerLocale)); + } + return DEFAULT_WORD_BOUNDARY_SCANNER; default: if (fieldOptions.boundaryMaxScan() != SimpleBoundaryScanner.DEFAULT_MAX_SCAN || fieldOptions.boundaryChars() != SimpleBoundaryScanner.DEFAULT_BOUNDARY_CHARS) { diff --git a/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightBuilder.java b/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightBuilder.java index b06a4a86bae83..5f12a270b208d 100644 --- a/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightBuilder.java +++ b/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightBuilder.java @@ -527,7 +527,7 @@ public String toString() { } public static enum BoundaryScannerType implements Writeable { - SIMPLE, BREAK_ITERATOR; + SIMPLE, SENTENCE, WORD; public static BoundaryScannerType readFromStream(StreamInput in) throws IOException { int ordinal = in.readVInt(); @@ -543,10 +543,7 @@ public void writeTo(StreamOutput out) throws IOException { } public static BoundaryScannerType fromString(String boundaryScannerType) { - if (boundaryScannerType.toUpperCase(Locale.ROOT).equals(BREAK_ITERATOR.name())) { - return BoundaryScannerType.BREAK_ITERATOR; - } - return SIMPLE; + return valueOf(boundaryScannerType.toUpperCase(Locale.ROOT)); } @Override diff --git a/core/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java b/core/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java index 54c2cd699700b..7db99ff3232ee 100644 --- a/core/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java +++ b/core/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java @@ -751,21 +751,21 @@ public void testFastVectorHighlighter() throws Exception { assertHighlight(searchResponse, 0, "field2", 0, 1, equalTo("The quick brown fox jumps over")); } - public void testFastVectorHighlighterWithBreakIterator() throws Exception { + public void testFastVectorHighlighterWithSentenceBoundaryScanner() throws Exception { assertAcked(prepareCreate("test").addMapping("type1", type1TermVectorMapping())); ensureGreen(); indexRandom(true, client().prepareIndex("test", "type1") .setSource("field1", "A sentence with few words. Another sentence with even more words.")); - logger.info("--> highlighting and searching on 'field' with breakIterator"); + logger.info("--> highlighting and searching on 'field' with sentence boundary_scanner"); SearchSourceBuilder source = searchSource() .query(termQuery("field1", "sentence")) .highlighter(highlight() .field("field1", 20, 2) .order("score") .preTags("").postTags("") - .boundaryScannerType(BoundaryScannerType.BREAK_ITERATOR)); + .boundaryScannerType(BoundaryScannerType.SENTENCE)); SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get(); @@ -773,21 +773,21 @@ public void testFastVectorHighlighterWithBreakIterator() throws Exception { assertHighlight(searchResponse, 0, "field1", 1, 2, equalTo("Another sentence with even more words. ")); } - public void testFastVectorHighlighterWithBreakIteratorAndLocale() throws Exception { + public void testFastVectorHighlighterWithSentenceBoundaryScannerAndLocale() throws Exception { assertAcked(prepareCreate("test").addMapping("type1", type1TermVectorMapping())); ensureGreen(); indexRandom(true, client().prepareIndex("test", "type1") .setSource("field1", "A sentence with few words. Another sentence with even more words.")); - logger.info("--> highlighting and searching on 'field' with breakIterator"); + logger.info("--> highlighting and searching on 'field' with sentence boundary_scanner"); SearchSourceBuilder source = searchSource() .query(termQuery("field1", "sentence")) .highlighter(highlight() .field("field1", 20, 2) .order("score") .preTags("").postTags("") - .boundaryScannerType(BoundaryScannerType.BREAK_ITERATOR) + .boundaryScannerType(BoundaryScannerType.SENTENCE) .boundaryScannerLocale(Locale.ENGLISH.toLanguageTag())); SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get(); @@ -796,6 +796,49 @@ public void testFastVectorHighlighterWithBreakIteratorAndLocale() throws Excepti assertHighlight(searchResponse, 0, "field1", 1, 2, equalTo("Another sentence with even more words. ")); } + public void testFastVectorHighlighterWithWordBoundaryScanner() throws Exception { + assertAcked(prepareCreate("test").addMapping("type1", type1TermVectorMapping())); + ensureGreen(); + + indexRandom(true, client().prepareIndex("test", "type1") + .setSource("field1", "some quick and hairy brown:fox jumped over the lazy dog")); + + logger.info("--> highlighting and searching on 'field' with word boundary_scanner"); + SearchSourceBuilder source = searchSource() + .query(termQuery("field1", "some")) + .highlighter(highlight() + .field("field1", 23, 1) + .order("score") + .preTags("").postTags("") + .boundaryScannerType(BoundaryScannerType.WORD)); + + SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get(); + + assertHighlight(searchResponse, 0, "field1", 0, 1, equalTo("some quick and hairy brown")); + } + + public void testFastVectorHighlighterWithWordBoundaryScannerAndLocale() throws Exception { + assertAcked(prepareCreate("test").addMapping("type1", type1TermVectorMapping())); + ensureGreen(); + + indexRandom(true, client().prepareIndex("test", "type1") + .setSource("field1", "some quick and hairy brown:fox jumped over the lazy dog")); + + logger.info("--> highlighting and searching on 'field' with word boundary_scanner"); + SearchSourceBuilder source = searchSource() + .query(termQuery("field1", "some")) + .highlighter(highlight() + .field("field1", 23, 1) + .order("score") + .preTags("").postTags("") + .boundaryScannerType(BoundaryScannerType.WORD) + .boundaryScannerLocale(Locale.ENGLISH.toLanguageTag())); + + SearchResponse searchResponse = client().prepareSearch("test").setSource(source).get(); + + assertHighlight(searchResponse, 0, "field1", 0, 1, equalTo("some quick and hairy brown")); + } + /** * The FHV can spend a long time highlighting degenerate documents if * phraseLimit is not set. Its default is now reasonably low. From 54f84a853dc451bac7b3b70c277423d1fb7420d4 Mon Sep 17 00:00:00 2001 From: Shai Erera Date: Thu, 23 Feb 2017 15:36:10 +0200 Subject: [PATCH 3/4] Update highlighting.asciidoc and rename SIMPLE to CHARS --- .../subphase/highlight/HighlightBuilder.java | 4 +-- .../search/request/highlighting.asciidoc | 25 +++++++++++-------- 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightBuilder.java b/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightBuilder.java index 5f12a270b208d..23d4b7b085a6c 100644 --- a/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightBuilder.java +++ b/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightBuilder.java @@ -95,7 +95,7 @@ public class HighlightBuilder extends AbstractHighlighterBuilder `1MB`) -* Can be customized with `boundary_chars`, `boundary_max_scan`, and - `fragment_offset` (see <>) +* Can be customized with `boundary_scanner` (see <>) * Requires setting `term_vector` to `with_positions_offsets` which increases the size of the index * Can combine matches from multiple fields into one result. See @@ -502,17 +501,23 @@ GET /_search -------------------------------------------------- // CONSOLE -[[boundary-characters]] -==== Boundary Characters +[[boundary-scanners]] +==== Boundary Scanners -When highlighting a field using the fast vector highlighter, -`boundary_chars` can be configured to define what constitutes a boundary -for highlighting. It's a single string with each boundary character -defined in it. It defaults to `.,!? \t\n`. +When highlighting a field using the fast vector highlighter, you can specify +how to break the highlighted fragments using `boundary_scanner`, which accepts +the following values: -The `boundary_max_scan` allows to control how far to look for boundary -characters, and defaults to `20`. +* `chars` (default): allows to configure which characters (`boundary_chars`) +constitute a boundary for highlighting. It's a single string with each boundary +character defined in it (defaults to `.,!? \t\n`). It also allows configuring +the `boundary_max_scan` to control how far to look for boundary characters +(defaults to `20`). +* `word` and `sentence`: use Java's https://docs.oracle.com/javase/8/docs/api/java/text/BreakIterator.html[BreakIterator] +to break the highlighted fragments at the next _word_ or _sentence_ boundary. +You can further specify `boundary_scanner_locale` to control which Locale is used +to search the text for these boundaries. [[matched-fields]] ==== Matched Fields From 563ad728cfa8f446f09e36d33aa170e43330e2fd Mon Sep 17 00:00:00 2001 From: Shai Erera Date: Thu, 23 Feb 2017 23:06:20 +0200 Subject: [PATCH 4/4] Address more review comments and test failures --- .../subphase/highlight/AbstractHighlighterBuilder.java | 10 ++++++---- .../fetch/subphase/highlight/HighlightBuilder.java | 7 +++++-- .../subphase/highlight/SearchContextHighlight.java | 3 +++ .../subphase/highlight/HighlightBuilderTests.java | 2 +- 4 files changed, 15 insertions(+), 7 deletions(-) diff --git a/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/AbstractHighlighterBuilder.java b/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/AbstractHighlighterBuilder.java index 0b8799c0f1114..3a3c1cfd66d57 100644 --- a/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/AbstractHighlighterBuilder.java +++ b/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/AbstractHighlighterBuilder.java @@ -136,7 +136,9 @@ protected AbstractHighlighterBuilder(StreamInput in) throws IOException { boundaryChars(in.readString().toCharArray()); } if (in.getVersion().onOrAfter(Version.V_5_4_0_UNRELEASED)) { - boundaryScannerLocale(in.readOptionalString()); + if (in.readBoolean()) { + boundaryScannerLocale(in.readString()); + } } noMatchSize(in.readOptionalVInt()); phraseLimit(in.readOptionalVInt()); @@ -571,7 +573,7 @@ void commonOptionsToXContent(XContentBuilder builder) throws IOException { if (boundaryChars != null) { builder.field(BOUNDARY_CHARS_FIELD.getPreferredName(), new String(boundaryChars)); } - if (boundaryScannerLocale != null && boundaryScannerLocale != Locale.ROOT) { + if (boundaryScannerLocale != null) { builder.field(BOUNDARY_SCANNER_LOCALE_FIELD.getPreferredName(), boundaryScannerLocale.toLanguageTag()); } if (options != null && options.size() > 0) { @@ -641,8 +643,8 @@ static > BiFunction