elastic · jimczi · Aug 9, 2023 · May 12, 2023 · May 12, 2023 · May 12, 2023
diff --git a/docs/changelog/96068.yaml b/docs/changelog/96068.yaml
@@ -0,0 +1,5 @@
+pr: 96068
+summary: Use the Weight#matches mode for highlighting by default
+area: Search
+type: enhancement
+issues: []
diff --git a/...common/src/test/java/org/elasticsearch/analysis/common/HighlighterWithAnalyzersTests.java b/...common/src/test/java/org/elasticsearch/analysis/common/HighlighterWithAnalyzersTests.java
@@ -258,7 +258,7 @@ public void testPhrasePrefix() throws IOException {
             .highlighter(highlight().field("field0").order("score").preTags("<x>").postTags("</x>"));
 
         searchResponse = client().search(new SearchRequest("first_test_index").source(source)).actionGet();
-        assertHighlight(searchResponse, 0, "field0", 0, 1, equalTo("The <x>quick</x> <x>brown</x> fox jumps over the lazy dog"));
+        assertHighlight(searchResponse, 0, "field0", 0, 1, equalTo("The <x>quick brown</x> fox jumps over the lazy dog"));
 
         logger.info("--> highlighting and searching on field1");
         source = searchSource().query(
@@ -293,8 +293,8 @@ public void testPhrasePrefix() throws IOException {
             0,
             1,
             anyOf(
-                equalTo("The <x>quick</x> <x>browse</x> button is a fancy thing, right bro?"),
-                equalTo("The <x>quick</x> <x>brown</x> fox jumps over the lazy dog")
+                equalTo("The <x>quick browse</x> button is a fancy thing, right bro?"),
+                equalTo("The <x>quick brown</x> fox jumps over the lazy dog")
             )
         );
         assertHighlight(
@@ -304,8 +304,8 @@ public void testPhrasePrefix() throws IOException {
             0,
             1,
             anyOf(
-                equalTo("The <x>quick</x> <x>browse</x> button is a fancy thing, right bro?"),
-                equalTo("The <x>quick</x> <x>brown</x> fox jumps over the lazy dog")
+                equalTo("The <x>quick browse</x> button is a fancy thing, right bro?"),
+                equalTo("The <x>quick brown</x> fox jumps over the lazy dog")
             )
         );
 
@@ -343,7 +343,7 @@ public void testPhrasePrefix() throws IOException {
 
         searchResponse = client().search(new SearchRequest("second_test_index").source(source)).actionGet();
 
-        assertHighlight(searchResponse, 0, "field3", 0, 1, equalTo("The <x>quick</x> <x>brown</x> fox jumps over the lazy dog"));
+        assertHighlight(searchResponse, 0, "field3", 0, 1, equalTo("The <x>quick brown</x> fox jumps over the lazy dog"));
 
         logger.info("--> highlighting and searching on field4");
         source = searchSource().postFilter(termQuery("type", "type2"))
@@ -358,8 +358,8 @@ public void testPhrasePrefix() throws IOException {
             0,
             1,
             anyOf(
-                equalTo("<x>The</x> <x>quick</x> <x>browse</x> button is a fancy thing, right bro?"),
-                equalTo("<x>The</x> <x>quick</x> <x>brown</x> fox jumps over the lazy dog")
+                equalTo("<x>The quick browse</x> button is a fancy thing, right bro?"),
+                equalTo("<x>The quick brown</x> fox jumps over the lazy dog")
             )
         );
         assertHighlight(
@@ -369,8 +369,8 @@ public void testPhrasePrefix() throws IOException {
             0,
             1,
             anyOf(
-                equalTo("<x>The</x> <x>quick</x> <x>browse</x> button is a fancy thing, right bro?"),
-                equalTo("<x>The</x> <x>quick</x> <x>brown</x> fox jumps over the lazy dog")
+                equalTo("<x>The quick browse</x> button is a fancy thing, right bro?"),
+                equalTo("<x>The quick brown</x> fox jumps over the lazy dog")
             )
         );
 

diff --git a/.../src/main/java/org/elasticsearch/index/mapper/annotatedtext/AnnotatedTextHighlighter.java b/.../src/main/java/org/elasticsearch/index/mapper/annotatedtext/AnnotatedTextHighlighter.java
@@ -17,14 +17,14 @@
 import org.elasticsearch.index.query.SearchExecutionContext;
 import org.elasticsearch.lucene.search.uhighlight.CustomUnifiedHighlighter;
 import org.elasticsearch.search.fetch.FetchSubPhase.HitContext;
+import org.elasticsearch.search.fetch.subphase.highlight.DefaultHighlighter;
 import org.elasticsearch.search.fetch.subphase.highlight.SearchHighlightContext;
-import org.elasticsearch.search.fetch.subphase.highlight.UnifiedHighlighter;
 
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
 
-public class AnnotatedTextHighlighter extends UnifiedHighlighter {
+public class AnnotatedTextHighlighter extends DefaultHighlighter {
 
     public static final String NAME = "annotated";
 

diff --git a/...test/java/org/elasticsearch/index/mapper/annotatedtext/AnnotatedTextHighlighterTests.java b/...test/java/org/elasticsearch/index/mapper/annotatedtext/AnnotatedTextHighlighterTests.java
@@ -131,21 +131,23 @@ private void assertHighlightOneDoc(
                 TopDocs topDocs = searcher.search(new MatchAllDocsQuery(), 1, Sort.INDEXORDER);
                 assertThat(topDocs.totalHits.value, equalTo(1L));
                 String rawValue = Strings.collectionToDelimitedString(plainTextForHighlighter, String.valueOf(MULTIVAL_SEP_CHAR));
+                UnifiedHighlighter.Builder builder = UnifiedHighlighter.builder(searcher, hiliteAnalyzer);
+                builder.withBreakIterator(() -> breakIterator);
+                builder.withFieldMatcher(name -> "text".equals(name));
+                builder.withFormatter(passageFormatter);
                 CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(
-                    searcher,
-                    hiliteAnalyzer,
+                    builder,
                     UnifiedHighlighter.OffsetSource.ANALYSIS,
-                    passageFormatter,
                     locale,
-                    breakIterator,
                     "index",
                     "text",
                     query,
                     noMatchSize,
                     expectedPassages.length,
-                    name -> "text".equals(name),
                     maxAnalyzedOffset,
-                    queryMaxAnalyzedOffset
+                    queryMaxAnalyzedOffset,
+                    true,
+                    true
                 );
                 highlighter.setFieldMatcher((name) -> "text".equals(name));
                 final Snippet[] snippets = highlighter.highlightField(getOnlyLeafReader(reader), topDocs.scoreDocs[0].doc, () -> rawValue);
@@ -236,7 +238,7 @@ public void testAnnotatedTextSingleFieldWithBreakIterator() throws Exception {
 
     public void testAnnotatedTextSingleFieldWithPhraseQuery() throws Exception {
         final String[] markedUpInputs = { "[Donald Trump](Donald+Trump) visited Singapore", "Donald Jr was with Melania Trump" };
-        String[] expectedPassages = { "[Donald](_hit_term=donald) [Trump](_hit_term=trump) visited Singapore" };
+        String[] expectedPassages = { "[Donald Trump](_hit_term=donald+trump&Donald+Trump) visited Singapore" };
         Query query = new PhraseQuery("text", "donald", "trump");
         BreakIterator breakIterator = new CustomSeparatorBreakIterator(MULTIVAL_SEP_CHAR);
         assertHighlightOneDoc("text", markedUpInputs, query, Locale.ROOT, breakIterator, 0, expectedPassages);

diff --git a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.highlight/10_unified.yml b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.highlight/10_unified.yml
@@ -23,6 +23,18 @@ setup:
   - do:
       indices.refresh: {}
 
+---
+teardown:
+  - skip:
+      version: " - 8.9.99"
+      reason: "setting added in 8.10.0"
+
+  - do:
+      indices.put_settings:
+        index: test
+        body:
+          index.highlight.weight_matches_mode.enabled: null
+
 ---
 "Basic multi_match query":
   - do:
@@ -49,3 +61,35 @@ setup:
   - match: {hits.hits.0.highlight.text.0: "The <em>quick</em> <em>brown</em> <em>fox</em> is <em>brown</em>."}
   - match: {hits.hits.0.highlight.text\.fvh.0: "The <em>quick</em> <em>brown</em> <em>fox</em> is <em>brown</em>."}
   - match: {hits.hits.0.highlight.text\.postings.0: "The <em>quick</em> <em>brown</em> <em>fox</em> is <em>brown</em>."}
+
+---
+"Disable weight matches":
+  - skip:
+      version: " - 8.9.99"
+      reason: "support for weight matches was added in 8.10"
+
+  - do:
+      search:
+        body: {
+          "query": { "multi_match": { "query": "quick brown fox", "type": "phrase", "fields": [ "text*" ] } },
+          "highlight": { "type": "unified", "fields": { "*": { } } } }
+
+  - match: { hits.hits.0.highlight.text.0: "The <em>quick brown fox</em> is brown." }
+  - match: { hits.hits.0.highlight.text\.fvh.0: "The <em>quick brown fox</em> is brown." }
+  - match: { hits.hits.0.highlight.text\.postings.0: "The <em>quick brown fox</em> is brown." }
+
+  - do:
+      indices.put_settings:
+        index: test
+        body:
+          index.highlight.weight_matches_mode.enabled: "false"
+
+  - do:
+      search:
+        body: {
+          "query" : { "multi_match" : { "query" : "quick brown fox", "type": "phrase", "fields" : [ "text*"] } },
+          "highlight" : { "type" : "unified", "fields" : { "*" : {} } } }
+
+  - match: {hits.hits.0.highlight.text.0: "The <em>quick</em> <em>brown</em> <em>fox</em> is brown."}
+  - match: {hits.hits.0.highlight.text\.fvh.0: "The <em>quick</em> <em>brown</em> <em>fox</em> is brown."}
+  - match: {hits.hits.0.highlight.text\.postings.0: "The <em>quick</em> <em>brown</em> <em>fox</em> is brown."}
diff --git a/...ec/src/yamlRestTest/resources/rest-api-spec/test/search.highlight/50_synthetic_source.yml b/...ec/src/yamlRestTest/resources/rest-api-spec/test/search.highlight/50_synthetic_source.yml
@@ -210,7 +210,7 @@ text single unified from reanalysis:
             fields:
               foo.analyze:
                 type: unified
-  - match: { hits.hits.0.highlight.foo\.analyze.0: <em>the</em> <em>quick</em> <em>brown</em> <em>fox</em> <em>jumped</em> <em>over</em> <em>the</em> <em>lazy</em> <em>dog</em> }
+  - match: { hits.hits.0.highlight.foo\.analyze.0: <em>the quick brown fox jumped over the lazy dog</em> }
 
 ---
 text multi unified from reanalysis:
@@ -225,7 +225,7 @@ text multi unified from reanalysis:
             fields:
               foo.analyze:
                 type: unified
-  - match: { hits.hits.0.highlight.foo\.analyze.0: <em>That</em> <em>makes</em> <em>calamity</em> <em>of</em> <em>so</em> <em>long</em> <em>life</em>. }
+  - match: { hits.hits.0.highlight.foo\.analyze.0: <em>That makes calamity of so long life</em>. }
 
 ---
 text single unified from positions:
@@ -240,7 +240,7 @@ text single unified from positions:
             fields:
               foo.positions:
                 type: unified
-  - match: { hits.hits.0.highlight.foo\.positions.0: <em>the</em> <em>quick</em> <em>brown</em> <em>fox</em> <em>jumped</em> <em>over</em> <em>the</em> <em>lazy</em> <em>dog</em> }
+  - match: { hits.hits.0.highlight.foo\.positions.0: <em>the quick brown fox jumped over the lazy dog</em> }
 
 ---
 text multi unified from positions:
@@ -255,7 +255,7 @@ text multi unified from positions:
             fields:
               foo.positions:
                 type: unified
-  - match: { hits.hits.0.highlight.foo\.positions.0: <em>That</em> <em>makes</em> <em>calamity</em> <em>of</em> <em>so</em> <em>long</em> <em>life</em>. }
+  - match: { hits.hits.0.highlight.foo\.positions.0: <em>That makes calamity of so long life</em>. }
 
 ---
 text single unified from vectors:
@@ -270,7 +270,7 @@ text single unified from vectors:
             fields:
               foo.vectors:
                 type: unified
-  - match: { hits.hits.0.highlight.foo\.vectors.0: <em>the</em> <em>quick</em> <em>brown</em> <em>fox</em> <em>jumped</em> <em>over</em> <em>the</em> <em>lazy</em> <em>dog</em> }
+  - match: { hits.hits.0.highlight.foo\.vectors.0: <em>the quick brown fox jumped over the lazy dog</em> }
 
 ---
 text multi unified from vectors:
@@ -285,7 +285,7 @@ text multi unified from vectors:
             fields:
               foo.vectors:
                 type: unified
-  - match: { hits.hits.0.highlight.foo\.vectors.0: <em>That</em> <em>makes</em> <em>calamity</em> <em>of</em> <em>so</em> <em>long</em> <em>life</em>. }
+  - match: { hits.hits.0.highlight.foo\.vectors.0: <em>That makes calamity of so long life</em>. }
 
 ---
 text single fvh source order:

diff --git a/...usterTest/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java b/...usterTest/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java
@@ -2217,14 +2217,7 @@ public void testPostingsHighlighter() throws Exception {
 
         searchResponse = client().search(new SearchRequest("test").source(source)).actionGet();
 
-        assertHighlight(
-            searchResponse,
-            0,
-            "field2",
-            0,
-            1,
-            equalTo("The <xxx>quick</xxx> <xxx>brown</xxx> fox jumps over the lazy quick dog")
-        );
+        assertHighlight(searchResponse, 0, "field2", 0, 1, equalTo("The <xxx>quick brown</xxx> fox jumps over the lazy quick dog"));
 
         // lets fall back to the standard highlighter then, what people would do to highlight query matches
         logger.info("--> searching on field2, highlighting on field2, falling back to the plain highlighter");

diff --git a/server/src/main/java/org/elasticsearch/common/lucene/search/MultiPhrasePrefixQuery.java b/server/src/main/java/org/elasticsearch/common/lucene/search/MultiPhrasePrefixQuery.java
@@ -17,10 +17,12 @@
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.MatchNoDocsQuery;
 import org.apache.lucene.search.MultiPhraseQuery;
+import org.apache.lucene.search.PrefixQuery;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.QueryVisitor;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.StringHelper;
+import org.apache.lucene.util.automaton.CompiledAutomaton;
 
 import java.io.IOException;
 import java.util.ArrayList;
@@ -295,20 +297,12 @@ public void visit(QueryVisitor visitor) {
                     shouldVisitor.consumeTerms(this, termArrays.get(i));
                 }
             }
-            /* We don't report automata here because this breaks the unified highlighter,
-               which extracts automata separately from phrases. MPPQ gets rewritten to a
-               SpanMTQQuery by the PhraseHelper in any case, so highlighting is taken
-               care of there instead.  If we extract automata here then the trailing prefix
-               word will be highlighted wherever it appears in the document, instead of only
-               as part of a phrase. This can be re-instated once we switch to using Matches
-               to highlight.
             for (Term prefixTerm : termArrays.get(termArrays.size() - 1)) {
                 visitor.consumeTermsMatching(this, field, () -> {
                     CompiledAutomaton ca = new CompiledAutomaton(PrefixQuery.toAutomaton(prefixTerm.bytes()));
                     return ca.runAutomaton;
                 });
             }
-            */
         }
     }
 }
@@ -114,6 +114,7 @@ public final class IndexScopedSettings extends AbstractScopedSettings {
         IndexSettings.MAX_SHINGLE_DIFF_SETTING,
         IndexSettings.MAX_RESCORE_WINDOW_SETTING,
         IndexSettings.MAX_ANALYZED_OFFSET_SETTING,
+        IndexSettings.WEIGHT_MATCHES_MODE_ENABLED_SETTING,
         IndexSettings.MAX_TERMS_COUNT_SETTING,
         IndexSettings.INDEX_TRANSLOG_SYNC_INTERVAL_SETTING,
         IndexSettings.DEFAULT_FIELD_SETTING,

diff --git a/server/src/main/java/org/elasticsearch/index/IndexSettings.java b/server/src/main/java/org/elasticsearch/index/IndexSettings.java
@@ -10,6 +10,7 @@
 import org.apache.logging.log4j.Logger;
 import org.apache.logging.log4j.util.Strings;
 import org.apache.lucene.index.MergePolicy;
+import org.apache.lucene.search.uhighlight.UnifiedHighlighter;
 import org.elasticsearch.cluster.metadata.IndexMetadata;
 import org.elasticsearch.cluster.node.DiscoveryNode;
 import org.elasticsearch.cluster.routing.IndexRouting;
@@ -187,6 +188,17 @@ public final class IndexSettings {
         Property.IndexScope
     );
 
+    /**
+     * Index setting to enable/disable the {@link UnifiedHighlighter.HighlightFlag#WEIGHT_MATCHES}
+     * mode of the unified highlighter.
+     */
+    public static final Setting<Boolean> WEIGHT_MATCHES_MODE_ENABLED_SETTING = Setting.boolSetting(
+        "index.highlight.weight_matches_mode.enabled",
+        true,
+        Property.Dynamic,
+        Property.IndexScope
+    );
+
     /**
      * Index setting describing the maximum number of terms that can be used in Terms Query.
      * The default maximum of 65536 terms is defensive, as extra processing and memory is involved
@@ -730,6 +742,7 @@ private void setRetentionLeaseMillis(final TimeValue retentionLease) {
     private volatile int maxShingleDiff;
     private volatile TimeValue searchIdleAfter;
     private volatile int maxAnalyzedOffset;
+    private volatile boolean weightMatchesEnabled;
     private volatile int maxTermsCount;
     private volatile String defaultPipeline;
     private volatile String requiredPipeline;
@@ -871,6 +884,7 @@ public IndexSettings(final IndexMetadata indexMetadata, final Settings nodeSetti
         maxRefreshListeners = scopedSettings.get(MAX_REFRESH_LISTENERS_PER_SHARD);
         maxSlicesPerScroll = scopedSettings.get(MAX_SLICES_PER_SCROLL);
         maxAnalyzedOffset = scopedSettings.get(MAX_ANALYZED_OFFSET_SETTING);
+        weightMatchesEnabled = scopedSettings.get(WEIGHT_MATCHES_MODE_ENABLED_SETTING);
         maxTermsCount = scopedSettings.get(MAX_TERMS_COUNT_SETTING);
         maxRegexLength = scopedSettings.get(MAX_REGEX_LENGTH_SETTING);
         this.mergePolicyConfig = new MergePolicyConfig(logger, this);
@@ -946,6 +960,7 @@ public IndexSettings(final IndexMetadata indexMetadata, final Settings nodeSetti
         scopedSettings.addSettingsUpdateConsumer(INDEX_REFRESH_INTERVAL_SETTING, this::setRefreshInterval);
         scopedSettings.addSettingsUpdateConsumer(MAX_REFRESH_LISTENERS_PER_SHARD, this::setMaxRefreshListeners);
         scopedSettings.addSettingsUpdateConsumer(MAX_ANALYZED_OFFSET_SETTING, this::setHighlightMaxAnalyzedOffset);
+        scopedSettings.addSettingsUpdateConsumer(WEIGHT_MATCHES_MODE_ENABLED_SETTING, this::setWeightMatchesEnabled);
         scopedSettings.addSettingsUpdateConsumer(MAX_TERMS_COUNT_SETTING, this::setMaxTermsCount);
         scopedSettings.addSettingsUpdateConsumer(MAX_SLICES_PER_SCROLL, this::setMaxSlicesPerScroll);
         scopedSettings.addSettingsUpdateConsumer(DEFAULT_FIELD_SETTING, this::setDefaultFields);
@@ -1325,6 +1340,14 @@ private void setHighlightMaxAnalyzedOffset(int maxAnalyzedOffset) {
         this.maxAnalyzedOffset = maxAnalyzedOffset;
     }
 
+    public boolean isWeightMatchesEnabled() {
+        return this.weightMatchesEnabled;
+    }
+
+    private void setWeightMatchesEnabled(boolean value) {
+        this.weightMatchesEnabled = value;
+    }
+
     /**
      *  Returns the maximum number of terms that can be used in a Terms Query request
      */

diff --git a/...src/main/java/org/elasticsearch/lucene/search/uhighlight/BoundedBreakIteratorScanner.java b/...src/main/java/org/elasticsearch/lucene/search/uhighlight/BoundedBreakIteratorScanner.java
@@ -131,15 +131,16 @@ public int preceding(int offset) {
     }
 
     /**
-     * Can be invoked only after a call to preceding(offset+1).
+     * Can be invoked only after a call to preceding().
+     *
      * See {@link FieldHighlighter} for usage.
      */
     @Override
     public int following(int offset) {
-        if (offset != lastPrecedingOffset || innerEnd == -1) {
-            throw new IllegalArgumentException("offset != lastPrecedingOffset: " + "usage doesn't look like UnifiedHighlighter");
+        if (innerEnd == -1) {
+            throw new IllegalArgumentException("preceding should be called first, usage doesn't look like UnifiedHighlighter");
         }
-        return innerEnd;
+        return Math.max(offset, innerEnd);
     }
 
     /**