Skip to content

Commit 28a504d

Browse files
authored
Use the Weight#matches mode for highlighting by default (#96068)
This PR adapts the unified highlighter to use the Weight#matches mode by default when possible. This is the default mode in Lucene for some time now. For cases where the matches mode won't work (nested and parent-child queries), the matches mode is disabled automatically. I didn't expose an option to explicitly disable this mode because that should be seen as an internal implementation detail. With this change, matches that span multiple terms are highlighted together (something that users asked for years) and the clauses that don't match the document are ignored.
1 parent 2938673 commit 28a504d

File tree

20 files changed

+253
-235
lines changed

20 files changed

+253
-235
lines changed

docs/changelog/96068.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 96068
2+
summary: Use the Weight#matches mode for highlighting by default
3+
area: Search
4+
type: enhancement
5+
issues: []

modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/HighlighterWithAnalyzersTests.java

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -258,7 +258,7 @@ public void testPhrasePrefix() throws IOException {
258258
.highlighter(highlight().field("field0").order("score").preTags("<x>").postTags("</x>"));
259259

260260
searchResponse = client().search(new SearchRequest("first_test_index").source(source)).actionGet();
261-
assertHighlight(searchResponse, 0, "field0", 0, 1, equalTo("The <x>quick</x> <x>brown</x> fox jumps over the lazy dog"));
261+
assertHighlight(searchResponse, 0, "field0", 0, 1, equalTo("The <x>quick brown</x> fox jumps over the lazy dog"));
262262

263263
logger.info("--> highlighting and searching on field1");
264264
source = searchSource().query(
@@ -293,8 +293,8 @@ public void testPhrasePrefix() throws IOException {
293293
0,
294294
1,
295295
anyOf(
296-
equalTo("The <x>quick</x> <x>browse</x> button is a fancy thing, right bro?"),
297-
equalTo("The <x>quick</x> <x>brown</x> fox jumps over the lazy dog")
296+
equalTo("The <x>quick browse</x> button is a fancy thing, right bro?"),
297+
equalTo("The <x>quick brown</x> fox jumps over the lazy dog")
298298
)
299299
);
300300
assertHighlight(
@@ -304,8 +304,8 @@ public void testPhrasePrefix() throws IOException {
304304
0,
305305
1,
306306
anyOf(
307-
equalTo("The <x>quick</x> <x>browse</x> button is a fancy thing, right bro?"),
308-
equalTo("The <x>quick</x> <x>brown</x> fox jumps over the lazy dog")
307+
equalTo("The <x>quick browse</x> button is a fancy thing, right bro?"),
308+
equalTo("The <x>quick brown</x> fox jumps over the lazy dog")
309309
)
310310
);
311311

@@ -343,7 +343,7 @@ public void testPhrasePrefix() throws IOException {
343343

344344
searchResponse = client().search(new SearchRequest("second_test_index").source(source)).actionGet();
345345

346-
assertHighlight(searchResponse, 0, "field3", 0, 1, equalTo("The <x>quick</x> <x>brown</x> fox jumps over the lazy dog"));
346+
assertHighlight(searchResponse, 0, "field3", 0, 1, equalTo("The <x>quick brown</x> fox jumps over the lazy dog"));
347347

348348
logger.info("--> highlighting and searching on field4");
349349
source = searchSource().postFilter(termQuery("type", "type2"))
@@ -358,8 +358,8 @@ public void testPhrasePrefix() throws IOException {
358358
0,
359359
1,
360360
anyOf(
361-
equalTo("<x>The</x> <x>quick</x> <x>browse</x> button is a fancy thing, right bro?"),
362-
equalTo("<x>The</x> <x>quick</x> <x>brown</x> fox jumps over the lazy dog")
361+
equalTo("<x>The quick browse</x> button is a fancy thing, right bro?"),
362+
equalTo("<x>The quick brown</x> fox jumps over the lazy dog")
363363
)
364364
);
365365
assertHighlight(
@@ -369,8 +369,8 @@ public void testPhrasePrefix() throws IOException {
369369
0,
370370
1,
371371
anyOf(
372-
equalTo("<x>The</x> <x>quick</x> <x>browse</x> button is a fancy thing, right bro?"),
373-
equalTo("<x>The</x> <x>quick</x> <x>brown</x> fox jumps over the lazy dog")
372+
equalTo("<x>The quick browse</x> button is a fancy thing, right bro?"),
373+
equalTo("<x>The quick brown</x> fox jumps over the lazy dog")
374374
)
375375
);
376376

plugins/mapper-annotated-text/src/main/java/org/elasticsearch/index/mapper/annotatedtext/AnnotatedTextHighlighter.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,14 @@
1717
import org.elasticsearch.index.query.SearchExecutionContext;
1818
import org.elasticsearch.lucene.search.uhighlight.CustomUnifiedHighlighter;
1919
import org.elasticsearch.search.fetch.FetchSubPhase.HitContext;
20+
import org.elasticsearch.search.fetch.subphase.highlight.DefaultHighlighter;
2021
import org.elasticsearch.search.fetch.subphase.highlight.SearchHighlightContext;
21-
import org.elasticsearch.search.fetch.subphase.highlight.UnifiedHighlighter;
2222

2323
import java.io.IOException;
2424
import java.util.ArrayList;
2525
import java.util.List;
2626

27-
public class AnnotatedTextHighlighter extends UnifiedHighlighter {
27+
public class AnnotatedTextHighlighter extends DefaultHighlighter {
2828

2929
public static final String NAME = "annotated";
3030

plugins/mapper-annotated-text/src/test/java/org/elasticsearch/index/mapper/annotatedtext/AnnotatedTextHighlighterTests.java

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -131,21 +131,23 @@ private void assertHighlightOneDoc(
131131
TopDocs topDocs = searcher.search(new MatchAllDocsQuery(), 1, Sort.INDEXORDER);
132132
assertThat(topDocs.totalHits.value, equalTo(1L));
133133
String rawValue = Strings.collectionToDelimitedString(plainTextForHighlighter, String.valueOf(MULTIVAL_SEP_CHAR));
134+
UnifiedHighlighter.Builder builder = UnifiedHighlighter.builder(searcher, hiliteAnalyzer);
135+
builder.withBreakIterator(() -> breakIterator);
136+
builder.withFieldMatcher(name -> "text".equals(name));
137+
builder.withFormatter(passageFormatter);
134138
CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(
135-
searcher,
136-
hiliteAnalyzer,
139+
builder,
137140
UnifiedHighlighter.OffsetSource.ANALYSIS,
138-
passageFormatter,
139141
locale,
140-
breakIterator,
141142
"index",
142143
"text",
143144
query,
144145
noMatchSize,
145146
expectedPassages.length,
146-
name -> "text".equals(name),
147147
maxAnalyzedOffset,
148-
queryMaxAnalyzedOffset
148+
queryMaxAnalyzedOffset,
149+
true,
150+
true
149151
);
150152
highlighter.setFieldMatcher((name) -> "text".equals(name));
151153
final Snippet[] snippets = highlighter.highlightField(getOnlyLeafReader(reader), topDocs.scoreDocs[0].doc, () -> rawValue);
@@ -236,7 +238,7 @@ public void testAnnotatedTextSingleFieldWithBreakIterator() throws Exception {
236238

237239
public void testAnnotatedTextSingleFieldWithPhraseQuery() throws Exception {
238240
final String[] markedUpInputs = { "[Donald Trump](Donald+Trump) visited Singapore", "Donald Jr was with Melania Trump" };
239-
String[] expectedPassages = { "[Donald](_hit_term=donald) [Trump](_hit_term=trump) visited Singapore" };
241+
String[] expectedPassages = { "[Donald Trump](_hit_term=donald+trump&Donald+Trump) visited Singapore" };
240242
Query query = new PhraseQuery("text", "donald", "trump");
241243
BreakIterator breakIterator = new CustomSeparatorBreakIterator(MULTIVAL_SEP_CHAR);
242244
assertHighlightOneDoc("text", markedUpInputs, query, Locale.ROOT, breakIterator, 0, expectedPassages);

rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.highlight/10_unified.yml

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,18 @@ setup:
2323
- do:
2424
indices.refresh: {}
2525

26+
---
27+
teardown:
28+
- skip:
29+
version: " - 8.9.99"
30+
reason: "setting added in 8.10.0"
31+
32+
- do:
33+
indices.put_settings:
34+
index: test
35+
body:
36+
index.highlight.weight_matches_mode.enabled: null
37+
2638
---
2739
"Basic multi_match query":
2840
- do:
@@ -49,3 +61,35 @@ setup:
4961
- match: {hits.hits.0.highlight.text.0: "The <em>quick</em> <em>brown</em> <em>fox</em> is <em>brown</em>."}
5062
- match: {hits.hits.0.highlight.text\.fvh.0: "The <em>quick</em> <em>brown</em> <em>fox</em> is <em>brown</em>."}
5163
- match: {hits.hits.0.highlight.text\.postings.0: "The <em>quick</em> <em>brown</em> <em>fox</em> is <em>brown</em>."}
64+
65+
---
66+
"Disable weight matches":
67+
- skip:
68+
version: " - 8.9.99"
69+
reason: "support for weight matches was added in 8.10"
70+
71+
- do:
72+
search:
73+
body: {
74+
"query": { "multi_match": { "query": "quick brown fox", "type": "phrase", "fields": [ "text*" ] } },
75+
"highlight": { "type": "unified", "fields": { "*": { } } } }
76+
77+
- match: { hits.hits.0.highlight.text.0: "The <em>quick brown fox</em> is brown." }
78+
- match: { hits.hits.0.highlight.text\.fvh.0: "The <em>quick brown fox</em> is brown." }
79+
- match: { hits.hits.0.highlight.text\.postings.0: "The <em>quick brown fox</em> is brown." }
80+
81+
- do:
82+
indices.put_settings:
83+
index: test
84+
body:
85+
index.highlight.weight_matches_mode.enabled: "false"
86+
87+
- do:
88+
search:
89+
body: {
90+
"query" : { "multi_match" : { "query" : "quick brown fox", "type": "phrase", "fields" : [ "text*"] } },
91+
"highlight" : { "type" : "unified", "fields" : { "*" : {} } } }
92+
93+
- match: {hits.hits.0.highlight.text.0: "The <em>quick</em> <em>brown</em> <em>fox</em> is brown."}
94+
- match: {hits.hits.0.highlight.text\.fvh.0: "The <em>quick</em> <em>brown</em> <em>fox</em> is brown."}
95+
- match: {hits.hits.0.highlight.text\.postings.0: "The <em>quick</em> <em>brown</em> <em>fox</em> is brown."}

rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.highlight/50_synthetic_source.yml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -210,7 +210,7 @@ text single unified from reanalysis:
210210
fields:
211211
foo.analyze:
212212
type: unified
213-
- match: { hits.hits.0.highlight.foo\.analyze.0: <em>the</em> <em>quick</em> <em>brown</em> <em>fox</em> <em>jumped</em> <em>over</em> <em>the</em> <em>lazy</em> <em>dog</em> }
213+
- match: { hits.hits.0.highlight.foo\.analyze.0: <em>the quick brown fox jumped over the lazy dog</em> }
214214

215215
---
216216
text multi unified from reanalysis:
@@ -225,7 +225,7 @@ text multi unified from reanalysis:
225225
fields:
226226
foo.analyze:
227227
type: unified
228-
- match: { hits.hits.0.highlight.foo\.analyze.0: <em>That</em> <em>makes</em> <em>calamity</em> <em>of</em> <em>so</em> <em>long</em> <em>life</em>. }
228+
- match: { hits.hits.0.highlight.foo\.analyze.0: <em>That makes calamity of so long life</em>. }
229229

230230
---
231231
text single unified from positions:
@@ -240,7 +240,7 @@ text single unified from positions:
240240
fields:
241241
foo.positions:
242242
type: unified
243-
- match: { hits.hits.0.highlight.foo\.positions.0: <em>the</em> <em>quick</em> <em>brown</em> <em>fox</em> <em>jumped</em> <em>over</em> <em>the</em> <em>lazy</em> <em>dog</em> }
243+
- match: { hits.hits.0.highlight.foo\.positions.0: <em>the quick brown fox jumped over the lazy dog</em> }
244244

245245
---
246246
text multi unified from positions:
@@ -255,7 +255,7 @@ text multi unified from positions:
255255
fields:
256256
foo.positions:
257257
type: unified
258-
- match: { hits.hits.0.highlight.foo\.positions.0: <em>That</em> <em>makes</em> <em>calamity</em> <em>of</em> <em>so</em> <em>long</em> <em>life</em>. }
258+
- match: { hits.hits.0.highlight.foo\.positions.0: <em>That makes calamity of so long life</em>. }
259259

260260
---
261261
text single unified from vectors:
@@ -270,7 +270,7 @@ text single unified from vectors:
270270
fields:
271271
foo.vectors:
272272
type: unified
273-
- match: { hits.hits.0.highlight.foo\.vectors.0: <em>the</em> <em>quick</em> <em>brown</em> <em>fox</em> <em>jumped</em> <em>over</em> <em>the</em> <em>lazy</em> <em>dog</em> }
273+
- match: { hits.hits.0.highlight.foo\.vectors.0: <em>the quick brown fox jumped over the lazy dog</em> }
274274

275275
---
276276
text multi unified from vectors:
@@ -285,7 +285,7 @@ text multi unified from vectors:
285285
fields:
286286
foo.vectors:
287287
type: unified
288-
- match: { hits.hits.0.highlight.foo\.vectors.0: <em>That</em> <em>makes</em> <em>calamity</em> <em>of</em> <em>so</em> <em>long</em> <em>life</em>. }
288+
- match: { hits.hits.0.highlight.foo\.vectors.0: <em>That makes calamity of so long life</em>. }
289289

290290
---
291291
text single fvh source order:

server/src/internalClusterTest/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2217,14 +2217,7 @@ public void testPostingsHighlighter() throws Exception {
22172217

22182218
searchResponse = client().search(new SearchRequest("test").source(source)).actionGet();
22192219

2220-
assertHighlight(
2221-
searchResponse,
2222-
0,
2223-
"field2",
2224-
0,
2225-
1,
2226-
equalTo("The <xxx>quick</xxx> <xxx>brown</xxx> fox jumps over the lazy quick dog")
2227-
);
2220+
assertHighlight(searchResponse, 0, "field2", 0, 1, equalTo("The <xxx>quick brown</xxx> fox jumps over the lazy quick dog"));
22282221

22292222
// lets fall back to the standard highlighter then, what people would do to highlight query matches
22302223
logger.info("--> searching on field2, highlighting on field2, falling back to the plain highlighter");

server/src/main/java/org/elasticsearch/common/lucene/search/MultiPhrasePrefixQuery.java

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,12 @@
1717
import org.apache.lucene.search.IndexSearcher;
1818
import org.apache.lucene.search.MatchNoDocsQuery;
1919
import org.apache.lucene.search.MultiPhraseQuery;
20+
import org.apache.lucene.search.PrefixQuery;
2021
import org.apache.lucene.search.Query;
2122
import org.apache.lucene.search.QueryVisitor;
2223
import org.apache.lucene.util.BytesRef;
2324
import org.apache.lucene.util.StringHelper;
25+
import org.apache.lucene.util.automaton.CompiledAutomaton;
2426

2527
import java.io.IOException;
2628
import java.util.ArrayList;
@@ -295,20 +297,12 @@ public void visit(QueryVisitor visitor) {
295297
shouldVisitor.consumeTerms(this, termArrays.get(i));
296298
}
297299
}
298-
/* We don't report automata here because this breaks the unified highlighter,
299-
which extracts automata separately from phrases. MPPQ gets rewritten to a
300-
SpanMTQQuery by the PhraseHelper in any case, so highlighting is taken
301-
care of there instead. If we extract automata here then the trailing prefix
302-
word will be highlighted wherever it appears in the document, instead of only
303-
as part of a phrase. This can be re-instated once we switch to using Matches
304-
to highlight.
305300
for (Term prefixTerm : termArrays.get(termArrays.size() - 1)) {
306301
visitor.consumeTermsMatching(this, field, () -> {
307302
CompiledAutomaton ca = new CompiledAutomaton(PrefixQuery.toAutomaton(prefixTerm.bytes()));
308303
return ca.runAutomaton;
309304
});
310305
}
311-
*/
312306
}
313307
}
314308
}

server/src/main/java/org/elasticsearch/common/settings/IndexScopedSettings.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,7 @@ public final class IndexScopedSettings extends AbstractScopedSettings {
114114
IndexSettings.MAX_SHINGLE_DIFF_SETTING,
115115
IndexSettings.MAX_RESCORE_WINDOW_SETTING,
116116
IndexSettings.MAX_ANALYZED_OFFSET_SETTING,
117+
IndexSettings.WEIGHT_MATCHES_MODE_ENABLED_SETTING,
117118
IndexSettings.MAX_TERMS_COUNT_SETTING,
118119
IndexSettings.INDEX_TRANSLOG_SYNC_INTERVAL_SETTING,
119120
IndexSettings.DEFAULT_FIELD_SETTING,

server/src/main/java/org/elasticsearch/index/IndexSettings.java

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import org.apache.logging.log4j.Logger;
1111
import org.apache.logging.log4j.util.Strings;
1212
import org.apache.lucene.index.MergePolicy;
13+
import org.apache.lucene.search.uhighlight.UnifiedHighlighter;
1314
import org.elasticsearch.cluster.metadata.IndexMetadata;
1415
import org.elasticsearch.cluster.node.DiscoveryNode;
1516
import org.elasticsearch.cluster.routing.IndexRouting;
@@ -187,6 +188,17 @@ public final class IndexSettings {
187188
Property.IndexScope
188189
);
189190

191+
/**
192+
* Index setting to enable/disable the {@link UnifiedHighlighter.HighlightFlag#WEIGHT_MATCHES}
193+
* mode of the unified highlighter.
194+
*/
195+
public static final Setting<Boolean> WEIGHT_MATCHES_MODE_ENABLED_SETTING = Setting.boolSetting(
196+
"index.highlight.weight_matches_mode.enabled",
197+
true,
198+
Property.Dynamic,
199+
Property.IndexScope
200+
);
201+
190202
/**
191203
* Index setting describing the maximum number of terms that can be used in Terms Query.
192204
* The default maximum of 65536 terms is defensive, as extra processing and memory is involved
@@ -730,6 +742,7 @@ private void setRetentionLeaseMillis(final TimeValue retentionLease) {
730742
private volatile int maxShingleDiff;
731743
private volatile TimeValue searchIdleAfter;
732744
private volatile int maxAnalyzedOffset;
745+
private volatile boolean weightMatchesEnabled;
733746
private volatile int maxTermsCount;
734747
private volatile String defaultPipeline;
735748
private volatile String requiredPipeline;
@@ -871,6 +884,7 @@ public IndexSettings(final IndexMetadata indexMetadata, final Settings nodeSetti
871884
maxRefreshListeners = scopedSettings.get(MAX_REFRESH_LISTENERS_PER_SHARD);
872885
maxSlicesPerScroll = scopedSettings.get(MAX_SLICES_PER_SCROLL);
873886
maxAnalyzedOffset = scopedSettings.get(MAX_ANALYZED_OFFSET_SETTING);
887+
weightMatchesEnabled = scopedSettings.get(WEIGHT_MATCHES_MODE_ENABLED_SETTING);
874888
maxTermsCount = scopedSettings.get(MAX_TERMS_COUNT_SETTING);
875889
maxRegexLength = scopedSettings.get(MAX_REGEX_LENGTH_SETTING);
876890
this.mergePolicyConfig = new MergePolicyConfig(logger, this);
@@ -946,6 +960,7 @@ public IndexSettings(final IndexMetadata indexMetadata, final Settings nodeSetti
946960
scopedSettings.addSettingsUpdateConsumer(INDEX_REFRESH_INTERVAL_SETTING, this::setRefreshInterval);
947961
scopedSettings.addSettingsUpdateConsumer(MAX_REFRESH_LISTENERS_PER_SHARD, this::setMaxRefreshListeners);
948962
scopedSettings.addSettingsUpdateConsumer(MAX_ANALYZED_OFFSET_SETTING, this::setHighlightMaxAnalyzedOffset);
963+
scopedSettings.addSettingsUpdateConsumer(WEIGHT_MATCHES_MODE_ENABLED_SETTING, this::setWeightMatchesEnabled);
949964
scopedSettings.addSettingsUpdateConsumer(MAX_TERMS_COUNT_SETTING, this::setMaxTermsCount);
950965
scopedSettings.addSettingsUpdateConsumer(MAX_SLICES_PER_SCROLL, this::setMaxSlicesPerScroll);
951966
scopedSettings.addSettingsUpdateConsumer(DEFAULT_FIELD_SETTING, this::setDefaultFields);
@@ -1325,6 +1340,14 @@ private void setHighlightMaxAnalyzedOffset(int maxAnalyzedOffset) {
13251340
this.maxAnalyzedOffset = maxAnalyzedOffset;
13261341
}
13271342

1343+
public boolean isWeightMatchesEnabled() {
1344+
return this.weightMatchesEnabled;
1345+
}
1346+
1347+
private void setWeightMatchesEnabled(boolean value) {
1348+
this.weightMatchesEnabled = value;
1349+
}
1350+
13281351
/**
13291352
* Returns the maximum number of terms that can be used in a Terms Query request
13301353
*/

0 commit comments

Comments
 (0)