diff --git a/docs/changelog/86110.yaml b/docs/changelog/86110.yaml new file mode 100644 index 0000000000000..376cadaa56602 --- /dev/null +++ b/docs/changelog/86110.yaml @@ -0,0 +1,6 @@ +pr: 86110 +summary: Add LimitedOffsetsEnum to Limited offset token +area: Search +type: enhancement +issues: + - 86109 diff --git a/server/src/main/java/org/elasticsearch/lucene/search/uhighlight/CustomFieldHighlighter.java b/server/src/main/java/org/elasticsearch/lucene/search/uhighlight/CustomFieldHighlighter.java index 640150323da51..cd781829dd08c 100644 --- a/server/src/main/java/org/elasticsearch/lucene/search/uhighlight/CustomFieldHighlighter.java +++ b/server/src/main/java/org/elasticsearch/lucene/search/uhighlight/CustomFieldHighlighter.java @@ -36,6 +36,7 @@ class CustomFieldHighlighter extends FieldHighlighter { private final Locale breakIteratorLocale; private final int noMatchSize; private String fieldValue; + private final Integer queryMaxAnalyzedOffset; CustomFieldHighlighter( String field, @@ -46,11 +47,13 @@ class CustomFieldHighlighter extends FieldHighlighter { int maxPassages, int maxNoHighlightPassages, PassageFormatter passageFormatter, - int noMatchSize + int noMatchSize, + Integer queryMaxAnalyzedOffset ) { super(field, fieldOffsetStrategy, breakIterator, passageScorer, maxPassages, maxNoHighlightPassages, passageFormatter); this.breakIteratorLocale = breakIteratorLocale; this.noMatchSize = noMatchSize; + this.queryMaxAnalyzedOffset = queryMaxAnalyzedOffset; } FieldOffsetStrategy getFieldOffsetStrategy() { @@ -106,6 +109,10 @@ protected Passage[] getSummaryPassagesNoHighlight(int maxPassages) { @Override protected Passage[] highlightOffsetsEnums(OffsetsEnum off) throws IOException { + if (queryMaxAnalyzedOffset != null) { + off = new LimitedOffsetsEnum(off, queryMaxAnalyzedOffset); + } + final int contentLength = this.breakIterator.getText().getEndIndex(); if (off.nextPosition() == false) { diff --git a/server/src/main/java/org/elasticsearch/lucene/search/uhighlight/CustomUnifiedHighlighter.java b/server/src/main/java/org/elasticsearch/lucene/search/uhighlight/CustomUnifiedHighlighter.java index 443520ac47d55..ca5d50ba10e89 100644 --- a/server/src/main/java/org/elasticsearch/lucene/search/uhighlight/CustomUnifiedHighlighter.java +++ b/server/src/main/java/org/elasticsearch/lucene/search/uhighlight/CustomUnifiedHighlighter.java @@ -193,7 +193,8 @@ protected CustomFieldHighlighter getFieldHighlighter(String field, Query query, maxPassages, (noMatchSize > 0 ? 1 : 0), getFormatter(field), - noMatchSize + noMatchSize, + queryMaxAnalyzedOffset ); } diff --git a/server/src/main/java/org/elasticsearch/lucene/search/uhighlight/LimitedOffsetsEnum.java b/server/src/main/java/org/elasticsearch/lucene/search/uhighlight/LimitedOffsetsEnum.java new file mode 100644 index 0000000000000..aebe135d4db53 --- /dev/null +++ b/server/src/main/java/org/elasticsearch/lucene/search/uhighlight/LimitedOffsetsEnum.java @@ -0,0 +1,56 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0 and the Server Side Public License, v 1; you may not use this file except + * in compliance with, at your election, the Elastic License 2.0 or the Server + * Side Public License, v 1. + */ + +package org.elasticsearch.lucene.search.uhighlight; + +import org.apache.lucene.search.uhighlight.OffsetsEnum; +import org.apache.lucene.util.BytesRef; + +import java.io.IOException; + +public class LimitedOffsetsEnum extends OffsetsEnum { + private final OffsetsEnum delegate; + private final int maxOffset; + + public LimitedOffsetsEnum(OffsetsEnum delegate, int maxOffset) { + this.delegate = delegate; + this.maxOffset = maxOffset; + } + + @Override + public boolean nextPosition() throws IOException { + boolean next = delegate.nextPosition(); + if (next == false) { + return next; + } + if (delegate.startOffset() > maxOffset) { + return false; + } + return next; + } + + @Override + public int freq() throws IOException { + return delegate.freq(); + } + + @Override + public BytesRef getTerm() throws IOException { + return delegate.getTerm(); + } + + @Override + public int startOffset() throws IOException { + return delegate.startOffset(); + } + + @Override + public int endOffset() throws IOException { + return delegate.endOffset(); + } +} diff --git a/server/src/test/java/org/elasticsearch/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java b/server/src/test/java/org/elasticsearch/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java index e81f5834f60ae..74d70f179697b 100644 --- a/server/src/test/java/org/elasticsearch/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java +++ b/server/src/test/java/org/elasticsearch/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java @@ -9,6 +9,7 @@ package org.elasticsearch.lucene.search.uhighlight; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.core.WhitespaceAnalyzer; import org.apache.lucene.analysis.custom.CustomAnalyzer; import org.apache.lucene.analysis.ngram.EdgeNGramTokenizerFactory; import org.apache.lucene.analysis.standard.StandardAnalyzer; @@ -40,6 +41,8 @@ import java.text.BreakIterator; import java.util.Locale; +import java.util.Map; +import java.util.TreeMap; import static org.elasticsearch.lucene.search.uhighlight.CustomUnifiedHighlighter.MULTIVAL_SEP_CHAR; import static org.hamcrest.CoreMatchers.equalTo; @@ -82,6 +85,34 @@ private void assertHighlightOneDoc( String[] expectedPassages, int maxAnalyzedOffset, Integer queryMaxAnalyzedOffset + ) throws Exception { + assertHighlightOneDoc( + fieldName, + inputs, + analyzer, + query, + locale, + breakIterator, + noMatchSize, + expectedPassages, + maxAnalyzedOffset, + queryMaxAnalyzedOffset, + UnifiedHighlighter.OffsetSource.ANALYSIS + ); + } + + private void assertHighlightOneDoc( + String fieldName, + String[] inputs, + Analyzer analyzer, + Query query, + Locale locale, + BreakIterator breakIterator, + int noMatchSize, + String[] expectedPassages, + int maxAnalyzedOffset, + Integer queryMaxAnalyzedOffset, + UnifiedHighlighter.OffsetSource offsetSource ) throws Exception { try (Directory dir = newDirectory()) { IndexWriterConfig iwc = newIndexWriterConfig(analyzer); @@ -106,7 +137,7 @@ private void assertHighlightOneDoc( CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter( searcher, analyzer, - UnifiedHighlighter.OffsetSource.ANALYSIS, + offsetSource, new CustomPassageFormatter("", "", new DefaultEncoder()), locale, breakIterator, @@ -394,4 +425,72 @@ public void testExceedMaxAnalyzedOffset() throws Exception { 10 ); } + + public void testExceedMaxAnalyzedOffsetWithRepeatedWords() throws Exception { + + TermQuery query = new TermQuery(new Term("text", "Fun")); + Analyzer analyzer = new WhitespaceAnalyzer(); + assertHighlightOneDoc( + "text", + new String[] { "Testing Fun Testing Fun" }, + analyzer, + query, + Locale.ROOT, + BreakIterator.getSentenceInstance(Locale.ROOT), + 0, + new String[] { "Testing Fun Testing Fun" }, + 29, + 10, + UnifiedHighlighter.OffsetSource.ANALYSIS + ); + assertHighlightOneDoc( + "text", + new String[] { "Testing Fun Testing Fun" }, + analyzer, + query, + Locale.ROOT, + BreakIterator.getSentenceInstance(Locale.ROOT), + 0, + new String[] { "Testing Fun Testing Fun" }, + 29, + 10, + UnifiedHighlighter.OffsetSource.POSTINGS + ); + } + + public void testExceedMaxAnalyzedOffsetRandomOffset() throws Exception { + TermQuery query = new TermQuery(new Term("text", "fun")); + Analyzer analyzer = new WhitespaceAnalyzer(); + UnifiedHighlighter.OffsetSource offsetSource = randomBoolean() + ? UnifiedHighlighter.OffsetSource.ANALYSIS + : UnifiedHighlighter.OffsetSource.POSTINGS; + final String[] inputs = { "Fun fun fun fun fun" }; + TreeMap outputs = new TreeMap<>( + Map.of( + 7, + "Fun fun fun fun fun", + 11, + "Fun fun fun fun fun", + 15, + "Fun fun fun fun fun", + 19, + "Fun fun fun fun fun" + ) + ); + Integer randomOffset = between(7, 19); + String output = outputs.ceilingEntry(randomOffset).getValue(); + assertHighlightOneDoc( + "text", + inputs, + analyzer, + query, + Locale.ROOT, + BreakIterator.getSentenceInstance(Locale.ROOT), + 0, + new String[] { output }, + 47, + randomOffset, + offsetSource + ); + } }