Skip to content

Commit 63bdd01

Browse files
authored
Expose WordDelimiterGraphTokenFilter (#23327)
This change exposes the new Lucene graph based word delimiter token filter in the analysis filters. Unlike the `word_delimiter` this token filter named `word_delimiter_graph` correctly handles multi terms expansion at query time. Closes #23104
1 parent 0e80296 commit 63bdd01

File tree

8 files changed

+342
-1180
lines changed

8 files changed

+342
-1180
lines changed

core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,7 @@
140140
import org.elasticsearch.index.analysis.UpperCaseTokenFilterFactory;
141141
import org.elasticsearch.index.analysis.WhitespaceAnalyzerProvider;
142142
import org.elasticsearch.index.analysis.WhitespaceTokenizerFactory;
143+
import org.elasticsearch.index.analysis.WordDelimiterGraphTokenFilterFactory;
143144
import org.elasticsearch.index.analysis.WordDelimiterTokenFilterFactory;
144145
import org.elasticsearch.index.analysis.compound.DictionaryCompoundWordTokenFilterFactory;
145146
import org.elasticsearch.index.analysis.compound.HyphenationCompoundWordTokenFilterFactory;
@@ -225,6 +226,7 @@ private NamedRegistry<AnalysisProvider<TokenFilterFactory>> setupTokenFilters(Li
225226
tokenFilters.register("snowball", SnowballTokenFilterFactory::new);
226227
tokenFilters.register("stemmer", StemmerTokenFilterFactory::new);
227228
tokenFilters.register("word_delimiter", WordDelimiterTokenFilterFactory::new);
229+
tokenFilters.register("word_delimiter_graph", WordDelimiterGraphTokenFilterFactory::new);
228230
tokenFilters.register("delimited_payload_filter", DelimitedPayloadTokenFilterFactory::new);
229231
tokenFilters.register("elision", ElisionTokenFilterFactory::new);
230232
tokenFilters.register("flatten_graph", FlattenGraphTokenFilterFactory::new);

core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltTokenFilters.java

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@
5151
import org.apache.lucene.analysis.miscellaneous.TruncateTokenFilter;
5252
import org.apache.lucene.analysis.miscellaneous.UniqueTokenFilter;
5353
import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter;
54+
import org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter;
5455
import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter;
5556
import org.apache.lucene.analysis.ngram.NGramTokenFilter;
5657
import org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilter;
@@ -87,6 +88,18 @@ public TokenStream create(TokenStream tokenStream, Version version) {
8788
}
8889
},
8990

91+
WORD_DELIMITER_GRAPH(CachingStrategy.ONE) {
92+
@Override
93+
public TokenStream create(TokenStream tokenStream, Version version) {
94+
return new WordDelimiterGraphFilter(tokenStream,
95+
WordDelimiterGraphFilter.GENERATE_WORD_PARTS |
96+
WordDelimiterGraphFilter.GENERATE_NUMBER_PARTS |
97+
WordDelimiterGraphFilter.SPLIT_ON_CASE_CHANGE |
98+
WordDelimiterGraphFilter.SPLIT_ON_NUMERICS |
99+
WordDelimiterGraphFilter.STEM_ENGLISH_POSSESSIVE, null);
100+
}
101+
},
102+
90103
STOP(CachingStrategy.LUCENE) {
91104
@Override
92105
public TokenStream create(TokenStream tokenStream, Version version) {

0 commit comments

Comments
 (0)