Skip to content

Commit ff90424

Browse files
committed
Fix PreConfiguredTokenFilters getSynonymFilter() implementations (#38839)
When we added support for TokenFilterFactories to specialise how they were used when parsing synonym files, PreConfiguredTokenFilters were set up to either apply themselves, or be ignored. This behaviour is a leftover from an earlier iteration, and also has an incorrect default. This commit makes preconfigured token filters usable in synonym file parsing by default, and brings those filters that should not be used into line with index-specific filter factories; in indexes created before version 7 we emit a deprecation warning, and we throw an error in indexes created after. Fixes #38793
1 parent dc0af84 commit ff90424

File tree

3 files changed

+115
-31
lines changed

3 files changed

+115
-31
lines changed

modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -399,7 +399,7 @@ public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
399399
filters.add(PreConfiguredTokenFilter.singleton("cjk_bigram", false, CJKBigramFilter::new));
400400
filters.add(PreConfiguredTokenFilter.singleton("cjk_width", true, CJKWidthFilter::new));
401401
filters.add(PreConfiguredTokenFilter.singleton("classic", false, ClassicFilter::new));
402-
filters.add(PreConfiguredTokenFilter.singleton("common_grams", false,
402+
filters.add(PreConfiguredTokenFilter.singleton("common_grams", false, false,
403403
input -> new CommonGramsFilter(input, CharArraySet.EMPTY_SET)));
404404
filters.add(PreConfiguredTokenFilter.singleton("czech_stem", false, CzechStemFilter::new));
405405
filters.add(PreConfiguredTokenFilter.singleton("decimal_digit", true, DecimalDigitFilter::new));
@@ -412,9 +412,9 @@ public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
412412
DelimitedPayloadTokenFilterFactory.DEFAULT_DELIMITER,
413413
DelimitedPayloadTokenFilterFactory.DEFAULT_ENCODER)));
414414
filters.add(PreConfiguredTokenFilter.singleton("dutch_stem", false, input -> new SnowballFilter(input, new DutchStemmer())));
415-
filters.add(PreConfiguredTokenFilter.singleton("edge_ngram", false, input ->
415+
filters.add(PreConfiguredTokenFilter.singleton("edge_ngram", false, false, input ->
416416
new EdgeNGramTokenFilter(input, 1)));
417-
filters.add(PreConfiguredTokenFilter.singletonWithVersion("edgeNGram", false, (reader, version) -> {
417+
filters.add(PreConfiguredTokenFilter.singletonWithVersion("edgeNGram", false, false, (reader, version) -> {
418418
if (version.onOrAfter(org.elasticsearch.Version.V_6_4_0)) {
419419
deprecationLogger.deprecatedAndMaybeLog("edgeNGram_deprecation",
420420
"The [edgeNGram] token filter name is deprecated and will be removed in a future version. "
@@ -437,8 +437,8 @@ public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
437437
new LimitTokenCountFilter(input,
438438
LimitTokenCountFilterFactory.DEFAULT_MAX_TOKEN_COUNT,
439439
LimitTokenCountFilterFactory.DEFAULT_CONSUME_ALL_TOKENS)));
440-
filters.add(PreConfiguredTokenFilter.singleton("ngram", false, reader -> new NGramTokenFilter(reader, 1, 2, false)));
441-
filters.add(PreConfiguredTokenFilter.singletonWithVersion("nGram", false, (reader, version) -> {
440+
filters.add(PreConfiguredTokenFilter.singleton("ngram", false, false, reader -> new NGramTokenFilter(reader, 1, 2, false)));
441+
filters.add(PreConfiguredTokenFilter.singletonWithVersion("nGram", false, false, (reader, version) -> {
442442
if (version.onOrAfter(org.elasticsearch.Version.V_6_4_0)) {
443443
deprecationLogger.deprecatedAndMaybeLog("nGram_deprecation",
444444
"The [nGram] token filter name is deprecated and will be removed in a future version. "
@@ -452,7 +452,7 @@ public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
452452
filters.add(PreConfiguredTokenFilter.singleton("russian_stem", false, input -> new SnowballFilter(input, "Russian")));
453453
filters.add(PreConfiguredTokenFilter.singleton("scandinavian_folding", true, ScandinavianFoldingFilter::new));
454454
filters.add(PreConfiguredTokenFilter.singleton("scandinavian_normalization", true, ScandinavianNormalizationFilter::new));
455-
filters.add(PreConfiguredTokenFilter.singleton("shingle", false, input -> {
455+
filters.add(PreConfiguredTokenFilter.singleton("shingle", false, false, input -> {
456456
TokenStream ts = new ShingleFilter(input);
457457
/**
458458
* We disable the graph analysis on this token stream
@@ -474,14 +474,14 @@ public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
474474
filters.add(PreConfiguredTokenFilter.singleton("type_as_payload", false, TypeAsPayloadTokenFilter::new));
475475
filters.add(PreConfiguredTokenFilter.singleton("unique", false, UniqueTokenFilter::new));
476476
filters.add(PreConfiguredTokenFilter.singleton("uppercase", true, UpperCaseFilter::new));
477-
filters.add(PreConfiguredTokenFilter.singleton("word_delimiter", false, input ->
477+
filters.add(PreConfiguredTokenFilter.singleton("word_delimiter", false, false, input ->
478478
new WordDelimiterFilter(input,
479479
WordDelimiterFilter.GENERATE_WORD_PARTS
480480
| WordDelimiterFilter.GENERATE_NUMBER_PARTS
481481
| WordDelimiterFilter.SPLIT_ON_CASE_CHANGE
482482
| WordDelimiterFilter.SPLIT_ON_NUMERICS
483483
| WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE, null)));
484-
filters.add(PreConfiguredTokenFilter.singleton("word_delimiter_graph", false, input ->
484+
filters.add(PreConfiguredTokenFilter.singleton("word_delimiter_graph", false, false, input ->
485485
new WordDelimiterGraphFilter(input,
486486
WordDelimiterGraphFilter.GENERATE_WORD_PARTS
487487
| WordDelimiterGraphFilter.GENERATE_NUMBER_PARTS

modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/SynonymsAnalysisTests.java

Lines changed: 63 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
import org.elasticsearch.env.Environment;
3131
import org.elasticsearch.index.IndexSettings;
3232
import org.elasticsearch.index.analysis.IndexAnalyzers;
33+
import org.elasticsearch.index.analysis.PreConfiguredTokenFilter;
3334
import org.elasticsearch.index.analysis.TokenFilterFactory;
3435
import org.elasticsearch.index.analysis.TokenizerFactory;
3536
import org.elasticsearch.test.ESTestCase;
@@ -42,8 +43,11 @@
4243
import java.nio.file.Files;
4344
import java.nio.file.Path;
4445
import java.util.ArrayList;
46+
import java.util.Arrays;
4547
import java.util.Collections;
48+
import java.util.HashSet;
4649
import java.util.List;
50+
import java.util.Set;
4751

4852
import static org.hamcrest.Matchers.equalTo;
4953
import static org.hamcrest.Matchers.instanceOf;
@@ -163,23 +167,21 @@ public void testAsciiFoldingFilterForSynonyms() throws IOException {
163167
new int[]{ 1, 0 });
164168
}
165169

166-
public void testKeywordRepeatAndSynonyms() throws IOException {
170+
public void testPreconfigured() throws IOException {
167171
Settings settings = Settings.builder()
168172
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
169173
.put("path.home", createTempDir().toString())
170174
.put("index.analysis.filter.synonyms.type", "synonym")
171-
.putList("index.analysis.filter.synonyms.synonyms", "programmer, developer")
172-
.put("index.analysis.filter.my_english.type", "stemmer")
173-
.put("index.analysis.filter.my_english.language", "porter2")
174-
.put("index.analysis.analyzer.synonymAnalyzer.tokenizer", "standard")
175-
.putList("index.analysis.analyzer.synonymAnalyzer.filter", "lowercase", "keyword_repeat", "my_english", "synonyms")
175+
.putList("index.analysis.filter.synonyms.synonyms", "würst, sausage")
176+
.put("index.analysis.analyzer.my_analyzer.tokenizer", "standard")
177+
.putList("index.analysis.analyzer.my_analyzer.filter", "lowercase", "asciifolding", "synonyms")
176178
.build();
177179
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
178180
indexAnalyzers = createTestAnalysis(idxSettings, settings, new CommonAnalysisPlugin()).indexAnalyzers;
179181

180-
BaseTokenStreamTestCase.assertAnalyzesTo(indexAnalyzers.get("synonymAnalyzer"), "programmers",
181-
new String[]{ "programmers", "programm", "develop" },
182-
new int[]{ 1, 0, 0 });
182+
BaseTokenStreamTestCase.assertAnalyzesTo(indexAnalyzers.get("my_analyzer"), "würst",
183+
new String[]{ "wurst", "sausage"},
184+
new int[]{ 1, 0 });
183185
}
184186

185187
public void testChainedSynonymFilters() throws IOException {
@@ -248,6 +250,58 @@ public void testTokenFiltersBypassSynonymAnalysis() throws IOException {
248250

249251
}
250252

253+
public void testPreconfiguredTokenFilters() throws IOException {
254+
Set<String> disallowedFilters = new HashSet<>(Arrays.asList(
255+
"common_grams", "edge_ngram", "edgeNGram", "keyword_repeat", "ngram", "nGram",
256+
"shingle", "word_delimiter", "word_delimiter_graph"
257+
));
258+
259+
Settings settings = Settings.builder()
260+
.put(IndexMetaData.SETTING_VERSION_CREATED,
261+
VersionUtils.randomVersionBetween(random(), Version.V_7_0_0, Version.CURRENT))
262+
.put("path.home", createTempDir().toString())
263+
.build();
264+
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
265+
266+
CommonAnalysisPlugin plugin = new CommonAnalysisPlugin();
267+
268+
for (PreConfiguredTokenFilter tf : plugin.getPreConfiguredTokenFilters()) {
269+
if (disallowedFilters.contains(tf.getName())) {
270+
IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
271+
"Expected exception for factory " + tf.getName(), () -> {
272+
tf.get(idxSettings, null, tf.getName(), settings).getSynonymFilter();
273+
});
274+
assertEquals(tf.getName(), "Token filter [" + tf.getName()
275+
+ "] cannot be used to parse synonyms",
276+
e.getMessage());
277+
}
278+
else {
279+
tf.get(idxSettings, null, tf.getName(), settings).getSynonymFilter();
280+
}
281+
}
282+
283+
Settings settings2 = Settings.builder()
284+
.put(IndexMetaData.SETTING_VERSION_CREATED,
285+
VersionUtils.randomVersionBetween(random(), Version.V_6_0_0, VersionUtils.getPreviousVersion(Version.V_7_0_0)))
286+
.put("path.home", createTempDir().toString())
287+
.putList("common_words", "a", "b")
288+
.put("output_unigrams", "true")
289+
.build();
290+
IndexSettings idxSettings2 = IndexSettingsModule.newIndexSettings("index", settings2);
291+
292+
List<String> expectedWarnings = new ArrayList<>();
293+
for (PreConfiguredTokenFilter tf : plugin.getPreConfiguredTokenFilters()) {
294+
if (disallowedFilters.contains(tf.getName())) {
295+
tf.get(idxSettings2, null, tf.getName(), settings2).getSynonymFilter();
296+
expectedWarnings.add("Token filter [" + tf.getName() + "] will not be usable to parse synonyms after v7.0");
297+
}
298+
else {
299+
tf.get(idxSettings2, null, tf.getName(), settings2).getSynonymFilter();
300+
}
301+
}
302+
assertWarnings(expectedWarnings.toArray(new String[0]));
303+
}
304+
251305
public void testDisallowedTokenFilters() throws IOException {
252306

253307
Settings settings = Settings.builder()

server/src/main/java/org/elasticsearch/index/analysis/PreConfiguredTokenFilter.java

Lines changed: 44 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,11 @@
1919

2020
package org.elasticsearch.index.analysis;
2121

22+
import org.apache.logging.log4j.LogManager;
2223
import org.apache.lucene.analysis.TokenFilter;
2324
import org.apache.lucene.analysis.TokenStream;
2425
import org.elasticsearch.Version;
26+
import org.elasticsearch.common.logging.DeprecationLogger;
2527
import org.elasticsearch.indices.analysis.PreBuiltCacheFactory;
2628
import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy;
2729

@@ -32,40 +34,54 @@
3234
* Provides pre-configured, shared {@link TokenFilter}s.
3335
*/
3436
public final class PreConfiguredTokenFilter extends PreConfiguredAnalysisComponent<TokenFilterFactory> {
37+
38+
private static final DeprecationLogger DEPRECATION_LOGGER
39+
= new DeprecationLogger(LogManager.getLogger(PreConfiguredTokenFilter.class));
40+
3541
/**
3642
* Create a pre-configured token filter that may not vary at all.
3743
*/
3844
public static PreConfiguredTokenFilter singleton(String name, boolean useFilterForMultitermQueries,
3945
Function<TokenStream, TokenStream> create) {
40-
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, false, CachingStrategy.ONE,
46+
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, true, CachingStrategy.ONE,
4147
(tokenStream, version) -> create.apply(tokenStream));
4248
}
4349

4450
/**
4551
* Create a pre-configured token filter that may not vary at all.
4652
*/
4753
public static PreConfiguredTokenFilter singleton(String name, boolean useFilterForMultitermQueries,
48-
boolean useFilterForParsingSynonyms,
54+
boolean allowForSynonymParsing,
4955
Function<TokenStream, TokenStream> create) {
50-
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, useFilterForParsingSynonyms, CachingStrategy.ONE,
56+
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, allowForSynonymParsing, CachingStrategy.ONE,
5157
(tokenStream, version) -> create.apply(tokenStream));
5258
}
5359

5460
/**
55-
* Create a pre-configured token filter that may not vary at all.
61+
* Create a pre-configured token filter that may vary based on the Elasticsearch version.
5662
*/
5763
public static PreConfiguredTokenFilter singletonWithVersion(String name, boolean useFilterForMultitermQueries,
5864
BiFunction<TokenStream, Version, TokenStream> create) {
59-
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, false, CachingStrategy.ONE,
65+
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, true, CachingStrategy.ONE,
6066
(tokenStream, version) -> create.apply(tokenStream, version));
6167
}
6268

69+
/**
70+
* Create a pre-configured token filter that may vary based on the Elasticsearch version.
71+
*/
72+
public static PreConfiguredTokenFilter singletonWithVersion(String name, boolean useFilterForMultitermQueries,
73+
boolean useFilterForParsingSynonyms,
74+
BiFunction<TokenStream, Version, TokenStream> create) {
75+
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, useFilterForParsingSynonyms, CachingStrategy.ONE,
76+
(tokenStream, version) -> create.apply(tokenStream, version));
77+
}
78+
6379
/**
6480
* Create a pre-configured token filter that may vary based on the Lucene version.
6581
*/
6682
public static PreConfiguredTokenFilter luceneVersion(String name, boolean useFilterForMultitermQueries,
6783
BiFunction<TokenStream, org.apache.lucene.util.Version, TokenStream> create) {
68-
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, false, CachingStrategy.LUCENE,
84+
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, true, CachingStrategy.LUCENE,
6985
(tokenStream, version) -> create.apply(tokenStream, version.luceneVersion));
7086
}
7187

@@ -74,18 +90,18 @@ public static PreConfiguredTokenFilter luceneVersion(String name, boolean useFil
7490
*/
7591
public static PreConfiguredTokenFilter elasticsearchVersion(String name, boolean useFilterForMultitermQueries,
7692
BiFunction<TokenStream, org.elasticsearch.Version, TokenStream> create) {
77-
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, false, CachingStrategy.ELASTICSEARCH, create);
93+
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, true, CachingStrategy.ELASTICSEARCH, create);
7894
}
7995

8096
private final boolean useFilterForMultitermQueries;
81-
private final boolean useFilterForParsingSynonyms;
97+
private final boolean allowForSynonymParsing;
8298
private final BiFunction<TokenStream, Version, TokenStream> create;
8399

84-
private PreConfiguredTokenFilter(String name, boolean useFilterForMultitermQueries, boolean useFilterForParsingSynonyms,
100+
private PreConfiguredTokenFilter(String name, boolean useFilterForMultitermQueries, boolean allowForSynonymParsing,
85101
PreBuiltCacheFactory.CachingStrategy cache, BiFunction<TokenStream, Version, TokenStream> create) {
86102
super(name, cache);
87103
this.useFilterForMultitermQueries = useFilterForMultitermQueries;
88-
this.useFilterForParsingSynonyms = useFilterForParsingSynonyms;
104+
this.allowForSynonymParsing = allowForSynonymParsing;
89105
this.create = create;
90106
}
91107

@@ -118,10 +134,17 @@ public TokenStream create(TokenStream tokenStream) {
118134

119135
@Override
120136
public TokenFilterFactory getSynonymFilter() {
121-
if (useFilterForParsingSynonyms) {
137+
if (allowForSynonymParsing) {
138+
return this;
139+
}
140+
if (version.onOrAfter(Version.V_7_0_0)) {
141+
throw new IllegalArgumentException("Token filter [" + name() + "] cannot be used to parse synonyms");
142+
}
143+
else {
144+
DEPRECATION_LOGGER.deprecatedAndMaybeLog(name(), "Token filter [" + name()
145+
+ "] will not be usable to parse synonyms after v7.0");
122146
return this;
123147
}
124-
return IDENTITY_FILTER;
125148
}
126149
};
127150
}
@@ -138,10 +161,17 @@ public TokenStream create(TokenStream tokenStream) {
138161

139162
@Override
140163
public TokenFilterFactory getSynonymFilter() {
141-
if (useFilterForParsingSynonyms) {
164+
if (allowForSynonymParsing) {
165+
return this;
166+
}
167+
if (version.onOrAfter(Version.V_7_0_0)) {
168+
throw new IllegalArgumentException("Token filter [" + name() + "] cannot be used to parse synonyms");
169+
}
170+
else {
171+
DEPRECATION_LOGGER.deprecatedAndMaybeLog(name(), "Token filter [" + name()
172+
+ "] will not be usable to parse synonyms after v7.0");
142173
return this;
143174
}
144-
return IDENTITY_FILTER;
145175
}
146176
};
147177
}

0 commit comments

Comments
 (0)