Skip to content

Commit e592bd1

Browse files
committed
Ensure TokenFilters only produce single tokens when parsing synonyms (#34331)
A number of tokenfilters can produce multiple tokens at the same position. This is a problem when using token chains to parse synonym files, as the SynonymMap requires that there are no stacked tokens in its input. This commit ensures that when used to parse synonyms, these tokenfilters either produce a single version of their input token, or that they throw an error when mappings are generated. In indexes created in elasticsearch 6.x deprecation warnings are emitted in place of the error. * asciifolding and cjk_bigram produce only the folded or bigrammed token * decompounders, synonyms and keyword_repeat are skipped * n-grams, word-delimiter-filter, multiplexer, fingerprint and phonetic throw errors Fixes #34298
1 parent dc50d8b commit e592bd1

File tree

20 files changed

+407
-16
lines changed

20 files changed

+407
-16
lines changed

docs/reference/analysis/tokenfilters/synonym-graph-tokenfilter.asciidoc

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -175,3 +175,15 @@ PUT /test_index
175175

176176
Using `synonyms_path` to define WordNet synonyms in a file is supported
177177
as well.
178+
179+
=== Parsing synonym files
180+
181+
Elasticsearch will use the token filters preceding the synonym filter
182+
in a tokenizer chain to parse the entries in a synonym file. So, for example, if a
183+
synonym filter is placed after a stemmer, then the stemmer will also be applied
184+
to the synonym entries. Because entries in the synonym map cannot have stacked
185+
positions, some token filters may cause issues here. Token filters that produce
186+
multiple versions of a token may choose which version of the token to emit when
187+
parsing synonyms, e.g. `asciifolding` will only produce the folded version of the
188+
token. Others, e.g. `multiplexer`, `word_delimiter_graph` or `ngram` will throw an
189+
error.

docs/reference/analysis/tokenfilters/synonym-tokenfilter.asciidoc

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,3 +163,16 @@ PUT /test_index
163163

164164
Using `synonyms_path` to define WordNet synonyms in a file is supported
165165
as well.
166+
167+
168+
=== Parsing synonym files
169+
170+
Elasticsearch will use the token filters preceding the synonym filter
171+
in a tokenizer chain to parse the entries in a synonym file. So, for example, if a
172+
synonym filter is placed after a stemmer, then the stemmer will also be applied
173+
to the synonym entries. Because entries in the synonym map cannot have stacked
174+
positions, some token filters may cause issues here. Token filters that produce
175+
multiple versions of a token may choose which version of the token to emit when
176+
parsing synonyms, e.g. `asciifolding` will only produce the folded version of the
177+
token. Others, e.g. `multiplexer`, `word_delimiter_graph` or `ngram` will throw an
178+
error.

modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ASCIIFoldingTokenFilterFactory.java

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,8 @@
3333
* Factory for ASCIIFoldingFilter.
3434
*/
3535
public class ASCIIFoldingTokenFilterFactory extends AbstractTokenFilterFactory
36-
implements MultiTermAwareComponent {
36+
implements MultiTermAwareComponent {
37+
3738
public static final ParseField PRESERVE_ORIGINAL = new ParseField("preserve_original");
3839
public static final boolean DEFAULT_PRESERVE_ORIGINAL = false;
3940

@@ -53,7 +54,7 @@ public TokenStream create(TokenStream tokenStream) {
5354
}
5455

5556
@Override
56-
public Object getMultiTermComponent() {
57+
public TokenFilterFactory getSynonymFilter() {
5758
if (preserveOriginal == false) {
5859
return this;
5960
} else {
@@ -70,4 +71,9 @@ public TokenStream create(TokenStream tokenStream) {
7071
};
7172
}
7273
}
74+
75+
@Override
76+
public Object getMultiTermComponent() {
77+
return getSynonymFilter();
78+
}
7379
}

modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/AbstractCompoundWordTokenFilterFactory.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
import org.elasticsearch.index.IndexSettings;
2727
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
2828
import org.elasticsearch.index.analysis.Analysis;
29+
import org.elasticsearch.index.analysis.TokenFilterFactory;
2930

3031
/**
3132
* Contains the common configuration settings between subclasses of this class.
@@ -51,4 +52,9 @@ protected AbstractCompoundWordTokenFilterFactory(IndexSettings indexSettings, En
5152
throw new IllegalArgumentException("word_list must be provided for [" + name + "], either as a path to a file, or directly");
5253
}
5354
}
55+
56+
@Override
57+
public TokenFilterFactory getSynonymFilter() {
58+
return IDENTITY_FILTER; // don't decompound synonym file
59+
}
5460
}

modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CJKBigramFilterFactory.java

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,16 @@
1919

2020
package org.elasticsearch.analysis.common;
2121

22+
import org.apache.logging.log4j.LogManager;
2223
import org.apache.lucene.analysis.TokenStream;
2324
import org.apache.lucene.analysis.cjk.CJKBigramFilter;
2425
import org.apache.lucene.analysis.miscellaneous.DisableGraphAttribute;
26+
import org.elasticsearch.common.logging.DeprecationLogger;
2527
import org.elasticsearch.common.settings.Settings;
2628
import org.elasticsearch.env.Environment;
2729
import org.elasticsearch.index.IndexSettings;
2830
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
31+
import org.elasticsearch.index.analysis.TokenFilterFactory;
2932

3033
import java.util.Arrays;
3134
import java.util.HashSet;
@@ -48,6 +51,9 @@
4851
*/
4952
public final class CJKBigramFilterFactory extends AbstractTokenFilterFactory {
5053

54+
private static final DeprecationLogger DEPRECATION_LOGGER
55+
= new DeprecationLogger(LogManager.getLogger(CJKBigramFilterFactory.class));
56+
5157
private final int flags;
5258
private final boolean outputUnigrams;
5359

@@ -90,4 +96,12 @@ public TokenStream create(TokenStream tokenStream) {
9096
return filter;
9197
}
9298

99+
@Override
100+
public TokenFilterFactory getSynonymFilter() {
101+
if (outputUnigrams) {
102+
DEPRECATION_LOGGER.deprecatedAndMaybeLog("synonym_tokenfilters", "Token filter [" + name()
103+
+ "] will not be usable to parse synonyms after v7.0");
104+
}
105+
return this;
106+
}
93107
}

modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -425,7 +425,7 @@ public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
425425
filters.add(PreConfiguredTokenFilter.singleton("german_stem", false, GermanStemFilter::new));
426426
filters.add(PreConfiguredTokenFilter.singleton("hindi_normalization", true, HindiNormalizationFilter::new));
427427
filters.add(PreConfiguredTokenFilter.singleton("indic_normalization", true, IndicNormalizationFilter::new));
428-
filters.add(PreConfiguredTokenFilter.singleton("keyword_repeat", false, KeywordRepeatFilter::new));
428+
filters.add(PreConfiguredTokenFilter.singleton("keyword_repeat", false, false, KeywordRepeatFilter::new));
429429
filters.add(PreConfiguredTokenFilter.singleton("kstem", false, KStemFilter::new));
430430
filters.add(PreConfiguredTokenFilter.singleton("length", false, input ->
431431
new LengthFilter(input, 0, Integer.MAX_VALUE))); // TODO this one seems useless

modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonGramsTokenFilterFactory.java

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,18 +19,24 @@
1919

2020
package org.elasticsearch.analysis.common;
2121

22+
import org.apache.logging.log4j.LogManager;
2223
import org.apache.lucene.analysis.CharArraySet;
2324
import org.apache.lucene.analysis.TokenStream;
2425
import org.apache.lucene.analysis.commongrams.CommonGramsFilter;
2526
import org.apache.lucene.analysis.commongrams.CommonGramsQueryFilter;
27+
import org.elasticsearch.common.logging.DeprecationLogger;
2628
import org.elasticsearch.common.settings.Settings;
2729
import org.elasticsearch.env.Environment;
2830
import org.elasticsearch.index.IndexSettings;
2931
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
3032
import org.elasticsearch.index.analysis.Analysis;
33+
import org.elasticsearch.index.analysis.TokenFilterFactory;
3134

3235
public class CommonGramsTokenFilterFactory extends AbstractTokenFilterFactory {
3336

37+
private static final DeprecationLogger DEPRECATION_LOGGER
38+
= new DeprecationLogger(LogManager.getLogger(CommonGramsTokenFilterFactory.class));
39+
3440
private final CharArraySet words;
3541

3642
private final boolean ignoreCase;
@@ -60,5 +66,12 @@ public TokenStream create(TokenStream tokenStream) {
6066
return filter;
6167
}
6268
}
69+
70+
@Override
71+
public TokenFilterFactory getSynonymFilter() {
72+
DEPRECATION_LOGGER.deprecatedAndMaybeLog("synonym_tokenfilters", "Token filter [" + name()
73+
+ "] will not be usable to parse synonyms after v7.0");
74+
return this;
75+
}
6376
}
6477

modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EdgeNGramTokenFilterFactory.java

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,18 +19,24 @@
1919

2020
package org.elasticsearch.analysis.common;
2121

22+
import org.apache.logging.log4j.LogManager;
2223
import org.apache.lucene.analysis.TokenStream;
2324
import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter;
2425
import org.apache.lucene.analysis.ngram.NGramTokenFilter;
2526
import org.apache.lucene.analysis.reverse.ReverseStringFilter;
27+
import org.elasticsearch.common.logging.DeprecationLogger;
2628
import org.elasticsearch.common.settings.Settings;
2729
import org.elasticsearch.env.Environment;
2830
import org.elasticsearch.index.IndexSettings;
2931
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
32+
import org.elasticsearch.index.analysis.TokenFilterFactory;
3033

3134

3235
public class EdgeNGramTokenFilterFactory extends AbstractTokenFilterFactory {
3336

37+
private static final DeprecationLogger DEPRECATION_LOGGER
38+
= new DeprecationLogger(LogManager.getLogger(EdgeNGramTokenFilterFactory.class));
39+
3440
private final int minGram;
3541

3642
private final int maxGram;
@@ -77,4 +83,11 @@ public TokenStream create(TokenStream tokenStream) {
7783
public boolean breaksFastVectorHighlighter() {
7884
return true;
7985
}
86+
87+
@Override
88+
public TokenFilterFactory getSynonymFilter() {
89+
DEPRECATION_LOGGER.deprecatedAndMaybeLog("synonym_tokenfilters", "Token filter [" + name()
90+
+ "] will not be usable to parse synonyms after v7.0");
91+
return this;
92+
}
8093
}

modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/FingerprintTokenFilterFactory.java

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,18 +19,24 @@
1919

2020
package org.elasticsearch.analysis.common;
2121

22+
import org.apache.logging.log4j.LogManager;
2223
import org.apache.lucene.analysis.TokenStream;
2324
import org.apache.lucene.analysis.miscellaneous.FingerprintFilter;
25+
import org.elasticsearch.common.logging.DeprecationLogger;
2426
import org.elasticsearch.common.settings.Settings;
2527
import org.elasticsearch.env.Environment;
2628
import org.elasticsearch.index.IndexSettings;
2729
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
30+
import org.elasticsearch.index.analysis.TokenFilterFactory;
2831

2932
import static org.elasticsearch.analysis.common.FingerprintAnalyzerProvider.DEFAULT_MAX_OUTPUT_SIZE;
3033
import static org.elasticsearch.analysis.common.FingerprintAnalyzerProvider.MAX_OUTPUT_SIZE;
3134

3235
public class FingerprintTokenFilterFactory extends AbstractTokenFilterFactory {
3336

37+
private static final DeprecationLogger DEPRECATION_LOGGER
38+
= new DeprecationLogger(LogManager.getLogger(FingerprintTokenFilterFactory.class));
39+
3440
private final char separator;
3541
private final int maxOutputSize;
3642

@@ -47,4 +53,11 @@ public TokenStream create(TokenStream tokenStream) {
4753
return result;
4854
}
4955

56+
@Override
57+
public TokenFilterFactory getSynonymFilter() {
58+
DEPRECATION_LOGGER.deprecatedAndMaybeLog("synonym_tokenfilters", "Token filter [" + name()
59+
+ "] will not be usable to parse synonyms after v7.0");
60+
return this;
61+
}
62+
5063
}

modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/MultiplexerTokenFilterFactory.java

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,14 @@
1919

2020
package org.elasticsearch.analysis.common;
2121

22+
import org.apache.logging.log4j.LogManager;
2223
import org.apache.lucene.analysis.TokenFilter;
2324
import org.apache.lucene.analysis.TokenStream;
2425
import org.apache.lucene.analysis.miscellaneous.ConditionalTokenFilter;
2526
import org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilter;
2627
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
2728
import org.elasticsearch.common.Strings;
29+
import org.elasticsearch.common.logging.DeprecationLogger;
2830
import org.elasticsearch.common.settings.Settings;
2931
import org.elasticsearch.env.Environment;
3032
import org.elasticsearch.index.IndexSettings;
@@ -40,6 +42,9 @@
4042

4143
public class MultiplexerTokenFilterFactory extends AbstractTokenFilterFactory {
4244

45+
private static final DeprecationLogger DEPRECATION_LOGGER
46+
= new DeprecationLogger(LogManager.getLogger(MultiplexerTokenFilterFactory.class));
47+
4348
private List<String> filterNames;
4449
private final boolean preserveOriginal;
4550

@@ -54,6 +59,17 @@ public TokenStream create(TokenStream tokenStream) {
5459
throw new UnsupportedOperationException("TokenFilterFactory.getChainAwareTokenFilterFactory() must be called first");
5560
}
5661

62+
@Override
63+
public TokenFilterFactory getSynonymFilter() {
64+
if (preserveOriginal) {
65+
DEPRECATION_LOGGER.deprecatedAndMaybeLog("synonym_tokenfilters", "Token filter [" + name()
66+
+ "] will not be usable to parse synonyms after v7.0");
67+
return IDENTITY_FILTER;
68+
}
69+
throw new IllegalArgumentException("Token filter [" + name()
70+
+ "] cannot be used to parse synonyms unless [preserve_original] is [true]");
71+
}
72+
5773
@Override
5874
public TokenFilterFactory getChainAwareTokenFilterFactory(TokenizerFactory tokenizer, List<CharFilterFactory> charFilters,
5975
List<TokenFilterFactory> previousTokenFilters,
@@ -98,7 +114,13 @@ public TokenStream create(TokenStream tokenStream) {
98114

99115
@Override
100116
public TokenFilterFactory getSynonymFilter() {
101-
return IDENTITY_FILTER;
117+
if (preserveOriginal) {
118+
DEPRECATION_LOGGER.deprecatedAndMaybeLog("synonym_tokenfilters", "Token filter [" + name()
119+
+ "] will not be usable to parse synonyms after v7.0");
120+
return IDENTITY_FILTER;
121+
}
122+
throw new IllegalArgumentException("Token filter [" + name()
123+
+ "] cannot be used to parse synonyms unless [preserve_original] is [true]");
102124
}
103125
};
104126
}

0 commit comments

Comments
 (0)