Skip to content

Commit 9a4357a

Browse files
author
Christoph Büscher
authored
Deprecate and remove camel-case nGram and edgeNGram tokenizers (#50862)
We already deprecated and removed the camel-case versions of the nGram and edgeNGram filters a while ago and we should do the same with the nGram and edgeNGram tokenizers. This PR deprecates the use of these names in favour of ngram and edge_ngram in 7 and disallows usage in new indices starting with 8. Closes #50561
1 parent e349c5e commit 9a4357a

File tree

4 files changed

+135
-5
lines changed

4 files changed

+135
-5
lines changed

docs/reference/migration/migrate_8_0/analysis.asciidoc

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,3 +16,12 @@
1616
The `nGram` and `edgeNGram` token filter names that have been deprecated since
1717
version 6.4 have been removed. Both token filters can only be used by their
1818
alternative names `ngram` and `edge_ngram` since version 7.0.
19+
20+
[float]
21+
[[nGram-edgeNGram-tokenizer-dreprecation]]
22+
==== Disallow use of the `nGram` and `edgeNGram` tokenizer names
23+
24+
The `nGram` and `edgeNGram` tokenizer names haven been deprecated with 7.6 and are no longer
25+
supported on new indices. Mappings for indices created after 7.6 will continue to work but
26+
emit a deprecation warning. The tokenizer name should be changed to the fully equivalent
27+
`ngram` or `edge_ngram` names for new indices and in index templates.

modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java

Lines changed: 41 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -337,9 +337,29 @@ public Map<String, AnalysisProvider<TokenizerFactory>> getTokenizers() {
337337
tokenizers.put("simple_pattern", SimplePatternTokenizerFactory::new);
338338
tokenizers.put("simple_pattern_split", SimplePatternSplitTokenizerFactory::new);
339339
tokenizers.put("thai", ThaiTokenizerFactory::new);
340-
tokenizers.put("nGram", NGramTokenizerFactory::new);
340+
tokenizers.put("nGram", (IndexSettings indexSettings, Environment environment, String name, Settings settings) -> {
341+
if (indexSettings.getIndexVersionCreated().onOrAfter(org.elasticsearch.Version.V_8_0_0)) {
342+
throw new IllegalArgumentException("The [nGram] tokenizer name was deprecated in 7.6. "
343+
+ "Please use the tokenizer name to [ngram] for indices created in versions 8 or higher instead.");
344+
} else if (indexSettings.getIndexVersionCreated().onOrAfter(org.elasticsearch.Version.V_7_6_0)) {
345+
deprecationLogger.deprecatedAndMaybeLog("nGram_tokenizer_deprecation",
346+
"The [nGram] tokenizer name is deprecated and will be removed in a future version. "
347+
+ "Please change the tokenizer name to [ngram] instead.");
348+
}
349+
return new NGramTokenizerFactory(indexSettings, environment, name, settings);
350+
});
341351
tokenizers.put("ngram", NGramTokenizerFactory::new);
342-
tokenizers.put("edgeNGram", EdgeNGramTokenizerFactory::new);
352+
tokenizers.put("edgeNGram", (IndexSettings indexSettings, Environment environment, String name, Settings settings) -> {
353+
if (indexSettings.getIndexVersionCreated().onOrAfter(org.elasticsearch.Version.V_8_0_0)) {
354+
throw new IllegalArgumentException("The [edgeNGram] tokenizer name was deprecated in 7.6. "
355+
+ "Please use the tokenizer name to [edge_nGram] for indices created in versions 8 or higher instead.");
356+
} else if (indexSettings.getIndexVersionCreated().onOrAfter(org.elasticsearch.Version.V_7_6_0)) {
357+
deprecationLogger.deprecatedAndMaybeLog("edgeNGram_tokenizer_deprecation",
358+
"The [edgeNGram] tokenizer name is deprecated and will be removed in a future version. "
359+
+ "Please change the tokenizer name to [edge_ngram] instead.");
360+
}
361+
return new EdgeNGramTokenizerFactory(indexSettings, environment, name, settings);
362+
});
343363
tokenizers.put("edge_ngram", EdgeNGramTokenizerFactory::new);
344364
tokenizers.put("char_group", CharGroupTokenizerFactory::new);
345365
tokenizers.put("classic", ClassicTokenizerFactory::new);
@@ -522,8 +542,26 @@ public List<PreConfiguredTokenizer> getPreConfiguredTokenizers() {
522542
tokenizers.add(PreConfiguredTokenizer.singleton("lowercase", XLowerCaseTokenizer::new));
523543

524544
// Temporary shim for aliases. TODO deprecate after they are moved
525-
tokenizers.add(PreConfiguredTokenizer.singleton("nGram", NGramTokenizer::new));
545+
tokenizers.add(PreConfiguredTokenizer.elasticsearchVersion("nGram", (version) -> {
546+
if (version.onOrAfter(org.elasticsearch.Version.V_8_0_0)) {
547+
throw new IllegalArgumentException("The [nGram] tokenizer name was deprecated in 7.6. "
548+
+ "Please use the tokenizer name to [ngram] for indices created in versions 8 or higher instead.");
549+
} else if (version.onOrAfter(org.elasticsearch.Version.V_7_6_0)) {
550+
deprecationLogger.deprecatedAndMaybeLog("nGram_tokenizer_deprecation",
551+
"The [nGram] tokenizer name is deprecated and will be removed in a future version. "
552+
+ "Please change the tokenizer name to [ngram] instead.");
553+
}
554+
return new NGramTokenizer();
555+
}));
526556
tokenizers.add(PreConfiguredTokenizer.elasticsearchVersion("edgeNGram", (version) -> {
557+
if (version.onOrAfter(org.elasticsearch.Version.V_8_0_0)) {
558+
throw new IllegalArgumentException("The [edgeNGram] tokenizer name was deprecated in 7.6. "
559+
+ "Please use the tokenizer name to [edge_ngram] for indices created in versions 8 or higher instead.");
560+
} else if (version.onOrAfter(org.elasticsearch.Version.V_7_6_0)) {
561+
deprecationLogger.deprecatedAndMaybeLog("edgeNGram_tokenizer_deprecation",
562+
"The [edgeNGram] tokenizer name is deprecated and will be removed in a future version. "
563+
+ "Please change the tokenizer name to [edge_ngram] instead.");
564+
}
527565
if (version.onOrAfter(Version.V_7_3_0)) {
528566
return new EdgeNGramTokenizer(NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE, NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE);
529567
}

modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisPluginTests.java

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,15 +19,18 @@
1919

2020
package org.elasticsearch.analysis.common;
2121

22+
import org.apache.lucene.analysis.Tokenizer;
2223
import org.elasticsearch.Version;
2324
import org.elasticsearch.cluster.metadata.IndexMetaData;
2425
import org.elasticsearch.common.settings.Settings;
2526
import org.elasticsearch.env.Environment;
27+
import org.elasticsearch.index.analysis.TokenizerFactory;
2628
import org.elasticsearch.test.ESTestCase;
2729
import org.elasticsearch.test.IndexSettingsModule;
2830
import org.elasticsearch.test.VersionUtils;
2931

3032
import java.io.IOException;
33+
import java.util.Map;
3134

3235
public class CommonAnalysisPluginTests extends ESTestCase {
3336

@@ -102,4 +105,82 @@ public void testEdgeNGramFilterInCustomAnalyzerDeprecationError() throws IOExcep
102105
+ "Please change the filter name to [edge_ngram] instead.");
103106
}
104107
}
108+
109+
/**
110+
* Check that we log a deprecation warning for "nGram" and "edgeNGram" tokenizer names with 7.6 and
111+
* disallow usages for indices created after 8.0
112+
*/
113+
public void testNGramTokenizerDeprecation() throws IOException {
114+
// tests for prebuilt tokenizer
115+
doTestPrebuiltTokenizerDeprecation("nGram", "ngram",
116+
VersionUtils.randomVersionBetween(random(), Version.V_7_0_0, Version.V_7_5_2), false);
117+
doTestPrebuiltTokenizerDeprecation("edgeNGram", "edge_ngram",
118+
VersionUtils.randomVersionBetween(random(), Version.V_7_0_0, Version.V_7_5_2), false);
119+
doTestPrebuiltTokenizerDeprecation("nGram", "ngram",
120+
VersionUtils.randomVersionBetween(random(), Version.V_7_6_0,
121+
Version.max(Version.V_7_6_0, VersionUtils.getPreviousVersion(Version.V_8_0_0))),
122+
true);
123+
doTestPrebuiltTokenizerDeprecation("edgeNGram", "edge_ngram",
124+
VersionUtils.randomVersionBetween(random(), Version.V_7_6_0,
125+
Version.max(Version.V_7_6_0, VersionUtils.getPreviousVersion(Version.V_8_0_0))), true);
126+
expectThrows(IllegalArgumentException.class, () -> doTestPrebuiltTokenizerDeprecation("nGram", "ngram",
127+
VersionUtils.randomVersionBetween(random(), Version.V_8_0_0, Version.CURRENT), true));
128+
expectThrows(IllegalArgumentException.class, () -> doTestPrebuiltTokenizerDeprecation("edgeNGram", "edge_ngram",
129+
VersionUtils.randomVersionBetween(random(), Version.V_8_0_0, Version.CURRENT), true));
130+
131+
// same batch of tests for custom tokenizer definition in the settings
132+
doTestCustomTokenizerDeprecation("nGram", "ngram",
133+
VersionUtils.randomVersionBetween(random(), Version.V_7_0_0, Version.V_7_5_2), false);
134+
doTestCustomTokenizerDeprecation("edgeNGram", "edge_ngram",
135+
VersionUtils.randomVersionBetween(random(), Version.V_7_0_0, Version.V_7_5_2), false);
136+
doTestCustomTokenizerDeprecation("nGram", "ngram",
137+
VersionUtils.randomVersionBetween(random(), Version.V_7_6_0,
138+
Version.max(Version.V_7_6_0, VersionUtils.getPreviousVersion(Version.V_8_0_0))),
139+
true);
140+
doTestCustomTokenizerDeprecation("edgeNGram", "edge_ngram",
141+
VersionUtils.randomVersionBetween(random(), Version.V_7_6_0,
142+
Version.max(Version.V_7_6_0, VersionUtils.getPreviousVersion(Version.V_8_0_0))), true);
143+
expectThrows(IllegalArgumentException.class, () -> doTestCustomTokenizerDeprecation("nGram", "ngram",
144+
VersionUtils.randomVersionBetween(random(), Version.V_8_0_0, Version.CURRENT), true));
145+
expectThrows(IllegalArgumentException.class, () -> doTestCustomTokenizerDeprecation("edgeNGram", "edge_ngram",
146+
VersionUtils.randomVersionBetween(random(), Version.V_8_0_0, Version.CURRENT), true));
147+
}
148+
149+
public void doTestPrebuiltTokenizerDeprecation(String deprecatedName, String replacement, Version version, boolean expectWarning)
150+
throws IOException {
151+
final Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir())
152+
.put(IndexMetaData.SETTING_VERSION_CREATED, version).build();
153+
154+
try (CommonAnalysisPlugin commonAnalysisPlugin = new CommonAnalysisPlugin()) {
155+
Map<String, TokenizerFactory> tokenizers = createTestAnalysis(
156+
IndexSettingsModule.newIndexSettings("index", settings), settings, commonAnalysisPlugin).tokenizer;
157+
TokenizerFactory tokenizerFactory = tokenizers.get(deprecatedName);
158+
159+
Tokenizer tokenizer = tokenizerFactory.create();
160+
assertNotNull(tokenizer);
161+
if (expectWarning) {
162+
assertWarnings("The [" + deprecatedName + "] tokenizer name is deprecated and will be removed in a future version. "
163+
+ "Please change the tokenizer name to [" + replacement + "] instead.");
164+
}
165+
}
166+
}
167+
168+
public void doTestCustomTokenizerDeprecation(String deprecatedName, String replacement, Version version, boolean expectWarning)
169+
throws IOException {
170+
final Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir())
171+
.put(IndexMetaData.SETTING_VERSION_CREATED, version)
172+
.put("index.analysis.analyzer.custom_analyzer.type", "custom")
173+
.put("index.analysis.analyzer.custom_analyzer.tokenizer", "my_tokenizer")
174+
.put("index.analysis.tokenizer.my_tokenizer.type", deprecatedName)
175+
.build();
176+
177+
try (CommonAnalysisPlugin commonAnalysisPlugin = new CommonAnalysisPlugin()) {
178+
createTestAnalysis(IndexSettingsModule.newIndexSettings("index", settings), settings, commonAnalysisPlugin);
179+
180+
if (expectWarning) {
181+
assertWarnings("The [" + deprecatedName + "] tokenizer name is deprecated and will be removed in a future version. "
182+
+ "Please change the tokenizer name to [" + replacement + "] instead.");
183+
}
184+
}
185+
}
105186
}

modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/EdgeNGramTokenizerTests.java

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -86,9 +86,11 @@ public void testPreConfiguredTokenizer() throws IOException {
8686
}
8787
}
8888

89-
// Check deprecated name as well
89+
// Check deprecated name as well, needs version before 8.0 because throws IAE after that
9090
{
91-
try (IndexAnalyzers indexAnalyzers = buildAnalyzers(Version.CURRENT, "edgeNGram")) {
91+
try (IndexAnalyzers indexAnalyzers = buildAnalyzers(
92+
VersionUtils.randomVersionBetween(random(), Version.V_7_3_0, VersionUtils.getPreviousVersion(Version.V_8_0_0)),
93+
"edgeNGram")) {
9294
NamedAnalyzer analyzer = indexAnalyzers.get("my_analyzer");
9395
assertNotNull(analyzer);
9496
assertAnalyzesTo(analyzer, "test", new String[]{"t", "te"});

0 commit comments

Comments
 (0)