From e2f6b4430621f99b809995a578da109bf37b04bc Mon Sep 17 00:00:00 2001 From: Luca Cavanna Date: Tue, 17 Sep 2024 15:10:49 +0200 Subject: [PATCH 1/3] Remove deprecations and 7.x related code from analysis common edgeNGram and NGram tokenizers and token filters were deprecated. They have not been supported in indices created from 8.0, hence their support can entirely be removed from main. The version related logic around the min grams can also be removed as it refers to 7.x which we no longer need to support. Relates to #50376, #50862, #43568 --- .../analysis/common/CommonAnalysisPlugin.java | 132 +------------ .../common/CommonAnalysisPluginTests.java | 186 ------------------ .../common/EdgeNGramTokenizerTests.java | 3 +- 3 files changed, 7 insertions(+), 314 deletions(-) delete mode 100644 modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisPluginTests.java diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java index dec2526db8515..a97154fd4d1ff 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java @@ -101,12 +101,7 @@ import org.apache.lucene.analysis.tr.TurkishAnalyzer; import org.apache.lucene.analysis.util.ElisionFilter; import org.apache.lucene.util.SetOnce; -import org.elasticsearch.common.logging.DeprecationCategory; -import org.elasticsearch.common.logging.DeprecationLogger; import org.elasticsearch.common.regex.Regex; -import org.elasticsearch.common.settings.Settings; -import org.elasticsearch.env.Environment; -import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.IndexVersions; import org.elasticsearch.index.analysis.AnalyzerProvider; import org.elasticsearch.index.analysis.CharFilterFactory; @@ -139,8 +134,6 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin, ScriptPlugin { - private static final DeprecationLogger deprecationLogger = DeprecationLogger.getLogger(CommonAnalysisPlugin.class); - private final SetOnce scriptServiceHolder = new SetOnce<>(); private final SetOnce synonymsManagementServiceHolder = new SetOnce<>(); @@ -231,28 +224,6 @@ public Map> getTokenFilters() { filters.put("dictionary_decompounder", requiresAnalysisSettings(DictionaryCompoundWordTokenFilterFactory::new)); filters.put("dutch_stem", DutchStemTokenFilterFactory::new); filters.put("edge_ngram", EdgeNGramTokenFilterFactory::new); - filters.put("edgeNGram", (IndexSettings indexSettings, Environment environment, String name, Settings settings) -> { - return new EdgeNGramTokenFilterFactory(indexSettings, environment, name, settings) { - @Override - public TokenStream create(TokenStream tokenStream) { - if (indexSettings.getIndexVersionCreated().onOrAfter(IndexVersions.V_8_0_0)) { - throw new IllegalArgumentException( - "The [edgeNGram] token filter name was deprecated in 6.4 and cannot be used in new indices. " - + "Please change the filter name to [edge_ngram] instead." - ); - } else { - deprecationLogger.warn( - DeprecationCategory.ANALYSIS, - "edgeNGram_deprecation", - "The [edgeNGram] token filter name is deprecated and will be removed in a future version. " - + "Please change the filter name to [edge_ngram] instead." - ); - } - return super.create(tokenStream); - } - - }; - }); filters.put("elision", requiresAnalysisSettings(ElisionTokenFilterFactory::new)); filters.put("fingerprint", FingerprintTokenFilterFactory::new); filters.put("flatten_graph", FlattenGraphTokenFilterFactory::new); @@ -272,28 +243,6 @@ public TokenStream create(TokenStream tokenStream) { filters.put("min_hash", MinHashTokenFilterFactory::new); filters.put("multiplexer", MultiplexerTokenFilterFactory::new); filters.put("ngram", NGramTokenFilterFactory::new); - filters.put("nGram", (IndexSettings indexSettings, Environment environment, String name, Settings settings) -> { - return new NGramTokenFilterFactory(indexSettings, environment, name, settings) { - @Override - public TokenStream create(TokenStream tokenStream) { - if (indexSettings.getIndexVersionCreated().onOrAfter(IndexVersions.V_8_0_0)) { - throw new IllegalArgumentException( - "The [nGram] token filter name was deprecated in 6.4 and cannot be used in new indices. " - + "Please change the filter name to [ngram] instead." - ); - } else { - deprecationLogger.warn( - DeprecationCategory.ANALYSIS, - "nGram_deprecation", - "The [nGram] token filter name is deprecated and will be removed in a future version. " - + "Please change the filter name to [ngram] instead." - ); - } - return super.create(tokenStream); - } - - }; - }); filters.put("pattern_capture", requiresAnalysisSettings(PatternCaptureGroupTokenFilterFactory::new)); filters.put("pattern_replace", requiresAnalysisSettings(PatternReplaceTokenFilterFactory::new)); filters.put("persian_normalization", PersianNormalizationFilterFactory::new); @@ -345,39 +294,7 @@ public Map> getTokenizers() { tokenizers.put("simple_pattern", SimplePatternTokenizerFactory::new); tokenizers.put("simple_pattern_split", SimplePatternSplitTokenizerFactory::new); tokenizers.put("thai", ThaiTokenizerFactory::new); - tokenizers.put("nGram", (IndexSettings indexSettings, Environment environment, String name, Settings settings) -> { - if (indexSettings.getIndexVersionCreated().onOrAfter(IndexVersions.V_8_0_0)) { - throw new IllegalArgumentException( - "The [nGram] tokenizer name was deprecated in 7.6. " - + "Please use the tokenizer name to [ngram] for indices created in versions 8 or higher instead." - ); - } else if (indexSettings.getIndexVersionCreated().onOrAfter(IndexVersions.V_7_6_0)) { - deprecationLogger.warn( - DeprecationCategory.ANALYSIS, - "nGram_tokenizer_deprecation", - "The [nGram] tokenizer name is deprecated and will be removed in a future version. " - + "Please change the tokenizer name to [ngram] instead." - ); - } - return new NGramTokenizerFactory(indexSettings, environment, name, settings); - }); tokenizers.put("ngram", NGramTokenizerFactory::new); - tokenizers.put("edgeNGram", (IndexSettings indexSettings, Environment environment, String name, Settings settings) -> { - if (indexSettings.getIndexVersionCreated().onOrAfter(IndexVersions.V_8_0_0)) { - throw new IllegalArgumentException( - "The [edgeNGram] tokenizer name was deprecated in 7.6. " - + "Please use the tokenizer name to [edge_nGram] for indices created in versions 8 or higher instead." - ); - } else if (indexSettings.getIndexVersionCreated().onOrAfter(IndexVersions.V_7_6_0)) { - deprecationLogger.warn( - DeprecationCategory.ANALYSIS, - "edgeNGram_tokenizer_deprecation", - "The [edgeNGram] tokenizer name is deprecated and will be removed in a future version. " - + "Please change the tokenizer name to [edge_ngram] instead." - ); - } - return new EdgeNGramTokenizerFactory(indexSettings, environment, name, settings); - }); tokenizers.put("edge_ngram", EdgeNGramTokenizerFactory::new); tokenizers.put("char_group", CharGroupTokenizerFactory::new); tokenizers.put("classic", ClassicTokenizerFactory::new); @@ -588,54 +505,17 @@ public List getPreConfiguredTokenizers() { tokenizers.add(PreConfiguredTokenizer.singleton("letter", LetterTokenizer::new)); tokenizers.add(PreConfiguredTokenizer.singleton("whitespace", WhitespaceTokenizer::new)); tokenizers.add(PreConfiguredTokenizer.singleton("ngram", NGramTokenizer::new)); - tokenizers.add(PreConfiguredTokenizer.indexVersion("edge_ngram", (version) -> { - if (version.onOrAfter(IndexVersions.V_7_3_0)) { - return new EdgeNGramTokenizer(NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE, NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE); - } - return new EdgeNGramTokenizer(EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE); - })); + tokenizers.add( + PreConfiguredTokenizer.indexVersion( + "edge_ngram", + (version) -> new EdgeNGramTokenizer(NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE, NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE) + ) + ); tokenizers.add(PreConfiguredTokenizer.singleton("pattern", () -> new PatternTokenizer(Regex.compile("\\W+", null), -1))); tokenizers.add(PreConfiguredTokenizer.singleton("thai", ThaiTokenizer::new)); // TODO deprecate and remove in API // This is already broken with normalization, so backwards compat isn't necessary? tokenizers.add(PreConfiguredTokenizer.singleton("lowercase", XLowerCaseTokenizer::new)); - - // Temporary shim for aliases. TODO deprecate after they are moved - tokenizers.add(PreConfiguredTokenizer.indexVersion("nGram", (version) -> { - if (version.onOrAfter(IndexVersions.V_8_0_0)) { - throw new IllegalArgumentException( - "The [nGram] tokenizer name was deprecated in 7.6. " - + "Please use the tokenizer name to [ngram] for indices created in versions 8 or higher instead." - ); - } else if (version.onOrAfter(IndexVersions.V_7_6_0)) { - deprecationLogger.warn( - DeprecationCategory.ANALYSIS, - "nGram_tokenizer_deprecation", - "The [nGram] tokenizer name is deprecated and will be removed in a future version. " - + "Please change the tokenizer name to [ngram] instead." - ); - } - return new NGramTokenizer(); - })); - tokenizers.add(PreConfiguredTokenizer.indexVersion("edgeNGram", (version) -> { - if (version.onOrAfter(IndexVersions.V_8_0_0)) { - throw new IllegalArgumentException( - "The [edgeNGram] tokenizer name was deprecated in 7.6. " - + "Please use the tokenizer name to [edge_ngram] for indices created in versions 8 or higher instead." - ); - } else if (version.onOrAfter(IndexVersions.V_7_6_0)) { - deprecationLogger.warn( - DeprecationCategory.ANALYSIS, - "edgeNGram_tokenizer_deprecation", - "The [edgeNGram] tokenizer name is deprecated and will be removed in a future version. " - + "Please change the tokenizer name to [edge_ngram] instead." - ); - } - if (version.onOrAfter(IndexVersions.V_7_3_0)) { - return new EdgeNGramTokenizer(NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE, NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE); - } - return new EdgeNGramTokenizer(EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE); - })); tokenizers.add(PreConfiguredTokenizer.singleton("PathHierarchy", PathHierarchyTokenizer::new)); return tokenizers; diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisPluginTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisPluginTests.java deleted file mode 100644 index 3263704d38e1d..0000000000000 --- a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisPluginTests.java +++ /dev/null @@ -1,186 +0,0 @@ -/* - * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one - * or more contributor license agreements. Licensed under the "Elastic License - * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side - * Public License v 1"; you may not use this file except in compliance with, at - * your election, the "Elastic License 2.0", the "GNU Affero General Public - * License v3.0 only", or the "Server Side Public License, v 1". - */ - -package org.elasticsearch.analysis.common; - -import org.apache.lucene.analysis.Tokenizer; -import org.elasticsearch.cluster.metadata.IndexMetadata; -import org.elasticsearch.common.settings.Settings; -import org.elasticsearch.env.Environment; -import org.elasticsearch.index.IndexVersion; -import org.elasticsearch.index.IndexVersions; -import org.elasticsearch.index.analysis.TokenizerFactory; -import org.elasticsearch.test.ESTestCase; -import org.elasticsearch.test.IndexSettingsModule; -import org.elasticsearch.test.index.IndexVersionUtils; - -import java.io.IOException; -import java.util.Map; - -public class CommonAnalysisPluginTests extends ESTestCase { - - /** - * Check that the deprecated "nGram" filter throws exception for indices created since 7.0.0 and - * logs a warning for earlier indices when the filter is used as a custom filter - */ - public void testNGramFilterInCustomAnalyzerDeprecationError() throws IOException { - final Settings settings = Settings.builder() - .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir()) - .put( - IndexMetadata.SETTING_VERSION_CREATED, - IndexVersionUtils.randomVersionBetween(random(), IndexVersions.V_8_0_0, IndexVersion.current()) - ) - .put("index.analysis.analyzer.custom_analyzer.type", "custom") - .put("index.analysis.analyzer.custom_analyzer.tokenizer", "standard") - .putList("index.analysis.analyzer.custom_analyzer.filter", "my_ngram") - .put("index.analysis.filter.my_ngram.type", "nGram") - .build(); - - try (CommonAnalysisPlugin commonAnalysisPlugin = new CommonAnalysisPlugin()) { - IllegalArgumentException ex = expectThrows( - IllegalArgumentException.class, - () -> createTestAnalysis(IndexSettingsModule.newIndexSettings("index", settings), settings, commonAnalysisPlugin) - ); - assertEquals( - "The [nGram] token filter name was deprecated in 6.4 and cannot be used in new indices. " - + "Please change the filter name to [ngram] instead.", - ex.getMessage() - ); - } - } - - /** - * Check that the deprecated "edgeNGram" filter throws exception for indices created since 7.0.0 and - * logs a warning for earlier indices when the filter is used as a custom filter - */ - public void testEdgeNGramFilterInCustomAnalyzerDeprecationError() throws IOException { - final Settings settings = Settings.builder() - .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir()) - .put( - IndexMetadata.SETTING_VERSION_CREATED, - IndexVersionUtils.randomVersionBetween(random(), IndexVersions.V_8_0_0, IndexVersion.current()) - ) - .put("index.analysis.analyzer.custom_analyzer.type", "custom") - .put("index.analysis.analyzer.custom_analyzer.tokenizer", "standard") - .putList("index.analysis.analyzer.custom_analyzer.filter", "my_ngram") - .put("index.analysis.filter.my_ngram.type", "edgeNGram") - .build(); - - try (CommonAnalysisPlugin commonAnalysisPlugin = new CommonAnalysisPlugin()) { - IllegalArgumentException ex = expectThrows( - IllegalArgumentException.class, - () -> createTestAnalysis(IndexSettingsModule.newIndexSettings("index", settings), settings, commonAnalysisPlugin) - ); - assertEquals( - "The [edgeNGram] token filter name was deprecated in 6.4 and cannot be used in new indices. " - + "Please change the filter name to [edge_ngram] instead.", - ex.getMessage() - ); - } - } - - /** - * Check that we log a deprecation warning for "nGram" and "edgeNGram" tokenizer names with 7.6 and - * disallow usages for indices created after 8.0 - */ - public void testNGramTokenizerDeprecation() throws IOException { - expectThrows( - IllegalArgumentException.class, - () -> doTestPrebuiltTokenizerDeprecation( - "nGram", - "ngram", - IndexVersionUtils.randomVersionBetween(random(), IndexVersions.V_8_0_0, IndexVersion.current()), - true - ) - ); - expectThrows( - IllegalArgumentException.class, - () -> doTestPrebuiltTokenizerDeprecation( - "edgeNGram", - "edge_ngram", - IndexVersionUtils.randomVersionBetween(random(), IndexVersions.V_8_0_0, IndexVersion.current()), - true - ) - ); - expectThrows( - IllegalArgumentException.class, - () -> doTestCustomTokenizerDeprecation( - "nGram", - "ngram", - IndexVersionUtils.randomVersionBetween(random(), IndexVersions.V_8_0_0, IndexVersion.current()), - true - ) - ); - expectThrows( - IllegalArgumentException.class, - () -> doTestCustomTokenizerDeprecation( - "edgeNGram", - "edge_ngram", - IndexVersionUtils.randomVersionBetween(random(), IndexVersions.V_8_0_0, IndexVersion.current()), - true - ) - ); - } - - public void doTestPrebuiltTokenizerDeprecation(String deprecatedName, String replacement, IndexVersion version, boolean expectWarning) - throws IOException { - final Settings settings = Settings.builder() - .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir()) - .put(IndexMetadata.SETTING_VERSION_CREATED, version) - .build(); - - try (CommonAnalysisPlugin commonAnalysisPlugin = new CommonAnalysisPlugin()) { - Map tokenizers = createTestAnalysis( - IndexSettingsModule.newIndexSettings("index", settings), - settings, - commonAnalysisPlugin - ).tokenizer; - TokenizerFactory tokenizerFactory = tokenizers.get(deprecatedName); - - Tokenizer tokenizer = tokenizerFactory.create(); - assertNotNull(tokenizer); - if (expectWarning) { - assertWarnings( - "The [" - + deprecatedName - + "] tokenizer name is deprecated and will be removed in a future version. " - + "Please change the tokenizer name to [" - + replacement - + "] instead." - ); - } - } - } - - public void doTestCustomTokenizerDeprecation(String deprecatedName, String replacement, IndexVersion version, boolean expectWarning) - throws IOException { - final Settings settings = Settings.builder() - .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir()) - .put(IndexMetadata.SETTING_VERSION_CREATED, version) - .put("index.analysis.analyzer.custom_analyzer.type", "custom") - .put("index.analysis.analyzer.custom_analyzer.tokenizer", "my_tokenizer") - .put("index.analysis.tokenizer.my_tokenizer.type", deprecatedName) - .build(); - - try (CommonAnalysisPlugin commonAnalysisPlugin = new CommonAnalysisPlugin()) { - createTestAnalysis(IndexSettingsModule.newIndexSettings("index", settings), settings, commonAnalysisPlugin); - - if (expectWarning) { - assertWarnings( - "The [" - + deprecatedName - + "] tokenizer name is deprecated and will be removed in a future version. " - + "Please change the tokenizer name to [" - + replacement - + "] instead." - ); - } - } - } -} diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/EdgeNGramTokenizerTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/EdgeNGramTokenizerTests.java index 286e3f1f939d8..329318a096efb 100644 --- a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/EdgeNGramTokenizerTests.java +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/EdgeNGramTokenizerTests.java @@ -31,7 +31,7 @@ public class EdgeNGramTokenizerTests extends ESTokenStreamTestCase { - private IndexAnalyzers buildAnalyzers(IndexVersion version, String tokenizer) throws IOException { + private static IndexAnalyzers buildAnalyzers(IndexVersion version, String tokenizer) throws IOException { Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build(); Settings indexSettings = Settings.builder() .put(IndexMetadata.SETTING_VERSION_CREATED, version) @@ -51,7 +51,6 @@ public void testPreConfiguredTokenizer() throws IOException { assertNotNull(analyzer); assertAnalyzesTo(analyzer, "test", new String[] { "t", "te" }); } - } public void testCustomTokenChars() throws IOException { From b96fc708bec890b47a09537e4bbc9d36a2bdea71 Mon Sep 17 00:00:00 2001 From: Luca Cavanna Date: Tue, 17 Sep 2024 15:28:56 +0200 Subject: [PATCH 2/3] Update docs/changelog/113009.yaml --- docs/changelog/113009.yaml | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 docs/changelog/113009.yaml diff --git a/docs/changelog/113009.yaml b/docs/changelog/113009.yaml new file mode 100644 index 0000000000000..a069f060c8276 --- /dev/null +++ b/docs/changelog/113009.yaml @@ -0,0 +1,12 @@ +pr: 113009 +summary: Remove deprecations and 7.x related code from analysis common +area: Analysis +type: breaking +issues: [] +breaking: + title: Remove deprecations and 7.x related code from analysis common + area: Analysis + details: Please describe the details of this change for the release notes. You can + use asciidoc. + impact: Please describe the impact of this change to users + notable: false From 9d0f5604900a0d1547952122e234beb80db0320a Mon Sep 17 00:00:00 2001 From: Luca Cavanna Date: Tue, 17 Sep 2024 16:11:47 +0200 Subject: [PATCH 3/3] Delete docs/changelog/113009.yaml --- docs/changelog/113009.yaml | 12 ------------ 1 file changed, 12 deletions(-) delete mode 100644 docs/changelog/113009.yaml diff --git a/docs/changelog/113009.yaml b/docs/changelog/113009.yaml deleted file mode 100644 index a069f060c8276..0000000000000 --- a/docs/changelog/113009.yaml +++ /dev/null @@ -1,12 +0,0 @@ -pr: 113009 -summary: Remove deprecations and 7.x related code from analysis common -area: Analysis -type: breaking -issues: [] -breaking: - title: Remove deprecations and 7.x related code from analysis common - area: Analysis - details: Please describe the details of this change for the release notes. You can - use asciidoc. - impact: Please describe the impact of this change to users - notable: false