diff --git a/docs/plugins/analysis-phonetic.asciidoc b/docs/plugins/analysis-phonetic.asciidoc index a75c21fdac658..9d9df4827fd4e 100644 --- a/docs/plugins/analysis-phonetic.asciidoc +++ b/docs/plugins/analysis-phonetic.asciidoc @@ -38,7 +38,6 @@ PUT phonetic_sample "my_analyzer": { "tokenizer": "standard", "filter": [ - "standard", "lowercase", "my_metaphone" ] diff --git a/docs/reference/analysis/analyzers/standard-analyzer.asciidoc b/docs/reference/analysis/analyzers/standard-analyzer.asciidoc index 20aa072066b5f..1a8a9ee3aa638 100644 --- a/docs/reference/analysis/analyzers/standard-analyzer.asciidoc +++ b/docs/reference/analysis/analyzers/standard-analyzer.asciidoc @@ -292,7 +292,6 @@ PUT /standard_example "rebuilt_standard": { "tokenizer": "standard", "filter": [ - "standard", "lowercase" <1> ] } diff --git a/docs/reference/analysis/tokenfilters/asciifolding-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/asciifolding-tokenfilter.asciidoc index 73d35549da8b6..bd22b013334a9 100644 --- a/docs/reference/analysis/tokenfilters/asciifolding-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/asciifolding-tokenfilter.asciidoc @@ -15,7 +15,7 @@ PUT /asciifold_example "analyzer" : { "default" : { "tokenizer" : "standard", - "filter" : ["standard", "asciifolding"] + "filter" : ["asciifolding"] } } } @@ -37,7 +37,7 @@ PUT /asciifold_example "analyzer" : { "default" : { "tokenizer" : "standard", - "filter" : ["standard", "my_ascii_folding"] + "filter" : ["my_ascii_folding"] } }, "filter" : { diff --git a/docs/reference/analysis/tokenfilters/elision-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/elision-tokenfilter.asciidoc index 956c5ad13d034..924903b9f65a8 100644 --- a/docs/reference/analysis/tokenfilters/elision-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/elision-tokenfilter.asciidoc @@ -16,7 +16,7 @@ PUT /elision_example "analyzer" : { "default" : { "tokenizer" : "standard", - "filter" : ["standard", "elision"] + "filter" : ["elision"] } }, "filter" : { diff --git a/docs/reference/analysis/tokenfilters/keep-types-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/keep-types-tokenfilter.asciidoc index 05687f8669155..33a927c4b98bf 100644 --- a/docs/reference/analysis/tokenfilters/keep-types-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/keep-types-tokenfilter.asciidoc @@ -26,7 +26,7 @@ PUT /keep_types_example "analyzer" : { "my_analyzer" : { "tokenizer" : "standard", - "filter" : ["standard", "lowercase", "extract_numbers"] + "filter" : ["lowercase", "extract_numbers"] } }, "filter" : { @@ -87,7 +87,7 @@ PUT /keep_types_exclude_example "analyzer" : { "my_analyzer" : { "tokenizer" : "standard", - "filter" : ["standard", "lowercase", "remove_numbers"] + "filter" : ["lowercase", "remove_numbers"] } }, "filter" : { diff --git a/docs/reference/analysis/tokenfilters/keep-words-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/keep-words-tokenfilter.asciidoc index 50c74942a0101..b7385379be94b 100644 --- a/docs/reference/analysis/tokenfilters/keep-words-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/keep-words-tokenfilter.asciidoc @@ -27,11 +27,11 @@ PUT /keep_words_example "analyzer" : { "example_1" : { "tokenizer" : "standard", - "filter" : ["standard", "lowercase", "words_till_three"] + "filter" : ["lowercase", "words_till_three"] }, "example_2" : { "tokenizer" : "standard", - "filter" : ["standard", "lowercase", "words_in_file"] + "filter" : ["lowercase", "words_in_file"] } }, "filter" : { diff --git a/docs/reference/analysis/tokenfilters/snowball-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/snowball-tokenfilter.asciidoc index 93e1eed26b4b2..99ed03649ff93 100644 --- a/docs/reference/analysis/tokenfilters/snowball-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/snowball-tokenfilter.asciidoc @@ -19,7 +19,7 @@ PUT /my_index "analyzer" : { "my_analyzer" : { "tokenizer" : "standard", - "filter" : ["standard", "lowercase", "my_snow"] + "filter" : ["lowercase", "my_snow"] } }, "filter" : { diff --git a/docs/reference/analysis/tokenfilters/standard-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/standard-tokenfilter.asciidoc index 0270bf71b4b3e..46f4cbbec2c19 100644 --- a/docs/reference/analysis/tokenfilters/standard-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/standard-tokenfilter.asciidoc @@ -1,10 +1,11 @@ [[analysis-standard-tokenfilter]] === Standard Token Filter +deprecated[6.5.0, This filter is deprecated and will be removed in the next +major version.] + A token filter of type `standard` that normalizes tokens extracted with -the -<>. +the <>. [TIP] ================================================== diff --git a/docs/reference/analysis/tokenfilters/stemmer-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/stemmer-tokenfilter.asciidoc index a13c6746d74be..f59e2f3f2cf88 100644 --- a/docs/reference/analysis/tokenfilters/stemmer-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/stemmer-tokenfilter.asciidoc @@ -13,7 +13,7 @@ PUT /my_index "analyzer" : { "my_analyzer" : { "tokenizer" : "standard", - "filter" : ["standard", "lowercase", "my_stemmer"] + "filter" : ["lowercase", "my_stemmer"] } }, "filter" : { diff --git a/docs/reference/mapping/types/percolator.asciidoc b/docs/reference/mapping/types/percolator.asciidoc index b5226b53ba0c7..6c9f9732cd9c0 100644 --- a/docs/reference/mapping/types/percolator.asciidoc +++ b/docs/reference/mapping/types/percolator.asciidoc @@ -446,7 +446,6 @@ PUT my_queries1 "type": "custom", "tokenizer": "standard", "filter": [ - "standard", "lowercase", "wildcard_edge_ngram" ] @@ -597,7 +596,6 @@ PUT my_queries2 "type": "custom", "tokenizer": "standard", "filter": [ - "standard", "lowercase", "reverse", "wildcard_edge_ngram" @@ -607,7 +605,6 @@ PUT my_queries2 "type": "custom", "tokenizer": "standard", "filter": [ - "standard", "lowercase", "reverse" ] diff --git a/docs/reference/migration/migrate_6_0/analysis.asciidoc b/docs/reference/migration/migrate_6_0/analysis.asciidoc index 4316a342eba86..8746f8ea72aec 100644 --- a/docs/reference/migration/migrate_6_0/analysis.asciidoc +++ b/docs/reference/migration/migrate_6_0/analysis.asciidoc @@ -20,4 +20,8 @@ To protect against this, the maximum number of characters that to be analyzed wi limited to 1000000 in the next major Elastic version. For this version, by default the limit is not set. A deprecation warning will be issued when an analyzed text exceeds 1000000. The limit can be set for a particular index with the index setting -`index.highlight.max_analyzed_offset`. \ No newline at end of file +`index.highlight.max_analyzed_offset`. + +==== `standard` filter has been deprecated + The `standard` token filter has been deprecated because it doesn't change anything in + the stream. It will be removed in the next major version. \ No newline at end of file diff --git a/docs/reference/search/suggesters/phrase-suggest.asciidoc b/docs/reference/search/suggesters/phrase-suggest.asciidoc index 92138e7ecdfe0..3364c0db108c6 100644 --- a/docs/reference/search/suggesters/phrase-suggest.asciidoc +++ b/docs/reference/search/suggesters/phrase-suggest.asciidoc @@ -33,12 +33,12 @@ PUT test "trigram": { "type": "custom", "tokenizer": "standard", - "filter": ["standard", "shingle"] + "filter": ["shingle"] }, "reverse": { "type": "custom", "tokenizer": "standard", - "filter": ["standard", "reverse"] + "filter": ["reverse"] } }, "filter": { diff --git a/plugins/analysis-icu/src/test/resources/rest-api-spec/test/analysis_icu/20_search.yml b/plugins/analysis-icu/src/test/resources/rest-api-spec/test/analysis_icu/20_search.yml index 67ff1dab98483..89ef510c72b02 100644 --- a/plugins/analysis-icu/src/test/resources/rest-api-spec/test/analysis_icu/20_search.yml +++ b/plugins/analysis-icu/src/test/resources/rest-api-spec/test/analysis_icu/20_search.yml @@ -12,7 +12,7 @@ analyzer: my_analyzer: tokenizer: standard - filter: ["standard", "lowercase", "my_collator"] + filter: ["lowercase", "my_collator"] filter: my_collator: type: icu_collation diff --git a/plugins/analysis-phonetic/src/test/resources/rest-api-spec/test/analysis_phonetic/10_metaphone.yml b/plugins/analysis-phonetic/src/test/resources/rest-api-spec/test/analysis_phonetic/10_metaphone.yml index 1f326fe3776d1..1be0d8525a1c6 100644 --- a/plugins/analysis-phonetic/src/test/resources/rest-api-spec/test/analysis_phonetic/10_metaphone.yml +++ b/plugins/analysis-phonetic/src/test/resources/rest-api-spec/test/analysis_phonetic/10_metaphone.yml @@ -13,7 +13,7 @@ analyzer: my_analyzer: tokenizer: standard - filter: ["standard", "lowercase", "my_metaphone"] + filter: ["lowercase", "my_metaphone"] filter: my_metaphone: type: phonetic diff --git a/plugins/analysis-phonetic/src/test/resources/rest-api-spec/test/analysis_phonetic/20_double_metaphone.yml b/plugins/analysis-phonetic/src/test/resources/rest-api-spec/test/analysis_phonetic/20_double_metaphone.yml index 5af9f48aa808e..84b0129414c8e 100644 --- a/plugins/analysis-phonetic/src/test/resources/rest-api-spec/test/analysis_phonetic/20_double_metaphone.yml +++ b/plugins/analysis-phonetic/src/test/resources/rest-api-spec/test/analysis_phonetic/20_double_metaphone.yml @@ -13,7 +13,7 @@ analyzer: my_analyzer: tokenizer: standard - filter: ["standard", "lowercase", "my_metaphone"] + filter: ["lowercase", "my_metaphone"] filter: my_metaphone: type: phonetic diff --git a/plugins/analysis-phonetic/src/test/resources/rest-api-spec/test/analysis_phonetic/30_beider_morse.yml b/plugins/analysis-phonetic/src/test/resources/rest-api-spec/test/analysis_phonetic/30_beider_morse.yml index 259b0adea745d..bdd1ddef388df 100644 --- a/plugins/analysis-phonetic/src/test/resources/rest-api-spec/test/analysis_phonetic/30_beider_morse.yml +++ b/plugins/analysis-phonetic/src/test/resources/rest-api-spec/test/analysis_phonetic/30_beider_morse.yml @@ -13,7 +13,7 @@ analyzer: my_analyzer: tokenizer: standard - filter: ["standard", "lowercase", "beider_morse"] + filter: ["lowercase", "beider_morse"] filter: beider_morse: type: phonetic diff --git a/plugins/analysis-phonetic/src/test/resources/rest-api-spec/test/analysis_phonetic/40_search.yml b/plugins/analysis-phonetic/src/test/resources/rest-api-spec/test/analysis_phonetic/40_search.yml index 75c672172391c..34a5bfa1da14c 100644 --- a/plugins/analysis-phonetic/src/test/resources/rest-api-spec/test/analysis_phonetic/40_search.yml +++ b/plugins/analysis-phonetic/src/test/resources/rest-api-spec/test/analysis_phonetic/40_search.yml @@ -12,7 +12,7 @@ analyzer: my_analyzer: tokenizer: standard - filter: ["standard", "lowercase", "my_metaphone"] + filter: ["lowercase", "my_metaphone"] filter: my_metaphone: type: phonetic diff --git a/plugins/analysis-phonetic/src/test/resources/rest-api-spec/test/analysis_phonetic/50_daitch_mokotoff.yml b/plugins/analysis-phonetic/src/test/resources/rest-api-spec/test/analysis_phonetic/50_daitch_mokotoff.yml index c67b6892bc993..bee4c8bf5f432 100644 --- a/plugins/analysis-phonetic/src/test/resources/rest-api-spec/test/analysis_phonetic/50_daitch_mokotoff.yml +++ b/plugins/analysis-phonetic/src/test/resources/rest-api-spec/test/analysis_phonetic/50_daitch_mokotoff.yml @@ -13,7 +13,7 @@ analyzer: my_analyzer: tokenizer: standard - filter: ["standard", "lowercase", "daitch_mokotoff"] + filter: ["lowercase", "daitch_mokotoff"] filter: daitch_mokotoff: type: phonetic diff --git a/server/src/main/java/org/elasticsearch/index/analysis/StandardTokenFilterFactory.java b/server/src/main/java/org/elasticsearch/index/analysis/StandardTokenFilterFactory.java deleted file mode 100644 index 2339815b5582e..0000000000000 --- a/server/src/main/java/org/elasticsearch/index/analysis/StandardTokenFilterFactory.java +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Licensed to Elasticsearch under one or more contributor - * license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright - * ownership. Elasticsearch licenses this file to you under - * the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.elasticsearch.index.analysis; - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.standard.StandardFilter; -import org.elasticsearch.common.settings.Settings; -import org.elasticsearch.env.Environment; -import org.elasticsearch.index.IndexSettings; - - -public class StandardTokenFilterFactory extends AbstractTokenFilterFactory { - - public StandardTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { - super(indexSettings, name, settings); - } - - @Override - public TokenStream create(TokenStream tokenStream) { - return new StandardFilter(tokenStream); - } -} \ No newline at end of file diff --git a/server/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java b/server/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java index 1ecdc797073cf..d180704b795f8 100644 --- a/server/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java +++ b/server/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java @@ -19,14 +19,17 @@ package org.elasticsearch.indices.analysis; +import org.apache.logging.log4j.LogManager; import org.apache.lucene.analysis.LowerCaseFilter; -import org.apache.lucene.analysis.standard.StandardFilter; +import org.apache.lucene.analysis.TokenStream; import org.elasticsearch.Version; import org.elasticsearch.cluster.metadata.IndexMetaData; import org.elasticsearch.common.NamedRegistry; +import org.elasticsearch.common.logging.DeprecationLogger; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; import org.elasticsearch.index.analysis.AnalysisRegistry; import org.elasticsearch.index.analysis.AnalyzerProvider; import org.elasticsearch.index.analysis.CharFilterFactory; @@ -39,7 +42,6 @@ import org.elasticsearch.index.analysis.ShingleTokenFilterFactory; import org.elasticsearch.index.analysis.SimpleAnalyzerProvider; import org.elasticsearch.index.analysis.StandardAnalyzerProvider; -import org.elasticsearch.index.analysis.StandardTokenFilterFactory; import org.elasticsearch.index.analysis.StandardTokenizerFactory; import org.elasticsearch.index.analysis.StopAnalyzerProvider; import org.elasticsearch.index.analysis.StopTokenFilterFactory; @@ -69,6 +71,8 @@ public final class AnalysisModule { private static final IndexSettings NA_INDEX_SETTINGS; + private static final DeprecationLogger DEPRECATION_LOGGER = new DeprecationLogger(LogManager.getLogger(AnalysisModule.class)); + private final HunspellService hunspellService; private final AnalysisRegistry analysisRegistry; @@ -116,7 +120,16 @@ private NamedRegistry> setupTokenFilters(Li hunspellService) { NamedRegistry> tokenFilters = new NamedRegistry<>("token_filter"); tokenFilters.register("stop", StopTokenFilterFactory::new); - tokenFilters.register("standard", StandardTokenFilterFactory::new); + tokenFilters.register("standard", (indexSettings, environment, name, settings) -> { + DEPRECATION_LOGGER.deprecatedAndMaybeLog("standard_deprecation", + "The [standard] token filter name is deprecated and will be removed in a future version."); + return new AbstractTokenFilterFactory(indexSettings, name, settings) { + @Override + public TokenStream create(TokenStream tokenStream) { + return tokenStream; + } + }; + }); tokenFilters.register("shingle", ShingleTokenFilterFactory::new); tokenFilters.register("hunspell", requiresAnalysisSettings((indexSettings, env, name, settings) -> new HunspellTokenFilterFactory (indexSettings, name, settings, hunspellService))); @@ -153,7 +166,12 @@ static Map setupPreConfiguredTokenFilters(List // Add filters available in lucene-core preConfiguredTokenFilters.register("lowercase", PreConfiguredTokenFilter.singleton("lowercase", true, LowerCaseFilter::new)); - preConfiguredTokenFilters.register("standard", PreConfiguredTokenFilter.singleton("standard", false, StandardFilter::new)); + preConfiguredTokenFilters.register( "standard", + PreConfiguredTokenFilter.singletonWithVersion("standard", false, (reader, version) -> { + DEPRECATION_LOGGER.deprecatedAndMaybeLog("standard_deprecation", + "The [standard] token filter is deprecated and will be removed in a future version."); + return reader; + })); /* Note that "stop" is available in lucene-core but it's pre-built * version uses a set of English stop words that are in * lucene-analyzers-common so "stop" is defined in the analysis-common diff --git a/server/src/test/java/org/elasticsearch/indices/analysis/AnalysisModuleTests.java b/server/src/test/java/org/elasticsearch/indices/analysis/AnalysisModuleTests.java index 3e6b11f56a1b2..3950b780a91e8 100644 --- a/server/src/test/java/org/elasticsearch/indices/analysis/AnalysisModuleTests.java +++ b/server/src/test/java/org/elasticsearch/indices/analysis/AnalysisModuleTests.java @@ -435,6 +435,19 @@ public Map getHunspellDictionaries() { assertSame(dictionary, module.getHunspellService().getDictionary("foo")); } + public void testStandardFilterDeprecation() throws IOException { + Version version = VersionUtils.randomVersionBetween(random(), Version.V_5_0_0, Version.CURRENT); + Settings settings = Settings.builder() + .put("index.analysis.analyzer.my_standard.tokenizer", "standard") + .put("index.analysis.analyzer.my_standard.filter", "standard") + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put(IndexMetaData.SETTING_VERSION_CREATED, version) + .build(); + IndexAnalyzers analyzers = getIndexAnalyzers(settings); + assertTokenStreamContents(analyzers.get("my_standard").tokenStream("", "test"), new String[]{"test"}); + assertWarnings("The [standard] token filter is deprecated and will be removed in a future version."); + } + // Simple char filter that appends text to the term public static class AppendCharFilter extends CharFilter { diff --git a/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java b/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java index 5298c3995cec2..a85619bb62596 100644 --- a/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java +++ b/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java @@ -29,7 +29,6 @@ import org.elasticsearch.index.analysis.PreConfiguredTokenFilter; import org.elasticsearch.index.analysis.PreConfiguredTokenizer; import org.elasticsearch.index.analysis.ShingleTokenFilterFactory; -import org.elasticsearch.index.analysis.StandardTokenFilterFactory; import org.elasticsearch.index.analysis.StandardTokenizerFactory; import org.elasticsearch.index.analysis.StopTokenFilterFactory; import org.elasticsearch.index.analysis.SynonymGraphTokenFilterFactory; @@ -167,7 +166,7 @@ private static String toCamelCase(String s) { .put("soraninormalization", MovedToAnalysisCommon.class) .put("soranistem", MovedToAnalysisCommon.class) .put("spanishlightstem", MovedToAnalysisCommon.class) - .put("standard", StandardTokenFilterFactory.class) + .put("standard", Deprecated.class) .put("stemmeroverride", MovedToAnalysisCommon.class) .put("stop", StopTokenFilterFactory.class) .put("swedishlightstem", MovedToAnalysisCommon.class) diff --git a/test/framework/src/main/resources/org/elasticsearch/analysis/common/test1.json b/test/framework/src/main/resources/org/elasticsearch/analysis/common/test1.json index 38937a9b5af93..e69c2db6ff400 100644 --- a/test/framework/src/main/resources/org/elasticsearch/analysis/common/test1.json +++ b/test/framework/src/main/resources/org/elasticsearch/analysis/common/test1.json @@ -42,7 +42,7 @@ }, "czechAnalyzerWithStemmer":{ "tokenizer":"standard", - "filter":["standard", "lowercase", "stop", "czech_stem"] + "filter":["lowercase", "stop", "czech_stem"] }, "decompoundingAnalyzer":{ "tokenizer":"standard",