diff --git a/docs/reference/analysis/normalizers.asciidoc b/docs/reference/analysis/normalizers.asciidoc index 94c99eec89910..eff9295857da4 100644 --- a/docs/reference/analysis/normalizers.asciidoc +++ b/docs/reference/analysis/normalizers.asciidoc @@ -13,11 +13,13 @@ following: `arabic_normalization`, `asciifolding`, `bengali_normalization`, `persian_normalization`, `scandinavian_folding`, `serbian_normalization`, `sorani_normalization`, `uppercase`. +Elasticsearch ships with a `lowercase` built-in normalizer. For other forms of +normalization a custom configuration is required. + [float] === Custom normalizers -Elasticsearch does not ship with built-in normalizers so far, so the only way -to get one is by building a custom one. Custom normalizers take a list of char +Custom normalizers take a list of <> and a list of <>. diff --git a/docs/reference/mapping/params/normalizer.asciidoc b/docs/reference/mapping/params/normalizer.asciidoc index b218d311c7201..bbb4f7d9cc8d5 100644 --- a/docs/reference/mapping/params/normalizer.asciidoc +++ b/docs/reference/mapping/params/normalizer.asciidoc @@ -7,9 +7,13 @@ produces a single token. The `normalizer` is applied prior to indexing the keyword, as well as at search-time when the `keyword` field is searched via a query parser such as -the <> query or via a term-level query +the <> query or via a term-level query such as the <> query. +A simple normalizer called `lowercase` ships with elasticsearch and can be used. +Custom normalizers can be defined as part of analysis settings as follows. + + [source,console] -------------------------------- PUT index diff --git a/server/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java b/server/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java index 197f2d862fcfa..659884490078b 100644 --- a/server/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java +++ b/server/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java @@ -293,7 +293,6 @@ private Map> buildAnalyzerFactories(IndexSettings in private Map> buildNormalizerFactories(IndexSettings indexSettings) throws IOException { final Map normalizersSettings = indexSettings.getSettings().getGroups("index.analysis.normalizer"); - // TODO: Have pre-built normalizers return buildMapping(Component.NORMALIZER, indexSettings, normalizersSettings, normalizers, Collections.emptyMap()); } diff --git a/server/src/main/java/org/elasticsearch/index/analysis/LowercaseNormalizer.java b/server/src/main/java/org/elasticsearch/index/analysis/LowercaseNormalizer.java new file mode 100644 index 0000000000000..f817810075641 --- /dev/null +++ b/server/src/main/java/org/elasticsearch/index/analysis/LowercaseNormalizer.java @@ -0,0 +1,42 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.LowerCaseFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.KeywordTokenizer; + +/** Normalizer used to lowercase values */ +public final class LowercaseNormalizer extends Analyzer { + + @Override + protected TokenStreamComponents createComponents(String s) { + final Tokenizer tokenizer = new KeywordTokenizer(); + TokenStream stream = new LowerCaseFilter(tokenizer); + return new TokenStreamComponents(tokenizer, stream); + } + + @Override + protected TokenStream normalize(String fieldName, TokenStream in) { + return new LowerCaseFilter(in); + } +} diff --git a/server/src/main/java/org/elasticsearch/index/analysis/LowercaseNormalizerProvider.java b/server/src/main/java/org/elasticsearch/index/analysis/LowercaseNormalizerProvider.java new file mode 100644 index 0000000000000..0ba6e292cf827 --- /dev/null +++ b/server/src/main/java/org/elasticsearch/index/analysis/LowercaseNormalizerProvider.java @@ -0,0 +1,44 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + + +package org.elasticsearch.index.analysis; + +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.env.Environment; +import org.elasticsearch.index.IndexSettings; + + +/** + * Builds an analyzer for normalization that lowercases terms. + */ +public class LowercaseNormalizerProvider extends AbstractIndexAnalyzerProvider { + + private final LowercaseNormalizer analyzer; + + public LowercaseNormalizerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { + super(indexSettings, name, settings); + this.analyzer = new LowercaseNormalizer(); + } + + @Override + public LowercaseNormalizer get() { + return analyzer; + } +} \ No newline at end of file diff --git a/server/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java b/server/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java index 5133182fc7901..6c54b59d474d9 100644 --- a/server/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java +++ b/server/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java @@ -35,6 +35,7 @@ import org.elasticsearch.index.analysis.CharFilterFactory; import org.elasticsearch.index.analysis.HunspellTokenFilterFactory; import org.elasticsearch.index.analysis.KeywordAnalyzerProvider; +import org.elasticsearch.index.analysis.LowercaseNormalizerProvider; import org.elasticsearch.index.analysis.PreBuiltAnalyzerProviderFactory; import org.elasticsearch.index.analysis.PreConfiguredCharFilter; import org.elasticsearch.index.analysis.PreConfiguredTokenFilter; @@ -250,7 +251,7 @@ private NamedRegistry>> setupAnalyzers(List private NamedRegistry>> setupNormalizers(List plugins) { NamedRegistry>> normalizers = new NamedRegistry<>("normalizer"); - // TODO: provide built-in normalizer providers? + normalizers.register("lowercase", LowercaseNormalizerProvider::new); // TODO: pluggability? return normalizers; } diff --git a/server/src/test/java/org/elasticsearch/index/analysis/AnalysisRegistryTests.java b/server/src/test/java/org/elasticsearch/index/analysis/AnalysisRegistryTests.java index aa8783f5cf70a..f1311fad0cf9a 100644 --- a/server/src/test/java/org/elasticsearch/index/analysis/AnalysisRegistryTests.java +++ b/server/src/test/java/org/elasticsearch/index/analysis/AnalysisRegistryTests.java @@ -20,11 +20,13 @@ package org.elasticsearch.index.analysis; import com.carrotsearch.randomizedtesting.generators.RandomPicks; + import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.MockTokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.en.EnglishAnalyzer; +import org.apache.lucene.analysis.reverse.ReverseStringFilter; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; @@ -39,12 +41,14 @@ import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider; import org.elasticsearch.indices.analysis.PreBuiltAnalyzers; import org.elasticsearch.plugins.AnalysisPlugin; +import org.elasticsearch.plugins.Plugin; import org.elasticsearch.test.ESTestCase; import org.elasticsearch.test.IndexSettingsModule; import org.elasticsearch.test.VersionUtils; import java.io.IOException; import java.util.Collections; +import java.util.List; import java.util.Map; import static java.util.Collections.emptyMap; @@ -57,6 +61,7 @@ public class AnalysisRegistryTests extends ESTestCase { private AnalysisRegistry emptyRegistry; + private AnalysisRegistry nonEmptyRegistry; private static AnalyzerProvider analyzerProvider(final String name) { return new PreBuiltAnalyzerProvider(name, AnalyzerScope.INDEX, new EnglishAnalyzer()); @@ -67,6 +72,16 @@ private static AnalysisRegistry emptyAnalysisRegistry(Settings settings) { emptyMap(), emptyMap(), emptyMap(), emptyMap()); } + /** + * Creates a reverse filter available for use in testNameClashNormalizer test + */ + public static class MockAnalysisPlugin extends Plugin implements AnalysisPlugin { + @Override + public List getPreConfiguredTokenFilters() { + return singletonList(PreConfiguredTokenFilter.singleton("reverse", true, ReverseStringFilter::new)); + } + } + private static IndexSettings indexSettingsOfCurrentVersion(Settings.Builder settings) { return IndexSettingsModule.newIndexSettings("index", settings .put(IndexMetadata.SETTING_VERSION_CREATED, Version.CURRENT) @@ -76,9 +91,13 @@ private static IndexSettings indexSettingsOfCurrentVersion(Settings.Builder sett @Override public void setUp() throws Exception { super.setUp(); - emptyRegistry = emptyAnalysisRegistry(Settings.builder() + Settings settings = Settings.builder() .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) - .build()); + .build(); + emptyRegistry = emptyAnalysisRegistry(settings); + // Module loaded to register in-built normalizers for testing + AnalysisModule module = new AnalysisModule(TestEnvironment.newEnvironment(settings), singletonList(new MockAnalysisPlugin())); + nonEmptyRegistry = module.getAnalysisRegistry(); } public void testDefaultAnalyzers() throws IOException { @@ -134,7 +153,29 @@ public Tokenizer create() { emptyMap(), emptyMap(), emptyMap())); assertEquals("analyzer [default] contains filters [my_filter] that are not allowed to run in all mode.", ex.getMessage()); } + + + public void testNameClashNormalizer() throws IOException { + + // Test out-of-the-box normalizer works OK. + IndexAnalyzers indexAnalyzers = nonEmptyRegistry.build(IndexSettingsModule.newIndexSettings("index", Settings.EMPTY)); + assertNotNull(indexAnalyzers.getNormalizer("lowercase")); + assertThat(indexAnalyzers.getNormalizer("lowercase").normalize("field", "AbC").utf8ToString(), equalTo("abc")); + + // Test that a name clash with a custom normalizer will favour the index's normalizer rather than the out-of-the-box + // one of the same name. (However this "feature" will be removed with https://github.com/elastic/elasticsearch/issues/22263 ) + Settings settings = Settings.builder() + // Deliberately bad choice of normalizer name for the job it does. + .put("index.analysis.normalizer.lowercase.type", "custom") + .putList("index.analysis.normalizer.lowercase.filter", "reverse") + .build(); + + indexAnalyzers = nonEmptyRegistry.build(IndexSettingsModule.newIndexSettings("index", settings)); + assertNotNull(indexAnalyzers.getNormalizer("lowercase")); + assertThat(indexAnalyzers.getNormalizer("lowercase").normalize("field","AbC").utf8ToString(), equalTo("CbA")); + } + public void testOverrideDefaultIndexAnalyzerIsUnsupported() { Version version = VersionUtils.randomIndexCompatibleVersion(random()); Settings settings = Settings.builder().put(IndexMetadata.SETTING_VERSION_CREATED, version).build(); diff --git a/server/src/test/java/org/elasticsearch/index/mapper/KeywordFieldMapperTests.java b/server/src/test/java/org/elasticsearch/index/mapper/KeywordFieldMapperTests.java index d5f0414708d64..9826b67c2bd13 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/KeywordFieldMapperTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/KeywordFieldMapperTests.java @@ -344,10 +344,18 @@ public void testEnableNorms() throws IOException { assertEquals(0, fieldNamesFields.length); } - public void testNormalizer() throws IOException { + public void testCustomNormalizer() throws IOException { + checkLowercaseNormalizer("my_lowercase"); + } + + public void testInBuiltNormalizer() throws IOException { + checkLowercaseNormalizer("lowercase"); + } + + public void checkLowercaseNormalizer(String normalizerName) throws IOException { String mapping = Strings.toString(XContentFactory.jsonBuilder().startObject().startObject("type") .startObject("properties").startObject("field") - .field("type", "keyword").field("normalizer", "my_lowercase").endObject().endObject() + .field("type", "keyword").field("normalizer", normalizerName).endObject().endObject() .endObject().endObject()); DocumentMapper mapper = parser.parse("type", new CompressedXContent(mapping));