Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions docs/reference/analysis/normalizers.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,13 @@ following: `arabic_normalization`, `asciifolding`, `bengali_normalization`,
`persian_normalization`, `scandinavian_folding`, `serbian_normalization`,
`sorani_normalization`, `uppercase`.

Elasticsearch ships with a `lowercase` built-in normalizer. For other forms of
normalization a custom configuration is required.

[float]
=== Custom normalizers

Elasticsearch does not ship with built-in normalizers so far, so the only way
to get one is by building a custom one. Custom normalizers take a list of char
Custom normalizers take a list of
<<analysis-charfilters, character filters>> and a list of
<<analysis-tokenfilters,token filters>>.

Expand Down
6 changes: 5 additions & 1 deletion docs/reference/mapping/params/normalizer.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,13 @@ produces a single token.

The `normalizer` is applied prior to indexing the keyword, as well as at
search-time when the `keyword` field is searched via a query parser such as
the <<query-dsl-match-query,`match`>> query or via a term-level query
the <<query-dsl-match-query,`match`>> query or via a term-level query
such as the <<query-dsl-term-query,`term`>> query.

A simple normalizer called `lowercase` ships with elasticsearch and can be used.
Custom normalizers can be defined as part of analysis settings as follows.


[source,console]
--------------------------------
PUT index
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -293,7 +293,6 @@ private Map<String, AnalyzerProvider<?>> buildAnalyzerFactories(IndexSettings in

private Map<String, AnalyzerProvider<?>> buildNormalizerFactories(IndexSettings indexSettings) throws IOException {
final Map<String, Settings> normalizersSettings = indexSettings.getSettings().getGroups("index.analysis.normalizer");
// TODO: Have pre-built normalizers
return buildMapping(Component.NORMALIZER, indexSettings, normalizersSettings, normalizers, Collections.emptyMap());
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.elasticsearch.index.analysis;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;

/** Normalizer used to lowercase values */
public final class LowercaseNormalizer extends Analyzer {

@Override
protected TokenStreamComponents createComponents(String s) {
final Tokenizer tokenizer = new KeywordTokenizer();
TokenStream stream = new LowerCaseFilter(tokenizer);
return new TokenStreamComponents(tokenizer, stream);
}

@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
return new LowerCaseFilter(in);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/


package org.elasticsearch.index.analysis;

import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;


/**
* Builds an analyzer for normalization that lowercases terms.
*/
public class LowercaseNormalizerProvider extends AbstractIndexAnalyzerProvider<LowercaseNormalizer> {

private final LowercaseNormalizer analyzer;

public LowercaseNormalizerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, name, settings);
this.analyzer = new LowercaseNormalizer();
}

@Override
public LowercaseNormalizer get() {
return analyzer;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
import org.elasticsearch.index.analysis.CharFilterFactory;
import org.elasticsearch.index.analysis.HunspellTokenFilterFactory;
import org.elasticsearch.index.analysis.KeywordAnalyzerProvider;
import org.elasticsearch.index.analysis.LowercaseNormalizerProvider;
import org.elasticsearch.index.analysis.PreBuiltAnalyzerProviderFactory;
import org.elasticsearch.index.analysis.PreConfiguredCharFilter;
import org.elasticsearch.index.analysis.PreConfiguredTokenFilter;
Expand Down Expand Up @@ -250,7 +251,7 @@ private NamedRegistry<AnalysisProvider<AnalyzerProvider<?>>> setupAnalyzers(List

private NamedRegistry<AnalysisProvider<AnalyzerProvider<?>>> setupNormalizers(List<AnalysisPlugin> plugins) {
NamedRegistry<AnalysisProvider<AnalyzerProvider<?>>> normalizers = new NamedRegistry<>("normalizer");
// TODO: provide built-in normalizer providers?
normalizers.register("lowercase", LowercaseNormalizerProvider::new);
// TODO: pluggability?
return normalizers;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,13 @@
package org.elasticsearch.index.analysis;

import com.carrotsearch.randomizedtesting.generators.RandomPicks;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockTokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.en.EnglishAnalyzer;
import org.apache.lucene.analysis.reverse.ReverseStringFilter;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
Expand All @@ -39,12 +41,14 @@
import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider;
import org.elasticsearch.indices.analysis.PreBuiltAnalyzers;
import org.elasticsearch.plugins.AnalysisPlugin;
import org.elasticsearch.plugins.Plugin;
import org.elasticsearch.test.ESTestCase;
import org.elasticsearch.test.IndexSettingsModule;
import org.elasticsearch.test.VersionUtils;

import java.io.IOException;
import java.util.Collections;
import java.util.List;
import java.util.Map;

import static java.util.Collections.emptyMap;
Expand All @@ -57,6 +61,7 @@

public class AnalysisRegistryTests extends ESTestCase {
private AnalysisRegistry emptyRegistry;
private AnalysisRegistry nonEmptyRegistry;

private static AnalyzerProvider<?> analyzerProvider(final String name) {
return new PreBuiltAnalyzerProvider(name, AnalyzerScope.INDEX, new EnglishAnalyzer());
Expand All @@ -67,6 +72,16 @@ private static AnalysisRegistry emptyAnalysisRegistry(Settings settings) {
emptyMap(), emptyMap(), emptyMap(), emptyMap());
}

/**
* Creates a reverse filter available for use in testNameClashNormalizer test
*/
public static class MockAnalysisPlugin extends Plugin implements AnalysisPlugin {
@Override
public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
return singletonList(PreConfiguredTokenFilter.singleton("reverse", true, ReverseStringFilter::new));
}
}

private static IndexSettings indexSettingsOfCurrentVersion(Settings.Builder settings) {
return IndexSettingsModule.newIndexSettings("index", settings
.put(IndexMetadata.SETTING_VERSION_CREATED, Version.CURRENT)
Expand All @@ -76,9 +91,13 @@ private static IndexSettings indexSettingsOfCurrentVersion(Settings.Builder sett
@Override
public void setUp() throws Exception {
super.setUp();
emptyRegistry = emptyAnalysisRegistry(Settings.builder()
Settings settings = Settings.builder()
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.build());
.build();
emptyRegistry = emptyAnalysisRegistry(settings);
// Module loaded to register in-built normalizers for testing
AnalysisModule module = new AnalysisModule(TestEnvironment.newEnvironment(settings), singletonList(new MockAnalysisPlugin()));
nonEmptyRegistry = module.getAnalysisRegistry();
}

public void testDefaultAnalyzers() throws IOException {
Expand Down Expand Up @@ -134,7 +153,29 @@ public Tokenizer create() {
emptyMap(), emptyMap(), emptyMap()));
assertEquals("analyzer [default] contains filters [my_filter] that are not allowed to run in all mode.", ex.getMessage());
}


public void testNameClashNormalizer() throws IOException {

// Test out-of-the-box normalizer works OK.
IndexAnalyzers indexAnalyzers = nonEmptyRegistry.build(IndexSettingsModule.newIndexSettings("index", Settings.EMPTY));
assertNotNull(indexAnalyzers.getNormalizer("lowercase"));
assertThat(indexAnalyzers.getNormalizer("lowercase").normalize("field", "AbC").utf8ToString(), equalTo("abc"));

// Test that a name clash with a custom normalizer will favour the index's normalizer rather than the out-of-the-box
// one of the same name. (However this "feature" will be removed with https://github.com/elastic/elasticsearch/issues/22263 )
Settings settings = Settings.builder()
// Deliberately bad choice of normalizer name for the job it does.
.put("index.analysis.normalizer.lowercase.type", "custom")
.putList("index.analysis.normalizer.lowercase.filter", "reverse")
.build();

indexAnalyzers = nonEmptyRegistry.build(IndexSettingsModule.newIndexSettings("index", settings));
assertNotNull(indexAnalyzers.getNormalizer("lowercase"));
assertThat(indexAnalyzers.getNormalizer("lowercase").normalize("field","AbC").utf8ToString(), equalTo("CbA"));
}


public void testOverrideDefaultIndexAnalyzerIsUnsupported() {
Version version = VersionUtils.randomIndexCompatibleVersion(random());
Settings settings = Settings.builder().put(IndexMetadata.SETTING_VERSION_CREATED, version).build();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -344,10 +344,18 @@ public void testEnableNorms() throws IOException {
assertEquals(0, fieldNamesFields.length);
}

public void testNormalizer() throws IOException {
public void testCustomNormalizer() throws IOException {
checkLowercaseNormalizer("my_lowercase");
}

public void testInBuiltNormalizer() throws IOException {
checkLowercaseNormalizer("lowercase");
}

public void checkLowercaseNormalizer(String normalizerName) throws IOException {
String mapping = Strings.toString(XContentFactory.jsonBuilder().startObject().startObject("type")
.startObject("properties").startObject("field")
.field("type", "keyword").field("normalizer", "my_lowercase").endObject().endObject()
.field("type", "keyword").field("normalizer", normalizerName).endObject().endObject()
.endObject().endObject());

DocumentMapper mapper = parser.parse("type", new CompressedXContent(mapping));
Expand Down