diff --git a/core/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java b/core/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java index 5d099267c79e8..4c17773d6df70 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java +++ b/core/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java @@ -35,8 +35,6 @@ import org.elasticsearch.indices.analysis.AnalysisModule; import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider; import org.elasticsearch.indices.analysis.PreBuiltAnalyzers; -import org.elasticsearch.indices.analysis.PreBuiltCharFilters; -import org.elasticsearch.indices.analysis.PreBuiltTokenizers; import java.io.Closeable; import java.io.IOException; @@ -74,6 +72,7 @@ public AnalysisRegistry(Environment environment, Map> tokenizers, Map>> analyzers, Map>> normalizers, + Map preConfiguredCharFilters, Map preConfiguredTokenFilters, Map preConfiguredTokenizers) { this.environment = environment; @@ -82,7 +81,7 @@ public AnalysisRegistry(Environment environment, this.tokenizers = unmodifiableMap(tokenizers); this.analyzers = unmodifiableMap(analyzers); this.normalizers = unmodifiableMap(normalizers); - prebuiltAnalysis = new PrebuiltAnalysis(preConfiguredTokenFilters, preConfiguredTokenizers); + prebuiltAnalysis = new PrebuiltAnalysis(preConfiguredCharFilters, preConfiguredTokenFilters, preConfiguredTokenizers); } /** @@ -180,7 +179,7 @@ public Map buildTokenizerFactories(IndexSettings index public Map buildCharFilterFactories(IndexSettings indexSettings) throws IOException { final Map charFiltersSettings = indexSettings.getSettings().getGroups(INDEX_ANALYSIS_CHAR_FILTER); - return buildMapping(Component.CHAR_FILTER, indexSettings, charFiltersSettings, charFilters, prebuiltAnalysis.charFilterFactories); + return buildMapping(Component.CHAR_FILTER, indexSettings, charFiltersSettings, charFilters, prebuiltAnalysis.preConfiguredCharFilterFactories); } public Map> buildAnalyzerFactories(IndexSettings indexSettings) throws IOException { @@ -397,13 +396,13 @@ private static class PrebuiltAnalysis implements Closeable { final Map>> analyzerProviderFactories; final Map> preConfiguredTokenFilters; final Map> preConfiguredTokenizers; - final Map> charFilterFactories; + final Map> preConfiguredCharFilterFactories; private PrebuiltAnalysis( + Map preConfiguredCharFilters, Map preConfiguredTokenFilters, Map preConfiguredTokenizers) { Map analyzerProviderFactories = new HashMap<>(); - Map charFilterFactories = new HashMap<>(); // Analyzers for (PreBuiltAnalyzers preBuiltAnalyzerEnum : PreBuiltAnalyzers.values()) { @@ -411,22 +410,14 @@ private PrebuiltAnalysis( analyzerProviderFactories.put(name, new PreBuiltAnalyzerProviderFactory(name, AnalyzerScope.INDICES, preBuiltAnalyzerEnum.getAnalyzer(Version.CURRENT))); } - // Char Filters - for (PreBuiltCharFilters preBuiltCharFilter : PreBuiltCharFilters.values()) { - String name = preBuiltCharFilter.name().toLowerCase(Locale.ROOT); - charFilterFactories.put(name, new PreBuiltCharFilterFactoryFactory(preBuiltCharFilter.getCharFilterFactory(Version.CURRENT))); - } - // Char filter aliases - charFilterFactories.put("htmlStrip", new PreBuiltCharFilterFactoryFactory(PreBuiltCharFilters.HTML_STRIP.getCharFilterFactory(Version.CURRENT))); - this.analyzerProviderFactories = Collections.unmodifiableMap(analyzerProviderFactories); - this.charFilterFactories = Collections.unmodifiableMap(charFilterFactories); + this.preConfiguredCharFilterFactories = preConfiguredCharFilters; this.preConfiguredTokenFilters = preConfiguredTokenFilters; this.preConfiguredTokenizers = preConfiguredTokenizers; } public AnalysisModule.AnalysisProvider getCharFilterFactory(String name) { - return charFilterFactories.get(name); + return preConfiguredCharFilterFactories.get(name); } public AnalysisModule.AnalysisProvider getTokenFilterFactory(String name) { diff --git a/core/src/main/java/org/elasticsearch/index/analysis/PreBuiltCharFilterFactoryFactory.java b/core/src/main/java/org/elasticsearch/index/analysis/PreBuiltCharFilterFactoryFactory.java deleted file mode 100644 index 62a8ff1ff3e76..0000000000000 --- a/core/src/main/java/org/elasticsearch/index/analysis/PreBuiltCharFilterFactoryFactory.java +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Licensed to Elasticsearch under one or more contributor - * license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright - * ownership. Elasticsearch licenses this file to you under - * the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.elasticsearch.index.analysis; - -import org.elasticsearch.Version; -import org.elasticsearch.common.settings.Settings; -import org.elasticsearch.env.Environment; -import org.elasticsearch.index.IndexSettings; -import org.elasticsearch.indices.analysis.AnalysisModule; -import org.elasticsearch.indices.analysis.PreBuiltCharFilters; - -import java.io.IOException; - -public class PreBuiltCharFilterFactoryFactory implements AnalysisModule.AnalysisProvider { - - private final CharFilterFactory charFilterFactory; - - public PreBuiltCharFilterFactoryFactory(CharFilterFactory charFilterFactory) { - this.charFilterFactory = charFilterFactory; - } - - @Override - public CharFilterFactory get(IndexSettings indexSettings, Environment environment, String name, Settings settings) throws IOException { - Version indexVersion = Version.indexCreated(settings); - if (!Version.CURRENT.equals(indexVersion)) { - PreBuiltCharFilters preBuiltCharFilters = PreBuiltCharFilters.getOrDefault(name, null); - if (preBuiltCharFilters != null) { - return preBuiltCharFilters.getCharFilterFactory(indexVersion); - } - } - - return charFilterFactory; - } -} diff --git a/core/src/main/java/org/elasticsearch/index/analysis/PreConfiguredCharFilter.java b/core/src/main/java/org/elasticsearch/index/analysis/PreConfiguredCharFilter.java new file mode 100644 index 0000000000000..a979e9e34fe4e --- /dev/null +++ b/core/src/main/java/org/elasticsearch/index/analysis/PreConfiguredCharFilter.java @@ -0,0 +1,112 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import org.apache.lucene.analysis.CharFilter; +import org.apache.lucene.analysis.TokenFilter; +import org.elasticsearch.Version; +import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy; + +import java.io.Reader; +import java.util.function.BiFunction; +import java.util.function.Function; + +/** + * Provides pre-configured, shared {@link CharFilter}s. + */ +public class PreConfiguredCharFilter extends PreConfiguredAnalysisComponent { + /** + * Create a pre-configured char filter that may not vary at all. + */ + public static PreConfiguredCharFilter singleton(String name, boolean useFilterForMultitermQueries, Function create) { + return new PreConfiguredCharFilter(name, CachingStrategy.ONE, useFilterForMultitermQueries, + (reader, version) -> create.apply(reader)); + } + + /** + * Create a pre-configured token filter that may vary based on the Lucene version. + */ + public static PreConfiguredCharFilter luceneVersion(String name, boolean useFilterForMultitermQueries, + BiFunction create) { + return new PreConfiguredCharFilter(name, CachingStrategy.LUCENE, useFilterForMultitermQueries, + (reader, version) -> create.apply(reader, version.luceneVersion)); + } + + /** + * Create a pre-configured token filter that may vary based on the Elasticsearch version. + */ + public static PreConfiguredCharFilter elasticsearchVersion(String name, boolean useFilterForMultitermQueries, + BiFunction create) { + return new PreConfiguredCharFilter(name, CachingStrategy.ELASTICSEARCH, useFilterForMultitermQueries, create); + } + + private final boolean useFilterForMultitermQueries; + private final BiFunction create; + + protected PreConfiguredCharFilter(String name, CachingStrategy cache, boolean useFilterForMultitermQueries, + BiFunction create) { + super(name, cache); + this.useFilterForMultitermQueries = useFilterForMultitermQueries; + this.create = create; + } + + /** + * Can this {@link TokenFilter} be used in multi-term queries? + */ + public boolean shouldUseFilterForMultitermQueries() { + return useFilterForMultitermQueries; + } + + private interface MultiTermAwareCharFilterFactory extends CharFilterFactory, MultiTermAwareComponent {} + + @Override + protected CharFilterFactory create(Version version) { + if (useFilterForMultitermQueries) { + return new MultiTermAwareCharFilterFactory() { + @Override + public String name() { + return getName(); + } + + @Override + public Reader create(Reader reader) { + return create.apply(reader, version); + } + + @Override + public Object getMultiTermComponent() { + return this; + } + }; + } + return new CharFilterFactory() { + @Override + public Reader create(Reader reader) { + return create.apply(reader, version); + } + + @Override + public String name() { + return getName(); + } + }; + } + +} diff --git a/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java b/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java index 9e378f66793b2..3f26b722f41ce 100644 --- a/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java +++ b/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java @@ -101,6 +101,7 @@ import org.elasticsearch.index.analysis.PersianAnalyzerProvider; import org.elasticsearch.index.analysis.PersianNormalizationFilterFactory; import org.elasticsearch.index.analysis.PortugueseAnalyzerProvider; +import org.elasticsearch.index.analysis.PreConfiguredCharFilter; import org.elasticsearch.index.analysis.PreConfiguredTokenFilter; import org.elasticsearch.index.analysis.PreConfiguredTokenizer; import org.elasticsearch.index.analysis.ReverseTokenFilterFactory; @@ -173,11 +174,14 @@ public AnalysisModule(Environment environment, List plugins) thr NamedRegistry>> analyzers = setupAnalyzers(plugins); NamedRegistry>> normalizers = setupNormalizers(plugins); + Map preConfiguredCharFilters = setupPreConfiguredCharFilters(plugins); Map preConfiguredTokenFilters = setupPreConfiguredTokenFilters(plugins); Map preConfiguredTokenizers = setupPreConfiguredTokenizers(plugins); - analysisRegistry = new AnalysisRegistry(environment, charFilters.getRegistry(), tokenFilters.getRegistry(), tokenizers - .getRegistry(), analyzers.getRegistry(), normalizers.getRegistry(), preConfiguredTokenFilters, preConfiguredTokenizers); + analysisRegistry = new AnalysisRegistry(environment, + charFilters.getRegistry(), tokenFilters.getRegistry(), tokenizers.getRegistry(), + analyzers.getRegistry(), normalizers.getRegistry(), + preConfiguredCharFilters, preConfiguredTokenFilters, preConfiguredTokenizers); } HunspellService getHunspellService() { @@ -261,6 +265,19 @@ private NamedRegistry> setupTokenFilters(Li return tokenFilters; } + static Map setupPreConfiguredCharFilters(List plugins) { + NamedRegistry preConfiguredCharFilters = new NamedRegistry<>("pre-configured char_filter"); + + // No char filter are available in lucene-core so none are built in to Elasticsearch core + + for (AnalysisPlugin plugin: plugins) { + for (PreConfiguredCharFilter filter : plugin.getPreConfiguredCharFilters()) { + preConfiguredCharFilters.register(filter.getName(), filter); + } + } + return unmodifiableMap(preConfiguredCharFilters.getRegistry()); + } + static Map setupPreConfiguredTokenFilters(List plugins) { NamedRegistry preConfiguredTokenFilters = new NamedRegistry<>("pre-configured token_filter"); diff --git a/core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltCharFilters.java b/core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltCharFilters.java deleted file mode 100644 index 063763006a00e..0000000000000 --- a/core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltCharFilters.java +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Licensed to Elasticsearch under one or more contributor - * license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright - * ownership. Elasticsearch licenses this file to you under - * the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.elasticsearch.indices.analysis; - -import org.apache.lucene.analysis.charfilter.HTMLStripCharFilter; -import org.elasticsearch.Version; -import org.elasticsearch.index.analysis.CharFilterFactory; -import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy; - -import java.io.Reader; -import java.util.Locale; - -public enum PreBuiltCharFilters { - - HTML_STRIP(CachingStrategy.ONE) { - @Override - public Reader create(Reader tokenStream, Version version) { - return new HTMLStripCharFilter(tokenStream); - } - }; - - public abstract Reader create(Reader tokenStream, Version version); - - protected final PreBuiltCacheFactory.PreBuiltCache cache; - - PreBuiltCharFilters(CachingStrategy cachingStrategy) { - cache = PreBuiltCacheFactory.getCache(cachingStrategy); - } - - public synchronized CharFilterFactory getCharFilterFactory(final Version version) { - CharFilterFactory charFilterFactory = cache.get(version); - if (charFilterFactory == null) { - final String finalName = name(); - - charFilterFactory = new CharFilterFactory() { - @Override - public String name() { - return finalName.toLowerCase(Locale.ROOT); - } - - @Override - public Reader create(Reader tokenStream) { - return valueOf(finalName).create(tokenStream, version); - } - }; - cache.put(version, charFilterFactory); - } - - return charFilterFactory; - } - - /** - * Get a pre built CharFilter by its name or fallback to the default one - * @param name CharFilter name - * @param defaultCharFilter default CharFilter if name not found - */ - public static PreBuiltCharFilters getOrDefault(String name, PreBuiltCharFilters defaultCharFilter) { - try { - return valueOf(name.toUpperCase(Locale.ROOT)); - } catch (IllegalArgumentException e) { - return defaultCharFilter; - } - } -} diff --git a/core/src/main/java/org/elasticsearch/plugins/AnalysisPlugin.java b/core/src/main/java/org/elasticsearch/plugins/AnalysisPlugin.java index 99b4117f112d7..cc04ed875d996 100644 --- a/core/src/main/java/org/elasticsearch/plugins/AnalysisPlugin.java +++ b/core/src/main/java/org/elasticsearch/plugins/AnalysisPlugin.java @@ -28,8 +28,9 @@ import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.analysis.AnalyzerProvider; import org.elasticsearch.index.analysis.CharFilterFactory; -import org.elasticsearch.index.analysis.PreConfiguredTokenizer; +import org.elasticsearch.index.analysis.PreConfiguredCharFilter; import org.elasticsearch.index.analysis.PreConfiguredTokenFilter; +import org.elasticsearch.index.analysis.PreConfiguredTokenizer; import org.elasticsearch.index.analysis.TokenFilterFactory; import org.elasticsearch.index.analysis.TokenizerFactory; import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider; @@ -91,6 +92,13 @@ default Map>> getA return emptyMap(); } + /** + * Override to add additional pre-configured {@link CharFilter}s. + */ + default List getPreConfiguredCharFilters() { + return emptyList(); + } + /** * Override to add additional pre-configured {@link TokenFilter}s. */ diff --git a/core/src/test/java/org/elasticsearch/action/admin/indices/TransportAnalyzeActionTests.java b/core/src/test/java/org/elasticsearch/action/admin/indices/TransportAnalyzeActionTests.java index 0e1414bdbefda..5dfcd102431a5 100644 --- a/core/src/test/java/org/elasticsearch/action/admin/indices/TransportAnalyzeActionTests.java +++ b/core/src/test/java/org/elasticsearch/action/admin/indices/TransportAnalyzeActionTests.java @@ -29,18 +29,24 @@ import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.analysis.AbstractCharFilterFactory; import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; import org.elasticsearch.index.analysis.AnalysisRegistry; +import org.elasticsearch.index.analysis.CharFilterFactory; import org.elasticsearch.index.analysis.IndexAnalyzers; +import org.elasticsearch.index.analysis.PreConfiguredCharFilter; import org.elasticsearch.index.analysis.TokenFilterFactory; import org.elasticsearch.index.mapper.AllFieldMapper; import org.elasticsearch.indices.analysis.AnalysisModule; import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider; +import org.elasticsearch.indices.analysis.AnalysisModuleTests.AppendCharFilter; import org.elasticsearch.plugins.AnalysisPlugin; +import static org.elasticsearch.plugins.AnalysisPlugin.requriesAnalysisSettings; import org.elasticsearch.test.ESTestCase; import org.elasticsearch.test.IndexSettingsModule; import java.io.IOException; +import java.io.Reader; import java.util.List; import java.util.Map; @@ -81,10 +87,31 @@ public TokenStream create(TokenStream tokenStream) { } } + class AppendCharFilterFactory extends AbstractCharFilterFactory { + AppendCharFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { + super(indexSettings, name); + } + + @Override + public Reader create(Reader reader) { + return new AppendCharFilter(reader, "bar"); + } + } + + @Override + public Map> getCharFilters() { + return singletonMap("append", AppendCharFilterFactory::new); + } + @Override public Map> getTokenFilters() { return singletonMap("mock", MockFactory::new); } + + @Override + public List getPreConfiguredCharFilters() { + return singletonList(PreConfiguredCharFilter.singleton("append_foo", false, reader -> new AppendCharFilter(reader, "foo"))); + } }; registry = new AnalysisModule(environment, singletonList(plugin)).getAnalysisRegistry(); indexAnalyzers = registry.build(idxSettings); @@ -96,17 +123,17 @@ public Map> getTokenFilters() { public void testNoIndexAnalyzers() throws IOException { // Refer to an analyzer by its type so we get its default configuration AnalyzeRequest request = new AnalyzeRequest(); - request.analyzer("standard"); request.text("the quick brown fox"); + request.analyzer("standard"); AnalyzeResponse analyze = TransportAnalyzeAction.analyze(request, AllFieldMapper.NAME, null, null, registry, environment); List tokens = analyze.getTokens(); assertEquals(4, tokens.size()); // Refer to a token filter by its type so we get its default configuration - request.analyzer(null); - request.tokenizer("whitespace"); - request.addTokenFilter("mock"); + request = new AnalyzeRequest(); request.text("the qu1ck brown fox"); + request.tokenizer("standard"); + request.addTokenFilter("mock"); analyze = TransportAnalyzeAction.analyze(request, AllFieldMapper.NAME, null, randomBoolean() ? indexAnalyzers : null, registry, environment); tokens = analyze.getTokens(); assertEquals(3, tokens.size()); @@ -114,18 +141,32 @@ public void testNoIndexAnalyzers() throws IOException { assertEquals("brown", tokens.get(1).getTerm()); assertEquals("fox", tokens.get(2).getTerm()); - // Refer to a char filter by its type so we get its default configuration - request.analyzer(null); - request.tokenizer("whitespace"); - request.addCharFilter("html_strip"); - request.addTokenFilter("mock"); - request.text("

the qu1ck brown fox

"); + // We can refer to a pre-configured token filter by its name to get it + request = new AnalyzeRequest(); + request.text("the qu1ck brown fox"); + request.tokenizer("standard"); + request.addCharFilter("append_foo"); analyze = TransportAnalyzeAction.analyze(request, AllFieldMapper.NAME, null, randomBoolean() ? indexAnalyzers : null, registry, environment); tokens = analyze.getTokens(); - assertEquals(3, tokens.size()); - assertEquals("qu1ck", tokens.get(0).getTerm()); - assertEquals("brown", tokens.get(1).getTerm()); - assertEquals("fox", tokens.get(2).getTerm()); + assertEquals(4, tokens.size()); + assertEquals("the", tokens.get(0).getTerm()); + assertEquals("qu1ck", tokens.get(1).getTerm()); + assertEquals("brown", tokens.get(2).getTerm()); + assertEquals("foxfoo", tokens.get(3).getTerm()); + + // We can refer to a token filter by its type to get its default configuration + request = new AnalyzeRequest(); + request.text("the qu1ck brown fox"); + request.tokenizer("standard"); + request.addCharFilter("append"); + request.text("the qu1ck brown fox"); + analyze = TransportAnalyzeAction.analyze(request, AllFieldMapper.NAME, null, randomBoolean() ? indexAnalyzers : null, registry, environment); + tokens = analyze.getTokens(); + assertEquals(4, tokens.size()); + assertEquals("the", tokens.get(0).getTerm()); + assertEquals("qu1ck", tokens.get(1).getTerm()); + assertEquals("brown", tokens.get(2).getTerm()); + assertEquals("foxbar", tokens.get(3).getTerm()); } public void testFillsAttributes() throws IOException { diff --git a/core/src/test/java/org/elasticsearch/index/IndexModuleTests.java b/core/src/test/java/org/elasticsearch/index/IndexModuleTests.java index deb6b536e9ded..74436b3937853 100644 --- a/core/src/test/java/org/elasticsearch/index/IndexModuleTests.java +++ b/core/src/test/java/org/elasticsearch/index/IndexModuleTests.java @@ -120,7 +120,7 @@ public void setUp() throws Exception { index = indexSettings.getIndex(); environment = new Environment(settings); emptyAnalysisRegistry = new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap(), - emptyMap(), emptyMap()); + emptyMap(), emptyMap(), emptyMap()); threadPool = new TestThreadPool("test"); circuitBreakerService = new NoneCircuitBreakerService(); bigArrays = new BigArrays(settings, circuitBreakerService); diff --git a/core/src/test/java/org/elasticsearch/index/analysis/AnalysisRegistryTests.java b/core/src/test/java/org/elasticsearch/index/analysis/AnalysisRegistryTests.java index 57ef842072acf..9303159c265b9 100644 --- a/core/src/test/java/org/elasticsearch/index/analysis/AnalysisRegistryTests.java +++ b/core/src/test/java/org/elasticsearch/index/analysis/AnalysisRegistryTests.java @@ -57,7 +57,7 @@ private static AnalyzerProvider analyzerProvider(final String name) { private static AnalysisRegistry emptyAnalysisRegistry(Settings settings) { return new AnalysisRegistry(new Environment(settings), emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap(), - emptyMap()); + emptyMap(), emptyMap()); } private static IndexSettings indexSettingsOfCurrentVersion(Settings.Builder settings) { diff --git a/core/src/test/java/org/elasticsearch/index/analysis/CustomNormalizerTests.java b/core/src/test/java/org/elasticsearch/index/analysis/CustomNormalizerTests.java index a818d9c7178f0..66b28ec419a7f 100644 --- a/core/src/test/java/org/elasticsearch/index/analysis/CustomNormalizerTests.java +++ b/core/src/test/java/org/elasticsearch/index/analysis/CustomNormalizerTests.java @@ -32,6 +32,7 @@ import java.io.Reader; import java.util.List; import java.util.Map; +import java.util.function.Function; import static java.util.Collections.singletonList; import static java.util.Collections.singletonMap; @@ -101,12 +102,12 @@ public void testIllegalFilters() throws IOException { public void testIllegalCharFilters() throws IOException { Settings settings = Settings.builder() - .putArray("index.analysis.normalizer.my_normalizer.char_filter", "html_strip") + .putArray("index.analysis.normalizer.my_normalizer.char_filter", "mock_forbidden") .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) .build(); IllegalArgumentException e = expectThrows(IllegalArgumentException.class, - () -> AnalysisTestsHelper.createTestAnalysisFromSettings(settings)); - assertEquals("Custom normalizer [my_normalizer] may not use char filter [html_strip]", e.getMessage()); + () -> AnalysisTestsHelper.createTestAnalysisFromSettings(settings, MOCK_ANALYSIS_PLUGIN)); + assertEquals("Custom normalizer [my_normalizer] may not use char filter [mock_forbidden]", e.getMessage()); } private static class MockAnalysisPlugin implements AnalysisPlugin { @@ -115,6 +116,11 @@ public List getPreConfiguredTokenFilters() { return singletonList(PreConfiguredTokenFilter.singleton("mock_forbidden", false, MockLowerCaseFilter::new)); } + @Override + public List getPreConfiguredCharFilters() { + return singletonList(PreConfiguredCharFilter.singleton("mock_forbidden", false, Function.identity())); + } + @Override public Map> getCharFilters() { return singletonMap("mock_char_filter", (indexSettings, env, name, settings) -> { diff --git a/core/src/test/java/org/elasticsearch/indices/analysis/AnalysisModuleTests.java b/core/src/test/java/org/elasticsearch/indices/analysis/AnalysisModuleTests.java index f94c0c8fe746d..b3394d4f4fade 100644 --- a/core/src/test/java/org/elasticsearch/indices/analysis/AnalysisModuleTests.java +++ b/core/src/test/java/org/elasticsearch/indices/analysis/AnalysisModuleTests.java @@ -20,6 +20,7 @@ package org.elasticsearch.indices.analysis; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.CharFilter; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; @@ -40,6 +41,7 @@ import org.elasticsearch.index.analysis.CustomAnalyzer; import org.elasticsearch.index.analysis.IndexAnalyzers; import org.elasticsearch.index.analysis.NamedAnalyzer; +import org.elasticsearch.index.analysis.PreConfiguredCharFilter; import org.elasticsearch.index.analysis.PreConfiguredTokenFilter; import org.elasticsearch.index.analysis.PreConfiguredTokenizer; import org.elasticsearch.index.analysis.StandardTokenizerFactory; @@ -56,6 +58,7 @@ import java.io.BufferedWriter; import java.io.IOException; import java.io.InputStream; +import java.io.Reader; import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; @@ -250,6 +253,50 @@ public void testUnderscoreInAnalyzerName() throws IOException { } } + /** + * Tests that plugins can register pre-configured char filters that vary in behavior based on Elasticsearch version, Lucene version, + * and that do not vary based on version at all. + */ + public void testPluginPreConfiguredCharFilters() throws IOException { + boolean noVersionSupportsMultiTerm = randomBoolean(); + boolean luceneVersionSupportsMultiTerm = randomBoolean(); + boolean elasticsearchVersionSupportsMultiTerm = randomBoolean(); + AnalysisRegistry registry = new AnalysisModule(new Environment(emptyNodeSettings), singletonList(new AnalysisPlugin() { + @Override + public List getPreConfiguredCharFilters() { + return Arrays.asList( + PreConfiguredCharFilter.singleton("no_version", noVersionSupportsMultiTerm, + tokenStream -> new AppendCharFilter(tokenStream, "no_version")), + PreConfiguredCharFilter.luceneVersion("lucene_version", luceneVersionSupportsMultiTerm, + (tokenStream, luceneVersion) -> new AppendCharFilter(tokenStream, luceneVersion.toString())), + PreConfiguredCharFilter.elasticsearchVersion("elasticsearch_version", elasticsearchVersionSupportsMultiTerm, + (tokenStream, esVersion) -> new AppendCharFilter(tokenStream, esVersion.toString())) + ); + } + })).getAnalysisRegistry(); + + Version version = VersionUtils.randomVersion(random()); + IndexAnalyzers analyzers = getIndexAnalyzers(registry, Settings.builder() + .put("index.analysis.analyzer.no_version.tokenizer", "keyword") + .put("index.analysis.analyzer.no_version.char_filter", "no_version") + .put("index.analysis.analyzer.lucene_version.tokenizer", "keyword") + .put("index.analysis.analyzer.lucene_version.char_filter", "lucene_version") + .put("index.analysis.analyzer.elasticsearch_version.tokenizer", "keyword") + .put("index.analysis.analyzer.elasticsearch_version.char_filter", "elasticsearch_version") + .put(IndexMetaData.SETTING_VERSION_CREATED, version) + .build()); + assertTokenStreamContents(analyzers.get("no_version").tokenStream("", "test"), new String[] {"testno_version"}); + assertTokenStreamContents(analyzers.get("lucene_version").tokenStream("", "test"), new String[] {"test" + version.luceneVersion}); + assertTokenStreamContents(analyzers.get("elasticsearch_version").tokenStream("", "test"), new String[] {"test" + version}); + + assertEquals("test" + (noVersionSupportsMultiTerm ? "no_version" : ""), + analyzers.get("no_version").normalize("", "test").utf8ToString()); + assertEquals("test" + (luceneVersionSupportsMultiTerm ? version.luceneVersion.toString() : ""), + analyzers.get("lucene_version").normalize("", "test").utf8ToString()); + assertEquals("test" + (elasticsearchVersionSupportsMultiTerm ? version.toString() : ""), + analyzers.get("elasticsearch_version").normalize("", "test").utf8ToString()); + } + /** * Tests that plugins can register pre-configured token filters that vary in behavior based on Elasticsearch version, Lucene version, * and that do not vary based on version at all. @@ -391,6 +438,44 @@ public Map getHunspellDictionaries() { assertSame(dictionary, module.getHunspellService().getDictionary("foo")); } + // Simple char filter that appends text to the term + public static class AppendCharFilter extends CharFilter { + private final char[] appendMe; + private int offsetInAppendMe = -1; + + public AppendCharFilter(Reader input, String appendMe) { + super(input); + this.appendMe = appendMe.toCharArray(); + } + + @Override + protected int correct(int currentOff) { + return currentOff; + } + + @Override + public int read(char[] cbuf, int off, int len) throws IOException { + if (offsetInAppendMe < 0) { + int read = input.read(cbuf, off, len); + if (read == len) { + return read; + } + off += read; + len -= read; + int allowedLen = Math.min(len, appendMe.length); + System.arraycopy(appendMe, 0, cbuf, off, allowedLen); + offsetInAppendMe = allowedLen; + return read + allowedLen; + } + if (offsetInAppendMe >= appendMe.length) { + return -1; + } + int allowedLen = Math.max(len, appendMe.length - offsetInAppendMe); + System.arraycopy(appendMe, offsetInAppendMe, cbuf, off, allowedLen); + return allowedLen; + } + } + // Simple token filter that appends text to the term private static class AppendTokenFilter extends TokenFilter { public static TokenFilterFactory factoryForSuffix(String suffix) { diff --git a/core/src/test/java/org/elasticsearch/indices/analyze/AnalyzeActionIT.java b/core/src/test/java/org/elasticsearch/indices/analyze/AnalyzeActionIT.java index 9ff301791aea9..ad51a5d694236 100644 --- a/core/src/test/java/org/elasticsearch/indices/analyze/AnalyzeActionIT.java +++ b/core/src/test/java/org/elasticsearch/indices/analyze/AnalyzeActionIT.java @@ -257,23 +257,18 @@ public void testDetailAnalyzeCustomAnalyzerWithNoIndex() throws Exception { assertThat(analyzeResponse.detail().analyzer().getTokens().length, equalTo(4)); //custom analyzer - analyzeResponse = client().admin().indices().prepareAnalyze("THIS IS A TEST") - .setExplain(true).addCharFilter("html_strip").setTokenizer("keyword").addTokenFilter("lowercase").get(); + analyzeResponse = client().admin().indices().prepareAnalyze("THIS IS A TEST") + .setExplain(true).setTokenizer("keyword").addTokenFilter("lowercase").get(); assertThat(analyzeResponse.detail().analyzer(), IsNull.nullValue()); - //charfilters - assertThat(analyzeResponse.detail().charfilters().length, equalTo(1)); - assertThat(analyzeResponse.detail().charfilters()[0].getName(), equalTo("html_strip")); - assertThat(analyzeResponse.detail().charfilters()[0].getTexts().length, equalTo(1)); - assertThat(analyzeResponse.detail().charfilters()[0].getTexts()[0], equalTo("\nTHIS IS A TEST\n")); //tokenizer assertThat(analyzeResponse.detail().tokenizer().getName(), equalTo("keyword")); assertThat(analyzeResponse.detail().tokenizer().getTokens().length, equalTo(1)); - assertThat(analyzeResponse.detail().tokenizer().getTokens()[0].getTerm(), equalTo("\nTHIS IS A TEST\n")); + assertThat(analyzeResponse.detail().tokenizer().getTokens()[0].getTerm(), equalTo("THIS IS A TEST")); //tokenfilters assertThat(analyzeResponse.detail().tokenfilters().length, equalTo(1)); assertThat(analyzeResponse.detail().tokenfilters()[0].getName(), equalTo("lowercase")); assertThat(analyzeResponse.detail().tokenfilters()[0].getTokens().length, equalTo(1)); - assertThat(analyzeResponse.detail().tokenfilters()[0].getTokens()[0].getTerm(), equalTo("\nthis is a test\n")); + assertThat(analyzeResponse.detail().tokenfilters()[0].getTokens()[0].getTerm(), equalTo("this is a test")); //check other attributes analyzeResponse = client().admin().indices().prepareAnalyze("This is troubled") diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java index 689803f323d48..c33023d1cb251 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java @@ -26,6 +26,7 @@ import org.apache.lucene.analysis.ar.ArabicNormalizationFilter; import org.apache.lucene.analysis.ar.ArabicStemFilter; import org.apache.lucene.analysis.br.BrazilianStemFilter; +import org.apache.lucene.analysis.charfilter.HTMLStripCharFilter; import org.apache.lucene.analysis.cjk.CJKBigramFilter; import org.apache.lucene.analysis.cjk.CJKWidthFilter; import org.apache.lucene.analysis.ckb.SoraniNormalizationFilter; @@ -68,6 +69,7 @@ import org.elasticsearch.index.analysis.DelimitedPayloadTokenFilterFactory; import org.elasticsearch.index.analysis.HtmlStripCharFilterFactory; import org.elasticsearch.index.analysis.LimitTokenCountFilterFactory; +import org.elasticsearch.index.analysis.PreConfiguredCharFilter; import org.elasticsearch.index.analysis.PreConfiguredTokenFilter; import org.elasticsearch.index.analysis.PreConfiguredTokenizer; import org.elasticsearch.index.analysis.TokenFilterFactory; @@ -106,6 +108,15 @@ public Map> getCharFilters() { return filters; } + @Override + public List getPreConfiguredCharFilters() { + List filters = new ArrayList<>(); + filters.add(PreConfiguredCharFilter.singleton("html_strip", false, HTMLStripCharFilter::new)); + // TODO deprecate htmlStrip + filters.add(PreConfiguredCharFilter.singleton("htmlStrip", false, HTMLStripCharFilter::new)); + return filters; + } + @Override public List getPreConfiguredTokenFilters() { List filters = new ArrayList<>(); diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java index 3282a023f6916..59164f7506504 100644 --- a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java @@ -19,6 +19,7 @@ package org.elasticsearch.analysis.common; +import org.apache.lucene.analysis.charfilter.HTMLStripCharFilterFactory; import org.apache.lucene.analysis.en.PorterStemFilterFactory; import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilterFactory; import org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilterFactory; @@ -71,6 +72,14 @@ protected Map> getCharFilters() { return filters; } + @Override + public Map> getPreConfiguredCharFilters() { + Map> filters = new TreeMap<>(super.getPreConfiguredCharFilters()); + filters.put("html_strip", HTMLStripCharFilterFactory.class); + filters.put("htmlStrip", HTMLStripCharFilterFactory.class); + return filters; + } + @Override protected Map> getPreConfiguredTokenFilters() { Map> filters = new TreeMap<>(super.getPreConfiguredTokenFilters()); @@ -92,6 +101,7 @@ protected Map> getPreConfiguredTokenFilters() { filters.put("elision", null); filters.put("french_stem", SnowballPorterFilterFactory.class); filters.put("german_stem", null); + filters.put("german_normalization", null); filters.put("hindi_normalization", null); filters.put("indic_normalization", null); filters.put("keyword_repeat", null); @@ -123,8 +133,8 @@ protected Map> getPreConfiguredTokenFilters() { @Override protected Map> getPreConfiguredTokenizers() { - Map> filters = new TreeMap<>(super.getPreConfiguredTokenFilters()); - + Map> filters = new TreeMap<>(super.getPreConfiguredTokenizers()); + filters.put("lowercase", null); return filters; } diff --git a/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java b/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java index 35f79a26ac85f..fd8a5e7cd9aed 100644 --- a/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java +++ b/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java @@ -63,6 +63,7 @@ import org.elasticsearch.index.analysis.PatternReplaceTokenFilterFactory; import org.elasticsearch.index.analysis.PatternTokenizerFactory; import org.elasticsearch.index.analysis.PersianNormalizationFilterFactory; +import org.elasticsearch.index.analysis.PreConfiguredCharFilter; import org.elasticsearch.index.analysis.PreConfiguredTokenFilter; import org.elasticsearch.index.analysis.PreConfiguredTokenizer; import org.elasticsearch.index.analysis.ReverseTokenFilterFactory; @@ -100,7 +101,9 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; +import static java.util.Collections.emptyMap; import static java.util.Collections.singletonList; +import static org.hamcrest.Matchers.empty; import static org.hamcrest.Matchers.typeCompatibleWith; /** @@ -275,20 +278,6 @@ private static String toCamelCase(String s) { .put("persian", Void.class) .immutableMap(); - static final Map> PREBUILT_CHARFILTERS; - static { - PREBUILT_CHARFILTERS = new EnumMap<>(PreBuiltCharFilters.class); - for (PreBuiltCharFilters tokenizer : PreBuiltCharFilters.values()) { - Class luceneFactoryClazz; - switch (tokenizer) { - default: - luceneFactoryClazz = org.apache.lucene.analysis.util.CharFilterFactory.lookupClass( - toCamelCase(tokenizer.getCharFilterFactory(Version.CURRENT).name())); - } - PREBUILT_CHARFILTERS.put(tokenizer, luceneFactoryClazz); - } - } - /** * The plugin being tested. Core uses an "empty" plugin so we don't have to throw null checks all over the place. */ @@ -352,9 +341,17 @@ protected Map> getPreConfiguredTokenizers() { } tokenizers.put(tokenizer.name().toLowerCase(Locale.ROOT), luceneFactoryClazz); } + // TODO drop aliases once they are moved to module + tokenizers.put("nGram", tokenizers.get("ngram")); + tokenizers.put("edgeNGram", tokenizers.get("edge_ngram")); + tokenizers.put("PathHierarchy", tokenizers.get("path_hierarchy")); return tokenizers; } + public Map> getPreConfiguredCharFilters() { + return emptyMap(); + } + public void testTokenizers() { Set missing = new TreeSet(org.apache.lucene.analysis.util.TokenizerFactory.availableTokenizers()); missing.removeAll(getTokenizers().keySet()); @@ -430,10 +427,12 @@ public void testPreBuiltMultiTermAware() { Collection actual = new HashSet<>(); Map preConfiguredTokenFilters = - AnalysisModule.setupPreConfiguredTokenFilters(singletonList(plugin)); + new HashMap<>(AnalysisModule.setupPreConfiguredTokenFilters(singletonList(plugin))); for (Map.Entry> entry : getPreConfiguredTokenFilters().entrySet()) { String name = entry.getKey(); Class luceneFactory = entry.getValue(); + PreConfiguredTokenFilter filter = preConfiguredTokenFilters.remove(name); + assertNotNull("test claims pre built token filter [" + name + "] should be available but it wasn't", filter); if (luceneFactory == Void.class) { continue; } @@ -441,8 +440,6 @@ public void testPreBuiltMultiTermAware() { luceneFactory = TokenFilterFactory.lookupClass(toCamelCase(name)); } assertThat(luceneFactory, typeCompatibleWith(TokenFilterFactory.class)); - PreConfiguredTokenFilter filter = preConfiguredTokenFilters.get(name); - assertNotNull("test claims pre built token filter [" + name + "] should be available but it wasn't", filter); if (filter.shouldUseFilterForMultitermQueries()) { actual.add("token filter [" + name + "]"); } @@ -450,10 +447,15 @@ public void testPreBuiltMultiTermAware() { expected.add("token filter [" + name + "]"); } } - Map preConfiguredTokenizers = AnalysisModule.setupPreConfiguredTokenizers(singletonList(plugin)); + assertThat("pre configured token filter not registered with test", preConfiguredTokenFilters.keySet(), empty()); + + Map preConfiguredTokenizers = new HashMap<>( + AnalysisModule.setupPreConfiguredTokenizers(singletonList(plugin))); for (Map.Entry> entry : getPreConfiguredTokenizers().entrySet()) { String name = entry.getKey(); Class luceneFactory = entry.getValue(); + PreConfiguredTokenizer tokenizer = preConfiguredTokenizers.remove(name); + assertNotNull("test claims pre built tokenizer [" + name + "] should be available but it wasn't", tokenizer); if (luceneFactory == Void.class) { continue; } @@ -461,7 +463,6 @@ public void testPreBuiltMultiTermAware() { luceneFactory = TokenizerFactory.lookupClass(toCamelCase(name)); } assertThat(luceneFactory, typeCompatibleWith(TokenizerFactory.class)); - PreConfiguredTokenizer tokenizer = preConfiguredTokenizers.get(name); if (tokenizer.hasMultiTermComponent()) { actual.add(tokenizer); } @@ -469,20 +470,30 @@ public void testPreBuiltMultiTermAware() { expected.add(tokenizer); } } - for (Map.Entry> entry : PREBUILT_CHARFILTERS.entrySet()) { - PreBuiltCharFilters charFilter = entry.getKey(); + assertThat("pre configured tokenizer not registered with test", preConfiguredTokenizers.keySet(), empty()); + + Map preConfiguredCharFilters = new HashMap<>( + AnalysisModule.setupPreConfiguredCharFilters(singletonList(plugin))); + for (Map.Entry> entry : getPreConfiguredCharFilters().entrySet()) { + String name = entry.getKey(); Class luceneFactory = entry.getValue(); + PreConfiguredCharFilter filter = preConfiguredCharFilters.remove(name); + assertNotNull("test claims pre built char filter [" + name + "] should be available but it wasn't", filter); if (luceneFactory == Void.class) { continue; } - assertTrue(CharFilterFactory.class.isAssignableFrom(luceneFactory)); - if (charFilter.getCharFilterFactory(Version.CURRENT) instanceof MultiTermAwareComponent) { - actual.add(charFilter); + if (luceneFactory == null) { + luceneFactory = TokenFilterFactory.lookupClass(toCamelCase(name)); + } + assertThat(luceneFactory, typeCompatibleWith(CharFilterFactory.class)); + if (filter.shouldUseFilterForMultitermQueries()) { + actual.add(filter); } if (org.apache.lucene.analysis.util.MultiTermAwareComponent.class.isAssignableFrom(luceneFactory)) { - expected.add(charFilter); + expected.add("token filter [" + name + "]"); } } + assertThat("pre configured char filter not registered with test", preConfiguredCharFilters.keySet(), empty()); Set classesMissingMultiTermSupport = new HashSet<>(expected); classesMissingMultiTermSupport.removeAll(actual);