From ecf6b46ff9c9e78fe65b84d146caea28a7ba606f Mon Sep 17 00:00:00 2001 From: Nik Everett Date: Thu, 20 Apr 2017 12:06:23 -0400 Subject: [PATCH 01/20] Begin --- .../index/analysis/AnalysisRegistry.java | 32 +++-- .../PreBuiltTokenFilterFactoryFactory.java | 50 -------- .../PreBuiltTokenFilterFactoryProvider.java | 95 +++++++++++++++ .../indices/analysis/AnalysisModule.java | 24 +++- .../analysis/PreBuiltCacheFactory.java | 2 +- .../analysis/PreBuiltTokenFilters.java | 49 -------- .../elasticsearch/plugins/AnalysisPlugin.java | 39 +++++++ .../elasticsearch/index/IndexModuleTests.java | 41 +++---- .../index/analysis/AnalysisRegistryTests.java | 74 +++++++----- ....java => BuiltInAnalysisFactoryTests.java} | 10 +- .../analysis/common/CommonAnalysisPlugin.java | 37 +++++- .../common/CommonAnalysisFactoryTests.java | 28 ++++- .../analysis/AnalysisICUFactoryTests.java | 6 +- .../AnalysisKuromojiFactoryTests.java | 6 +- .../AnalysisPhoneticFactoryTests.java | 6 +- .../AnalysisSmartChineseFactoryTests.java | 7 +- .../analysis/AnalysisPolishFactoryTests.java | 6 +- .../analysis}/AnalysisFactoryTestCase.java | 110 +++++++++++------- 18 files changed, 396 insertions(+), 226 deletions(-) delete mode 100644 core/src/main/java/org/elasticsearch/index/analysis/PreBuiltTokenFilterFactoryFactory.java create mode 100644 core/src/main/java/org/elasticsearch/index/analysis/PreBuiltTokenFilterFactoryProvider.java rename core/src/test/java/org/elasticsearch/index/analysis/{AnalysisFactoryTests.java => BuiltInAnalysisFactoryTests.java} (68%) rename test/framework/src/main/java/org/elasticsearch/{ => indices/analysis}/AnalysisFactoryTestCase.java (90%) diff --git a/core/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java b/core/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java index 36357afe678b5..410b73cc3f2df 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java +++ b/core/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java @@ -36,8 +36,8 @@ import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider; import org.elasticsearch.indices.analysis.PreBuiltAnalyzers; import org.elasticsearch.indices.analysis.PreBuiltCharFilters; -import org.elasticsearch.indices.analysis.PreBuiltTokenFilters; import org.elasticsearch.indices.analysis.PreBuiltTokenizers; +import org.elasticsearch.plugins.AnalysisPlugin.PreBuiltTokenFilterSpec; import java.io.Closeable; import java.io.IOException; @@ -59,7 +59,7 @@ public final class AnalysisRegistry implements Closeable { public static final String INDEX_ANALYSIS_CHAR_FILTER = "index.analysis.char_filter"; public static final String INDEX_ANALYSIS_FILTER = "index.analysis.filter"; public static final String INDEX_ANALYSIS_TOKENIZER = "index.analysis.tokenizer"; - private final PrebuiltAnalysis prebuiltAnalysis = new PrebuiltAnalysis(); + private final PrebuiltAnalysis prebuiltAnalysis; private final Map cachedAnalyzer = new ConcurrentHashMap<>(); private final Environment environment; @@ -74,13 +74,15 @@ public AnalysisRegistry(Environment environment, Map> tokenFilters, Map> tokenizers, Map>> analyzers, - Map>> normalizers) { + Map>> normalizers, + Map preBuiltTokenFilters) { this.environment = environment; this.charFilters = unmodifiableMap(charFilters); this.tokenFilters = unmodifiableMap(tokenFilters); this.tokenizers = unmodifiableMap(tokenizers); this.analyzers = unmodifiableMap(analyzers); this.normalizers = unmodifiableMap(normalizers); + prebuiltAnalysis = new PrebuiltAnalysis(preBuiltTokenFilters); } /** @@ -393,13 +395,13 @@ private static class PrebuiltAnalysis implements Closeable { final Map>> analyzerProviderFactories; final Map> tokenizerFactories; - final Map> tokenFilterFactories; + final Map> tokenFilterFactories; final Map> charFilterFactories; - private PrebuiltAnalysis() { + private PrebuiltAnalysis(Map preBuiltTokenFilters) { Map analyzerProviderFactories = new HashMap<>(); Map tokenizerFactories = new HashMap<>(); - Map tokenFilterFactories = new HashMap<>(); + Map> tokenFilterFactories = new HashMap<>(); Map charFilterFactories = new HashMap<>(); // Analyzers for (PreBuiltAnalyzers preBuiltAnalyzerEnum : PreBuiltAnalyzers.values()) { @@ -420,14 +422,20 @@ private PrebuiltAnalysis() { // Token filters - for (PreBuiltTokenFilters preBuiltTokenFilter : PreBuiltTokenFilters.values()) { - String name = preBuiltTokenFilter.name().toLowerCase(Locale.ROOT); - tokenFilterFactories.put(name, new PreBuiltTokenFilterFactoryFactory(preBuiltTokenFilter.getTokenFilterFactory(Version.CURRENT))); + for (Map.Entry preBuiltTokenFilter : preBuiltTokenFilters.entrySet()) { + tokenFilterFactories.put(preBuiltTokenFilter.getKey(), + new PreBuiltTokenFilterFactoryProvider(preBuiltTokenFilter.getKey(), preBuiltTokenFilter.getValue())); } // Token filter aliases - tokenFilterFactories.put("nGram", new PreBuiltTokenFilterFactoryFactory(PreBuiltTokenFilters.NGRAM.getTokenFilterFactory(Version.CURRENT))); - tokenFilterFactories.put("edgeNGram", new PreBuiltTokenFilterFactoryFactory(PreBuiltTokenFilters.EDGE_NGRAM.getTokenFilterFactory(Version.CURRENT))); - + // NOCOMMIT move the aliases to the module and deprecate + AnalysisProvider toAlias = tokenFilterFactories.get("ngram"); + if (toAlias != null) { + tokenFilterFactories.put("nGram", toAlias); + } + toAlias = tokenFilterFactories.get("edge_ngram"); + if (toAlias != null) { + tokenFilterFactories.put("edgeNGram", toAlias); + } // Char Filters for (PreBuiltCharFilters preBuiltCharFilter : PreBuiltCharFilters.values()) { diff --git a/core/src/main/java/org/elasticsearch/index/analysis/PreBuiltTokenFilterFactoryFactory.java b/core/src/main/java/org/elasticsearch/index/analysis/PreBuiltTokenFilterFactoryFactory.java deleted file mode 100644 index 52c9f2851a29f..0000000000000 --- a/core/src/main/java/org/elasticsearch/index/analysis/PreBuiltTokenFilterFactoryFactory.java +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Licensed to Elasticsearch under one or more contributor - * license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright - * ownership. Elasticsearch licenses this file to you under - * the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.elasticsearch.index.analysis; - -import org.elasticsearch.Version; -import org.elasticsearch.common.settings.Settings; -import org.elasticsearch.env.Environment; -import org.elasticsearch.index.IndexSettings; -import org.elasticsearch.indices.analysis.AnalysisModule; -import org.elasticsearch.indices.analysis.PreBuiltTokenFilters; - -import java.io.IOException; - -public class PreBuiltTokenFilterFactoryFactory implements AnalysisModule.AnalysisProvider { - - private final TokenFilterFactory tokenFilterFactory; - - public PreBuiltTokenFilterFactoryFactory(TokenFilterFactory tokenFilterFactory) { - this.tokenFilterFactory = tokenFilterFactory; - } - - @Override - public TokenFilterFactory get(IndexSettings indexSettings, Environment environment, String name, Settings settings) throws IOException { - Version indexVersion = Version.indexCreated(settings); - if (!Version.CURRENT.equals(indexVersion)) { - PreBuiltTokenFilters preBuiltTokenFilters = PreBuiltTokenFilters.getOrDefault(name, null); - if (preBuiltTokenFilters != null) { - return preBuiltTokenFilters.getTokenFilterFactory(indexVersion); - } - } - return tokenFilterFactory; - } -} diff --git a/core/src/main/java/org/elasticsearch/index/analysis/PreBuiltTokenFilterFactoryProvider.java b/core/src/main/java/org/elasticsearch/index/analysis/PreBuiltTokenFilterFactoryProvider.java new file mode 100644 index 0000000000000..cbac745e0789d --- /dev/null +++ b/core/src/main/java/org/elasticsearch/index/analysis/PreBuiltTokenFilterFactoryProvider.java @@ -0,0 +1,95 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import org.apache.lucene.analysis.TokenStream; +import org.elasticsearch.Version; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.env.Environment; +import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.indices.analysis.AnalysisModule; +import org.elasticsearch.indices.analysis.PreBuiltCacheFactory; +import org.elasticsearch.plugins.AnalysisPlugin.PreBuiltTokenFilterSpec; + +import java.io.IOException; +import java.util.function.BiFunction; + +/** + * Resolves pre-built {@link TokenFilterFactory}s based on the specifications provided by plugins. + */ +public class PreBuiltTokenFilterFactoryProvider implements AnalysisModule.AnalysisProvider { + private final String name; + private final PreBuiltCacheFactory.PreBuiltCache cache; + private final boolean useFilterForMultitermQueries; + private final BiFunction create; + + public PreBuiltTokenFilterFactoryProvider(String name, PreBuiltTokenFilterSpec spec) { + this.name = name; + cache = PreBuiltCacheFactory.getCache(spec.getCachingStrategy()); + this.useFilterForMultitermQueries = spec.shouldUseFilterForMultitermQueries(); + this.create = spec.getCreate(); + } + + @Override + public TokenFilterFactory get(IndexSettings indexSettings, Environment environment, String name, Settings settings) throws IOException { + return getTokenFilterFactory(Version.indexCreated(settings)); + } + + private interface MultiTermAwareTokenFilterFactory extends TokenFilterFactory, MultiTermAwareComponent {} + + private synchronized TokenFilterFactory getTokenFilterFactory(final Version version) { + TokenFilterFactory factory = cache.get(version); + if (factory == null) { + if (useFilterForMultitermQueries) { + factory = new MultiTermAwareTokenFilterFactory() { + @Override + public String name() { + return name; + } + + @Override + public TokenStream create(TokenStream tokenStream) { + return create.apply(tokenStream, version); + } + + @Override + public Object getMultiTermComponent() { + return this; + } + }; + } else { + factory = new TokenFilterFactory() { + @Override + public String name() { + return name; + } + + @Override + public TokenStream create(TokenStream tokenStream) { + return create.apply(tokenStream, version); + } + }; + } + cache.put(version, factory); + } + + return factory; + } +} diff --git a/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java b/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java index c494c4cae9c35..71757fe45e2e4 100644 --- a/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java +++ b/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java @@ -19,6 +19,7 @@ package org.elasticsearch.indices.analysis; +import org.apache.lucene.analysis.standard.StandardFilter; import org.elasticsearch.Version; import org.elasticsearch.cluster.metadata.IndexMetaData; import org.elasticsearch.common.NamedRegistry; @@ -141,10 +142,14 @@ import org.elasticsearch.index.analysis.WhitespaceTokenizerFactory; import org.elasticsearch.index.analysis.compound.DictionaryCompoundWordTokenFilterFactory; import org.elasticsearch.index.analysis.compound.HyphenationCompoundWordTokenFilterFactory; +import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy; import org.elasticsearch.plugins.AnalysisPlugin; +import org.elasticsearch.plugins.AnalysisPlugin.PreBuiltTokenFilterSpec; import java.io.IOException; import java.util.List; +import java.util.Locale; +import java.util.Map; /** * Sets up {@link AnalysisRegistry}. @@ -170,8 +175,11 @@ public AnalysisModule(Environment environment, List plugins) thr NamedRegistry> tokenizers = setupTokenizers(plugins); NamedRegistry>> analyzers = setupAnalyzers(plugins); NamedRegistry>> normalizers = setupNormalizers(plugins); + + Map preBuiltTokenFilters = setupPreBuiltTokenFilters(plugins); + analysisRegistry = new AnalysisRegistry(environment, charFilters.getRegistry(), tokenFilters.getRegistry(), tokenizers - .getRegistry(), analyzers.getRegistry(), normalizers.getRegistry()); + .getRegistry(), analyzers.getRegistry(), normalizers.getRegistry(), preBuiltTokenFilters); } HunspellService getHunspellService() { @@ -262,6 +270,20 @@ private NamedRegistry> setupTokenFilters(Li return tokenFilters; } + static Map setupPreBuiltTokenFilters(List plugins) { + NamedRegistry preBuiltTokenFilters = new NamedRegistry<>("pre built token_filter"); + preBuiltTokenFilters.register("standard", new PreBuiltTokenFilterSpec(false, CachingStrategy.LUCENE, (inputs, version) -> + new StandardFilter(inputs))); + for (PreBuiltTokenFilters preBuilt : PreBuiltTokenFilters.values()) { + preBuiltTokenFilters.register(preBuilt.name().toLowerCase(Locale.ROOT), new PreBuiltTokenFilterSpec( // NOCOMMIT remove this shim + preBuilt.isMultiTermAware(), + CachingStrategy.ELASTICSEARCH, // This is the most granular/safest/whatever + preBuilt::create)); + } + preBuiltTokenFilters.extractAndRegister(plugins, AnalysisPlugin::getPreBuiltTokenFilters); + return preBuiltTokenFilters.getRegistry(); + } + private NamedRegistry> setupTokenizers(List plugins) { NamedRegistry> tokenizers = new NamedRegistry<>("tokenizer"); tokenizers.register("standard", StandardTokenizerFactory::new); diff --git a/core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltCacheFactory.java b/core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltCacheFactory.java index 823152e6d9e9e..8636e04f20f10 100644 --- a/core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltCacheFactory.java +++ b/core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltCacheFactory.java @@ -42,7 +42,7 @@ public interface PreBuiltCache { private PreBuiltCacheFactory() {} - static PreBuiltCache getCache(CachingStrategy cachingStrategy) { + public static PreBuiltCache getCache(CachingStrategy cachingStrategy) { switch (cachingStrategy) { case ONE: return new PreBuiltCacheStrategyOne<>(); diff --git a/core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltTokenFilters.java b/core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltTokenFilters.java index 6c58ab884db27..56f3b7809de4c 100644 --- a/core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltTokenFilters.java +++ b/core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltTokenFilters.java @@ -139,13 +139,6 @@ public TokenStream create(TokenStream tokenStream, Version version) { } }, - COMMON_GRAMS(CachingStrategy.LUCENE) { - @Override - public TokenStream create(TokenStream tokenStream, Version version) { - return new CommonGramsFilter(tokenStream, CharArraySet.EMPTY_SET); - } - }, - LOWERCASE(CachingStrategy.LUCENE) { @Override public TokenStream create(TokenStream tokenStream, Version version) { @@ -182,48 +175,6 @@ public TokenStream create(TokenStream tokenStream, Version version) { } }, - STANDARD(CachingStrategy.LUCENE) { - @Override - public TokenStream create(TokenStream tokenStream, Version version) { - return new StandardFilter(tokenStream); - } - }, - - CLASSIC(CachingStrategy.ONE) { - @Override - public TokenStream create(TokenStream tokenStream, Version version) { - return new ClassicFilter(tokenStream); - } - }, - - NGRAM(CachingStrategy.LUCENE) { - @Override - public TokenStream create(TokenStream tokenStream, Version version) { - return new NGramTokenFilter(tokenStream); - } - }, - - EDGE_NGRAM(CachingStrategy.LUCENE) { - @Override - public TokenStream create(TokenStream tokenStream, Version version) { - return new EdgeNGramTokenFilter(tokenStream, EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE); - } - }, - - UNIQUE(CachingStrategy.ONE) { - @Override - public TokenStream create(TokenStream tokenStream, Version version) { - return new UniqueTokenFilter(tokenStream); - } - }, - - TRUNCATE(CachingStrategy.ONE) { - @Override - public TokenStream create(TokenStream tokenStream, Version version) { - return new TruncateTokenFilter(tokenStream, 10); - } - }, - // Extended Token Filters SNOWBALL(CachingStrategy.ONE) { @Override diff --git a/core/src/main/java/org/elasticsearch/plugins/AnalysisPlugin.java b/core/src/main/java/org/elasticsearch/plugins/AnalysisPlugin.java index 8c23e530e4998..a77ce0dfe4953 100644 --- a/core/src/main/java/org/elasticsearch/plugins/AnalysisPlugin.java +++ b/core/src/main/java/org/elasticsearch/plugins/AnalysisPlugin.java @@ -22,14 +22,18 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.CharFilter; import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; +import org.elasticsearch.Version; import org.elasticsearch.index.analysis.AnalyzerProvider; import org.elasticsearch.index.analysis.CharFilterFactory; import org.elasticsearch.index.analysis.TokenFilterFactory; import org.elasticsearch.index.analysis.TokenizerFactory; import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider; +import org.elasticsearch.indices.analysis.PreBuiltCacheFactory; import java.util.Map; +import java.util.function.BiFunction; import static java.util.Collections.emptyMap; @@ -65,6 +69,10 @@ default Map> getTokenFilters() { return emptyMap(); } + default Map getPreBuiltTokenFilters() { + return emptyMap(); + } + /** * Override to add additional {@link Tokenizer}s. */ @@ -85,4 +93,35 @@ default Map>> getA default Map getHunspellDictionaries() { return emptyMap(); } + + class PreBuiltTokenFilterSpec { + private final boolean useFilterForMultitermQueries; + private final PreBuiltCacheFactory.CachingStrategy cachingStrategy; + private final BiFunction create; + + /** + * Setup the spec. + * @param useFilterForMultitermQueries use the pre-built token filter for multiterm queries. + * @param cachingStrategy caching strategy the pre-built token filter should use + * @param create function to create the token filter + */ + public PreBuiltTokenFilterSpec(boolean useFilterForMultitermQueries, PreBuiltCacheFactory.CachingStrategy cachingStrategy, + BiFunction create) { + this.useFilterForMultitermQueries = useFilterForMultitermQueries; + this.cachingStrategy = cachingStrategy; + this.create = create; + } + + public boolean shouldUseFilterForMultitermQueries() { + return useFilterForMultitermQueries; + } + + public PreBuiltCacheFactory.CachingStrategy getCachingStrategy() { + return cachingStrategy; + } + + public BiFunction getCreate() { + return create; + } + } } diff --git a/core/src/test/java/org/elasticsearch/index/IndexModuleTests.java b/core/src/test/java/org/elasticsearch/index/IndexModuleTests.java index 6f80da2d639c7..2a8fa7f9c143f 100644 --- a/core/src/test/java/org/elasticsearch/index/IndexModuleTests.java +++ b/core/src/test/java/org/elasticsearch/index/IndexModuleTests.java @@ -93,6 +93,7 @@ public class IndexModuleTests extends ESTestCase { private Settings settings; private IndexSettings indexSettings; private Environment environment; + private AnalysisRegistry emptyAnalysisRegistry; private NodeEnvironment nodeEnvironment; private IndicesQueryCache indicesQueryCache; @@ -122,6 +123,7 @@ public void setUp() throws Exception { indexSettings = IndexSettingsModule.newIndexSettings("foo", settings); index = indexSettings.getIndex(); environment = new Environment(settings); + emptyAnalysisRegistry = new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap()); threadPool = new TestThreadPool("test"); circuitBreakerService = new NoneCircuitBreakerService(); bigArrays = new BigArrays(settings, circuitBreakerService); @@ -149,8 +151,7 @@ private IndexService newIndexService(IndexModule module) throws IOException { } public void testWrapperIsBound() throws IOException { - IndexModule module = new IndexModule(indexSettings, - new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap())); + IndexModule module = new IndexModule(indexSettings, emptyAnalysisRegistry); module.setSearcherWrapper((s) -> new Wrapper()); module.engineFactory.set(new MockEngineFactory(AssertingDirectoryReader.class)); @@ -169,8 +170,7 @@ public void testRegisterIndexStore() throws IOException { .put(IndexModule.INDEX_STORE_TYPE_SETTING.getKey(), "foo_store") .build(); IndexSettings indexSettings = IndexSettingsModule.newIndexSettings(index, settings); - IndexModule module = new IndexModule(indexSettings, - new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap())); + IndexModule module = new IndexModule(indexSettings, emptyAnalysisRegistry); module.addIndexStore("foo_store", FooStore::new); try { module.addIndexStore("foo_store", FooStore::new); @@ -194,8 +194,7 @@ public void beforeIndexRemoved(IndexService indexService, IndexRemovalReason rea } }; IndexSettings indexSettings = IndexSettingsModule.newIndexSettings(index, settings); - IndexModule module = new IndexModule(indexSettings, - new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap())); + IndexModule module = new IndexModule(indexSettings, emptyAnalysisRegistry); module.addIndexEventListener(eventListener); IndexService indexService = newIndexService(module); IndexSettings x = indexService.getIndexSettings(); @@ -209,8 +208,7 @@ public void beforeIndexRemoved(IndexService indexService, IndexRemovalReason rea public void testListener() throws IOException { Setting booleanSetting = Setting.boolSetting("index.foo.bar", false, Property.Dynamic, Property.IndexScope); - IndexModule module = new IndexModule(IndexSettingsModule.newIndexSettings(index, settings, booleanSetting), - new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap())); + IndexModule module = new IndexModule(IndexSettingsModule.newIndexSettings(index, settings, booleanSetting), emptyAnalysisRegistry); Setting booleanSetting2 = Setting.boolSetting("index.foo.bar.baz", false, Property.Dynamic, Property.IndexScope); AtomicBoolean atomicBoolean = new AtomicBoolean(false); module.addSettingsUpdateConsumer(booleanSetting, atomicBoolean::set); @@ -229,8 +227,7 @@ public void testListener() throws IOException { } public void testAddIndexOperationListener() throws IOException { - IndexModule module = new IndexModule(IndexSettingsModule.newIndexSettings(index, settings), - new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap())); + IndexModule module = new IndexModule(IndexSettingsModule.newIndexSettings(index, settings), emptyAnalysisRegistry); AtomicBoolean executed = new AtomicBoolean(false); IndexingOperationListener listener = new IndexingOperationListener() { @Override @@ -260,8 +257,7 @@ public Engine.Index preIndex(ShardId shardId, Engine.Index operation) { } public void testAddSearchOperationListener() throws IOException { - IndexModule module = new IndexModule(IndexSettingsModule.newIndexSettings(index, settings), - new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap())); + IndexModule module = new IndexModule(IndexSettingsModule.newIndexSettings(index, settings), emptyAnalysisRegistry); AtomicBoolean executed = new AtomicBoolean(false); SearchOperationListener listener = new SearchOperationListener() { @@ -294,8 +290,7 @@ public void testAddSimilarity() throws IOException { .put("index.similarity.my_similarity.key", "there is a key") .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) .build(); - IndexModule module = new IndexModule(IndexSettingsModule.newIndexSettings("foo", indexSettings), - new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap())); + IndexModule module = new IndexModule(IndexSettingsModule.newIndexSettings("foo", indexSettings), emptyAnalysisRegistry); module.addSimilarity("test_similarity", (string, providerSettings, indexLevelSettings) -> new SimilarityProvider() { @Override public String name() { @@ -318,8 +313,7 @@ public Similarity get() { } public void testFrozen() { - IndexModule module = new IndexModule(IndexSettingsModule.newIndexSettings(index, settings), - new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap())); + IndexModule module = new IndexModule(IndexSettingsModule.newIndexSettings(index, settings), emptyAnalysisRegistry); module.freeze(); String msg = "Can't modify IndexModule once the index service has been created"; assertEquals(msg, expectThrows(IllegalStateException.class, () -> module.addSearchOperationListener(null)).getMessage()); @@ -337,8 +331,7 @@ public void testSetupUnknownSimilarity() throws IOException { .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) .build(); - IndexModule module = new IndexModule(IndexSettingsModule.newIndexSettings("foo", indexSettings), - new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap())); + IndexModule module = new IndexModule(IndexSettingsModule.newIndexSettings("foo", indexSettings), emptyAnalysisRegistry); Exception ex = expectThrows(IllegalArgumentException.class, () -> newIndexService(module)); assertEquals("Unknown Similarity type [test_similarity] for [my_similarity]", ex.getMessage()); } @@ -349,8 +342,7 @@ public void testSetupWithoutType() throws IOException { .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) .build(); - IndexModule module = new IndexModule(IndexSettingsModule.newIndexSettings("foo", indexSettings), - new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap())); + IndexModule module = new IndexModule(IndexSettingsModule.newIndexSettings("foo", indexSettings), emptyAnalysisRegistry); Exception ex = expectThrows(IllegalArgumentException.class, () -> newIndexService(module)); assertEquals("Similarity [my_similarity] must have an associated type", ex.getMessage()); } @@ -359,8 +351,7 @@ public void testForceCustomQueryCache() throws IOException { Settings indexSettings = Settings.builder() .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT).build(); - IndexModule module = new IndexModule(IndexSettingsModule.newIndexSettings("foo", indexSettings), - new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap())); + IndexModule module = new IndexModule(IndexSettingsModule.newIndexSettings("foo", indexSettings), emptyAnalysisRegistry); module.forceQueryCacheProvider((a, b) -> new CustomQueryCache()); expectThrows(AlreadySetException.class, () -> module.forceQueryCacheProvider((a, b) -> new CustomQueryCache())); IndexService indexService = newIndexService(module); @@ -372,8 +363,7 @@ public void testDefaultQueryCacheImplIsSelected() throws IOException { Settings indexSettings = Settings.builder() .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT).build(); - IndexModule module = new IndexModule(IndexSettingsModule.newIndexSettings("foo", indexSettings), - new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap())); + IndexModule module = new IndexModule(IndexSettingsModule.newIndexSettings("foo", indexSettings), emptyAnalysisRegistry); IndexService indexService = newIndexService(module); assertTrue(indexService.cache().query() instanceof IndexQueryCache); indexService.close("simon says", false); @@ -384,8 +374,7 @@ public void testDisableQueryCacheHasPrecedenceOverForceQueryCache() throws IOExc .put(IndexModule.INDEX_QUERY_CACHE_ENABLED_SETTING.getKey(), false) .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT).build(); - IndexModule module = new IndexModule(IndexSettingsModule.newIndexSettings("foo", indexSettings), - new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap())); + IndexModule module = new IndexModule(IndexSettingsModule.newIndexSettings("foo", indexSettings), emptyAnalysisRegistry); module.forceQueryCacheProvider((a, b) -> new CustomQueryCache()); IndexService indexService = newIndexService(module); assertTrue(indexService.cache().query() instanceof DisabledQueryCache); diff --git a/core/src/test/java/org/elasticsearch/index/analysis/AnalysisRegistryTests.java b/core/src/test/java/org/elasticsearch/index/analysis/AnalysisRegistryTests.java index 0edd2fbe2c08f..0370af7c0ef24 100644 --- a/core/src/test/java/org/elasticsearch/index/analysis/AnalysisRegistryTests.java +++ b/core/src/test/java/org/elasticsearch/index/analysis/AnalysisRegistryTests.java @@ -34,15 +34,16 @@ import org.elasticsearch.indices.analysis.AnalysisModule; import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider; import org.elasticsearch.indices.analysis.PreBuiltAnalyzers; +import org.elasticsearch.indices.analysis.PreBuiltCacheFactory; import org.elasticsearch.plugins.AnalysisPlugin; +import org.elasticsearch.plugins.AnalysisPlugin.PreBuiltTokenFilterSpec; import org.elasticsearch.test.ESTestCase; import org.elasticsearch.test.IndexSettingsModule; import org.elasticsearch.test.VersionUtils; import java.io.IOException; -import java.util.Collections; -import java.util.HashMap; import java.util.Map; +import java.util.concurrent.atomic.AtomicBoolean; import static java.util.Collections.emptyMap; import static java.util.Collections.singletonList; @@ -52,7 +53,9 @@ public class AnalysisRegistryTests extends ESTestCase { - private AnalysisRegistry registry; + private Environment emptyEnvironment; + private AnalysisRegistry emptyRegistry; + private IndexSettings emptyIndexSettingsOfCurrentVersion; private static AnalyzerProvider analyzerProvider(final String name) { return new PreBuiltAnalyzerProvider(name, AnalyzerScope.INDEX, new EnglishAnalyzer()); @@ -61,12 +64,13 @@ private static AnalyzerProvider analyzerProvider(final String name) { @Override public void setUp() throws Exception { super.setUp(); - Settings settings = Settings - .builder() - .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) - .build(); - registry = new AnalysisRegistry(new Environment(settings), - emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap()); + emptyEnvironment = new Environment(Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .build()); + emptyRegistry = new AnalysisRegistry(emptyEnvironment, emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap()); + emptyIndexSettingsOfCurrentVersion = IndexSettingsModule.newIndexSettings("index", Settings.builder() + .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) + .build()); } public void testDefaultAnalyzers() throws IOException { @@ -77,9 +81,7 @@ public void testDefaultAnalyzers() throws IOException { .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) .build(); IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings); - IndexAnalyzers indexAnalyzers = new AnalysisRegistry(new Environment(settings), - emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap()) - .build(idxSettings); + IndexAnalyzers indexAnalyzers = emptyRegistry.build(idxSettings); assertThat(indexAnalyzers.getDefaultIndexAnalyzer().analyzer(), instanceOf(StandardAnalyzer.class)); assertThat(indexAnalyzers.getDefaultSearchAnalyzer().analyzer(), instanceOf(StandardAnalyzer.class)); assertThat(indexAnalyzers.getDefaultSearchQuoteAnalyzer().analyzer(), instanceOf(StandardAnalyzer.class)); @@ -88,7 +90,7 @@ public void testDefaultAnalyzers() throws IOException { public void testOverrideDefaultAnalyzer() throws IOException { Version version = VersionUtils.randomVersion(random()); Settings settings = Settings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, version).build(); - IndexAnalyzers indexAnalyzers = registry.build(IndexSettingsModule.newIndexSettings("index", settings), + IndexAnalyzers indexAnalyzers = emptyRegistry.build(IndexSettingsModule.newIndexSettings("index", settings), singletonMap("default", analyzerProvider("default")) , emptyMap(), emptyMap(), emptyMap(), emptyMap()); assertThat(indexAnalyzers.getDefaultIndexAnalyzer().analyzer(), instanceOf(EnglishAnalyzer.class)); @@ -101,7 +103,7 @@ public void testOverrideDefaultIndexAnalyzerIsUnsupported() { Settings settings = Settings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, version).build(); AnalyzerProvider defaultIndex = new PreBuiltAnalyzerProvider("default_index", AnalyzerScope.INDEX, new EnglishAnalyzer()); IllegalArgumentException e = expectThrows(IllegalArgumentException.class, - () -> registry.build(IndexSettingsModule.newIndexSettings("index", settings), + () -> emptyRegistry.build(IndexSettingsModule.newIndexSettings("index", settings), singletonMap("default_index", defaultIndex), emptyMap(), emptyMap(), emptyMap(), emptyMap())); assertTrue(e.getMessage().contains("[index.analysis.analyzer.default_index] is not supported")); } @@ -109,7 +111,7 @@ public void testOverrideDefaultIndexAnalyzerIsUnsupported() { public void testOverrideDefaultSearchAnalyzer() { Version version = VersionUtils.randomVersion(random()); Settings settings = Settings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, version).build(); - IndexAnalyzers indexAnalyzers = registry.build(IndexSettingsModule.newIndexSettings("index", settings), + IndexAnalyzers indexAnalyzers = emptyRegistry.build(IndexSettingsModule.newIndexSettings("index", settings), singletonMap("default_search", analyzerProvider("default_search")), emptyMap(), emptyMap(), emptyMap(), emptyMap()); assertThat(indexAnalyzers.getDefaultIndexAnalyzer().analyzer(), instanceOf(StandardAnalyzer.class)); assertThat(indexAnalyzers.getDefaultSearchAnalyzer().analyzer(), instanceOf(EnglishAnalyzer.class)); @@ -191,11 +193,12 @@ public void testBuiltInAnalyzersAreCached() throws IOException { Settings indexSettings = Settings.builder() .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT).build(); IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings); - IndexAnalyzers indexAnalyzers = new AnalysisRegistry(new Environment(settings), - emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap()) - .build(idxSettings); - IndexAnalyzers otherIndexAnalyzers = new AnalysisRegistry(new Environment(settings), emptyMap(), emptyMap(), emptyMap(), - emptyMap(), emptyMap()).build(idxSettings); + IndexAnalyzers indexAnalyzers = + new AnalysisRegistry(new Environment(settings), emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap()) + .build(idxSettings); + IndexAnalyzers otherIndexAnalyzers = + new AnalysisRegistry(new Environment(settings), emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap()) + .build(idxSettings); final int numIters = randomIntBetween(5, 20); for (int i = 0; i < numIters; i++) { PreBuiltAnalyzers preBuiltAnalyzers = RandomPicks.randomFrom(random(), PreBuiltAnalyzers.values()); @@ -203,6 +206,23 @@ public void testBuiltInAnalyzersAreCached() throws IOException { } } + public void testPreBuiltTokenFiltersAreCached() throws IOException { + AtomicBoolean built = new AtomicBoolean(false); + PreBuiltTokenFilterSpec assertsBuiltOnce = new PreBuiltTokenFilterSpec(false, PreBuiltCacheFactory.CachingStrategy.ONE, + (tokens, version) -> { + if (false == built.compareAndSet(false, true)) { + fail("Attempted to build the token filter twice when it should have been cached"); + } + return new MockTokenFilter(tokens, MockTokenFilter.EMPTY_STOPSET); + }); + try (AnalysisRegistry registryWithPreBuiltTokenFilter = new AnalysisRegistry(emptyEnvironment, emptyMap(), emptyMap(), emptyMap(), + emptyMap(), emptyMap(), singletonMap("asserts_built_once", assertsBuiltOnce))) { + IndexAnalyzers indexAnalyzers = registryWithPreBuiltTokenFilter.build(emptyIndexSettingsOfCurrentVersion); + IndexAnalyzers otherIndexAnalyzers = registryWithPreBuiltTokenFilter.build(emptyIndexSettingsOfCurrentVersion); + assertSame(indexAnalyzers.get("asserts_built_once"), otherIndexAnalyzers.get("asserts_built_once")); + } + } + public void testNoTypeOrTokenizerErrorMessage() throws IOException { Version version = VersionUtils.randomVersion(random()); Settings settings = Settings @@ -214,20 +234,14 @@ public void testNoTypeOrTokenizerErrorMessage() throws IOException { .build(); IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings); - IllegalArgumentException e = expectThrows(IllegalArgumentException.class, - () -> new AnalysisRegistry(new Environment(settings), - emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap()).build(idxSettings)); + IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> + new AnalysisRegistry(new Environment(settings), emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap()) + .build(idxSettings)); assertThat(e.getMessage(), equalTo("analyzer [test_analyzer] must specify either an analyzer type, or a tokenizer")); } public void testCloseIndexAnalyzersMultipleTimes() throws IOException { - Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build(); - Settings indexSettings = Settings.builder() - .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT).build(); - IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings); - IndexAnalyzers indexAnalyzers = new AnalysisRegistry(new Environment(settings), - emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap()) - .build(idxSettings); + IndexAnalyzers indexAnalyzers = emptyRegistry.build(emptyIndexSettingsOfCurrentVersion); indexAnalyzers.close(); indexAnalyzers.close(); } diff --git a/core/src/test/java/org/elasticsearch/index/analysis/AnalysisFactoryTests.java b/core/src/test/java/org/elasticsearch/index/analysis/BuiltInAnalysisFactoryTests.java similarity index 68% rename from core/src/test/java/org/elasticsearch/index/analysis/AnalysisFactoryTests.java rename to core/src/test/java/org/elasticsearch/index/analysis/BuiltInAnalysisFactoryTests.java index 0a62e8c491588..57ae76ac00bd4 100644 --- a/core/src/test/java/org/elasticsearch/index/analysis/AnalysisFactoryTests.java +++ b/core/src/test/java/org/elasticsearch/index/analysis/BuiltInAnalysisFactoryTests.java @@ -19,8 +19,12 @@ package org.elasticsearch.index.analysis; -import org.elasticsearch.AnalysisFactoryTestCase; +import org.elasticsearch.indices.analysis.AnalysisFactoryTestCase; +import org.elasticsearch.plugins.AnalysisPlugin; -public class AnalysisFactoryTests extends AnalysisFactoryTestCase { - // tests are inherited and nothing needs to be defined here +public class BuiltInAnalysisFactoryTests extends AnalysisFactoryTestCase { + public BuiltInAnalysisFactoryTests() { + // Use an empty plugin that doesn't define anything so the test doesn't need a ton of null checks. + super(new AnalysisPlugin() {}); + } } diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java index bfd1bbdcc97b8..d1b060e023361 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java @@ -19,21 +19,54 @@ package org.elasticsearch.analysis.common; +import org.apache.lucene.analysis.CharArraySet; +import org.apache.lucene.analysis.commongrams.CommonGramsFilter; +import org.apache.lucene.analysis.miscellaneous.TruncateTokenFilter; +import org.apache.lucene.analysis.miscellaneous.UniqueTokenFilter; +import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter; +import org.apache.lucene.analysis.ngram.NGramTokenFilter; +import org.apache.lucene.analysis.standard.ClassicFilter; import org.elasticsearch.index.analysis.TokenFilterFactory; import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider; +import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy; import org.elasticsearch.plugins.AnalysisPlugin; import org.elasticsearch.plugins.Plugin; -import java.util.HashMap; import java.util.Map; +import java.util.TreeMap; public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin { @Override public Map> getTokenFilters() { - Map> filters = new HashMap<>(); + Map> filters = new TreeMap<>(); filters.put("asciifolding", ASCIIFoldingTokenFilterFactory::new); filters.put("word_delimiter", WordDelimiterTokenFilterFactory::new); filters.put("word_delimiter_graph", WordDelimiterGraphTokenFilterFactory::new); return filters; } + + @Override + public Map getPreBuiltTokenFilters() { + Map filters = new TreeMap<>(); + filters.put("classic", new PreBuiltTokenFilterSpec(false, CachingStrategy.ONE, (input, version) -> + new ClassicFilter(input))); + filters.put("common_grams", new PreBuiltTokenFilterSpec(false, CachingStrategy.LUCENE, (input, version) -> + new CommonGramsFilter(input, CharArraySet.EMPTY_SET))); + filters.put("edge_ngram", new PreBuiltTokenFilterSpec(false, CachingStrategy.LUCENE, (input, version) -> + new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE))); + // NOCOMMIT deprecate edgeNGram + filters.put("edgeNGram", new PreBuiltTokenFilterSpec(false, CachingStrategy.LUCENE, (input, version) -> + new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE))); + filters.put("ngram", new PreBuiltTokenFilterSpec(false, CachingStrategy.LUCENE, (input, version) -> + new NGramTokenFilter(input))); + // NOCOMMIT deprecate nGram + filters.put("nGram", new PreBuiltTokenFilterSpec(false, CachingStrategy.LUCENE, (input, version) -> + new NGramTokenFilter(input))); + filters.put("truncate", new PreBuiltTokenFilterSpec(false, CachingStrategy.ONE, (input, version) -> + new TruncateTokenFilter(input, 10))); + filters.put("unique", new PreBuiltTokenFilterSpec(false, CachingStrategy.ONE, (input, version) -> + new UniqueTokenFilter(input))); + + return filters; + } } diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java index 886dad37b56f1..6b7832543dd90 100644 --- a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java @@ -19,25 +19,29 @@ package org.elasticsearch.analysis.common; -import org.elasticsearch.AnalysisFactoryTestCase; +import org.elasticsearch.indices.analysis.AnalysisFactoryTestCase; -import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.TreeMap; import static java.util.Collections.emptyList; import static java.util.stream.Collectors.toList; public class CommonAnalysisFactoryTests extends AnalysisFactoryTestCase { + public CommonAnalysisFactoryTests() { + super(new CommonAnalysisPlugin()); + } + @Override protected Map> getTokenizers() { - Map> tokenizers = new HashMap<>(super.getTokenizers()); + Map> tokenizers = new TreeMap<>(super.getTokenizers()); return tokenizers; } @Override protected Map> getTokenFilters() { - Map> filters = new HashMap<>(super.getTokenFilters()); + Map> filters = new TreeMap<>(super.getTokenFilters()); filters.put("asciifolding", ASCIIFoldingTokenFilterFactory.class); filters.put("worddelimiter", WordDelimiterTokenFilterFactory.class); filters.put("worddelimitergraph", WordDelimiterGraphTokenFilterFactory.class); @@ -46,7 +50,21 @@ protected Map> getTokenFilters() { @Override protected Map> getCharFilters() { - Map> filters = new HashMap<>(super.getCharFilters()); + Map> filters = new TreeMap<>(super.getCharFilters()); + return filters; + } + + @Override + protected Map> getPreBuiltTokenFilters() { + Map> filters = new TreeMap<>(super.getPreBuiltTokenFilters()); + filters.put("classic", null); + filters.put("common_grams", null); + filters.put("edge_ngram", null); + filters.put("edgeNGram", null); + filters.put("ngram", null); + filters.put("nGram", null); + filters.put("truncate", null); + filters.put("unique", Void.class); return filters; } diff --git a/plugins/analysis-icu/src/test/java/org/elasticsearch/index/analysis/AnalysisICUFactoryTests.java b/plugins/analysis-icu/src/test/java/org/elasticsearch/index/analysis/AnalysisICUFactoryTests.java index 704ca61985aa6..d222189651e1c 100644 --- a/plugins/analysis-icu/src/test/java/org/elasticsearch/index/analysis/AnalysisICUFactoryTests.java +++ b/plugins/analysis-icu/src/test/java/org/elasticsearch/index/analysis/AnalysisICUFactoryTests.java @@ -19,12 +19,16 @@ package org.elasticsearch.index.analysis; -import org.elasticsearch.AnalysisFactoryTestCase; +import org.elasticsearch.indices.analysis.AnalysisFactoryTestCase; +import org.elasticsearch.plugin.analysis.icu.AnalysisICUPlugin; import java.util.HashMap; import java.util.Map; public class AnalysisICUFactoryTests extends AnalysisFactoryTestCase { + public AnalysisICUFactoryTests() { + super(new AnalysisICUPlugin()); + } @Override protected Map> getTokenizers() { diff --git a/plugins/analysis-kuromoji/src/test/java/org/elasticsearch/index/analysis/AnalysisKuromojiFactoryTests.java b/plugins/analysis-kuromoji/src/test/java/org/elasticsearch/index/analysis/AnalysisKuromojiFactoryTests.java index 9db7def101ef8..dbdc5795b38f8 100644 --- a/plugins/analysis-kuromoji/src/test/java/org/elasticsearch/index/analysis/AnalysisKuromojiFactoryTests.java +++ b/plugins/analysis-kuromoji/src/test/java/org/elasticsearch/index/analysis/AnalysisKuromojiFactoryTests.java @@ -20,12 +20,16 @@ package org.elasticsearch.index.analysis; import org.apache.lucene.analysis.ja.JapaneseTokenizerFactory; -import org.elasticsearch.AnalysisFactoryTestCase; +import org.elasticsearch.indices.analysis.AnalysisFactoryTestCase; +import org.elasticsearch.plugin.analysis.kuromoji.AnalysisKuromojiPlugin; import java.util.HashMap; import java.util.Map; public class AnalysisKuromojiFactoryTests extends AnalysisFactoryTestCase { + public AnalysisKuromojiFactoryTests() { + super(new AnalysisKuromojiPlugin()); + } @Override protected Map> getTokenizers() { diff --git a/plugins/analysis-phonetic/src/test/java/org/elasticsearch/index/analysis/AnalysisPhoneticFactoryTests.java b/plugins/analysis-phonetic/src/test/java/org/elasticsearch/index/analysis/AnalysisPhoneticFactoryTests.java index 0546fb468c924..8c551aee9190e 100644 --- a/plugins/analysis-phonetic/src/test/java/org/elasticsearch/index/analysis/AnalysisPhoneticFactoryTests.java +++ b/plugins/analysis-phonetic/src/test/java/org/elasticsearch/index/analysis/AnalysisPhoneticFactoryTests.java @@ -19,12 +19,16 @@ package org.elasticsearch.index.analysis; -import org.elasticsearch.AnalysisFactoryTestCase; +import org.elasticsearch.indices.analysis.AnalysisFactoryTestCase; +import org.elasticsearch.plugin.analysis.AnalysisPhoneticPlugin; import java.util.HashMap; import java.util.Map; public class AnalysisPhoneticFactoryTests extends AnalysisFactoryTestCase { + public AnalysisPhoneticFactoryTests() { + super(new AnalysisPhoneticPlugin()); + } @Override protected Map> getTokenFilters() { diff --git a/plugins/analysis-smartcn/src/test/java/org/elasticsearch/index/analysis/AnalysisSmartChineseFactoryTests.java b/plugins/analysis-smartcn/src/test/java/org/elasticsearch/index/analysis/AnalysisSmartChineseFactoryTests.java index d8aad322dcb93..53652c55f018a 100644 --- a/plugins/analysis-smartcn/src/test/java/org/elasticsearch/index/analysis/AnalysisSmartChineseFactoryTests.java +++ b/plugins/analysis-smartcn/src/test/java/org/elasticsearch/index/analysis/AnalysisSmartChineseFactoryTests.java @@ -19,13 +19,16 @@ package org.elasticsearch.index.analysis; -import org.elasticsearch.AnalysisFactoryTestCase; +import org.elasticsearch.indices.analysis.AnalysisFactoryTestCase; +import org.elasticsearch.plugin.analysis.smartcn.AnalysisSmartChinesePlugin; import java.util.HashMap; import java.util.Map; public class AnalysisSmartChineseFactoryTests extends AnalysisFactoryTestCase { - + public AnalysisSmartChineseFactoryTests() { + super(new AnalysisSmartChinesePlugin()); + } @Override protected Map> getTokenizers() { Map> tokenizers = new HashMap<>(super.getTokenizers()); diff --git a/plugins/analysis-stempel/src/test/java/org/elasticsearch/index/analysis/AnalysisPolishFactoryTests.java b/plugins/analysis-stempel/src/test/java/org/elasticsearch/index/analysis/AnalysisPolishFactoryTests.java index 8301529627670..ae78b9c01b3f8 100644 --- a/plugins/analysis-stempel/src/test/java/org/elasticsearch/index/analysis/AnalysisPolishFactoryTests.java +++ b/plugins/analysis-stempel/src/test/java/org/elasticsearch/index/analysis/AnalysisPolishFactoryTests.java @@ -23,7 +23,6 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.Tokenizer; -import org.elasticsearch.AnalysisFactoryTestCase; import org.elasticsearch.Version; import org.elasticsearch.cluster.metadata.IndexMetaData; import org.elasticsearch.common.UUIDs; @@ -31,12 +30,17 @@ import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.analysis.pl.PolishStemTokenFilterFactory; +import org.elasticsearch.indices.analysis.AnalysisFactoryTestCase; +import org.elasticsearch.plugin.analysis.stempel.AnalysisStempelPlugin; import java.io.IOException; import java.util.HashMap; import java.util.Map; public class AnalysisPolishFactoryTests extends AnalysisFactoryTestCase { + public AnalysisPolishFactoryTests() { + super(new AnalysisStempelPlugin()); + } @Override protected Map> getTokenFilters() { diff --git a/test/framework/src/main/java/org/elasticsearch/AnalysisFactoryTestCase.java b/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java similarity index 90% rename from test/framework/src/main/java/org/elasticsearch/AnalysisFactoryTestCase.java rename to test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java index 7f60058788a18..429c722a2ebf2 100644 --- a/test/framework/src/main/java/org/elasticsearch/AnalysisFactoryTestCase.java +++ b/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java @@ -17,7 +17,7 @@ * under the License. */ -package org.elasticsearch; +package org.elasticsearch.indices.analysis; import org.apache.lucene.analysis.en.PorterStemFilterFactory; import org.apache.lucene.analysis.reverse.ReverseStringFilterFactory; @@ -25,6 +25,7 @@ import org.apache.lucene.analysis.util.CharFilterFactory; import org.apache.lucene.analysis.util.TokenFilterFactory; import org.apache.lucene.analysis.util.TokenizerFactory; +import org.elasticsearch.Version; import org.elasticsearch.common.collect.MapBuilder; import org.elasticsearch.index.analysis.ApostropheFilterFactory; import org.elasticsearch.index.analysis.ArabicNormalizationFilterFactory; @@ -95,17 +96,23 @@ import org.elasticsearch.indices.analysis.PreBuiltCharFilters; import org.elasticsearch.indices.analysis.PreBuiltTokenFilters; import org.elasticsearch.indices.analysis.PreBuiltTokenizers; +import org.elasticsearch.plugins.AnalysisPlugin; +import org.elasticsearch.plugins.AnalysisPlugin.PreBuiltTokenFilterSpec; import org.elasticsearch.test.ESTestCase; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; +import java.util.Locale; import java.util.Map; +import java.util.Objects; import java.util.Set; import java.util.TreeSet; import java.util.regex.Matcher; import java.util.regex.Pattern; +import static java.util.Collections.singletonList; + /** * Alerts us if new analysis components are added to Lucene, so we don't miss them. *

@@ -287,41 +294,6 @@ private static String toCamelCase(String s) { .immutableMap(); - static final Map> PREBUILT_TOKENFILTERS; - static { - PREBUILT_TOKENFILTERS = new HashMap<>(); - for (PreBuiltTokenFilters tokenizer : PreBuiltTokenFilters.values()) { - Class luceneFactoryClazz; - switch (tokenizer) { - case REVERSE: - luceneFactoryClazz = ReverseStringFilterFactory.class; - break; - case UNIQUE: - luceneFactoryClazz = Void.class; - break; - case SNOWBALL: - case DUTCH_STEM: - case FRENCH_STEM: - case RUSSIAN_STEM: - luceneFactoryClazz = SnowballPorterFilterFactory.class; - break; - case STEMMER: - luceneFactoryClazz = PorterStemFilterFactory.class; - break; - case DELIMITED_PAYLOAD_FILTER: - luceneFactoryClazz = org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilterFactory.class; - break; - case LIMIT: - luceneFactoryClazz = org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilterFactory.class; - break; - default: - luceneFactoryClazz = org.apache.lucene.analysis.util.TokenFilterFactory.lookupClass( - toCamelCase(tokenizer.getTokenFilterFactory(Version.CURRENT).name())); - } - PREBUILT_TOKENFILTERS.put(tokenizer, luceneFactoryClazz); - } - } - static final Map> KNOWN_CHARFILTERS = new MapBuilder>() // exposed in ES .put("htmlstrip", HtmlStripCharFilterFactory.class) @@ -347,6 +319,15 @@ private static String toCamelCase(String s) { } } + /** + * The plugin being tested. Core uses an "empty" plugin so we don't have to throw null checks all over the place. + */ + private final AnalysisPlugin plugin; + + public AnalysisFactoryTestCase(AnalysisPlugin plugin) { + this.plugin = Objects.requireNonNull(plugin, "plugin is required. use an empty plugin for core"); + } + protected Map> getTokenizers() { return KNOWN_TOKENIZERS; } @@ -355,6 +336,47 @@ protected Map> getTokenFilters() { return KNOWN_TOKENFILTERS; } + /** + * Map containing pre-built token filters that should be available after installing this plugin. The map is from the name of the token + * filter to the class of the Lucene {@link TokenFilterFactory} that it is emulating. If the Lucene filter factory is {@code null} then + * the test will look it up for you using {@link TokenFilterFactory#lookupClass(String)}. If there is no Lucene + * {@linkplain TokenFilterFactory} then the right hand side should be {@link Void}. + */ + protected Map> getPreBuiltTokenFilters() { + // NOCOMMIT use this: +// return singletonMap("standard", null); + // Temporary builtin list until I remove them all + Map> filters = new HashMap<>(); + for (PreBuiltTokenFilters tokenizer : PreBuiltTokenFilters.values()) { + Class luceneFactoryClass; + switch (tokenizer) { + case REVERSE: + luceneFactoryClass = ReverseStringFilterFactory.class; + break; + case SNOWBALL: + case DUTCH_STEM: + case FRENCH_STEM: + case RUSSIAN_STEM: + luceneFactoryClass = SnowballPorterFilterFactory.class; + break; + case STEMMER: + luceneFactoryClass = PorterStemFilterFactory.class; + break; + case DELIMITED_PAYLOAD_FILTER: + luceneFactoryClass = org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilterFactory.class; + break; + case LIMIT: + luceneFactoryClass = org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilterFactory.class; + break; + default: + luceneFactoryClass = org.apache.lucene.analysis.util.TokenFilterFactory.lookupClass( + toCamelCase(tokenizer.getTokenFilterFactory(Version.CURRENT).name())); + } + filters.put(tokenizer.name().toLowerCase(Locale.ROOT), luceneFactoryClass); + } + return filters; + } + protected Map> getCharFilters() { return KNOWN_CHARFILTERS; } @@ -447,18 +469,24 @@ public void testPreBuiltMultiTermAware() { expected.add(tokenizer); } } - for (Map.Entry> entry : PREBUILT_TOKENFILTERS.entrySet()) { - PreBuiltTokenFilters tokenFilter = entry.getKey(); + Map preBuiltTokenFilters = AnalysisModule.setupPreBuiltTokenFilters(singletonList(plugin)); + for (Map.Entry> entry : getPreBuiltTokenFilters().entrySet()) { + String name = entry.getKey(); Class luceneFactory = entry.getValue(); if (luceneFactory == Void.class) { continue; } + if (luceneFactory == null) { + luceneFactory = TokenFilterFactory.lookupClass(name); + } assertTrue(TokenFilterFactory.class.isAssignableFrom(luceneFactory)); - if (tokenFilter.getTokenFilterFactory(Version.CURRENT) instanceof MultiTermAwareComponent) { - actual.add(tokenFilter); + PreBuiltTokenFilterSpec spec = preBuiltTokenFilters.get(name); + assertNotNull("test claims pre built token filter [" + name + "] should be available but it wasn't", spec); + if (spec.shouldUseFilterForMultitermQueries()) { + actual.add("token filter [" + name + "]"); } if (org.apache.lucene.analysis.util.MultiTermAwareComponent.class.isAssignableFrom(luceneFactory)) { - expected.add(tokenFilter); + expected.add("token filter [" + name + "]"); } } for (Map.Entry> entry : PREBUILT_CHARFILTERS.entrySet()) { From 1a110bd1dfb7faaeb40ddc70c097c4c912dbcace Mon Sep 17 00:00:00 2001 From: Nik Everett Date: Thu, 20 Apr 2017 12:22:48 -0400 Subject: [PATCH 02/20] More --- .../analysis/PreBuiltTokenFilters.java | 25 ------------------- .../elasticsearch/plugins/AnalysisPlugin.java | 2 +- .../analysis/common/CommonAnalysisPlugin.java | 16 ++++++++++++ .../common/CommonAnalysisFactoryTests.java | 2 ++ .../analysis/AnalysisFactoryTestCase.java | 9 +++---- 5 files changed, 23 insertions(+), 31 deletions(-) diff --git a/core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltTokenFilters.java b/core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltTokenFilters.java index 56f3b7809de4c..670c4f3495ce4 100644 --- a/core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltTokenFilters.java +++ b/core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltTokenFilters.java @@ -75,31 +75,6 @@ import java.util.Locale; public enum PreBuiltTokenFilters { - - WORD_DELIMITER(CachingStrategy.ONE) { - @Override - public TokenStream create(TokenStream tokenStream, Version version) { - return new WordDelimiterFilter(tokenStream, - WordDelimiterFilter.GENERATE_WORD_PARTS | - WordDelimiterFilter.GENERATE_NUMBER_PARTS | - WordDelimiterFilter.SPLIT_ON_CASE_CHANGE | - WordDelimiterFilter.SPLIT_ON_NUMERICS | - WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE, null); - } - }, - - WORD_DELIMITER_GRAPH(CachingStrategy.ONE) { - @Override - public TokenStream create(TokenStream tokenStream, Version version) { - return new WordDelimiterGraphFilter(tokenStream, - WordDelimiterGraphFilter.GENERATE_WORD_PARTS | - WordDelimiterGraphFilter.GENERATE_NUMBER_PARTS | - WordDelimiterGraphFilter.SPLIT_ON_CASE_CHANGE | - WordDelimiterGraphFilter.SPLIT_ON_NUMERICS | - WordDelimiterGraphFilter.STEM_ENGLISH_POSSESSIVE, null); - } - }, - STOP(CachingStrategy.LUCENE) { @Override public TokenStream create(TokenStream tokenStream, Version version) { diff --git a/core/src/main/java/org/elasticsearch/plugins/AnalysisPlugin.java b/core/src/main/java/org/elasticsearch/plugins/AnalysisPlugin.java index a77ce0dfe4953..634698a676c04 100644 --- a/core/src/main/java/org/elasticsearch/plugins/AnalysisPlugin.java +++ b/core/src/main/java/org/elasticsearch/plugins/AnalysisPlugin.java @@ -96,7 +96,7 @@ default Map getHunspellD class PreBuiltTokenFilterSpec { private final boolean useFilterForMultitermQueries; - private final PreBuiltCacheFactory.CachingStrategy cachingStrategy; + private final PreBuiltCacheFactory.CachingStrategy cachingStrategy; private final BiFunction create; /** diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java index d1b060e023361..0f467189cde26 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java @@ -23,6 +23,8 @@ import org.apache.lucene.analysis.commongrams.CommonGramsFilter; import org.apache.lucene.analysis.miscellaneous.TruncateTokenFilter; import org.apache.lucene.analysis.miscellaneous.UniqueTokenFilter; +import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter; +import org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter; import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter; import org.apache.lucene.analysis.ngram.NGramTokenFilter; import org.apache.lucene.analysis.standard.ClassicFilter; @@ -66,6 +68,20 @@ public Map getPreBuiltTokenFilters() { new TruncateTokenFilter(input, 10))); filters.put("unique", new PreBuiltTokenFilterSpec(false, CachingStrategy.ONE, (input, version) -> new UniqueTokenFilter(input))); + filters.put("word_delimiter", new PreBuiltTokenFilterSpec(false, CachingStrategy.ONE, (input, version) -> + new WordDelimiterFilter(input, + WordDelimiterFilter.GENERATE_WORD_PARTS + | WordDelimiterFilter.GENERATE_NUMBER_PARTS + | WordDelimiterFilter.SPLIT_ON_CASE_CHANGE + | WordDelimiterFilter.SPLIT_ON_NUMERICS + | WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE, null))); + filters.put("word_delimiter_graph", new PreBuiltTokenFilterSpec(false, CachingStrategy.ONE, (input, version) -> + new WordDelimiterGraphFilter(input, + WordDelimiterGraphFilter.GENERATE_WORD_PARTS + | WordDelimiterGraphFilter.GENERATE_NUMBER_PARTS + | WordDelimiterGraphFilter.SPLIT_ON_CASE_CHANGE + | WordDelimiterGraphFilter.SPLIT_ON_NUMERICS + | WordDelimiterGraphFilter.STEM_ENGLISH_POSSESSIVE, null))); return filters; } diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java index 6b7832543dd90..0b33b8194cf42 100644 --- a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java @@ -65,6 +65,8 @@ protected Map> getPreBuiltTokenFilters() { filters.put("nGram", null); filters.put("truncate", null); filters.put("unique", Void.class); + filters.put("word_delimiter", null); + filters.put("word_delimiter_graph", null); return filters; } diff --git a/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java b/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java index 429c722a2ebf2..4d5dc63c652b9 100644 --- a/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java +++ b/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java @@ -339,8 +339,8 @@ protected Map> getTokenFilters() { /** * Map containing pre-built token filters that should be available after installing this plugin. The map is from the name of the token * filter to the class of the Lucene {@link TokenFilterFactory} that it is emulating. If the Lucene filter factory is {@code null} then - * the test will look it up for you using {@link TokenFilterFactory#lookupClass(String)}. If there is no Lucene - * {@linkplain TokenFilterFactory} then the right hand side should be {@link Void}. + * the test will look it up for you from the name. If there is no Lucene {@linkplain TokenFilterFactory} then the right hand side should + * be {@link Void}. */ protected Map> getPreBuiltTokenFilters() { // NOCOMMIT use this: @@ -369,8 +369,7 @@ protected Map> getPreBuiltTokenFilters() { luceneFactoryClass = org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilterFactory.class; break; default: - luceneFactoryClass = org.apache.lucene.analysis.util.TokenFilterFactory.lookupClass( - toCamelCase(tokenizer.getTokenFilterFactory(Version.CURRENT).name())); + luceneFactoryClass = null; } filters.put(tokenizer.name().toLowerCase(Locale.ROOT), luceneFactoryClass); } @@ -477,7 +476,7 @@ public void testPreBuiltMultiTermAware() { continue; } if (luceneFactory == null) { - luceneFactory = TokenFilterFactory.lookupClass(name); + luceneFactory = TokenFilterFactory.lookupClass(toCamelCase(name)); } assertTrue(TokenFilterFactory.class.isAssignableFrom(luceneFactory)); PreBuiltTokenFilterSpec spec = preBuiltTokenFilters.get(name); From baaa06d294462622ffd10c0f2a08e328da63e760 Mon Sep 17 00:00:00 2001 From: Nik Everett Date: Thu, 20 Apr 2017 12:55:51 -0400 Subject: [PATCH 03/20] Further --- .../analysis/PreBuiltTokenFilters.java | 27 ------------------- .../indices/TransportAnalyzeActionTests.java | 4 +-- .../analysis/common/CommonAnalysisPlugin.java | 8 ++++++ .../analysis-common/40_token_filters.yaml | 12 +++++++++ 4 files changed, 22 insertions(+), 29 deletions(-) diff --git a/core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltTokenFilters.java b/core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltTokenFilters.java index 670c4f3495ce4..e583afcab13eb 100644 --- a/core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltTokenFilters.java +++ b/core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltTokenFilters.java @@ -18,9 +18,7 @@ */ package org.elasticsearch.indices.analysis; -import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.LowerCaseFilter; -import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.ar.ArabicNormalizationFilter; import org.apache.lucene.analysis.ar.ArabicStemFilter; @@ -28,9 +26,7 @@ import org.apache.lucene.analysis.cjk.CJKBigramFilter; import org.apache.lucene.analysis.cjk.CJKWidthFilter; import org.apache.lucene.analysis.ckb.SoraniNormalizationFilter; -import org.apache.lucene.analysis.commongrams.CommonGramsFilter; import org.apache.lucene.analysis.core.DecimalDigitFilter; -import org.apache.lucene.analysis.core.StopAnalyzer; import org.apache.lucene.analysis.core.UpperCaseFilter; import org.apache.lucene.analysis.cz.CzechStemFilter; import org.apache.lucene.analysis.de.GermanNormalizationFilter; @@ -47,20 +43,11 @@ import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilter; import org.apache.lucene.analysis.miscellaneous.ScandinavianFoldingFilter; import org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizationFilter; -import org.apache.lucene.analysis.miscellaneous.TrimFilter; -import org.apache.lucene.analysis.miscellaneous.TruncateTokenFilter; -import org.apache.lucene.analysis.miscellaneous.UniqueTokenFilter; -import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter; -import org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter; -import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter; -import org.apache.lucene.analysis.ngram.NGramTokenFilter; import org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilter; import org.apache.lucene.analysis.payloads.TypeAsPayloadTokenFilter; import org.apache.lucene.analysis.reverse.ReverseStringFilter; import org.apache.lucene.analysis.shingle.ShingleFilter; import org.apache.lucene.analysis.snowball.SnowballFilter; -import org.apache.lucene.analysis.standard.ClassicFilter; -import org.apache.lucene.analysis.standard.StandardFilter; import org.apache.lucene.analysis.tr.ApostropheFilter; import org.apache.lucene.analysis.util.ElisionFilter; import org.elasticsearch.Version; @@ -75,20 +62,6 @@ import java.util.Locale; public enum PreBuiltTokenFilters { - STOP(CachingStrategy.LUCENE) { - @Override - public TokenStream create(TokenStream tokenStream, Version version) { - return new StopFilter(tokenStream, StopAnalyzer.ENGLISH_STOP_WORDS_SET); - } - }, - - TRIM(CachingStrategy.LUCENE) { - @Override - public TokenStream create(TokenStream tokenStream, Version version) { - return new TrimFilter(tokenStream); - } - }, - REVERSE(CachingStrategy.LUCENE) { @Override public TokenStream create(TokenStream tokenStream, Version version) { diff --git a/core/src/test/java/org/elasticsearch/action/admin/indices/TransportAnalyzeActionTests.java b/core/src/test/java/org/elasticsearch/action/admin/indices/TransportAnalyzeActionTests.java index 57a83b2c68081..2f64069f38381 100644 --- a/core/src/test/java/org/elasticsearch/action/admin/indices/TransportAnalyzeActionTests.java +++ b/core/src/test/java/org/elasticsearch/action/admin/indices/TransportAnalyzeActionTests.java @@ -48,8 +48,8 @@ import static java.util.Collections.singletonMap; /** - * Tests for {@link TransportAnalyzeAction}. See the more "intense" version of this test in the - * {@code common-analysis} module. + * Tests for {@link TransportAnalyzeAction}. See the rest tests in the {@code analysis-common} module for places where this code gets a ton + * more exercise. */ public class TransportAnalyzeActionTests extends ESTestCase { diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java index 0f467189cde26..d7a137af9322a 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java @@ -20,7 +20,10 @@ package org.elasticsearch.analysis.common; import org.apache.lucene.analysis.CharArraySet; +import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.analysis.commongrams.CommonGramsFilter; +import org.apache.lucene.analysis.core.StopAnalyzer; +import org.apache.lucene.analysis.miscellaneous.TrimFilter; import org.apache.lucene.analysis.miscellaneous.TruncateTokenFilter; import org.apache.lucene.analysis.miscellaneous.UniqueTokenFilter; import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter; @@ -49,6 +52,7 @@ public Map> getTokenFilters() { @Override public Map getPreBuiltTokenFilters() { + // TODO we should revisit the caching strategies. Map filters = new TreeMap<>(); filters.put("classic", new PreBuiltTokenFilterSpec(false, CachingStrategy.ONE, (input, version) -> new ClassicFilter(input))); @@ -64,6 +68,10 @@ public Map getPreBuiltTokenFilters() { // NOCOMMIT deprecate nGram filters.put("nGram", new PreBuiltTokenFilterSpec(false, CachingStrategy.LUCENE, (input, version) -> new NGramTokenFilter(input))); + filters.put("stop", new PreBuiltTokenFilterSpec(false, CachingStrategy.LUCENE, (input, version) -> + new StopFilter(input, StopAnalyzer.ENGLISH_STOP_WORDS_SET))); + filters.put("trim", new PreBuiltTokenFilterSpec(false, CachingStrategy.LUCENE, (input, version) -> + new TrimFilter(input))); filters.put("truncate", new PreBuiltTokenFilterSpec(false, CachingStrategy.ONE, (input, version) -> new TruncateTokenFilter(input, 10))); filters.put("unique", new PreBuiltTokenFilterSpec(false, CachingStrategy.ONE, (input, version) -> diff --git a/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/40_token_filters.yaml b/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/40_token_filters.yaml index ac5bcb82e5783..5a636d91e7675 100644 --- a/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/40_token_filters.yaml +++ b/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/40_token_filters.yaml @@ -10,6 +10,18 @@ - length: { tokens: 1 } - match: { tokens.0.token: Musee d'Orsay } + - do: + indices.analyze: + body: + text: Musée d'Orsay + tokenizer: keyword + filter: + - type: asciifolding + preserve_original: true + - length: { tokens: 2 } + - match: { tokens.0.token: Musee d'Orsay } + - match: { tokens.1.token: Musée d'Orsay } + --- "lowercase": - do: From 9decf6aa5a12fecf7095b6a5bbf6c0bf9c9bc50e Mon Sep 17 00:00:00 2001 From: Nik Everett Date: Thu, 20 Apr 2017 13:06:15 -0400 Subject: [PATCH 04/20] Fix more --- .../analysis/PreBuiltTokenFilters.java | 18 ----------- .../indices/TransportAnalyzeActionTests.java | 31 ++++++++++--------- .../analysis/common/CommonAnalysisPlugin.java | 6 ++++ .../common/CommonAnalysisFactoryTests.java | 5 +++ .../analysis/AnalysisFactoryTestCase.java | 3 -- 5 files changed, 27 insertions(+), 36 deletions(-) diff --git a/core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltTokenFilters.java b/core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltTokenFilters.java index e583afcab13eb..a1bbc9a9a5dab 100644 --- a/core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltTokenFilters.java +++ b/core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltTokenFilters.java @@ -62,24 +62,6 @@ import java.util.Locale; public enum PreBuiltTokenFilters { - REVERSE(CachingStrategy.LUCENE) { - @Override - public TokenStream create(TokenStream tokenStream, Version version) { - return new ReverseStringFilter(tokenStream); - } - }, - - ASCIIFOLDING(CachingStrategy.ONE) { - @Override - public TokenStream create(TokenStream tokenStream, Version version) { - return new ASCIIFoldingFilter(tokenStream); - } - @Override - protected boolean isMultiTermAware() { - return true; - } - }, - LENGTH(CachingStrategy.LUCENE) { @Override public TokenStream create(TokenStream tokenStream, Version version) { diff --git a/core/src/test/java/org/elasticsearch/action/admin/indices/TransportAnalyzeActionTests.java b/core/src/test/java/org/elasticsearch/action/admin/indices/TransportAnalyzeActionTests.java index 2f64069f38381..0e1414bdbefda 100644 --- a/core/src/test/java/org/elasticsearch/action/admin/indices/TransportAnalyzeActionTests.java +++ b/core/src/test/java/org/elasticsearch/action/admin/indices/TransportAnalyzeActionTests.java @@ -90,7 +90,11 @@ public Map> getTokenFilters() { indexAnalyzers = registry.build(idxSettings); } + /** + * Test behavior when the named analysis component isn't defined on the index. In that case we should build with defaults. + */ public void testNoIndexAnalyzers() throws IOException { + // Refer to an analyzer by its type so we get its default configuration AnalyzeRequest request = new AnalyzeRequest(); request.analyzer("standard"); request.text("the quick brown fox"); @@ -98,33 +102,30 @@ public void testNoIndexAnalyzers() throws IOException { List tokens = analyze.getTokens(); assertEquals(4, tokens.size()); + // Refer to a token filter by its type so we get its default configuration request.analyzer(null); request.tokenizer("whitespace"); - request.addTokenFilter("lowercase"); - request.addTokenFilter("word_delimiter"); + request.addTokenFilter("mock"); request.text("the qu1ck brown fox"); analyze = TransportAnalyzeAction.analyze(request, AllFieldMapper.NAME, null, randomBoolean() ? indexAnalyzers : null, registry, environment); tokens = analyze.getTokens(); - assertEquals(6, tokens.size()); - assertEquals("qu", tokens.get(1).getTerm()); - assertEquals("1", tokens.get(2).getTerm()); - assertEquals("ck", tokens.get(3).getTerm()); + assertEquals(3, tokens.size()); + assertEquals("qu1ck", tokens.get(0).getTerm()); + assertEquals("brown", tokens.get(1).getTerm()); + assertEquals("fox", tokens.get(2).getTerm()); + // Refer to a char filter by its type so we get its default configuration request.analyzer(null); request.tokenizer("whitespace"); request.addCharFilter("html_strip"); - request.addTokenFilter("lowercase"); - request.addTokenFilter("word_delimiter"); + request.addTokenFilter("mock"); request.text("

the qu1ck brown fox

"); analyze = TransportAnalyzeAction.analyze(request, AllFieldMapper.NAME, null, randomBoolean() ? indexAnalyzers : null, registry, environment); tokens = analyze.getTokens(); - assertEquals(6, tokens.size()); - assertEquals("the", tokens.get(0).getTerm()); - assertEquals("qu", tokens.get(1).getTerm()); - assertEquals("1", tokens.get(2).getTerm()); - assertEquals("ck", tokens.get(3).getTerm()); - assertEquals("brown", tokens.get(4).getTerm()); - assertEquals("fox", tokens.get(5).getTerm()); + assertEquals(3, tokens.size()); + assertEquals("qu1ck", tokens.get(0).getTerm()); + assertEquals("brown", tokens.get(1).getTerm()); + assertEquals("fox", tokens.get(2).getTerm()); } public void testFillsAttributes() throws IOException { diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java index d7a137af9322a..099ea6369d246 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java @@ -23,6 +23,7 @@ import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.analysis.commongrams.CommonGramsFilter; import org.apache.lucene.analysis.core.StopAnalyzer; +import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter; import org.apache.lucene.analysis.miscellaneous.TrimFilter; import org.apache.lucene.analysis.miscellaneous.TruncateTokenFilter; import org.apache.lucene.analysis.miscellaneous.UniqueTokenFilter; @@ -30,6 +31,7 @@ import org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter; import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter; import org.apache.lucene.analysis.ngram.NGramTokenFilter; +import org.apache.lucene.analysis.reverse.ReverseStringFilter; import org.apache.lucene.analysis.standard.ClassicFilter; import org.elasticsearch.index.analysis.TokenFilterFactory; import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider; @@ -54,6 +56,8 @@ public Map> getTokenFilters() { public Map getPreBuiltTokenFilters() { // TODO we should revisit the caching strategies. Map filters = new TreeMap<>(); + filters.put("asciifolding", new PreBuiltTokenFilterSpec(true, CachingStrategy.ONE, (input, version) -> + new ASCIIFoldingFilter(input))); filters.put("classic", new PreBuiltTokenFilterSpec(false, CachingStrategy.ONE, (input, version) -> new ClassicFilter(input))); filters.put("common_grams", new PreBuiltTokenFilterSpec(false, CachingStrategy.LUCENE, (input, version) -> @@ -68,6 +72,8 @@ public Map getPreBuiltTokenFilters() { // NOCOMMIT deprecate nGram filters.put("nGram", new PreBuiltTokenFilterSpec(false, CachingStrategy.LUCENE, (input, version) -> new NGramTokenFilter(input))); + filters.put("reverse", new PreBuiltTokenFilterSpec(false, CachingStrategy.LUCENE, (input, version) -> + new ReverseStringFilter(input))); filters.put("stop", new PreBuiltTokenFilterSpec(false, CachingStrategy.LUCENE, (input, version) -> new StopFilter(input, StopAnalyzer.ENGLISH_STOP_WORDS_SET))); filters.put("trim", new PreBuiltTokenFilterSpec(false, CachingStrategy.LUCENE, (input, version) -> diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java index 0b33b8194cf42..f3f948de6b2fe 100644 --- a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java @@ -19,6 +19,7 @@ package org.elasticsearch.analysis.common; +import org.apache.lucene.analysis.reverse.ReverseStringFilterFactory; import org.elasticsearch.indices.analysis.AnalysisFactoryTestCase; import java.util.List; @@ -57,12 +58,16 @@ protected Map> getCharFilters() { @Override protected Map> getPreBuiltTokenFilters() { Map> filters = new TreeMap<>(super.getPreBuiltTokenFilters()); + filters.put("asciifolding", null); filters.put("classic", null); filters.put("common_grams", null); filters.put("edge_ngram", null); filters.put("edgeNGram", null); filters.put("ngram", null); filters.put("nGram", null); + filters.put("reverse", ReverseStringFilterFactory.class); + filters.put("stop", null); + filters.put("trim", null); filters.put("truncate", null); filters.put("unique", Void.class); filters.put("word_delimiter", null); diff --git a/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java b/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java index 4d5dc63c652b9..260d0852cef91 100644 --- a/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java +++ b/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java @@ -350,9 +350,6 @@ protected Map> getPreBuiltTokenFilters() { for (PreBuiltTokenFilters tokenizer : PreBuiltTokenFilters.values()) { Class luceneFactoryClass; switch (tokenizer) { - case REVERSE: - luceneFactoryClass = ReverseStringFilterFactory.class; - break; case SNOWBALL: case DUTCH_STEM: case FRENCH_STEM: From 0b54ba2df305ccd059223ebdd20a90e508118259 Mon Sep 17 00:00:00 2001 From: Nik Everett Date: Thu, 20 Apr 2017 13:22:22 -0400 Subject: [PATCH 05/20] More tests --- .../index/analysis/CustomNormalizerTests.java | 30 ++++++++++++++----- .../index/mapper/KeywordFieldMapperTests.java | 26 ++++++++++++---- 2 files changed, 44 insertions(+), 12 deletions(-) diff --git a/core/src/test/java/org/elasticsearch/index/analysis/CustomNormalizerTests.java b/core/src/test/java/org/elasticsearch/index/analysis/CustomNormalizerTests.java index 3e71a60973724..90414dfd2a436 100644 --- a/core/src/test/java/org/elasticsearch/index/analysis/CustomNormalizerTests.java +++ b/core/src/test/java/org/elasticsearch/index/analysis/CustomNormalizerTests.java @@ -19,28 +19,44 @@ package org.elasticsearch.index.analysis; +import org.apache.lucene.analysis.MockLowerCaseFilter; import org.apache.lucene.util.BytesRef; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; +import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy; +import org.elasticsearch.plugins.AnalysisPlugin; import org.elasticsearch.test.ESTestCase; import org.elasticsearch.test.ESTokenStreamTestCase; import java.io.IOException; +import java.util.Map; +import java.util.TreeMap; public class CustomNormalizerTests extends ESTokenStreamTestCase { + private static final AnalysisPlugin MOCK_ANALYSIS_PLUGIN = new AnalysisPlugin() { + @Override + public Map getPreBuiltTokenFilters() { + Map filters = new TreeMap<>(); + filters.put("mock_lowercase", new PreBuiltTokenFilterSpec(true, CachingStrategy.ONE, (input, version) -> + new MockLowerCaseFilter(input))); + filters.put("mock_forbidden", new PreBuiltTokenFilterSpec(false, CachingStrategy.ONE, (input, version) -> + new MockLowerCaseFilter(input))); + return filters; + } + }; public void testBasics() throws IOException { Settings settings = Settings.builder() - .putArray("index.analysis.normalizer.my_normalizer.filter", "lowercase", "asciifolding") + .putArray("index.analysis.normalizer.my_normalizer.filter", "mock_lowercase") .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) .build(); - ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings); + ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, MOCK_ANALYSIS_PLUGIN); assertNull(analysis.indexAnalyzers.get("my_normalizer")); NamedAnalyzer normalizer = analysis.indexAnalyzers.getNormalizer("my_normalizer"); assertNotNull(normalizer); assertEquals("my_normalizer", normalizer.name()); - assertTokenStreamContents(normalizer.tokenStream("foo", "Cet été-là"), new String[] {"cet ete-la"}); - assertEquals(new BytesRef("cet ete-la"), normalizer.normalize("foo", "Cet été-là")); + assertTokenStreamContents(normalizer.tokenStream("foo", "Cet été-là"), new String[] {"cet été-là"}); + assertEquals(new BytesRef("cet été-là"), normalizer.normalize("foo", "Cet été-là")); } public void testUnknownType() { @@ -82,12 +98,12 @@ public void testCharFilters() throws IOException { public void testIllegalFilters() throws IOException { Settings settings = Settings.builder() - .putArray("index.analysis.normalizer.my_normalizer.filter", "porter_stem") + .putArray("index.analysis.normalizer.my_normalizer.filter", "mock_forbidden") .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) .build(); IllegalArgumentException e = expectThrows(IllegalArgumentException.class, - () -> AnalysisTestsHelper.createTestAnalysisFromSettings(settings)); - assertEquals("Custom normalizer [my_normalizer] may not use filter [porter_stem]", e.getMessage()); + () -> AnalysisTestsHelper.createTestAnalysisFromSettings(settings, MOCK_ANALYSIS_PLUGIN)); + assertEquals("Custom normalizer [my_normalizer] may not use filter [mock_forbidden]", e.getMessage()); } public void testIllegalCharFilters() throws IOException { diff --git a/core/src/test/java/org/elasticsearch/index/mapper/KeywordFieldMapperTests.java b/core/src/test/java/org/elasticsearch/index/mapper/KeywordFieldMapperTests.java index 2da44d57f00aa..ec90daca1ab81 100644 --- a/core/src/test/java/org/elasticsearch/index/mapper/KeywordFieldMapperTests.java +++ b/core/src/test/java/org/elasticsearch/index/mapper/KeywordFieldMapperTests.java @@ -19,6 +19,7 @@ package org.elasticsearch.index.mapper; +import org.apache.lucene.analysis.MockLowerCaseFilter; import org.apache.lucene.index.DocValuesType; import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.IndexableField; @@ -30,6 +31,8 @@ import org.elasticsearch.common.xcontent.XContentType; import org.elasticsearch.index.IndexService; import org.elasticsearch.index.mapper.MapperService.MergeReason; +import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy; +import org.elasticsearch.plugins.AnalysisPlugin; import org.elasticsearch.plugins.Plugin; import org.elasticsearch.test.ESSingleNodeTestCase; import org.elasticsearch.test.InternalSettingsPlugin; @@ -38,15 +41,28 @@ import java.io.IOException; import java.util.Arrays; import java.util.Collection; +import java.util.Map; +import java.util.TreeMap; import static org.hamcrest.Matchers.containsString; import static org.hamcrest.Matchers.equalTo; public class KeywordFieldMapperTests extends ESSingleNodeTestCase { + public static class MockAnalysisPlugin extends Plugin implements AnalysisPlugin { + @Override + public Map getPreBuiltTokenFilters() { + Map filters = new TreeMap<>(); + filters.put("mock_lowercase", new PreBuiltTokenFilterSpec(true, CachingStrategy.ONE, (input, version) -> + new MockLowerCaseFilter(input))); + filters.put("mock_other_lowercase", new PreBuiltTokenFilterSpec(true, CachingStrategy.ONE, (input, version) -> + new MockLowerCaseFilter(input))); + return filters; + } + }; @Override protected Collection> getPlugins() { - return pluginList(InternalSettingsPlugin.class); + return pluginList(InternalSettingsPlugin.class, MockAnalysisPlugin.class); } IndexService indexService; @@ -56,9 +72,9 @@ protected Collection> getPlugins() { public void setup() { indexService = createIndex("test", Settings.builder() .put("index.analysis.normalizer.my_lowercase.type", "custom") - .putArray("index.analysis.normalizer.my_lowercase.filter", "lowercase") - .put("index.analysis.normalizer.my_asciifolding.type", "custom") - .putArray("index.analysis.normalizer.my_asciifolding.filter", "asciifolding").build()); + .putArray("index.analysis.normalizer.my_lowercase.filter", "mock_lowercase") + .put("index.analysis.normalizer.my_other_lowercase.type", "custom") + .putArray("index.analysis.normalizer.my_other_lowercase.filter", "mock_other_lowercase").build()); parser = indexService.mapperService().documentMapperParser(); } @@ -348,7 +364,7 @@ public void testUpdateNormalizer() throws IOException { String mapping2 = XContentFactory.jsonBuilder().startObject().startObject("type") .startObject("properties").startObject("field") - .field("type", "keyword").field("normalizer", "my_asciifolding").endObject().endObject() + .field("type", "keyword").field("normalizer", "my_other_lowercase").endObject().endObject() .endObject().endObject().string(); IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> indexService.mapperService().merge("type", From 2d095313099290b3fea0d69980965e8a6fd58c74 Mon Sep 17 00:00:00 2001 From: Nik Everett Date: Thu, 20 Apr 2017 13:38:56 -0400 Subject: [PATCH 06/20] More --- .../indices/analysis/AnalysisModule.java | 20 ++++++++++++++----- .../analysis/PreBuiltTokenFilters.java | 10 ---------- .../analysis/common/CommonAnalysisPlugin.java | 4 ++++ .../common/CommonAnalysisFactoryTests.java | 1 + .../analysis/AnalysisFactoryTestCase.java | 13 ++++++------ 5 files changed, 26 insertions(+), 22 deletions(-) diff --git a/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java b/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java index 71757fe45e2e4..49c52a44be141 100644 --- a/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java +++ b/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java @@ -19,6 +19,7 @@ package org.elasticsearch.indices.analysis; +import org.apache.lucene.analysis.LowerCaseFilter; import org.apache.lucene.analysis.standard.StandardFilter; import org.elasticsearch.Version; import org.elasticsearch.cluster.metadata.IndexMetaData; @@ -272,13 +273,22 @@ private NamedRegistry> setupTokenFilters(Li static Map setupPreBuiltTokenFilters(List plugins) { NamedRegistry preBuiltTokenFilters = new NamedRegistry<>("pre built token_filter"); + preBuiltTokenFilters.register("lowercase", new PreBuiltTokenFilterSpec(true, CachingStrategy.LUCENE, (inputs, version) -> + new LowerCaseFilter(inputs))); preBuiltTokenFilters.register("standard", new PreBuiltTokenFilterSpec(false, CachingStrategy.LUCENE, (inputs, version) -> - new StandardFilter(inputs))); + new StandardFilter(inputs))); for (PreBuiltTokenFilters preBuilt : PreBuiltTokenFilters.values()) { - preBuiltTokenFilters.register(preBuilt.name().toLowerCase(Locale.ROOT), new PreBuiltTokenFilterSpec( // NOCOMMIT remove this shim - preBuilt.isMultiTermAware(), - CachingStrategy.ELASTICSEARCH, // This is the most granular/safest/whatever - preBuilt::create)); + // NOCOMMIT remove this shim + switch (preBuilt) { + case LOWERCASE: + // This has been migrated but has to stick around until PreBuiltAnalyzers is removed. + continue; + default: + preBuiltTokenFilters.register(preBuilt.name().toLowerCase(Locale.ROOT), new PreBuiltTokenFilterSpec( + preBuilt.isMultiTermAware(), + CachingStrategy.ELASTICSEARCH, // This is the most granular/safest/whatever + preBuilt::create)); + } } preBuiltTokenFilters.extractAndRegister(plugins, AnalysisPlugin::getPreBuiltTokenFilters); return preBuiltTokenFilters.getRegistry(); diff --git a/core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltTokenFilters.java b/core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltTokenFilters.java index a1bbc9a9a5dab..7e1077a4b4b42 100644 --- a/core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltTokenFilters.java +++ b/core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltTokenFilters.java @@ -37,15 +37,12 @@ import org.apache.lucene.analysis.fr.FrenchAnalyzer; import org.apache.lucene.analysis.hi.HindiNormalizationFilter; import org.apache.lucene.analysis.in.IndicNormalizationFilter; -import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter; import org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilter; -import org.apache.lucene.analysis.miscellaneous.LengthFilter; import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilter; import org.apache.lucene.analysis.miscellaneous.ScandinavianFoldingFilter; import org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizationFilter; import org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilter; import org.apache.lucene.analysis.payloads.TypeAsPayloadTokenFilter; -import org.apache.lucene.analysis.reverse.ReverseStringFilter; import org.apache.lucene.analysis.shingle.ShingleFilter; import org.apache.lucene.analysis.snowball.SnowballFilter; import org.apache.lucene.analysis.tr.ApostropheFilter; @@ -62,13 +59,6 @@ import java.util.Locale; public enum PreBuiltTokenFilters { - LENGTH(CachingStrategy.LUCENE) { - @Override - public TokenStream create(TokenStream tokenStream, Version version) { - return new LengthFilter(tokenStream, 0, Integer.MAX_VALUE); - } - }, - LOWERCASE(CachingStrategy.LUCENE) { @Override public TokenStream create(TokenStream tokenStream, Version version) { diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java index 099ea6369d246..97222eba013c7 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java @@ -24,6 +24,7 @@ import org.apache.lucene.analysis.commongrams.CommonGramsFilter; import org.apache.lucene.analysis.core.StopAnalyzer; import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter; +import org.apache.lucene.analysis.miscellaneous.LengthFilter; import org.apache.lucene.analysis.miscellaneous.TrimFilter; import org.apache.lucene.analysis.miscellaneous.TruncateTokenFilter; import org.apache.lucene.analysis.miscellaneous.UniqueTokenFilter; @@ -67,6 +68,8 @@ public Map getPreBuiltTokenFilters() { // NOCOMMIT deprecate edgeNGram filters.put("edgeNGram", new PreBuiltTokenFilterSpec(false, CachingStrategy.LUCENE, (input, version) -> new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE))); + filters.put("length", new PreBuiltTokenFilterSpec(false, CachingStrategy.LUCENE, (input, version) -> + new LengthFilter(input, 0, Integer.MAX_VALUE))); filters.put("ngram", new PreBuiltTokenFilterSpec(false, CachingStrategy.LUCENE, (input, version) -> new NGramTokenFilter(input))); // NOCOMMIT deprecate nGram @@ -74,6 +77,7 @@ public Map getPreBuiltTokenFilters() { new NGramTokenFilter(input))); filters.put("reverse", new PreBuiltTokenFilterSpec(false, CachingStrategy.LUCENE, (input, version) -> new ReverseStringFilter(input))); + // The stop filter is in lucene-core but the english stop words set is in lucene-analyzers-common filters.put("stop", new PreBuiltTokenFilterSpec(false, CachingStrategy.LUCENE, (input, version) -> new StopFilter(input, StopAnalyzer.ENGLISH_STOP_WORDS_SET))); filters.put("trim", new PreBuiltTokenFilterSpec(false, CachingStrategy.LUCENE, (input, version) -> diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java index f3f948de6b2fe..18c3d9da7ea48 100644 --- a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java @@ -63,6 +63,7 @@ protected Map> getPreBuiltTokenFilters() { filters.put("common_grams", null); filters.put("edge_ngram", null); filters.put("edgeNGram", null); + filters.put("length", null); filters.put("ngram", null); filters.put("nGram", null); filters.put("reverse", ReverseStringFilterFactory.class); diff --git a/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java b/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java index 260d0852cef91..80a62bc7455a8 100644 --- a/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java +++ b/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java @@ -20,7 +20,6 @@ package org.elasticsearch.indices.analysis; import org.apache.lucene.analysis.en.PorterStemFilterFactory; -import org.apache.lucene.analysis.reverse.ReverseStringFilterFactory; import org.apache.lucene.analysis.snowball.SnowballPorterFilterFactory; import org.apache.lucene.analysis.util.CharFilterFactory; import org.apache.lucene.analysis.util.TokenFilterFactory; @@ -93,9 +92,6 @@ import org.elasticsearch.index.analysis.WhitespaceTokenizerFactory; import org.elasticsearch.index.analysis.compound.DictionaryCompoundWordTokenFilterFactory; import org.elasticsearch.index.analysis.compound.HyphenationCompoundWordTokenFilterFactory; -import org.elasticsearch.indices.analysis.PreBuiltCharFilters; -import org.elasticsearch.indices.analysis.PreBuiltTokenFilters; -import org.elasticsearch.indices.analysis.PreBuiltTokenizers; import org.elasticsearch.plugins.AnalysisPlugin; import org.elasticsearch.plugins.AnalysisPlugin.PreBuiltTokenFilterSpec; import org.elasticsearch.test.ESTestCase; @@ -343,13 +339,16 @@ protected Map> getTokenFilters() { * be {@link Void}. */ protected Map> getPreBuiltTokenFilters() { - // NOCOMMIT use this: -// return singletonMap("standard", null); - // Temporary builtin list until I remove them all Map> filters = new HashMap<>(); + filters.put("standard", null); + filters.put("lowercase", null); + // NOCOMMIT drop this Temporary builtin list until I remove them all for (PreBuiltTokenFilters tokenizer : PreBuiltTokenFilters.values()) { Class luceneFactoryClass; switch (tokenizer) { + case LOWERCASE: + // This has been migrated but has to stick around until PreBuiltAnalyzers is removed. + continue; case SNOWBALL: case DUTCH_STEM: case FRENCH_STEM: From 3abc5b658ad67df5872af2694499b4285390a988 Mon Sep 17 00:00:00 2001 From: Nik Everett Date: Thu, 20 Apr 2017 13:59:27 -0400 Subject: [PATCH 07/20] More --- .../indices/analysis/AnalysisModule.java | 4 +- .../analysis/PreBuiltTokenFilters.java | 41 +------------------ .../index/analysis/CustomNormalizerTests.java | 11 ++--- .../analysis/common/CommonAnalysisPlugin.java | 9 ++++ .../common/CommonAnalysisFactoryTests.java | 3 ++ .../analysis/AnalysisFactoryTestCase.java | 2 +- 6 files changed, 20 insertions(+), 50 deletions(-) diff --git a/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java b/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java index 49c52a44be141..d757b80822247 100644 --- a/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java +++ b/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java @@ -278,10 +278,10 @@ static Map setupPreBuiltTokenFilters(List new StandardFilter(inputs))); for (PreBuiltTokenFilters preBuilt : PreBuiltTokenFilters.values()) { - // NOCOMMIT remove this shim + // TODO remove this temporary shim when there are no more PreBuiltTokenFilters switch (preBuilt) { case LOWERCASE: - // This has been migrated but has to stick around until PreBuiltAnalyzers is removed. + // This has been migrated but has to stick around until PreBuiltTokenizers is removed. continue; default: preBuiltTokenFilters.register(preBuilt.name().toLowerCase(Locale.ROOT), new PreBuiltTokenFilterSpec( diff --git a/core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltTokenFilters.java b/core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltTokenFilters.java index 7e1077a4b4b42..077d89119a905 100644 --- a/core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltTokenFilters.java +++ b/core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltTokenFilters.java @@ -27,11 +27,9 @@ import org.apache.lucene.analysis.cjk.CJKWidthFilter; import org.apache.lucene.analysis.ckb.SoraniNormalizationFilter; import org.apache.lucene.analysis.core.DecimalDigitFilter; -import org.apache.lucene.analysis.core.UpperCaseFilter; import org.apache.lucene.analysis.cz.CzechStemFilter; import org.apache.lucene.analysis.de.GermanNormalizationFilter; import org.apache.lucene.analysis.de.GermanStemFilter; -import org.apache.lucene.analysis.en.KStemFilter; import org.apache.lucene.analysis.en.PorterStemFilter; import org.apache.lucene.analysis.fa.PersianNormalizationFilter; import org.apache.lucene.analysis.fr.FrenchAnalyzer; @@ -59,6 +57,7 @@ import java.util.Locale; public enum PreBuiltTokenFilters { + // TODO remove this entire class when PreBuiltTokenizers no longer needs it..... LOWERCASE(CachingStrategy.LUCENE) { @Override public TokenStream create(TokenStream tokenStream, Version version) { @@ -70,31 +69,6 @@ protected boolean isMultiTermAware() { } }, - UPPERCASE(CachingStrategy.LUCENE) { - @Override - public TokenStream create(TokenStream tokenStream, Version version) { - return new UpperCaseFilter(tokenStream); - } - @Override - protected boolean isMultiTermAware() { - return true; - } - }, - - KSTEM(CachingStrategy.ONE) { - @Override - public TokenStream create(TokenStream tokenStream, Version version) { - return new KStemFilter(tokenStream); - } - }, - - PORTER_STEM(CachingStrategy.ONE) { - @Override - public TokenStream create(TokenStream tokenStream, Version version) { - return new PorterStemFilter(tokenStream); - } - }, - // Extended Token Filters SNOWBALL(CachingStrategy.ONE) { @Override @@ -385,17 +359,4 @@ public TokenStream create(TokenStream tokenStream) { return factory; } - - /** - * Get a pre built TokenFilter by its name or fallback to the default one - * @param name TokenFilter name - * @param defaultTokenFilter default TokenFilter if name not found - */ - public static PreBuiltTokenFilters getOrDefault(String name, PreBuiltTokenFilters defaultTokenFilter) { - try { - return valueOf(name.toUpperCase(Locale.ROOT)); - } catch (IllegalArgumentException e) { - return defaultTokenFilter; - } - } } diff --git a/core/src/test/java/org/elasticsearch/index/analysis/CustomNormalizerTests.java b/core/src/test/java/org/elasticsearch/index/analysis/CustomNormalizerTests.java index 90414dfd2a436..717cdae2756de 100644 --- a/core/src/test/java/org/elasticsearch/index/analysis/CustomNormalizerTests.java +++ b/core/src/test/java/org/elasticsearch/index/analysis/CustomNormalizerTests.java @@ -30,24 +30,21 @@ import java.io.IOException; import java.util.Map; -import java.util.TreeMap; + +import static java.util.Collections.singletonMap; public class CustomNormalizerTests extends ESTokenStreamTestCase { private static final AnalysisPlugin MOCK_ANALYSIS_PLUGIN = new AnalysisPlugin() { @Override public Map getPreBuiltTokenFilters() { - Map filters = new TreeMap<>(); - filters.put("mock_lowercase", new PreBuiltTokenFilterSpec(true, CachingStrategy.ONE, (input, version) -> - new MockLowerCaseFilter(input))); - filters.put("mock_forbidden", new PreBuiltTokenFilterSpec(false, CachingStrategy.ONE, (input, version) -> + return singletonMap("mock_forbidden", new PreBuiltTokenFilterSpec(false, CachingStrategy.ONE, (input, version) -> new MockLowerCaseFilter(input))); - return filters; } }; public void testBasics() throws IOException { Settings settings = Settings.builder() - .putArray("index.analysis.normalizer.my_normalizer.filter", "mock_lowercase") + .putArray("index.analysis.normalizer.my_normalizer.filter", "lowercase") .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) .build(); ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, MOCK_ANALYSIS_PLUGIN); diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java index 97222eba013c7..a259751686a7e 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java @@ -23,6 +23,9 @@ import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.analysis.commongrams.CommonGramsFilter; import org.apache.lucene.analysis.core.StopAnalyzer; +import org.apache.lucene.analysis.core.UpperCaseFilter; +import org.apache.lucene.analysis.en.KStemFilter; +import org.apache.lucene.analysis.en.PorterStemFilter; import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter; import org.apache.lucene.analysis.miscellaneous.LengthFilter; import org.apache.lucene.analysis.miscellaneous.TrimFilter; @@ -68,6 +71,8 @@ public Map getPreBuiltTokenFilters() { // NOCOMMIT deprecate edgeNGram filters.put("edgeNGram", new PreBuiltTokenFilterSpec(false, CachingStrategy.LUCENE, (input, version) -> new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE))); + filters.put("kstem", new PreBuiltTokenFilterSpec(false, CachingStrategy.ONE, (input, version) -> + new KStemFilter(input))); filters.put("length", new PreBuiltTokenFilterSpec(false, CachingStrategy.LUCENE, (input, version) -> new LengthFilter(input, 0, Integer.MAX_VALUE))); filters.put("ngram", new PreBuiltTokenFilterSpec(false, CachingStrategy.LUCENE, (input, version) -> @@ -75,6 +80,8 @@ public Map getPreBuiltTokenFilters() { // NOCOMMIT deprecate nGram filters.put("nGram", new PreBuiltTokenFilterSpec(false, CachingStrategy.LUCENE, (input, version) -> new NGramTokenFilter(input))); + filters.put("porter_stem", new PreBuiltTokenFilterSpec(false, CachingStrategy.ONE, (input, version) -> + new PorterStemFilter(input))); filters.put("reverse", new PreBuiltTokenFilterSpec(false, CachingStrategy.LUCENE, (input, version) -> new ReverseStringFilter(input))); // The stop filter is in lucene-core but the english stop words set is in lucene-analyzers-common @@ -86,6 +93,8 @@ public Map getPreBuiltTokenFilters() { new TruncateTokenFilter(input, 10))); filters.put("unique", new PreBuiltTokenFilterSpec(false, CachingStrategy.ONE, (input, version) -> new UniqueTokenFilter(input))); + filters.put("uppercase", new PreBuiltTokenFilterSpec(true, CachingStrategy.LUCENE, (input, version) -> + new UpperCaseFilter(input))); filters.put("word_delimiter", new PreBuiltTokenFilterSpec(false, CachingStrategy.ONE, (input, version) -> new WordDelimiterFilter(input, WordDelimiterFilter.GENERATE_WORD_PARTS diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java index 18c3d9da7ea48..800cfdc71b188 100644 --- a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java @@ -63,14 +63,17 @@ protected Map> getPreBuiltTokenFilters() { filters.put("common_grams", null); filters.put("edge_ngram", null); filters.put("edgeNGram", null); + filters.put("kstem", null); filters.put("length", null); filters.put("ngram", null); filters.put("nGram", null); + filters.put("porter_stem", null); filters.put("reverse", ReverseStringFilterFactory.class); filters.put("stop", null); filters.put("trim", null); filters.put("truncate", null); filters.put("unique", Void.class); + filters.put("uppercase", null); filters.put("word_delimiter", null); filters.put("word_delimiter_graph", null); return filters; diff --git a/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java b/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java index 80a62bc7455a8..86106efee3b9e 100644 --- a/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java +++ b/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java @@ -347,7 +347,7 @@ protected Map> getPreBuiltTokenFilters() { Class luceneFactoryClass; switch (tokenizer) { case LOWERCASE: - // This has been migrated but has to stick around until PreBuiltAnalyzers is removed. + // This has been migrated but has to stick around until PreBuiltTokenizers is removed. continue; case SNOWBALL: case DUTCH_STEM: From 6a06eacbd0a17f3bebd9f0d302ff33570a086bf1 Mon Sep 17 00:00:00 2001 From: Nik Everett Date: Thu, 20 Apr 2017 15:07:45 -0400 Subject: [PATCH 08/20] Convert NOCOMMIT to TODO --- .../elasticsearch/analysis/common/CommonAnalysisPlugin.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java index a259751686a7e..5746c647b2527 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java @@ -68,8 +68,8 @@ public Map getPreBuiltTokenFilters() { new CommonGramsFilter(input, CharArraySet.EMPTY_SET))); filters.put("edge_ngram", new PreBuiltTokenFilterSpec(false, CachingStrategy.LUCENE, (input, version) -> new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE))); - // NOCOMMIT deprecate edgeNGram - filters.put("edgeNGram", new PreBuiltTokenFilterSpec(false, CachingStrategy.LUCENE, (input, version) -> + // TODO deprecate edgeNGram + filters.put("edgeNGram", new PreBuiltTokenFilterSpec(false, CachingStrategy.LUCENE, (input, version) -> new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE))); filters.put("kstem", new PreBuiltTokenFilterSpec(false, CachingStrategy.ONE, (input, version) -> new KStemFilter(input))); @@ -77,7 +77,7 @@ public Map getPreBuiltTokenFilters() { new LengthFilter(input, 0, Integer.MAX_VALUE))); filters.put("ngram", new PreBuiltTokenFilterSpec(false, CachingStrategy.LUCENE, (input, version) -> new NGramTokenFilter(input))); - // NOCOMMIT deprecate nGram + // TODO deprecate nGram filters.put("nGram", new PreBuiltTokenFilterSpec(false, CachingStrategy.LUCENE, (input, version) -> new NGramTokenFilter(input))); filters.put("porter_stem", new PreBuiltTokenFilterSpec(false, CachingStrategy.ONE, (input, version) -> From 1995bea980e30d2c2f8fdca1b3ee6514225f5726 Mon Sep 17 00:00:00 2001 From: Nik Everett Date: Thu, 20 Apr 2017 15:13:15 -0400 Subject: [PATCH 09/20] Convert another NOCOMMIT to TODO To keep the PR small I'm breaking it off here. --- .../elasticsearch/indices/analysis/AnalysisModule.java | 8 +++++++- .../analysis/common/CommonAnalysisPlugin.java | 2 +- .../indices/analysis/AnalysisFactoryTestCase.java | 2 +- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java b/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java index d757b80822247..d2d4500dfb81b 100644 --- a/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java +++ b/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java @@ -273,12 +273,17 @@ private NamedRegistry> setupTokenFilters(Li static Map setupPreBuiltTokenFilters(List plugins) { NamedRegistry preBuiltTokenFilters = new NamedRegistry<>("pre built token_filter"); + + // Add filters available in lucene-core preBuiltTokenFilters.register("lowercase", new PreBuiltTokenFilterSpec(true, CachingStrategy.LUCENE, (inputs, version) -> new LowerCaseFilter(inputs))); preBuiltTokenFilters.register("standard", new PreBuiltTokenFilterSpec(false, CachingStrategy.LUCENE, (inputs, version) -> new StandardFilter(inputs))); + /* Note that "stop" is available in lucene-core but it's pre-built version uses a set of English stop words that are in + * lucene-analyzers-common so "stop" is defined in the analysis-common module. */ + + // Add token filers declared in PreBuiltTokenFilters until they have all been migrated for (PreBuiltTokenFilters preBuilt : PreBuiltTokenFilters.values()) { - // TODO remove this temporary shim when there are no more PreBuiltTokenFilters switch (preBuilt) { case LOWERCASE: // This has been migrated but has to stick around until PreBuiltTokenizers is removed. @@ -290,6 +295,7 @@ static Map setupPreBuiltTokenFilters(List getPreBuiltTokenFilters() { new PorterStemFilter(input))); filters.put("reverse", new PreBuiltTokenFilterSpec(false, CachingStrategy.LUCENE, (input, version) -> new ReverseStringFilter(input))); - // The stop filter is in lucene-core but the english stop words set is in lucene-analyzers-common + // The stop filter is in lucene-core but the English stop words set is in lucene-analyzers-common filters.put("stop", new PreBuiltTokenFilterSpec(false, CachingStrategy.LUCENE, (input, version) -> new StopFilter(input, StopAnalyzer.ENGLISH_STOP_WORDS_SET))); filters.put("trim", new PreBuiltTokenFilterSpec(false, CachingStrategy.LUCENE, (input, version) -> diff --git a/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java b/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java index 86106efee3b9e..c0aae876a2000 100644 --- a/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java +++ b/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java @@ -342,7 +342,7 @@ protected Map> getPreBuiltTokenFilters() { Map> filters = new HashMap<>(); filters.put("standard", null); filters.put("lowercase", null); - // NOCOMMIT drop this Temporary builtin list until I remove them all + // NOCOMMIT drop this Temporary builtin list once these are all migrated for (PreBuiltTokenFilters tokenizer : PreBuiltTokenFilters.values()) { Class luceneFactoryClass; switch (tokenizer) { From e43fc1f572027d2f51ca6c46073e005338e60c4c Mon Sep 17 00:00:00 2001 From: Nik Everett Date: Thu, 20 Apr 2017 15:16:56 -0400 Subject: [PATCH 10/20] One more NOCOMMIT --- .../elasticsearch/indices/analysis/AnalysisFactoryTestCase.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java b/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java index c0aae876a2000..46f29d3ea940c 100644 --- a/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java +++ b/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java @@ -342,7 +342,7 @@ protected Map> getPreBuiltTokenFilters() { Map> filters = new HashMap<>(); filters.put("standard", null); filters.put("lowercase", null); - // NOCOMMIT drop this Temporary builtin list once these are all migrated + // TODO remove the loop below once all the tokenizers are migrated out of PreBuiltTokenFilters for (PreBuiltTokenFilters tokenizer : PreBuiltTokenFilters.values()) { Class luceneFactoryClass; switch (tokenizer) { From fad8fb3a1c501435dd5d066d95647e62a7884d3b Mon Sep 17 00:00:00 2001 From: Nik Everett Date: Thu, 20 Apr 2017 15:20:10 -0400 Subject: [PATCH 11/20] Yet another --- .../elasticsearch/index/analysis/AnalysisRegistry.java | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/core/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java b/core/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java index 410b73cc3f2df..9977001c5ec8a 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java +++ b/core/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java @@ -426,16 +426,6 @@ private PrebuiltAnalysis(Map preBuiltTokenFilte tokenFilterFactories.put(preBuiltTokenFilter.getKey(), new PreBuiltTokenFilterFactoryProvider(preBuiltTokenFilter.getKey(), preBuiltTokenFilter.getValue())); } - // Token filter aliases - // NOCOMMIT move the aliases to the module and deprecate - AnalysisProvider toAlias = tokenFilterFactories.get("ngram"); - if (toAlias != null) { - tokenFilterFactories.put("nGram", toAlias); - } - toAlias = tokenFilterFactories.get("edge_ngram"); - if (toAlias != null) { - tokenFilterFactories.put("edgeNGram", toAlias); - } // Char Filters for (PreBuiltCharFilters preBuiltCharFilter : PreBuiltCharFilters.values()) { From c47cafa4dc20801c01868d88802b1658b054edb4 Mon Sep 17 00:00:00 2001 From: Nik Everett Date: Thu, 20 Apr 2017 15:24:54 -0400 Subject: [PATCH 12/20] Cleanup --- .../index/mapper/KeywordFieldMapperTests.java | 12 ++++++------ .../analysis/common/CommonAnalysisPlugin.java | 10 +++++----- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/core/src/test/java/org/elasticsearch/index/mapper/KeywordFieldMapperTests.java b/core/src/test/java/org/elasticsearch/index/mapper/KeywordFieldMapperTests.java index ec90daca1ab81..3b5fb191daa3a 100644 --- a/core/src/test/java/org/elasticsearch/index/mapper/KeywordFieldMapperTests.java +++ b/core/src/test/java/org/elasticsearch/index/mapper/KeywordFieldMapperTests.java @@ -44,19 +44,19 @@ import java.util.Map; import java.util.TreeMap; +import static java.util.Collections.singletonMap; import static org.hamcrest.Matchers.containsString; import static org.hamcrest.Matchers.equalTo; public class KeywordFieldMapperTests extends ESSingleNodeTestCase { + /** + * Creates a copy of the lowercase token filter which we use for testing merge errors. + */ public static class MockAnalysisPlugin extends Plugin implements AnalysisPlugin { @Override public Map getPreBuiltTokenFilters() { - Map filters = new TreeMap<>(); - filters.put("mock_lowercase", new PreBuiltTokenFilterSpec(true, CachingStrategy.ONE, (input, version) -> + return singletonMap("mock_other_lowercase", new PreBuiltTokenFilterSpec(true, CachingStrategy.ONE, (input, version) -> new MockLowerCaseFilter(input))); - filters.put("mock_other_lowercase", new PreBuiltTokenFilterSpec(true, CachingStrategy.ONE, (input, version) -> - new MockLowerCaseFilter(input))); - return filters; } }; @@ -72,7 +72,7 @@ protected Collection> getPlugins() { public void setup() { indexService = createIndex("test", Settings.builder() .put("index.analysis.normalizer.my_lowercase.type", "custom") - .putArray("index.analysis.normalizer.my_lowercase.filter", "mock_lowercase") + .putArray("index.analysis.normalizer.my_lowercase.filter", "lowercase") .put("index.analysis.normalizer.my_other_lowercase.type", "custom") .putArray("index.analysis.normalizer.my_other_lowercase.filter", "mock_other_lowercase").build()); parser = indexService.mapperService().documentMapperParser(); diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java index fffc64306d0aa..701f9eb865e0e 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java @@ -104,11 +104,11 @@ public Map getPreBuiltTokenFilters() { | WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE, null))); filters.put("word_delimiter_graph", new PreBuiltTokenFilterSpec(false, CachingStrategy.ONE, (input, version) -> new WordDelimiterGraphFilter(input, - WordDelimiterGraphFilter.GENERATE_WORD_PARTS - | WordDelimiterGraphFilter.GENERATE_NUMBER_PARTS - | WordDelimiterGraphFilter.SPLIT_ON_CASE_CHANGE - | WordDelimiterGraphFilter.SPLIT_ON_NUMERICS - | WordDelimiterGraphFilter.STEM_ENGLISH_POSSESSIVE, null))); + WordDelimiterGraphFilter.GENERATE_WORD_PARTS + | WordDelimiterGraphFilter.GENERATE_NUMBER_PARTS + | WordDelimiterGraphFilter.SPLIT_ON_CASE_CHANGE + | WordDelimiterGraphFilter.SPLIT_ON_NUMERICS + | WordDelimiterGraphFilter.STEM_ENGLISH_POSSESSIVE, null))); return filters; } From b54a8ce8a602e5528786a63408c03f0873d08b38 Mon Sep 17 00:00:00 2001 From: Nik Everett Date: Thu, 20 Apr 2017 15:46:06 -0400 Subject: [PATCH 13/20] Try and keep caching strategy --- .../org/elasticsearch/indices/analysis/AnalysisModule.java | 6 ++---- .../indices/analysis/PreBuiltTokenFilters.java | 6 ++++++ .../main/java/org/elasticsearch/plugins/AnalysisPlugin.java | 3 +++ 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java b/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java index d2d4500dfb81b..e740444c05c06 100644 --- a/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java +++ b/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java @@ -289,10 +289,8 @@ static Map setupPreBuiltTokenFilters(List cache; + private final CachingStrategy cachingStrategy; PreBuiltTokenFilters(CachingStrategy cachingStrategy) { + this.cachingStrategy = cachingStrategy; cache = PreBuiltCacheFactory.getCache(cachingStrategy); } + public CachingStrategy getCachingStrategy() { + return cachingStrategy; + } + private interface MultiTermAwareTokenFilterFactory extends TokenFilterFactory, MultiTermAwareComponent {} public synchronized TokenFilterFactory getTokenFilterFactory(final Version version) { diff --git a/core/src/main/java/org/elasticsearch/plugins/AnalysisPlugin.java b/core/src/main/java/org/elasticsearch/plugins/AnalysisPlugin.java index 634698a676c04..fe88f57d8f281 100644 --- a/core/src/main/java/org/elasticsearch/plugins/AnalysisPlugin.java +++ b/core/src/main/java/org/elasticsearch/plugins/AnalysisPlugin.java @@ -94,6 +94,9 @@ default Map getHunspellD return emptyMap(); } + /** + * Specification for a pre-built token filter that is shared between multiple indices. + */ class PreBuiltTokenFilterSpec { private final boolean useFilterForMultitermQueries; private final PreBuiltCacheFactory.CachingStrategy cachingStrategy; From 3cd577d95e7af78836100db140f8536288ba5ab7 Mon Sep 17 00:00:00 2001 From: Nik Everett Date: Thu, 27 Apr 2017 14:40:11 -0400 Subject: [PATCH 14/20] Start converting to PreConfiguredTokenFilter --- .../index/analysis/AnalysisRegistry.java | 10 ++-- ...der.java => PreConfiguredTokenFilter.java} | 44 ++++++++++++--- .../indices/analysis/AnalysisModule.java | 38 ++++++++----- .../elasticsearch/plugins/AnalysisPlugin.java | 16 ++++-- .../elasticsearch/index/IndexModuleTests.java | 2 +- .../index/analysis/AnalysisRegistryTests.java | 16 +++--- .../index/analysis/CustomNormalizerTests.java | 7 ++- .../index/mapper/KeywordFieldMapperTests.java | 11 ++-- .../analysis/common/CommonAnalysisPlugin.java | 55 ++++++++----------- .../analysis/AnalysisFactoryTestCase.java | 13 +++-- 10 files changed, 126 insertions(+), 86 deletions(-) rename core/src/main/java/org/elasticsearch/index/analysis/{PreBuiltTokenFilterFactoryProvider.java => PreConfiguredTokenFilter.java} (67%) diff --git a/core/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java b/core/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java index 9977001c5ec8a..f854b52a1bc6f 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java +++ b/core/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java @@ -43,6 +43,7 @@ import java.io.IOException; import java.util.Collections; import java.util.HashMap; +import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Set; @@ -75,7 +76,7 @@ public AnalysisRegistry(Environment environment, Map> tokenizers, Map>> analyzers, Map>> normalizers, - Map preBuiltTokenFilters) { + List preBuiltTokenFilters) { this.environment = environment; this.charFilters = unmodifiableMap(charFilters); this.tokenFilters = unmodifiableMap(tokenFilters); @@ -398,7 +399,7 @@ private static class PrebuiltAnalysis implements Closeable { final Map> tokenFilterFactories; final Map> charFilterFactories; - private PrebuiltAnalysis(Map preBuiltTokenFilters) { + private PrebuiltAnalysis(List preBuiltTokenFilters) { Map analyzerProviderFactories = new HashMap<>(); Map tokenizerFactories = new HashMap<>(); Map> tokenFilterFactories = new HashMap<>(); @@ -422,9 +423,8 @@ private PrebuiltAnalysis(Map preBuiltTokenFilte // Token filters - for (Map.Entry preBuiltTokenFilter : preBuiltTokenFilters.entrySet()) { - tokenFilterFactories.put(preBuiltTokenFilter.getKey(), - new PreBuiltTokenFilterFactoryProvider(preBuiltTokenFilter.getKey(), preBuiltTokenFilter.getValue())); + for (PreConfiguredTokenFilter preBuiltTokenFilter : preBuiltTokenFilters) { + tokenFilterFactories.put(preBuiltTokenFilter.getName(), preBuiltTokenFilter); } // Char Filters diff --git a/core/src/main/java/org/elasticsearch/index/analysis/PreBuiltTokenFilterFactoryProvider.java b/core/src/main/java/org/elasticsearch/index/analysis/PreConfiguredTokenFilter.java similarity index 67% rename from core/src/main/java/org/elasticsearch/index/analysis/PreBuiltTokenFilterFactoryProvider.java rename to core/src/main/java/org/elasticsearch/index/analysis/PreConfiguredTokenFilter.java index cbac745e0789d..b410e8fb70e85 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/PreBuiltTokenFilterFactoryProvider.java +++ b/core/src/main/java/org/elasticsearch/index/analysis/PreConfiguredTokenFilter.java @@ -19,6 +19,7 @@ package org.elasticsearch.index.analysis; +import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.elasticsearch.Version; import org.elasticsearch.common.settings.Settings; @@ -26,25 +27,38 @@ import org.elasticsearch.index.IndexSettings; import org.elasticsearch.indices.analysis.AnalysisModule; import org.elasticsearch.indices.analysis.PreBuiltCacheFactory; -import org.elasticsearch.plugins.AnalysisPlugin.PreBuiltTokenFilterSpec; import java.io.IOException; import java.util.function.BiFunction; +import java.util.function.Function; /** - * Resolves pre-built {@link TokenFilterFactory}s based on the specifications provided by plugins. + * Provides pre-configured, shared {@link TokenFilter}s. */ -public class PreBuiltTokenFilterFactoryProvider implements AnalysisModule.AnalysisProvider { +public final class PreConfiguredTokenFilter implements AnalysisModule.AnalysisProvider { private final String name; - private final PreBuiltCacheFactory.PreBuiltCache cache; private final boolean useFilterForMultitermQueries; + private final PreBuiltCacheFactory.PreBuiltCache cache; private final BiFunction create; - public PreBuiltTokenFilterFactoryProvider(String name, PreBuiltTokenFilterSpec spec) { + /** + * Standard ctor with all the power. + */ + public PreConfiguredTokenFilter(String name, boolean useFilterForMultitermQueries, + PreBuiltCacheFactory.CachingStrategy cachingStrategy, BiFunction create) { this.name = name; - cache = PreBuiltCacheFactory.getCache(spec.getCachingStrategy()); - this.useFilterForMultitermQueries = spec.shouldUseFilterForMultitermQueries(); - this.create = spec.getCreate(); + this.useFilterForMultitermQueries = useFilterForMultitermQueries; + cache = PreBuiltCacheFactory.getCache(cachingStrategy); + this.create = create; + } + + /** + * Convenience ctor for token streams that don't vary based on version. + */ + public PreConfiguredTokenFilter(String name, boolean useFilterForMultitermQueries, + PreBuiltCacheFactory.CachingStrategy cachingStrategy, Function create) { + this(name, useFilterForMultitermQueries, cachingStrategy, (input, version) -> create.apply(input)); + // TODO why oh why aren't these all CachingStrategy.ONE? They *can't* vary based on version because they don't get it, right?! } @Override @@ -52,6 +66,20 @@ public TokenFilterFactory get(IndexSettings indexSettings, Environment environme return getTokenFilterFactory(Version.indexCreated(settings)); } + /** + * The name of the {@link TokenFilter} in the API. + */ + public String getName() { + return name; + } + + /** + * Can this {@link TokenFilter} be used in multi-term queries? + */ + public boolean shouldUseFilterForMultitermQueries() { + return useFilterForMultitermQueries; + } + private interface MultiTermAwareTokenFilterFactory extends TokenFilterFactory, MultiTermAwareComponent {} private synchronized TokenFilterFactory getTokenFilterFactory(final Version version) { diff --git a/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java b/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java index bf2cc8bb223b1..727e3b54c7a1a 100644 --- a/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java +++ b/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java @@ -103,6 +103,7 @@ import org.elasticsearch.index.analysis.PersianNormalizationFilterFactory; import org.elasticsearch.index.analysis.PorterStemTokenFilterFactory; import org.elasticsearch.index.analysis.PortugueseAnalyzerProvider; +import org.elasticsearch.index.analysis.PreConfiguredTokenFilter; import org.elasticsearch.index.analysis.ReverseTokenFilterFactory; import org.elasticsearch.index.analysis.RomanianAnalyzerProvider; import org.elasticsearch.index.analysis.RussianAnalyzerProvider; @@ -142,12 +143,11 @@ import org.elasticsearch.index.analysis.compound.HyphenationCompoundWordTokenFilterFactory; import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy; import org.elasticsearch.plugins.AnalysisPlugin; -import org.elasticsearch.plugins.AnalysisPlugin.PreBuiltTokenFilterSpec; import java.io.IOException; +import java.util.ArrayList; import java.util.List; import java.util.Locale; -import java.util.Map; import static org.elasticsearch.plugins.AnalysisPlugin.requriesAnalysisSettings; @@ -176,7 +176,7 @@ public AnalysisModule(Environment environment, List plugins) thr NamedRegistry>> analyzers = setupAnalyzers(plugins); NamedRegistry>> normalizers = setupNormalizers(plugins); - Map preBuiltTokenFilters = setupPreBuiltTokenFilters(plugins); + List preBuiltTokenFilters = setupPreBuiltTokenFilters(plugins); analysisRegistry = new AnalysisRegistry(environment, charFilters.getRegistry(), tokenFilters.getRegistry(), tokenizers .getRegistry(), analyzers.getRegistry(), normalizers.getRegistry(), preBuiltTokenFilters); @@ -267,16 +267,19 @@ private NamedRegistry> setupTokenFilters(Li return tokenFilters; } - static Map setupPreBuiltTokenFilters(List plugins) { - NamedRegistry preBuiltTokenFilters = new NamedRegistry<>("pre built token_filter"); + static List setupPreBuiltTokenFilters(List plugins) { + // Use NamedRegistry for the duplicate detection + NamedRegistry preBuiltTokenFilters = new NamedRegistry<>("pre-configured token_filter"); // Add filters available in lucene-core - preBuiltTokenFilters.register("lowercase", new PreBuiltTokenFilterSpec(true, CachingStrategy.LUCENE, (inputs, version) -> - new LowerCaseFilter(inputs))); - preBuiltTokenFilters.register("standard", new PreBuiltTokenFilterSpec(false, CachingStrategy.LUCENE, (inputs, version) -> - new StandardFilter(inputs))); - /* Note that "stop" is available in lucene-core but it's pre-built version uses a set of English stop words that are in - * lucene-analyzers-common so "stop" is defined in the analysis-common module. */ + preBuiltTokenFilters.register("lowercase", + new PreConfiguredTokenFilter("lowercase", true, CachingStrategy.LUCENE, LowerCaseFilter::new)); + preBuiltTokenFilters.register("standard", + new PreConfiguredTokenFilter("standard", false, CachingStrategy.LUCENE, StandardFilter::new)); + /* Note that "stop" is available in lucene-core but it's pre-built + * version uses a set of English stop words that are in + * lucene-analyzers-common so "stop" is defined in the analysis-common + * module. */ // Add token filers declared in PreBuiltTokenFilters until they have all been migrated for (PreBuiltTokenFilters preBuilt : PreBuiltTokenFilters.values()) { @@ -285,13 +288,18 @@ static Map setupPreBuiltTokenFilters(List(preBuiltTokenFilters.getRegistry().values()); } private NamedRegistry> setupTokenizers(List plugins) { diff --git a/core/src/main/java/org/elasticsearch/plugins/AnalysisPlugin.java b/core/src/main/java/org/elasticsearch/plugins/AnalysisPlugin.java index 8fdabbc15161d..98323371f2e31 100644 --- a/core/src/main/java/org/elasticsearch/plugins/AnalysisPlugin.java +++ b/core/src/main/java/org/elasticsearch/plugins/AnalysisPlugin.java @@ -30,15 +30,18 @@ import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.analysis.AnalyzerProvider; import org.elasticsearch.index.analysis.CharFilterFactory; +import org.elasticsearch.index.analysis.PreConfiguredTokenFilter; import org.elasticsearch.index.analysis.TokenFilterFactory; import org.elasticsearch.index.analysis.TokenizerFactory; import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider; import org.elasticsearch.indices.analysis.PreBuiltCacheFactory; import java.io.IOException; +import java.util.List; import java.util.Map; import java.util.function.BiFunction; +import static java.util.Collections.emptyList; import static java.util.Collections.emptyMap; /** @@ -75,10 +78,6 @@ default Map> getTokenFilters() { return emptyMap(); } - default Map getPreBuiltTokenFilters() { - return emptyMap(); - } - /** * Override to add additional {@link Tokenizer}s. See {@link #requriesAnalysisSettings(AnalysisProvider)} * how to on get the configuration from the index. @@ -95,6 +94,13 @@ default Map>> getA return emptyMap(); } + /** + * Override to add additional pre-configured token filters. + */ + default List getPreConfiguredTokenFilters() { + return emptyList(); + } + /** * Override to add additional hunspell {@link org.apache.lucene.analysis.hunspell.Dictionary}s. */ @@ -105,7 +111,7 @@ default Map getHunspellD /** * Specification for a pre-built token filter that is shared between multiple indices. */ - class PreBuiltTokenFilterSpec { + class PreBuiltTokenFilterSpec { // NOCOMMIT remove me private final boolean useFilterForMultitermQueries; private final PreBuiltCacheFactory.CachingStrategy cachingStrategy; private final BiFunction create; diff --git a/core/src/test/java/org/elasticsearch/index/IndexModuleTests.java b/core/src/test/java/org/elasticsearch/index/IndexModuleTests.java index 2a8fa7f9c143f..6cf3f072a2f41 100644 --- a/core/src/test/java/org/elasticsearch/index/IndexModuleTests.java +++ b/core/src/test/java/org/elasticsearch/index/IndexModuleTests.java @@ -123,7 +123,7 @@ public void setUp() throws Exception { indexSettings = IndexSettingsModule.newIndexSettings("foo", settings); index = indexSettings.getIndex(); environment = new Environment(settings); - emptyAnalysisRegistry = new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap()); + emptyAnalysisRegistry = new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyList()); threadPool = new TestThreadPool("test"); circuitBreakerService = new NoneCircuitBreakerService(); bigArrays = new BigArrays(settings, circuitBreakerService); diff --git a/core/src/test/java/org/elasticsearch/index/analysis/AnalysisRegistryTests.java b/core/src/test/java/org/elasticsearch/index/analysis/AnalysisRegistryTests.java index 0370af7c0ef24..6645c65c01584 100644 --- a/core/src/test/java/org/elasticsearch/index/analysis/AnalysisRegistryTests.java +++ b/core/src/test/java/org/elasticsearch/index/analysis/AnalysisRegistryTests.java @@ -36,7 +36,6 @@ import org.elasticsearch.indices.analysis.PreBuiltAnalyzers; import org.elasticsearch.indices.analysis.PreBuiltCacheFactory; import org.elasticsearch.plugins.AnalysisPlugin; -import org.elasticsearch.plugins.AnalysisPlugin.PreBuiltTokenFilterSpec; import org.elasticsearch.test.ESTestCase; import org.elasticsearch.test.IndexSettingsModule; import org.elasticsearch.test.VersionUtils; @@ -45,6 +44,7 @@ import java.util.Map; import java.util.concurrent.atomic.AtomicBoolean; +import static java.util.Collections.emptyList; import static java.util.Collections.emptyMap; import static java.util.Collections.singletonList; import static java.util.Collections.singletonMap; @@ -67,7 +67,7 @@ public void setUp() throws Exception { emptyEnvironment = new Environment(Settings.builder() .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) .build()); - emptyRegistry = new AnalysisRegistry(emptyEnvironment, emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap()); + emptyRegistry = new AnalysisRegistry(emptyEnvironment, emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyList()); emptyIndexSettingsOfCurrentVersion = IndexSettingsModule.newIndexSettings("index", Settings.builder() .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) .build()); @@ -194,10 +194,10 @@ public void testBuiltInAnalyzersAreCached() throws IOException { .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT).build(); IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings); IndexAnalyzers indexAnalyzers = - new AnalysisRegistry(new Environment(settings), emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap()) + new AnalysisRegistry(new Environment(settings), emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyList()) .build(idxSettings); IndexAnalyzers otherIndexAnalyzers = - new AnalysisRegistry(new Environment(settings), emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap()) + new AnalysisRegistry(new Environment(settings), emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyList()) .build(idxSettings); final int numIters = randomIntBetween(5, 20); for (int i = 0; i < numIters; i++) { @@ -208,15 +208,15 @@ public void testBuiltInAnalyzersAreCached() throws IOException { public void testPreBuiltTokenFiltersAreCached() throws IOException { AtomicBoolean built = new AtomicBoolean(false); - PreBuiltTokenFilterSpec assertsBuiltOnce = new PreBuiltTokenFilterSpec(false, PreBuiltCacheFactory.CachingStrategy.ONE, - (tokens, version) -> { + PreConfiguredTokenFilter assertsBuiltOnce = new PreConfiguredTokenFilter("asserts_built_once", false, + PreBuiltCacheFactory.CachingStrategy.ONE, (tokens, version) -> { if (false == built.compareAndSet(false, true)) { fail("Attempted to build the token filter twice when it should have been cached"); } return new MockTokenFilter(tokens, MockTokenFilter.EMPTY_STOPSET); }); try (AnalysisRegistry registryWithPreBuiltTokenFilter = new AnalysisRegistry(emptyEnvironment, emptyMap(), emptyMap(), emptyMap(), - emptyMap(), emptyMap(), singletonMap("asserts_built_once", assertsBuiltOnce))) { + emptyMap(), emptyMap(), singletonList(assertsBuiltOnce))) { IndexAnalyzers indexAnalyzers = registryWithPreBuiltTokenFilter.build(emptyIndexSettingsOfCurrentVersion); IndexAnalyzers otherIndexAnalyzers = registryWithPreBuiltTokenFilter.build(emptyIndexSettingsOfCurrentVersion); assertSame(indexAnalyzers.get("asserts_built_once"), otherIndexAnalyzers.get("asserts_built_once")); @@ -235,7 +235,7 @@ public void testNoTypeOrTokenizerErrorMessage() throws IOException { IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings); IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> - new AnalysisRegistry(new Environment(settings), emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap()) + new AnalysisRegistry(new Environment(settings), emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyList()) .build(idxSettings)); assertThat(e.getMessage(), equalTo("analyzer [test_analyzer] must specify either an analyzer type, or a tokenizer")); } diff --git a/core/src/test/java/org/elasticsearch/index/analysis/CustomNormalizerTests.java b/core/src/test/java/org/elasticsearch/index/analysis/CustomNormalizerTests.java index 00892c9c65232..5cdc589405714 100644 --- a/core/src/test/java/org/elasticsearch/index/analysis/CustomNormalizerTests.java +++ b/core/src/test/java/org/elasticsearch/index/analysis/CustomNormalizerTests.java @@ -31,8 +31,10 @@ import java.io.IOException; import java.io.Reader; +import java.util.List; import java.util.Map; +import static java.util.Collections.singletonList; import static java.util.Collections.singletonMap; public class CustomNormalizerTests extends ESTokenStreamTestCase { @@ -110,9 +112,8 @@ public void testIllegalCharFilters() throws IOException { private static class MockAnalysisPlugin implements AnalysisPlugin { @Override - public Map getPreBuiltTokenFilters() { - return singletonMap("mock_forbidden", new PreBuiltTokenFilterSpec(false, CachingStrategy.ONE, (input, version) -> - new MockLowerCaseFilter(input))); + public List getPreConfiguredTokenFilters() { + return singletonList(new PreConfiguredTokenFilter("mock_forbidden", false, CachingStrategy.ONE, MockLowerCaseFilter::new)); } @Override diff --git a/core/src/test/java/org/elasticsearch/index/mapper/KeywordFieldMapperTests.java b/core/src/test/java/org/elasticsearch/index/mapper/KeywordFieldMapperTests.java index 3b5fb191daa3a..518f669f81f3f 100644 --- a/core/src/test/java/org/elasticsearch/index/mapper/KeywordFieldMapperTests.java +++ b/core/src/test/java/org/elasticsearch/index/mapper/KeywordFieldMapperTests.java @@ -30,6 +30,7 @@ import org.elasticsearch.common.xcontent.XContentFactory; import org.elasticsearch.common.xcontent.XContentType; import org.elasticsearch.index.IndexService; +import org.elasticsearch.index.analysis.PreConfiguredTokenFilter; import org.elasticsearch.index.mapper.MapperService.MergeReason; import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy; import org.elasticsearch.plugins.AnalysisPlugin; @@ -41,10 +42,9 @@ import java.io.IOException; import java.util.Arrays; import java.util.Collection; -import java.util.Map; -import java.util.TreeMap; +import java.util.List; -import static java.util.Collections.singletonMap; +import static java.util.Collections.singletonList; import static org.hamcrest.Matchers.containsString; import static org.hamcrest.Matchers.equalTo; @@ -54,9 +54,8 @@ public class KeywordFieldMapperTests extends ESSingleNodeTestCase { */ public static class MockAnalysisPlugin extends Plugin implements AnalysisPlugin { @Override - public Map getPreBuiltTokenFilters() { - return singletonMap("mock_other_lowercase", new PreBuiltTokenFilterSpec(true, CachingStrategy.ONE, (input, version) -> - new MockLowerCaseFilter(input))); + public List getPreConfiguredTokenFilters() { + return singletonList(new PreConfiguredTokenFilter("mock_other_lowercase", true, CachingStrategy.ONE, MockLowerCaseFilter::new)); } }; diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java index 814cef5b66124..2cb94e6908719 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java @@ -39,12 +39,15 @@ import org.apache.lucene.analysis.standard.ClassicFilter; import org.elasticsearch.index.analysis.CharFilterFactory; import org.elasticsearch.index.analysis.HtmlStripCharFilterFactory; +import org.elasticsearch.index.analysis.PreConfiguredTokenFilter; import org.elasticsearch.index.analysis.TokenFilterFactory; import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider; import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy; import org.elasticsearch.plugins.AnalysisPlugin; import org.elasticsearch.plugins.Plugin; +import java.util.ArrayList; +import java.util.List; import java.util.Map; import java.util.TreeMap; @@ -69,52 +72,42 @@ public Map> getCharFilters() { } @Override - public Map getPreBuiltTokenFilters() { + public List getPreConfiguredTokenFilters() { // TODO we should revisit the caching strategies. - Map filters = new TreeMap<>(); - filters.put("asciifolding", new PreBuiltTokenFilterSpec(true, CachingStrategy.ONE, (input, version) -> - new ASCIIFoldingFilter(input))); - filters.put("classic", new PreBuiltTokenFilterSpec(false, CachingStrategy.ONE, (input, version) -> - new ClassicFilter(input))); - filters.put("common_grams", new PreBuiltTokenFilterSpec(false, CachingStrategy.LUCENE, (input, version) -> + List filters = new ArrayList<>(); + filters.add(new PreConfiguredTokenFilter("asciifolding", true, CachingStrategy.ONE, ASCIIFoldingFilter::new)); + filters.add(new PreConfiguredTokenFilter("classic", false, CachingStrategy.ONE, ClassicFilter::new)); + filters.add(new PreConfiguredTokenFilter("common_grams", false, CachingStrategy.LUCENE, input -> new CommonGramsFilter(input, CharArraySet.EMPTY_SET))); - filters.put("edge_ngram", new PreBuiltTokenFilterSpec(false, CachingStrategy.LUCENE, (input, version) -> + filters.add(new PreConfiguredTokenFilter("edge_ngram", false, CachingStrategy.LUCENE, input -> new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE))); // TODO deprecate edgeNGram - filters.put("edgeNGram", new PreBuiltTokenFilterSpec(false, CachingStrategy.LUCENE, (input, version) -> + filters.add(new PreConfiguredTokenFilter("edgeNGram", false, CachingStrategy.LUCENE, input -> new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE))); - filters.put("kstem", new PreBuiltTokenFilterSpec(false, CachingStrategy.ONE, (input, version) -> - new KStemFilter(input))); - filters.put("length", new PreBuiltTokenFilterSpec(false, CachingStrategy.LUCENE, (input, version) -> - new LengthFilter(input, 0, Integer.MAX_VALUE))); - filters.put("ngram", new PreBuiltTokenFilterSpec(false, CachingStrategy.LUCENE, (input, version) -> - new NGramTokenFilter(input))); + filters.add(new PreConfiguredTokenFilter("kstem", false, CachingStrategy.ONE, KStemFilter::new)); + filters.add(new PreConfiguredTokenFilter("length", false, CachingStrategy.LUCENE, input -> + new LengthFilter(input, 0, Integer.MAX_VALUE))); // TODO this one seems useless + filters.add(new PreConfiguredTokenFilter("ngram", false, CachingStrategy.LUCENE, NGramTokenFilter::new)); // TODO deprecate nGram - filters.put("nGram", new PreBuiltTokenFilterSpec(false, CachingStrategy.LUCENE, (input, version) -> - new NGramTokenFilter(input))); - filters.put("porter_stem", new PreBuiltTokenFilterSpec(false, CachingStrategy.ONE, (input, version) -> - new PorterStemFilter(input))); - filters.put("reverse", new PreBuiltTokenFilterSpec(false, CachingStrategy.LUCENE, (input, version) -> - new ReverseStringFilter(input))); + filters.add(new PreConfiguredTokenFilter("nGram", false, CachingStrategy.LUCENE, NGramTokenFilter::new)); + filters.add(new PreConfiguredTokenFilter("porter_stem", false, CachingStrategy.ONE, PorterStemFilter::new)); + filters.add(new PreConfiguredTokenFilter("reverse", false, CachingStrategy.LUCENE, ReverseStringFilter::new)); // The stop filter is in lucene-core but the English stop words set is in lucene-analyzers-common - filters.put("stop", new PreBuiltTokenFilterSpec(false, CachingStrategy.LUCENE, (input, version) -> + filters.add(new PreConfiguredTokenFilter("stop", false, CachingStrategy.LUCENE, input -> new StopFilter(input, StopAnalyzer.ENGLISH_STOP_WORDS_SET))); - filters.put("trim", new PreBuiltTokenFilterSpec(false, CachingStrategy.LUCENE, (input, version) -> - new TrimFilter(input))); - filters.put("truncate", new PreBuiltTokenFilterSpec(false, CachingStrategy.ONE, (input, version) -> + filters.add(new PreConfiguredTokenFilter("trim", false, CachingStrategy.LUCENE, TrimFilter::new)); + filters.add(new PreConfiguredTokenFilter("truncate", false, CachingStrategy.ONE, input -> new TruncateTokenFilter(input, 10))); - filters.put("unique", new PreBuiltTokenFilterSpec(false, CachingStrategy.ONE, (input, version) -> - new UniqueTokenFilter(input))); - filters.put("uppercase", new PreBuiltTokenFilterSpec(true, CachingStrategy.LUCENE, (input, version) -> - new UpperCaseFilter(input))); - filters.put("word_delimiter", new PreBuiltTokenFilterSpec(false, CachingStrategy.ONE, (input, version) -> + filters.add(new PreConfiguredTokenFilter("unique", false, CachingStrategy.ONE, UniqueTokenFilter::new)); + filters.add(new PreConfiguredTokenFilter("uppercase", true, CachingStrategy.LUCENE, UpperCaseFilter::new)); + filters.add(new PreConfiguredTokenFilter("word_delimiter", false, CachingStrategy.ONE, input -> new WordDelimiterFilter(input, WordDelimiterFilter.GENERATE_WORD_PARTS | WordDelimiterFilter.GENERATE_NUMBER_PARTS | WordDelimiterFilter.SPLIT_ON_CASE_CHANGE | WordDelimiterFilter.SPLIT_ON_NUMERICS | WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE, null))); - filters.put("word_delimiter_graph", new PreBuiltTokenFilterSpec(false, CachingStrategy.ONE, (input, version) -> + filters.add(new PreConfiguredTokenFilter("word_delimiter_graph", false, CachingStrategy.ONE, input -> new WordDelimiterGraphFilter(input, WordDelimiterGraphFilter.GENERATE_WORD_PARTS | WordDelimiterGraphFilter.GENERATE_NUMBER_PARTS diff --git a/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java b/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java index b4e0ad00d0a34..074d23091730c 100644 --- a/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java +++ b/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java @@ -67,6 +67,7 @@ import org.elasticsearch.index.analysis.PatternTokenizerFactory; import org.elasticsearch.index.analysis.PersianNormalizationFilterFactory; import org.elasticsearch.index.analysis.PorterStemTokenFilterFactory; +import org.elasticsearch.index.analysis.PreConfiguredTokenFilter; import org.elasticsearch.index.analysis.ReverseTokenFilterFactory; import org.elasticsearch.index.analysis.ScandinavianFoldingFilterFactory; import org.elasticsearch.index.analysis.ScandinavianNormalizationFilterFactory; @@ -102,8 +103,10 @@ import java.util.Objects; import java.util.Set; import java.util.TreeSet; +import java.util.function.Function; import java.util.regex.Matcher; import java.util.regex.Pattern; +import java.util.stream.Collectors; import static java.util.Collections.singletonList; @@ -462,7 +465,9 @@ public void testPreBuiltMultiTermAware() { expected.add(tokenizer); } } - Map preBuiltTokenFilters = AnalysisModule.setupPreBuiltTokenFilters(singletonList(plugin)); + Map preBuiltTokenFilters = + AnalysisModule.setupPreBuiltTokenFilters(singletonList(plugin)).stream() + .collect(Collectors.toMap(PreConfiguredTokenFilter::getName, Function.identity())); for (Map.Entry> entry : getPreBuiltTokenFilters().entrySet()) { String name = entry.getKey(); Class luceneFactory = entry.getValue(); @@ -473,9 +478,9 @@ public void testPreBuiltMultiTermAware() { luceneFactory = TokenFilterFactory.lookupClass(toCamelCase(name)); } assertTrue(TokenFilterFactory.class.isAssignableFrom(luceneFactory)); - PreBuiltTokenFilterSpec spec = preBuiltTokenFilters.get(name); - assertNotNull("test claims pre built token filter [" + name + "] should be available but it wasn't", spec); - if (spec.shouldUseFilterForMultitermQueries()) { + PreConfiguredTokenFilter filter = preBuiltTokenFilters.get(name); + assertNotNull("test claims pre built token filter [" + name + "] should be available but it wasn't", filter); + if (filter.shouldUseFilterForMultitermQueries()) { actual.add("token filter [" + name + "]"); } if (org.apache.lucene.analysis.util.MultiTermAwareComponent.class.isAssignableFrom(luceneFactory)) { From 605a8989043eba69359fa72939d8c56096c8d00b Mon Sep 17 00:00:00 2001 From: Nik Everett Date: Thu, 27 Apr 2017 15:39:31 -0400 Subject: [PATCH 15/20] Cleanup --- .../index/analysis/AnalysisRegistry.java | 29 ++++++++----------- .../indices/analysis/AnalysisModule.java | 9 +++--- .../elasticsearch/index/IndexModuleTests.java | 2 +- .../index/analysis/AnalysisRegistryTests.java | 11 ++++--- .../analysis/AnalysisFactoryTestCase.java | 7 +---- 5 files changed, 24 insertions(+), 34 deletions(-) diff --git a/core/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java b/core/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java index f854b52a1bc6f..a1c13de971e3c 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java +++ b/core/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java @@ -76,14 +76,14 @@ public AnalysisRegistry(Environment environment, Map> tokenizers, Map>> analyzers, Map>> normalizers, - List preBuiltTokenFilters) { + Map preConfiguredTokenFilters) { this.environment = environment; this.charFilters = unmodifiableMap(charFilters); this.tokenFilters = unmodifiableMap(tokenFilters); this.tokenizers = unmodifiableMap(tokenizers); this.analyzers = unmodifiableMap(analyzers); this.normalizers = unmodifiableMap(normalizers); - prebuiltAnalysis = new PrebuiltAnalysis(preBuiltTokenFilters); + prebuiltAnalysis = new PrebuiltAnalysis(preConfiguredTokenFilters); } /** @@ -308,8 +308,8 @@ public String toString() { } private Map buildMapping(Component component, IndexSettings settings, Map settingsMap, - Map> providerMap, Map> defaultInstance) - throws IOException { + Map> providerMap, + Map> defaultInstance) throws IOException { Settings defaultSettings = Settings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, settings.getIndexVersionCreated()).build(); Map factories = new HashMap<>(); for (Map.Entry entry : settingsMap.entrySet()) { @@ -347,7 +347,7 @@ private Map buildMapping(Component component, IndexSettings setti } // go over the char filters in the bindings and register the ones that are not configured - for (Map.Entry> entry : providerMap.entrySet()) { + for (Map.Entry> entry : providerMap.entrySet()) { String name = entry.getKey(); AnalysisModule.AnalysisProvider provider = entry.getValue(); // we don't want to re-register one that already exists @@ -368,7 +368,7 @@ private Map buildMapping(Component component, IndexSettings setti factories.put(name, instance); } - for (Map.Entry> entry : defaultInstance.entrySet()) { + for (Map.Entry> entry : defaultInstance.entrySet()) { final String name = entry.getKey(); final AnalysisModule.AnalysisProvider provider = entry.getValue(); if (factories.containsKey(name) == false) { @@ -381,7 +381,8 @@ private Map buildMapping(Component component, IndexSettings setti return factories; } - private AnalysisProvider getAnalysisProvider(Component component, Map> providerMap, String name, String typeName) { + private AnalysisProvider getAnalysisProvider(Component component, Map> providerMap, + String name, String typeName) { if (typeName == null) { throw new IllegalArgumentException(component + " [" + name + "] must specify either an analyzer type, or a tokenizer"); } @@ -396,13 +397,12 @@ private static class PrebuiltAnalysis implements Closeable { final Map>> analyzerProviderFactories; final Map> tokenizerFactories; - final Map> tokenFilterFactories; + final Map> tokenFilterFactories; final Map> charFilterFactories; - private PrebuiltAnalysis(List preBuiltTokenFilters) { + private PrebuiltAnalysis(Map preBuiltTokenFilters) { Map analyzerProviderFactories = new HashMap<>(); Map tokenizerFactories = new HashMap<>(); - Map> tokenFilterFactories = new HashMap<>(); Map charFilterFactories = new HashMap<>(); // Analyzers for (PreBuiltAnalyzers preBuiltAnalyzerEnum : PreBuiltAnalyzers.values()) { @@ -421,12 +421,6 @@ private PrebuiltAnalysis(List preBuiltTokenFilters) { tokenizerFactories.put("edgeNGram", new PreBuiltTokenizerFactoryFactory(PreBuiltTokenizers.EDGE_NGRAM.getTokenizerFactory(Version.CURRENT))); tokenizerFactories.put("PathHierarchy", new PreBuiltTokenizerFactoryFactory(PreBuiltTokenizers.PATH_HIERARCHY.getTokenizerFactory(Version.CURRENT))); - - // Token filters - for (PreConfiguredTokenFilter preBuiltTokenFilter : preBuiltTokenFilters) { - tokenFilterFactories.put(preBuiltTokenFilter.getName(), preBuiltTokenFilter); - } - // Char Filters for (PreBuiltCharFilters preBuiltCharFilter : PreBuiltCharFilters.values()) { String name = preBuiltCharFilter.name().toLowerCase(Locale.ROOT); @@ -434,10 +428,11 @@ private PrebuiltAnalysis(List preBuiltTokenFilters) { } // Char filter aliases charFilterFactories.put("htmlStrip", new PreBuiltCharFilterFactoryFactory(PreBuiltCharFilters.HTML_STRIP.getCharFilterFactory(Version.CURRENT))); + this.analyzerProviderFactories = Collections.unmodifiableMap(analyzerProviderFactories); this.charFilterFactories = Collections.unmodifiableMap(charFilterFactories); - this.tokenFilterFactories = Collections.unmodifiableMap(tokenFilterFactories); this.tokenizerFactories = Collections.unmodifiableMap(tokenizerFactories); + tokenFilterFactories = preBuiltTokenFilters; } public AnalysisModule.AnalysisProvider getCharFilterFactory(String name) { diff --git a/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java b/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java index 727e3b54c7a1a..44135c194bd6e 100644 --- a/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java +++ b/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java @@ -148,7 +148,9 @@ import java.util.ArrayList; import java.util.List; import java.util.Locale; +import java.util.Map; +import static java.util.Collections.unmodifiableMap; import static org.elasticsearch.plugins.AnalysisPlugin.requriesAnalysisSettings; /** @@ -176,7 +178,7 @@ public AnalysisModule(Environment environment, List plugins) thr NamedRegistry>> analyzers = setupAnalyzers(plugins); NamedRegistry>> normalizers = setupNormalizers(plugins); - List preBuiltTokenFilters = setupPreBuiltTokenFilters(plugins); + Map preBuiltTokenFilters = setupPreBuiltTokenFilters(plugins); analysisRegistry = new AnalysisRegistry(environment, charFilters.getRegistry(), tokenFilters.getRegistry(), tokenizers .getRegistry(), analyzers.getRegistry(), normalizers.getRegistry(), preBuiltTokenFilters); @@ -267,8 +269,7 @@ private NamedRegistry> setupTokenFilters(Li return tokenFilters; } - static List setupPreBuiltTokenFilters(List plugins) { - // Use NamedRegistry for the duplicate detection + static Map setupPreBuiltTokenFilters(List plugins) { NamedRegistry preBuiltTokenFilters = new NamedRegistry<>("pre-configured token_filter"); // Add filters available in lucene-core @@ -299,7 +300,7 @@ static List setupPreBuiltTokenFilters(List(preBuiltTokenFilters.getRegistry().values()); + return unmodifiableMap(preBuiltTokenFilters.getRegistry()); } private NamedRegistry> setupTokenizers(List plugins) { diff --git a/core/src/test/java/org/elasticsearch/index/IndexModuleTests.java b/core/src/test/java/org/elasticsearch/index/IndexModuleTests.java index 6cf3f072a2f41..2a8fa7f9c143f 100644 --- a/core/src/test/java/org/elasticsearch/index/IndexModuleTests.java +++ b/core/src/test/java/org/elasticsearch/index/IndexModuleTests.java @@ -123,7 +123,7 @@ public void setUp() throws Exception { indexSettings = IndexSettingsModule.newIndexSettings("foo", settings); index = indexSettings.getIndex(); environment = new Environment(settings); - emptyAnalysisRegistry = new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyList()); + emptyAnalysisRegistry = new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap()); threadPool = new TestThreadPool("test"); circuitBreakerService = new NoneCircuitBreakerService(); bigArrays = new BigArrays(settings, circuitBreakerService); diff --git a/core/src/test/java/org/elasticsearch/index/analysis/AnalysisRegistryTests.java b/core/src/test/java/org/elasticsearch/index/analysis/AnalysisRegistryTests.java index 6645c65c01584..167f82632372e 100644 --- a/core/src/test/java/org/elasticsearch/index/analysis/AnalysisRegistryTests.java +++ b/core/src/test/java/org/elasticsearch/index/analysis/AnalysisRegistryTests.java @@ -44,7 +44,6 @@ import java.util.Map; import java.util.concurrent.atomic.AtomicBoolean; -import static java.util.Collections.emptyList; import static java.util.Collections.emptyMap; import static java.util.Collections.singletonList; import static java.util.Collections.singletonMap; @@ -67,7 +66,7 @@ public void setUp() throws Exception { emptyEnvironment = new Environment(Settings.builder() .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) .build()); - emptyRegistry = new AnalysisRegistry(emptyEnvironment, emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyList()); + emptyRegistry = new AnalysisRegistry(emptyEnvironment, emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap()); emptyIndexSettingsOfCurrentVersion = IndexSettingsModule.newIndexSettings("index", Settings.builder() .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) .build()); @@ -194,10 +193,10 @@ public void testBuiltInAnalyzersAreCached() throws IOException { .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT).build(); IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings); IndexAnalyzers indexAnalyzers = - new AnalysisRegistry(new Environment(settings), emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyList()) + new AnalysisRegistry(new Environment(settings), emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap()) .build(idxSettings); IndexAnalyzers otherIndexAnalyzers = - new AnalysisRegistry(new Environment(settings), emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyList()) + new AnalysisRegistry(new Environment(settings), emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap()) .build(idxSettings); final int numIters = randomIntBetween(5, 20); for (int i = 0; i < numIters; i++) { @@ -216,7 +215,7 @@ public void testPreBuiltTokenFiltersAreCached() throws IOException { return new MockTokenFilter(tokens, MockTokenFilter.EMPTY_STOPSET); }); try (AnalysisRegistry registryWithPreBuiltTokenFilter = new AnalysisRegistry(emptyEnvironment, emptyMap(), emptyMap(), emptyMap(), - emptyMap(), emptyMap(), singletonList(assertsBuiltOnce))) { + emptyMap(), emptyMap(), singletonMap("asserts_built_once", assertsBuiltOnce))) { IndexAnalyzers indexAnalyzers = registryWithPreBuiltTokenFilter.build(emptyIndexSettingsOfCurrentVersion); IndexAnalyzers otherIndexAnalyzers = registryWithPreBuiltTokenFilter.build(emptyIndexSettingsOfCurrentVersion); assertSame(indexAnalyzers.get("asserts_built_once"), otherIndexAnalyzers.get("asserts_built_once")); @@ -235,7 +234,7 @@ public void testNoTypeOrTokenizerErrorMessage() throws IOException { IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings); IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> - new AnalysisRegistry(new Environment(settings), emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyList()) + new AnalysisRegistry(new Environment(settings), emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap()) .build(idxSettings)); assertThat(e.getMessage(), equalTo("analyzer [test_analyzer] must specify either an analyzer type, or a tokenizer")); } diff --git a/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java b/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java index 074d23091730c..1cfb621e28632 100644 --- a/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java +++ b/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java @@ -91,7 +91,6 @@ import org.elasticsearch.index.analysis.compound.DictionaryCompoundWordTokenFilterFactory; import org.elasticsearch.index.analysis.compound.HyphenationCompoundWordTokenFilterFactory; import org.elasticsearch.plugins.AnalysisPlugin; -import org.elasticsearch.plugins.AnalysisPlugin.PreBuiltTokenFilterSpec; import org.elasticsearch.test.ESTestCase; import java.util.Collection; @@ -103,10 +102,8 @@ import java.util.Objects; import java.util.Set; import java.util.TreeSet; -import java.util.function.Function; import java.util.regex.Matcher; import java.util.regex.Pattern; -import java.util.stream.Collectors; import static java.util.Collections.singletonList; @@ -465,9 +462,7 @@ public void testPreBuiltMultiTermAware() { expected.add(tokenizer); } } - Map preBuiltTokenFilters = - AnalysisModule.setupPreBuiltTokenFilters(singletonList(plugin)).stream() - .collect(Collectors.toMap(PreConfiguredTokenFilter::getName, Function.identity())); + Map preBuiltTokenFilters = AnalysisModule.setupPreBuiltTokenFilters(singletonList(plugin)); for (Map.Entry> entry : getPreBuiltTokenFilters().entrySet()) { String name = entry.getKey(); Class luceneFactory = entry.getValue(); From 025ffe6a3999eafb607825d3d87d87c510ebe7e2 Mon Sep 17 00:00:00 2001 From: Nik Everett Date: Thu, 27 Apr 2017 15:45:02 -0400 Subject: [PATCH 16/20] Eclipse likes these but javac doesn't --- .../elasticsearch/analysis/common/CommonAnalysisPlugin.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java index 2cb94e6908719..afe235ac8a5f9 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java @@ -75,7 +75,7 @@ public Map> getCharFilters() { public List getPreConfiguredTokenFilters() { // TODO we should revisit the caching strategies. List filters = new ArrayList<>(); - filters.add(new PreConfiguredTokenFilter("asciifolding", true, CachingStrategy.ONE, ASCIIFoldingFilter::new)); + filters.add(new PreConfiguredTokenFilter("asciifolding", true, CachingStrategy.ONE, input -> new ASCIIFoldingFilter(input))); filters.add(new PreConfiguredTokenFilter("classic", false, CachingStrategy.ONE, ClassicFilter::new)); filters.add(new PreConfiguredTokenFilter("common_grams", false, CachingStrategy.LUCENE, input -> new CommonGramsFilter(input, CharArraySet.EMPTY_SET))); @@ -91,14 +91,14 @@ public List getPreConfiguredTokenFilters() { // TODO deprecate nGram filters.add(new PreConfiguredTokenFilter("nGram", false, CachingStrategy.LUCENE, NGramTokenFilter::new)); filters.add(new PreConfiguredTokenFilter("porter_stem", false, CachingStrategy.ONE, PorterStemFilter::new)); - filters.add(new PreConfiguredTokenFilter("reverse", false, CachingStrategy.LUCENE, ReverseStringFilter::new)); + filters.add(new PreConfiguredTokenFilter("reverse", false, CachingStrategy.LUCENE, input -> new ReverseStringFilter(input))); // The stop filter is in lucene-core but the English stop words set is in lucene-analyzers-common filters.add(new PreConfiguredTokenFilter("stop", false, CachingStrategy.LUCENE, input -> new StopFilter(input, StopAnalyzer.ENGLISH_STOP_WORDS_SET))); filters.add(new PreConfiguredTokenFilter("trim", false, CachingStrategy.LUCENE, TrimFilter::new)); filters.add(new PreConfiguredTokenFilter("truncate", false, CachingStrategy.ONE, input -> new TruncateTokenFilter(input, 10))); - filters.add(new PreConfiguredTokenFilter("unique", false, CachingStrategy.ONE, UniqueTokenFilter::new)); + filters.add(new PreConfiguredTokenFilter("unique", false, CachingStrategy.ONE, input -> new UniqueTokenFilter(input))); filters.add(new PreConfiguredTokenFilter("uppercase", true, CachingStrategy.LUCENE, UpperCaseFilter::new)); filters.add(new PreConfiguredTokenFilter("word_delimiter", false, CachingStrategy.ONE, input -> new WordDelimiterFilter(input, From 45746ea3be4819a33fdf62e9dc3b09068618a9f6 Mon Sep 17 00:00:00 2001 From: Nik Everett Date: Thu, 27 Apr 2017 15:52:08 -0400 Subject: [PATCH 17/20] More rename --- .../index/analysis/AnalysisRegistry.java | 6 ++---- .../indices/analysis/AnalysisModule.java | 19 +++++++++---------- .../common/CommonAnalysisFactoryTests.java | 4 ++-- .../analysis/AnalysisFactoryTestCase.java | 6 +++--- 4 files changed, 16 insertions(+), 19 deletions(-) diff --git a/core/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java b/core/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java index a1c13de971e3c..b438cd5af4155 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java +++ b/core/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java @@ -37,13 +37,11 @@ import org.elasticsearch.indices.analysis.PreBuiltAnalyzers; import org.elasticsearch.indices.analysis.PreBuiltCharFilters; import org.elasticsearch.indices.analysis.PreBuiltTokenizers; -import org.elasticsearch.plugins.AnalysisPlugin.PreBuiltTokenFilterSpec; import java.io.Closeable; import java.io.IOException; import java.util.Collections; import java.util.HashMap; -import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Set; @@ -400,7 +398,7 @@ private static class PrebuiltAnalysis implements Closeable { final Map> tokenFilterFactories; final Map> charFilterFactories; - private PrebuiltAnalysis(Map preBuiltTokenFilters) { + private PrebuiltAnalysis(Map preConfiguredTokenFilters) { Map analyzerProviderFactories = new HashMap<>(); Map tokenizerFactories = new HashMap<>(); Map charFilterFactories = new HashMap<>(); @@ -432,7 +430,7 @@ private PrebuiltAnalysis(Map preBuiltTokenFilt this.analyzerProviderFactories = Collections.unmodifiableMap(analyzerProviderFactories); this.charFilterFactories = Collections.unmodifiableMap(charFilterFactories); this.tokenizerFactories = Collections.unmodifiableMap(tokenizerFactories); - tokenFilterFactories = preBuiltTokenFilters; + tokenFilterFactories = preConfiguredTokenFilters; } public AnalysisModule.AnalysisProvider getCharFilterFactory(String name) { diff --git a/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java b/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java index 44135c194bd6e..7f514ffd981cd 100644 --- a/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java +++ b/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java @@ -145,7 +145,6 @@ import org.elasticsearch.plugins.AnalysisPlugin; import java.io.IOException; -import java.util.ArrayList; import java.util.List; import java.util.Locale; import java.util.Map; @@ -178,10 +177,10 @@ public AnalysisModule(Environment environment, List plugins) thr NamedRegistry>> analyzers = setupAnalyzers(plugins); NamedRegistry>> normalizers = setupNormalizers(plugins); - Map preBuiltTokenFilters = setupPreBuiltTokenFilters(plugins); + Map preConfiguredTokenFilters = setupPreConfiguredTokenFilters(plugins); analysisRegistry = new AnalysisRegistry(environment, charFilters.getRegistry(), tokenFilters.getRegistry(), tokenizers - .getRegistry(), analyzers.getRegistry(), normalizers.getRegistry(), preBuiltTokenFilters); + .getRegistry(), analyzers.getRegistry(), normalizers.getRegistry(), preConfiguredTokenFilters); } HunspellService getHunspellService() { @@ -269,13 +268,13 @@ private NamedRegistry> setupTokenFilters(Li return tokenFilters; } - static Map setupPreBuiltTokenFilters(List plugins) { - NamedRegistry preBuiltTokenFilters = new NamedRegistry<>("pre-configured token_filter"); + static Map setupPreConfiguredTokenFilters(List plugins) { + NamedRegistry preConfiguredTokenFilters = new NamedRegistry<>("pre-configured token_filter"); // Add filters available in lucene-core - preBuiltTokenFilters.register("lowercase", + preConfiguredTokenFilters.register("lowercase", new PreConfiguredTokenFilter("lowercase", true, CachingStrategy.LUCENE, LowerCaseFilter::new)); - preBuiltTokenFilters.register("standard", + preConfiguredTokenFilters.register("standard", new PreConfiguredTokenFilter("standard", false, CachingStrategy.LUCENE, StandardFilter::new)); /* Note that "stop" is available in lucene-core but it's pre-built * version uses a set of English stop words that are in @@ -290,17 +289,17 @@ static Map setupPreBuiltTokenFilters(List> setupTokenizers(List plugins) { diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java index fa8170921dab9..73a6c3d273291 100644 --- a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java @@ -64,8 +64,8 @@ protected Map> getCharFilters() { } @Override - protected Map> getPreBuiltTokenFilters() { - Map> filters = new TreeMap<>(super.getPreBuiltTokenFilters()); + protected Map> getPreConfiguredTokenFilters() { + Map> filters = new TreeMap<>(super.getPreConfiguredTokenFilters()); filters.put("asciifolding", null); filters.put("classic", null); filters.put("common_grams", null); diff --git a/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java b/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java index 1cfb621e28632..5434b08cbe6e9 100644 --- a/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java +++ b/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java @@ -336,7 +336,7 @@ protected Map> getTokenFilters() { * the test will look it up for you from the name. If there is no Lucene {@linkplain TokenFilterFactory} then the right hand side should * be {@link Void}. */ - protected Map> getPreBuiltTokenFilters() { + protected Map> getPreConfiguredTokenFilters() { Map> filters = new HashMap<>(); filters.put("standard", null); filters.put("lowercase", null); @@ -462,8 +462,8 @@ public void testPreBuiltMultiTermAware() { expected.add(tokenizer); } } - Map preBuiltTokenFilters = AnalysisModule.setupPreBuiltTokenFilters(singletonList(plugin)); - for (Map.Entry> entry : getPreBuiltTokenFilters().entrySet()) { + Map preBuiltTokenFilters = AnalysisModule.setupPreConfiguredTokenFilters(singletonList(plugin)); + for (Map.Entry> entry : getPreConfiguredTokenFilters().entrySet()) { String name = entry.getKey(); Class luceneFactory = entry.getValue(); if (luceneFactory == Void.class) { From 34fb2cf4244c5451154b5f9bf1169f51eeea22c6 Mon Sep 17 00:00:00 2001 From: Nik Everett Date: Thu, 27 Apr 2017 15:57:12 -0400 Subject: [PATCH 18/20] More renames --- .../elasticsearch/plugins/AnalysisPlugin.java | 34 ------------------- .../index/analysis/AnalysisRegistryTests.java | 2 +- .../analysis/AnalysisFactoryTestCase.java | 9 +++-- 3 files changed, 7 insertions(+), 38 deletions(-) diff --git a/core/src/main/java/org/elasticsearch/plugins/AnalysisPlugin.java b/core/src/main/java/org/elasticsearch/plugins/AnalysisPlugin.java index 98323371f2e31..c248c706f2321 100644 --- a/core/src/main/java/org/elasticsearch/plugins/AnalysisPlugin.java +++ b/core/src/main/java/org/elasticsearch/plugins/AnalysisPlugin.java @@ -108,40 +108,6 @@ default Map getHunspellD return emptyMap(); } - /** - * Specification for a pre-built token filter that is shared between multiple indices. - */ - class PreBuiltTokenFilterSpec { // NOCOMMIT remove me - private final boolean useFilterForMultitermQueries; - private final PreBuiltCacheFactory.CachingStrategy cachingStrategy; - private final BiFunction create; - - /** - * Setup the spec. - * @param useFilterForMultitermQueries use the pre-built token filter for multiterm queries. - * @param cachingStrategy caching strategy the pre-built token filter should use - * @param create function to create the token filter - */ - public PreBuiltTokenFilterSpec(boolean useFilterForMultitermQueries, PreBuiltCacheFactory.CachingStrategy cachingStrategy, - BiFunction create) { - this.useFilterForMultitermQueries = useFilterForMultitermQueries; - this.cachingStrategy = cachingStrategy; - this.create = create; - } - - public boolean shouldUseFilterForMultitermQueries() { - return useFilterForMultitermQueries; - } - - public PreBuiltCacheFactory.CachingStrategy getCachingStrategy() { - return cachingStrategy; - } - - public BiFunction getCreate() { - return create; - } - } - /** * Mark an {@link AnalysisProvider} as requiring the index's settings. */ diff --git a/core/src/test/java/org/elasticsearch/index/analysis/AnalysisRegistryTests.java b/core/src/test/java/org/elasticsearch/index/analysis/AnalysisRegistryTests.java index 167f82632372e..471d6f9cccc29 100644 --- a/core/src/test/java/org/elasticsearch/index/analysis/AnalysisRegistryTests.java +++ b/core/src/test/java/org/elasticsearch/index/analysis/AnalysisRegistryTests.java @@ -205,7 +205,7 @@ public void testBuiltInAnalyzersAreCached() throws IOException { } } - public void testPreBuiltTokenFiltersAreCached() throws IOException { + public void testPreConfiguredTokenFiltersAreCached() throws IOException { AtomicBoolean built = new AtomicBoolean(false); PreConfiguredTokenFilter assertsBuiltOnce = new PreConfiguredTokenFilter("asserts_built_once", false, PreBuiltCacheFactory.CachingStrategy.ONE, (tokens, version) -> { diff --git a/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java b/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java index 5434b08cbe6e9..534db0be39fb7 100644 --- a/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java +++ b/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java @@ -331,9 +331,12 @@ protected Map> getTokenFilters() { } /** - * Map containing pre-built token filters that should be available after installing this plugin. The map is from the name of the token - * filter to the class of the Lucene {@link TokenFilterFactory} that it is emulating. If the Lucene filter factory is {@code null} then - * the test will look it up for you from the name. If there is no Lucene {@linkplain TokenFilterFactory} then the right hand side should + * Map containing pre-configured token filters that should be available + * after installing this plugin. The map is from the name of the token + * filter to the class of the Lucene {@link TokenFilterFactory} that it + * is emulating. If the Lucene filter factory is {@code null} then the + * test will look it up for you from the name. If there is no Lucene + * {@linkplain TokenFilterFactory} then the right hand side should * be {@link Void}. */ protected Map> getPreConfiguredTokenFilters() { From 3e516b943e6d2f6560777c9b21402600a8c925f7 Mon Sep 17 00:00:00 2001 From: Nik Everett Date: Tue, 9 May 2017 12:27:53 -0400 Subject: [PATCH 19/20] Tests: Give stats tests long to wait for listener This test waited 10 seconds for a refresh listener to appear in the stats. It turns out that in our NFS testing infrastructure this can take a lot longer than 10 seconds. The error reported here: https://elasticsearch-ci.elastic.co/job/elastic+elasticsearch+master+nfs/257/consoleFull has it taking something like 15 seconds. This bumps the timeout to a solid minute. Closes #24417 --- .../action/admin/indices/stats/IndicesStatsTests.java | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/core/src/test/java/org/elasticsearch/action/admin/indices/stats/IndicesStatsTests.java b/core/src/test/java/org/elasticsearch/action/admin/indices/stats/IndicesStatsTests.java index 6c1f44ea69a14..be84a8880641f 100644 --- a/core/src/test/java/org/elasticsearch/action/admin/indices/stats/IndicesStatsTests.java +++ b/core/src/test/java/org/elasticsearch/action/admin/indices/stats/IndicesStatsTests.java @@ -118,7 +118,6 @@ public void testCommitStats() throws Exception { } } - @TestLogging("_root:debug") public void testRefreshListeners() throws Exception { // Create an index without automatic refreshes createIndex("test", Settings.builder().put("refresh_interval", -1).build()); @@ -127,8 +126,9 @@ public void testRefreshListeners() throws Exception { ActionFuture index = client().prepareIndex("test", "test", "test").setSource("test", "test") .setRefreshPolicy(RefreshPolicy.WAIT_UNTIL).execute(); - // Wait for the refresh listener to appear in the stats - long end = System.nanoTime() + TimeUnit.SECONDS.toNanos(10); + // Wait for the refresh listener to appear in the stats. Wait a long time because NFS tests can be quite slow! + logger.info("starting to wait"); + long end = System.nanoTime() + TimeUnit.MINUTES.toNanos(1); while (true) { IndicesStatsResponse stats = client().admin().indices().prepareStats("test").clear().setRefresh(true).setDocs(true).get(); CommonStats common = stats.getIndices().get("test").getTotal(); @@ -138,6 +138,7 @@ public void testRefreshListeners() throws Exception { break; } if (end - System.nanoTime() < 0) { + logger.info("timed out"); fail("didn't get a refresh listener in time: " + Strings.toString(common)); } } From 324ac706a1fc5157b20a60beafe51455a0b4fffe Mon Sep 17 00:00:00 2001 From: Nik Everett Date: Tue, 9 May 2017 13:29:03 -0400 Subject: [PATCH 20/20] Rename class to be more clear And add some more comments --- .../indices/analysis/AnalysisModule.java | 2 +- ...actoryTests.java => CoreAnalysisFactoryTests.java} | 11 +++++++++-- 2 files changed, 10 insertions(+), 3 deletions(-) rename core/src/test/java/org/elasticsearch/index/analysis/{BuiltInAnalysisFactoryTests.java => CoreAnalysisFactoryTests.java} (65%) diff --git a/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java b/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java index 7f514ffd981cd..06ef3e315c6ab 100644 --- a/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java +++ b/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java @@ -281,7 +281,7 @@ static Map setupPreConfiguredTokenFilters(List * lucene-analyzers-common so "stop" is defined in the analysis-common * module. */ - // Add token filers declared in PreBuiltTokenFilters until they have all been migrated + // Add token filters declared in PreBuiltTokenFilters until they have all been migrated for (PreBuiltTokenFilters preBuilt : PreBuiltTokenFilters.values()) { switch (preBuilt) { case LOWERCASE: diff --git a/core/src/test/java/org/elasticsearch/index/analysis/BuiltInAnalysisFactoryTests.java b/core/src/test/java/org/elasticsearch/index/analysis/CoreAnalysisFactoryTests.java similarity index 65% rename from core/src/test/java/org/elasticsearch/index/analysis/BuiltInAnalysisFactoryTests.java rename to core/src/test/java/org/elasticsearch/index/analysis/CoreAnalysisFactoryTests.java index 57ae76ac00bd4..3b4897b588988 100644 --- a/core/src/test/java/org/elasticsearch/index/analysis/BuiltInAnalysisFactoryTests.java +++ b/core/src/test/java/org/elasticsearch/index/analysis/CoreAnalysisFactoryTests.java @@ -22,8 +22,15 @@ import org.elasticsearch.indices.analysis.AnalysisFactoryTestCase; import org.elasticsearch.plugins.AnalysisPlugin; -public class BuiltInAnalysisFactoryTests extends AnalysisFactoryTestCase { - public BuiltInAnalysisFactoryTests() { +/** + * Checks on the analysis components that are part of core to make sure that any that are added + * to lucene are either enabled or explicitly not enabled. During the migration of analysis + * components to the {@code analysis-common} module this test ignores many components that are + * available to es-core but mapping in {@code analysis-common}. When the migration is complete + * no such ignoring will be needed because the analysis components won't be available to core. + */ +public class CoreAnalysisFactoryTests extends AnalysisFactoryTestCase { + public CoreAnalysisFactoryTests() { // Use an empty plugin that doesn't define anything so the test doesn't need a ton of null checks. super(new AnalysisPlugin() {}); }