diff --git a/core/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java b/core/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java index 36357afe678b5..b438cd5af4155 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java +++ b/core/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java @@ -36,7 +36,6 @@ import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider; import org.elasticsearch.indices.analysis.PreBuiltAnalyzers; import org.elasticsearch.indices.analysis.PreBuiltCharFilters; -import org.elasticsearch.indices.analysis.PreBuiltTokenFilters; import org.elasticsearch.indices.analysis.PreBuiltTokenizers; import java.io.Closeable; @@ -59,7 +58,7 @@ public final class AnalysisRegistry implements Closeable { public static final String INDEX_ANALYSIS_CHAR_FILTER = "index.analysis.char_filter"; public static final String INDEX_ANALYSIS_FILTER = "index.analysis.filter"; public static final String INDEX_ANALYSIS_TOKENIZER = "index.analysis.tokenizer"; - private final PrebuiltAnalysis prebuiltAnalysis = new PrebuiltAnalysis(); + private final PrebuiltAnalysis prebuiltAnalysis; private final Map cachedAnalyzer = new ConcurrentHashMap<>(); private final Environment environment; @@ -74,13 +73,15 @@ public AnalysisRegistry(Environment environment, Map> tokenFilters, Map> tokenizers, Map>> analyzers, - Map>> normalizers) { + Map>> normalizers, + Map preConfiguredTokenFilters) { this.environment = environment; this.charFilters = unmodifiableMap(charFilters); this.tokenFilters = unmodifiableMap(tokenFilters); this.tokenizers = unmodifiableMap(tokenizers); this.analyzers = unmodifiableMap(analyzers); this.normalizers = unmodifiableMap(normalizers); + prebuiltAnalysis = new PrebuiltAnalysis(preConfiguredTokenFilters); } /** @@ -305,8 +306,8 @@ public String toString() { } private Map buildMapping(Component component, IndexSettings settings, Map settingsMap, - Map> providerMap, Map> defaultInstance) - throws IOException { + Map> providerMap, + Map> defaultInstance) throws IOException { Settings defaultSettings = Settings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, settings.getIndexVersionCreated()).build(); Map factories = new HashMap<>(); for (Map.Entry entry : settingsMap.entrySet()) { @@ -344,7 +345,7 @@ private Map buildMapping(Component component, IndexSettings setti } // go over the char filters in the bindings and register the ones that are not configured - for (Map.Entry> entry : providerMap.entrySet()) { + for (Map.Entry> entry : providerMap.entrySet()) { String name = entry.getKey(); AnalysisModule.AnalysisProvider provider = entry.getValue(); // we don't want to re-register one that already exists @@ -365,7 +366,7 @@ private Map buildMapping(Component component, IndexSettings setti factories.put(name, instance); } - for (Map.Entry> entry : defaultInstance.entrySet()) { + for (Map.Entry> entry : defaultInstance.entrySet()) { final String name = entry.getKey(); final AnalysisModule.AnalysisProvider provider = entry.getValue(); if (factories.containsKey(name) == false) { @@ -378,7 +379,8 @@ private Map buildMapping(Component component, IndexSettings setti return factories; } - private AnalysisProvider getAnalysisProvider(Component component, Map> providerMap, String name, String typeName) { + private AnalysisProvider getAnalysisProvider(Component component, Map> providerMap, + String name, String typeName) { if (typeName == null) { throw new IllegalArgumentException(component + " [" + name + "] must specify either an analyzer type, or a tokenizer"); } @@ -393,13 +395,12 @@ private static class PrebuiltAnalysis implements Closeable { final Map>> analyzerProviderFactories; final Map> tokenizerFactories; - final Map> tokenFilterFactories; + final Map> tokenFilterFactories; final Map> charFilterFactories; - private PrebuiltAnalysis() { + private PrebuiltAnalysis(Map preConfiguredTokenFilters) { Map analyzerProviderFactories = new HashMap<>(); Map tokenizerFactories = new HashMap<>(); - Map tokenFilterFactories = new HashMap<>(); Map charFilterFactories = new HashMap<>(); // Analyzers for (PreBuiltAnalyzers preBuiltAnalyzerEnum : PreBuiltAnalyzers.values()) { @@ -418,17 +419,6 @@ private PrebuiltAnalysis() { tokenizerFactories.put("edgeNGram", new PreBuiltTokenizerFactoryFactory(PreBuiltTokenizers.EDGE_NGRAM.getTokenizerFactory(Version.CURRENT))); tokenizerFactories.put("PathHierarchy", new PreBuiltTokenizerFactoryFactory(PreBuiltTokenizers.PATH_HIERARCHY.getTokenizerFactory(Version.CURRENT))); - - // Token filters - for (PreBuiltTokenFilters preBuiltTokenFilter : PreBuiltTokenFilters.values()) { - String name = preBuiltTokenFilter.name().toLowerCase(Locale.ROOT); - tokenFilterFactories.put(name, new PreBuiltTokenFilterFactoryFactory(preBuiltTokenFilter.getTokenFilterFactory(Version.CURRENT))); - } - // Token filter aliases - tokenFilterFactories.put("nGram", new PreBuiltTokenFilterFactoryFactory(PreBuiltTokenFilters.NGRAM.getTokenFilterFactory(Version.CURRENT))); - tokenFilterFactories.put("edgeNGram", new PreBuiltTokenFilterFactoryFactory(PreBuiltTokenFilters.EDGE_NGRAM.getTokenFilterFactory(Version.CURRENT))); - - // Char Filters for (PreBuiltCharFilters preBuiltCharFilter : PreBuiltCharFilters.values()) { String name = preBuiltCharFilter.name().toLowerCase(Locale.ROOT); @@ -436,10 +426,11 @@ private PrebuiltAnalysis() { } // Char filter aliases charFilterFactories.put("htmlStrip", new PreBuiltCharFilterFactoryFactory(PreBuiltCharFilters.HTML_STRIP.getCharFilterFactory(Version.CURRENT))); + this.analyzerProviderFactories = Collections.unmodifiableMap(analyzerProviderFactories); this.charFilterFactories = Collections.unmodifiableMap(charFilterFactories); - this.tokenFilterFactories = Collections.unmodifiableMap(tokenFilterFactories); this.tokenizerFactories = Collections.unmodifiableMap(tokenizerFactories); + tokenFilterFactories = preConfiguredTokenFilters; } public AnalysisModule.AnalysisProvider getCharFilterFactory(String name) { diff --git a/core/src/main/java/org/elasticsearch/index/analysis/PreBuiltTokenFilterFactoryFactory.java b/core/src/main/java/org/elasticsearch/index/analysis/PreBuiltTokenFilterFactoryFactory.java deleted file mode 100644 index 52c9f2851a29f..0000000000000 --- a/core/src/main/java/org/elasticsearch/index/analysis/PreBuiltTokenFilterFactoryFactory.java +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Licensed to Elasticsearch under one or more contributor - * license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright - * ownership. Elasticsearch licenses this file to you under - * the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.elasticsearch.index.analysis; - -import org.elasticsearch.Version; -import org.elasticsearch.common.settings.Settings; -import org.elasticsearch.env.Environment; -import org.elasticsearch.index.IndexSettings; -import org.elasticsearch.indices.analysis.AnalysisModule; -import org.elasticsearch.indices.analysis.PreBuiltTokenFilters; - -import java.io.IOException; - -public class PreBuiltTokenFilterFactoryFactory implements AnalysisModule.AnalysisProvider { - - private final TokenFilterFactory tokenFilterFactory; - - public PreBuiltTokenFilterFactoryFactory(TokenFilterFactory tokenFilterFactory) { - this.tokenFilterFactory = tokenFilterFactory; - } - - @Override - public TokenFilterFactory get(IndexSettings indexSettings, Environment environment, String name, Settings settings) throws IOException { - Version indexVersion = Version.indexCreated(settings); - if (!Version.CURRENT.equals(indexVersion)) { - PreBuiltTokenFilters preBuiltTokenFilters = PreBuiltTokenFilters.getOrDefault(name, null); - if (preBuiltTokenFilters != null) { - return preBuiltTokenFilters.getTokenFilterFactory(indexVersion); - } - } - return tokenFilterFactory; - } -} diff --git a/core/src/main/java/org/elasticsearch/index/analysis/PreConfiguredTokenFilter.java b/core/src/main/java/org/elasticsearch/index/analysis/PreConfiguredTokenFilter.java new file mode 100644 index 0000000000000..b410e8fb70e85 --- /dev/null +++ b/core/src/main/java/org/elasticsearch/index/analysis/PreConfiguredTokenFilter.java @@ -0,0 +1,123 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.elasticsearch.Version; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.env.Environment; +import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.indices.analysis.AnalysisModule; +import org.elasticsearch.indices.analysis.PreBuiltCacheFactory; + +import java.io.IOException; +import java.util.function.BiFunction; +import java.util.function.Function; + +/** + * Provides pre-configured, shared {@link TokenFilter}s. + */ +public final class PreConfiguredTokenFilter implements AnalysisModule.AnalysisProvider { + private final String name; + private final boolean useFilterForMultitermQueries; + private final PreBuiltCacheFactory.PreBuiltCache cache; + private final BiFunction create; + + /** + * Standard ctor with all the power. + */ + public PreConfiguredTokenFilter(String name, boolean useFilterForMultitermQueries, + PreBuiltCacheFactory.CachingStrategy cachingStrategy, BiFunction create) { + this.name = name; + this.useFilterForMultitermQueries = useFilterForMultitermQueries; + cache = PreBuiltCacheFactory.getCache(cachingStrategy); + this.create = create; + } + + /** + * Convenience ctor for token streams that don't vary based on version. + */ + public PreConfiguredTokenFilter(String name, boolean useFilterForMultitermQueries, + PreBuiltCacheFactory.CachingStrategy cachingStrategy, Function create) { + this(name, useFilterForMultitermQueries, cachingStrategy, (input, version) -> create.apply(input)); + // TODO why oh why aren't these all CachingStrategy.ONE? They *can't* vary based on version because they don't get it, right?! + } + + @Override + public TokenFilterFactory get(IndexSettings indexSettings, Environment environment, String name, Settings settings) throws IOException { + return getTokenFilterFactory(Version.indexCreated(settings)); + } + + /** + * The name of the {@link TokenFilter} in the API. + */ + public String getName() { + return name; + } + + /** + * Can this {@link TokenFilter} be used in multi-term queries? + */ + public boolean shouldUseFilterForMultitermQueries() { + return useFilterForMultitermQueries; + } + + private interface MultiTermAwareTokenFilterFactory extends TokenFilterFactory, MultiTermAwareComponent {} + + private synchronized TokenFilterFactory getTokenFilterFactory(final Version version) { + TokenFilterFactory factory = cache.get(version); + if (factory == null) { + if (useFilterForMultitermQueries) { + factory = new MultiTermAwareTokenFilterFactory() { + @Override + public String name() { + return name; + } + + @Override + public TokenStream create(TokenStream tokenStream) { + return create.apply(tokenStream, version); + } + + @Override + public Object getMultiTermComponent() { + return this; + } + }; + } else { + factory = new TokenFilterFactory() { + @Override + public String name() { + return name; + } + + @Override + public TokenStream create(TokenStream tokenStream) { + return create.apply(tokenStream, version); + } + }; + } + cache.put(version, factory); + } + + return factory; + } +} diff --git a/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java b/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java index 26a4e4c1c5c3a..06ef3e315c6ab 100644 --- a/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java +++ b/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java @@ -19,6 +19,8 @@ package org.elasticsearch.indices.analysis; +import org.apache.lucene.analysis.LowerCaseFilter; +import org.apache.lucene.analysis.standard.StandardFilter; import org.elasticsearch.Version; import org.elasticsearch.cluster.metadata.IndexMetaData; import org.elasticsearch.common.NamedRegistry; @@ -101,6 +103,7 @@ import org.elasticsearch.index.analysis.PersianNormalizationFilterFactory; import org.elasticsearch.index.analysis.PorterStemTokenFilterFactory; import org.elasticsearch.index.analysis.PortugueseAnalyzerProvider; +import org.elasticsearch.index.analysis.PreConfiguredTokenFilter; import org.elasticsearch.index.analysis.ReverseTokenFilterFactory; import org.elasticsearch.index.analysis.RomanianAnalyzerProvider; import org.elasticsearch.index.analysis.RussianAnalyzerProvider; @@ -138,11 +141,15 @@ import org.elasticsearch.index.analysis.WhitespaceTokenizerFactory; import org.elasticsearch.index.analysis.compound.DictionaryCompoundWordTokenFilterFactory; import org.elasticsearch.index.analysis.compound.HyphenationCompoundWordTokenFilterFactory; +import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy; import org.elasticsearch.plugins.AnalysisPlugin; import java.io.IOException; import java.util.List; +import java.util.Locale; +import java.util.Map; +import static java.util.Collections.unmodifiableMap; import static org.elasticsearch.plugins.AnalysisPlugin.requriesAnalysisSettings; /** @@ -169,8 +176,11 @@ public AnalysisModule(Environment environment, List plugins) thr NamedRegistry> tokenizers = setupTokenizers(plugins); NamedRegistry>> analyzers = setupAnalyzers(plugins); NamedRegistry>> normalizers = setupNormalizers(plugins); + + Map preConfiguredTokenFilters = setupPreConfiguredTokenFilters(plugins); + analysisRegistry = new AnalysisRegistry(environment, charFilters.getRegistry(), tokenFilters.getRegistry(), tokenizers - .getRegistry(), analyzers.getRegistry(), normalizers.getRegistry()); + .getRegistry(), analyzers.getRegistry(), normalizers.getRegistry(), preConfiguredTokenFilters); } HunspellService getHunspellService() { @@ -258,6 +268,40 @@ private NamedRegistry> setupTokenFilters(Li return tokenFilters; } + static Map setupPreConfiguredTokenFilters(List plugins) { + NamedRegistry preConfiguredTokenFilters = new NamedRegistry<>("pre-configured token_filter"); + + // Add filters available in lucene-core + preConfiguredTokenFilters.register("lowercase", + new PreConfiguredTokenFilter("lowercase", true, CachingStrategy.LUCENE, LowerCaseFilter::new)); + preConfiguredTokenFilters.register("standard", + new PreConfiguredTokenFilter("standard", false, CachingStrategy.LUCENE, StandardFilter::new)); + /* Note that "stop" is available in lucene-core but it's pre-built + * version uses a set of English stop words that are in + * lucene-analyzers-common so "stop" is defined in the analysis-common + * module. */ + + // Add token filters declared in PreBuiltTokenFilters until they have all been migrated + for (PreBuiltTokenFilters preBuilt : PreBuiltTokenFilters.values()) { + switch (preBuilt) { + case LOWERCASE: + // This has been migrated but has to stick around until PreBuiltTokenizers is removed. + continue; + default: + String name = preBuilt.name().toLowerCase(Locale.ROOT); + preConfiguredTokenFilters.register(name, + new PreConfiguredTokenFilter(name, preBuilt.isMultiTermAware(), preBuilt.getCachingStrategy(), preBuilt::create)); + } + } + + for (AnalysisPlugin plugin: plugins) { + for (PreConfiguredTokenFilter filter : plugin.getPreConfiguredTokenFilters()) { + preConfiguredTokenFilters.register(filter.getName(), filter); + } + } + return unmodifiableMap(preConfiguredTokenFilters.getRegistry()); + } + private NamedRegistry> setupTokenizers(List plugins) { NamedRegistry> tokenizers = new NamedRegistry<>("tokenizer"); tokenizers.register("standard", StandardTokenizerFactory::new); diff --git a/core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltCacheFactory.java b/core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltCacheFactory.java index 823152e6d9e9e..8636e04f20f10 100644 --- a/core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltCacheFactory.java +++ b/core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltCacheFactory.java @@ -42,7 +42,7 @@ public interface PreBuiltCache { private PreBuiltCacheFactory() {} - static PreBuiltCache getCache(CachingStrategy cachingStrategy) { + public static PreBuiltCache getCache(CachingStrategy cachingStrategy) { switch (cachingStrategy) { case ONE: return new PreBuiltCacheStrategyOne<>(); diff --git a/core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltTokenFilters.java b/core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltTokenFilters.java index 6c58ab884db27..02f6d8aadc5e0 100644 --- a/core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltTokenFilters.java +++ b/core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltTokenFilters.java @@ -18,9 +18,7 @@ */ package org.elasticsearch.indices.analysis; -import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.LowerCaseFilter; -import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.ar.ArabicNormalizationFilter; import org.apache.lucene.analysis.ar.ArabicStemFilter; @@ -28,39 +26,23 @@ import org.apache.lucene.analysis.cjk.CJKBigramFilter; import org.apache.lucene.analysis.cjk.CJKWidthFilter; import org.apache.lucene.analysis.ckb.SoraniNormalizationFilter; -import org.apache.lucene.analysis.commongrams.CommonGramsFilter; import org.apache.lucene.analysis.core.DecimalDigitFilter; -import org.apache.lucene.analysis.core.StopAnalyzer; -import org.apache.lucene.analysis.core.UpperCaseFilter; import org.apache.lucene.analysis.cz.CzechStemFilter; import org.apache.lucene.analysis.de.GermanNormalizationFilter; import org.apache.lucene.analysis.de.GermanStemFilter; -import org.apache.lucene.analysis.en.KStemFilter; import org.apache.lucene.analysis.en.PorterStemFilter; import org.apache.lucene.analysis.fa.PersianNormalizationFilter; import org.apache.lucene.analysis.fr.FrenchAnalyzer; import org.apache.lucene.analysis.hi.HindiNormalizationFilter; import org.apache.lucene.analysis.in.IndicNormalizationFilter; -import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter; import org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilter; -import org.apache.lucene.analysis.miscellaneous.LengthFilter; import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilter; import org.apache.lucene.analysis.miscellaneous.ScandinavianFoldingFilter; import org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizationFilter; -import org.apache.lucene.analysis.miscellaneous.TrimFilter; -import org.apache.lucene.analysis.miscellaneous.TruncateTokenFilter; -import org.apache.lucene.analysis.miscellaneous.UniqueTokenFilter; -import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter; -import org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter; -import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter; -import org.apache.lucene.analysis.ngram.NGramTokenFilter; import org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilter; import org.apache.lucene.analysis.payloads.TypeAsPayloadTokenFilter; -import org.apache.lucene.analysis.reverse.ReverseStringFilter; import org.apache.lucene.analysis.shingle.ShingleFilter; import org.apache.lucene.analysis.snowball.SnowballFilter; -import org.apache.lucene.analysis.standard.ClassicFilter; -import org.apache.lucene.analysis.standard.StandardFilter; import org.apache.lucene.analysis.tr.ApostropheFilter; import org.apache.lucene.analysis.util.ElisionFilter; import org.elasticsearch.Version; @@ -75,77 +57,7 @@ import java.util.Locale; public enum PreBuiltTokenFilters { - - WORD_DELIMITER(CachingStrategy.ONE) { - @Override - public TokenStream create(TokenStream tokenStream, Version version) { - return new WordDelimiterFilter(tokenStream, - WordDelimiterFilter.GENERATE_WORD_PARTS | - WordDelimiterFilter.GENERATE_NUMBER_PARTS | - WordDelimiterFilter.SPLIT_ON_CASE_CHANGE | - WordDelimiterFilter.SPLIT_ON_NUMERICS | - WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE, null); - } - }, - - WORD_DELIMITER_GRAPH(CachingStrategy.ONE) { - @Override - public TokenStream create(TokenStream tokenStream, Version version) { - return new WordDelimiterGraphFilter(tokenStream, - WordDelimiterGraphFilter.GENERATE_WORD_PARTS | - WordDelimiterGraphFilter.GENERATE_NUMBER_PARTS | - WordDelimiterGraphFilter.SPLIT_ON_CASE_CHANGE | - WordDelimiterGraphFilter.SPLIT_ON_NUMERICS | - WordDelimiterGraphFilter.STEM_ENGLISH_POSSESSIVE, null); - } - }, - - STOP(CachingStrategy.LUCENE) { - @Override - public TokenStream create(TokenStream tokenStream, Version version) { - return new StopFilter(tokenStream, StopAnalyzer.ENGLISH_STOP_WORDS_SET); - } - }, - - TRIM(CachingStrategy.LUCENE) { - @Override - public TokenStream create(TokenStream tokenStream, Version version) { - return new TrimFilter(tokenStream); - } - }, - - REVERSE(CachingStrategy.LUCENE) { - @Override - public TokenStream create(TokenStream tokenStream, Version version) { - return new ReverseStringFilter(tokenStream); - } - }, - - ASCIIFOLDING(CachingStrategy.ONE) { - @Override - public TokenStream create(TokenStream tokenStream, Version version) { - return new ASCIIFoldingFilter(tokenStream); - } - @Override - protected boolean isMultiTermAware() { - return true; - } - }, - - LENGTH(CachingStrategy.LUCENE) { - @Override - public TokenStream create(TokenStream tokenStream, Version version) { - return new LengthFilter(tokenStream, 0, Integer.MAX_VALUE); - } - }, - - COMMON_GRAMS(CachingStrategy.LUCENE) { - @Override - public TokenStream create(TokenStream tokenStream, Version version) { - return new CommonGramsFilter(tokenStream, CharArraySet.EMPTY_SET); - } - }, - + // TODO remove this entire class when PreBuiltTokenizers no longer needs it..... LOWERCASE(CachingStrategy.LUCENE) { @Override public TokenStream create(TokenStream tokenStream, Version version) { @@ -157,73 +69,6 @@ protected boolean isMultiTermAware() { } }, - UPPERCASE(CachingStrategy.LUCENE) { - @Override - public TokenStream create(TokenStream tokenStream, Version version) { - return new UpperCaseFilter(tokenStream); - } - @Override - protected boolean isMultiTermAware() { - return true; - } - }, - - KSTEM(CachingStrategy.ONE) { - @Override - public TokenStream create(TokenStream tokenStream, Version version) { - return new KStemFilter(tokenStream); - } - }, - - PORTER_STEM(CachingStrategy.ONE) { - @Override - public TokenStream create(TokenStream tokenStream, Version version) { - return new PorterStemFilter(tokenStream); - } - }, - - STANDARD(CachingStrategy.LUCENE) { - @Override - public TokenStream create(TokenStream tokenStream, Version version) { - return new StandardFilter(tokenStream); - } - }, - - CLASSIC(CachingStrategy.ONE) { - @Override - public TokenStream create(TokenStream tokenStream, Version version) { - return new ClassicFilter(tokenStream); - } - }, - - NGRAM(CachingStrategy.LUCENE) { - @Override - public TokenStream create(TokenStream tokenStream, Version version) { - return new NGramTokenFilter(tokenStream); - } - }, - - EDGE_NGRAM(CachingStrategy.LUCENE) { - @Override - public TokenStream create(TokenStream tokenStream, Version version) { - return new EdgeNGramTokenFilter(tokenStream, EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE); - } - }, - - UNIQUE(CachingStrategy.ONE) { - @Override - public TokenStream create(TokenStream tokenStream, Version version) { - return new UniqueTokenFilter(tokenStream); - } - }, - - TRUNCATE(CachingStrategy.ONE) { - @Override - public TokenStream create(TokenStream tokenStream, Version version) { - return new TruncateTokenFilter(tokenStream, 10); - } - }, - // Extended Token Filters SNOWBALL(CachingStrategy.ONE) { @Override @@ -469,10 +314,16 @@ protected boolean isMultiTermAware() { protected final PreBuiltCacheFactory.PreBuiltCache cache; + private final CachingStrategy cachingStrategy; PreBuiltTokenFilters(CachingStrategy cachingStrategy) { + this.cachingStrategy = cachingStrategy; cache = PreBuiltCacheFactory.getCache(cachingStrategy); } + public CachingStrategy getCachingStrategy() { + return cachingStrategy; + } + private interface MultiTermAwareTokenFilterFactory extends TokenFilterFactory, MultiTermAwareComponent {} public synchronized TokenFilterFactory getTokenFilterFactory(final Version version) { @@ -514,17 +365,4 @@ public TokenStream create(TokenStream tokenStream) { return factory; } - - /** - * Get a pre built TokenFilter by its name or fallback to the default one - * @param name TokenFilter name - * @param defaultTokenFilter default TokenFilter if name not found - */ - public static PreBuiltTokenFilters getOrDefault(String name, PreBuiltTokenFilters defaultTokenFilter) { - try { - return valueOf(name.toUpperCase(Locale.ROOT)); - } catch (IllegalArgumentException e) { - return defaultTokenFilter; - } - } } diff --git a/core/src/main/java/org/elasticsearch/plugins/AnalysisPlugin.java b/core/src/main/java/org/elasticsearch/plugins/AnalysisPlugin.java index 5e7e1053add58..c248c706f2321 100644 --- a/core/src/main/java/org/elasticsearch/plugins/AnalysisPlugin.java +++ b/core/src/main/java/org/elasticsearch/plugins/AnalysisPlugin.java @@ -22,19 +22,26 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.CharFilter; import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; +import org.elasticsearch.Version; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.analysis.AnalyzerProvider; import org.elasticsearch.index.analysis.CharFilterFactory; +import org.elasticsearch.index.analysis.PreConfiguredTokenFilter; import org.elasticsearch.index.analysis.TokenFilterFactory; import org.elasticsearch.index.analysis.TokenizerFactory; import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider; +import org.elasticsearch.indices.analysis.PreBuiltCacheFactory; import java.io.IOException; +import java.util.List; import java.util.Map; +import java.util.function.BiFunction; +import static java.util.Collections.emptyList; import static java.util.Collections.emptyMap; /** @@ -87,6 +94,13 @@ default Map>> getA return emptyMap(); } + /** + * Override to add additional pre-configured token filters. + */ + default List getPreConfiguredTokenFilters() { + return emptyList(); + } + /** * Override to add additional hunspell {@link org.apache.lucene.analysis.hunspell.Dictionary}s. */ diff --git a/core/src/test/java/org/elasticsearch/action/admin/indices/TransportAnalyzeActionTests.java b/core/src/test/java/org/elasticsearch/action/admin/indices/TransportAnalyzeActionTests.java index 57a83b2c68081..0e1414bdbefda 100644 --- a/core/src/test/java/org/elasticsearch/action/admin/indices/TransportAnalyzeActionTests.java +++ b/core/src/test/java/org/elasticsearch/action/admin/indices/TransportAnalyzeActionTests.java @@ -48,8 +48,8 @@ import static java.util.Collections.singletonMap; /** - * Tests for {@link TransportAnalyzeAction}. See the more "intense" version of this test in the - * {@code common-analysis} module. + * Tests for {@link TransportAnalyzeAction}. See the rest tests in the {@code analysis-common} module for places where this code gets a ton + * more exercise. */ public class TransportAnalyzeActionTests extends ESTestCase { @@ -90,7 +90,11 @@ public Map> getTokenFilters() { indexAnalyzers = registry.build(idxSettings); } + /** + * Test behavior when the named analysis component isn't defined on the index. In that case we should build with defaults. + */ public void testNoIndexAnalyzers() throws IOException { + // Refer to an analyzer by its type so we get its default configuration AnalyzeRequest request = new AnalyzeRequest(); request.analyzer("standard"); request.text("the quick brown fox"); @@ -98,33 +102,30 @@ public void testNoIndexAnalyzers() throws IOException { List tokens = analyze.getTokens(); assertEquals(4, tokens.size()); + // Refer to a token filter by its type so we get its default configuration request.analyzer(null); request.tokenizer("whitespace"); - request.addTokenFilter("lowercase"); - request.addTokenFilter("word_delimiter"); + request.addTokenFilter("mock"); request.text("the qu1ck brown fox"); analyze = TransportAnalyzeAction.analyze(request, AllFieldMapper.NAME, null, randomBoolean() ? indexAnalyzers : null, registry, environment); tokens = analyze.getTokens(); - assertEquals(6, tokens.size()); - assertEquals("qu", tokens.get(1).getTerm()); - assertEquals("1", tokens.get(2).getTerm()); - assertEquals("ck", tokens.get(3).getTerm()); + assertEquals(3, tokens.size()); + assertEquals("qu1ck", tokens.get(0).getTerm()); + assertEquals("brown", tokens.get(1).getTerm()); + assertEquals("fox", tokens.get(2).getTerm()); + // Refer to a char filter by its type so we get its default configuration request.analyzer(null); request.tokenizer("whitespace"); request.addCharFilter("html_strip"); - request.addTokenFilter("lowercase"); - request.addTokenFilter("word_delimiter"); + request.addTokenFilter("mock"); request.text("

the qu1ck brown fox

"); analyze = TransportAnalyzeAction.analyze(request, AllFieldMapper.NAME, null, randomBoolean() ? indexAnalyzers : null, registry, environment); tokens = analyze.getTokens(); - assertEquals(6, tokens.size()); - assertEquals("the", tokens.get(0).getTerm()); - assertEquals("qu", tokens.get(1).getTerm()); - assertEquals("1", tokens.get(2).getTerm()); - assertEquals("ck", tokens.get(3).getTerm()); - assertEquals("brown", tokens.get(4).getTerm()); - assertEquals("fox", tokens.get(5).getTerm()); + assertEquals(3, tokens.size()); + assertEquals("qu1ck", tokens.get(0).getTerm()); + assertEquals("brown", tokens.get(1).getTerm()); + assertEquals("fox", tokens.get(2).getTerm()); } public void testFillsAttributes() throws IOException { diff --git a/core/src/test/java/org/elasticsearch/action/admin/indices/stats/IndicesStatsTests.java b/core/src/test/java/org/elasticsearch/action/admin/indices/stats/IndicesStatsTests.java index 6c1f44ea69a14..be84a8880641f 100644 --- a/core/src/test/java/org/elasticsearch/action/admin/indices/stats/IndicesStatsTests.java +++ b/core/src/test/java/org/elasticsearch/action/admin/indices/stats/IndicesStatsTests.java @@ -118,7 +118,6 @@ public void testCommitStats() throws Exception { } } - @TestLogging("_root:debug") public void testRefreshListeners() throws Exception { // Create an index without automatic refreshes createIndex("test", Settings.builder().put("refresh_interval", -1).build()); @@ -127,8 +126,9 @@ public void testRefreshListeners() throws Exception { ActionFuture index = client().prepareIndex("test", "test", "test").setSource("test", "test") .setRefreshPolicy(RefreshPolicy.WAIT_UNTIL).execute(); - // Wait for the refresh listener to appear in the stats - long end = System.nanoTime() + TimeUnit.SECONDS.toNanos(10); + // Wait for the refresh listener to appear in the stats. Wait a long time because NFS tests can be quite slow! + logger.info("starting to wait"); + long end = System.nanoTime() + TimeUnit.MINUTES.toNanos(1); while (true) { IndicesStatsResponse stats = client().admin().indices().prepareStats("test").clear().setRefresh(true).setDocs(true).get(); CommonStats common = stats.getIndices().get("test").getTotal(); @@ -138,6 +138,7 @@ public void testRefreshListeners() throws Exception { break; } if (end - System.nanoTime() < 0) { + logger.info("timed out"); fail("didn't get a refresh listener in time: " + Strings.toString(common)); } } diff --git a/core/src/test/java/org/elasticsearch/index/IndexModuleTests.java b/core/src/test/java/org/elasticsearch/index/IndexModuleTests.java index 1ae125cecd19e..209bd1648b66e 100644 --- a/core/src/test/java/org/elasticsearch/index/IndexModuleTests.java +++ b/core/src/test/java/org/elasticsearch/index/IndexModuleTests.java @@ -94,6 +94,7 @@ public class IndexModuleTests extends ESTestCase { private Settings settings; private IndexSettings indexSettings; private Environment environment; + private AnalysisRegistry emptyAnalysisRegistry; private NodeEnvironment nodeEnvironment; private IndicesQueryCache indicesQueryCache; @@ -123,6 +124,7 @@ public void setUp() throws Exception { indexSettings = IndexSettingsModule.newIndexSettings("foo", settings); index = indexSettings.getIndex(); environment = new Environment(settings); + emptyAnalysisRegistry = new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap()); threadPool = new TestThreadPool("test"); circuitBreakerService = new NoneCircuitBreakerService(); bigArrays = new BigArrays(settings, circuitBreakerService); @@ -150,8 +152,7 @@ private IndexService newIndexService(IndexModule module) throws IOException { } public void testWrapperIsBound() throws IOException { - IndexModule module = new IndexModule(indexSettings, - new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap())); + IndexModule module = new IndexModule(indexSettings, emptyAnalysisRegistry); module.setSearcherWrapper((s) -> new Wrapper()); module.engineFactory.set(new MockEngineFactory(AssertingDirectoryReader.class)); @@ -170,8 +171,7 @@ public void testRegisterIndexStore() throws IOException { .put(IndexModule.INDEX_STORE_TYPE_SETTING.getKey(), "foo_store") .build(); IndexSettings indexSettings = IndexSettingsModule.newIndexSettings(index, settings); - IndexModule module = new IndexModule(indexSettings, - new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap())); + IndexModule module = new IndexModule(indexSettings, emptyAnalysisRegistry); module.addIndexStore("foo_store", FooStore::new); try { module.addIndexStore("foo_store", FooStore::new); @@ -195,8 +195,7 @@ public void beforeIndexRemoved(IndexService indexService, IndexRemovalReason rea } }; IndexSettings indexSettings = IndexSettingsModule.newIndexSettings(index, settings); - IndexModule module = new IndexModule(indexSettings, - new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap())); + IndexModule module = new IndexModule(indexSettings, emptyAnalysisRegistry); module.addIndexEventListener(eventListener); IndexService indexService = newIndexService(module); IndexSettings x = indexService.getIndexSettings(); @@ -210,8 +209,7 @@ public void beforeIndexRemoved(IndexService indexService, IndexRemovalReason rea public void testListener() throws IOException { Setting booleanSetting = Setting.boolSetting("index.foo.bar", false, Property.Dynamic, Property.IndexScope); - IndexModule module = new IndexModule(IndexSettingsModule.newIndexSettings(index, settings, booleanSetting), - new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap())); + IndexModule module = new IndexModule(IndexSettingsModule.newIndexSettings(index, settings, booleanSetting), emptyAnalysisRegistry); Setting booleanSetting2 = Setting.boolSetting("index.foo.bar.baz", false, Property.Dynamic, Property.IndexScope); AtomicBoolean atomicBoolean = new AtomicBoolean(false); module.addSettingsUpdateConsumer(booleanSetting, atomicBoolean::set); @@ -230,8 +228,7 @@ public void testListener() throws IOException { } public void testAddIndexOperationListener() throws IOException { - IndexModule module = new IndexModule(IndexSettingsModule.newIndexSettings(index, settings), - new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap())); + IndexModule module = new IndexModule(IndexSettingsModule.newIndexSettings(index, settings), emptyAnalysisRegistry); AtomicBoolean executed = new AtomicBoolean(false); IndexingOperationListener listener = new IndexingOperationListener() { @Override @@ -261,8 +258,7 @@ public Engine.Index preIndex(ShardId shardId, Engine.Index operation) { } public void testAddSearchOperationListener() throws IOException { - IndexModule module = new IndexModule(IndexSettingsModule.newIndexSettings(index, settings), - new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap())); + IndexModule module = new IndexModule(IndexSettingsModule.newIndexSettings(index, settings), emptyAnalysisRegistry); AtomicBoolean executed = new AtomicBoolean(false); SearchOperationListener listener = new SearchOperationListener() { @@ -295,8 +291,7 @@ public void testAddSimilarity() throws IOException { .put("index.similarity.my_similarity.key", "there is a key") .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) .build(); - IndexModule module = new IndexModule(IndexSettingsModule.newIndexSettings("foo", indexSettings), - new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap())); + IndexModule module = new IndexModule(IndexSettingsModule.newIndexSettings("foo", indexSettings), emptyAnalysisRegistry); module.addSimilarity("test_similarity", (string, providerSettings, indexLevelSettings) -> new SimilarityProvider() { @Override public String name() { @@ -319,8 +314,7 @@ public Similarity get() { } public void testFrozen() { - IndexModule module = new IndexModule(IndexSettingsModule.newIndexSettings(index, settings), - new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap())); + IndexModule module = new IndexModule(IndexSettingsModule.newIndexSettings(index, settings), emptyAnalysisRegistry); module.freeze(); String msg = "Can't modify IndexModule once the index service has been created"; assertEquals(msg, expectThrows(IllegalStateException.class, () -> module.addSearchOperationListener(null)).getMessage()); @@ -338,8 +332,7 @@ public void testSetupUnknownSimilarity() throws IOException { .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) .build(); - IndexModule module = new IndexModule(IndexSettingsModule.newIndexSettings("foo", indexSettings), - new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap())); + IndexModule module = new IndexModule(IndexSettingsModule.newIndexSettings("foo", indexSettings), emptyAnalysisRegistry); Exception ex = expectThrows(IllegalArgumentException.class, () -> newIndexService(module)); assertEquals("Unknown Similarity type [test_similarity] for [my_similarity]", ex.getMessage()); } @@ -350,8 +343,7 @@ public void testSetupWithoutType() throws IOException { .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) .build(); - IndexModule module = new IndexModule(IndexSettingsModule.newIndexSettings("foo", indexSettings), - new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap())); + IndexModule module = new IndexModule(IndexSettingsModule.newIndexSettings("foo", indexSettings), emptyAnalysisRegistry); Exception ex = expectThrows(IllegalArgumentException.class, () -> newIndexService(module)); assertEquals("Similarity [my_similarity] must have an associated type", ex.getMessage()); } @@ -360,8 +352,7 @@ public void testForceCustomQueryCache() throws IOException { Settings indexSettings = Settings.builder() .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT).build(); - IndexModule module = new IndexModule(IndexSettingsModule.newIndexSettings("foo", indexSettings), - new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap())); + IndexModule module = new IndexModule(IndexSettingsModule.newIndexSettings("foo", indexSettings), emptyAnalysisRegistry); module.forceQueryCacheProvider((a, b) -> new CustomQueryCache()); expectThrows(AlreadySetException.class, () -> module.forceQueryCacheProvider((a, b) -> new CustomQueryCache())); IndexService indexService = newIndexService(module); @@ -373,8 +364,7 @@ public void testDefaultQueryCacheImplIsSelected() throws IOException { Settings indexSettings = Settings.builder() .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT).build(); - IndexModule module = new IndexModule(IndexSettingsModule.newIndexSettings("foo", indexSettings), - new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap())); + IndexModule module = new IndexModule(IndexSettingsModule.newIndexSettings("foo", indexSettings), emptyAnalysisRegistry); IndexService indexService = newIndexService(module); assertTrue(indexService.cache().query() instanceof IndexQueryCache); indexService.close("simon says", false); @@ -385,8 +375,7 @@ public void testDisableQueryCacheHasPrecedenceOverForceQueryCache() throws IOExc .put(IndexModule.INDEX_QUERY_CACHE_ENABLED_SETTING.getKey(), false) .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT).build(); - IndexModule module = new IndexModule(IndexSettingsModule.newIndexSettings("foo", indexSettings), - new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap())); + IndexModule module = new IndexModule(IndexSettingsModule.newIndexSettings("foo", indexSettings), emptyAnalysisRegistry); module.forceQueryCacheProvider((a, b) -> new CustomQueryCache()); IndexService indexService = newIndexService(module); assertTrue(indexService.cache().query() instanceof DisabledQueryCache); diff --git a/core/src/test/java/org/elasticsearch/index/analysis/AnalysisFactoryTests.java b/core/src/test/java/org/elasticsearch/index/analysis/AnalysisFactoryTests.java deleted file mode 100644 index 0a62e8c491588..0000000000000 --- a/core/src/test/java/org/elasticsearch/index/analysis/AnalysisFactoryTests.java +++ /dev/null @@ -1,26 +0,0 @@ -/* - * Licensed to Elasticsearch under one or more contributor - * license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright - * ownership. Elasticsearch licenses this file to you under - * the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.elasticsearch.index.analysis; - -import org.elasticsearch.AnalysisFactoryTestCase; - -public class AnalysisFactoryTests extends AnalysisFactoryTestCase { - // tests are inherited and nothing needs to be defined here -} diff --git a/core/src/test/java/org/elasticsearch/index/analysis/AnalysisRegistryTests.java b/core/src/test/java/org/elasticsearch/index/analysis/AnalysisRegistryTests.java index 6033186c81289..471d6f9cccc29 100644 --- a/core/src/test/java/org/elasticsearch/index/analysis/AnalysisRegistryTests.java +++ b/core/src/test/java/org/elasticsearch/index/analysis/AnalysisRegistryTests.java @@ -34,6 +34,7 @@ import org.elasticsearch.indices.analysis.AnalysisModule; import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider; import org.elasticsearch.indices.analysis.PreBuiltAnalyzers; +import org.elasticsearch.indices.analysis.PreBuiltCacheFactory; import org.elasticsearch.plugins.AnalysisPlugin; import org.elasticsearch.test.ESTestCase; import org.elasticsearch.test.IndexSettingsModule; @@ -41,6 +42,7 @@ import java.io.IOException; import java.util.Map; +import java.util.concurrent.atomic.AtomicBoolean; import static java.util.Collections.emptyMap; import static java.util.Collections.singletonList; @@ -50,7 +52,9 @@ public class AnalysisRegistryTests extends ESTestCase { - private AnalysisRegistry registry; + private Environment emptyEnvironment; + private AnalysisRegistry emptyRegistry; + private IndexSettings emptyIndexSettingsOfCurrentVersion; private static AnalyzerProvider analyzerProvider(final String name) { return new PreBuiltAnalyzerProvider(name, AnalyzerScope.INDEX, new EnglishAnalyzer()); @@ -59,12 +63,13 @@ private static AnalyzerProvider analyzerProvider(final String name) { @Override public void setUp() throws Exception { super.setUp(); - Settings settings = Settings - .builder() - .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) - .build(); - registry = new AnalysisRegistry(new Environment(settings), - emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap()); + emptyEnvironment = new Environment(Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .build()); + emptyRegistry = new AnalysisRegistry(emptyEnvironment, emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap()); + emptyIndexSettingsOfCurrentVersion = IndexSettingsModule.newIndexSettings("index", Settings.builder() + .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) + .build()); } public void testDefaultAnalyzers() throws IOException { @@ -75,9 +80,7 @@ public void testDefaultAnalyzers() throws IOException { .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) .build(); IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings); - IndexAnalyzers indexAnalyzers = new AnalysisRegistry(new Environment(settings), - emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap()) - .build(idxSettings); + IndexAnalyzers indexAnalyzers = emptyRegistry.build(idxSettings); assertThat(indexAnalyzers.getDefaultIndexAnalyzer().analyzer(), instanceOf(StandardAnalyzer.class)); assertThat(indexAnalyzers.getDefaultSearchAnalyzer().analyzer(), instanceOf(StandardAnalyzer.class)); assertThat(indexAnalyzers.getDefaultSearchQuoteAnalyzer().analyzer(), instanceOf(StandardAnalyzer.class)); @@ -86,7 +89,7 @@ public void testDefaultAnalyzers() throws IOException { public void testOverrideDefaultAnalyzer() throws IOException { Version version = VersionUtils.randomVersion(random()); Settings settings = Settings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, version).build(); - IndexAnalyzers indexAnalyzers = registry.build(IndexSettingsModule.newIndexSettings("index", settings), + IndexAnalyzers indexAnalyzers = emptyRegistry.build(IndexSettingsModule.newIndexSettings("index", settings), singletonMap("default", analyzerProvider("default")) , emptyMap(), emptyMap(), emptyMap(), emptyMap()); assertThat(indexAnalyzers.getDefaultIndexAnalyzer().analyzer(), instanceOf(EnglishAnalyzer.class)); @@ -99,7 +102,7 @@ public void testOverrideDefaultIndexAnalyzerIsUnsupported() { Settings settings = Settings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, version).build(); AnalyzerProvider defaultIndex = new PreBuiltAnalyzerProvider("default_index", AnalyzerScope.INDEX, new EnglishAnalyzer()); IllegalArgumentException e = expectThrows(IllegalArgumentException.class, - () -> registry.build(IndexSettingsModule.newIndexSettings("index", settings), + () -> emptyRegistry.build(IndexSettingsModule.newIndexSettings("index", settings), singletonMap("default_index", defaultIndex), emptyMap(), emptyMap(), emptyMap(), emptyMap())); assertTrue(e.getMessage().contains("[index.analysis.analyzer.default_index] is not supported")); } @@ -107,7 +110,7 @@ public void testOverrideDefaultIndexAnalyzerIsUnsupported() { public void testOverrideDefaultSearchAnalyzer() { Version version = VersionUtils.randomVersion(random()); Settings settings = Settings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, version).build(); - IndexAnalyzers indexAnalyzers = registry.build(IndexSettingsModule.newIndexSettings("index", settings), + IndexAnalyzers indexAnalyzers = emptyRegistry.build(IndexSettingsModule.newIndexSettings("index", settings), singletonMap("default_search", analyzerProvider("default_search")), emptyMap(), emptyMap(), emptyMap(), emptyMap()); assertThat(indexAnalyzers.getDefaultIndexAnalyzer().analyzer(), instanceOf(StandardAnalyzer.class)); assertThat(indexAnalyzers.getDefaultSearchAnalyzer().analyzer(), instanceOf(EnglishAnalyzer.class)); @@ -189,11 +192,12 @@ public void testBuiltInAnalyzersAreCached() throws IOException { Settings indexSettings = Settings.builder() .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT).build(); IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings); - IndexAnalyzers indexAnalyzers = new AnalysisRegistry(new Environment(settings), - emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap()) - .build(idxSettings); - IndexAnalyzers otherIndexAnalyzers = new AnalysisRegistry(new Environment(settings), emptyMap(), emptyMap(), emptyMap(), - emptyMap(), emptyMap()).build(idxSettings); + IndexAnalyzers indexAnalyzers = + new AnalysisRegistry(new Environment(settings), emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap()) + .build(idxSettings); + IndexAnalyzers otherIndexAnalyzers = + new AnalysisRegistry(new Environment(settings), emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap()) + .build(idxSettings); final int numIters = randomIntBetween(5, 20); for (int i = 0; i < numIters; i++) { PreBuiltAnalyzers preBuiltAnalyzers = RandomPicks.randomFrom(random(), PreBuiltAnalyzers.values()); @@ -201,6 +205,23 @@ public void testBuiltInAnalyzersAreCached() throws IOException { } } + public void testPreConfiguredTokenFiltersAreCached() throws IOException { + AtomicBoolean built = new AtomicBoolean(false); + PreConfiguredTokenFilter assertsBuiltOnce = new PreConfiguredTokenFilter("asserts_built_once", false, + PreBuiltCacheFactory.CachingStrategy.ONE, (tokens, version) -> { + if (false == built.compareAndSet(false, true)) { + fail("Attempted to build the token filter twice when it should have been cached"); + } + return new MockTokenFilter(tokens, MockTokenFilter.EMPTY_STOPSET); + }); + try (AnalysisRegistry registryWithPreBuiltTokenFilter = new AnalysisRegistry(emptyEnvironment, emptyMap(), emptyMap(), emptyMap(), + emptyMap(), emptyMap(), singletonMap("asserts_built_once", assertsBuiltOnce))) { + IndexAnalyzers indexAnalyzers = registryWithPreBuiltTokenFilter.build(emptyIndexSettingsOfCurrentVersion); + IndexAnalyzers otherIndexAnalyzers = registryWithPreBuiltTokenFilter.build(emptyIndexSettingsOfCurrentVersion); + assertSame(indexAnalyzers.get("asserts_built_once"), otherIndexAnalyzers.get("asserts_built_once")); + } + } + public void testNoTypeOrTokenizerErrorMessage() throws IOException { Version version = VersionUtils.randomVersion(random()); Settings settings = Settings @@ -212,20 +233,14 @@ public void testNoTypeOrTokenizerErrorMessage() throws IOException { .build(); IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings); - IllegalArgumentException e = expectThrows(IllegalArgumentException.class, - () -> new AnalysisRegistry(new Environment(settings), - emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap()).build(idxSettings)); + IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> + new AnalysisRegistry(new Environment(settings), emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap()) + .build(idxSettings)); assertThat(e.getMessage(), equalTo("analyzer [test_analyzer] must specify either an analyzer type, or a tokenizer")); } public void testCloseIndexAnalyzersMultipleTimes() throws IOException { - Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build(); - Settings indexSettings = Settings.builder() - .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT).build(); - IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings); - IndexAnalyzers indexAnalyzers = new AnalysisRegistry(new Environment(settings), - emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap()) - .build(idxSettings); + IndexAnalyzers indexAnalyzers = emptyRegistry.build(emptyIndexSettingsOfCurrentVersion); indexAnalyzers.close(); indexAnalyzers.close(); } diff --git a/core/src/test/java/org/elasticsearch/index/analysis/CoreAnalysisFactoryTests.java b/core/src/test/java/org/elasticsearch/index/analysis/CoreAnalysisFactoryTests.java new file mode 100644 index 0000000000000..3b4897b588988 --- /dev/null +++ b/core/src/test/java/org/elasticsearch/index/analysis/CoreAnalysisFactoryTests.java @@ -0,0 +1,37 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import org.elasticsearch.indices.analysis.AnalysisFactoryTestCase; +import org.elasticsearch.plugins.AnalysisPlugin; + +/** + * Checks on the analysis components that are part of core to make sure that any that are added + * to lucene are either enabled or explicitly not enabled. During the migration of analysis + * components to the {@code analysis-common} module this test ignores many components that are + * available to es-core but mapping in {@code analysis-common}. When the migration is complete + * no such ignoring will be needed because the analysis components won't be available to core. + */ +public class CoreAnalysisFactoryTests extends AnalysisFactoryTestCase { + public CoreAnalysisFactoryTests() { + // Use an empty plugin that doesn't define anything so the test doesn't need a ton of null checks. + super(new AnalysisPlugin() {}); + } +} diff --git a/core/src/test/java/org/elasticsearch/index/analysis/CustomNormalizerTests.java b/core/src/test/java/org/elasticsearch/index/analysis/CustomNormalizerTests.java index c6b5806099699..5cdc589405714 100644 --- a/core/src/test/java/org/elasticsearch/index/analysis/CustomNormalizerTests.java +++ b/core/src/test/java/org/elasticsearch/index/analysis/CustomNormalizerTests.java @@ -19,33 +19,39 @@ package org.elasticsearch.index.analysis; +import org.apache.lucene.analysis.MockLowerCaseFilter; import org.apache.lucene.util.BytesRef; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider; +import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy; import org.elasticsearch.plugins.AnalysisPlugin; import org.elasticsearch.test.ESTestCase; import org.elasticsearch.test.ESTokenStreamTestCase; import java.io.IOException; import java.io.Reader; +import java.util.List; import java.util.Map; +import static java.util.Collections.singletonList; import static java.util.Collections.singletonMap; public class CustomNormalizerTests extends ESTokenStreamTestCase { + private static final AnalysisPlugin MOCK_ANALYSIS_PLUGIN = new MockAnalysisPlugin(); + public void testBasics() throws IOException { Settings settings = Settings.builder() - .putArray("index.analysis.normalizer.my_normalizer.filter", "lowercase", "asciifolding") + .putArray("index.analysis.normalizer.my_normalizer.filter", "lowercase") .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) .build(); - ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings); + ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, MOCK_ANALYSIS_PLUGIN); assertNull(analysis.indexAnalyzers.get("my_normalizer")); NamedAnalyzer normalizer = analysis.indexAnalyzers.getNormalizer("my_normalizer"); assertNotNull(normalizer); assertEquals("my_normalizer", normalizer.name()); - assertTokenStreamContents(normalizer.tokenStream("foo", "Cet été-là"), new String[] {"cet ete-la"}); - assertEquals(new BytesRef("cet ete-la"), normalizer.normalize("foo", "Cet été-là")); + assertTokenStreamContents(normalizer.tokenStream("foo", "Cet été-là"), new String[] {"cet été-là"}); + assertEquals(new BytesRef("cet été-là"), normalizer.normalize("foo", "Cet été-là")); } public void testUnknownType() { @@ -75,7 +81,7 @@ public void testCharFilters() throws IOException { .putArray("index.analysis.normalizer.my_normalizer.char_filter", "my_mapping") .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) .build(); - ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new MockCharFilterPlugin()); + ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, MOCK_ANALYSIS_PLUGIN); assertNull(analysis.indexAnalyzers.get("my_normalizer")); NamedAnalyzer normalizer = analysis.indexAnalyzers.getNormalizer("my_normalizer"); assertNotNull(normalizer); @@ -86,12 +92,12 @@ public void testCharFilters() throws IOException { public void testIllegalFilters() throws IOException { Settings settings = Settings.builder() - .putArray("index.analysis.normalizer.my_normalizer.filter", "porter_stem") + .putArray("index.analysis.normalizer.my_normalizer.filter", "mock_forbidden") .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) .build(); IllegalArgumentException e = expectThrows(IllegalArgumentException.class, - () -> AnalysisTestsHelper.createTestAnalysisFromSettings(settings)); - assertEquals("Custom normalizer [my_normalizer] may not use filter [porter_stem]", e.getMessage()); + () -> AnalysisTestsHelper.createTestAnalysisFromSettings(settings, MOCK_ANALYSIS_PLUGIN)); + assertEquals("Custom normalizer [my_normalizer] may not use filter [mock_forbidden]", e.getMessage()); } public void testIllegalCharFilters() throws IOException { @@ -104,7 +110,12 @@ public void testIllegalCharFilters() throws IOException { assertEquals("Custom normalizer [my_normalizer] may not use char filter [html_strip]", e.getMessage()); } - private class MockCharFilterPlugin implements AnalysisPlugin { + private static class MockAnalysisPlugin implements AnalysisPlugin { + @Override + public List getPreConfiguredTokenFilters() { + return singletonList(new PreConfiguredTokenFilter("mock_forbidden", false, CachingStrategy.ONE, MockLowerCaseFilter::new)); + } + @Override public Map> getCharFilters() { return singletonMap("mock_char_filter", (indexSettings, env, name, settings) -> { @@ -116,22 +127,21 @@ public String name() { @Override public Reader create(Reader reader) { return new Reader() { + @Override + public int read(char[] cbuf, int off, int len) throws IOException { + int result = reader.read(cbuf, off, len); + for (int i = off; i < result; i++) { + if (cbuf[i] == 'a') { + cbuf[i] = 'z'; + } + } + return result; + } - @Override - public int read(char[] cbuf, int off, int len) throws IOException { - int result = reader.read(cbuf, off, len); - for (int i = off; i < result; i++) { - if (cbuf[i] == 'a') { - cbuf[i] = 'z'; - } - } - return result; - } - - @Override - public void close() throws IOException { - reader.close(); - } + @Override + public void close() throws IOException { + reader.close(); + } }; } @Override diff --git a/core/src/test/java/org/elasticsearch/index/mapper/KeywordFieldMapperTests.java b/core/src/test/java/org/elasticsearch/index/mapper/KeywordFieldMapperTests.java index 2da44d57f00aa..518f669f81f3f 100644 --- a/core/src/test/java/org/elasticsearch/index/mapper/KeywordFieldMapperTests.java +++ b/core/src/test/java/org/elasticsearch/index/mapper/KeywordFieldMapperTests.java @@ -19,6 +19,7 @@ package org.elasticsearch.index.mapper; +import org.apache.lucene.analysis.MockLowerCaseFilter; import org.apache.lucene.index.DocValuesType; import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.IndexableField; @@ -29,7 +30,10 @@ import org.elasticsearch.common.xcontent.XContentFactory; import org.elasticsearch.common.xcontent.XContentType; import org.elasticsearch.index.IndexService; +import org.elasticsearch.index.analysis.PreConfiguredTokenFilter; import org.elasticsearch.index.mapper.MapperService.MergeReason; +import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy; +import org.elasticsearch.plugins.AnalysisPlugin; import org.elasticsearch.plugins.Plugin; import org.elasticsearch.test.ESSingleNodeTestCase; import org.elasticsearch.test.InternalSettingsPlugin; @@ -38,15 +42,26 @@ import java.io.IOException; import java.util.Arrays; import java.util.Collection; +import java.util.List; +import static java.util.Collections.singletonList; import static org.hamcrest.Matchers.containsString; import static org.hamcrest.Matchers.equalTo; public class KeywordFieldMapperTests extends ESSingleNodeTestCase { + /** + * Creates a copy of the lowercase token filter which we use for testing merge errors. + */ + public static class MockAnalysisPlugin extends Plugin implements AnalysisPlugin { + @Override + public List getPreConfiguredTokenFilters() { + return singletonList(new PreConfiguredTokenFilter("mock_other_lowercase", true, CachingStrategy.ONE, MockLowerCaseFilter::new)); + } + }; @Override protected Collection> getPlugins() { - return pluginList(InternalSettingsPlugin.class); + return pluginList(InternalSettingsPlugin.class, MockAnalysisPlugin.class); } IndexService indexService; @@ -57,8 +72,8 @@ public void setup() { indexService = createIndex("test", Settings.builder() .put("index.analysis.normalizer.my_lowercase.type", "custom") .putArray("index.analysis.normalizer.my_lowercase.filter", "lowercase") - .put("index.analysis.normalizer.my_asciifolding.type", "custom") - .putArray("index.analysis.normalizer.my_asciifolding.filter", "asciifolding").build()); + .put("index.analysis.normalizer.my_other_lowercase.type", "custom") + .putArray("index.analysis.normalizer.my_other_lowercase.filter", "mock_other_lowercase").build()); parser = indexService.mapperService().documentMapperParser(); } @@ -348,7 +363,7 @@ public void testUpdateNormalizer() throws IOException { String mapping2 = XContentFactory.jsonBuilder().startObject().startObject("type") .startObject("properties").startObject("field") - .field("type", "keyword").field("normalizer", "my_asciifolding").endObject().endObject() + .field("type", "keyword").field("normalizer", "my_other_lowercase").endObject().endObject() .endObject().endObject().string(); IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> indexService.mapperService().merge("type", diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java index e17df4b44631e..afe235ac8a5f9 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java @@ -19,14 +19,35 @@ package org.elasticsearch.analysis.common; +import org.apache.lucene.analysis.CharArraySet; +import org.apache.lucene.analysis.StopFilter; +import org.apache.lucene.analysis.commongrams.CommonGramsFilter; +import org.apache.lucene.analysis.core.StopAnalyzer; +import org.apache.lucene.analysis.core.UpperCaseFilter; +import org.apache.lucene.analysis.en.KStemFilter; +import org.apache.lucene.analysis.en.PorterStemFilter; +import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter; +import org.apache.lucene.analysis.miscellaneous.LengthFilter; +import org.apache.lucene.analysis.miscellaneous.TrimFilter; +import org.apache.lucene.analysis.miscellaneous.TruncateTokenFilter; +import org.apache.lucene.analysis.miscellaneous.UniqueTokenFilter; +import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter; +import org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter; +import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter; +import org.apache.lucene.analysis.ngram.NGramTokenFilter; +import org.apache.lucene.analysis.reverse.ReverseStringFilter; +import org.apache.lucene.analysis.standard.ClassicFilter; import org.elasticsearch.index.analysis.CharFilterFactory; import org.elasticsearch.index.analysis.HtmlStripCharFilterFactory; +import org.elasticsearch.index.analysis.PreConfiguredTokenFilter; import org.elasticsearch.index.analysis.TokenFilterFactory; import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider; +import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy; import org.elasticsearch.plugins.AnalysisPlugin; import org.elasticsearch.plugins.Plugin; -import java.util.HashMap; +import java.util.ArrayList; +import java.util.List; import java.util.Map; import java.util.TreeMap; @@ -35,14 +56,13 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin { @Override public Map> getTokenFilters() { - Map> filters = new HashMap<>(); + Map> filters = new TreeMap<>(); filters.put("asciifolding", ASCIIFoldingTokenFilterFactory::new); filters.put("word_delimiter", WordDelimiterTokenFilterFactory::new); filters.put("word_delimiter_graph", WordDelimiterGraphTokenFilterFactory::new); return filters; } - @Override public Map> getCharFilters() { Map> filters = new TreeMap<>(); filters.put("html_strip", HtmlStripCharFilterFactory::new); @@ -50,4 +70,50 @@ public Map> getCharFilters() { filters.put("mapping", requriesAnalysisSettings(MappingCharFilterFactory::new)); return filters; } + + @Override + public List getPreConfiguredTokenFilters() { + // TODO we should revisit the caching strategies. + List filters = new ArrayList<>(); + filters.add(new PreConfiguredTokenFilter("asciifolding", true, CachingStrategy.ONE, input -> new ASCIIFoldingFilter(input))); + filters.add(new PreConfiguredTokenFilter("classic", false, CachingStrategy.ONE, ClassicFilter::new)); + filters.add(new PreConfiguredTokenFilter("common_grams", false, CachingStrategy.LUCENE, input -> + new CommonGramsFilter(input, CharArraySet.EMPTY_SET))); + filters.add(new PreConfiguredTokenFilter("edge_ngram", false, CachingStrategy.LUCENE, input -> + new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE))); + // TODO deprecate edgeNGram + filters.add(new PreConfiguredTokenFilter("edgeNGram", false, CachingStrategy.LUCENE, input -> + new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE))); + filters.add(new PreConfiguredTokenFilter("kstem", false, CachingStrategy.ONE, KStemFilter::new)); + filters.add(new PreConfiguredTokenFilter("length", false, CachingStrategy.LUCENE, input -> + new LengthFilter(input, 0, Integer.MAX_VALUE))); // TODO this one seems useless + filters.add(new PreConfiguredTokenFilter("ngram", false, CachingStrategy.LUCENE, NGramTokenFilter::new)); + // TODO deprecate nGram + filters.add(new PreConfiguredTokenFilter("nGram", false, CachingStrategy.LUCENE, NGramTokenFilter::new)); + filters.add(new PreConfiguredTokenFilter("porter_stem", false, CachingStrategy.ONE, PorterStemFilter::new)); + filters.add(new PreConfiguredTokenFilter("reverse", false, CachingStrategy.LUCENE, input -> new ReverseStringFilter(input))); + // The stop filter is in lucene-core but the English stop words set is in lucene-analyzers-common + filters.add(new PreConfiguredTokenFilter("stop", false, CachingStrategy.LUCENE, input -> + new StopFilter(input, StopAnalyzer.ENGLISH_STOP_WORDS_SET))); + filters.add(new PreConfiguredTokenFilter("trim", false, CachingStrategy.LUCENE, TrimFilter::new)); + filters.add(new PreConfiguredTokenFilter("truncate", false, CachingStrategy.ONE, input -> + new TruncateTokenFilter(input, 10))); + filters.add(new PreConfiguredTokenFilter("unique", false, CachingStrategy.ONE, input -> new UniqueTokenFilter(input))); + filters.add(new PreConfiguredTokenFilter("uppercase", true, CachingStrategy.LUCENE, UpperCaseFilter::new)); + filters.add(new PreConfiguredTokenFilter("word_delimiter", false, CachingStrategy.ONE, input -> + new WordDelimiterFilter(input, + WordDelimiterFilter.GENERATE_WORD_PARTS + | WordDelimiterFilter.GENERATE_NUMBER_PARTS + | WordDelimiterFilter.SPLIT_ON_CASE_CHANGE + | WordDelimiterFilter.SPLIT_ON_NUMERICS + | WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE, null))); + filters.add(new PreConfiguredTokenFilter("word_delimiter_graph", false, CachingStrategy.ONE, input -> + new WordDelimiterGraphFilter(input, + WordDelimiterGraphFilter.GENERATE_WORD_PARTS + | WordDelimiterGraphFilter.GENERATE_NUMBER_PARTS + | WordDelimiterGraphFilter.SPLIT_ON_CASE_CHANGE + | WordDelimiterGraphFilter.SPLIT_ON_NUMERICS + | WordDelimiterGraphFilter.STEM_ENGLISH_POSSESSIVE, null))); + return filters; + } } diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java index 78522f3b6f3fd..73a6c3d273291 100644 --- a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java @@ -19,10 +19,10 @@ package org.elasticsearch.analysis.common; -import org.elasticsearch.AnalysisFactoryTestCase; +import org.apache.lucene.analysis.reverse.ReverseStringFilterFactory; import org.elasticsearch.index.analysis.HtmlStripCharFilterFactory; +import org.elasticsearch.indices.analysis.AnalysisFactoryTestCase; -import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.TreeMap; @@ -31,15 +31,19 @@ import static java.util.stream.Collectors.toList; public class CommonAnalysisFactoryTests extends AnalysisFactoryTestCase { + public CommonAnalysisFactoryTests() { + super(new CommonAnalysisPlugin()); + } + @Override protected Map> getTokenizers() { - Map> tokenizers = new HashMap<>(super.getTokenizers()); + Map> tokenizers = new TreeMap<>(super.getTokenizers()); return tokenizers; } @Override protected Map> getTokenFilters() { - Map> filters = new HashMap<>(super.getTokenFilters()); + Map> filters = new TreeMap<>(super.getTokenFilters()); filters.put("asciifolding", ASCIIFoldingTokenFilterFactory.class); filters.put("worddelimiter", WordDelimiterTokenFilterFactory.class); filters.put("worddelimitergraph", WordDelimiterGraphTokenFilterFactory.class); @@ -59,6 +63,30 @@ protected Map> getCharFilters() { return filters; } + @Override + protected Map> getPreConfiguredTokenFilters() { + Map> filters = new TreeMap<>(super.getPreConfiguredTokenFilters()); + filters.put("asciifolding", null); + filters.put("classic", null); + filters.put("common_grams", null); + filters.put("edge_ngram", null); + filters.put("edgeNGram", null); + filters.put("kstem", null); + filters.put("length", null); + filters.put("ngram", null); + filters.put("nGram", null); + filters.put("porter_stem", null); + filters.put("reverse", ReverseStringFilterFactory.class); + filters.put("stop", null); + filters.put("trim", null); + filters.put("truncate", null); + filters.put("unique", Void.class); + filters.put("uppercase", null); + filters.put("word_delimiter", null); + filters.put("word_delimiter_graph", null); + return filters; + } + /** * Fails if a tokenizer is marked in the superclass with {@link MovedToAnalysisCommon} but * hasn't been marked in this class with its proper factory. diff --git a/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/40_token_filters.yaml b/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/40_token_filters.yaml index 0666a31623b10..39d55c15acec0 100644 --- a/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/40_token_filters.yaml +++ b/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/40_token_filters.yaml @@ -10,6 +10,18 @@ - length: { tokens: 1 } - match: { tokens.0.token: Musee d'Orsay } + - do: + indices.analyze: + body: + text: Musée d'Orsay + tokenizer: keyword + filter: + - type: asciifolding + preserve_original: true + - length: { tokens: 2 } + - match: { tokens.0.token: Musee d'Orsay } + - match: { tokens.1.token: Musée d'Orsay } + --- "lowercase": - do: diff --git a/plugins/analysis-icu/src/test/java/org/elasticsearch/index/analysis/AnalysisICUFactoryTests.java b/plugins/analysis-icu/src/test/java/org/elasticsearch/index/analysis/AnalysisICUFactoryTests.java index 704ca61985aa6..d222189651e1c 100644 --- a/plugins/analysis-icu/src/test/java/org/elasticsearch/index/analysis/AnalysisICUFactoryTests.java +++ b/plugins/analysis-icu/src/test/java/org/elasticsearch/index/analysis/AnalysisICUFactoryTests.java @@ -19,12 +19,16 @@ package org.elasticsearch.index.analysis; -import org.elasticsearch.AnalysisFactoryTestCase; +import org.elasticsearch.indices.analysis.AnalysisFactoryTestCase; +import org.elasticsearch.plugin.analysis.icu.AnalysisICUPlugin; import java.util.HashMap; import java.util.Map; public class AnalysisICUFactoryTests extends AnalysisFactoryTestCase { + public AnalysisICUFactoryTests() { + super(new AnalysisICUPlugin()); + } @Override protected Map> getTokenizers() { diff --git a/plugins/analysis-kuromoji/src/test/java/org/elasticsearch/index/analysis/AnalysisKuromojiFactoryTests.java b/plugins/analysis-kuromoji/src/test/java/org/elasticsearch/index/analysis/AnalysisKuromojiFactoryTests.java index 9db7def101ef8..dbdc5795b38f8 100644 --- a/plugins/analysis-kuromoji/src/test/java/org/elasticsearch/index/analysis/AnalysisKuromojiFactoryTests.java +++ b/plugins/analysis-kuromoji/src/test/java/org/elasticsearch/index/analysis/AnalysisKuromojiFactoryTests.java @@ -20,12 +20,16 @@ package org.elasticsearch.index.analysis; import org.apache.lucene.analysis.ja.JapaneseTokenizerFactory; -import org.elasticsearch.AnalysisFactoryTestCase; +import org.elasticsearch.indices.analysis.AnalysisFactoryTestCase; +import org.elasticsearch.plugin.analysis.kuromoji.AnalysisKuromojiPlugin; import java.util.HashMap; import java.util.Map; public class AnalysisKuromojiFactoryTests extends AnalysisFactoryTestCase { + public AnalysisKuromojiFactoryTests() { + super(new AnalysisKuromojiPlugin()); + } @Override protected Map> getTokenizers() { diff --git a/plugins/analysis-phonetic/src/test/java/org/elasticsearch/index/analysis/AnalysisPhoneticFactoryTests.java b/plugins/analysis-phonetic/src/test/java/org/elasticsearch/index/analysis/AnalysisPhoneticFactoryTests.java index 0546fb468c924..8c551aee9190e 100644 --- a/plugins/analysis-phonetic/src/test/java/org/elasticsearch/index/analysis/AnalysisPhoneticFactoryTests.java +++ b/plugins/analysis-phonetic/src/test/java/org/elasticsearch/index/analysis/AnalysisPhoneticFactoryTests.java @@ -19,12 +19,16 @@ package org.elasticsearch.index.analysis; -import org.elasticsearch.AnalysisFactoryTestCase; +import org.elasticsearch.indices.analysis.AnalysisFactoryTestCase; +import org.elasticsearch.plugin.analysis.AnalysisPhoneticPlugin; import java.util.HashMap; import java.util.Map; public class AnalysisPhoneticFactoryTests extends AnalysisFactoryTestCase { + public AnalysisPhoneticFactoryTests() { + super(new AnalysisPhoneticPlugin()); + } @Override protected Map> getTokenFilters() { diff --git a/plugins/analysis-smartcn/src/test/java/org/elasticsearch/index/analysis/AnalysisSmartChineseFactoryTests.java b/plugins/analysis-smartcn/src/test/java/org/elasticsearch/index/analysis/AnalysisSmartChineseFactoryTests.java index d8aad322dcb93..53652c55f018a 100644 --- a/plugins/analysis-smartcn/src/test/java/org/elasticsearch/index/analysis/AnalysisSmartChineseFactoryTests.java +++ b/plugins/analysis-smartcn/src/test/java/org/elasticsearch/index/analysis/AnalysisSmartChineseFactoryTests.java @@ -19,13 +19,16 @@ package org.elasticsearch.index.analysis; -import org.elasticsearch.AnalysisFactoryTestCase; +import org.elasticsearch.indices.analysis.AnalysisFactoryTestCase; +import org.elasticsearch.plugin.analysis.smartcn.AnalysisSmartChinesePlugin; import java.util.HashMap; import java.util.Map; public class AnalysisSmartChineseFactoryTests extends AnalysisFactoryTestCase { - + public AnalysisSmartChineseFactoryTests() { + super(new AnalysisSmartChinesePlugin()); + } @Override protected Map> getTokenizers() { Map> tokenizers = new HashMap<>(super.getTokenizers()); diff --git a/plugins/analysis-stempel/src/test/java/org/elasticsearch/index/analysis/AnalysisPolishFactoryTests.java b/plugins/analysis-stempel/src/test/java/org/elasticsearch/index/analysis/AnalysisPolishFactoryTests.java index 8301529627670..ae78b9c01b3f8 100644 --- a/plugins/analysis-stempel/src/test/java/org/elasticsearch/index/analysis/AnalysisPolishFactoryTests.java +++ b/plugins/analysis-stempel/src/test/java/org/elasticsearch/index/analysis/AnalysisPolishFactoryTests.java @@ -23,7 +23,6 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.Tokenizer; -import org.elasticsearch.AnalysisFactoryTestCase; import org.elasticsearch.Version; import org.elasticsearch.cluster.metadata.IndexMetaData; import org.elasticsearch.common.UUIDs; @@ -31,12 +30,17 @@ import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.analysis.pl.PolishStemTokenFilterFactory; +import org.elasticsearch.indices.analysis.AnalysisFactoryTestCase; +import org.elasticsearch.plugin.analysis.stempel.AnalysisStempelPlugin; import java.io.IOException; import java.util.HashMap; import java.util.Map; public class AnalysisPolishFactoryTests extends AnalysisFactoryTestCase { + public AnalysisPolishFactoryTests() { + super(new AnalysisStempelPlugin()); + } @Override protected Map> getTokenFilters() { diff --git a/test/framework/src/main/java/org/elasticsearch/AnalysisFactoryTestCase.java b/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java similarity index 89% rename from test/framework/src/main/java/org/elasticsearch/AnalysisFactoryTestCase.java rename to test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java index d14f81c61dfeb..534db0be39fb7 100644 --- a/test/framework/src/main/java/org/elasticsearch/AnalysisFactoryTestCase.java +++ b/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java @@ -17,14 +17,14 @@ * under the License. */ -package org.elasticsearch; +package org.elasticsearch.indices.analysis; import org.apache.lucene.analysis.en.PorterStemFilterFactory; -import org.apache.lucene.analysis.reverse.ReverseStringFilterFactory; import org.apache.lucene.analysis.snowball.SnowballPorterFilterFactory; import org.apache.lucene.analysis.util.CharFilterFactory; import org.apache.lucene.analysis.util.TokenFilterFactory; import org.apache.lucene.analysis.util.TokenizerFactory; +import org.elasticsearch.Version; import org.elasticsearch.common.collect.MapBuilder; import org.elasticsearch.index.analysis.ApostropheFilterFactory; import org.elasticsearch.index.analysis.ArabicNormalizationFilterFactory; @@ -67,6 +67,7 @@ import org.elasticsearch.index.analysis.PatternTokenizerFactory; import org.elasticsearch.index.analysis.PersianNormalizationFilterFactory; import org.elasticsearch.index.analysis.PorterStemTokenFilterFactory; +import org.elasticsearch.index.analysis.PreConfiguredTokenFilter; import org.elasticsearch.index.analysis.ReverseTokenFilterFactory; import org.elasticsearch.index.analysis.ScandinavianFoldingFilterFactory; import org.elasticsearch.index.analysis.ScandinavianNormalizationFilterFactory; @@ -89,21 +90,23 @@ import org.elasticsearch.index.analysis.WhitespaceTokenizerFactory; import org.elasticsearch.index.analysis.compound.DictionaryCompoundWordTokenFilterFactory; import org.elasticsearch.index.analysis.compound.HyphenationCompoundWordTokenFilterFactory; -import org.elasticsearch.indices.analysis.PreBuiltCharFilters; -import org.elasticsearch.indices.analysis.PreBuiltTokenFilters; -import org.elasticsearch.indices.analysis.PreBuiltTokenizers; +import org.elasticsearch.plugins.AnalysisPlugin; import org.elasticsearch.test.ESTestCase; import java.util.Collection; import java.util.EnumMap; import java.util.HashMap; import java.util.HashSet; +import java.util.Locale; import java.util.Map; +import java.util.Objects; import java.util.Set; import java.util.TreeSet; import java.util.regex.Matcher; import java.util.regex.Pattern; +import static java.util.Collections.singletonList; + /** * Alerts us if new analysis components are added to Lucene, so we don't miss them. *

@@ -285,41 +288,6 @@ private static String toCamelCase(String s) { .immutableMap(); - static final Map> PREBUILT_TOKENFILTERS; - static { - PREBUILT_TOKENFILTERS = new EnumMap<>(PreBuiltTokenFilters.class); - for (PreBuiltTokenFilters tokenizer : PreBuiltTokenFilters.values()) { - Class luceneFactoryClazz; - switch (tokenizer) { - case REVERSE: - luceneFactoryClazz = ReverseStringFilterFactory.class; - break; - case UNIQUE: - luceneFactoryClazz = Void.class; - break; - case SNOWBALL: - case DUTCH_STEM: - case FRENCH_STEM: - case RUSSIAN_STEM: - luceneFactoryClazz = SnowballPorterFilterFactory.class; - break; - case STEMMER: - luceneFactoryClazz = PorterStemFilterFactory.class; - break; - case DELIMITED_PAYLOAD_FILTER: - luceneFactoryClazz = org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilterFactory.class; - break; - case LIMIT: - luceneFactoryClazz = org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilterFactory.class; - break; - default: - luceneFactoryClazz = org.apache.lucene.analysis.util.TokenFilterFactory.lookupClass( - toCamelCase(tokenizer.getTokenFilterFactory(Version.CURRENT).name())); - } - PREBUILT_TOKENFILTERS.put(tokenizer, luceneFactoryClazz); - } - } - static final Map> KNOWN_CHARFILTERS = new MapBuilder>() // exposed in ES .put("htmlstrip", MovedToAnalysisCommon.class) @@ -345,6 +313,15 @@ private static String toCamelCase(String s) { } } + /** + * The plugin being tested. Core uses an "empty" plugin so we don't have to throw null checks all over the place. + */ + private final AnalysisPlugin plugin; + + public AnalysisFactoryTestCase(AnalysisPlugin plugin) { + this.plugin = Objects.requireNonNull(plugin, "plugin is required. use an empty plugin for core"); + } + protected Map> getTokenizers() { return KNOWN_TOKENIZERS; } @@ -353,6 +330,49 @@ protected Map> getTokenFilters() { return KNOWN_TOKENFILTERS; } + /** + * Map containing pre-configured token filters that should be available + * after installing this plugin. The map is from the name of the token + * filter to the class of the Lucene {@link TokenFilterFactory} that it + * is emulating. If the Lucene filter factory is {@code null} then the + * test will look it up for you from the name. If there is no Lucene + * {@linkplain TokenFilterFactory} then the right hand side should + * be {@link Void}. + */ + protected Map> getPreConfiguredTokenFilters() { + Map> filters = new HashMap<>(); + filters.put("standard", null); + filters.put("lowercase", null); + // TODO remove the loop below once all the tokenizers are migrated out of PreBuiltTokenFilters + for (PreBuiltTokenFilters tokenizer : PreBuiltTokenFilters.values()) { + Class luceneFactoryClass; + switch (tokenizer) { + case LOWERCASE: + // This has been migrated but has to stick around until PreBuiltTokenizers is removed. + continue; + case SNOWBALL: + case DUTCH_STEM: + case FRENCH_STEM: + case RUSSIAN_STEM: + luceneFactoryClass = SnowballPorterFilterFactory.class; + break; + case STEMMER: + luceneFactoryClass = PorterStemFilterFactory.class; + break; + case DELIMITED_PAYLOAD_FILTER: + luceneFactoryClass = org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilterFactory.class; + break; + case LIMIT: + luceneFactoryClass = org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilterFactory.class; + break; + default: + luceneFactoryClass = null; + } + filters.put(tokenizer.name().toLowerCase(Locale.ROOT), luceneFactoryClass); + } + return filters; + } + protected Map> getCharFilters() { return KNOWN_CHARFILTERS; } @@ -445,18 +465,24 @@ public void testPreBuiltMultiTermAware() { expected.add(tokenizer); } } - for (Map.Entry> entry : PREBUILT_TOKENFILTERS.entrySet()) { - PreBuiltTokenFilters tokenFilter = entry.getKey(); + Map preBuiltTokenFilters = AnalysisModule.setupPreConfiguredTokenFilters(singletonList(plugin)); + for (Map.Entry> entry : getPreConfiguredTokenFilters().entrySet()) { + String name = entry.getKey(); Class luceneFactory = entry.getValue(); if (luceneFactory == Void.class) { continue; } + if (luceneFactory == null) { + luceneFactory = TokenFilterFactory.lookupClass(toCamelCase(name)); + } assertTrue(TokenFilterFactory.class.isAssignableFrom(luceneFactory)); - if (tokenFilter.getTokenFilterFactory(Version.CURRENT) instanceof MultiTermAwareComponent) { - actual.add(tokenFilter); + PreConfiguredTokenFilter filter = preBuiltTokenFilters.get(name); + assertNotNull("test claims pre built token filter [" + name + "] should be available but it wasn't", filter); + if (filter.shouldUseFilterForMultitermQueries()) { + actual.add("token filter [" + name + "]"); } if (org.apache.lucene.analysis.util.MultiTermAwareComponent.class.isAssignableFrom(luceneFactory)) { - expected.add(tokenFilter); + expected.add("token filter [" + name + "]"); } } for (Map.Entry> entry : PREBUILT_CHARFILTERS.entrySet()) {