Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@
import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider;
import org.elasticsearch.indices.analysis.PreBuiltAnalyzers;
import org.elasticsearch.indices.analysis.PreBuiltCharFilters;
import org.elasticsearch.indices.analysis.PreBuiltTokenFilters;
import org.elasticsearch.indices.analysis.PreBuiltTokenizers;

import java.io.Closeable;
Expand All @@ -59,7 +58,7 @@ public final class AnalysisRegistry implements Closeable {
public static final String INDEX_ANALYSIS_CHAR_FILTER = "index.analysis.char_filter";
public static final String INDEX_ANALYSIS_FILTER = "index.analysis.filter";
public static final String INDEX_ANALYSIS_TOKENIZER = "index.analysis.tokenizer";
private final PrebuiltAnalysis prebuiltAnalysis = new PrebuiltAnalysis();
private final PrebuiltAnalysis prebuiltAnalysis;
private final Map<String, Analyzer> cachedAnalyzer = new ConcurrentHashMap<>();

private final Environment environment;
Expand All @@ -74,13 +73,15 @@ public AnalysisRegistry(Environment environment,
Map<String, AnalysisProvider<TokenFilterFactory>> tokenFilters,
Map<String, AnalysisProvider<TokenizerFactory>> tokenizers,
Map<String, AnalysisProvider<AnalyzerProvider<?>>> analyzers,
Map<String, AnalysisProvider<AnalyzerProvider<?>>> normalizers) {
Map<String, AnalysisProvider<AnalyzerProvider<?>>> normalizers,
Map<String, PreConfiguredTokenFilter> preConfiguredTokenFilters) {
this.environment = environment;
this.charFilters = unmodifiableMap(charFilters);
this.tokenFilters = unmodifiableMap(tokenFilters);
this.tokenizers = unmodifiableMap(tokenizers);
this.analyzers = unmodifiableMap(analyzers);
this.normalizers = unmodifiableMap(normalizers);
prebuiltAnalysis = new PrebuiltAnalysis(preConfiguredTokenFilters);
}

/**
Expand Down Expand Up @@ -305,8 +306,8 @@ public String toString() {
}

private <T> Map<String, T> buildMapping(Component component, IndexSettings settings, Map<String, Settings> settingsMap,
Map<String, AnalysisModule.AnalysisProvider<T>> providerMap, Map<String, AnalysisModule.AnalysisProvider<T>> defaultInstance)
throws IOException {
Map<String, ? extends AnalysisModule.AnalysisProvider<T>> providerMap,
Map<String, ? extends AnalysisModule.AnalysisProvider<T>> defaultInstance) throws IOException {
Settings defaultSettings = Settings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, settings.getIndexVersionCreated()).build();
Map<String, T> factories = new HashMap<>();
for (Map.Entry<String, Settings> entry : settingsMap.entrySet()) {
Expand Down Expand Up @@ -344,7 +345,7 @@ private <T> Map<String, T> buildMapping(Component component, IndexSettings setti

}
// go over the char filters in the bindings and register the ones that are not configured
for (Map.Entry<String, AnalysisModule.AnalysisProvider<T>> entry : providerMap.entrySet()) {
for (Map.Entry<String, ? extends AnalysisModule.AnalysisProvider<T>> entry : providerMap.entrySet()) {
String name = entry.getKey();
AnalysisModule.AnalysisProvider<T> provider = entry.getValue();
// we don't want to re-register one that already exists
Expand All @@ -365,7 +366,7 @@ private <T> Map<String, T> buildMapping(Component component, IndexSettings setti
factories.put(name, instance);
}

for (Map.Entry<String, AnalysisModule.AnalysisProvider<T>> entry : defaultInstance.entrySet()) {
for (Map.Entry<String, ? extends AnalysisModule.AnalysisProvider<T>> entry : defaultInstance.entrySet()) {
final String name = entry.getKey();
final AnalysisModule.AnalysisProvider<T> provider = entry.getValue();
if (factories.containsKey(name) == false) {
Expand All @@ -378,7 +379,8 @@ private <T> Map<String, T> buildMapping(Component component, IndexSettings setti
return factories;
}

private <T> AnalysisProvider<T> getAnalysisProvider(Component component, Map<String, AnalysisProvider<T>> providerMap, String name, String typeName) {
private <T> AnalysisProvider<T> getAnalysisProvider(Component component, Map<String, ? extends AnalysisProvider<T>> providerMap,
String name, String typeName) {
if (typeName == null) {
throw new IllegalArgumentException(component + " [" + name + "] must specify either an analyzer type, or a tokenizer");
}
Expand All @@ -393,13 +395,12 @@ private static class PrebuiltAnalysis implements Closeable {

final Map<String, AnalysisModule.AnalysisProvider<AnalyzerProvider<?>>> analyzerProviderFactories;
final Map<String, AnalysisModule.AnalysisProvider<TokenizerFactory>> tokenizerFactories;
final Map<String, AnalysisModule.AnalysisProvider<TokenFilterFactory>> tokenFilterFactories;
final Map<String, ? extends AnalysisProvider<TokenFilterFactory>> tokenFilterFactories;
final Map<String, AnalysisModule.AnalysisProvider<CharFilterFactory>> charFilterFactories;

private PrebuiltAnalysis() {
private PrebuiltAnalysis(Map<String, PreConfiguredTokenFilter> preConfiguredTokenFilters) {
Map<String, PreBuiltAnalyzerProviderFactory> analyzerProviderFactories = new HashMap<>();
Map<String, PreBuiltTokenizerFactoryFactory> tokenizerFactories = new HashMap<>();
Map<String, PreBuiltTokenFilterFactoryFactory> tokenFilterFactories = new HashMap<>();
Map<String, PreBuiltCharFilterFactoryFactory> charFilterFactories = new HashMap<>();
// Analyzers
for (PreBuiltAnalyzers preBuiltAnalyzerEnum : PreBuiltAnalyzers.values()) {
Expand All @@ -418,28 +419,18 @@ private PrebuiltAnalysis() {
tokenizerFactories.put("edgeNGram", new PreBuiltTokenizerFactoryFactory(PreBuiltTokenizers.EDGE_NGRAM.getTokenizerFactory(Version.CURRENT)));
tokenizerFactories.put("PathHierarchy", new PreBuiltTokenizerFactoryFactory(PreBuiltTokenizers.PATH_HIERARCHY.getTokenizerFactory(Version.CURRENT)));


// Token filters
for (PreBuiltTokenFilters preBuiltTokenFilter : PreBuiltTokenFilters.values()) {
String name = preBuiltTokenFilter.name().toLowerCase(Locale.ROOT);
tokenFilterFactories.put(name, new PreBuiltTokenFilterFactoryFactory(preBuiltTokenFilter.getTokenFilterFactory(Version.CURRENT)));
}
// Token filter aliases
tokenFilterFactories.put("nGram", new PreBuiltTokenFilterFactoryFactory(PreBuiltTokenFilters.NGRAM.getTokenFilterFactory(Version.CURRENT)));
tokenFilterFactories.put("edgeNGram", new PreBuiltTokenFilterFactoryFactory(PreBuiltTokenFilters.EDGE_NGRAM.getTokenFilterFactory(Version.CURRENT)));


// Char Filters
for (PreBuiltCharFilters preBuiltCharFilter : PreBuiltCharFilters.values()) {
String name = preBuiltCharFilter.name().toLowerCase(Locale.ROOT);
charFilterFactories.put(name, new PreBuiltCharFilterFactoryFactory(preBuiltCharFilter.getCharFilterFactory(Version.CURRENT)));
}
// Char filter aliases
charFilterFactories.put("htmlStrip", new PreBuiltCharFilterFactoryFactory(PreBuiltCharFilters.HTML_STRIP.getCharFilterFactory(Version.CURRENT)));

this.analyzerProviderFactories = Collections.unmodifiableMap(analyzerProviderFactories);
this.charFilterFactories = Collections.unmodifiableMap(charFilterFactories);
this.tokenFilterFactories = Collections.unmodifiableMap(tokenFilterFactories);
this.tokenizerFactories = Collections.unmodifiableMap(tokenizerFactories);
tokenFilterFactories = preConfiguredTokenFilters;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe later (followup, as cleanup after these are all done) these members could be renamed to preConfigured*

}

public AnalysisModule.AnalysisProvider<CharFilterFactory> getCharFilterFactory(String name) {
Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.elasticsearch.index.analysis;

import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.elasticsearch.Version;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.indices.analysis.AnalysisModule;
import org.elasticsearch.indices.analysis.PreBuiltCacheFactory;

import java.io.IOException;
import java.util.function.BiFunction;
import java.util.function.Function;

/**
* Provides pre-configured, shared {@link TokenFilter}s.
*/
public final class PreConfiguredTokenFilter implements AnalysisModule.AnalysisProvider<TokenFilterFactory> {
private final String name;
private final boolean useFilterForMultitermQueries;
private final PreBuiltCacheFactory.PreBuiltCache<TokenFilterFactory> cache;
private final BiFunction<TokenStream, Version, TokenStream> create;

/**
* Standard ctor with all the power.
*/
public PreConfiguredTokenFilter(String name, boolean useFilterForMultitermQueries,
PreBuiltCacheFactory.CachingStrategy cachingStrategy, BiFunction<TokenStream, Version, TokenStream> create) {
this.name = name;
this.useFilterForMultitermQueries = useFilterForMultitermQueries;
cache = PreBuiltCacheFactory.getCache(cachingStrategy);
this.create = create;
}

/**
* Convenience ctor for token streams that don't vary based on version.
*/
public PreConfiguredTokenFilter(String name, boolean useFilterForMultitermQueries,
PreBuiltCacheFactory.CachingStrategy cachingStrategy, Function<TokenStream, TokenStream> create) {
this(name, useFilterForMultitermQueries, cachingStrategy, (input, version) -> create.apply(input));
// TODO why oh why aren't these all CachingStrategy.ONE? They *can't* vary based on version because they don't get it, right?!
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds correct to me, let's hardcode CachingStrategy.ONE?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd been thinking of doing it in a follow. Any objections?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not at all, this PR is large already

}

@Override
public TokenFilterFactory get(IndexSettings indexSettings, Environment environment, String name, Settings settings) throws IOException {
return getTokenFilterFactory(Version.indexCreated(settings));
}

/**
* The name of the {@link TokenFilter} in the API.
*/
public String getName() {
return name;
}

/**
* Can this {@link TokenFilter} be used in multi-term queries?
*/
public boolean shouldUseFilterForMultitermQueries() {
return useFilterForMultitermQueries;
}

private interface MultiTermAwareTokenFilterFactory extends TokenFilterFactory, MultiTermAwareComponent {}

private synchronized TokenFilterFactory getTokenFilterFactory(final Version version) {
TokenFilterFactory factory = cache.get(version);
if (factory == null) {
if (useFilterForMultitermQueries) {
factory = new MultiTermAwareTokenFilterFactory() {
@Override
public String name() {
return name;
}

@Override
public TokenStream create(TokenStream tokenStream) {
return create.apply(tokenStream, version);
}

@Override
public Object getMultiTermComponent() {
return this;
}
};
} else {
factory = new TokenFilterFactory() {
@Override
public String name() {
return name;
}

@Override
public TokenStream create(TokenStream tokenStream) {
return create.apply(tokenStream, version);
}
};
}
cache.put(version, factory);
}

return factory;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@

package org.elasticsearch.indices.analysis;

import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.elasticsearch.Version;
import org.elasticsearch.cluster.metadata.IndexMetaData;
import org.elasticsearch.common.NamedRegistry;
Expand Down Expand Up @@ -101,6 +103,7 @@
import org.elasticsearch.index.analysis.PersianNormalizationFilterFactory;
import org.elasticsearch.index.analysis.PorterStemTokenFilterFactory;
import org.elasticsearch.index.analysis.PortugueseAnalyzerProvider;
import org.elasticsearch.index.analysis.PreConfiguredTokenFilter;
import org.elasticsearch.index.analysis.ReverseTokenFilterFactory;
import org.elasticsearch.index.analysis.RomanianAnalyzerProvider;
import org.elasticsearch.index.analysis.RussianAnalyzerProvider;
Expand Down Expand Up @@ -138,11 +141,15 @@
import org.elasticsearch.index.analysis.WhitespaceTokenizerFactory;
import org.elasticsearch.index.analysis.compound.DictionaryCompoundWordTokenFilterFactory;
import org.elasticsearch.index.analysis.compound.HyphenationCompoundWordTokenFilterFactory;
import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy;
import org.elasticsearch.plugins.AnalysisPlugin;

import java.io.IOException;
import java.util.List;
import java.util.Locale;
import java.util.Map;

import static java.util.Collections.unmodifiableMap;
import static org.elasticsearch.plugins.AnalysisPlugin.requriesAnalysisSettings;

/**
Expand All @@ -169,8 +176,11 @@ public AnalysisModule(Environment environment, List<AnalysisPlugin> plugins) thr
NamedRegistry<AnalysisProvider<TokenizerFactory>> tokenizers = setupTokenizers(plugins);
NamedRegistry<AnalysisProvider<AnalyzerProvider<?>>> analyzers = setupAnalyzers(plugins);
NamedRegistry<AnalysisProvider<AnalyzerProvider<?>>> normalizers = setupNormalizers(plugins);

Map<String, PreConfiguredTokenFilter> preConfiguredTokenFilters = setupPreConfiguredTokenFilters(plugins);

analysisRegistry = new AnalysisRegistry(environment, charFilters.getRegistry(), tokenFilters.getRegistry(), tokenizers
.getRegistry(), analyzers.getRegistry(), normalizers.getRegistry());
.getRegistry(), analyzers.getRegistry(), normalizers.getRegistry(), preConfiguredTokenFilters);
}

HunspellService getHunspellService() {
Expand Down Expand Up @@ -258,6 +268,40 @@ private NamedRegistry<AnalysisProvider<TokenFilterFactory>> setupTokenFilters(Li
return tokenFilters;
}

static Map<String, PreConfiguredTokenFilter> setupPreConfiguredTokenFilters(List<AnalysisPlugin> plugins) {
NamedRegistry<PreConfiguredTokenFilter> preConfiguredTokenFilters = new NamedRegistry<>("pre-configured token_filter");

// Add filters available in lucene-core
preConfiguredTokenFilters.register("lowercase",
new PreConfiguredTokenFilter("lowercase", true, CachingStrategy.LUCENE, LowerCaseFilter::new));
preConfiguredTokenFilters.register("standard",
new PreConfiguredTokenFilter("standard", false, CachingStrategy.LUCENE, StandardFilter::new));
/* Note that "stop" is available in lucene-core but it's pre-built
* version uses a set of English stop words that are in
* lucene-analyzers-common so "stop" is defined in the analysis-common
* module. */

// Add token filters declared in PreBuiltTokenFilters until they have all been migrated
for (PreBuiltTokenFilters preBuilt : PreBuiltTokenFilters.values()) {
switch (preBuilt) {
case LOWERCASE:
// This has been migrated but has to stick around until PreBuiltTokenizers is removed.
continue;
default:
String name = preBuilt.name().toLowerCase(Locale.ROOT);
preConfiguredTokenFilters.register(name,
new PreConfiguredTokenFilter(name, preBuilt.isMultiTermAware(), preBuilt.getCachingStrategy(), preBuilt::create));
}
}

for (AnalysisPlugin plugin: plugins) {
for (PreConfiguredTokenFilter filter : plugin.getPreConfiguredTokenFilters()) {
preConfiguredTokenFilters.register(filter.getName(), filter);
}
}
return unmodifiableMap(preConfiguredTokenFilters.getRegistry());
}

private NamedRegistry<AnalysisProvider<TokenizerFactory>> setupTokenizers(List<AnalysisPlugin> plugins) {
NamedRegistry<AnalysisProvider<TokenizerFactory>> tokenizers = new NamedRegistry<>("tokenizer");
tokenizers.register("standard", StandardTokenizerFactory::new);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ public interface PreBuiltCache<T> {

private PreBuiltCacheFactory() {}

static <T> PreBuiltCache<T> getCache(CachingStrategy cachingStrategy) {
public static <T> PreBuiltCache<T> getCache(CachingStrategy cachingStrategy) {
switch (cachingStrategy) {
case ONE:
return new PreBuiltCacheStrategyOne<>();
Expand Down
Loading