Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.indices.analysis.AnalysisModule;
import org.elasticsearch.indices.analysis.PreBuiltCacheFactory;
import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy;

import java.io.IOException;
import java.util.function.BiFunction;
Expand All @@ -36,31 +37,46 @@
* Provides pre-configured, shared {@link TokenFilter}s.
*/
public final class PreConfiguredTokenFilter implements AnalysisModule.AnalysisProvider<TokenFilterFactory> {
/**
* Create a pre-configured token filter that may not vary at all.
*/
public static PreConfiguredTokenFilter singleton(String name, boolean useFilterForMultitermQueries,
Function<TokenStream, TokenStream> create) {
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, CachingStrategy.ONE,
(tokenStream, version) -> create.apply(tokenStream));
}

/**
* Create a pre-configured token filter that may vary based on the Lucene version.
*/
public static PreConfiguredTokenFilter luceneVersion(String name, boolean useFilterForMultitermQueries,
BiFunction<TokenStream, org.apache.lucene.util.Version, TokenStream> create) {
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, CachingStrategy.LUCENE,
(tokenStream, version) -> create.apply(tokenStream, version.luceneVersion));
}

/**
* Create a pre-configured token filter that may vary based on the Elasticsearch version.
*/
public static PreConfiguredTokenFilter elasticsearchVersion(String name, boolean useFilterForMultitermQueries,
BiFunction<TokenStream, org.elasticsearch.Version, TokenStream> create) {
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, CachingStrategy.ELASTICSEARCH,
(tokenStream, version) -> create.apply(tokenStream, version));
}

private final String name;
private final boolean useFilterForMultitermQueries;
private final PreBuiltCacheFactory.PreBuiltCache<TokenFilterFactory> cache;
private final BiFunction<TokenStream, Version, TokenStream> create;

/**
* Standard ctor with all the power.
*/
public PreConfiguredTokenFilter(String name, boolean useFilterForMultitermQueries,
PreBuiltCacheFactory.CachingStrategy cachingStrategy, BiFunction<TokenStream, Version, TokenStream> create) {
private PreConfiguredTokenFilter(String name, boolean useFilterForMultitermQueries,
PreBuiltCacheFactory.CachingStrategy cache, BiFunction<TokenStream, Version, TokenStream> create) {
this.name = name;
this.useFilterForMultitermQueries = useFilterForMultitermQueries;
cache = PreBuiltCacheFactory.getCache(cachingStrategy);
this.cache = PreBuiltCacheFactory.getCache(cache);
this.create = create;
}

/**
* Convenience ctor for token streams that don't vary based on version.
*/
public PreConfiguredTokenFilter(String name, boolean useFilterForMultitermQueries,
PreBuiltCacheFactory.CachingStrategy cachingStrategy, Function<TokenStream, TokenStream> create) {
this(name, useFilterForMultitermQueries, cachingStrategy, (input, version) -> create.apply(input));
// TODO why oh why aren't these all CachingStrategy.ONE? They *can't* vary based on version because they don't get it, right?!
}

@Override
public TokenFilterFactory get(IndexSettings indexSettings, Environment environment, String name, Settings settings) throws IOException {
return getTokenFilterFactory(Version.indexCreated(settings));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -272,10 +272,8 @@ static Map<String, PreConfiguredTokenFilter> setupPreConfiguredTokenFilters(List
NamedRegistry<PreConfiguredTokenFilter> preConfiguredTokenFilters = new NamedRegistry<>("pre-configured token_filter");

// Add filters available in lucene-core
preConfiguredTokenFilters.register("lowercase",
new PreConfiguredTokenFilter("lowercase", true, CachingStrategy.LUCENE, LowerCaseFilter::new));
preConfiguredTokenFilters.register("standard",
new PreConfiguredTokenFilter("standard", false, CachingStrategy.LUCENE, StandardFilter::new));
preConfiguredTokenFilters.register("lowercase", PreConfiguredTokenFilter.singleton("lowercase", true, LowerCaseFilter::new));
preConfiguredTokenFilters.register("standard", PreConfiguredTokenFilter.singleton("standard", false, StandardFilter::new));
/* Note that "stop" is available in lucene-core but it's pre-built
* version uses a set of English stop words that are in
* lucene-analyzers-common so "stop" is defined in the analysis-common
Expand All @@ -288,9 +286,12 @@ static Map<String, PreConfiguredTokenFilter> setupPreConfiguredTokenFilters(List
// This has been migrated but has to stick around until PreBuiltTokenizers is removed.
continue;
default:
if (CachingStrategy.ONE != preBuilt.getCachingStrategy()) {
throw new UnsupportedOperationException("shim not available for " + preBuilt.getCachingStrategy());
}
String name = preBuilt.name().toLowerCase(Locale.ROOT);
preConfiguredTokenFilters.register(name,
new PreConfiguredTokenFilter(name, preBuilt.isMultiTermAware(), preBuilt.getCachingStrategy(), preBuilt::create));
preConfiguredTokenFilters.register(name, PreConfiguredTokenFilter.singleton(name, preBuilt.isMultiTermAware(),
tokenStream -> preBuilt.create(tokenStream, Version.CURRENT)));
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@
import org.apache.lucene.analysis.cz.CzechStemFilter;
import org.apache.lucene.analysis.de.GermanNormalizationFilter;
import org.apache.lucene.analysis.de.GermanStemFilter;
import org.apache.lucene.analysis.en.PorterStemFilter;
import org.apache.lucene.analysis.fa.PersianNormalizationFilter;
import org.apache.lucene.analysis.fr.FrenchAnalyzer;
import org.apache.lucene.analysis.hi.HindiNormalizationFilter;
Expand Down Expand Up @@ -70,20 +69,6 @@ protected boolean isMultiTermAware() {
},

// Extended Token Filters
SNOWBALL(CachingStrategy.ONE) {
@Override
public TokenStream create(TokenStream tokenStream, Version version) {
return new SnowballFilter(tokenStream, "English");
}
},

STEMMER(CachingStrategy.ONE) {
@Override
public TokenStream create(TokenStream tokenStream, Version version) {
return new PorterStemFilter(tokenStream);
}
},

ELISION(CachingStrategy.ONE) {
@Override
public TokenStream create(TokenStream tokenStream, Version version) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@
import org.elasticsearch.indices.analysis.AnalysisModule;
import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider;
import org.elasticsearch.indices.analysis.PreBuiltAnalyzers;
import org.elasticsearch.indices.analysis.PreBuiltCacheFactory;
import org.elasticsearch.plugins.AnalysisPlugin;
import org.elasticsearch.test.ESTestCase;
import org.elasticsearch.test.IndexSettingsModule;
Expand Down Expand Up @@ -207,12 +206,11 @@ public void testBuiltInAnalyzersAreCached() throws IOException {

public void testPreConfiguredTokenFiltersAreCached() throws IOException {
AtomicBoolean built = new AtomicBoolean(false);
PreConfiguredTokenFilter assertsBuiltOnce = new PreConfiguredTokenFilter("asserts_built_once", false,
PreBuiltCacheFactory.CachingStrategy.ONE, (tokens, version) -> {
PreConfiguredTokenFilter assertsBuiltOnce = PreConfiguredTokenFilter.singleton("asserts_built_once", false, tokenStream -> {
if (false == built.compareAndSet(false, true)) {
fail("Attempted to build the token filter twice when it should have been cached");
}
return new MockTokenFilter(tokens, MockTokenFilter.EMPTY_STOPSET);
return new MockTokenFilter(tokenStream, MockTokenFilter.EMPTY_STOPSET);
});
try (AnalysisRegistry registryWithPreBuiltTokenFilter = new AnalysisRegistry(emptyEnvironment, emptyMap(), emptyMap(), emptyMap(),
emptyMap(), emptyMap(), singletonMap("asserts_built_once", assertsBuiltOnce))) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider;
import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy;
import org.elasticsearch.plugins.AnalysisPlugin;
import org.elasticsearch.test.ESTestCase;
import org.elasticsearch.test.ESTokenStreamTestCase;
Expand Down Expand Up @@ -113,7 +112,7 @@ public void testIllegalCharFilters() throws IOException {
private static class MockAnalysisPlugin implements AnalysisPlugin {
@Override
public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
return singletonList(new PreConfiguredTokenFilter("mock_forbidden", false, CachingStrategy.ONE, MockLowerCaseFilter::new));
return singletonList(PreConfiguredTokenFilter.singleton("mock_forbidden", false, MockLowerCaseFilter::new));
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@
import org.elasticsearch.index.IndexService;
import org.elasticsearch.index.analysis.PreConfiguredTokenFilter;
import org.elasticsearch.index.mapper.MapperService.MergeReason;
import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy;
import org.elasticsearch.plugins.AnalysisPlugin;
import org.elasticsearch.plugins.Plugin;
import org.elasticsearch.test.ESSingleNodeTestCase;
Expand All @@ -55,7 +54,7 @@ public class KeywordFieldMapperTests extends ESSingleNodeTestCase {
public static class MockAnalysisPlugin extends Plugin implements AnalysisPlugin {
@Override
public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
return singletonList(new PreConfiguredTokenFilter("mock_other_lowercase", true, CachingStrategy.ONE, MockLowerCaseFilter::new));
return singletonList(PreConfiguredTokenFilter.singleton("mock_other_lowercase", true, MockLowerCaseFilter::new));
}
};

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
package org.elasticsearch.indices.analysis;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
Expand All @@ -28,6 +29,7 @@
import org.apache.lucene.analysis.hunspell.Dictionary;
import org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilter;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.SimpleFSDirectory;
import org.elasticsearch.Version;
Expand All @@ -43,6 +45,7 @@
import org.elasticsearch.index.analysis.CustomAnalyzer;
import org.elasticsearch.index.analysis.IndexAnalyzers;
import org.elasticsearch.index.analysis.NamedAnalyzer;
import org.elasticsearch.index.analysis.PreConfiguredTokenFilter;
import org.elasticsearch.index.analysis.StandardTokenizerFactory;
import org.elasticsearch.index.analysis.StopTokenFilterFactory;
import org.elasticsearch.index.analysis.TokenFilterFactory;
Expand All @@ -61,17 +64,23 @@
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.Set;

import static java.util.Collections.singletonList;
import static java.util.Collections.singletonMap;
import static org.apache.lucene.analysis.BaseTokenStreamTestCase.assertTokenStreamContents;
import static org.hamcrest.Matchers.either;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.instanceOf;
import static org.hamcrest.Matchers.is;

public class AnalysisModuleTests extends ESTestCase {
private final Settings emptyNodeSettings = Settings.builder()
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.build();

public IndexAnalyzers getIndexAnalyzers(Settings settings) throws IOException {
return getIndexAnalyzers(getNewRegistry(settings), settings);
Expand Down Expand Up @@ -264,6 +273,71 @@ public void testUnderscoreInAnalyzerName() throws IOException {
}
}

/**
* Tests that plugins can register pre-configured token filters that vary in behavior based on Elasticsearch version, Lucene version,
* and that do not vary based on version at all.
*/
public void testPluginPreConfiguredTokenFilters() throws IOException {
// Simple token filter that appends text to the term
final class AppendTokenFilter extends TokenFilter {
private final CharTermAttribute term = addAttribute(CharTermAttribute.class);
private final char[] appendMe;

protected AppendTokenFilter(TokenStream input, String appendMe) {
super(input);
this.appendMe = appendMe.toCharArray();
}

@Override
public boolean incrementToken() throws IOException {
if (false == input.incrementToken()) {
return false;
}
term.resizeBuffer(term.length() + appendMe.length);
System.arraycopy(appendMe, 0, term.buffer(), term.length(), appendMe.length);
term.setLength(term.length() + appendMe.length);
return true;
}
}
boolean noVersionSupportsMultiTerm = randomBoolean();
boolean luceneVersionSupportsMultiTerm = randomBoolean();
boolean elasticsearchVersionSupportsMultiTerm = randomBoolean();
AnalysisRegistry registry = new AnalysisModule(new Environment(emptyNodeSettings), singletonList(new AnalysisPlugin() {
@Override
public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
return Arrays.asList(
PreConfiguredTokenFilter.singleton("no_version", noVersionSupportsMultiTerm,
tokenStream -> new AppendTokenFilter(tokenStream, "no_version")),
PreConfiguredTokenFilter.luceneVersion("lucene_version", luceneVersionSupportsMultiTerm,
(tokenStream, luceneVersion) -> new AppendTokenFilter(tokenStream, luceneVersion.toString())),
PreConfiguredTokenFilter.elasticsearchVersion("elasticsearch_version", elasticsearchVersionSupportsMultiTerm,
(tokenStream, esVersion) -> new AppendTokenFilter(tokenStream, esVersion.toString()))
);
}
})).getAnalysisRegistry();

Version version = VersionUtils.randomVersion(random());
IndexAnalyzers analyzers = getIndexAnalyzers(registry, Settings.builder()
.put("index.analysis.analyzer.no_version.tokenizer", "keyword")
.put("index.analysis.analyzer.no_version.filter", "no_version")
.put("index.analysis.analyzer.lucene_version.tokenizer", "keyword")
.put("index.analysis.analyzer.lucene_version.filter", "lucene_version")
.put("index.analysis.analyzer.elasticsearch_version.tokenizer", "keyword")
.put("index.analysis.analyzer.elasticsearch_version.filter", "elasticsearch_version")
.put(IndexMetaData.SETTING_VERSION_CREATED, version)
.build());
assertTokenStreamContents(analyzers.get("no_version").tokenStream("", "test"), new String[] {"testno_version"});
assertTokenStreamContents(analyzers.get("lucene_version").tokenStream("", "test"), new String[] {"test" + version.luceneVersion});
assertTokenStreamContents(analyzers.get("elasticsearch_version").tokenStream("", "test"), new String[] {"test" + version});

assertEquals("test" + (noVersionSupportsMultiTerm ? "no_version" : ""),
analyzers.get("no_version").normalize("", "test").utf8ToString());
assertEquals("test" + (luceneVersionSupportsMultiTerm ? version.luceneVersion.toString() : ""),
analyzers.get("lucene_version").normalize("", "test").utf8ToString());
assertEquals("test" + (elasticsearchVersionSupportsMultiTerm ? version.toString() : ""),
analyzers.get("elasticsearch_version").normalize("", "test").utf8ToString());
}

public void testRegisterHunspellDictionary() throws Exception {
Settings settings = Settings.builder()
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
Expand Down
Loading