Skip to content

Commit 65f2717

Browse files
authored
Make PreConfiguredTokenFilter harder to misuse (#24572)
There are now three public static method to build instances of PreConfiguredTokenFilter and the ctor is private. I chose static methods instead of constructors because those allow us to change out the implementation returned if we so desire. Relates to #23658
1 parent d447b79 commit 65f2717

File tree

10 files changed

+142
-71
lines changed

10 files changed

+142
-71
lines changed

core/src/main/java/org/elasticsearch/index/analysis/PreConfiguredTokenFilter.java

Lines changed: 31 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
import org.elasticsearch.index.IndexSettings;
2828
import org.elasticsearch.indices.analysis.AnalysisModule;
2929
import org.elasticsearch.indices.analysis.PreBuiltCacheFactory;
30+
import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy;
3031

3132
import java.io.IOException;
3233
import java.util.function.BiFunction;
@@ -36,31 +37,46 @@
3637
* Provides pre-configured, shared {@link TokenFilter}s.
3738
*/
3839
public final class PreConfiguredTokenFilter implements AnalysisModule.AnalysisProvider<TokenFilterFactory> {
40+
/**
41+
* Create a pre-configured token filter that may not vary at all.
42+
*/
43+
public static PreConfiguredTokenFilter singleton(String name, boolean useFilterForMultitermQueries,
44+
Function<TokenStream, TokenStream> create) {
45+
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, CachingStrategy.ONE,
46+
(tokenStream, version) -> create.apply(tokenStream));
47+
}
48+
49+
/**
50+
* Create a pre-configured token filter that may vary based on the Lucene version.
51+
*/
52+
public static PreConfiguredTokenFilter luceneVersion(String name, boolean useFilterForMultitermQueries,
53+
BiFunction<TokenStream, org.apache.lucene.util.Version, TokenStream> create) {
54+
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, CachingStrategy.LUCENE,
55+
(tokenStream, version) -> create.apply(tokenStream, version.luceneVersion));
56+
}
57+
58+
/**
59+
* Create a pre-configured token filter that may vary based on the Elasticsearch version.
60+
*/
61+
public static PreConfiguredTokenFilter elasticsearchVersion(String name, boolean useFilterForMultitermQueries,
62+
BiFunction<TokenStream, org.elasticsearch.Version, TokenStream> create) {
63+
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, CachingStrategy.ELASTICSEARCH,
64+
(tokenStream, version) -> create.apply(tokenStream, version));
65+
}
66+
3967
private final String name;
4068
private final boolean useFilterForMultitermQueries;
4169
private final PreBuiltCacheFactory.PreBuiltCache<TokenFilterFactory> cache;
4270
private final BiFunction<TokenStream, Version, TokenStream> create;
4371

44-
/**
45-
* Standard ctor with all the power.
46-
*/
47-
public PreConfiguredTokenFilter(String name, boolean useFilterForMultitermQueries,
48-
PreBuiltCacheFactory.CachingStrategy cachingStrategy, BiFunction<TokenStream, Version, TokenStream> create) {
72+
private PreConfiguredTokenFilter(String name, boolean useFilterForMultitermQueries,
73+
PreBuiltCacheFactory.CachingStrategy cache, BiFunction<TokenStream, Version, TokenStream> create) {
4974
this.name = name;
5075
this.useFilterForMultitermQueries = useFilterForMultitermQueries;
51-
cache = PreBuiltCacheFactory.getCache(cachingStrategy);
76+
this.cache = PreBuiltCacheFactory.getCache(cache);
5277
this.create = create;
5378
}
5479

55-
/**
56-
* Convenience ctor for token streams that don't vary based on version.
57-
*/
58-
public PreConfiguredTokenFilter(String name, boolean useFilterForMultitermQueries,
59-
PreBuiltCacheFactory.CachingStrategy cachingStrategy, Function<TokenStream, TokenStream> create) {
60-
this(name, useFilterForMultitermQueries, cachingStrategy, (input, version) -> create.apply(input));
61-
// TODO why oh why aren't these all CachingStrategy.ONE? They *can't* vary based on version because they don't get it, right?!
62-
}
63-
6480
@Override
6581
public TokenFilterFactory get(IndexSettings indexSettings, Environment environment, String name, Settings settings) throws IOException {
6682
return getTokenFilterFactory(Version.indexCreated(settings));

core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -272,10 +272,8 @@ static Map<String, PreConfiguredTokenFilter> setupPreConfiguredTokenFilters(List
272272
NamedRegistry<PreConfiguredTokenFilter> preConfiguredTokenFilters = new NamedRegistry<>("pre-configured token_filter");
273273

274274
// Add filters available in lucene-core
275-
preConfiguredTokenFilters.register("lowercase",
276-
new PreConfiguredTokenFilter("lowercase", true, CachingStrategy.LUCENE, LowerCaseFilter::new));
277-
preConfiguredTokenFilters.register("standard",
278-
new PreConfiguredTokenFilter("standard", false, CachingStrategy.LUCENE, StandardFilter::new));
275+
preConfiguredTokenFilters.register("lowercase", PreConfiguredTokenFilter.singleton("lowercase", true, LowerCaseFilter::new));
276+
preConfiguredTokenFilters.register("standard", PreConfiguredTokenFilter.singleton("standard", false, StandardFilter::new));
279277
/* Note that "stop" is available in lucene-core but it's pre-built
280278
* version uses a set of English stop words that are in
281279
* lucene-analyzers-common so "stop" is defined in the analysis-common
@@ -288,9 +286,12 @@ static Map<String, PreConfiguredTokenFilter> setupPreConfiguredTokenFilters(List
288286
// This has been migrated but has to stick around until PreBuiltTokenizers is removed.
289287
continue;
290288
default:
289+
if (CachingStrategy.ONE != preBuilt.getCachingStrategy()) {
290+
throw new UnsupportedOperationException("shim not available for " + preBuilt.getCachingStrategy());
291+
}
291292
String name = preBuilt.name().toLowerCase(Locale.ROOT);
292-
preConfiguredTokenFilters.register(name,
293-
new PreConfiguredTokenFilter(name, preBuilt.isMultiTermAware(), preBuilt.getCachingStrategy(), preBuilt::create));
293+
preConfiguredTokenFilters.register(name, PreConfiguredTokenFilter.singleton(name, preBuilt.isMultiTermAware(),
294+
tokenStream -> preBuilt.create(tokenStream, Version.CURRENT)));
294295
}
295296
}
296297

core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltTokenFilters.java

Lines changed: 0 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@
3030
import org.apache.lucene.analysis.cz.CzechStemFilter;
3131
import org.apache.lucene.analysis.de.GermanNormalizationFilter;
3232
import org.apache.lucene.analysis.de.GermanStemFilter;
33-
import org.apache.lucene.analysis.en.PorterStemFilter;
3433
import org.apache.lucene.analysis.fa.PersianNormalizationFilter;
3534
import org.apache.lucene.analysis.fr.FrenchAnalyzer;
3635
import org.apache.lucene.analysis.hi.HindiNormalizationFilter;
@@ -70,20 +69,6 @@ protected boolean isMultiTermAware() {
7069
},
7170

7271
// Extended Token Filters
73-
SNOWBALL(CachingStrategy.ONE) {
74-
@Override
75-
public TokenStream create(TokenStream tokenStream, Version version) {
76-
return new SnowballFilter(tokenStream, "English");
77-
}
78-
},
79-
80-
STEMMER(CachingStrategy.ONE) {
81-
@Override
82-
public TokenStream create(TokenStream tokenStream, Version version) {
83-
return new PorterStemFilter(tokenStream);
84-
}
85-
},
86-
8772
ELISION(CachingStrategy.ONE) {
8873
@Override
8974
public TokenStream create(TokenStream tokenStream, Version version) {

core/src/test/java/org/elasticsearch/index/analysis/AnalysisRegistryTests.java

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,6 @@
3434
import org.elasticsearch.indices.analysis.AnalysisModule;
3535
import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider;
3636
import org.elasticsearch.indices.analysis.PreBuiltAnalyzers;
37-
import org.elasticsearch.indices.analysis.PreBuiltCacheFactory;
3837
import org.elasticsearch.plugins.AnalysisPlugin;
3938
import org.elasticsearch.test.ESTestCase;
4039
import org.elasticsearch.test.IndexSettingsModule;
@@ -207,12 +206,11 @@ public void testBuiltInAnalyzersAreCached() throws IOException {
207206

208207
public void testPreConfiguredTokenFiltersAreCached() throws IOException {
209208
AtomicBoolean built = new AtomicBoolean(false);
210-
PreConfiguredTokenFilter assertsBuiltOnce = new PreConfiguredTokenFilter("asserts_built_once", false,
211-
PreBuiltCacheFactory.CachingStrategy.ONE, (tokens, version) -> {
209+
PreConfiguredTokenFilter assertsBuiltOnce = PreConfiguredTokenFilter.singleton("asserts_built_once", false, tokenStream -> {
212210
if (false == built.compareAndSet(false, true)) {
213211
fail("Attempted to build the token filter twice when it should have been cached");
214212
}
215-
return new MockTokenFilter(tokens, MockTokenFilter.EMPTY_STOPSET);
213+
return new MockTokenFilter(tokenStream, MockTokenFilter.EMPTY_STOPSET);
216214
});
217215
try (AnalysisRegistry registryWithPreBuiltTokenFilter = new AnalysisRegistry(emptyEnvironment, emptyMap(), emptyMap(), emptyMap(),
218216
emptyMap(), emptyMap(), singletonMap("asserts_built_once", assertsBuiltOnce))) {

core/src/test/java/org/elasticsearch/index/analysis/CustomNormalizerTests.java

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@
2424
import org.elasticsearch.common.settings.Settings;
2525
import org.elasticsearch.env.Environment;
2626
import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider;
27-
import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy;
2827
import org.elasticsearch.plugins.AnalysisPlugin;
2928
import org.elasticsearch.test.ESTestCase;
3029
import org.elasticsearch.test.ESTokenStreamTestCase;
@@ -113,7 +112,7 @@ public void testIllegalCharFilters() throws IOException {
113112
private static class MockAnalysisPlugin implements AnalysisPlugin {
114113
@Override
115114
public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
116-
return singletonList(new PreConfiguredTokenFilter("mock_forbidden", false, CachingStrategy.ONE, MockLowerCaseFilter::new));
115+
return singletonList(PreConfiguredTokenFilter.singleton("mock_forbidden", false, MockLowerCaseFilter::new));
117116
}
118117

119118
@Override

core/src/test/java/org/elasticsearch/index/mapper/KeywordFieldMapperTests.java

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,6 @@
3232
import org.elasticsearch.index.IndexService;
3333
import org.elasticsearch.index.analysis.PreConfiguredTokenFilter;
3434
import org.elasticsearch.index.mapper.MapperService.MergeReason;
35-
import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy;
3635
import org.elasticsearch.plugins.AnalysisPlugin;
3736
import org.elasticsearch.plugins.Plugin;
3837
import org.elasticsearch.test.ESSingleNodeTestCase;
@@ -55,7 +54,7 @@ public class KeywordFieldMapperTests extends ESSingleNodeTestCase {
5554
public static class MockAnalysisPlugin extends Plugin implements AnalysisPlugin {
5655
@Override
5756
public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
58-
return singletonList(new PreConfiguredTokenFilter("mock_other_lowercase", true, CachingStrategy.ONE, MockLowerCaseFilter::new));
57+
return singletonList(PreConfiguredTokenFilter.singleton("mock_other_lowercase", true, MockLowerCaseFilter::new));
5958
}
6059
};
6160

core/src/test/java/org/elasticsearch/indices/analysis/AnalysisModuleTests.java

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
package org.elasticsearch.indices.analysis;
2121

2222
import org.apache.lucene.analysis.Analyzer;
23+
import org.apache.lucene.analysis.TokenFilter;
2324
import org.apache.lucene.analysis.TokenStream;
2425
import org.apache.lucene.analysis.Tokenizer;
2526
import org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
@@ -28,6 +29,7 @@
2829
import org.apache.lucene.analysis.hunspell.Dictionary;
2930
import org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilter;
3031
import org.apache.lucene.analysis.standard.StandardAnalyzer;
32+
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
3133
import org.apache.lucene.store.Directory;
3234
import org.apache.lucene.store.SimpleFSDirectory;
3335
import org.elasticsearch.Version;
@@ -43,6 +45,7 @@
4345
import org.elasticsearch.index.analysis.CustomAnalyzer;
4446
import org.elasticsearch.index.analysis.IndexAnalyzers;
4547
import org.elasticsearch.index.analysis.NamedAnalyzer;
48+
import org.elasticsearch.index.analysis.PreConfiguredTokenFilter;
4649
import org.elasticsearch.index.analysis.StandardTokenizerFactory;
4750
import org.elasticsearch.index.analysis.StopTokenFilterFactory;
4851
import org.elasticsearch.index.analysis.TokenFilterFactory;
@@ -61,17 +64,23 @@
6164
import java.nio.charset.StandardCharsets;
6265
import java.nio.file.Files;
6366
import java.nio.file.Path;
67+
import java.util.Arrays;
68+
import java.util.List;
6469
import java.util.Map;
6570
import java.util.Set;
6671

6772
import static java.util.Collections.singletonList;
6873
import static java.util.Collections.singletonMap;
74+
import static org.apache.lucene.analysis.BaseTokenStreamTestCase.assertTokenStreamContents;
6975
import static org.hamcrest.Matchers.either;
7076
import static org.hamcrest.Matchers.equalTo;
7177
import static org.hamcrest.Matchers.instanceOf;
7278
import static org.hamcrest.Matchers.is;
7379

7480
public class AnalysisModuleTests extends ESTestCase {
81+
private final Settings emptyNodeSettings = Settings.builder()
82+
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
83+
.build();
7584

7685
public IndexAnalyzers getIndexAnalyzers(Settings settings) throws IOException {
7786
return getIndexAnalyzers(getNewRegistry(settings), settings);
@@ -264,6 +273,71 @@ public void testUnderscoreInAnalyzerName() throws IOException {
264273
}
265274
}
266275

276+
/**
277+
* Tests that plugins can register pre-configured token filters that vary in behavior based on Elasticsearch version, Lucene version,
278+
* and that do not vary based on version at all.
279+
*/
280+
public void testPluginPreConfiguredTokenFilters() throws IOException {
281+
// Simple token filter that appends text to the term
282+
final class AppendTokenFilter extends TokenFilter {
283+
private final CharTermAttribute term = addAttribute(CharTermAttribute.class);
284+
private final char[] appendMe;
285+
286+
protected AppendTokenFilter(TokenStream input, String appendMe) {
287+
super(input);
288+
this.appendMe = appendMe.toCharArray();
289+
}
290+
291+
@Override
292+
public boolean incrementToken() throws IOException {
293+
if (false == input.incrementToken()) {
294+
return false;
295+
}
296+
term.resizeBuffer(term.length() + appendMe.length);
297+
System.arraycopy(appendMe, 0, term.buffer(), term.length(), appendMe.length);
298+
term.setLength(term.length() + appendMe.length);
299+
return true;
300+
}
301+
}
302+
boolean noVersionSupportsMultiTerm = randomBoolean();
303+
boolean luceneVersionSupportsMultiTerm = randomBoolean();
304+
boolean elasticsearchVersionSupportsMultiTerm = randomBoolean();
305+
AnalysisRegistry registry = new AnalysisModule(new Environment(emptyNodeSettings), singletonList(new AnalysisPlugin() {
306+
@Override
307+
public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
308+
return Arrays.asList(
309+
PreConfiguredTokenFilter.singleton("no_version", noVersionSupportsMultiTerm,
310+
tokenStream -> new AppendTokenFilter(tokenStream, "no_version")),
311+
PreConfiguredTokenFilter.luceneVersion("lucene_version", luceneVersionSupportsMultiTerm,
312+
(tokenStream, luceneVersion) -> new AppendTokenFilter(tokenStream, luceneVersion.toString())),
313+
PreConfiguredTokenFilter.elasticsearchVersion("elasticsearch_version", elasticsearchVersionSupportsMultiTerm,
314+
(tokenStream, esVersion) -> new AppendTokenFilter(tokenStream, esVersion.toString()))
315+
);
316+
}
317+
})).getAnalysisRegistry();
318+
319+
Version version = VersionUtils.randomVersion(random());
320+
IndexAnalyzers analyzers = getIndexAnalyzers(registry, Settings.builder()
321+
.put("index.analysis.analyzer.no_version.tokenizer", "keyword")
322+
.put("index.analysis.analyzer.no_version.filter", "no_version")
323+
.put("index.analysis.analyzer.lucene_version.tokenizer", "keyword")
324+
.put("index.analysis.analyzer.lucene_version.filter", "lucene_version")
325+
.put("index.analysis.analyzer.elasticsearch_version.tokenizer", "keyword")
326+
.put("index.analysis.analyzer.elasticsearch_version.filter", "elasticsearch_version")
327+
.put(IndexMetaData.SETTING_VERSION_CREATED, version)
328+
.build());
329+
assertTokenStreamContents(analyzers.get("no_version").tokenStream("", "test"), new String[] {"testno_version"});
330+
assertTokenStreamContents(analyzers.get("lucene_version").tokenStream("", "test"), new String[] {"test" + version.luceneVersion});
331+
assertTokenStreamContents(analyzers.get("elasticsearch_version").tokenStream("", "test"), new String[] {"test" + version});
332+
333+
assertEquals("test" + (noVersionSupportsMultiTerm ? "no_version" : ""),
334+
analyzers.get("no_version").normalize("", "test").utf8ToString());
335+
assertEquals("test" + (luceneVersionSupportsMultiTerm ? version.luceneVersion.toString() : ""),
336+
analyzers.get("lucene_version").normalize("", "test").utf8ToString());
337+
assertEquals("test" + (elasticsearchVersionSupportsMultiTerm ? version.toString() : ""),
338+
analyzers.get("elasticsearch_version").normalize("", "test").utf8ToString());
339+
}
340+
267341
public void testRegisterHunspellDictionary() throws Exception {
268342
Settings settings = Settings.builder()
269343
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())

0 commit comments

Comments
 (0)