Skip to content

Commit b9ea579

Browse files
authored
Allow plugins to register pre-configured tokenizers (#24751)
Allows plugins to register pre-configured tokenizers. Much of the decisions are the same as those in #24223, #24572, and #24223. This only migrates the lowercase tokenizer but I figure that is a good start because it proves out the features.
1 parent ae73670 commit b9ea579

File tree

14 files changed

+557
-323
lines changed

14 files changed

+557
-323
lines changed

core/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java

Lines changed: 15 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -74,14 +74,15 @@ public AnalysisRegistry(Environment environment,
7474
Map<String, AnalysisProvider<TokenizerFactory>> tokenizers,
7575
Map<String, AnalysisProvider<AnalyzerProvider<?>>> analyzers,
7676
Map<String, AnalysisProvider<AnalyzerProvider<?>>> normalizers,
77-
Map<String, PreConfiguredTokenFilter> preConfiguredTokenFilters) {
77+
Map<String, PreConfiguredTokenFilter> preConfiguredTokenFilters,
78+
Map<String, PreConfiguredTokenizer> preConfiguredTokenizers) {
7879
this.environment = environment;
7980
this.charFilters = unmodifiableMap(charFilters);
8081
this.tokenFilters = unmodifiableMap(tokenFilters);
8182
this.tokenizers = unmodifiableMap(tokenizers);
8283
this.analyzers = unmodifiableMap(analyzers);
8384
this.normalizers = unmodifiableMap(normalizers);
84-
prebuiltAnalysis = new PrebuiltAnalysis(preConfiguredTokenFilters);
85+
prebuiltAnalysis = new PrebuiltAnalysis(preConfiguredTokenFilters, preConfiguredTokenizers);
8586
}
8687

8788
/**
@@ -169,12 +170,12 @@ public Map<String, TokenFilterFactory> buildTokenFilterFactories(IndexSettings i
169170
*/
170171
tokenFilters.put("synonym", requiresAnalysisSettings((is, env, name, settings) -> new SynonymTokenFilterFactory(is, env, this, name, settings)));
171172
tokenFilters.put("synonym_graph", requiresAnalysisSettings((is, env, name, settings) -> new SynonymGraphTokenFilterFactory(is, env, this, name, settings)));
172-
return buildMapping(Component.FILTER, indexSettings, tokenFiltersSettings, Collections.unmodifiableMap(tokenFilters), prebuiltAnalysis.tokenFilterFactories);
173+
return buildMapping(Component.FILTER, indexSettings, tokenFiltersSettings, Collections.unmodifiableMap(tokenFilters), prebuiltAnalysis.preConfiguredTokenFilters);
173174
}
174175

175176
public Map<String, TokenizerFactory> buildTokenizerFactories(IndexSettings indexSettings) throws IOException {
176177
final Map<String, Settings> tokenizersSettings = indexSettings.getSettings().getGroups(INDEX_ANALYSIS_TOKENIZER);
177-
return buildMapping(Component.TOKENIZER, indexSettings, tokenizersSettings, tokenizers, prebuiltAnalysis.tokenizerFactories);
178+
return buildMapping(Component.TOKENIZER, indexSettings, tokenizersSettings, tokenizers, prebuiltAnalysis.preConfiguredTokenizers);
178179
}
179180

180181
public Map<String, CharFilterFactory> buildCharFilterFactories(IndexSettings indexSettings) throws IOException {
@@ -394,31 +395,22 @@ private <T> AnalysisProvider<T> getAnalysisProvider(Component component, Map<Str
394395
private static class PrebuiltAnalysis implements Closeable {
395396

396397
final Map<String, AnalysisModule.AnalysisProvider<AnalyzerProvider<?>>> analyzerProviderFactories;
397-
final Map<String, AnalysisModule.AnalysisProvider<TokenizerFactory>> tokenizerFactories;
398-
final Map<String, ? extends AnalysisProvider<TokenFilterFactory>> tokenFilterFactories;
398+
final Map<String, ? extends AnalysisProvider<TokenFilterFactory>> preConfiguredTokenFilters;
399+
final Map<String, ? extends AnalysisProvider<TokenizerFactory>> preConfiguredTokenizers;
399400
final Map<String, AnalysisModule.AnalysisProvider<CharFilterFactory>> charFilterFactories;
400401

401-
private PrebuiltAnalysis(Map<String, PreConfiguredTokenFilter> preConfiguredTokenFilters) {
402+
private PrebuiltAnalysis(
403+
Map<String, PreConfiguredTokenFilter> preConfiguredTokenFilters,
404+
Map<String, PreConfiguredTokenizer> preConfiguredTokenizers) {
402405
Map<String, PreBuiltAnalyzerProviderFactory> analyzerProviderFactories = new HashMap<>();
403-
Map<String, PreBuiltTokenizerFactoryFactory> tokenizerFactories = new HashMap<>();
404406
Map<String, PreBuiltCharFilterFactoryFactory> charFilterFactories = new HashMap<>();
407+
405408
// Analyzers
406409
for (PreBuiltAnalyzers preBuiltAnalyzerEnum : PreBuiltAnalyzers.values()) {
407410
String name = preBuiltAnalyzerEnum.name().toLowerCase(Locale.ROOT);
408411
analyzerProviderFactories.put(name, new PreBuiltAnalyzerProviderFactory(name, AnalyzerScope.INDICES, preBuiltAnalyzerEnum.getAnalyzer(Version.CURRENT)));
409412
}
410413

411-
// Tokenizers
412-
for (PreBuiltTokenizers preBuiltTokenizer : PreBuiltTokenizers.values()) {
413-
String name = preBuiltTokenizer.name().toLowerCase(Locale.ROOT);
414-
tokenizerFactories.put(name, new PreBuiltTokenizerFactoryFactory(preBuiltTokenizer.getTokenizerFactory(Version.CURRENT)));
415-
}
416-
417-
// Tokenizer aliases
418-
tokenizerFactories.put("nGram", new PreBuiltTokenizerFactoryFactory(PreBuiltTokenizers.NGRAM.getTokenizerFactory(Version.CURRENT)));
419-
tokenizerFactories.put("edgeNGram", new PreBuiltTokenizerFactoryFactory(PreBuiltTokenizers.EDGE_NGRAM.getTokenizerFactory(Version.CURRENT)));
420-
tokenizerFactories.put("PathHierarchy", new PreBuiltTokenizerFactoryFactory(PreBuiltTokenizers.PATH_HIERARCHY.getTokenizerFactory(Version.CURRENT)));
421-
422414
// Char Filters
423415
for (PreBuiltCharFilters preBuiltCharFilter : PreBuiltCharFilters.values()) {
424416
String name = preBuiltCharFilter.name().toLowerCase(Locale.ROOT);
@@ -429,20 +421,20 @@ private PrebuiltAnalysis(Map<String, PreConfiguredTokenFilter> preConfiguredToke
429421

430422
this.analyzerProviderFactories = Collections.unmodifiableMap(analyzerProviderFactories);
431423
this.charFilterFactories = Collections.unmodifiableMap(charFilterFactories);
432-
this.tokenizerFactories = Collections.unmodifiableMap(tokenizerFactories);
433-
tokenFilterFactories = preConfiguredTokenFilters;
424+
this.preConfiguredTokenFilters = preConfiguredTokenFilters;
425+
this.preConfiguredTokenizers = preConfiguredTokenizers;
434426
}
435427

436428
public AnalysisModule.AnalysisProvider<CharFilterFactory> getCharFilterFactory(String name) {
437429
return charFilterFactories.get(name);
438430
}
439431

440432
public AnalysisModule.AnalysisProvider<TokenFilterFactory> getTokenFilterFactory(String name) {
441-
return tokenFilterFactories.get(name);
433+
return preConfiguredTokenFilters.get(name);
442434
}
443435

444436
public AnalysisModule.AnalysisProvider<TokenizerFactory> getTokenizerFactory(String name) {
445-
return tokenizerFactories.get(name);
437+
return preConfiguredTokenizers.get(name);
446438
}
447439

448440
public AnalysisModule.AnalysisProvider<AnalyzerProvider<?>> getAnalyzerProvider(String name) {

core/src/main/java/org/elasticsearch/index/analysis/PreBuiltTokenizerFactoryFactory.java

Lines changed: 0 additions & 50 deletions
This file was deleted.
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
/*
2+
* Licensed to Elasticsearch under one or more contributor
3+
* license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright
5+
* ownership. Elasticsearch licenses this file to you under
6+
* the Apache License, Version 2.0 (the "License"); you may
7+
* not use this file except in compliance with the License.
8+
* You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package org.elasticsearch.index.analysis;
21+
22+
import org.elasticsearch.Version;
23+
import org.elasticsearch.common.settings.Settings;
24+
import org.elasticsearch.env.Environment;
25+
import org.elasticsearch.index.IndexSettings;
26+
import org.elasticsearch.indices.analysis.AnalysisModule;
27+
import org.elasticsearch.indices.analysis.PreBuiltCacheFactory;
28+
29+
import java.io.IOException;
30+
31+
/**
32+
* Shared implementation for pre-configured analysis components.
33+
*/
34+
public abstract class PreConfiguredAnalysisComponent<T> implements AnalysisModule.AnalysisProvider<T> {
35+
private final String name;
36+
private final PreBuiltCacheFactory.PreBuiltCache<T> cache;
37+
38+
protected PreConfiguredAnalysisComponent(String name, PreBuiltCacheFactory.CachingStrategy cache) {
39+
this.name = name;
40+
this.cache = PreBuiltCacheFactory.getCache(cache);
41+
}
42+
43+
@Override
44+
public T get(IndexSettings indexSettings, Environment environment, String name, Settings settings) throws IOException {
45+
Version versionCreated = Version.indexCreated(settings);
46+
synchronized (this) {
47+
T factory = cache.get(versionCreated);
48+
if (factory == null) {
49+
factory = create(versionCreated);
50+
cache.put(versionCreated, factory);
51+
}
52+
return factory;
53+
}
54+
}
55+
56+
/**
57+
* The name of the analysis component in the API.
58+
*/
59+
public String getName() {
60+
return name;
61+
}
62+
63+
protected abstract T create(Version version);
64+
}

core/src/main/java/org/elasticsearch/index/analysis/PreConfiguredTokenFilter.java

Lines changed: 32 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -22,21 +22,16 @@
2222
import org.apache.lucene.analysis.TokenFilter;
2323
import org.apache.lucene.analysis.TokenStream;
2424
import org.elasticsearch.Version;
25-
import org.elasticsearch.common.settings.Settings;
26-
import org.elasticsearch.env.Environment;
27-
import org.elasticsearch.index.IndexSettings;
28-
import org.elasticsearch.indices.analysis.AnalysisModule;
2925
import org.elasticsearch.indices.analysis.PreBuiltCacheFactory;
3026
import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy;
3127

32-
import java.io.IOException;
3328
import java.util.function.BiFunction;
3429
import java.util.function.Function;
3530

3631
/**
3732
* Provides pre-configured, shared {@link TokenFilter}s.
3833
*/
39-
public final class PreConfiguredTokenFilter implements AnalysisModule.AnalysisProvider<TokenFilterFactory> {
34+
public final class PreConfiguredTokenFilter extends PreConfiguredAnalysisComponent<TokenFilterFactory> {
4035
/**
4136
* Create a pre-configured token filter that may not vary at all.
4237
*/
@@ -60,35 +55,19 @@ public static PreConfiguredTokenFilter luceneVersion(String name, boolean useFil
6055
*/
6156
public static PreConfiguredTokenFilter elasticsearchVersion(String name, boolean useFilterForMultitermQueries,
6257
BiFunction<TokenStream, org.elasticsearch.Version, TokenStream> create) {
63-
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, CachingStrategy.ELASTICSEARCH,
64-
(tokenStream, version) -> create.apply(tokenStream, version));
58+
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, CachingStrategy.ELASTICSEARCH, create);
6559
}
6660

67-
private final String name;
6861
private final boolean useFilterForMultitermQueries;
69-
private final PreBuiltCacheFactory.PreBuiltCache<TokenFilterFactory> cache;
7062
private final BiFunction<TokenStream, Version, TokenStream> create;
7163

7264
private PreConfiguredTokenFilter(String name, boolean useFilterForMultitermQueries,
7365
PreBuiltCacheFactory.CachingStrategy cache, BiFunction<TokenStream, Version, TokenStream> create) {
74-
this.name = name;
66+
super(name, cache);
7567
this.useFilterForMultitermQueries = useFilterForMultitermQueries;
76-
this.cache = PreBuiltCacheFactory.getCache(cache);
7768
this.create = create;
7869
}
7970

80-
@Override
81-
public TokenFilterFactory get(IndexSettings indexSettings, Environment environment, String name, Settings settings) throws IOException {
82-
return getTokenFilterFactory(Version.indexCreated(settings));
83-
}
84-
85-
/**
86-
* The name of the {@link TokenFilter} in the API.
87-
*/
88-
public String getName() {
89-
return name;
90-
}
91-
9271
/**
9372
* Can this {@link TokenFilter} be used in multi-term queries?
9473
*/
@@ -98,42 +77,36 @@ public boolean shouldUseFilterForMultitermQueries() {
9877

9978
private interface MultiTermAwareTokenFilterFactory extends TokenFilterFactory, MultiTermAwareComponent {}
10079

101-
private synchronized TokenFilterFactory getTokenFilterFactory(final Version version) {
102-
TokenFilterFactory factory = cache.get(version);
103-
if (factory == null) {
104-
if (useFilterForMultitermQueries) {
105-
factory = new MultiTermAwareTokenFilterFactory() {
106-
@Override
107-
public String name() {
108-
return name;
109-
}
110-
111-
@Override
112-
public TokenStream create(TokenStream tokenStream) {
113-
return create.apply(tokenStream, version);
114-
}
115-
116-
@Override
117-
public Object getMultiTermComponent() {
118-
return this;
119-
}
120-
};
121-
} else {
122-
factory = new TokenFilterFactory() {
123-
@Override
124-
public String name() {
125-
return name;
126-
}
127-
128-
@Override
129-
public TokenStream create(TokenStream tokenStream) {
130-
return create.apply(tokenStream, version);
131-
}
132-
};
133-
}
134-
cache.put(version, factory);
80+
@Override
81+
protected TokenFilterFactory create(Version version) {
82+
if (useFilterForMultitermQueries) {
83+
return new MultiTermAwareTokenFilterFactory() {
84+
@Override
85+
public String name() {
86+
return getName();
87+
}
88+
89+
@Override
90+
public TokenStream create(TokenStream tokenStream) {
91+
return create.apply(tokenStream, version);
92+
}
93+
94+
@Override
95+
public Object getMultiTermComponent() {
96+
return this;
97+
}
98+
};
13599
}
100+
return new TokenFilterFactory() {
101+
@Override
102+
public String name() {
103+
return getName();
104+
}
136105

137-
return factory;
106+
@Override
107+
public TokenStream create(TokenStream tokenStream) {
108+
return create.apply(tokenStream, version);
109+
}
110+
};
138111
}
139112
}

0 commit comments

Comments
 (0)