Skip to content

Commit ecc87f6

Browse files
authored
Move pre-configured "keyword" tokenizer to the analysis-common module (#24863)
Moves the keyword tokenizer to the analysis-common module. The keyword tokenizer is special because it is used by CustomNormalizerProvider so I pulled it out into its own PR. To get the move to work I've reworked the lookup from static to one using the AnalysisRegistry. This seems safe enough. Part of #23658.
1 parent 7b35819 commit ecc87f6

File tree

5 files changed

+9
-53
lines changed

5 files changed

+9
-53
lines changed

core/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -466,7 +466,7 @@ public IndexAnalyzers build(IndexSettings indexSettings,
466466
}
467467
for (Map.Entry<String, AnalyzerProvider<?>> entry : normalizerProviders.entrySet()) {
468468
processNormalizerFactory(deprecationLogger, indexSettings, entry.getKey(), entry.getValue(), normalizers,
469-
tokenFilterFactoryFactories, charFilterFactoryFactories);
469+
tokenizerFactoryFactories.get("keyword"), tokenFilterFactoryFactories, charFilterFactoryFactories);
470470
}
471471
for (Map.Entry<String, NamedAnalyzer> entry : analyzerAliases.entrySet()) {
472472
String key = entry.getKey();
@@ -585,10 +585,11 @@ private void processNormalizerFactory(DeprecationLogger deprecationLogger,
585585
String name,
586586
AnalyzerProvider<?> normalizerFactory,
587587
Map<String, NamedAnalyzer> normalizers,
588+
TokenizerFactory keywordTokenizerFactory,
588589
Map<String, TokenFilterFactory> tokenFilters,
589590
Map<String, CharFilterFactory> charFilters) {
590591
if (normalizerFactory instanceof CustomNormalizerProvider) {
591-
((CustomNormalizerProvider) normalizerFactory).build(charFilters, tokenFilters);
592+
((CustomNormalizerProvider) normalizerFactory).build(keywordTokenizerFactory, charFilters, tokenFilters);
592593
}
593594
Analyzer normalizerF = normalizerFactory.get();
594595
if (normalizerF == null) {

core/src/main/java/org/elasticsearch/index/analysis/CustomNormalizerProvider.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121

2222
import org.elasticsearch.common.settings.Settings;
2323
import org.elasticsearch.index.IndexSettings;
24-
import org.elasticsearch.indices.analysis.PreBuiltTokenizers;
2524

2625
import java.util.ArrayList;
2726
import java.util.List;
@@ -44,7 +43,8 @@ public CustomNormalizerProvider(IndexSettings indexSettings,
4443
this.analyzerSettings = settings;
4544
}
4645

47-
public void build(final Map<String, CharFilterFactory> charFilters, final Map<String, TokenFilterFactory> tokenFilters) {
46+
public void build(final TokenizerFactory keywordTokenizerFactory, final Map<String, CharFilterFactory> charFilters,
47+
final Map<String, TokenFilterFactory> tokenFilters) {
4848
String tokenizerName = analyzerSettings.get("tokenizer");
4949
if (tokenizerName != null) {
5050
throw new IllegalArgumentException("Custom normalizer [" + name() + "] cannot configure a tokenizer");
@@ -83,7 +83,7 @@ public void build(final Map<String, CharFilterFactory> charFilters, final Map<St
8383

8484
this.customAnalyzer = new CustomAnalyzer(
8585
"keyword",
86-
PreBuiltTokenizers.KEYWORD.getTokenizerFactory(indexSettings.getIndexVersionCreated()),
86+
keywordTokenizerFactory,
8787
charFiltersList.toArray(new CharFilterFactory[charFiltersList.size()]),
8888
tokenFilterList.toArray(new TokenFilterFactory[tokenFilterList.size()])
8989
);

core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltTokenizers.java

Lines changed: 0 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919
package org.elasticsearch.indices.analysis;
2020

2121
import org.apache.lucene.analysis.Tokenizer;
22-
import org.apache.lucene.analysis.core.KeywordTokenizer;
2322
import org.apache.lucene.analysis.core.LetterTokenizer;
2423
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
2524
import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer;
@@ -32,10 +31,7 @@
3231
import org.apache.lucene.analysis.th.ThaiTokenizer;
3332
import org.elasticsearch.Version;
3433
import org.elasticsearch.common.regex.Regex;
35-
import org.elasticsearch.index.analysis.CustomNormalizerProvider;
36-
import org.elasticsearch.index.analysis.MultiTermAwareComponent;
3734
import org.elasticsearch.index.analysis.TokenFilterFactory;
38-
import org.elasticsearch.index.analysis.TokenizerFactory;
3935
import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy;
4036

4137
public enum PreBuiltTokenizers {
@@ -68,13 +64,6 @@ protected Tokenizer create(Version version) {
6864
}
6965
},
7066

71-
KEYWORD(CachingStrategy.ONE) {
72-
@Override
73-
protected Tokenizer create(Version version) {
74-
return new KeywordTokenizer();
75-
}
76-
},
77-
7867
LETTER(CachingStrategy.ONE) {
7968
@Override
8069
protected Tokenizer create(Version version) {
@@ -125,50 +114,13 @@ protected TokenFilterFactory getMultiTermComponent(Version version) {
125114
return null;
126115
}
127116

128-
protected final PreBuiltCacheFactory.PreBuiltCache<TokenizerFactory> cache;
129117
private final CachingStrategy cachingStrategy;
130118

131119
PreBuiltTokenizers(CachingStrategy cachingStrategy) {
132120
this.cachingStrategy = cachingStrategy;
133-
cache = PreBuiltCacheFactory.getCache(cachingStrategy);
134121
}
135122

136123
public CachingStrategy getCachingStrategy() {
137124
return cachingStrategy;
138125
}
139-
140-
private interface MultiTermAwareTokenizerFactory extends TokenizerFactory, MultiTermAwareComponent {}
141-
142-
/**
143-
* Old style resolution for {@link TokenizerFactory}. Exists entirely to keep
144-
* {@link CustomNormalizerProvider#build(java.util.Map, java.util.Map)} working during the migration.
145-
*/
146-
public synchronized TokenizerFactory getTokenizerFactory(final Version version) {
147-
TokenizerFactory tokenizerFactory = cache.get(version);
148-
if (tokenizerFactory == null) {
149-
if (getMultiTermComponent(version) != null) {
150-
tokenizerFactory = new MultiTermAwareTokenizerFactory() {
151-
@Override
152-
public Tokenizer create() {
153-
return PreBuiltTokenizers.this.create(version);
154-
}
155-
156-
@Override
157-
public Object getMultiTermComponent() {
158-
return PreBuiltTokenizers.this.getMultiTermComponent(version);
159-
}
160-
};
161-
} else {
162-
tokenizerFactory = new TokenizerFactory() {
163-
@Override
164-
public Tokenizer create() {
165-
return PreBuiltTokenizers.this.create(version);
166-
}
167-
};
168-
}
169-
cache.put(version, tokenizerFactory);
170-
}
171-
172-
return tokenizerFactory;
173-
}
174126
}

modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
import org.apache.lucene.analysis.ckb.SoraniNormalizationFilter;
3333
import org.apache.lucene.analysis.commongrams.CommonGramsFilter;
3434
import org.apache.lucene.analysis.core.DecimalDigitFilter;
35+
import org.apache.lucene.analysis.core.KeywordTokenizer;
3536
import org.apache.lucene.analysis.core.LowerCaseTokenizer;
3637
import org.apache.lucene.analysis.core.StopAnalyzer;
3738
import org.apache.lucene.analysis.core.UpperCaseFilter;
@@ -215,6 +216,7 @@ public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
215216
@Override
216217
public List<PreConfiguredTokenizer> getPreConfiguredTokenizers() {
217218
List<PreConfiguredTokenizer> tokenizers = new ArrayList<>();
219+
tokenizers.add(PreConfiguredTokenizer.singleton("keyword", KeywordTokenizer::new, null));
218220
tokenizers.add(PreConfiguredTokenizer.singleton("lowercase", LowerCaseTokenizer::new, () -> new TokenFilterFactory() {
219221
@Override
220222
public String name() {

modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,7 @@ protected Map<String, Class<?>> getPreConfiguredTokenFilters() {
145145
@Override
146146
protected Map<String, Class<?>> getPreConfiguredTokenizers() {
147147
Map<String, Class<?>> filters = new TreeMap<>(super.getPreConfiguredTokenizers());
148+
filters.put("keyword", null);
148149
filters.put("lowercase", null);
149150
return filters;
150151
}

0 commit comments

Comments
 (0)