Skip to content

Commit 73307a2

Browse files
authored
Plugins can register pre-configured char filters (#25000)
Fixes the plumbing so plugins can register char filters and moves the `html_strip` char filter into analysis-common. Relates to #23658
1 parent 6600707 commit 73307a2

File tree

15 files changed

+361
-205
lines changed

15 files changed

+361
-205
lines changed

core/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java

Lines changed: 7 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,6 @@
3535
import org.elasticsearch.indices.analysis.AnalysisModule;
3636
import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider;
3737
import org.elasticsearch.indices.analysis.PreBuiltAnalyzers;
38-
import org.elasticsearch.indices.analysis.PreBuiltCharFilters;
39-
import org.elasticsearch.indices.analysis.PreBuiltTokenizers;
4038

4139
import java.io.Closeable;
4240
import java.io.IOException;
@@ -74,6 +72,7 @@ public AnalysisRegistry(Environment environment,
7472
Map<String, AnalysisProvider<TokenizerFactory>> tokenizers,
7573
Map<String, AnalysisProvider<AnalyzerProvider<?>>> analyzers,
7674
Map<String, AnalysisProvider<AnalyzerProvider<?>>> normalizers,
75+
Map<String, PreConfiguredCharFilter> preConfiguredCharFilters,
7776
Map<String, PreConfiguredTokenFilter> preConfiguredTokenFilters,
7877
Map<String, PreConfiguredTokenizer> preConfiguredTokenizers) {
7978
this.environment = environment;
@@ -82,7 +81,7 @@ public AnalysisRegistry(Environment environment,
8281
this.tokenizers = unmodifiableMap(tokenizers);
8382
this.analyzers = unmodifiableMap(analyzers);
8483
this.normalizers = unmodifiableMap(normalizers);
85-
prebuiltAnalysis = new PrebuiltAnalysis(preConfiguredTokenFilters, preConfiguredTokenizers);
84+
prebuiltAnalysis = new PrebuiltAnalysis(preConfiguredCharFilters, preConfiguredTokenFilters, preConfiguredTokenizers);
8685
}
8786

8887
/**
@@ -180,7 +179,7 @@ public Map<String, TokenizerFactory> buildTokenizerFactories(IndexSettings index
180179

181180
public Map<String, CharFilterFactory> buildCharFilterFactories(IndexSettings indexSettings) throws IOException {
182181
final Map<String, Settings> charFiltersSettings = indexSettings.getSettings().getGroups(INDEX_ANALYSIS_CHAR_FILTER);
183-
return buildMapping(Component.CHAR_FILTER, indexSettings, charFiltersSettings, charFilters, prebuiltAnalysis.charFilterFactories);
182+
return buildMapping(Component.CHAR_FILTER, indexSettings, charFiltersSettings, charFilters, prebuiltAnalysis.preConfiguredCharFilterFactories);
184183
}
185184

186185
public Map<String, AnalyzerProvider<?>> buildAnalyzerFactories(IndexSettings indexSettings) throws IOException {
@@ -397,36 +396,28 @@ private static class PrebuiltAnalysis implements Closeable {
397396
final Map<String, AnalysisModule.AnalysisProvider<AnalyzerProvider<?>>> analyzerProviderFactories;
398397
final Map<String, ? extends AnalysisProvider<TokenFilterFactory>> preConfiguredTokenFilters;
399398
final Map<String, ? extends AnalysisProvider<TokenizerFactory>> preConfiguredTokenizers;
400-
final Map<String, AnalysisModule.AnalysisProvider<CharFilterFactory>> charFilterFactories;
399+
final Map<String, ? extends AnalysisProvider<CharFilterFactory>> preConfiguredCharFilterFactories;
401400

402401
private PrebuiltAnalysis(
402+
Map<String, PreConfiguredCharFilter> preConfiguredCharFilters,
403403
Map<String, PreConfiguredTokenFilter> preConfiguredTokenFilters,
404404
Map<String, PreConfiguredTokenizer> preConfiguredTokenizers) {
405405
Map<String, PreBuiltAnalyzerProviderFactory> analyzerProviderFactories = new HashMap<>();
406-
Map<String, PreBuiltCharFilterFactoryFactory> charFilterFactories = new HashMap<>();
407406

408407
// Analyzers
409408
for (PreBuiltAnalyzers preBuiltAnalyzerEnum : PreBuiltAnalyzers.values()) {
410409
String name = preBuiltAnalyzerEnum.name().toLowerCase(Locale.ROOT);
411410
analyzerProviderFactories.put(name, new PreBuiltAnalyzerProviderFactory(name, AnalyzerScope.INDICES, preBuiltAnalyzerEnum.getAnalyzer(Version.CURRENT)));
412411
}
413412

414-
// Char Filters
415-
for (PreBuiltCharFilters preBuiltCharFilter : PreBuiltCharFilters.values()) {
416-
String name = preBuiltCharFilter.name().toLowerCase(Locale.ROOT);
417-
charFilterFactories.put(name, new PreBuiltCharFilterFactoryFactory(preBuiltCharFilter.getCharFilterFactory(Version.CURRENT)));
418-
}
419-
// Char filter aliases
420-
charFilterFactories.put("htmlStrip", new PreBuiltCharFilterFactoryFactory(PreBuiltCharFilters.HTML_STRIP.getCharFilterFactory(Version.CURRENT)));
421-
422413
this.analyzerProviderFactories = Collections.unmodifiableMap(analyzerProviderFactories);
423-
this.charFilterFactories = Collections.unmodifiableMap(charFilterFactories);
414+
this.preConfiguredCharFilterFactories = preConfiguredCharFilters;
424415
this.preConfiguredTokenFilters = preConfiguredTokenFilters;
425416
this.preConfiguredTokenizers = preConfiguredTokenizers;
426417
}
427418

428419
public AnalysisModule.AnalysisProvider<CharFilterFactory> getCharFilterFactory(String name) {
429-
return charFilterFactories.get(name);
420+
return preConfiguredCharFilterFactories.get(name);
430421
}
431422

432423
public AnalysisModule.AnalysisProvider<TokenFilterFactory> getTokenFilterFactory(String name) {

core/src/main/java/org/elasticsearch/index/analysis/PreBuiltCharFilterFactoryFactory.java

Lines changed: 0 additions & 51 deletions
This file was deleted.
Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
/*
2+
* Licensed to Elasticsearch under one or more contributor
3+
* license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright
5+
* ownership. Elasticsearch licenses this file to you under
6+
* the Apache License, Version 2.0 (the "License"); you may
7+
* not use this file except in compliance with the License.
8+
* You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package org.elasticsearch.index.analysis;
21+
22+
import org.apache.lucene.analysis.CharFilter;
23+
import org.apache.lucene.analysis.TokenFilter;
24+
import org.elasticsearch.Version;
25+
import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy;
26+
27+
import java.io.Reader;
28+
import java.util.function.BiFunction;
29+
import java.util.function.Function;
30+
31+
/**
32+
* Provides pre-configured, shared {@link CharFilter}s.
33+
*/
34+
public class PreConfiguredCharFilter extends PreConfiguredAnalysisComponent<CharFilterFactory> {
35+
/**
36+
* Create a pre-configured char filter that may not vary at all.
37+
*/
38+
public static PreConfiguredCharFilter singleton(String name, boolean useFilterForMultitermQueries, Function<Reader, Reader> create) {
39+
return new PreConfiguredCharFilter(name, CachingStrategy.ONE, useFilterForMultitermQueries,
40+
(reader, version) -> create.apply(reader));
41+
}
42+
43+
/**
44+
* Create a pre-configured token filter that may vary based on the Lucene version.
45+
*/
46+
public static PreConfiguredCharFilter luceneVersion(String name, boolean useFilterForMultitermQueries,
47+
BiFunction<Reader, org.apache.lucene.util.Version, Reader> create) {
48+
return new PreConfiguredCharFilter(name, CachingStrategy.LUCENE, useFilterForMultitermQueries,
49+
(reader, version) -> create.apply(reader, version.luceneVersion));
50+
}
51+
52+
/**
53+
* Create a pre-configured token filter that may vary based on the Elasticsearch version.
54+
*/
55+
public static PreConfiguredCharFilter elasticsearchVersion(String name, boolean useFilterForMultitermQueries,
56+
BiFunction<Reader, org.elasticsearch.Version, Reader> create) {
57+
return new PreConfiguredCharFilter(name, CachingStrategy.ELASTICSEARCH, useFilterForMultitermQueries, create);
58+
}
59+
60+
private final boolean useFilterForMultitermQueries;
61+
private final BiFunction<Reader, Version, Reader> create;
62+
63+
protected PreConfiguredCharFilter(String name, CachingStrategy cache, boolean useFilterForMultitermQueries,
64+
BiFunction<Reader, org.elasticsearch.Version, Reader> create) {
65+
super(name, cache);
66+
this.useFilterForMultitermQueries = useFilterForMultitermQueries;
67+
this.create = create;
68+
}
69+
70+
/**
71+
* Can this {@link TokenFilter} be used in multi-term queries?
72+
*/
73+
public boolean shouldUseFilterForMultitermQueries() {
74+
return useFilterForMultitermQueries;
75+
}
76+
77+
private interface MultiTermAwareCharFilterFactory extends CharFilterFactory, MultiTermAwareComponent {}
78+
79+
@Override
80+
protected CharFilterFactory create(Version version) {
81+
if (useFilterForMultitermQueries) {
82+
return new MultiTermAwareCharFilterFactory() {
83+
@Override
84+
public String name() {
85+
return getName();
86+
}
87+
88+
@Override
89+
public Reader create(Reader reader) {
90+
return create.apply(reader, version);
91+
}
92+
93+
@Override
94+
public Object getMultiTermComponent() {
95+
return this;
96+
}
97+
};
98+
}
99+
return new CharFilterFactory() {
100+
@Override
101+
public Reader create(Reader reader) {
102+
return create.apply(reader, version);
103+
}
104+
105+
@Override
106+
public String name() {
107+
return getName();
108+
}
109+
};
110+
}
111+
112+
}

core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,7 @@
101101
import org.elasticsearch.index.analysis.PersianAnalyzerProvider;
102102
import org.elasticsearch.index.analysis.PersianNormalizationFilterFactory;
103103
import org.elasticsearch.index.analysis.PortugueseAnalyzerProvider;
104+
import org.elasticsearch.index.analysis.PreConfiguredCharFilter;
104105
import org.elasticsearch.index.analysis.PreConfiguredTokenFilter;
105106
import org.elasticsearch.index.analysis.PreConfiguredTokenizer;
106107
import org.elasticsearch.index.analysis.ReverseTokenFilterFactory;
@@ -173,11 +174,14 @@ public AnalysisModule(Environment environment, List<AnalysisPlugin> plugins) thr
173174
NamedRegistry<AnalysisProvider<AnalyzerProvider<?>>> analyzers = setupAnalyzers(plugins);
174175
NamedRegistry<AnalysisProvider<AnalyzerProvider<?>>> normalizers = setupNormalizers(plugins);
175176

177+
Map<String, PreConfiguredCharFilter> preConfiguredCharFilters = setupPreConfiguredCharFilters(plugins);
176178
Map<String, PreConfiguredTokenFilter> preConfiguredTokenFilters = setupPreConfiguredTokenFilters(plugins);
177179
Map<String, PreConfiguredTokenizer> preConfiguredTokenizers = setupPreConfiguredTokenizers(plugins);
178180

179-
analysisRegistry = new AnalysisRegistry(environment, charFilters.getRegistry(), tokenFilters.getRegistry(), tokenizers
180-
.getRegistry(), analyzers.getRegistry(), normalizers.getRegistry(), preConfiguredTokenFilters, preConfiguredTokenizers);
181+
analysisRegistry = new AnalysisRegistry(environment,
182+
charFilters.getRegistry(), tokenFilters.getRegistry(), tokenizers.getRegistry(),
183+
analyzers.getRegistry(), normalizers.getRegistry(),
184+
preConfiguredCharFilters, preConfiguredTokenFilters, preConfiguredTokenizers);
181185
}
182186

183187
HunspellService getHunspellService() {
@@ -261,6 +265,19 @@ private NamedRegistry<AnalysisProvider<TokenFilterFactory>> setupTokenFilters(Li
261265
return tokenFilters;
262266
}
263267

268+
static Map<String, PreConfiguredCharFilter> setupPreConfiguredCharFilters(List<AnalysisPlugin> plugins) {
269+
NamedRegistry<PreConfiguredCharFilter> preConfiguredCharFilters = new NamedRegistry<>("pre-configured char_filter");
270+
271+
// No char filter are available in lucene-core so none are built in to Elasticsearch core
272+
273+
for (AnalysisPlugin plugin: plugins) {
274+
for (PreConfiguredCharFilter filter : plugin.getPreConfiguredCharFilters()) {
275+
preConfiguredCharFilters.register(filter.getName(), filter);
276+
}
277+
}
278+
return unmodifiableMap(preConfiguredCharFilters.getRegistry());
279+
}
280+
264281
static Map<String, PreConfiguredTokenFilter> setupPreConfiguredTokenFilters(List<AnalysisPlugin> plugins) {
265282
NamedRegistry<PreConfiguredTokenFilter> preConfiguredTokenFilters = new NamedRegistry<>("pre-configured token_filter");
266283

core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltCharFilters.java

Lines changed: 0 additions & 80 deletions
This file was deleted.

core/src/main/java/org/elasticsearch/plugins/AnalysisPlugin.java

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,9 @@
2828
import org.elasticsearch.index.IndexSettings;
2929
import org.elasticsearch.index.analysis.AnalyzerProvider;
3030
import org.elasticsearch.index.analysis.CharFilterFactory;
31-
import org.elasticsearch.index.analysis.PreConfiguredTokenizer;
31+
import org.elasticsearch.index.analysis.PreConfiguredCharFilter;
3232
import org.elasticsearch.index.analysis.PreConfiguredTokenFilter;
33+
import org.elasticsearch.index.analysis.PreConfiguredTokenizer;
3334
import org.elasticsearch.index.analysis.TokenFilterFactory;
3435
import org.elasticsearch.index.analysis.TokenizerFactory;
3536
import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider;
@@ -91,6 +92,13 @@ default Map<String, AnalysisProvider<AnalyzerProvider<? extends Analyzer>>> getA
9192
return emptyMap();
9293
}
9394

95+
/**
96+
* Override to add additional pre-configured {@link CharFilter}s.
97+
*/
98+
default List<PreConfiguredCharFilter> getPreConfiguredCharFilters() {
99+
return emptyList();
100+
}
101+
94102
/**
95103
* Override to add additional pre-configured {@link TokenFilter}s.
96104
*/

0 commit comments

Comments
 (0)