Skip to content

Commit f6a43b5

Browse files
authored
Add a prebuilt ICU Analyzer (#34958)
The ICU plugin provides the building blocks of an analysis chain, but doesn't actually have a prebuilt analyzer. It would be a better for users if there was a simple analyzer that they could use out of the box, and also something we can point to from the CJK Analyzer docs as a superior alternative. Relates to #34285
1 parent e8ec4fa commit f6a43b5

File tree

5 files changed

+192
-0
lines changed

5 files changed

+192
-0
lines changed

docs/plugins/analysis-icu.asciidoc

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,24 @@ characters.
2626
:plugin_name: analysis-icu
2727
include::install_remove.asciidoc[]
2828

29+
[[analysis-icu-analyzer]]
30+
==== ICU Analyzer
31+
32+
Performs basic normalization, tokenization and character folding, using the
33+
`icu_normalizer` char filter, `icu_tokenizer` and `icu_normalizer` token filter
34+
35+
The following parameters are accepted:
36+
37+
[horizontal]
38+
39+
`method`::
40+
41+
Normalization method. Accepts `nfkc`, `nfc` or `nfkc_cf` (default)
42+
43+
`mode`::
44+
45+
Normalization mode. Accepts `compose` (default) or `decompose`.
46+
2947
[[analysis-icu-normalization-charfilter]]
3048
==== ICU Normalization Character Filter
3149

docs/reference/analysis/analyzers/lang-analyzer.asciidoc

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -421,6 +421,9 @@ PUT /catalan_example
421421
[[cjk-analyzer]]
422422
===== `cjk` analyzer
423423

424+
NOTE: You may find that `icu_analyzer` in the ICU analysis plugin works better
425+
for CJK text than the `cjk` analyzer. Experiment with your text and queries.
426+
424427
The `cjk` analyzer could be reimplemented as a `custom` analyzer as follows:
425428

426429
[source,js]
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
/*
2+
* Licensed to Elasticsearch under one or more contributor
3+
* license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright
5+
* ownership. Elasticsearch licenses this file to you under
6+
* the Apache License, Version 2.0 (the "License"); you may
7+
* not use this file except in compliance with the License.
8+
* You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package org.elasticsearch.index.analysis;
21+
22+
import com.ibm.icu.text.Normalizer2;
23+
import org.apache.lucene.analysis.Analyzer;
24+
import org.apache.lucene.analysis.Tokenizer;
25+
import org.apache.lucene.analysis.icu.ICUFoldingFilter;
26+
import org.apache.lucene.analysis.icu.ICUNormalizer2CharFilter;
27+
import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer;
28+
import org.elasticsearch.common.settings.Settings;
29+
import org.elasticsearch.env.Environment;
30+
import org.elasticsearch.index.IndexSettings;
31+
32+
import java.io.Reader;
33+
34+
public class IcuAnalyzerProvider extends AbstractIndexAnalyzerProvider<Analyzer> {
35+
36+
private final Normalizer2 normalizer;
37+
38+
public IcuAnalyzerProvider(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
39+
super(indexSettings, name, settings);
40+
String method = settings.get("method", "nfkc_cf");
41+
String mode = settings.get("mode", "compose");
42+
if (!"compose".equals(mode) && !"decompose".equals(mode)) {
43+
throw new IllegalArgumentException("Unknown mode [" + mode + "] in analyzer [" + name +
44+
"], expected one of [compose, decompose]");
45+
}
46+
Normalizer2 normalizer = Normalizer2.getInstance(
47+
null, method, "compose".equals(mode) ? Normalizer2.Mode.COMPOSE : Normalizer2.Mode.DECOMPOSE);
48+
this.normalizer = IcuNormalizerTokenFilterFactory.wrapWithUnicodeSetFilter(indexSettings, normalizer, settings);
49+
}
50+
51+
@Override
52+
public Analyzer get() {
53+
return new Analyzer() {
54+
55+
@Override
56+
protected Reader initReader(String fieldName, Reader reader) {
57+
return new ICUNormalizer2CharFilter(reader, normalizer);
58+
}
59+
60+
@Override
61+
protected TokenStreamComponents createComponents(String fieldName) {
62+
Tokenizer source = new ICUTokenizer();
63+
return new TokenStreamComponents(source, new ICUFoldingFilter(source));
64+
}
65+
};
66+
}
67+
}

plugins/analysis-icu/src/main/java/org/elasticsearch/plugin/analysis/icu/AnalysisICUPlugin.java

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,11 @@
2121

2222
import static java.util.Collections.singletonMap;
2323

24+
import org.apache.lucene.analysis.Analyzer;
2425
import org.elasticsearch.common.io.stream.NamedWriteableRegistry;
26+
import org.elasticsearch.index.analysis.AnalyzerProvider;
2527
import org.elasticsearch.index.analysis.CharFilterFactory;
28+
import org.elasticsearch.index.analysis.IcuAnalyzerProvider;
2629
import org.elasticsearch.index.analysis.IcuCollationTokenFilterFactory;
2730
import org.elasticsearch.index.analysis.IcuFoldingTokenFilterFactory;
2831
import org.elasticsearch.index.analysis.IcuNormalizerCharFilterFactory;
@@ -60,6 +63,11 @@ public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
6063
return extra;
6164
}
6265

66+
@Override
67+
public Map<String, AnalysisProvider<AnalyzerProvider<? extends Analyzer>>> getAnalyzers() {
68+
return singletonMap("icu_analyzer", IcuAnalyzerProvider::new);
69+
}
70+
6371
@Override
6472
public Map<String, AnalysisProvider<TokenizerFactory>> getTokenizers() {
6573
return singletonMap("icu_tokenizer", IcuTokenizerFactory::new);
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
/*
2+
* Licensed to Elasticsearch under one or more contributor
3+
* license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright
5+
* ownership. Elasticsearch licenses this file to you under
6+
* the Apache License, Version 2.0 (the "License"); you may
7+
* not use this file except in compliance with the License.
8+
* You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package org.elasticsearch.index.analysis;
21+
22+
import org.apache.lucene.analysis.Analyzer;
23+
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
24+
import org.elasticsearch.Version;
25+
import org.elasticsearch.cluster.metadata.IndexMetaData;
26+
import org.elasticsearch.common.settings.Settings;
27+
import org.elasticsearch.index.IndexSettings;
28+
import org.elasticsearch.plugin.analysis.icu.AnalysisICUPlugin;
29+
import org.elasticsearch.test.IndexSettingsModule;
30+
31+
import java.io.IOException;
32+
33+
import static org.hamcrest.Matchers.containsString;
34+
35+
public class IcuAnalyzerTests extends BaseTokenStreamTestCase {
36+
37+
public void testMixedAlphabetTokenization() throws IOException {
38+
39+
Settings settings = Settings.builder()
40+
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
41+
.build();
42+
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
43+
44+
String input = "안녕은하철도999극장판2.1981년8월8일.일본개봉작1999년재더빙video판";
45+
46+
AnalysisICUPlugin plugin = new AnalysisICUPlugin();
47+
Analyzer analyzer = plugin.getAnalyzers().get("icu_analyzer").get(idxSettings, null, "icu", settings).get();
48+
assertAnalyzesTo(analyzer, input,
49+
new String[]{"안녕은하철도", "999", "극장판", "2.1981", "년", "8", "월", "8", "일", "일본개봉작", "1999", "년재더빙", "video", "판"});
50+
51+
}
52+
53+
public void testMiddleDots() throws IOException {
54+
Settings settings = Settings.builder()
55+
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
56+
.build();
57+
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
58+
59+
String input = "경승지·산악·협곡·해협·곶·심연·폭포·호수·급류";
60+
61+
Analyzer analyzer = new IcuAnalyzerProvider(idxSettings, null, "icu", settings).get();
62+
assertAnalyzesTo(analyzer, input,
63+
new String[]{"경승지", "산악", "협곡", "해협", "곶", "심연", "폭포", "호수", "급류"});
64+
}
65+
66+
public void testUnicodeNumericCharacters() throws IOException {
67+
68+
Settings settings = Settings.builder()
69+
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
70+
.build();
71+
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
72+
73+
String input = "① ② ③ ⑴ ⑵ ⑶ ¼ ⅓ ⅜ ¹ ² ³ ₁ ₂ ₃";
74+
75+
Analyzer analyzer = new IcuAnalyzerProvider(idxSettings, null, "icu", settings).get();
76+
assertAnalyzesTo(analyzer, input,
77+
new String[]{"1", "2", "3", "1", "2", "3", "1/4", "1/3", "3/8", "1", "2", "3", "1", "2", "3"});
78+
}
79+
80+
public void testBadSettings() {
81+
82+
Settings settings = Settings.builder()
83+
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
84+
.put("mode", "wrong")
85+
.build();
86+
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
87+
88+
IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> {
89+
new IcuAnalyzerProvider(idxSettings, null, "icu", settings);
90+
});
91+
92+
assertThat(e.getMessage(), containsString("Unknown mode [wrong] in analyzer [icu], expected one of [compose, decompose]"));
93+
94+
}
95+
96+
}

0 commit comments

Comments
 (0)