From 7233acb392e5213d79e17f49d551760e38262d1a Mon Sep 17 00:00:00 2001 From: liketic Date: Fri, 23 Mar 2018 13:57:09 +0800 Subject: [PATCH 1/3] Replace parameter unicodeSetFilter with unicode_set_filter (#22823) --- .../IcuFoldingTokenFilterFactory.java | 2 +- .../IcuNormalizerCharFilterFactory.java | 2 +- .../IcuNormalizerTokenFilterFactory.java | 19 ++++++- .../test/analysis_icu/10_basic.yml | 55 +++++++++++++++++++ 4 files changed, 73 insertions(+), 5 deletions(-) diff --git a/plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/IcuFoldingTokenFilterFactory.java b/plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/IcuFoldingTokenFilterFactory.java index 60ab831e6f1f4..165cfe1eb54c2 100644 --- a/plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/IcuFoldingTokenFilterFactory.java +++ b/plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/IcuFoldingTokenFilterFactory.java @@ -48,7 +48,7 @@ public class IcuFoldingTokenFilterFactory extends AbstractTokenFilterFactory imp public IcuFoldingTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { super(indexSettings, name, settings); - this.normalizer = IcuNormalizerTokenFilterFactory.wrapWithUnicodeSetFilter(ICU_FOLDING_NORMALIZER, settings); + this.normalizer = IcuNormalizerTokenFilterFactory.wrapWithUnicodeSetFilter(indexSettings, ICU_FOLDING_NORMALIZER, settings); } @Override diff --git a/plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/IcuNormalizerCharFilterFactory.java b/plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/IcuNormalizerCharFilterFactory.java index 3046d6839b9e6..f552fa78e1f32 100644 --- a/plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/IcuNormalizerCharFilterFactory.java +++ b/plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/IcuNormalizerCharFilterFactory.java @@ -49,7 +49,7 @@ public IcuNormalizerCharFilterFactory(IndexSettings indexSettings, Environment e } Normalizer2 normalizer = Normalizer2.getInstance( null, method, "compose".equals(mode) ? Normalizer2.Mode.COMPOSE : Normalizer2.Mode.DECOMPOSE); - this.normalizer = IcuNormalizerTokenFilterFactory.wrapWithUnicodeSetFilter(normalizer, settings); + this.normalizer = IcuNormalizerTokenFilterFactory.wrapWithUnicodeSetFilter(indexSettings, normalizer, settings); } @Override diff --git a/plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/IcuNormalizerTokenFilterFactory.java b/plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/IcuNormalizerTokenFilterFactory.java index 4e8d5d702205d..9741996bd4f79 100644 --- a/plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/IcuNormalizerTokenFilterFactory.java +++ b/plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/IcuNormalizerTokenFilterFactory.java @@ -24,6 +24,9 @@ import com.ibm.icu.text.UnicodeSet; import org.apache.lucene.analysis.TokenStream; +import org.elasticsearch.Version; +import org.elasticsearch.common.logging.DeprecationLogger; +import org.elasticsearch.common.logging.Loggers; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; @@ -37,14 +40,15 @@ * */ public class IcuNormalizerTokenFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent { - + private final static DeprecationLogger DEPRECATION_LOGGER = + new DeprecationLogger(Loggers.getLogger(IcuNormalizerTokenFilterFactory.class)); private final Normalizer2 normalizer; public IcuNormalizerTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { super(indexSettings, name, settings); String method = settings.get("name", "nfkc_cf"); Normalizer2 normalizer = Normalizer2.getInstance(null, method, Normalizer2.Mode.COMPOSE); - this.normalizer = wrapWithUnicodeSetFilter(normalizer, settings); + this.normalizer = wrapWithUnicodeSetFilter(indexSettings, normalizer, settings); } @Override @@ -57,8 +61,17 @@ public Object getMultiTermComponent() { return this; } - static Normalizer2 wrapWithUnicodeSetFilter(final Normalizer2 normalizer, Settings settings) { + static Normalizer2 wrapWithUnicodeSetFilter(final IndexSettings indexSettings, + final Normalizer2 normalizer, + final Settings settings) { String unicodeSetFilter = settings.get("unicodeSetFilter"); + if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_7_0_0_alpha1)) { + if (unicodeSetFilter != null) { + DEPRECATION_LOGGER.deprecated("[unicodeSetFilter] has been deprecated in favor of [unicode_set_filter]"); + } else { + unicodeSetFilter = settings.get("unicode_set_filter"); + } + } if (unicodeSetFilter != null) { UnicodeSet unicodeSet = new UnicodeSet(unicodeSetFilter); diff --git a/plugins/analysis-icu/src/test/resources/rest-api-spec/test/analysis_icu/10_basic.yml b/plugins/analysis-icu/src/test/resources/rest-api-spec/test/analysis_icu/10_basic.yml index 521d8f0714070..cc01ce63d0297 100644 --- a/plugins/analysis-icu/src/test/resources/rest-api-spec/test/analysis_icu/10_basic.yml +++ b/plugins/analysis-icu/src/test/resources/rest-api-spec/test/analysis_icu/10_basic.yml @@ -42,6 +42,61 @@ --- "Normalization with a UnicodeSet Filter": - do: + indices.create: + index: test + body: + settings: + index: + analysis: + char_filter: + charfilter_icu_normalizer: + type: icu_normalizer + unicode_set_filter: "[^ß]" + filter: + tokenfilter_icu_normalizer: + type: icu_normalizer + unicode_set_filter: "[^ßB]" + tokenfilter_icu_folding: + type: icu_folding + unicode_set_filter: "[^â]" + - do: + indices.analyze: + index: test + body: + char_filter: ["charfilter_icu_normalizer"] + tokenizer: keyword + text: charfilter Föo Bâr Ruß + - length: { tokens: 1 } + - match: { tokens.0.token: charfilter föo bâr ruß } + - do: + indices.analyze: + index: test + body: + tokenizer: keyword + filter: ["tokenfilter_icu_normalizer"] + text: tokenfilter Föo Bâr Ruß + - length: { tokens: 1 } + - match: { tokens.0.token: tokenfilter föo Bâr ruß } + - do: + indices.analyze: + index: test + body: + tokenizer: keyword + filter: ["tokenfilter_icu_folding"] + text: icufolding Föo Bâr Ruß + - length: { tokens: 1 } + - match: { tokens.0.token: icufolding foo bâr russ } + +--- +"Normalization with a CamcelCase UnicodeSet Filter": + - skip: + version: " - 6.99.99" + reason: unicodeSetFilter deprecated in 7.0.0, replaced by unicode_set_filter + features: "warnings" + + - do: + warnings: + - "[unicodeSetFilter] has been deprecated in favor of [unicode_set_filter]" indices.create: index: test body: From fe4f1369d255542d6bcde2507909997de7d7613c Mon Sep 17 00:00:00 2001 From: liketic Date: Sat, 27 Oct 2018 15:50:03 +0800 Subject: [PATCH 2/3] Update to latest getLogger --- .../index/analysis/IcuNormalizerTokenFilterFactory.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/IcuNormalizerTokenFilterFactory.java b/plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/IcuNormalizerTokenFilterFactory.java index f8f2068d6a103..cceff91ee3658 100644 --- a/plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/IcuNormalizerTokenFilterFactory.java +++ b/plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/IcuNormalizerTokenFilterFactory.java @@ -23,10 +23,10 @@ import com.ibm.icu.text.Normalizer2; import com.ibm.icu.text.UnicodeSet; +import org.apache.logging.log4j.LogManager; import org.apache.lucene.analysis.TokenStream; import org.elasticsearch.Version; import org.elasticsearch.common.logging.DeprecationLogger; -import org.elasticsearch.common.logging.Loggers; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; @@ -38,8 +38,8 @@ *

The {@code unicodeSetFilter} attribute can be used to provide the UniCodeSet for filtering.

*/ public class IcuNormalizerTokenFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent { - private final static DeprecationLogger DEPRECATION_LOGGER = - new DeprecationLogger(Loggers.getLogger(IcuNormalizerTokenFilterFactory.class)); + private final static DeprecationLogger deprecationLogger = + new DeprecationLogger(LogManager.getLogger(IcuNormalizerTokenFilterFactory.class)); private final Normalizer2 normalizer; public IcuNormalizerTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { @@ -65,7 +65,7 @@ static Normalizer2 wrapWithUnicodeSetFilter(final IndexSettings indexSettings, String unicodeSetFilter = settings.get("unicodeSetFilter"); if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_7_0_0_alpha1)) { if (unicodeSetFilter != null) { - DEPRECATION_LOGGER.deprecated("[unicodeSetFilter] has been deprecated in favor of [unicode_set_filter]"); + deprecationLogger.deprecated("[unicodeSetFilter] has been deprecated in favor of [unicode_set_filter]"); } else { unicodeSetFilter = settings.get("unicode_set_filter"); } From d10c1d8c44a34ed5d7e768410a47382c4f5805b8 Mon Sep 17 00:00:00 2001 From: liketic Date: Mon, 29 Oct 2018 20:49:12 +0800 Subject: [PATCH 3/3] Replace unicodeSetFilter with unicode_set_filter in document --- docs/plugins/analysis-icu.asciidoc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/plugins/analysis-icu.asciidoc b/docs/plugins/analysis-icu.asciidoc index 35265140533ed..ef5744f7dff18 100644 --- a/docs/plugins/analysis-icu.asciidoc +++ b/docs/plugins/analysis-icu.asciidoc @@ -38,7 +38,7 @@ normalization can be specified with the `name` parameter, which accepts `nfc`, convert `nfc` to `nfd` or `nfkc` to `nfkd` respectively: Which letters are normalized can be controlled by specifying the -`unicodeSetFilter` parameter, which accepts a +`unicode_set_filter` parameter, which accepts a http://icu-project.org/apiref/icu4j/com/ibm/icu/text/UnicodeSet.html[UnicodeSet]. Here are two examples, the default usage and a customised character filter: @@ -194,7 +194,7 @@ with the `name` parameter, which accepts `nfc`, `nfkc`, and `nfkc_cf` (default). Which letters are normalized can be controlled by specifying the -`unicodeSetFilter` parameter, which accepts a +`unicode_set_filter` parameter, which accepts a http://icu-project.org/apiref/icu4j/com/ibm/icu/text/UnicodeSet.html[UnicodeSet]. You should probably prefer the <>. @@ -273,7 +273,7 @@ The ICU folding token filter already does Unicode normalization, so there is no need to use Normalize character or token filter as well. Which letters are folded can be controlled by specifying the -`unicodeSetFilter` parameter, which accepts a +`unicode_set_filter` parameter, which accepts a http://icu-project.org/apiref/icu4j/com/ibm/icu/text/UnicodeSet.html[UnicodeSet]. The following example exempts Swedish characters from folding. It is important @@ -300,7 +300,7 @@ PUT icu_sample "filter": { "swedish_folding": { "type": "icu_folding", - "unicodeSetFilter": "[^åäöÅÄÖ]" + "unicode_set_filter": "[^åäöÅÄÖ]" } } }