Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions docs/plugins/analysis-icu.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ normalization can be specified with the `name` parameter, which accepts `nfc`,
convert `nfc` to `nfd` or `nfkc` to `nfkd` respectively:

Which letters are normalized can be controlled by specifying the
`unicodeSetFilter` parameter, which accepts a
`unicode_set_filter` parameter, which accepts a
http://icu-project.org/apiref/icu4j/com/ibm/icu/text/UnicodeSet.html[UnicodeSet].

Here are two examples, the default usage and a customised character filter:
Expand Down Expand Up @@ -194,7 +194,7 @@ with the `name` parameter, which accepts `nfc`, `nfkc`, and `nfkc_cf`
(default).

Which letters are normalized can be controlled by specifying the
`unicodeSetFilter` parameter, which accepts a
`unicode_set_filter` parameter, which accepts a
http://icu-project.org/apiref/icu4j/com/ibm/icu/text/UnicodeSet.html[UnicodeSet].

You should probably prefer the <<analysis-icu-normalization-charfilter,Normalization character filter>>.
Expand Down Expand Up @@ -273,7 +273,7 @@ The ICU folding token filter already does Unicode normalization, so there is
no need to use Normalize character or token filter as well.

Which letters are folded can be controlled by specifying the
`unicodeSetFilter` parameter, which accepts a
`unicode_set_filter` parameter, which accepts a
http://icu-project.org/apiref/icu4j/com/ibm/icu/text/UnicodeSet.html[UnicodeSet].

The following example exempts Swedish characters from folding. It is important
Expand All @@ -300,7 +300,7 @@ PUT icu_sample
"filter": {
"swedish_folding": {
"type": "icu_folding",
"unicodeSetFilter": "[^åäöÅÄÖ]"
"unicode_set_filter": "[^åäöÅÄÖ]"
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ public class IcuFoldingTokenFilterFactory extends AbstractTokenFilterFactory imp

public IcuFoldingTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);
this.normalizer = IcuNormalizerTokenFilterFactory.wrapWithUnicodeSetFilter(ICU_FOLDING_NORMALIZER, settings);
this.normalizer = IcuNormalizerTokenFilterFactory.wrapWithUnicodeSetFilter(indexSettings, ICU_FOLDING_NORMALIZER, settings);
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ public IcuNormalizerCharFilterFactory(IndexSettings indexSettings, Environment e
}
Normalizer2 normalizer = Normalizer2.getInstance(
null, method, "compose".equals(mode) ? Normalizer2.Mode.COMPOSE : Normalizer2.Mode.DECOMPOSE);
this.normalizer = IcuNormalizerTokenFilterFactory.wrapWithUnicodeSetFilter(normalizer, settings);
this.normalizer = IcuNormalizerTokenFilterFactory.wrapWithUnicodeSetFilter(indexSettings, normalizer, settings);
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,10 @@
import com.ibm.icu.text.Normalizer2;
import com.ibm.icu.text.UnicodeSet;

import org.apache.logging.log4j.LogManager;
import org.apache.lucene.analysis.TokenStream;
import org.elasticsearch.Version;
import org.elasticsearch.common.logging.DeprecationLogger;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
Expand All @@ -35,14 +38,15 @@
* <p>The {@code unicodeSetFilter} attribute can be used to provide the UniCodeSet for filtering.</p>
*/
public class IcuNormalizerTokenFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent {

private final static DeprecationLogger deprecationLogger =
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@romseygeek this line is causing checkstyle violations

> Task :plugins:analysis-icu:checkstyleMain FAILED
[ant:checkstyle] [ERROR] /home/alpar/work/elastic/elasticsearch/plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/IcuNormalizerTokenFilterFactory.java:41:19: 'static' modifier out of order with the JLS suggestions. [ModifierOrder]

I'm pushing a fix to master now. FYI

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Master was merged in a few days ago and the checkstyle rules last changed 4 weeks ago, yet the PR build passed.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I opened #35207 to fix these

new DeprecationLogger(LogManager.getLogger(IcuNormalizerTokenFilterFactory.class));
private final Normalizer2 normalizer;

public IcuNormalizerTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);
String method = settings.get("name", "nfkc_cf");
Normalizer2 normalizer = Normalizer2.getInstance(null, method, Normalizer2.Mode.COMPOSE);
this.normalizer = wrapWithUnicodeSetFilter(normalizer, settings);
this.normalizer = wrapWithUnicodeSetFilter(indexSettings, normalizer, settings);
}

@Override
Expand All @@ -55,8 +59,17 @@ public Object getMultiTermComponent() {
return this;
}

static Normalizer2 wrapWithUnicodeSetFilter(final Normalizer2 normalizer, Settings settings) {
static Normalizer2 wrapWithUnicodeSetFilter(final IndexSettings indexSettings,
final Normalizer2 normalizer,
final Settings settings) {
String unicodeSetFilter = settings.get("unicodeSetFilter");
if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_7_0_0_alpha1)) {
if (unicodeSetFilter != null) {
deprecationLogger.deprecated("[unicodeSetFilter] has been deprecated in favor of [unicode_set_filter]");
} else {
unicodeSetFilter = settings.get("unicode_set_filter");
}
}
if (unicodeSetFilter != null) {
UnicodeSet unicodeSet = new UnicodeSet(unicodeSetFilter);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,61 @@
---
"Normalization with a UnicodeSet Filter":
- do:
indices.create:
index: test
body:
settings:
index:
analysis:
char_filter:
charfilter_icu_normalizer:
type: icu_normalizer
unicode_set_filter: "[^ß]"
filter:
tokenfilter_icu_normalizer:
type: icu_normalizer
unicode_set_filter: "[^ßB]"
tokenfilter_icu_folding:
type: icu_folding
unicode_set_filter: "[^â]"
- do:
indices.analyze:
index: test
body:
char_filter: ["charfilter_icu_normalizer"]
tokenizer: keyword
text: charfilter Föo Bâr Ruß
- length: { tokens: 1 }
- match: { tokens.0.token: charfilter föo bâr ruß }
- do:
indices.analyze:
index: test
body:
tokenizer: keyword
filter: ["tokenfilter_icu_normalizer"]
text: tokenfilter Föo Bâr Ruß
- length: { tokens: 1 }
- match: { tokens.0.token: tokenfilter föo Bâr ruß }
- do:
indices.analyze:
index: test
body:
tokenizer: keyword
filter: ["tokenfilter_icu_folding"]
text: icufolding Föo Bâr Ruß
- length: { tokens: 1 }
- match: { tokens.0.token: icufolding foo bâr russ }

---
"Normalization with a CamcelCase UnicodeSet Filter":
- skip:
version: " - 6.99.99"
reason: unicodeSetFilter deprecated in 7.0.0, replaced by unicode_set_filter
features: "warnings"

- do:
warnings:
- "[unicodeSetFilter] has been deprecated in favor of [unicode_set_filter]"
indices.create:
index: test
body:
Expand Down