From 0359d5d78e4c0ff84e74b2d65e63c851e80e2b26 Mon Sep 17 00:00:00 2001 From: markharwood Date: Fri, 14 Jun 2019 18:25:54 +0100 Subject: [PATCH 1/7] =?UTF-8?q?Analysis=20enhancement=20-=20better=20plura?= =?UTF-8?q?l=20stemmer=20than=20minimal=5Fenglish.=20Drops=20the=20trailin?= =?UTF-8?q?g=20=E2=80=9Ce=E2=80=9D=20in=20taxes,=20dresses,=20watches=20et?= =?UTF-8?q?c=20that=20otherwise=20cause=20mismatches=20with=20plural=20and?= =?UTF-8?q?=20singular=20forms?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes #42892 --- .../common/EnglishPluralStemFilter.java | 111 ++++++++++++++++++ .../common/StemmerTokenFilterFactory.java | 2 + .../StemmerTokenFilterFactoryTests.java | 35 ++++++ 3 files changed, 148 insertions(+) create mode 100644 modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EnglishPluralStemFilter.java diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EnglishPluralStemFilter.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EnglishPluralStemFilter.java new file mode 100644 index 0000000000000..6d57400a3d226 --- /dev/null +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EnglishPluralStemFilter.java @@ -0,0 +1,111 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.analysis.common; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.en.EnglishMinimalStemFilter; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; + +import java.io.IOException; + +public final class EnglishPluralStemFilter extends TokenFilter { + private final EnglishPlurallStemmer stemmer = new EnglishPlurallStemmer(); + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class); + + public EnglishPluralStemFilter(TokenStream input) { + super(input); + } + + @Override + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + if (!keywordAttr.isKeyword()) { + final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length()); + termAtt.setLength(newlen); + } + return true; + } else { + return false; + } + } + + /** + * Plural stemmer for English based on the {@link EnglishMinimalStemFilter} + *

+ * This stemmer removes plurals but beyond EnglishMinimalStemFilter adds + * four new suffix rules to remove dangling e characters: + *

+ * See https://github.com/elastic/elasticsearch/issues/42892 + */ + public static class EnglishPlurallStemmer { + @SuppressWarnings("fallthrough") + public int stem(char s[], int len) { + if (len < 3 || s[len - 1] != 's') + return len; + + switch (s[len - 2]) { + case 'u': + case 's': + return len; + case 'e': + if (len > 3 && s[len - 3] == 'i' && s[len - 4] != 'a' && s[len - 4] != 'e') { + s[len - 3] = 'y'; + return len - 2; + } + + // Suffix rules to remove any dangling "e" + if (len > 3) { + // xes (but >1 prefix so we can stem "boxes->box" but keep "axes->axe") + if (len > 4 && s[len -3] == 'x') { + return len - 2; + } + // shes/sses + if (len > 4) { + if (s[len -4] == 's' && (s[len -3] == 'h' || s[len -3] == 's')){ + return len - 2; + } + // tches + if (len > 5) { + if (s[len -5] == 't' && s[len -4] == 'c' && s[len -3] == 'h' ){ + return len - 2; + } + } + } + } + + if (s[len - 3] == 'i' || s[len - 3] == 'a' || s[len - 3] == 'o' || s[len - 3] == 'e') + return len; /* intentional fallthrough */ + if (s[len - 3] == 'i' || s[len - 3] == 'a' || s[len - 3] == 'o' || s[len - 3] == 'e') + return len; /* intentional fallthrough */ + default: + return len - 1; + } + } + } + +} diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactory.java index b94f7f6499a97..396db78707a36 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactory.java @@ -139,6 +139,8 @@ public TokenStream create(TokenStream tokenStream) { return new SnowballFilter(tokenStream, new EnglishStemmer()); } else if ("minimal_english".equalsIgnoreCase(language) || "minimalEnglish".equalsIgnoreCase(language)) { return new EnglishMinimalStemFilter(tokenStream); + } else if ("plural_english".equalsIgnoreCase(language) || "pluralEnglish".equalsIgnoreCase(language)) { + return new EnglishPluralStemFilter(tokenStream); } else if ("possessive_english".equalsIgnoreCase(language) || "possessiveEnglish".equalsIgnoreCase(language)) { return new EnglishPossessiveFilter(tokenStream); diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactoryTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactoryTests.java index 8e3e862f462e2..e56820f5709d8 100644 --- a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactoryTests.java +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactoryTests.java @@ -97,6 +97,41 @@ public void testPorter2FilterFactory() throws IOException { assertAnalyzesTo(analyzer, "possibly", new String[]{"possibl"}); } } + + public void testEnglishPluralFilter() throws IOException { + int iters = scaledRandomIntBetween(20, 100); + for (int i = 0; i < iters; i++) { + + Version v = VersionUtils.randomVersion(random()); + Settings settings = Settings.builder() + .put("index.analysis.filter.my_plurals.type", "stemmer") + .put("index.analysis.filter.my_plurals.language", "plural_english") + .put("index.analysis.analyzer.my_plurals.tokenizer","whitespace") + .put("index.analysis.analyzer.my_plurals.filter","my_plurals") + .put(SETTING_VERSION_CREATED,v) + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .build(); + + ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, PLUGIN); + TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_plurals"); + assertThat(tokenFilter, instanceOf(StemmerTokenFilterFactory.class)); + Tokenizer tokenizer = new WhitespaceTokenizer(); + tokenizer.setReader(new StringReader("dresses")); + TokenStream create = tokenFilter.create(tokenizer); + IndexAnalyzers indexAnalyzers = analysis.indexAnalyzers; + NamedAnalyzer analyzer = indexAnalyzers.get("my_plurals"); + assertThat(create, instanceOf(EnglishPluralStemFilter.class)); + assertAnalyzesTo(analyzer, "phones", new String[]{"phone"}); + assertAnalyzesTo(analyzer, "horses", new String[]{"horse"}); + assertAnalyzesTo(analyzer, "dresses", new String[]{"dress"}); + assertAnalyzesTo(analyzer, "watches", new String[]{"watch"}); + assertAnalyzesTo(analyzer, "possess", new String[]{"possess"}); + assertAnalyzesTo(analyzer, "possesses", new String[]{"possess"}); + assertAnalyzesTo(analyzer, "boxes", new String[]{"box"}); + assertAnalyzesTo(analyzer, "axes", new String[]{"axe"}); + assertAnalyzesTo(analyzer, "dishes", new String[]{"dish"}); + } + } public void testMultipleLanguagesThrowsException() throws IOException { Version v = VersionUtils.randomVersion(random()); From 9682315e4ec8291e394a5be709da5b941be790c0 Mon Sep 17 00:00:00 2001 From: markharwood Date: Mon, 17 Jun 2019 11:49:25 +0100 Subject: [PATCH 2/7] Added ees->ee stemming so that bees match bee. Made ies->y stemming stricter so short words match eg ties==tie Removed special-case code for crazy-rare words iaes and eies --- .../common/EnglishPluralStemFilter.java | 23 ++++++++---- .../StemmerTokenFilterFactoryTests.java | 36 ++++++++++++++++++- 2 files changed, 52 insertions(+), 7 deletions(-) diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EnglishPluralStemFilter.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EnglishPluralStemFilter.java index 6d57400a3d226..d0a6c34395633 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EnglishPluralStemFilter.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EnglishPluralStemFilter.java @@ -60,7 +60,13 @@ public boolean incrementToken() throws IOException { *
  • shes - "dishes" becomes "dish"
  • *
  • tches - "watches" becomes "watch"
  • * - * See https://github.com/elastic/elasticsearch/issues/42892 + * See https://github.com/elastic/elasticsearch/issues/42892 + *

    + * In addition the s stemmer logic is amended so that + *

    */ public static class EnglishPlurallStemmer { @SuppressWarnings("fallthrough") @@ -73,7 +79,11 @@ public int stem(char s[], int len) { case 's': return len; case 'e': - if (len > 3 && s[len - 3] == 'i' && s[len - 4] != 'a' && s[len - 4] != 'e') { + // Modified ies->y logic from original s-stemmer - only work on strings > 4 + // so spies -> spy still but pies->pie. + // The original code also special-cased aies and eies for no good reason as far as I can tell. + // ( no words of consequence - eg http://www.thefreedictionary.com/words-that-end-in-aies ) + if (len > 4 && s[len - 3] == 'i') { s[len - 3] = 'y'; return len - 2; } @@ -89,7 +99,7 @@ public int stem(char s[], int len) { if (s[len -4] == 's' && (s[len -3] == 'h' || s[len -3] == 's')){ return len - 2; } - // tches + // tches (TODO consider just ches? Gains: lunches == lunch, losses: moustaches!= moustache if (len > 5) { if (s[len -5] == 't' && s[len -4] == 'c' && s[len -3] == 'h' ){ return len - 2; @@ -98,9 +108,10 @@ public int stem(char s[], int len) { } } - if (s[len - 3] == 'i' || s[len - 3] == 'a' || s[len - 3] == 'o' || s[len - 3] == 'e') - return len; /* intentional fallthrough */ - if (s[len - 3] == 'i' || s[len - 3] == 'a' || s[len - 3] == 'o' || s[len - 3] == 'e') + // oes condition below is taken from original s-stemmer and is a cop-out because there are too many special cases + // e.g. shoes->shoe but heroes->hero so just doesn't try stem these words at all. + // TODO Would be good to find a heuristic for stemming here (see https://howtospell.co.uk/making-O-words-plural ) + if ( s[len - 3] == 'o') return len; /* intentional fallthrough */ default: return len - 1; diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactoryTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactoryTests.java index e56820f5709d8..64d42bd267def 100644 --- a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactoryTests.java +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactoryTests.java @@ -121,15 +121,49 @@ public void testEnglishPluralFilter() throws IOException { IndexAnalyzers indexAnalyzers = analysis.indexAnalyzers; NamedAnalyzer analyzer = indexAnalyzers.get("my_plurals"); assertThat(create, instanceOf(EnglishPluralStemFilter.class)); + + // Check old EnglishMinimalStemmer ("S" stemmer) logic assertAnalyzesTo(analyzer, "phones", new String[]{"phone"}); assertAnalyzesTo(analyzer, "horses", new String[]{"horse"}); + assertAnalyzesTo(analyzer, "cameras", new String[]{"camera"}); + + // TODO The orginal s stemmer gives up on stemming oes words because English has no fixed rule for the stem + // (see https://howtospell.co.uk/making-O-words-plural ) + // Would be good to find a heuristic for stemming oes words. + assertAnalyzesTo(analyzer, "toes", new String[]{"toes"}); + assertAnalyzesTo(analyzer, "shoes", new String[]{"shoes"}); + assertAnalyzesTo(analyzer, "heroes", new String[]{"heroes"}); + + // Check improved EnglishPluralStemFilter logic + //sses assertAnalyzesTo(analyzer, "dresses", new String[]{"dress"}); - assertAnalyzesTo(analyzer, "watches", new String[]{"watch"}); assertAnalyzesTo(analyzer, "possess", new String[]{"possess"}); assertAnalyzesTo(analyzer, "possesses", new String[]{"possess"}); + // xes assertAnalyzesTo(analyzer, "boxes", new String[]{"box"}); assertAnalyzesTo(analyzer, "axes", new String[]{"axe"}); + //shes assertAnalyzesTo(analyzer, "dishes", new String[]{"dish"}); + assertAnalyzesTo(analyzer, "washes", new String[]{"wash"}); + //ees + assertAnalyzesTo(analyzer, "employees", new String[]{"employee"}); + assertAnalyzesTo(analyzer, "bees", new String[]{"bee"}); + //tch + assertAnalyzesTo(analyzer, "watches", new String[]{"watch"}); + assertAnalyzesTo(analyzer, "itches", new String[]{"itch"}); + // ies->y but only for length >4 + assertAnalyzesTo(analyzer, "spies", new String[]{"spy"}); + assertAnalyzesTo(analyzer, "ties", new String[]{"tie"}); + assertAnalyzesTo(analyzer, "lies", new String[]{"lie"}); + assertAnalyzesTo(analyzer, "pies", new String[]{"pie"}); + assertAnalyzesTo(analyzer, "dies", new String[]{"die"}); + + + // *CHES - would be good to find a simple rule that solves lunches, churches but doesn't break aches + // documenting current behaviour here as a known issue: + assertAnalyzesTo(analyzer, "lunches", new String[]{"lunche"}); + assertAnalyzesTo(analyzer, "avalanches", new String[]{"avalanche"}); + assertAnalyzesTo(analyzer, "headaches", new String[]{"headache"}); } } From 3dbd2f6fc4528b159c6f32baac04346a5603bb71 Mon Sep 17 00:00:00 2001 From: markharwood Date: Mon, 17 Jun 2019 11:54:52 +0100 Subject: [PATCH 3/7] Javadoc fix --- .../analysis/common/EnglishPluralStemFilter.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EnglishPluralStemFilter.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EnglishPluralStemFilter.java index d0a6c34395633..1b3af0b625740 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EnglishPluralStemFilter.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EnglishPluralStemFilter.java @@ -64,8 +64,8 @@ public boolean incrementToken() throws IOException { *

    * In addition the s stemmer logic is amended so that *

      - *
    • ees->ee so that bees matches bee
    • - *
    • ies->y only on longer words to that ties matches tie
    • + *
    • ees->ee so that bees matches bee
    • + *
    • ies->y only on longer words to that ties matches tie
    • *
    */ public static class EnglishPlurallStemmer { From 18fcf11e95bbf23c5767f3da0a8e7acd3bb19670 Mon Sep 17 00:00:00 2001 From: markharwood Date: Mon, 17 Jun 2019 15:22:04 +0100 Subject: [PATCH 4/7] Added support for oes -> o suffixes mawith small exception list for ones-> oe e.g. shoes --- .../common/EnglishPluralStemFilter.java | 52 +++++++++++++++---- .../StemmerTokenFilterFactoryTests.java | 14 +++-- 2 files changed, 52 insertions(+), 14 deletions(-) diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EnglishPluralStemFilter.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EnglishPluralStemFilter.java index 1b3af0b625740..32af6d2ef2263 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EnglishPluralStemFilter.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EnglishPluralStemFilter.java @@ -28,7 +28,7 @@ import java.io.IOException; public final class EnglishPluralStemFilter extends TokenFilter { - private final EnglishPlurallStemmer stemmer = new EnglishPlurallStemmer(); + private final EnglishPluralStemmer stemmer = new EnglishPluralStemmer(); private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class); @@ -66,9 +66,18 @@ public boolean incrementToken() throws IOException { *
      *
    • ees->ee so that bees matches bee
    • *
    • ies->y only on longer words to that ties matches tie
    • + *
    • oes->o rule so that tomatoes matches tomato but retains e for some words eg shoes to shoe
    • *
    */ - public static class EnglishPlurallStemmer { + public static class EnglishPluralStemmer { + + // Words ending in oes that retain the e when stemmed + public static final char [][] oesExceptions = { + "shoes".toCharArray(), + "canoes".toCharArray(), + "oboes".toCharArray() + }; + @SuppressWarnings("fallthrough") public int stem(char s[], int len) { if (len < 3 || s[len - 1] != 's') @@ -94,11 +103,21 @@ public int stem(char s[], int len) { if (len > 4 && s[len -3] == 'x') { return len - 2; } - // shes/sses + // oes + if (len > 3 && s[len -3] == 'o') { + if (isOesException(s, len)) { + // Only remove the S + return len -1; + } + // Remove the es + return len - 2; + } if (len > 4) { + // shes/sses if (s[len -4] == 's' && (s[len -3] == 'h' || s[len -3] == 's')){ return len - 2; } + // tches (TODO consider just ches? Gains: lunches == lunch, losses: moustaches!= moustache if (len > 5) { if (s[len -5] == 't' && s[len -4] == 'c' && s[len -3] == 'h' ){ @@ -107,16 +126,31 @@ public int stem(char s[], int len) { } } } - - // oes condition below is taken from original s-stemmer and is a cop-out because there are too many special cases - // e.g. shoes->shoe but heroes->hero so just doesn't try stem these words at all. - // TODO Would be good to find a heuristic for stemming here (see https://howtospell.co.uk/making-O-words-plural ) - if ( s[len - 3] == 'o') - return len; /* intentional fallthrough */ + default: return len - 1; } } + + private final boolean isOesException(char[] s, int len) { + for (char[] oesRule : oesExceptions) { + int rulePos = oesRule.length - 1; + int sPos = len - 1; + boolean matched = true; + while (rulePos >= 0 && sPos >= 0) { + if (oesRule[rulePos] != s[sPos]) { + matched = false; + break; + } + rulePos--; + sPos--; + } + if (matched) { + return true; + } + } + return false; + } } } diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactoryTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactoryTests.java index 64d42bd267def..49c86ba6e87f0 100644 --- a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactoryTests.java +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactoryTests.java @@ -127,12 +127,16 @@ public void testEnglishPluralFilter() throws IOException { assertAnalyzesTo(analyzer, "horses", new String[]{"horse"}); assertAnalyzesTo(analyzer, "cameras", new String[]{"camera"}); - // TODO The orginal s stemmer gives up on stemming oes words because English has no fixed rule for the stem + // The orginal s stemmer gives up on stemming oes words because English has no fixed rule for the stem // (see https://howtospell.co.uk/making-O-words-plural ) - // Would be good to find a heuristic for stemming oes words. - assertAnalyzesTo(analyzer, "toes", new String[]{"toes"}); - assertAnalyzesTo(analyzer, "shoes", new String[]{"shoes"}); - assertAnalyzesTo(analyzer, "heroes", new String[]{"heroes"}); + // This stemmer removes the es but retains e for a small number of exceptions + assertAnalyzesTo(analyzer, "mosquitoes", new String[]{"mosquito"}); + assertAnalyzesTo(analyzer, "heroes", new String[]{"hero"}); + // oes exceptions that retain the e. + assertAnalyzesTo(analyzer, "shoes", new String[]{"shoe"}); + assertAnalyzesTo(analyzer, "horseshoes", new String[]{"horseshoe"}); + assertAnalyzesTo(analyzer, "canoes", new String[]{"canoe"}); + assertAnalyzesTo(analyzer, "oboes", new String[]{"oboe"}); // Check improved EnglishPluralStemFilter logic //sses From 942a7aff586ee4b896d5e3a209115fd0d4724e2d Mon Sep 17 00:00:00 2001 From: markharwood Date: Mon, 17 Jun 2019 15:53:35 +0100 Subject: [PATCH 5/7] Remove redundant modifier --- .../elasticsearch/analysis/common/EnglishPluralStemFilter.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EnglishPluralStemFilter.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EnglishPluralStemFilter.java index 32af6d2ef2263..eb8c2bc5ba0ed 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EnglishPluralStemFilter.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EnglishPluralStemFilter.java @@ -132,7 +132,7 @@ public int stem(char s[], int len) { } } - private final boolean isOesException(char[] s, int len) { + private boolean isOesException(char[] s, int len) { for (char[] oesRule : oesExceptions) { int rulePos = oesRule.length - 1; int sPos = len - 1; From 0171b4684f9b912c6963c6ec443fa6aa1440a459 Mon Sep 17 00:00:00 2001 From: markharwood Date: Mon, 17 Jun 2019 15:59:38 +0100 Subject: [PATCH 6/7] Added docs --- .../reference/analysis/tokenfilters/stemmer-tokenfilter.asciidoc | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/reference/analysis/tokenfilters/stemmer-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/stemmer-tokenfilter.asciidoc index b5d5426ff2710..1e82b2f47417a 100644 --- a/docs/reference/analysis/tokenfilters/stemmer-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/stemmer-tokenfilter.asciidoc @@ -84,6 +84,7 @@ English:: http://snowball.tartarus.org/algorithms/porter/stemmer.html[*`english`*], http://ciir.cs.umass.edu/pubfiles/ir-35.pdf[`light_english`], http://www.researchgate.net/publication/220433848_How_effective_is_suffixing[`minimal_english`], +https://github.com/elastic/elasticsearch/issues/42892[`plural_english`], http://lucene.apache.org/core/4_9_0/analyzers-common/org/apache/lucene/analysis/en/EnglishPossessiveFilter.html[`possessive_english`], http://snowball.tartarus.org/algorithms/english/stemmer.html[`porter2`], http://snowball.tartarus.org/algorithms/lovins/stemmer.html[`lovins`] From 3c76b8b0f52aa1ab4b6c7f59c864ab69cd2367a0 Mon Sep 17 00:00:00 2001 From: markharwood Date: Thu, 4 Jul 2019 16:03:30 +0100 Subject: [PATCH 7/7] Added support for ches suffix with exceptions for words like avalanche --- .../common/EnglishPluralStemFilter.java | 31 +++++++++++++++---- .../StemmerTokenFilterFactoryTests.java | 11 +++++-- 2 files changed, 33 insertions(+), 9 deletions(-) diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EnglishPluralStemFilter.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EnglishPluralStemFilter.java index eb8c2bc5ba0ed..98e0936dc0faa 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EnglishPluralStemFilter.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EnglishPluralStemFilter.java @@ -77,6 +77,19 @@ public static class EnglishPluralStemmer { "canoes".toCharArray(), "oboes".toCharArray() }; + // Words ending in ches that retain the e when stemmed + public static final char [][] chesExceptions = { + "cliches".toCharArray(), + "avalanches".toCharArray(), + "mustaches".toCharArray(), + "moustaches".toCharArray(), + "quiches".toCharArray(), + "headaches".toCharArray(), + "heartaches".toCharArray(), + "porsches".toCharArray(), + "tranches".toCharArray(), + "caches".toCharArray() + }; @SuppressWarnings("fallthrough") public int stem(char s[], int len) { @@ -105,7 +118,7 @@ public int stem(char s[], int len) { } // oes if (len > 3 && s[len -3] == 'o') { - if (isOesException(s, len)) { + if (isException(s, len, oesExceptions)) { // Only remove the S return len -1; } @@ -118,10 +131,16 @@ public int stem(char s[], int len) { return len - 2; } - // tches (TODO consider just ches? Gains: lunches == lunch, losses: moustaches!= moustache - if (len > 5) { - if (s[len -5] == 't' && s[len -4] == 'c' && s[len -3] == 'h' ){ + // ches + if (len > 4) { + if (s[len -4] == 'c' && s[len -3] == 'h' ){ + if (isException(s, len, chesExceptions)) { + // Only remove the S + return len -1; + } + // Remove the es return len - 2; + } } } @@ -132,8 +151,8 @@ public int stem(char s[], int len) { } } - private boolean isOesException(char[] s, int len) { - for (char[] oesRule : oesExceptions) { + private boolean isException(char[] s, int len, char [][] exceptionsList) { + for (char[] oesRule : exceptionsList) { int rulePos = oesRule.length - 1; int sPos = len - 1; boolean matched = true; diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactoryTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactoryTests.java index 49c86ba6e87f0..c4f598dea2f73 100644 --- a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactoryTests.java +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactoryTests.java @@ -163,11 +163,16 @@ public void testEnglishPluralFilter() throws IOException { assertAnalyzesTo(analyzer, "dies", new String[]{"die"}); - // *CHES - would be good to find a simple rule that solves lunches, churches but doesn't break aches - // documenting current behaviour here as a known issue: - assertAnalyzesTo(analyzer, "lunches", new String[]{"lunche"}); + assertAnalyzesTo(analyzer, "lunches", new String[]{"lunch"}); assertAnalyzesTo(analyzer, "avalanches", new String[]{"avalanche"}); assertAnalyzesTo(analyzer, "headaches", new String[]{"headache"}); + assertAnalyzesTo(analyzer, "caches", new String[]{"cache"}); + assertAnalyzesTo(analyzer, "beaches", new String[]{"beach"}); + assertAnalyzesTo(analyzer, "britches", new String[]{"britch"}); + assertAnalyzesTo(analyzer, "cockroaches", new String[]{"cockroach"}); + assertAnalyzesTo(analyzer, "cliches", new String[]{"cliche"}); + assertAnalyzesTo(analyzer, "quiches", new String[]{"quiche"}); + } }