From 0359d5d78e4c0ff84e74b2d65e63c851e80e2b26 Mon Sep 17 00:00:00 2001
From: markharwood <markharwood@gmail.com>
Date: Fri, 14 Jun 2019 18:25:54 +0100
Subject: [PATCH 1/7] =?UTF-8?q?Analysis=20enhancement=20-=20better=20plura?=
 =?UTF-8?q?l=20stemmer=20than=20minimal=5Fenglish.=20Drops=20the=20trailin?=
 =?UTF-8?q?g=20=E2=80=9Ce=E2=80=9D=20in=20taxes,=20dresses,=20watches=20et?=
 =?UTF-8?q?c=20that=20otherwise=20cause=20mismatches=20with=20plural=20and?=
 =?UTF-8?q?=20singular=20forms?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Closes #42892
---
 .../common/EnglishPluralStemFilter.java       | 111 ++++++++++++++++++
 .../common/StemmerTokenFilterFactory.java     |   2 +
 .../StemmerTokenFilterFactoryTests.java       |  35 ++++++
 3 files changed, 148 insertions(+)
 create mode 100644 modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EnglishPluralStemFilter.java
diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EnglishPluralStemFilter.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EnglishPluralStemFilter.java
new file mode 100644
index 0000000000000..6d57400a3d226
--- /dev/null
+++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EnglishPluralStemFilter.java
@@ -0,0 +1,111 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.analysis.common;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.en.EnglishMinimalStemFilter;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+
+import java.io.IOException;
+
+public final class EnglishPluralStemFilter extends TokenFilter {
+    private final EnglishPlurallStemmer stemmer = new EnglishPlurallStemmer();
+    private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+    private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
+
+    public EnglishPluralStemFilter(TokenStream input) {
+        super(input);
+    }
+
+    @Override
+    public boolean incrementToken() throws IOException {
+        if (input.incrementToken()) {
+            if (!keywordAttr.isKeyword()) {
+                final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
+                termAtt.setLength(newlen);
+            }
+            return true;
+        } else {
+            return false;
+        }
+    }
+
+    /**
+     * Plural stemmer for English based on the {@link EnglishMinimalStemFilter}
+     * <p>
+     * This stemmer removes plurals but beyond EnglishMinimalStemFilter adds
+     * four new suffix rules to remove dangling e characters:
+     * <ul>
+     * <li>xes - "boxes" becomes "box"</li>
+     * <li>sses - "dresses" becomes "dress"</li>
+     * <li>shes - "dishes" becomes "dish"</li>
+     * <li>tches - "watches" becomes "watch"</li>
+     * </ul>
+     * See https://github.com/elastic/elasticsearch/issues/42892
+     */
+    public static class EnglishPlurallStemmer {
+        @SuppressWarnings("fallthrough")
+        public int stem(char s[], int len) {
+            if (len < 3 || s[len - 1] != 's')
+                return len;
+
+            switch (s[len - 2]) {
+            case 'u':
+            case 's':
+                return len;
+            case 'e':
+                if (len > 3 && s[len - 3] == 'i' && s[len - 4] != 'a' && s[len - 4] != 'e') {
+                    s[len - 3] = 'y';
+                    return len - 2;
+                }
+                
+                // Suffix rules to remove any dangling "e"                
+                if (len > 3) {
+                    // xes (but >1 prefix so we can stem "boxes->box" but keep "axes->axe")
+                    if (len > 4 && s[len -3] == 'x') {
+                        return len - 2;
+                    }
+                    // shes/sses
+                    if (len > 4) {
+                        if (s[len -4] == 's' && (s[len -3] == 'h' || s[len -3] == 's')){
+                            return len - 2;
+                        }
+                        // tches
+                        if (len > 5) {
+                            if (s[len -5] == 't' && s[len -4] == 'c' && s[len -3] == 'h' ){
+                                return len - 2;
+                            }                            
+                        }                        
+                    }
+                }
+                
+                if (s[len - 3] == 'i' || s[len - 3] == 'a' || s[len - 3] == 'o' || s[len - 3] == 'e')
+                    return len; /* intentional fallthrough */
+                if (s[len - 3] == 'i' || s[len - 3] == 'a' || s[len - 3] == 'o' || s[len - 3] == 'e')
+                    return len; /* intentional fallthrough */
+            default:
+                return len - 1;
+            }
+        }
+    }
+
+}
diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactory.java
index b94f7f6499a97..396db78707a36 100644
--- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactory.java
+++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactory.java
@@ -139,6 +139,8 @@ public TokenStream create(TokenStream tokenStream) {
             return new SnowballFilter(tokenStream, new EnglishStemmer());
         } else if ("minimal_english".equalsIgnoreCase(language) || "minimalEnglish".equalsIgnoreCase(language)) {
             return new EnglishMinimalStemFilter(tokenStream);
+        } else if ("plural_english".equalsIgnoreCase(language) || "pluralEnglish".equalsIgnoreCase(language)) {
+            return new EnglishPluralStemFilter(tokenStream);
         } else if ("possessive_english".equalsIgnoreCase(language) || "possessiveEnglish".equalsIgnoreCase(language)) {
             return new EnglishPossessiveFilter(tokenStream);
 
diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactoryTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactoryTests.java
index 8e3e862f462e2..e56820f5709d8 100644
--- a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactoryTests.java
+++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactoryTests.java
@@ -97,6 +97,41 @@ public void testPorter2FilterFactory() throws IOException {
             assertAnalyzesTo(analyzer, "possibly", new String[]{"possibl"});
         }
     }
+    
+    public void testEnglishPluralFilter() throws IOException {
+        int iters = scaledRandomIntBetween(20, 100);
+        for (int i = 0; i < iters; i++) {
+
+            Version v = VersionUtils.randomVersion(random());
+            Settings settings = Settings.builder()
+                    .put("index.analysis.filter.my_plurals.type", "stemmer")
+                    .put("index.analysis.filter.my_plurals.language", "plural_english")
+                    .put("index.analysis.analyzer.my_plurals.tokenizer","whitespace")
+                    .put("index.analysis.analyzer.my_plurals.filter","my_plurals")
+                    .put(SETTING_VERSION_CREATED,v)
+                    .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
+                    .build();
+
+            ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, PLUGIN);
+            TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_plurals");
+            assertThat(tokenFilter, instanceOf(StemmerTokenFilterFactory.class));
+            Tokenizer tokenizer = new WhitespaceTokenizer();
+            tokenizer.setReader(new StringReader("dresses"));
+            TokenStream create = tokenFilter.create(tokenizer);
+            IndexAnalyzers indexAnalyzers = analysis.indexAnalyzers;
+            NamedAnalyzer analyzer = indexAnalyzers.get("my_plurals");
+            assertThat(create, instanceOf(EnglishPluralStemFilter.class));
+            assertAnalyzesTo(analyzer, "phones", new String[]{"phone"});
+            assertAnalyzesTo(analyzer, "horses", new String[]{"horse"});
+            assertAnalyzesTo(analyzer, "dresses", new String[]{"dress"});
+            assertAnalyzesTo(analyzer, "watches", new String[]{"watch"});
+            assertAnalyzesTo(analyzer, "possess", new String[]{"possess"});
+            assertAnalyzesTo(analyzer, "possesses", new String[]{"possess"});
+            assertAnalyzesTo(analyzer, "boxes", new String[]{"box"});
+            assertAnalyzesTo(analyzer, "axes", new String[]{"axe"});
+            assertAnalyzesTo(analyzer, "dishes", new String[]{"dish"});
+        }
+    }    
 
     public void testMultipleLanguagesThrowsException() throws IOException {
         Version v = VersionUtils.randomVersion(random());

From 9682315e4ec8291e394a5be709da5b941be790c0 Mon Sep 17 00:00:00 2001
From: markharwood <markharwood@gmail.com>
Date: Mon, 17 Jun 2019 11:49:25 +0100
Subject: [PATCH 2/7] Added ees->ee stemming so that bees match bee. Made
 ies->y stemming stricter so short words match eg ties==tie Removed
 special-case code for crazy-rare words iaes and eies

---
 .../common/EnglishPluralStemFilter.java       | 23 ++++++++----
 .../StemmerTokenFilterFactoryTests.java       | 36 ++++++++++++++++++-
 2 files changed, 52 insertions(+), 7 deletions(-)

diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EnglishPluralStemFilter.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EnglishPluralStemFilter.java
index 6d57400a3d226..d0a6c34395633 100644
--- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EnglishPluralStemFilter.java
+++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EnglishPluralStemFilter.java
@@ -60,7 +60,13 @@ public boolean incrementToken() throws IOException {
      * <li>shes - "dishes" becomes "dish"</li>
      * <li>tches - "watches" becomes "watch"</li>
      * </ul>
-     * See https://github.com/elastic/elasticsearch/issues/42892
+     * See https://github.com/elastic/elasticsearch/issues/42892 
+     * <p>
+     * In addition the s stemmer logic is amended so that
+     * <ul>
+     * <li>ees->ee so that bees matches bee</li>
+     * <li>ies->y only on longer words to that ties matches tie</li>
+     * </ul>
      */
     public static class EnglishPlurallStemmer {
         @SuppressWarnings("fallthrough")
@@ -73,7 +79,11 @@ public int stem(char s[], int len) {
             case 's':
                 return len;
             case 'e':
-                if (len > 3 && s[len - 3] == 'i' && s[len - 4] != 'a' && s[len - 4] != 'e') {
+                // Modified ies->y logic from original s-stemmer - only work on strings > 4
+                // so spies -> spy still but pies->pie.
+                // The original code also special-cased aies and eies for no good reason as far as I can tell.
+                // ( no words of consequence - eg http://www.thefreedictionary.com/words-that-end-in-aies )
+                if (len > 4 && s[len - 3] == 'i') {
                     s[len - 3] = 'y';
                     return len - 2;
                 }
@@ -89,7 +99,7 @@ public int stem(char s[], int len) {
                         if (s[len -4] == 's' && (s[len -3] == 'h' || s[len -3] == 's')){
                             return len - 2;
                         }
-                        // tches
+                        // tches (TODO consider just ches? Gains: lunches == lunch, losses: moustaches!= moustache
                         if (len > 5) {
                             if (s[len -5] == 't' && s[len -4] == 'c' && s[len -3] == 'h' ){
                                 return len - 2;
@@ -98,9 +108,10 @@ public int stem(char s[], int len) {
                     }
                 }
                 
-                if (s[len - 3] == 'i' || s[len - 3] == 'a' || s[len - 3] == 'o' || s[len - 3] == 'e')
-                    return len; /* intentional fallthrough */
-                if (s[len - 3] == 'i' || s[len - 3] == 'a' || s[len - 3] == 'o' || s[len - 3] == 'e')
+                // oes condition below is taken from original s-stemmer and is a cop-out because there are too many special cases 
+                // e.g. shoes->shoe but heroes->hero so just doesn't try stem these words at all.
+                // TODO Would be good to find a heuristic for stemming here (see https://howtospell.co.uk/making-O-words-plural )
+                if (  s[len - 3] == 'o')
                     return len; /* intentional fallthrough */
             default:
                 return len - 1;
diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactoryTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactoryTests.java
index e56820f5709d8..64d42bd267def 100644
--- a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactoryTests.java
+++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactoryTests.java
@@ -121,15 +121,49 @@ public void testEnglishPluralFilter() throws IOException {
             IndexAnalyzers indexAnalyzers = analysis.indexAnalyzers;
             NamedAnalyzer analyzer = indexAnalyzers.get("my_plurals");
             assertThat(create, instanceOf(EnglishPluralStemFilter.class));
+
+            // Check old EnglishMinimalStemmer ("S" stemmer) logic
             assertAnalyzesTo(analyzer, "phones", new String[]{"phone"});
             assertAnalyzesTo(analyzer, "horses", new String[]{"horse"});
+            assertAnalyzesTo(analyzer, "cameras", new String[]{"camera"});
+            
+            // TODO The orginal s stemmer gives up on stemming oes words because English has no fixed rule for the stem
+            // (see https://howtospell.co.uk/making-O-words-plural )
+            // Would be good to find a heuristic for stemming oes words.
+            assertAnalyzesTo(analyzer, "toes", new String[]{"toes"});
+            assertAnalyzesTo(analyzer, "shoes", new String[]{"shoes"});
+            assertAnalyzesTo(analyzer, "heroes", new String[]{"heroes"});
+
+            // Check improved EnglishPluralStemFilter logic
+            //sses
             assertAnalyzesTo(analyzer, "dresses", new String[]{"dress"});
-            assertAnalyzesTo(analyzer, "watches", new String[]{"watch"});
             assertAnalyzesTo(analyzer, "possess", new String[]{"possess"});
             assertAnalyzesTo(analyzer, "possesses", new String[]{"possess"});
+            // xes
             assertAnalyzesTo(analyzer, "boxes", new String[]{"box"});
             assertAnalyzesTo(analyzer, "axes", new String[]{"axe"});
+            //shes
             assertAnalyzesTo(analyzer, "dishes", new String[]{"dish"});
+            assertAnalyzesTo(analyzer, "washes", new String[]{"wash"});
+            //ees
+            assertAnalyzesTo(analyzer, "employees", new String[]{"employee"});
+            assertAnalyzesTo(analyzer, "bees", new String[]{"bee"});
+            //tch
+            assertAnalyzesTo(analyzer, "watches", new String[]{"watch"});
+            assertAnalyzesTo(analyzer, "itches", new String[]{"itch"});
+            // ies->y but only for length >4
+            assertAnalyzesTo(analyzer, "spies", new String[]{"spy"});
+            assertAnalyzesTo(analyzer, "ties", new String[]{"tie"});
+            assertAnalyzesTo(analyzer, "lies", new String[]{"lie"});
+            assertAnalyzesTo(analyzer, "pies", new String[]{"pie"});
+            assertAnalyzesTo(analyzer, "dies", new String[]{"die"});
+
+            
+            // *CHES - would be good to find a simple rule that solves lunches, churches but doesn't break aches
+            // documenting current behaviour here as a known issue:
+            assertAnalyzesTo(analyzer, "lunches", new String[]{"lunche"});
+            assertAnalyzesTo(analyzer, "avalanches", new String[]{"avalanche"});
+            assertAnalyzesTo(analyzer, "headaches", new String[]{"headache"});
         }
     }    
 

From 3dbd2f6fc4528b159c6f32baac04346a5603bb71 Mon Sep 17 00:00:00 2001
From: markharwood <markharwood@gmail.com>
Date: Mon, 17 Jun 2019 11:54:52 +0100
Subject: [PATCH 3/7] Javadoc fix

---
 .../analysis/common/EnglishPluralStemFilter.java              | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EnglishPluralStemFilter.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EnglishPluralStemFilter.java
index d0a6c34395633..1b3af0b625740 100644
--- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EnglishPluralStemFilter.java
+++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EnglishPluralStemFilter.java
@@ -64,8 +64,8 @@ public boolean incrementToken() throws IOException {
      * <p>
      * In addition the s stemmer logic is amended so that
      * <ul>
-     * <li>ees->ee so that bees matches bee</li>
-     * <li>ies->y only on longer words to that ties matches tie</li>
+     * <li>ees-&gt;ee so that bees matches bee</li>
+     * <li>ies-&gt;y only on longer words to that ties matches tie</li>
      * </ul>
      */
     public static class EnglishPlurallStemmer {

From 18fcf11e95bbf23c5767f3da0a8e7acd3bb19670 Mon Sep 17 00:00:00 2001
From: markharwood <markharwood@gmail.com>
Date: Mon, 17 Jun 2019 15:22:04 +0100
Subject: [PATCH 4/7] Added support for oes -> o suffixes mawith small
 exception list  for ones-> oe e.g. shoes

---
 .../common/EnglishPluralStemFilter.java       | 52 +++++++++++++++----
 .../StemmerTokenFilterFactoryTests.java       | 14 +++--
 2 files changed, 52 insertions(+), 14 deletions(-)

diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EnglishPluralStemFilter.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EnglishPluralStemFilter.java
index 1b3af0b625740..32af6d2ef2263 100644
--- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EnglishPluralStemFilter.java
+++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EnglishPluralStemFilter.java
@@ -28,7 +28,7 @@
 import java.io.IOException;
 
 public final class EnglishPluralStemFilter extends TokenFilter {
-    private final EnglishPlurallStemmer stemmer = new EnglishPlurallStemmer();
+    private final EnglishPluralStemmer stemmer = new EnglishPluralStemmer();
     private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
     private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
 
@@ -66,9 +66,18 @@ public boolean incrementToken() throws IOException {
      * <ul>
      * <li>ees-&gt;ee so that bees matches bee</li>
      * <li>ies-&gt;y only on longer words to that ties matches tie</li>
+     * <li>oes-&gt;o rule so that tomatoes matches tomato but retains e for some words eg shoes to shoe</li>
      * </ul>
      */
-    public static class EnglishPlurallStemmer {
+    public static class EnglishPluralStemmer {
+        
+        // Words ending in oes that retain the e when stemmed 
+        public static final char [][] oesExceptions = { 
+                "shoes".toCharArray(), 
+                "canoes".toCharArray(), 
+                "oboes".toCharArray() 
+                }; 
+        
         @SuppressWarnings("fallthrough")
         public int stem(char s[], int len) {
             if (len < 3 || s[len - 1] != 's')
@@ -94,11 +103,21 @@ public int stem(char s[], int len) {
                     if (len > 4 && s[len -3] == 'x') {
                         return len - 2;
                     }
-                    // shes/sses
+                    // oes
+                    if (len > 3 && s[len -3] == 'o') {
+                        if (isOesException(s, len)) {
+                            // Only remove the S
+                            return len -1;
+                        }
+                        // Remove the es 
+                        return len - 2;
+                    }                    
                     if (len > 4) {
+                        // shes/sses
                         if (s[len -4] == 's' && (s[len -3] == 'h' || s[len -3] == 's')){
                             return len - 2;
                         }
+                        
                         // tches (TODO consider just ches? Gains: lunches == lunch, losses: moustaches!= moustache
                         if (len > 5) {
                             if (s[len -5] == 't' && s[len -4] == 'c' && s[len -3] == 'h' ){
@@ -107,16 +126,31 @@ public int stem(char s[], int len) {
                         }                        
                     }
                 }
-                
-                // oes condition below is taken from original s-stemmer and is a cop-out because there are too many special cases 
-                // e.g. shoes->shoe but heroes->hero so just doesn't try stem these words at all.
-                // TODO Would be good to find a heuristic for stemming here (see https://howtospell.co.uk/making-O-words-plural )
-                if (  s[len - 3] == 'o')
-                    return len; /* intentional fallthrough */
+
             default:
                 return len - 1;
             }
         }
+
+        private final boolean isOesException(char[] s, int len) {
+            for (char[] oesRule : oesExceptions) {
+                int rulePos = oesRule.length - 1;
+                int sPos = len - 1;
+                boolean matched = true;
+                while (rulePos >= 0 && sPos >= 0) {
+                    if (oesRule[rulePos] != s[sPos]) {
+                        matched = false;
+                        break;
+                    }
+                    rulePos--;
+                    sPos--;
+                }
+                if (matched) {
+                    return true;
+                }
+            }
+            return false;
+        }
     }
 
 }
diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactoryTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactoryTests.java
index 64d42bd267def..49c86ba6e87f0 100644
--- a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactoryTests.java
+++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactoryTests.java
@@ -127,12 +127,16 @@ public void testEnglishPluralFilter() throws IOException {
             assertAnalyzesTo(analyzer, "horses", new String[]{"horse"});
             assertAnalyzesTo(analyzer, "cameras", new String[]{"camera"});
             
-            // TODO The orginal s stemmer gives up on stemming oes words because English has no fixed rule for the stem
+            // The orginal s stemmer gives up on stemming oes words because English has no fixed rule for the stem
             // (see https://howtospell.co.uk/making-O-words-plural )
-            // Would be good to find a heuristic for stemming oes words.
-            assertAnalyzesTo(analyzer, "toes", new String[]{"toes"});
-            assertAnalyzesTo(analyzer, "shoes", new String[]{"shoes"});
-            assertAnalyzesTo(analyzer, "heroes", new String[]{"heroes"});
+            // This stemmer removes the es but retains e for a small number of exceptions 
+            assertAnalyzesTo(analyzer, "mosquitoes", new String[]{"mosquito"});
+            assertAnalyzesTo(analyzer, "heroes", new String[]{"hero"});
+            // oes exceptions that retain the e.
+            assertAnalyzesTo(analyzer, "shoes", new String[]{"shoe"});
+            assertAnalyzesTo(analyzer, "horseshoes", new String[]{"horseshoe"});
+            assertAnalyzesTo(analyzer, "canoes", new String[]{"canoe"});
+            assertAnalyzesTo(analyzer, "oboes", new String[]{"oboe"});
 
             // Check improved EnglishPluralStemFilter logic
             //sses

From 942a7aff586ee4b896d5e3a209115fd0d4724e2d Mon Sep 17 00:00:00 2001
From: markharwood <markharwood@gmail.com>
Date: Mon, 17 Jun 2019 15:53:35 +0100
Subject: [PATCH 5/7] Remove redundant modifier

---
 .../elasticsearch/analysis/common/EnglishPluralStemFilter.java  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EnglishPluralStemFilter.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EnglishPluralStemFilter.java
index 32af6d2ef2263..eb8c2bc5ba0ed 100644
--- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EnglishPluralStemFilter.java
+++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EnglishPluralStemFilter.java
@@ -132,7 +132,7 @@ public int stem(char s[], int len) {
             }
         }
 
-        private final boolean isOesException(char[] s, int len) {
+        private boolean isOesException(char[] s, int len) {
             for (char[] oesRule : oesExceptions) {
                 int rulePos = oesRule.length - 1;
                 int sPos = len - 1;

From 0171b4684f9b912c6963c6ec443fa6aa1440a459 Mon Sep 17 00:00:00 2001
From: markharwood <markharwood@gmail.com>
Date: Mon, 17 Jun 2019 15:59:38 +0100
Subject: [PATCH 6/7] Added docs

---
 .../reference/analysis/tokenfilters/stemmer-tokenfilter.asciidoc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/reference/analysis/tokenfilters/stemmer-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/stemmer-tokenfilter.asciidoc
index b5d5426ff2710..1e82b2f47417a 100644
--- a/docs/reference/analysis/tokenfilters/stemmer-tokenfilter.asciidoc
+++ b/docs/reference/analysis/tokenfilters/stemmer-tokenfilter.asciidoc
@@ -84,6 +84,7 @@ English::
 http://snowball.tartarus.org/algorithms/porter/stemmer.html[*`english`*],
 http://ciir.cs.umass.edu/pubfiles/ir-35.pdf[`light_english`],
 http://www.researchgate.net/publication/220433848_How_effective_is_suffixing[`minimal_english`],
+https://github.com/elastic/elasticsearch/issues/42892[`plural_english`],
 http://lucene.apache.org/core/4_9_0/analyzers-common/org/apache/lucene/analysis/en/EnglishPossessiveFilter.html[`possessive_english`],
 http://snowball.tartarus.org/algorithms/english/stemmer.html[`porter2`],
 http://snowball.tartarus.org/algorithms/lovins/stemmer.html[`lovins`]

From 3c76b8b0f52aa1ab4b6c7f59c864ab69cd2367a0 Mon Sep 17 00:00:00 2001
From: markharwood <markharwood@gmail.com>
Date: Thu, 4 Jul 2019 16:03:30 +0100
Subject: [PATCH 7/7] Added support for ches suffix with  exceptions for words
 like avalanche

---
 .../common/EnglishPluralStemFilter.java       | 31 +++++++++++++++----
 .../StemmerTokenFilterFactoryTests.java       | 11 +++++--
 2 files changed, 33 insertions(+), 9 deletions(-)

diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EnglishPluralStemFilter.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EnglishPluralStemFilter.java
index eb8c2bc5ba0ed..98e0936dc0faa 100644
--- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EnglishPluralStemFilter.java
+++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EnglishPluralStemFilter.java
@@ -77,6 +77,19 @@ public static class EnglishPluralStemmer {
                 "canoes".toCharArray(), 
                 "oboes".toCharArray() 
                 }; 
+        // Words ending in ches that retain the e when stemmed 
+        public static final char [][] chesExceptions = { 
+                "cliches".toCharArray(), 
+                "avalanches".toCharArray(), 
+                "mustaches".toCharArray(), 
+                "moustaches".toCharArray(), 
+                "quiches".toCharArray(), 
+                "headaches".toCharArray(), 
+                "heartaches".toCharArray(), 
+                "porsches".toCharArray(), 
+                "tranches".toCharArray(), 
+                "caches".toCharArray() 
+                }; 
         
         @SuppressWarnings("fallthrough")
         public int stem(char s[], int len) {
@@ -105,7 +118,7 @@ public int stem(char s[], int len) {
                     }
                     // oes
                     if (len > 3 && s[len -3] == 'o') {
-                        if (isOesException(s, len)) {
+                        if (isException(s, len, oesExceptions)) {
                             // Only remove the S
                             return len -1;
                         }
@@ -118,10 +131,16 @@ public int stem(char s[], int len) {
                             return len - 2;
                         }
                         
-                        // tches (TODO consider just ches? Gains: lunches == lunch, losses: moustaches!= moustache
-                        if (len > 5) {
-                            if (s[len -5] == 't' && s[len -4] == 'c' && s[len -3] == 'h' ){
+                        // ches
+                        if (len > 4) {
+                            if (s[len -4] == 'c' && s[len -3] == 'h' ){
+                                if (isException(s, len, chesExceptions)) {
+                                    // Only remove the S
+                                    return len -1;
+                                }
+                                // Remove the es 
                                 return len - 2;
+
                             }                            
                         }                        
                     }
@@ -132,8 +151,8 @@ public int stem(char s[], int len) {
             }
         }
 
-        private boolean isOesException(char[] s, int len) {
-            for (char[] oesRule : oesExceptions) {
+        private boolean isException(char[] s, int len, char [][] exceptionsList) {
+            for (char[] oesRule : exceptionsList) {
                 int rulePos = oesRule.length - 1;
                 int sPos = len - 1;
                 boolean matched = true;
diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactoryTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactoryTests.java
index 49c86ba6e87f0..c4f598dea2f73 100644
--- a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactoryTests.java
+++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactoryTests.java
@@ -163,11 +163,16 @@ public void testEnglishPluralFilter() throws IOException {
             assertAnalyzesTo(analyzer, "dies", new String[]{"die"});
 
             
-            // *CHES - would be good to find a simple rule that solves lunches, churches but doesn't break aches
-            // documenting current behaviour here as a known issue:
-            assertAnalyzesTo(analyzer, "lunches", new String[]{"lunche"});
+            assertAnalyzesTo(analyzer, "lunches", new String[]{"lunch"});
             assertAnalyzesTo(analyzer, "avalanches", new String[]{"avalanche"});
             assertAnalyzesTo(analyzer, "headaches", new String[]{"headache"});
+            assertAnalyzesTo(analyzer, "caches", new String[]{"cache"});
+            assertAnalyzesTo(analyzer, "beaches", new String[]{"beach"});
+            assertAnalyzesTo(analyzer, "britches", new String[]{"britch"});
+            assertAnalyzesTo(analyzer, "cockroaches", new String[]{"cockroach"});
+            assertAnalyzesTo(analyzer, "cliches", new String[]{"cliche"});
+            assertAnalyzesTo(analyzer, "quiches", new String[]{"quiche"});
+            
         }
     }