From e0a0bccbab4e40e2849901ee567e0294d7e3984b Mon Sep 17 00:00:00 2001
From: Jim Ferenczi <jim.ferenczi@elastic.co>
Date: Thu, 23 Feb 2017 15:52:53 +0100
Subject: [PATCH 1/2] Expose WordDelimiterGraphTokenFilter

This change exposes the new Lucene graph based word delimiter token filter in the analysis filters.
Unlike the `word_delimiter` this token filter named `word_delimiter_graph` correctly handles multi terms expansion at query time.

Closes #23104
---
 .../WordDelimiterGraphTokenFilterFactory.java |    4 +
 .../indices/analysis/AnalysisModule.java      |    2 +
 .../analysis/PreBuiltTokenFilters.java        |   13 +
 .../synonym/SynonymGraphFilterTests.java      | 1074 -----------------
 ...rdDelimiterTokenFilterFactoryTestCase.java |  158 +++
 ...DelimiterGraphTokenFilterFactoryTests.java |   56 +
 .../WordDelimiterTokenFilterFactoryTests.java |  129 +-
 .../word-delimiter-graph-tokenfilter.asciidoc |   97 ++
 8 files changed, 333 insertions(+), 1200 deletions(-)
 delete mode 100644 core/src/test/java/org/apache/lucene/analysis/synonym/SynonymGraphFilterTests.java
 create mode 100644 core/src/test/java/org/elasticsearch/index/analysis/BaseWordDelimiterTokenFilterFactoryTestCase.java
 create mode 100644 core/src/test/java/org/elasticsearch/index/analysis/WordDelimiterGraphTokenFilterFactoryTests.java
 create mode 100644 docs/reference/analysis/tokenfilters/word-delimiter-graph-tokenfilter.asciidoc

diff --git a/core/src/main/java/org/elasticsearch/index/analysis/WordDelimiterGraphTokenFilterFactory.java b/core/src/main/java/org/elasticsearch/index/analysis/WordDelimiterGraphTokenFilterFactory.java
index 7cdc215f1b34a..20fc7f0ad9c81 100644
--- a/core/src/main/java/org/elasticsearch/index/analysis/WordDelimiterGraphTokenFilterFactory.java
+++ b/core/src/main/java/org/elasticsearch/index/analysis/WordDelimiterGraphTokenFilterFactory.java
@@ -98,4 +98,8 @@ private int getFlag(int flag, Settings settings, String key, boolean defaultValu
         }
         return 0;
     }
+
+    int getFlags() {
+        return flags;
+    }
 }
diff --git a/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java b/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java
index 1aaf3077aea93..61950942e6076 100644
--- a/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java
+++ b/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java
@@ -140,6 +140,7 @@
 import org.elasticsearch.index.analysis.UpperCaseTokenFilterFactory;
 import org.elasticsearch.index.analysis.WhitespaceAnalyzerProvider;
 import org.elasticsearch.index.analysis.WhitespaceTokenizerFactory;
+import org.elasticsearch.index.analysis.WordDelimiterGraphTokenFilterFactory;
 import org.elasticsearch.index.analysis.WordDelimiterTokenFilterFactory;
 import org.elasticsearch.index.analysis.compound.DictionaryCompoundWordTokenFilterFactory;
 import org.elasticsearch.index.analysis.compound.HyphenationCompoundWordTokenFilterFactory;
@@ -225,6 +226,7 @@ private NamedRegistry<AnalysisProvider<TokenFilterFactory>> setupTokenFilters(Li
         tokenFilters.register("snowball", SnowballTokenFilterFactory::new);
         tokenFilters.register("stemmer", StemmerTokenFilterFactory::new);
         tokenFilters.register("word_delimiter", WordDelimiterTokenFilterFactory::new);
+        tokenFilters.register("word_delimiter_graph", WordDelimiterGraphTokenFilterFactory::new);
         tokenFilters.register("delimited_payload_filter", DelimitedPayloadTokenFilterFactory::new);
         tokenFilters.register("elision", ElisionTokenFilterFactory::new);
         tokenFilters.register("flatten_graph", FlattenGraphTokenFilterFactory::new);
diff --git a/core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltTokenFilters.java b/core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltTokenFilters.java
index 53e79cb9dfe4d..6c58ab884db27 100644
--- a/core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltTokenFilters.java
+++ b/core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltTokenFilters.java
@@ -51,6 +51,7 @@
 import org.apache.lucene.analysis.miscellaneous.TruncateTokenFilter;
 import org.apache.lucene.analysis.miscellaneous.UniqueTokenFilter;
 import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter;
+import org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter;
 import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter;
 import org.apache.lucene.analysis.ngram.NGramTokenFilter;
 import org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilter;
@@ -87,6 +88,18 @@ public TokenStream create(TokenStream tokenStream, Version version) {
         }
     },
 
+    WORD_DELIMITER_GRAPH(CachingStrategy.ONE) {
+        @Override
+        public TokenStream create(TokenStream tokenStream, Version version) {
+            return new WordDelimiterGraphFilter(tokenStream,
+                WordDelimiterGraphFilter.GENERATE_WORD_PARTS |
+                    WordDelimiterGraphFilter.GENERATE_NUMBER_PARTS |
+                    WordDelimiterGraphFilter.SPLIT_ON_CASE_CHANGE |
+                    WordDelimiterGraphFilter.SPLIT_ON_NUMERICS |
+                    WordDelimiterGraphFilter.STEM_ENGLISH_POSSESSIVE, null);
+        }
+    },
+
     STOP(CachingStrategy.LUCENE) {
         @Override
         public TokenStream create(TokenStream tokenStream, Version version) {
diff --git a/core/src/test/java/org/apache/lucene/analysis/synonym/SynonymGraphFilterTests.java b/core/src/test/java/org/apache/lucene/analysis/synonym/SynonymGraphFilterTests.java
deleted file mode 100644
index fafe8a954c850..0000000000000
--- a/core/src/test/java/org/apache/lucene/analysis/synonym/SynonymGraphFilterTests.java
+++ /dev/null
@@ -1,1074 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.lucene.analysis.synonym;
-
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.CannedTokenStream;
-import org.apache.lucene.analysis.MockAnalyzer;
-import org.apache.lucene.analysis.MockGraphTokenFilter;
-import org.apache.lucene.analysis.MockTokenizer;
-import org.apache.lucene.analysis.Token;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.TokenStreamToAutomaton;
-import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.RandomIndexWriter;
-import org.apache.lucene.search.IndexSearcher;
-import org.apache.lucene.search.TokenStreamToTermAutomatonQuery;
-import org.apache.lucene.store.Directory;
-import org.apache.lucene.util.BytesRefBuilder;
-import org.apache.lucene.util.CharsRefBuilder;
-import org.apache.lucene.util.IOUtils;
-import org.apache.lucene.util.IntsRef;
-import org.apache.lucene.util.TestUtil;
-import org.apache.lucene.util.automaton.Automaton;
-import org.apache.lucene.util.automaton.AutomatonTestUtil;
-import org.apache.lucene.util.automaton.Operations;
-import org.apache.lucene.util.automaton.TooComplexToDeterminizeException;
-import org.apache.lucene.util.automaton.Transition;
-import org.apache.lucene.util.fst.Util;
-
-import java.io.IOException;
-import java.io.StringReader;
-import java.text.ParseException;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Set;
-
-public class SynonymGraphFilterTests extends BaseTokenStreamTestCase {
-
-    /**
-     * Set a side effect by {@link #getAnalyzer}.
-     */
-    private SynonymGraphFilter synFilter;
-
-    // LUCENE-6664
-    public static void assertAnalyzesToPositions(Analyzer a, String input, String[] output, String[] types, int[] posIncrements, int[]
-        posLengths) throws IOException {
-        assertAnalyzesTo(a, input, output, null, null, types, posIncrements, posLengths);
-    }
-
-    public void testBasicKeepOrigOneOutput() throws Exception {
-        SynonymMap.Builder b = new SynonymMap.Builder(true);
-        add(b, "a b", "x", true);
-
-        Analyzer a = getAnalyzer(b, true);
-        assertAnalyzesTo(a, "c a b", new String[]{"c", "x", "a", "b"}, new int[]{0, 2, 2, 4}, new int[]{1, 5, 3, 5}, new String[]{"word",
-            "SYNONYM", "word", "word"}, new int[]{1, 1, 0, 1}, new int[]{1, 2, 1, 1});
-        a.close();
-    }
-
-    public void testMixedKeepOrig() throws Exception {
-        SynonymMap.Builder b = new SynonymMap.Builder(true);
-        add(b, "a b", "x", true);
-        add(b, "e f", "y", false);
-
-        Analyzer a = getAnalyzer(b, true);
-        assertAnalyzesTo(a, "c a b c e f g", new String[]{"c", "x", "a", "b", "c", "y", "g"}, new int[]{0, 2, 2, 4, 6, 8, 12}, new
-            int[]{1, 5, 3, 5, 7, 11, 13}, new String[]{"word", "SYNONYM", "word", "word", "word", "SYNONYM", "word"}, new
-            int[]{1, 1, 0,
-            1, 1, 1, 1}, new int[]{1, 2, 1, 1, 1, 1, 1});
-        a.close();
-    }
-
-    public void testNoParseAfterBuffer() throws Exception {
-        SynonymMap.Builder b = new SynonymMap.Builder(true);
-        add(b, "b a", "x", true);
-
-        Analyzer a = getAnalyzer(b, true);
-        assertAnalyzesTo(a, "b b b", new String[]{"b", "b", "b"}, new int[]{0, 2, 4}, new int[]{1, 3, 5}, new String[]{"word", "word",
-            "word"}, new int[]{1, 1, 1}, new int[]{1, 1, 1});
-        a.close();
-    }
-
-    public void testOneInputMultipleOutputKeepOrig() throws Exception {
-        SynonymMap.Builder b = new SynonymMap.Builder(true);
-        add(b, "a b", "x", true);
-        add(b, "a b", "y", true);
-
-        Analyzer a = getAnalyzer(b, true);
-        assertAnalyzesTo(a, "c a b c", new String[]{"c", "x", "y", "a", "b", "c"}, new int[]{0, 2, 2, 2, 4, 6}, new int[]{1, 5, 5, 3, 5,
-            7}, new String[]{"word", "SYNONYM", "SYNONYM", "word", "word", "word"}, new int[]{1, 1, 0, 0, 1, 1, 1, 1}, new
-            int[]{1, 2, 2,
-            1, 1, 1, 1, 1});
-        a.close();
-    }
-
-    /**
-     * parse a syn file with bad syntax
-     */
-    public void testInvalidAnalyzesToNothingOutput() throws Exception {
-        String testFile = "a => 1";
-        Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, false);
-        SolrSynonymParser parser = new SolrSynonymParser(true, true, analyzer);
-        try {
-            parser.parse(new StringReader(testFile));
-            fail("didn't get expected exception");
-        } catch (ParseException expected) {
-            // expected exc
-        }
-        analyzer.close();
-    }
-
-    /**
-     * parse a syn file with bad syntax
-     */
-    public void testInvalidDoubleMap() throws Exception {
-        String testFile = "a => b => c";
-        Analyzer analyzer = new MockAnalyzer(random());
-        SolrSynonymParser parser = new SolrSynonymParser(true, true, analyzer);
-        try {
-            parser.parse(new StringReader(testFile));
-            fail("didn't get expected exception");
-        } catch (ParseException expected) {
-            // expected exc
-        }
-        analyzer.close();
-    }
-
-    public void testMoreThanOneLookAhead() throws Exception {
-        SynonymMap.Builder b = new SynonymMap.Builder(true);
-        add(b, "a b c d", "x", true);
-
-        Analyzer a = getAnalyzer(b, true);
-        assertAnalyzesTo(a, "a b c e", new String[]{"a", "b", "c", "e"}, new int[]{0, 2, 4, 6}, new int[]{1, 3, 5, 7}, new
-            String[]{"word", "word", "word", "word"}, new int[]{1, 1, 1, 1}, new int[]{1, 1, 1, 1});
-        a.close();
-    }
-
-    public void testLookaheadAfterParse() throws Exception {
-        SynonymMap.Builder b = new SynonymMap.Builder(true);
-        add(b, "b b", "x", true);
-        add(b, "b", "y", true);
-
-        Analyzer a = getAnalyzer(b, true);
-
-        assertAnalyzesTo(a, "b a b b", new String[]{"y", "b", "a", "x", "b", "b"}, new int[]{0, 0, 2, 4, 4, 6}, new int[]{1, 1, 3, 7, 5,
-            7}, null, new int[]{1, 0, 1, 1, 0, 1}, new int[]{1, 1, 1, 2, 1, 1}, true);
-    }
-
-    public void testLookaheadSecondParse() throws Exception {
-        SynonymMap.Builder b = new SynonymMap.Builder(true);
-        add(b, "b b b", "x", true);
-        add(b, "b", "y", true);
-
-        Analyzer a = getAnalyzer(b, true);
-
-        assertAnalyzesTo(a, "b b", new String[]{"y", "b", "y", "b"}, new int[]{0, 0, 2, 2}, new int[]{1, 1, 3, 3}, null, new int[]{1, 0,
-            1, 0}, new int[]{1, 1, 1, 1}, true);
-    }
-
-    public void testOneInputMultipleOutputNoKeepOrig() throws Exception {
-        SynonymMap.Builder b = new SynonymMap.Builder(true);
-        add(b, "a b", "x", false);
-        add(b, "a b", "y", false);
-
-        Analyzer a = getAnalyzer(b, true);
-        assertAnalyzesTo(a, "c a b c", new String[]{"c", "x", "y", "c"}, new int[]{0, 2, 2, 6}, new int[]{1, 5, 5, 7}, new
-            String[]{"word", "SYNONYM", "SYNONYM", "word"}, new int[]{1, 1, 0, 1}, new int[]{1, 1, 1, 1});
-        a.close();
-    }
-
-    public void testOneInputMultipleOutputMixedKeepOrig() throws Exception {
-        SynonymMap.Builder b = new SynonymMap.Builder(true);
-        add(b, "a b", "x", true);
-        add(b, "a b", "y", false);
-
-        Analyzer a = getAnalyzer(b, true);
-        assertAnalyzesTo(a, "c a b c", new String[]{"c", "x", "y", "a", "b", "c"}, new int[]{0, 2, 2, 2, 4, 6}, new int[]{1, 5, 5, 3, 5,
-            7}, new String[]{"word", "SYNONYM", "SYNONYM", "word", "word", "word"}, new int[]{1, 1, 0, 0, 1, 1, 1, 1}, new
-            int[]{1, 2, 2,
-            1, 1, 1, 1, 1});
-        a.close();
-    }
-
-    public void testSynAtEnd() throws Exception {
-        SynonymMap.Builder b = new SynonymMap.Builder(true);
-        add(b, "a b", "x", true);
-
-        Analyzer a = getAnalyzer(b, true);
-        assertAnalyzesTo(a, "c d e a b", new String[]{"c", "d", "e", "x", "a", "b"}, new int[]{0, 2, 4, 6, 6, 8}, new int[]{1, 3, 5, 9,
-            7, 9}, new String[]{"word", "word", "word", "SYNONYM", "word", "word"}, new int[]{1, 1, 1, 1, 0, 1}, new int[]{1, 1, 1,
-            2, 1,
-            1});
-        a.close();
-    }
-
-    public void testTwoSynsInARow() throws Exception {
-        SynonymMap.Builder b = new SynonymMap.Builder(true);
-        add(b, "a", "x", false);
-
-        Analyzer a = getAnalyzer(b, true);
-        assertAnalyzesTo(a, "c a a b", new String[]{"c", "x", "x", "b"}, new int[]{0, 2, 4, 6}, new int[]{1, 3, 5, 7}, new
-            String[]{"word", "SYNONYM", "SYNONYM", "word"}, new int[]{1, 1, 1, 1}, new int[]{1, 1, 1, 1});
-        a.close();
-    }
-
-    public void testBasicKeepOrigTwoOutputs() throws Exception {
-        SynonymMap.Builder b = new SynonymMap.Builder(true);
-        add(b, "a b", "x y", true);
-        add(b, "a b", "m n o", true);
-
-        Analyzer a = getAnalyzer(b, true);
-        assertAnalyzesTo(a, "c a b d", new String[]{"c", "x", "m", "a", "y", "n", "o", "b", "d"}, new int[]{0, 2, 2, 2, 2, 2, 2, 4, 6},
-            new int[]{1, 5, 5, 3, 5, 5, 5, 5, 7}, new String[]{"word", "SYNONYM", "SYNONYM", "word", "SYNONYM",
-                "SYNONYM", "SYNONYM",
-                "word", "word"}, new int[]{1, 1, 0, 0, 1, 1, 1, 1, 1}, new int[]{1, 1, 2, 4, 4, 1, 2, 1, 1});
-        a.close();
-    }
-
-    public void testNoCaptureIfNoMatch() throws Exception {
-        SynonymMap.Builder b = new SynonymMap.Builder(true);
-        add(b, "a b", "x y", true);
-
-        Analyzer a = getAnalyzer(b, true);
-
-        assertAnalyzesTo(a, "c d d", new String[]{"c", "d", "d"}, new int[]{0, 2, 4}, new int[]{1, 3, 5}, new String[]{"word", "word",
-            "word"}, new int[]{1, 1, 1}, new int[]{1, 1, 1});
-        assertEquals(0, synFilter.getCaptureCount());
-        a.close();
-    }
-
-    public void testBasicNotKeepOrigOneOutput() throws Exception {
-        SynonymMap.Builder b = new SynonymMap.Builder(true);
-        add(b, "a b", "x", false);
-
-        Analyzer a = getAnalyzer(b, true);
-        assertAnalyzesTo(a, "c a b", new String[]{"c", "x"}, new int[]{0, 2}, new int[]{1, 5}, new String[]{"word", "SYNONYM"}, new
-            int[]{1, 1}, new int[]{1, 1});
-        a.close();
-    }
-
-    public void testBasicNoKeepOrigTwoOutputs() throws Exception {
-        SynonymMap.Builder b = new SynonymMap.Builder(true);
-        add(b, "a b", "x y", false);
-        add(b, "a b", "m n o", false);
-
-        Analyzer a = getAnalyzer(b, true);
-        assertAnalyzesTo(a, "c a b d", new String[]{"c", "x", "m", "y", "n", "o", "d"}, new int[]{0, 2, 2, 2, 2, 2, 6}, new int[]{1, 5,
-            5, 5, 5, 5, 7}, new String[]{"word", "SYNONYM", "SYNONYM", "SYNONYM", "SYNONYM", "SYNONYM",
-            "word"}, new int[]{1, 1, 0, 1, 1,
-            1, 1}, new int[]{1, 1, 2, 3, 1, 1, 1});
-        a.close();
-    }
-
-    public void testIgnoreCase() throws Exception {
-        SynonymMap.Builder b = new SynonymMap.Builder(true);
-        add(b, "a b", "x y", false);
-        add(b, "a b", "m n o", false);
-
-        Analyzer a = getAnalyzer(b, true);
-        assertAnalyzesTo(a, "c A B D", new String[]{"c", "x", "m", "y", "n", "o", "D"}, new int[]{0, 2, 2, 2, 2, 2, 6}, new int[]{1, 5,
-            5, 5, 5, 5, 7}, new String[]{"word", "SYNONYM", "SYNONYM", "SYNONYM", "SYNONYM", "SYNONYM",
-            "word"}, new int[]{1, 1, 0, 1, 1,
-            1, 1}, new int[]{1, 1, 2, 3, 1, 1, 1});
-        a.close();
-    }
-
-    public void testDoNotIgnoreCase() throws Exception {
-        SynonymMap.Builder b = new SynonymMap.Builder(true);
-        add(b, "a b", "x y", false);
-        add(b, "a b", "m n o", false);
-
-        Analyzer a = getAnalyzer(b, false);
-        assertAnalyzesTo(a, "c A B D", new String[]{"c", "A", "B", "D"}, new int[]{0, 2, 4, 6}, new int[]{1, 3, 5, 7}, new
-            String[]{"word", "word", "word", "word"}, new int[]{1, 1, 1, 1}, new int[]{1, 1, 1, 1});
-        a.close();
-    }
-
-    public void testBufferedFinish1() throws Exception {
-        SynonymMap.Builder b = new SynonymMap.Builder(true);
-        add(b, "a b c", "m n o", false);
-
-        Analyzer a = getAnalyzer(b, true);
-        assertAnalyzesTo(a, "c a b", new String[]{"c", "a", "b"}, new int[]{0, 2, 4}, new int[]{1, 3, 5}, new String[]{"word", "word",
-            "word"}, new int[]{1, 1, 1}, new int[]{1, 1, 1});
-        a.close();
-    }
-
-    public void testBufferedFinish2() throws Exception {
-        SynonymMap.Builder b = new SynonymMap.Builder(true);
-        add(b, "a b", "m n o", false);
-        add(b, "d e", "m n o", false);
-
-        Analyzer a = getAnalyzer(b, true);
-        assertAnalyzesTo(a, "c a d", new String[]{"c", "a", "d"}, new int[]{0, 2, 4}, new int[]{1, 3, 5}, new String[]{"word", "word",
-            "word"}, new int[]{1, 1, 1}, new int[]{1, 1, 1});
-        a.close();
-    }
-
-    public void testCanReuse() throws Exception {
-        SynonymMap.Builder b = new SynonymMap.Builder(true);
-        add(b, "a b", "x", true);
-        Analyzer a = getAnalyzer(b, true);
-        for (int i = 0; i < 10; i++) {
-            assertAnalyzesTo(a, "c a b", new String[]{"c", "x", "a", "b"}, new int[]{0, 2, 2, 4}, new int[]{1, 5, 3, 5}, new
-                String[]{"word", "SYNONYM", "word", "word"}, new int[]{1, 1, 0, 1}, new int[]{1, 2, 1, 1});
-        }
-        a.close();
-    }
-
-    /**
-     * Multiple input tokens map to a single output token
-     */
-    public void testManyToOne() throws Exception {
-        SynonymMap.Builder b = new SynonymMap.Builder(true);
-        add(b, "a b c", "z", true);
-
-        Analyzer a = getAnalyzer(b, true);
-        assertAnalyzesTo(a, "a b c d", new String[]{"z", "a", "b", "c", "d"}, new int[]{0, 0, 2, 4, 6}, new int[]{5, 1, 3, 5, 7}, new
-            String[]{"SYNONYM", "word", "word", "word", "word"}, new int[]{1, 0, 1, 1, 1}, new int[]{3, 1, 1, 1, 1});
-        a.close();
-    }
-
-    public void testBufferAfterMatch() throws Exception {
-        SynonymMap.Builder b = new SynonymMap.Builder(true);
-        add(b, "a b c d", "x", true);
-        add(b, "a b", "y", false);
-
-        // The 'c' token has to be buffered because SynGraphFilter
-        // needs to know whether a b c d -> x matches:
-        Analyzer a = getAnalyzer(b, true);
-        assertAnalyzesTo(a, "f a b c e", new String[]{"f", "y", "c", "e"}, new int[]{0, 2, 6, 8}, new int[]{1, 5, 7, 9}, new
-            String[]{"word", "SYNONYM", "word", "word"}, new int[]{1, 1, 1, 1}, new int[]{1, 1, 1, 1});
-        a.close();
-    }
-
-    public void testZeroSyns() throws Exception {
-        Tokenizer tokenizer = new MockTokenizer();
-        tokenizer.setReader(new StringReader("aa bb"));
-        try {
-            new SynonymGraphFilter(tokenizer, new SynonymMap.Builder(true).build(), true);
-            fail("did not hit expected exception");
-        } catch (IllegalArgumentException iae) {
-            // expected
-            assertEquals("fst must be non-null", iae.getMessage());
-        }
-    }
-
-    // Needs TermAutomatonQuery, which is in sandbox still:
-    public void testAccurateGraphQuery1() throws Exception {
-        Directory dir = newDirectory();
-        RandomIndexWriter w = new RandomIndexWriter(random(), dir);
-        Document doc = new Document();
-        doc.add(newTextField("field", "wtf happened", Field.Store.NO));
-        w.addDocument(doc);
-        IndexReader r = w.getReader();
-        w.close();
-
-        IndexSearcher s = newSearcher(r);
-
-        SynonymMap.Builder b = new SynonymMap.Builder(true);
-        add(b, "what the fudge", "wtf", true);
-
-        SynonymMap map = b.build();
-
-        TokenStreamToTermAutomatonQuery ts2q = new TokenStreamToTermAutomatonQuery();
-
-
-        TokenStream in = new CannedTokenStream(0, 23, token("what", 1, 1, 0, 4), token("the", 1, 1, 5, 8), token("fudge", 1, 1, 9, 14),
-            token("happened", 1, 1, 15, 23));
-
-        assertEquals(1, s.count(ts2q.toQuery("field", new SynonymGraphFilter(in, map, true))));
-
-        in = new CannedTokenStream(0, 12, token("wtf", 1, 1, 0, 3), token("happened", 1, 1, 4, 12));
-
-        assertEquals(1, s.count(ts2q.toQuery("field", new SynonymGraphFilter(in, map, true))));
-
-        // "what happened" should NOT match:
-        in = new CannedTokenStream(0, 13, token("what", 1, 1, 0, 4), token("happened", 1, 1, 5, 13));
-        assertEquals(0, s.count(ts2q.toQuery("field", new SynonymGraphFilter(in, map, true))));
-
-        IOUtils.close(r, dir);
-    }
-
-
-    /**
-     * If we expand synonyms at search time, the results are correct.
-     */
-    // Needs TermAutomatonQuery, which is in sandbox still:
-    public void testAccurateGraphQuery2() throws Exception {
-        Directory dir = newDirectory();
-        RandomIndexWriter w = new RandomIndexWriter(random(), dir);
-        Document doc = new Document();
-        doc.add(newTextField("field", "say wtf happened", Field.Store.NO));
-        w.addDocument(doc);
-        IndexReader r = w.getReader();
-        w.close();
-
-        IndexSearcher s = newSearcher(r);
-
-        SynonymMap.Builder b = new SynonymMap.Builder(true);
-        add(b, "fudge", "chocolate", true);
-        add(b, "what the fudge", "wtf", true);
-        add(b, "what the", "wut", true);
-        add(b, "say", "say what", true);
-
-        SynonymMap map = b.build();
-
-        TokenStream in = new CannedTokenStream(0, 26, token("say", 1, 1, 0, 3), token("what", 1, 1, 3, 7), token("the", 1, 1, 8, 11),
-            token("fudge", 1, 1, 12, 17), token("happened", 1, 1, 18, 26));
-
-        TokenStreamToTermAutomatonQuery ts2q = new TokenStreamToTermAutomatonQuery();
-
-        assertEquals(1, s.count(ts2q.toQuery("field", new SynonymGraphFilter(in, map, true))));
-
-        // "what happened" should NOT match:
-        in = new CannedTokenStream(0, 13, token("what", 1, 1, 0, 4), token("happened", 1, 1, 5, 13));
-        assertEquals(0, s.count(ts2q.toQuery("field", new SynonymGraphFilter(in, map, true))));
-
-        IOUtils.close(r, dir);
-    }
-
-
-    // Needs TermAutomatonQuery, which is in sandbox still:
-    public void testAccurateGraphQuery3() throws Exception {
-        Directory dir = newDirectory();
-        RandomIndexWriter w = new RandomIndexWriter(random(), dir);
-        Document doc = new Document();
-        doc.add(newTextField("field", "say what the fudge happened", Field.Store.NO));
-        w.addDocument(doc);
-        IndexReader r = w.getReader();
-        w.close();
-
-        IndexSearcher s = newSearcher(r);
-
-        SynonymMap.Builder b = new SynonymMap.Builder(true);
-        add(b, "wtf", "what the fudge", true);
-
-        SynonymMap map = b.build();
-
-        TokenStream in = new CannedTokenStream(0, 15, token("say", 1, 1, 0, 3), token("wtf", 1, 1, 3, 6), token("happened", 1, 1, 7, 15));
-
-        TokenStreamToTermAutomatonQuery ts2q = new TokenStreamToTermAutomatonQuery();
-
-        assertEquals(1, s.count(ts2q.toQuery("field", new SynonymGraphFilter(in, map, true))));
-
-        // "what happened" should NOT match:
-        in = new CannedTokenStream(0, 13, token("what", 1, 1, 0, 4), token("happened", 1, 1, 5, 13));
-        assertEquals(0, s.count(ts2q.toQuery("field", new SynonymGraphFilter(in, map, true))));
-
-        IOUtils.close(r, dir);
-    }
-
-    private static Token token(String term, int posInc, int posLength, int startOffset, int endOffset) {
-        final Token t = new Token(term, startOffset, endOffset);
-        t.setPositionIncrement(posInc);
-        t.setPositionLength(posLength);
-        return t;
-    }
-
-    private String randomNonEmptyString() {
-        while (true) {
-            String s = TestUtil.randomUnicodeString(random()).trim();
-            //String s = TestUtil.randomSimpleString(random()).trim();
-            if (s.length() != 0 && s.indexOf('\u0000') == -1) {
-                return s;
-            }
-        }
-    }
-
-    // Adds MockGraphTokenFilter after SynFilter:
-    public void testRandomGraphAfter() throws Exception {
-        final int numIters = atLeast(3);
-        for (int i = 0; i < numIters; i++) {
-            SynonymMap.Builder b = new SynonymMap.Builder(random().nextBoolean());
-            final int numEntries = atLeast(10);
-            for (int j = 0; j < numEntries; j++) {
-                add(b, randomNonEmptyString(), randomNonEmptyString(), random().nextBoolean());
-            }
-            final SynonymMap map = b.build();
-            final boolean ignoreCase = random().nextBoolean();
-
-            final Analyzer analyzer = new Analyzer() {
-                @Override
-                protected TokenStreamComponents createComponents(String fieldName) {
-                    Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true);
-                    TokenStream syns = new SynonymGraphFilter(tokenizer, map, ignoreCase);
-                    TokenStream graph = new MockGraphTokenFilter(random(), syns);
-                    return new TokenStreamComponents(tokenizer, graph);
-                }
-            };
-
-            checkRandomData(random(), analyzer, 100);
-            analyzer.close();
-        }
-    }
-
-    public void testEmptyStringInput() throws IOException {
-        final int numIters = atLeast(10);
-        for (int i = 0; i < numIters; i++) {
-            SynonymMap.Builder b = new SynonymMap.Builder(random().nextBoolean());
-            final int numEntries = atLeast(10);
-            for (int j = 0; j < numEntries; j++) {
-                add(b, randomNonEmptyString(), randomNonEmptyString(), random().nextBoolean());
-            }
-            final boolean ignoreCase = random().nextBoolean();
-
-            Analyzer analyzer = getAnalyzer(b, ignoreCase);
-
-            checkAnalysisConsistency(random(), analyzer, random().nextBoolean(), "");
-            analyzer.close();
-        }
-    }
-
-    /**
-     * simple random test, doesn't verify correctness.
-     * does verify it doesnt throw exceptions, or that the stream doesn't misbehave
-     */
-    public void testRandom2() throws Exception {
-        final int numIters = atLeast(3);
-        for (int i = 0; i < numIters; i++) {
-            SynonymMap.Builder b = new SynonymMap.Builder(random().nextBoolean());
-            final int numEntries = atLeast(10);
-            for (int j = 0; j < numEntries; j++) {
-                add(b, randomNonEmptyString(), randomNonEmptyString(), random().nextBoolean());
-            }
-            final boolean ignoreCase = random().nextBoolean();
-
-            Analyzer analyzer = getAnalyzer(b, ignoreCase);
-            checkRandomData(random(), analyzer, 100);
-            analyzer.close();
-        }
-    }
-
-    /**
-     * simple random test like testRandom2, but for larger docs
-     */
-    public void testRandomHuge() throws Exception {
-        final int numIters = atLeast(3);
-        for (int i = 0; i < numIters; i++) {
-            SynonymMap.Builder b = new SynonymMap.Builder(random().nextBoolean());
-            final int numEntries = atLeast(10);
-            //if (VERBOSE) {
-            //System.out.println("TEST: iter=" + i + " numEntries=" + numEntries);
-            //}
-            for (int j = 0; j < numEntries; j++) {
-                add(b, randomNonEmptyString(), randomNonEmptyString(), random().nextBoolean());
-            }
-            final boolean ignoreCase = random().nextBoolean();
-
-            Analyzer analyzer = getAnalyzer(b, ignoreCase);
-            checkRandomData(random(), analyzer, 100, 1024);
-            analyzer.close();
-        }
-    }
-
-    public void testEmptyTerm() throws IOException {
-        final int numIters = atLeast(10);
-        for (int i = 0; i < numIters; i++) {
-            SynonymMap.Builder b = new SynonymMap.Builder(random().nextBoolean());
-            final int numEntries = atLeast(10);
-            for (int j = 0; j < numEntries; j++) {
-                add(b, randomNonEmptyString(), randomNonEmptyString(), random().nextBoolean());
-            }
-            final boolean ignoreCase = random().nextBoolean();
-
-            final Analyzer analyzer = getAnalyzer(b, ignoreCase);
-
-            checkAnalysisConsistency(random(), analyzer, random().nextBoolean(), "");
-            analyzer.close();
-        }
-    }
-
-    public void testBuilderDedup() throws Exception {
-        SynonymMap.Builder b = new SynonymMap.Builder(true);
-        final boolean keepOrig = false;
-        add(b, "a b", "ab", keepOrig);
-        add(b, "a b", "ab", keepOrig);
-        add(b, "a b", "ab", keepOrig);
-        Analyzer a = getAnalyzer(b, true);
-
-        assertAnalyzesTo(a, "a b", new String[]{"ab"}, new int[]{1});
-        a.close();
-    }
-
-    public void testBuilderNoDedup() throws Exception {
-        SynonymMap.Builder b = new SynonymMap.Builder(false);
-        final boolean keepOrig = false;
-        add(b, "a b", "ab", keepOrig);
-        add(b, "a b", "ab", keepOrig);
-        add(b, "a b", "ab", keepOrig);
-        Analyzer a = getAnalyzer(b, true);
-
-        assertAnalyzesTo(a, "a b", new String[]{"ab", "ab", "ab"}, new int[]{1, 0, 0});
-        a.close();
-    }
-
-    public void testRecursion1() throws Exception {
-        SynonymMap.Builder b = new SynonymMap.Builder(true);
-        final boolean keepOrig = false;
-        add(b, "zoo", "zoo", keepOrig);
-        Analyzer a = getAnalyzer(b, true);
-
-        assertAnalyzesTo(a, "zoo zoo $ zoo", new String[]{"zoo", "zoo", "$", "zoo"}, new int[]{1, 1, 1, 1});
-        a.close();
-    }
-
-    public void testRecursion2() throws Exception {
-        SynonymMap.Builder b = new SynonymMap.Builder(true);
-        final boolean keepOrig = false;
-        add(b, "zoo", "zoo", keepOrig);
-        add(b, "zoo", "zoo zoo", keepOrig);
-        Analyzer a = getAnalyzer(b, true);
-
-        // verify("zoo zoo $ zoo", "zoo/zoo zoo/zoo/zoo $/zoo zoo/zoo zoo");
-        assertAnalyzesTo(a, "zoo zoo $ zoo", new String[]{"zoo", "zoo", "zoo", "zoo", "zoo", "zoo", "$", "zoo", "zoo", "zoo"}, new
-            int[]{1, 0, 1, 1, 0, 1, 1, 1, 0, 1});
-        a.close();
-    }
-
-    public void testKeepOrig() throws Exception {
-        SynonymMap.Builder b = new SynonymMap.Builder(true);
-        final boolean keepOrig = true;
-        add(b, "a b", "ab", keepOrig);
-        add(b, "a c", "ac", keepOrig);
-        add(b, "a", "aa", keepOrig);
-        add(b, "b", "bb", keepOrig);
-        add(b, "z x c v", "zxcv", keepOrig);
-        add(b, "x c", "xc", keepOrig);
-        Analyzer a = getAnalyzer(b, true);
-
-        assertAnalyzesTo(a, "$", new String[]{"$"}, new int[]{1});
-        assertAnalyzesTo(a, "a", new String[]{"aa", "a"}, new int[]{1, 0});
-        assertAnalyzesTo(a, "a", new String[]{"aa", "a"}, new int[]{1, 0});
-        assertAnalyzesTo(a, "$ a", new String[]{"$", "aa", "a"}, new int[]{1, 1, 0});
-        assertAnalyzesTo(a, "a $", new String[]{"aa", "a", "$"}, new int[]{1, 0, 1});
-        assertAnalyzesTo(a, "$ a !", new String[]{"$", "aa", "a", "!"}, new int[]{1, 1, 0, 1});
-        assertAnalyzesTo(a, "a a", new String[]{"aa", "a", "aa", "a"}, new int[]{1, 0, 1, 0});
-        assertAnalyzesTo(a, "b", new String[]{"bb", "b"}, new int[]{1, 0});
-        assertAnalyzesTo(a, "z x c v", new String[]{"zxcv", "z", "x", "c", "v"}, new int[]{1, 0, 1, 1, 1});
-        assertAnalyzesTo(a, "z x c $", new String[]{"z", "xc", "x", "c", "$"}, new int[]{1, 1, 0, 1, 1});
-        a.close();
-    }
-
-    private Analyzer getAnalyzer(SynonymMap.Builder b, final boolean ignoreCase) throws IOException {
-        final SynonymMap map = b.build();
-        return new Analyzer() {
-            @Override
-            protected TokenStreamComponents createComponents(String fieldName) {
-                Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
-                // Make a local variable so testRandomHuge doesn't share it across threads!
-                SynonymGraphFilter synFilter = new SynonymGraphFilter(tokenizer, map, ignoreCase);
-                SynonymGraphFilterTests.this.synFilter = synFilter;
-                return new TokenStreamComponents(tokenizer, synFilter);
-            }
-        };
-    }
-
-    private void add(SynonymMap.Builder b, String input, String output, boolean keepOrig) {
-        if (VERBOSE) {
-            //System.out.println("  add input=" + input + " output=" + output + " keepOrig=" + keepOrig);
-        }
-        CharsRefBuilder inputCharsRef = new CharsRefBuilder();
-        SynonymMap.Builder.join(input.split(" +"), inputCharsRef);
-
-        CharsRefBuilder outputCharsRef = new CharsRefBuilder();
-        SynonymMap.Builder.join(output.split(" +"), outputCharsRef);
-
-        b.add(inputCharsRef.get(), outputCharsRef.get(), keepOrig);
-    }
-
-    private char[] randomBinaryChars(int minLen, int maxLen, double bias, char base) {
-        int len = TestUtil.nextInt(random(), minLen, maxLen);
-        char[] chars = new char[len];
-        for (int i = 0; i < len; i++) {
-            char ch;
-            if (random().nextDouble() < bias) {
-                ch = base;
-            } else {
-                ch = (char) (base + 1);
-            }
-            chars[i] = ch;
-        }
-
-        return chars;
-    }
-
-    private static String toTokenString(char[] chars) {
-        StringBuilder b = new StringBuilder();
-        for (char c : chars) {
-            if (b.length() > 0) {
-                b.append(' ');
-            }
-            b.append(c);
-        }
-        return b.toString();
-    }
-
-    private static class OneSyn {
-        char[] in;
-        char[] out;
-        boolean keepOrig;
-
-        @Override
-        public String toString() {
-            return toTokenString(in) + " --> " + toTokenString(out) + " (keepOrig=" + keepOrig + ")";
-        }
-    }
-
-    public void testRandomSyns() throws Exception {
-        int synCount = atLeast(10);
-        double bias = random().nextDouble();
-        boolean dedup = random().nextBoolean();
-
-        SynonymMap.Builder b = new SynonymMap.Builder(dedup);
-        List<OneSyn> syns = new ArrayList<>();
-        // Makes random syns from random a / b tokens, mapping to random x / y tokens
-        //if (VERBOSE) {
-        //    System.out.println("TEST: make " + synCount + " syns");
-        //    System.out.println("  bias for a over b=" + bias);
-        //    System.out.println("  dedup=" + dedup);
-        //    System.out.println("  sausage=" + sausage);
-        //}
-
-        int maxSynLength = 0;
-
-        for (int i = 0; i < synCount; i++) {
-            OneSyn syn = new OneSyn();
-            syn.in = randomBinaryChars(1, 5, bias, 'a');
-            syn.out = randomBinaryChars(1, 5, 0.5, 'x');
-            syn.keepOrig = random().nextBoolean();
-            syns.add(syn);
-
-            maxSynLength = Math.max(maxSynLength, syn.in.length);
-
-            //if (VERBOSE) {
-            //    System.out.println("  " + syn);
-            //}
-            add(b, toTokenString(syn.in), toTokenString(syn.out), syn.keepOrig);
-        }
-
-        // Only used w/ VERBOSE:
-        Analyzer aNoSausageed;
-        if (VERBOSE) {
-            aNoSausageed = getAnalyzer(b, true);
-        } else {
-            aNoSausageed = null;
-        }
-
-        Analyzer a = getAnalyzer(b, true);
-        int iters = atLeast(20);
-        for (int iter = 0; iter < iters; iter++) {
-
-            String doc = toTokenString(randomBinaryChars(50, 100, bias, 'a'));
-            //String doc = toTokenString(randomBinaryChars(10, 50, bias, 'a'));
-
-            //if (VERBOSE) {
-            //    System.out.println("TEST: iter=" + iter + " doc=" + doc);
-            //}
-            Automaton expected = slowSynFilter(doc, syns);
-            if (VERBOSE) {
-                //System.out.println("  expected:\n" + expected.toDot());
-            }
-            Automaton actual = toAutomaton(a.tokenStream("field", new StringReader(doc)));
-            //if (VERBOSE) {
-            //    System.out.println("  actual:\n" + actual.toDot());
-            //}
-
-            assertTrue("maxLookaheadUsed=" + synFilter.getMaxLookaheadUsed() + " maxSynLength=" + maxSynLength, synFilter
-                .getMaxLookaheadUsed() <= maxSynLength);
-
-            checkAnalysisConsistency(random(), a, random().nextBoolean(), doc);
-            // We can easily have a non-deterministic automaton at this point, e.g. if
-            // more than one syn matched at given point, or if the syn mapped to an
-            // output token that also happens to be in the input:
-            try {
-                actual = Operations.determinize(actual, 50000);
-            } catch (TooComplexToDeterminizeException tctde) {
-                // Unfortunately the syns can easily create difficult-to-determinize graphs:
-                assertTrue(approxEquals(actual, expected));
-                continue;
-            }
-
-            try {
-                expected = Operations.determinize(expected, 50000);
-            } catch (TooComplexToDeterminizeException tctde) {
-                // Unfortunately the syns can easily create difficult-to-determinize graphs:
-                assertTrue(approxEquals(actual, expected));
-                continue;
-            }
-
-            assertTrue(approxEquals(actual, expected));
-            assertTrue(Operations.sameLanguage(actual, expected));
-        }
-
-        a.close();
-    }
-
-    /**
-     * Only used when true equality is too costly to check!
-     */
-    private boolean approxEquals(Automaton actual, Automaton expected) {
-        // Don't collapse these into one line else the thread stack won't say which direction failed!:
-        boolean b1 = approxSubsetOf(actual, expected);
-        boolean b2 = approxSubsetOf(expected, actual);
-        return b1 && b2;
-    }
-
-    private boolean approxSubsetOf(Automaton a1, Automaton a2) {
-        AutomatonTestUtil.RandomAcceptedStrings ras = new AutomatonTestUtil.RandomAcceptedStrings(a1);
-        for (int i = 0; i < 2000; i++) {
-            int[] ints = ras.getRandomAcceptedString(random());
-            IntsRef path = new IntsRef(ints, 0, ints.length);
-            if (accepts(a2, path) == false) {
-                throw new RuntimeException("a2 does not accept " + path);
-            }
-        }
-
-        // Presumed true
-        return true;
-    }
-
-    /**
-     * Like {@link Operations#run} except the incoming automaton is allowed to be non-deterministic.
-     */
-    private static boolean accepts(Automaton a, IntsRef path) {
-        Set<Integer> states = new HashSet<>();
-        states.add(0);
-        Transition t = new Transition();
-        for (int i = 0; i < path.length; i++) {
-            int digit = path.ints[path.offset + i];
-            Set<Integer> nextStates = new HashSet<>();
-            for (int state : states) {
-                int count = a.initTransition(state, t);
-                for (int j = 0; j < count; j++) {
-                    a.getNextTransition(t);
-                    if (digit >= t.min && digit <= t.max) {
-                        nextStates.add(t.dest);
-                    }
-                }
-            }
-            states = nextStates;
-            if (states.isEmpty()) {
-                return false;
-            }
-        }
-
-        for (int state : states) {
-            if (a.isAccept(state)) {
-                return true;
-            }
-        }
-
-        return false;
-    }
-
-    /**
-     * Stupid, slow brute-force, yet hopefully bug-free, synonym filter.
-     */
-    private Automaton slowSynFilter(String doc, List<OneSyn> syns) {
-        String[] tokens = doc.split(" +");
-        //if (VERBOSE) {
-        //    System.out.println("  doc has " + tokens.length + " tokens");
-        //}
-        int i = 0;
-        Automaton.Builder a = new Automaton.Builder();
-        int lastState = a.createState();
-        while (i < tokens.length) {
-            // Consider all possible syn matches starting at this point:
-            assert tokens[i].length() == 1;
-            //if (VERBOSE) {
-            //    System.out.println("    i=" + i);
-            //}
-
-            List<OneSyn> matches = new ArrayList<>();
-            for (OneSyn syn : syns) {
-                if (i + syn.in.length <= tokens.length) {
-                    boolean match = true;
-                    for (int j = 0; j < syn.in.length; j++) {
-                        if (tokens[i + j].charAt(0) != syn.in[j]) {
-                            match = false;
-                            break;
-                        }
-                    }
-
-                    if (match) {
-                        if (matches.isEmpty() == false) {
-                            if (syn.in.length < matches.get(0).in.length) {
-                                // Greedy matching: we already found longer syns matching here
-                                continue;
-                            } else if (syn.in.length > matches.get(0).in.length) {
-                                // Greedy matching: all previous matches were shorter, so we drop them
-                                matches.clear();
-                            } else {
-                                // Keep the current matches: we allow multiple synonyms matching the same input string
-                            }
-                        }
-
-                        matches.add(syn);
-                    }
-                }
-            }
-
-            int nextState = a.createState();
-
-            if (matches.isEmpty() == false) {
-                // We have match(es) starting at this token
-                //if (VERBOSE) {
-                //    System.out.println("  matches @ i=" + i + ": " + matches);
-                //}
-                // We keepOrig if any of the matches said to:
-                boolean keepOrig = false;
-                for (OneSyn syn : matches) {
-                    keepOrig |= syn.keepOrig;
-                }
-
-                if (keepOrig) {
-                    // Add path for the original tokens
-                    addSidePath(a, lastState, nextState, matches.get(0).in);
-                }
-
-                for (OneSyn syn : matches) {
-                    addSidePath(a, lastState, nextState, syn.out);
-                }
-
-                i += matches.get(0).in.length;
-            } else {
-                a.addTransition(lastState, nextState, tokens[i].charAt(0));
-                i++;
-            }
-
-            lastState = nextState;
-        }
-
-        a.setAccept(lastState, true);
-
-        return topoSort(a.finish());
-    }
-
-    /**
-     * Just creates a side path from startState to endState with the provided tokens.
-     */
-    private static void addSidePath(Automaton.Builder a, int startState, int endState, char[] tokens) {
-        int lastState = startState;
-        for (int i = 0; i < tokens.length; i++) {
-            int nextState;
-            if (i == tokens.length - 1) {
-                nextState = endState;
-            } else {
-                nextState = a.createState();
-            }
-
-            a.addTransition(lastState, nextState, tokens[i]);
-
-            lastState = nextState;
-        }
-    }
-
-    private Automaton toAutomaton(TokenStream ts) throws IOException {
-        PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
-        PositionLengthAttribute posLenAtt = ts.addAttribute(PositionLengthAttribute.class);
-        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
-        ts.reset();
-        Automaton a = new Automaton();
-        int srcNode = -1;
-        int destNode = -1;
-        int state = a.createState();
-        while (ts.incrementToken()) {
-            assert termAtt.length() == 1;
-            char c = termAtt.charAt(0);
-            int posInc = posIncAtt.getPositionIncrement();
-            if (posInc != 0) {
-                srcNode += posInc;
-                while (state < srcNode) {
-                    state = a.createState();
-                }
-            }
-            destNode = srcNode + posLenAtt.getPositionLength();
-            while (state < destNode) {
-                state = a.createState();
-            }
-            a.addTransition(srcNode, destNode, c);
-        }
-        ts.end();
-        ts.close();
-        a.finishState();
-        a.setAccept(destNode, true);
-        return a;
-    }
-
-    /**
-     * Renumbers nodes according to their topo sort
-     */
-    private Automaton topoSort(Automaton in) {
-        int[] newToOld = Operations.topoSortStates(in);
-        int[] oldToNew = new int[newToOld.length];
-
-        Automaton.Builder a = new Automaton.Builder();
-        //System.out.println("remap:");
-        for (int i = 0; i < newToOld.length; i++) {
-            a.createState();
-            oldToNew[newToOld[i]] = i;
-            //System.out.println("  " + newToOld[i] + " -> " + i);
-            if (in.isAccept(newToOld[i])) {
-                a.setAccept(i, true);
-                //System.out.println("    **");
-            }
-        }
-
-        Transition t = new Transition();
-        for (int i = 0; i < newToOld.length; i++) {
-            int count = in.initTransition(newToOld[i], t);
-            for (int j = 0; j < count; j++) {
-                in.getNextTransition(t);
-                a.addTransition(i, oldToNew[t.dest], t.min, t.max);
-            }
-        }
-
-        return a.finish();
-    }
-
-    /**
-     * Helper method to validate all strings that can be generated from a token stream. Uses {@link
-     * TokenStreamToAutomaton} to create an automaton. Asserts the finite strings of the automaton
-     * are all and only the given valid strings.
-     *
-     * @param analyzer        analyzer containing the SynonymFilter under test.
-     * @param text            text to be analyzed.
-     * @param expectedStrings all expected finite strings.
-     */
-    public void assertAllStrings(Analyzer analyzer, String text, String[] expectedStrings) throws IOException {
-        TokenStream tokenStream = analyzer.tokenStream("dummy", text);
-        try {
-            Automaton automaton = new TokenStreamToAutomaton().toAutomaton(tokenStream);
-            Set<IntsRef> finiteStrings = AutomatonTestUtil.getFiniteStringsRecursive(automaton, -1);
-
-            assertEquals("Invalid resulting strings count. Expected " + expectedStrings.length + " was " + finiteStrings.size(),
-                expectedStrings.length, finiteStrings.size());
-
-            Set<String> expectedStringsSet = new HashSet<>(Arrays.asList(expectedStrings));
-
-            BytesRefBuilder scratchBytesRefBuilder = new BytesRefBuilder();
-            for (IntsRef ir : finiteStrings) {
-                String s = Util.toBytesRef(ir, scratchBytesRefBuilder).utf8ToString().replace((char) TokenStreamToAutomaton.POS_SEP, ' ');
-                assertTrue("Unexpected string found: " + s, expectedStringsSet.contains(s));
-            }
-        } finally {
-            tokenStream.close();
-        }
-    }
-}
diff --git a/core/src/test/java/org/elasticsearch/index/analysis/BaseWordDelimiterTokenFilterFactoryTestCase.java b/core/src/test/java/org/elasticsearch/index/analysis/BaseWordDelimiterTokenFilterFactoryTestCase.java
new file mode 100644
index 0000000000000..edd202fae5fa8
--- /dev/null
+++ b/core/src/test/java/org/elasticsearch/index/analysis/BaseWordDelimiterTokenFilterFactoryTestCase.java
@@ -0,0 +1,158 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.elasticsearch.index.analysis;
+
+
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.test.ESTestCase;
+import org.elasticsearch.test.ESTokenStreamTestCase;
+
+import java.io.IOException;
+import java.io.StringReader;
+
+/**
+ * Base class to test {@link WordDelimiterGraphTokenFilterFactory}  and {@link WordDelimiterGraphTokenFilterFactory}
+ */
+public abstract class BaseWordDelimiterTokenFilterFactoryTestCase extends ESTokenStreamTestCase {
+    final String type;
+
+    public BaseWordDelimiterTokenFilterFactoryTestCase(String type) {
+        this.type = type;
+    }
+
+    public void testDefault() throws IOException {
+        ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
+                .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
+                .put("index.analysis.filter.my_word_delimiter.type", type)
+                .build());
+        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
+        String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
+        String[] expected = new String[]{"Power", "Shot", "500", "42", "wi", "fi", "wi", "fi", "4000", "j", "2", "se", "O", "Neil"};
+        Tokenizer tokenizer = new WhitespaceTokenizer();
+        tokenizer.setReader(new StringReader(source));
+        assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
+    }
+
+    public void testCatenateWords() throws IOException {
+        ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
+                .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
+                .put("index.analysis.filter.my_word_delimiter.type", type)
+                .put("index.analysis.filter.my_word_delimiter.catenate_words", "true")
+                .put("index.analysis.filter.my_word_delimiter.generate_word_parts", "false")
+                .build());
+        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
+        String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
+        String[] expected = new String[]{"PowerShot", "500", "42", "wifi", "wifi", "4000", "j", "2", "se", "ONeil"};
+        Tokenizer tokenizer = new WhitespaceTokenizer();
+        tokenizer.setReader(new StringReader(source));
+        assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
+    }
+
+    public void testCatenateNumbers() throws IOException {
+        ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
+                .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
+                .put("index.analysis.filter.my_word_delimiter.type", type)
+                .put("index.analysis.filter.my_word_delimiter.generate_number_parts", "false")
+                .put("index.analysis.filter.my_word_delimiter.catenate_numbers", "true")
+                .build());
+        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
+        String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
+        String[] expected = new String[]{"Power", "Shot", "50042", "wi", "fi", "wi", "fi", "4000", "j", "2", "se", "O", "Neil"};
+        Tokenizer tokenizer = new WhitespaceTokenizer();
+        tokenizer.setReader(new StringReader(source));
+        assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
+    }
+
+    public void testCatenateAll() throws IOException {
+        ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
+                .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
+                .put("index.analysis.filter.my_word_delimiter.type", type)
+                .put("index.analysis.filter.my_word_delimiter.generate_word_parts", "false")
+                .put("index.analysis.filter.my_word_delimiter.generate_number_parts", "false")
+                .put("index.analysis.filter.my_word_delimiter.catenate_all", "true")
+                .build());
+        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
+        String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
+        String[] expected = new String[]{"PowerShot", "50042", "wifi", "wifi4000", "j2se", "ONeil"};
+        Tokenizer tokenizer = new WhitespaceTokenizer();
+        tokenizer.setReader(new StringReader(source));
+        assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
+    }
+
+    public void testSplitOnCaseChange() throws IOException {
+        ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
+                .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
+                .put("index.analysis.filter.my_word_delimiter.type", type)
+                .put("index.analysis.filter.my_word_delimiter.split_on_case_change", "false")
+                .build());
+        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
+        String source = "PowerShot";
+        String[] expected = new String[]{"PowerShot"};
+        Tokenizer tokenizer = new WhitespaceTokenizer();
+        tokenizer.setReader(new StringReader(source));
+        assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
+    }
+
+    public void testPreserveOriginal() throws IOException {
+        ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
+                .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
+                .put("index.analysis.filter.my_word_delimiter.type", type)
+                .put("index.analysis.filter.my_word_delimiter.preserve_original", "true")
+                .build());
+        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
+        String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
+        String[] expected = new String[]{"PowerShot", "Power", "Shot", "500-42", "500", "42", "wi-fi", "wi", "fi", "wi-fi-4000", "wi", "fi", "4000", "j2se", "j", "2", "se", "O'Neil's", "O", "Neil"};
+        Tokenizer tokenizer = new WhitespaceTokenizer();
+        tokenizer.setReader(new StringReader(source));
+        assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
+    }
+
+    public void testStemEnglishPossessive() throws IOException {
+        ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
+                .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
+                .put("index.analysis.filter.my_word_delimiter.type", type)
+                .put("index.analysis.filter.my_word_delimiter.stem_english_possessive", "false")
+                .build());
+        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
+        String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
+        String[] expected = new String[]{"Power", "Shot", "500", "42", "wi", "fi", "wi", "fi", "4000", "j", "2", "se", "O", "Neil", "s"};
+        Tokenizer tokenizer = new WhitespaceTokenizer();
+        tokenizer.setReader(new StringReader(source));
+        assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
+    }
+
+    /** Correct offset order when doing both parts and concatenation: PowerShot is a synonym of Power */
+    public void testPartsAndCatenate() throws IOException {
+        ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
+                .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
+                .put("index.analysis.filter.my_word_delimiter.type", type)
+                .put("index.analysis.filter.my_word_delimiter.catenate_words", "true")
+                .put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true")
+                .build());
+        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
+        String source = "PowerShot";
+        String[] expected = new String[]{"Power", "PowerShot", "Shot" };
+        Tokenizer tokenizer = new WhitespaceTokenizer();
+        tokenizer.setReader(new StringReader(source));
+        assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
+    }
+}
diff --git a/core/src/test/java/org/elasticsearch/index/analysis/WordDelimiterGraphTokenFilterFactoryTests.java b/core/src/test/java/org/elasticsearch/index/analysis/WordDelimiterGraphTokenFilterFactoryTests.java
new file mode 100644
index 0000000000000..793c634e6a412
--- /dev/null
+++ b/core/src/test/java/org/elasticsearch/index/analysis/WordDelimiterGraphTokenFilterFactoryTests.java
@@ -0,0 +1,56 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.elasticsearch.index.analysis;
+
+
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.test.ESTestCase;
+
+import java.io.IOException;
+import java.io.StringReader;
+
+public class WordDelimiterGraphTokenFilterFactoryTests extends BaseWordDelimiterTokenFilterFactoryTestCase {
+    public WordDelimiterGraphTokenFilterFactoryTests() {
+        super("word_delimiter_graph");
+    }
+
+    public void testMultiTerms() throws IOException {
+        ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
+            .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
+            .put("index.analysis.filter.my_word_delimiter.type", type)
+            .put("index.analysis.filter.my_word_delimiter.catenate_all", "true")
+            .put("index.analysis.filter.my_word_delimiter.preserve_original", "true")
+            .build());
+
+        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
+        String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
+        String[] expected = new String[]{"PowerShot", "PowerShot", "Power", "Shot", "50042", "500-42", "500", "42",
+            "wifi", "wi-fi", "wi", "fi", "wifi4000", "wi-fi-4000", "wi", "fi", "4000", "j2se", "j2se", "j", "2", "se",
+            "ONeil", "O'Neil's", "O", "Neil" };
+        Tokenizer tokenizer = new WhitespaceTokenizer();
+        tokenizer.setReader(new StringReader(source));
+        int[] expectedIncr = new int[]{1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1};
+        int[] expectedPosLen = new int[]{2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 3, 3, 1, 1, 1, 3, 3, 1, 1, 1, 2, 2, 1, 1};
+        assertTokenStreamContents(tokenFilter.create(tokenizer), expected, null, null, null,
+                expectedIncr, expectedPosLen, null);
+    }
+}
diff --git a/core/src/test/java/org/elasticsearch/index/analysis/WordDelimiterTokenFilterFactoryTests.java b/core/src/test/java/org/elasticsearch/index/analysis/WordDelimiterTokenFilterFactoryTests.java
index 1a7903bcfac09..bfa93c4eb01b7 100644
--- a/core/src/test/java/org/elasticsearch/index/analysis/WordDelimiterTokenFilterFactoryTests.java
+++ b/core/src/test/java/org/elasticsearch/index/analysis/WordDelimiterTokenFilterFactoryTests.java
@@ -19,131 +19,8 @@
 package org.elasticsearch.index.analysis;
 
 
-import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.core.WhitespaceTokenizer;
-import org.elasticsearch.common.settings.Settings;
-import org.elasticsearch.env.Environment;
-import org.elasticsearch.test.ESTestCase;
-import org.elasticsearch.test.ESTokenStreamTestCase;
-
-import java.io.IOException;
-import java.io.StringReader;
-
-public class WordDelimiterTokenFilterFactoryTests extends ESTokenStreamTestCase {
-    public void testDefault() throws IOException {
-        ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
-                .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
-                .put("index.analysis.filter.my_word_delimiter.type", "word_delimiter")
-                .build());
-        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
-        String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
-        String[] expected = new String[]{"Power", "Shot", "500", "42", "wi", "fi", "wi", "fi", "4000", "j", "2", "se", "O", "Neil"};
-        Tokenizer tokenizer = new WhitespaceTokenizer();
-        tokenizer.setReader(new StringReader(source));
-        assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
-    }
-
-    public void testCatenateWords() throws IOException {
-        ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
-                .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
-                .put("index.analysis.filter.my_word_delimiter.type", "word_delimiter")
-                .put("index.analysis.filter.my_word_delimiter.catenate_words", "true")
-                .put("index.analysis.filter.my_word_delimiter.generate_word_parts", "false")
-                .build());
-        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
-        String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
-        String[] expected = new String[]{"PowerShot", "500", "42", "wifi", "wifi", "4000", "j", "2", "se", "ONeil"};
-        Tokenizer tokenizer = new WhitespaceTokenizer();
-        tokenizer.setReader(new StringReader(source));
-        assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
-    }
-
-    public void testCatenateNumbers() throws IOException {
-        ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
-                .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
-                .put("index.analysis.filter.my_word_delimiter.type", "word_delimiter")
-                .put("index.analysis.filter.my_word_delimiter.generate_number_parts", "false")
-                .put("index.analysis.filter.my_word_delimiter.catenate_numbers", "true")
-                .build());
-        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
-        String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
-        String[] expected = new String[]{"Power", "Shot", "50042", "wi", "fi", "wi", "fi", "4000", "j", "2", "se", "O", "Neil"};
-        Tokenizer tokenizer = new WhitespaceTokenizer();
-        tokenizer.setReader(new StringReader(source));
-        assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
-    }
-
-    public void testCatenateAll() throws IOException {
-        ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
-                .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
-                .put("index.analysis.filter.my_word_delimiter.type", "word_delimiter")
-                .put("index.analysis.filter.my_word_delimiter.generate_word_parts", "false")
-                .put("index.analysis.filter.my_word_delimiter.generate_number_parts", "false")
-                .put("index.analysis.filter.my_word_delimiter.catenate_all", "true")
-                .build());
-        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
-        String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
-        String[] expected = new String[]{"PowerShot", "50042", "wifi", "wifi4000", "j2se", "ONeil"};
-        Tokenizer tokenizer = new WhitespaceTokenizer();
-        tokenizer.setReader(new StringReader(source));
-        assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
-    }
-
-    public void testSplitOnCaseChange() throws IOException {
-        ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
-                .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
-                .put("index.analysis.filter.my_word_delimiter.type", "word_delimiter")
-                .put("index.analysis.filter.my_word_delimiter.split_on_case_change", "false")
-                .build());
-        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
-        String source = "PowerShot";
-        String[] expected = new String[]{"PowerShot"};
-        Tokenizer tokenizer = new WhitespaceTokenizer();
-        tokenizer.setReader(new StringReader(source));
-        assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
-    }
-
-    public void testPreserveOriginal() throws IOException {
-        ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
-                .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
-                .put("index.analysis.filter.my_word_delimiter.type", "word_delimiter")
-                .put("index.analysis.filter.my_word_delimiter.preserve_original", "true")
-                .build());
-        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
-        String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
-        String[] expected = new String[]{"PowerShot", "Power", "Shot", "500-42", "500", "42", "wi-fi", "wi", "fi", "wi-fi-4000", "wi", "fi", "4000", "j2se", "j", "2", "se", "O'Neil's", "O", "Neil"};
-        Tokenizer tokenizer = new WhitespaceTokenizer();
-        tokenizer.setReader(new StringReader(source));
-        assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
-    }
-
-    public void testStemEnglishPossessive() throws IOException {
-        ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
-                .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
-                .put("index.analysis.filter.my_word_delimiter.type", "word_delimiter")
-                .put("index.analysis.filter.my_word_delimiter.stem_english_possessive", "false")
-                .build());
-        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
-        String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
-        String[] expected = new String[]{"Power", "Shot", "500", "42", "wi", "fi", "wi", "fi", "4000", "j", "2", "se", "O", "Neil", "s"};
-        Tokenizer tokenizer = new WhitespaceTokenizer();
-        tokenizer.setReader(new StringReader(source));
-        assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
-    }
-
-    /** Correct offset order when doing both parts and concatenation: PowerShot is a synonym of Power */
-    public void testPartsAndCatenate() throws IOException {
-        ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
-                .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
-                .put("index.analysis.filter.my_word_delimiter.type", "word_delimiter")
-                .put("index.analysis.filter.my_word_delimiter.catenate_words", "true")
-                .put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true")
-                .build());
-        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
-        String source = "PowerShot";
-        String[] expected = new String[]{"Power", "PowerShot", "Shot" };
-        Tokenizer tokenizer = new WhitespaceTokenizer();
-        tokenizer.setReader(new StringReader(source));
-        assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
+public class WordDelimiterTokenFilterFactoryTests extends BaseWordDelimiterTokenFilterFactoryTestCase {
+    public WordDelimiterTokenFilterFactoryTests() {
+        super("word_delimiter");
     }
 }
diff --git a/docs/reference/analysis/tokenfilters/word-delimiter-graph-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/word-delimiter-graph-tokenfilter.asciidoc
new file mode 100644
index 0000000000000..b415d1c376ec1
--- /dev/null
+++ b/docs/reference/analysis/tokenfilters/word-delimiter-graph-tokenfilter.asciidoc
@@ -0,0 +1,97 @@
+[[analysis-word-delimiter-graph-tokenfilter]]
+=== Word Delimiter Graph Token Filter
+
+experimental[]
+
+Named `word_delimiter_graph`, it Splits words into subwords and performs
+optional transformations on subword groups. Words are split into
+subwords with the following rules:
+
+* split on intra-word delimiters (by default, all non alpha-numeric
+characters).
+* "Wi-Fi" -> "Wi", "Fi"
+* split on case transitions: "PowerShot" -> "Power", "Shot"
+* split on letter-number transitions: "SD500" -> "SD", "500"
+* leading and trailing intra-word delimiters on each subword are
+ignored: "//hello---there, 'dude'" -> "hello", "there", "dude"
+* trailing "'s" are removed for each subword: "O'Neil's" -> "O", "Neil"
+
+Unlike the `word_delimiter`, this token filter correctly handles positions for
+multi terms expansion at search-time when any of the following options
+are set to true:
+
+ * `preserve_original`
+ * `catenate_numbers`
+ * `catenate_words`
+ * `catenate_all`
+
+Parameters include:
+
+`generate_word_parts`::
+    If `true` causes parts of words to be
+    generated: "PowerShot" => "Power" "Shot". Defaults to `true`.
+
+`generate_number_parts`::
+    If `true` causes number subwords to be
+    generated: "500-42" => "500" "42". Defaults to `true`.
+
+`catenate_words`::
+    If `true` causes maximum runs of word parts to be
+    catenated: "wi-fi" => "wifi". Defaults to `false`.
+
+`catenate_numbers`::
+    If `true` causes maximum runs of number parts to
+    be catenated: "500-42" => "50042". Defaults to `false`.
+
+`catenate_all`::
+    If `true` causes all subword parts to be catenated:
+    "wi-fi-4000" => "wifi4000". Defaults to `false`.
+
+`split_on_case_change`::
+    If `true` causes "PowerShot" to be two tokens;
+    ("Power-Shot" remains two parts regards). Defaults to `true`.
+
+`preserve_original`::
+    If `true` includes original words in subwords:
+    "500-42" => "500-42" "500" "42". Defaults to `false`.
+
+`split_on_numerics`::
+    If `true` causes "j2se" to be three tokens; "j"
+    "2" "se". Defaults to `true`.
+
+`stem_english_possessive`::
+    If `true` causes trailing "'s" to be
+    removed for each subword: "O'Neil's" => "O", "Neil". Defaults to `true`.
+
+Advance settings include:
+
+`protected_words`::
+    A list of protected words from being delimiter.
+    Either an array, or also can set `protected_words_path` which resolved
+    to a file configured with protected words (one on each line).
+    Automatically resolves to `config/` based location if exists.
+
+`type_table`::
+    A custom type mapping table, for example (when configured
+    using `type_table_path`):
+
+[source,js]
+--------------------------------------------------
+    # Map the $, %, '.', and ',' characters to DIGIT
+    # This might be useful for financial data.
+    $ => DIGIT
+    % => DIGIT
+    . => DIGIT
+    \\u002C => DIGIT
+
+    # in some cases you might not want to split on ZWJ
+    # this also tests the case where we need a bigger byte[]
+    # see http://en.wikipedia.org/wiki/Zero-width_joiner
+    \\u200D => ALPHANUM
+--------------------------------------------------
+
+NOTE: Using a tokenizer like the `standard` tokenizer may interfere with
+the `catenate_*` and `preserve_original` parameters, as the original
+string may already have lost punctuation during tokenization.  Instead,
+you may want to use the `whitespace` tokenizer.
+

From 271467e5dfdf0958f339bfc4a41903dfce681947 Mon Sep 17 00:00:00 2001
From: Jim Ferenczi <jim.ferenczi@elastic.co>
Date: Fri, 24 Feb 2017 00:52:35 +0100
Subject: [PATCH 2/2] Address review

---
 .../WordDelimiterGraphTokenFilterFactory.java |  4 ---
 ...rdDelimiterTokenFilterFactoryTestCase.java | 30 ++++++-------------
 ...DelimiterGraphTokenFilterFactoryTests.java | 19 ++++++++++++
 .../WordDelimiterTokenFilterFactoryTests.java | 25 ++++++++++++++++
 docs/build.gradle                             |  1 +
 .../word-delimiter-graph-tokenfilter.asciidoc |  2 +-
 6 files changed, 55 insertions(+), 26 deletions(-)

diff --git a/core/src/main/java/org/elasticsearch/index/analysis/WordDelimiterGraphTokenFilterFactory.java b/core/src/main/java/org/elasticsearch/index/analysis/WordDelimiterGraphTokenFilterFactory.java
index 20fc7f0ad9c81..7cdc215f1b34a 100644
--- a/core/src/main/java/org/elasticsearch/index/analysis/WordDelimiterGraphTokenFilterFactory.java
+++ b/core/src/main/java/org/elasticsearch/index/analysis/WordDelimiterGraphTokenFilterFactory.java
@@ -98,8 +98,4 @@ private int getFlag(int flag, Settings settings, String key, boolean defaultValu
         }
         return 0;
     }
-
-    int getFlags() {
-        return flags;
-    }
 }
diff --git a/core/src/test/java/org/elasticsearch/index/analysis/BaseWordDelimiterTokenFilterFactoryTestCase.java b/core/src/test/java/org/elasticsearch/index/analysis/BaseWordDelimiterTokenFilterFactoryTestCase.java
index edd202fae5fa8..713e9424759a0 100644
--- a/core/src/test/java/org/elasticsearch/index/analysis/BaseWordDelimiterTokenFilterFactoryTestCase.java
+++ b/core/src/test/java/org/elasticsearch/index/analysis/BaseWordDelimiterTokenFilterFactoryTestCase.java
@@ -30,7 +30,7 @@
 import java.io.StringReader;
 
 /**
- * Base class to test {@link WordDelimiterGraphTokenFilterFactory}  and {@link WordDelimiterGraphTokenFilterFactory}
+ * Base class to test {@link WordDelimiterTokenFilterFactory}  and {@link WordDelimiterGraphTokenFilterFactory}
  */
 public abstract class BaseWordDelimiterTokenFilterFactoryTestCase extends ESTokenStreamTestCase {
     final String type;
@@ -46,7 +46,8 @@ public void testDefault() throws IOException {
                 .build());
         TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
         String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
-        String[] expected = new String[]{"Power", "Shot", "500", "42", "wi", "fi", "wi", "fi", "4000", "j", "2", "se", "O", "Neil"};
+        String[] expected = new String[]{"Power", "Shot", "500", "42", "wi", "fi", "wi",
+            "fi", "4000", "j", "2", "se", "O", "Neil"};
         Tokenizer tokenizer = new WhitespaceTokenizer();
         tokenizer.setReader(new StringReader(source));
         assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
@@ -76,7 +77,8 @@ public void testCatenateNumbers() throws IOException {
                 .build());
         TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
         String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
-        String[] expected = new String[]{"Power", "Shot", "50042", "wi", "fi", "wi", "fi", "4000", "j", "2", "se", "O", "Neil"};
+        String[] expected = new String[]{"Power", "Shot", "50042", "wi", "fi", "wi", "fi", "4000", "j", "2",
+            "se", "O", "Neil"};
         Tokenizer tokenizer = new WhitespaceTokenizer();
         tokenizer.setReader(new StringReader(source));
         assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
@@ -120,7 +122,8 @@ public void testPreserveOriginal() throws IOException {
                 .build());
         TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
         String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
-        String[] expected = new String[]{"PowerShot", "Power", "Shot", "500-42", "500", "42", "wi-fi", "wi", "fi", "wi-fi-4000", "wi", "fi", "4000", "j2se", "j", "2", "se", "O'Neil's", "O", "Neil"};
+        String[] expected = new String[]{"PowerShot", "Power", "Shot", "500-42", "500", "42", "wi-fi", "wi", "fi",
+            "wi-fi-4000", "wi", "fi", "4000", "j2se", "j", "2", "se", "O'Neil's", "O", "Neil"};
         Tokenizer tokenizer = new WhitespaceTokenizer();
         tokenizer.setReader(new StringReader(source));
         assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
@@ -134,23 +137,8 @@ public void testStemEnglishPossessive() throws IOException {
                 .build());
         TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
         String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
-        String[] expected = new String[]{"Power", "Shot", "500", "42", "wi", "fi", "wi", "fi", "4000", "j", "2", "se", "O", "Neil", "s"};
-        Tokenizer tokenizer = new WhitespaceTokenizer();
-        tokenizer.setReader(new StringReader(source));
-        assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
-    }
-
-    /** Correct offset order when doing both parts and concatenation: PowerShot is a synonym of Power */
-    public void testPartsAndCatenate() throws IOException {
-        ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
-                .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
-                .put("index.analysis.filter.my_word_delimiter.type", type)
-                .put("index.analysis.filter.my_word_delimiter.catenate_words", "true")
-                .put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true")
-                .build());
-        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
-        String source = "PowerShot";
-        String[] expected = new String[]{"Power", "PowerShot", "Shot" };
+        String[] expected = new String[]{"Power", "Shot", "500", "42", "wi", "fi", "wi", "fi", "4000", "j", "2",
+            "se", "O", "Neil", "s"};
         Tokenizer tokenizer = new WhitespaceTokenizer();
         tokenizer.setReader(new StringReader(source));
         assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
diff --git a/core/src/test/java/org/elasticsearch/index/analysis/WordDelimiterGraphTokenFilterFactoryTests.java b/core/src/test/java/org/elasticsearch/index/analysis/WordDelimiterGraphTokenFilterFactoryTests.java
index 793c634e6a412..2ae4267104a05 100644
--- a/core/src/test/java/org/elasticsearch/index/analysis/WordDelimiterGraphTokenFilterFactoryTests.java
+++ b/core/src/test/java/org/elasticsearch/index/analysis/WordDelimiterGraphTokenFilterFactoryTests.java
@@ -53,4 +53,23 @@ public void testMultiTerms() throws IOException {
         assertTokenStreamContents(tokenFilter.create(tokenizer), expected, null, null, null,
                 expectedIncr, expectedPosLen, null);
     }
+
+    /** Correct offset order when doing both parts and concatenation: PowerShot is a synonym of Power */
+    public void testPartsAndCatenate() throws IOException {
+        ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
+            .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
+            .put("index.analysis.filter.my_word_delimiter.type", type)
+            .put("index.analysis.filter.my_word_delimiter.catenate_words", "true")
+            .put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true")
+            .build());
+        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
+        String source = "PowerShot";
+        int[] expectedIncr = new int[]{1, 0, 1};
+        int[] expectedPosLen = new int[]{2, 1, 1};
+        String[] expected = new String[]{"PowerShot", "Power", "Shot" };
+        Tokenizer tokenizer = new WhitespaceTokenizer();
+        tokenizer.setReader(new StringReader(source));
+        assertTokenStreamContents(tokenFilter.create(tokenizer), expected, null, null, null,
+            expectedIncr, expectedPosLen, null);
+    }
 }
diff --git a/core/src/test/java/org/elasticsearch/index/analysis/WordDelimiterTokenFilterFactoryTests.java b/core/src/test/java/org/elasticsearch/index/analysis/WordDelimiterTokenFilterFactoryTests.java
index bfa93c4eb01b7..1e919e00bbb29 100644
--- a/core/src/test/java/org/elasticsearch/index/analysis/WordDelimiterTokenFilterFactoryTests.java
+++ b/core/src/test/java/org/elasticsearch/index/analysis/WordDelimiterTokenFilterFactoryTests.java
@@ -19,8 +19,33 @@
 package org.elasticsearch.index.analysis;
 
 
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.test.ESTestCase;
+
+import java.io.IOException;
+import java.io.StringReader;
+
 public class WordDelimiterTokenFilterFactoryTests extends BaseWordDelimiterTokenFilterFactoryTestCase {
     public WordDelimiterTokenFilterFactoryTests() {
         super("word_delimiter");
     }
+
+    /** Correct offset order when doing both parts and concatenation: PowerShot is a synonym of Power */
+    public void testPartsAndCatenate() throws IOException {
+        ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
+            .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
+            .put("index.analysis.filter.my_word_delimiter.type", type)
+            .put("index.analysis.filter.my_word_delimiter.catenate_words", "true")
+            .put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true")
+            .build());
+        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
+        String source = "PowerShot";
+        String[] expected = new String[]{"Power", "PowerShot", "Shot" };
+        Tokenizer tokenizer = new WhitespaceTokenizer();
+        tokenizer.setReader(new StringReader(source));
+        assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
+    }
 }
diff --git a/docs/build.gradle b/docs/build.gradle
index 36727b12e5097..9fd593e2faeb0 100644
--- a/docs/build.gradle
+++ b/docs/build.gradle
@@ -81,6 +81,7 @@ buildRestTests.expectedUnconvertedCandidates = [
   'reference/analysis/tokenfilters/synonym-tokenfilter.asciidoc',
   'reference/analysis/tokenfilters/synonym-graph-tokenfilter.asciidoc',
   'reference/analysis/tokenfilters/word-delimiter-tokenfilter.asciidoc',
+  'reference/analysis/tokenfilters/word-delimiter-graph-tokenfilter.asciidoc',
   'reference/cat/snapshots.asciidoc',
   'reference/cat/templates.asciidoc',
   'reference/cat/thread_pool.asciidoc',
diff --git a/docs/reference/analysis/tokenfilters/word-delimiter-graph-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/word-delimiter-graph-tokenfilter.asciidoc
index b415d1c376ec1..01176fa5636c8 100644
--- a/docs/reference/analysis/tokenfilters/word-delimiter-graph-tokenfilter.asciidoc
+++ b/docs/reference/analysis/tokenfilters/word-delimiter-graph-tokenfilter.asciidoc
@@ -3,7 +3,7 @@
 
 experimental[]
 
-Named `word_delimiter_graph`, it Splits words into subwords and performs
+Named `word_delimiter_graph`, it splits words into subwords and performs
 optional transformations on subword groups. Words are split into
 subwords with the following rules: