From e0a0bccbab4e40e2849901ee567e0294d7e3984b Mon Sep 17 00:00:00 2001 From: Jim Ferenczi Date: Thu, 23 Feb 2017 15:52:53 +0100 Subject: [PATCH 1/2] Expose WordDelimiterGraphTokenFilter This change exposes the new Lucene graph based word delimiter token filter in the analysis filters. Unlike the `word_delimiter` this token filter named `word_delimiter_graph` correctly handles multi terms expansion at query time. Closes #23104 --- .../WordDelimiterGraphTokenFilterFactory.java | 4 + .../indices/analysis/AnalysisModule.java | 2 + .../analysis/PreBuiltTokenFilters.java | 13 + .../synonym/SynonymGraphFilterTests.java | 1074 ----------------- ...rdDelimiterTokenFilterFactoryTestCase.java | 158 +++ ...DelimiterGraphTokenFilterFactoryTests.java | 56 + .../WordDelimiterTokenFilterFactoryTests.java | 129 +- .../word-delimiter-graph-tokenfilter.asciidoc | 97 ++ 8 files changed, 333 insertions(+), 1200 deletions(-) delete mode 100644 core/src/test/java/org/apache/lucene/analysis/synonym/SynonymGraphFilterTests.java create mode 100644 core/src/test/java/org/elasticsearch/index/analysis/BaseWordDelimiterTokenFilterFactoryTestCase.java create mode 100644 core/src/test/java/org/elasticsearch/index/analysis/WordDelimiterGraphTokenFilterFactoryTests.java create mode 100644 docs/reference/analysis/tokenfilters/word-delimiter-graph-tokenfilter.asciidoc diff --git a/core/src/main/java/org/elasticsearch/index/analysis/WordDelimiterGraphTokenFilterFactory.java b/core/src/main/java/org/elasticsearch/index/analysis/WordDelimiterGraphTokenFilterFactory.java index 7cdc215f1b34a..20fc7f0ad9c81 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/WordDelimiterGraphTokenFilterFactory.java +++ b/core/src/main/java/org/elasticsearch/index/analysis/WordDelimiterGraphTokenFilterFactory.java @@ -98,4 +98,8 @@ private int getFlag(int flag, Settings settings, String key, boolean defaultValu } return 0; } + + int getFlags() { + return flags; + } } diff --git a/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java b/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java index 1aaf3077aea93..61950942e6076 100644 --- a/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java +++ b/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java @@ -140,6 +140,7 @@ import org.elasticsearch.index.analysis.UpperCaseTokenFilterFactory; import org.elasticsearch.index.analysis.WhitespaceAnalyzerProvider; import org.elasticsearch.index.analysis.WhitespaceTokenizerFactory; +import org.elasticsearch.index.analysis.WordDelimiterGraphTokenFilterFactory; import org.elasticsearch.index.analysis.WordDelimiterTokenFilterFactory; import org.elasticsearch.index.analysis.compound.DictionaryCompoundWordTokenFilterFactory; import org.elasticsearch.index.analysis.compound.HyphenationCompoundWordTokenFilterFactory; @@ -225,6 +226,7 @@ private NamedRegistry> setupTokenFilters(Li tokenFilters.register("snowball", SnowballTokenFilterFactory::new); tokenFilters.register("stemmer", StemmerTokenFilterFactory::new); tokenFilters.register("word_delimiter", WordDelimiterTokenFilterFactory::new); + tokenFilters.register("word_delimiter_graph", WordDelimiterGraphTokenFilterFactory::new); tokenFilters.register("delimited_payload_filter", DelimitedPayloadTokenFilterFactory::new); tokenFilters.register("elision", ElisionTokenFilterFactory::new); tokenFilters.register("flatten_graph", FlattenGraphTokenFilterFactory::new); diff --git a/core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltTokenFilters.java b/core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltTokenFilters.java index 53e79cb9dfe4d..6c58ab884db27 100644 --- a/core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltTokenFilters.java +++ b/core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltTokenFilters.java @@ -51,6 +51,7 @@ import org.apache.lucene.analysis.miscellaneous.TruncateTokenFilter; import org.apache.lucene.analysis.miscellaneous.UniqueTokenFilter; import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter; +import org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter; import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter; import org.apache.lucene.analysis.ngram.NGramTokenFilter; import org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilter; @@ -87,6 +88,18 @@ public TokenStream create(TokenStream tokenStream, Version version) { } }, + WORD_DELIMITER_GRAPH(CachingStrategy.ONE) { + @Override + public TokenStream create(TokenStream tokenStream, Version version) { + return new WordDelimiterGraphFilter(tokenStream, + WordDelimiterGraphFilter.GENERATE_WORD_PARTS | + WordDelimiterGraphFilter.GENERATE_NUMBER_PARTS | + WordDelimiterGraphFilter.SPLIT_ON_CASE_CHANGE | + WordDelimiterGraphFilter.SPLIT_ON_NUMERICS | + WordDelimiterGraphFilter.STEM_ENGLISH_POSSESSIVE, null); + } + }, + STOP(CachingStrategy.LUCENE) { @Override public TokenStream create(TokenStream tokenStream, Version version) { diff --git a/core/src/test/java/org/apache/lucene/analysis/synonym/SynonymGraphFilterTests.java b/core/src/test/java/org/apache/lucene/analysis/synonym/SynonymGraphFilterTests.java deleted file mode 100644 index fafe8a954c850..0000000000000 --- a/core/src/test/java/org/apache/lucene/analysis/synonym/SynonymGraphFilterTests.java +++ /dev/null @@ -1,1074 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.lucene.analysis.synonym; - -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.CannedTokenStream; -import org.apache.lucene.analysis.MockAnalyzer; -import org.apache.lucene.analysis.MockGraphTokenFilter; -import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.analysis.Token; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.TokenStreamToAutomaton; -import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; -import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.RandomIndexWriter; -import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.TokenStreamToTermAutomatonQuery; -import org.apache.lucene.store.Directory; -import org.apache.lucene.util.BytesRefBuilder; -import org.apache.lucene.util.CharsRefBuilder; -import org.apache.lucene.util.IOUtils; -import org.apache.lucene.util.IntsRef; -import org.apache.lucene.util.TestUtil; -import org.apache.lucene.util.automaton.Automaton; -import org.apache.lucene.util.automaton.AutomatonTestUtil; -import org.apache.lucene.util.automaton.Operations; -import org.apache.lucene.util.automaton.TooComplexToDeterminizeException; -import org.apache.lucene.util.automaton.Transition; -import org.apache.lucene.util.fst.Util; - -import java.io.IOException; -import java.io.StringReader; -import java.text.ParseException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.HashSet; -import java.util.List; -import java.util.Set; - -public class SynonymGraphFilterTests extends BaseTokenStreamTestCase { - - /** - * Set a side effect by {@link #getAnalyzer}. - */ - private SynonymGraphFilter synFilter; - - // LUCENE-6664 - public static void assertAnalyzesToPositions(Analyzer a, String input, String[] output, String[] types, int[] posIncrements, int[] - posLengths) throws IOException { - assertAnalyzesTo(a, input, output, null, null, types, posIncrements, posLengths); - } - - public void testBasicKeepOrigOneOutput() throws Exception { - SynonymMap.Builder b = new SynonymMap.Builder(true); - add(b, "a b", "x", true); - - Analyzer a = getAnalyzer(b, true); - assertAnalyzesTo(a, "c a b", new String[]{"c", "x", "a", "b"}, new int[]{0, 2, 2, 4}, new int[]{1, 5, 3, 5}, new String[]{"word", - "SYNONYM", "word", "word"}, new int[]{1, 1, 0, 1}, new int[]{1, 2, 1, 1}); - a.close(); - } - - public void testMixedKeepOrig() throws Exception { - SynonymMap.Builder b = new SynonymMap.Builder(true); - add(b, "a b", "x", true); - add(b, "e f", "y", false); - - Analyzer a = getAnalyzer(b, true); - assertAnalyzesTo(a, "c a b c e f g", new String[]{"c", "x", "a", "b", "c", "y", "g"}, new int[]{0, 2, 2, 4, 6, 8, 12}, new - int[]{1, 5, 3, 5, 7, 11, 13}, new String[]{"word", "SYNONYM", "word", "word", "word", "SYNONYM", "word"}, new - int[]{1, 1, 0, - 1, 1, 1, 1}, new int[]{1, 2, 1, 1, 1, 1, 1}); - a.close(); - } - - public void testNoParseAfterBuffer() throws Exception { - SynonymMap.Builder b = new SynonymMap.Builder(true); - add(b, "b a", "x", true); - - Analyzer a = getAnalyzer(b, true); - assertAnalyzesTo(a, "b b b", new String[]{"b", "b", "b"}, new int[]{0, 2, 4}, new int[]{1, 3, 5}, new String[]{"word", "word", - "word"}, new int[]{1, 1, 1}, new int[]{1, 1, 1}); - a.close(); - } - - public void testOneInputMultipleOutputKeepOrig() throws Exception { - SynonymMap.Builder b = new SynonymMap.Builder(true); - add(b, "a b", "x", true); - add(b, "a b", "y", true); - - Analyzer a = getAnalyzer(b, true); - assertAnalyzesTo(a, "c a b c", new String[]{"c", "x", "y", "a", "b", "c"}, new int[]{0, 2, 2, 2, 4, 6}, new int[]{1, 5, 5, 3, 5, - 7}, new String[]{"word", "SYNONYM", "SYNONYM", "word", "word", "word"}, new int[]{1, 1, 0, 0, 1, 1, 1, 1}, new - int[]{1, 2, 2, - 1, 1, 1, 1, 1}); - a.close(); - } - - /** - * parse a syn file with bad syntax - */ - public void testInvalidAnalyzesToNothingOutput() throws Exception { - String testFile = "a => 1"; - Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, false); - SolrSynonymParser parser = new SolrSynonymParser(true, true, analyzer); - try { - parser.parse(new StringReader(testFile)); - fail("didn't get expected exception"); - } catch (ParseException expected) { - // expected exc - } - analyzer.close(); - } - - /** - * parse a syn file with bad syntax - */ - public void testInvalidDoubleMap() throws Exception { - String testFile = "a => b => c"; - Analyzer analyzer = new MockAnalyzer(random()); - SolrSynonymParser parser = new SolrSynonymParser(true, true, analyzer); - try { - parser.parse(new StringReader(testFile)); - fail("didn't get expected exception"); - } catch (ParseException expected) { - // expected exc - } - analyzer.close(); - } - - public void testMoreThanOneLookAhead() throws Exception { - SynonymMap.Builder b = new SynonymMap.Builder(true); - add(b, "a b c d", "x", true); - - Analyzer a = getAnalyzer(b, true); - assertAnalyzesTo(a, "a b c e", new String[]{"a", "b", "c", "e"}, new int[]{0, 2, 4, 6}, new int[]{1, 3, 5, 7}, new - String[]{"word", "word", "word", "word"}, new int[]{1, 1, 1, 1}, new int[]{1, 1, 1, 1}); - a.close(); - } - - public void testLookaheadAfterParse() throws Exception { - SynonymMap.Builder b = new SynonymMap.Builder(true); - add(b, "b b", "x", true); - add(b, "b", "y", true); - - Analyzer a = getAnalyzer(b, true); - - assertAnalyzesTo(a, "b a b b", new String[]{"y", "b", "a", "x", "b", "b"}, new int[]{0, 0, 2, 4, 4, 6}, new int[]{1, 1, 3, 7, 5, - 7}, null, new int[]{1, 0, 1, 1, 0, 1}, new int[]{1, 1, 1, 2, 1, 1}, true); - } - - public void testLookaheadSecondParse() throws Exception { - SynonymMap.Builder b = new SynonymMap.Builder(true); - add(b, "b b b", "x", true); - add(b, "b", "y", true); - - Analyzer a = getAnalyzer(b, true); - - assertAnalyzesTo(a, "b b", new String[]{"y", "b", "y", "b"}, new int[]{0, 0, 2, 2}, new int[]{1, 1, 3, 3}, null, new int[]{1, 0, - 1, 0}, new int[]{1, 1, 1, 1}, true); - } - - public void testOneInputMultipleOutputNoKeepOrig() throws Exception { - SynonymMap.Builder b = new SynonymMap.Builder(true); - add(b, "a b", "x", false); - add(b, "a b", "y", false); - - Analyzer a = getAnalyzer(b, true); - assertAnalyzesTo(a, "c a b c", new String[]{"c", "x", "y", "c"}, new int[]{0, 2, 2, 6}, new int[]{1, 5, 5, 7}, new - String[]{"word", "SYNONYM", "SYNONYM", "word"}, new int[]{1, 1, 0, 1}, new int[]{1, 1, 1, 1}); - a.close(); - } - - public void testOneInputMultipleOutputMixedKeepOrig() throws Exception { - SynonymMap.Builder b = new SynonymMap.Builder(true); - add(b, "a b", "x", true); - add(b, "a b", "y", false); - - Analyzer a = getAnalyzer(b, true); - assertAnalyzesTo(a, "c a b c", new String[]{"c", "x", "y", "a", "b", "c"}, new int[]{0, 2, 2, 2, 4, 6}, new int[]{1, 5, 5, 3, 5, - 7}, new String[]{"word", "SYNONYM", "SYNONYM", "word", "word", "word"}, new int[]{1, 1, 0, 0, 1, 1, 1, 1}, new - int[]{1, 2, 2, - 1, 1, 1, 1, 1}); - a.close(); - } - - public void testSynAtEnd() throws Exception { - SynonymMap.Builder b = new SynonymMap.Builder(true); - add(b, "a b", "x", true); - - Analyzer a = getAnalyzer(b, true); - assertAnalyzesTo(a, "c d e a b", new String[]{"c", "d", "e", "x", "a", "b"}, new int[]{0, 2, 4, 6, 6, 8}, new int[]{1, 3, 5, 9, - 7, 9}, new String[]{"word", "word", "word", "SYNONYM", "word", "word"}, new int[]{1, 1, 1, 1, 0, 1}, new int[]{1, 1, 1, - 2, 1, - 1}); - a.close(); - } - - public void testTwoSynsInARow() throws Exception { - SynonymMap.Builder b = new SynonymMap.Builder(true); - add(b, "a", "x", false); - - Analyzer a = getAnalyzer(b, true); - assertAnalyzesTo(a, "c a a b", new String[]{"c", "x", "x", "b"}, new int[]{0, 2, 4, 6}, new int[]{1, 3, 5, 7}, new - String[]{"word", "SYNONYM", "SYNONYM", "word"}, new int[]{1, 1, 1, 1}, new int[]{1, 1, 1, 1}); - a.close(); - } - - public void testBasicKeepOrigTwoOutputs() throws Exception { - SynonymMap.Builder b = new SynonymMap.Builder(true); - add(b, "a b", "x y", true); - add(b, "a b", "m n o", true); - - Analyzer a = getAnalyzer(b, true); - assertAnalyzesTo(a, "c a b d", new String[]{"c", "x", "m", "a", "y", "n", "o", "b", "d"}, new int[]{0, 2, 2, 2, 2, 2, 2, 4, 6}, - new int[]{1, 5, 5, 3, 5, 5, 5, 5, 7}, new String[]{"word", "SYNONYM", "SYNONYM", "word", "SYNONYM", - "SYNONYM", "SYNONYM", - "word", "word"}, new int[]{1, 1, 0, 0, 1, 1, 1, 1, 1}, new int[]{1, 1, 2, 4, 4, 1, 2, 1, 1}); - a.close(); - } - - public void testNoCaptureIfNoMatch() throws Exception { - SynonymMap.Builder b = new SynonymMap.Builder(true); - add(b, "a b", "x y", true); - - Analyzer a = getAnalyzer(b, true); - - assertAnalyzesTo(a, "c d d", new String[]{"c", "d", "d"}, new int[]{0, 2, 4}, new int[]{1, 3, 5}, new String[]{"word", "word", - "word"}, new int[]{1, 1, 1}, new int[]{1, 1, 1}); - assertEquals(0, synFilter.getCaptureCount()); - a.close(); - } - - public void testBasicNotKeepOrigOneOutput() throws Exception { - SynonymMap.Builder b = new SynonymMap.Builder(true); - add(b, "a b", "x", false); - - Analyzer a = getAnalyzer(b, true); - assertAnalyzesTo(a, "c a b", new String[]{"c", "x"}, new int[]{0, 2}, new int[]{1, 5}, new String[]{"word", "SYNONYM"}, new - int[]{1, 1}, new int[]{1, 1}); - a.close(); - } - - public void testBasicNoKeepOrigTwoOutputs() throws Exception { - SynonymMap.Builder b = new SynonymMap.Builder(true); - add(b, "a b", "x y", false); - add(b, "a b", "m n o", false); - - Analyzer a = getAnalyzer(b, true); - assertAnalyzesTo(a, "c a b d", new String[]{"c", "x", "m", "y", "n", "o", "d"}, new int[]{0, 2, 2, 2, 2, 2, 6}, new int[]{1, 5, - 5, 5, 5, 5, 7}, new String[]{"word", "SYNONYM", "SYNONYM", "SYNONYM", "SYNONYM", "SYNONYM", - "word"}, new int[]{1, 1, 0, 1, 1, - 1, 1}, new int[]{1, 1, 2, 3, 1, 1, 1}); - a.close(); - } - - public void testIgnoreCase() throws Exception { - SynonymMap.Builder b = new SynonymMap.Builder(true); - add(b, "a b", "x y", false); - add(b, "a b", "m n o", false); - - Analyzer a = getAnalyzer(b, true); - assertAnalyzesTo(a, "c A B D", new String[]{"c", "x", "m", "y", "n", "o", "D"}, new int[]{0, 2, 2, 2, 2, 2, 6}, new int[]{1, 5, - 5, 5, 5, 5, 7}, new String[]{"word", "SYNONYM", "SYNONYM", "SYNONYM", "SYNONYM", "SYNONYM", - "word"}, new int[]{1, 1, 0, 1, 1, - 1, 1}, new int[]{1, 1, 2, 3, 1, 1, 1}); - a.close(); - } - - public void testDoNotIgnoreCase() throws Exception { - SynonymMap.Builder b = new SynonymMap.Builder(true); - add(b, "a b", "x y", false); - add(b, "a b", "m n o", false); - - Analyzer a = getAnalyzer(b, false); - assertAnalyzesTo(a, "c A B D", new String[]{"c", "A", "B", "D"}, new int[]{0, 2, 4, 6}, new int[]{1, 3, 5, 7}, new - String[]{"word", "word", "word", "word"}, new int[]{1, 1, 1, 1}, new int[]{1, 1, 1, 1}); - a.close(); - } - - public void testBufferedFinish1() throws Exception { - SynonymMap.Builder b = new SynonymMap.Builder(true); - add(b, "a b c", "m n o", false); - - Analyzer a = getAnalyzer(b, true); - assertAnalyzesTo(a, "c a b", new String[]{"c", "a", "b"}, new int[]{0, 2, 4}, new int[]{1, 3, 5}, new String[]{"word", "word", - "word"}, new int[]{1, 1, 1}, new int[]{1, 1, 1}); - a.close(); - } - - public void testBufferedFinish2() throws Exception { - SynonymMap.Builder b = new SynonymMap.Builder(true); - add(b, "a b", "m n o", false); - add(b, "d e", "m n o", false); - - Analyzer a = getAnalyzer(b, true); - assertAnalyzesTo(a, "c a d", new String[]{"c", "a", "d"}, new int[]{0, 2, 4}, new int[]{1, 3, 5}, new String[]{"word", "word", - "word"}, new int[]{1, 1, 1}, new int[]{1, 1, 1}); - a.close(); - } - - public void testCanReuse() throws Exception { - SynonymMap.Builder b = new SynonymMap.Builder(true); - add(b, "a b", "x", true); - Analyzer a = getAnalyzer(b, true); - for (int i = 0; i < 10; i++) { - assertAnalyzesTo(a, "c a b", new String[]{"c", "x", "a", "b"}, new int[]{0, 2, 2, 4}, new int[]{1, 5, 3, 5}, new - String[]{"word", "SYNONYM", "word", "word"}, new int[]{1, 1, 0, 1}, new int[]{1, 2, 1, 1}); - } - a.close(); - } - - /** - * Multiple input tokens map to a single output token - */ - public void testManyToOne() throws Exception { - SynonymMap.Builder b = new SynonymMap.Builder(true); - add(b, "a b c", "z", true); - - Analyzer a = getAnalyzer(b, true); - assertAnalyzesTo(a, "a b c d", new String[]{"z", "a", "b", "c", "d"}, new int[]{0, 0, 2, 4, 6}, new int[]{5, 1, 3, 5, 7}, new - String[]{"SYNONYM", "word", "word", "word", "word"}, new int[]{1, 0, 1, 1, 1}, new int[]{3, 1, 1, 1, 1}); - a.close(); - } - - public void testBufferAfterMatch() throws Exception { - SynonymMap.Builder b = new SynonymMap.Builder(true); - add(b, "a b c d", "x", true); - add(b, "a b", "y", false); - - // The 'c' token has to be buffered because SynGraphFilter - // needs to know whether a b c d -> x matches: - Analyzer a = getAnalyzer(b, true); - assertAnalyzesTo(a, "f a b c e", new String[]{"f", "y", "c", "e"}, new int[]{0, 2, 6, 8}, new int[]{1, 5, 7, 9}, new - String[]{"word", "SYNONYM", "word", "word"}, new int[]{1, 1, 1, 1}, new int[]{1, 1, 1, 1}); - a.close(); - } - - public void testZeroSyns() throws Exception { - Tokenizer tokenizer = new MockTokenizer(); - tokenizer.setReader(new StringReader("aa bb")); - try { - new SynonymGraphFilter(tokenizer, new SynonymMap.Builder(true).build(), true); - fail("did not hit expected exception"); - } catch (IllegalArgumentException iae) { - // expected - assertEquals("fst must be non-null", iae.getMessage()); - } - } - - // Needs TermAutomatonQuery, which is in sandbox still: - public void testAccurateGraphQuery1() throws Exception { - Directory dir = newDirectory(); - RandomIndexWriter w = new RandomIndexWriter(random(), dir); - Document doc = new Document(); - doc.add(newTextField("field", "wtf happened", Field.Store.NO)); - w.addDocument(doc); - IndexReader r = w.getReader(); - w.close(); - - IndexSearcher s = newSearcher(r); - - SynonymMap.Builder b = new SynonymMap.Builder(true); - add(b, "what the fudge", "wtf", true); - - SynonymMap map = b.build(); - - TokenStreamToTermAutomatonQuery ts2q = new TokenStreamToTermAutomatonQuery(); - - - TokenStream in = new CannedTokenStream(0, 23, token("what", 1, 1, 0, 4), token("the", 1, 1, 5, 8), token("fudge", 1, 1, 9, 14), - token("happened", 1, 1, 15, 23)); - - assertEquals(1, s.count(ts2q.toQuery("field", new SynonymGraphFilter(in, map, true)))); - - in = new CannedTokenStream(0, 12, token("wtf", 1, 1, 0, 3), token("happened", 1, 1, 4, 12)); - - assertEquals(1, s.count(ts2q.toQuery("field", new SynonymGraphFilter(in, map, true)))); - - // "what happened" should NOT match: - in = new CannedTokenStream(0, 13, token("what", 1, 1, 0, 4), token("happened", 1, 1, 5, 13)); - assertEquals(0, s.count(ts2q.toQuery("field", new SynonymGraphFilter(in, map, true)))); - - IOUtils.close(r, dir); - } - - - /** - * If we expand synonyms at search time, the results are correct. - */ - // Needs TermAutomatonQuery, which is in sandbox still: - public void testAccurateGraphQuery2() throws Exception { - Directory dir = newDirectory(); - RandomIndexWriter w = new RandomIndexWriter(random(), dir); - Document doc = new Document(); - doc.add(newTextField("field", "say wtf happened", Field.Store.NO)); - w.addDocument(doc); - IndexReader r = w.getReader(); - w.close(); - - IndexSearcher s = newSearcher(r); - - SynonymMap.Builder b = new SynonymMap.Builder(true); - add(b, "fudge", "chocolate", true); - add(b, "what the fudge", "wtf", true); - add(b, "what the", "wut", true); - add(b, "say", "say what", true); - - SynonymMap map = b.build(); - - TokenStream in = new CannedTokenStream(0, 26, token("say", 1, 1, 0, 3), token("what", 1, 1, 3, 7), token("the", 1, 1, 8, 11), - token("fudge", 1, 1, 12, 17), token("happened", 1, 1, 18, 26)); - - TokenStreamToTermAutomatonQuery ts2q = new TokenStreamToTermAutomatonQuery(); - - assertEquals(1, s.count(ts2q.toQuery("field", new SynonymGraphFilter(in, map, true)))); - - // "what happened" should NOT match: - in = new CannedTokenStream(0, 13, token("what", 1, 1, 0, 4), token("happened", 1, 1, 5, 13)); - assertEquals(0, s.count(ts2q.toQuery("field", new SynonymGraphFilter(in, map, true)))); - - IOUtils.close(r, dir); - } - - - // Needs TermAutomatonQuery, which is in sandbox still: - public void testAccurateGraphQuery3() throws Exception { - Directory dir = newDirectory(); - RandomIndexWriter w = new RandomIndexWriter(random(), dir); - Document doc = new Document(); - doc.add(newTextField("field", "say what the fudge happened", Field.Store.NO)); - w.addDocument(doc); - IndexReader r = w.getReader(); - w.close(); - - IndexSearcher s = newSearcher(r); - - SynonymMap.Builder b = new SynonymMap.Builder(true); - add(b, "wtf", "what the fudge", true); - - SynonymMap map = b.build(); - - TokenStream in = new CannedTokenStream(0, 15, token("say", 1, 1, 0, 3), token("wtf", 1, 1, 3, 6), token("happened", 1, 1, 7, 15)); - - TokenStreamToTermAutomatonQuery ts2q = new TokenStreamToTermAutomatonQuery(); - - assertEquals(1, s.count(ts2q.toQuery("field", new SynonymGraphFilter(in, map, true)))); - - // "what happened" should NOT match: - in = new CannedTokenStream(0, 13, token("what", 1, 1, 0, 4), token("happened", 1, 1, 5, 13)); - assertEquals(0, s.count(ts2q.toQuery("field", new SynonymGraphFilter(in, map, true)))); - - IOUtils.close(r, dir); - } - - private static Token token(String term, int posInc, int posLength, int startOffset, int endOffset) { - final Token t = new Token(term, startOffset, endOffset); - t.setPositionIncrement(posInc); - t.setPositionLength(posLength); - return t; - } - - private String randomNonEmptyString() { - while (true) { - String s = TestUtil.randomUnicodeString(random()).trim(); - //String s = TestUtil.randomSimpleString(random()).trim(); - if (s.length() != 0 && s.indexOf('\u0000') == -1) { - return s; - } - } - } - - // Adds MockGraphTokenFilter after SynFilter: - public void testRandomGraphAfter() throws Exception { - final int numIters = atLeast(3); - for (int i = 0; i < numIters; i++) { - SynonymMap.Builder b = new SynonymMap.Builder(random().nextBoolean()); - final int numEntries = atLeast(10); - for (int j = 0; j < numEntries; j++) { - add(b, randomNonEmptyString(), randomNonEmptyString(), random().nextBoolean()); - } - final SynonymMap map = b.build(); - final boolean ignoreCase = random().nextBoolean(); - - final Analyzer analyzer = new Analyzer() { - @Override - protected TokenStreamComponents createComponents(String fieldName) { - Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true); - TokenStream syns = new SynonymGraphFilter(tokenizer, map, ignoreCase); - TokenStream graph = new MockGraphTokenFilter(random(), syns); - return new TokenStreamComponents(tokenizer, graph); - } - }; - - checkRandomData(random(), analyzer, 100); - analyzer.close(); - } - } - - public void testEmptyStringInput() throws IOException { - final int numIters = atLeast(10); - for (int i = 0; i < numIters; i++) { - SynonymMap.Builder b = new SynonymMap.Builder(random().nextBoolean()); - final int numEntries = atLeast(10); - for (int j = 0; j < numEntries; j++) { - add(b, randomNonEmptyString(), randomNonEmptyString(), random().nextBoolean()); - } - final boolean ignoreCase = random().nextBoolean(); - - Analyzer analyzer = getAnalyzer(b, ignoreCase); - - checkAnalysisConsistency(random(), analyzer, random().nextBoolean(), ""); - analyzer.close(); - } - } - - /** - * simple random test, doesn't verify correctness. - * does verify it doesnt throw exceptions, or that the stream doesn't misbehave - */ - public void testRandom2() throws Exception { - final int numIters = atLeast(3); - for (int i = 0; i < numIters; i++) { - SynonymMap.Builder b = new SynonymMap.Builder(random().nextBoolean()); - final int numEntries = atLeast(10); - for (int j = 0; j < numEntries; j++) { - add(b, randomNonEmptyString(), randomNonEmptyString(), random().nextBoolean()); - } - final boolean ignoreCase = random().nextBoolean(); - - Analyzer analyzer = getAnalyzer(b, ignoreCase); - checkRandomData(random(), analyzer, 100); - analyzer.close(); - } - } - - /** - * simple random test like testRandom2, but for larger docs - */ - public void testRandomHuge() throws Exception { - final int numIters = atLeast(3); - for (int i = 0; i < numIters; i++) { - SynonymMap.Builder b = new SynonymMap.Builder(random().nextBoolean()); - final int numEntries = atLeast(10); - //if (VERBOSE) { - //System.out.println("TEST: iter=" + i + " numEntries=" + numEntries); - //} - for (int j = 0; j < numEntries; j++) { - add(b, randomNonEmptyString(), randomNonEmptyString(), random().nextBoolean()); - } - final boolean ignoreCase = random().nextBoolean(); - - Analyzer analyzer = getAnalyzer(b, ignoreCase); - checkRandomData(random(), analyzer, 100, 1024); - analyzer.close(); - } - } - - public void testEmptyTerm() throws IOException { - final int numIters = atLeast(10); - for (int i = 0; i < numIters; i++) { - SynonymMap.Builder b = new SynonymMap.Builder(random().nextBoolean()); - final int numEntries = atLeast(10); - for (int j = 0; j < numEntries; j++) { - add(b, randomNonEmptyString(), randomNonEmptyString(), random().nextBoolean()); - } - final boolean ignoreCase = random().nextBoolean(); - - final Analyzer analyzer = getAnalyzer(b, ignoreCase); - - checkAnalysisConsistency(random(), analyzer, random().nextBoolean(), ""); - analyzer.close(); - } - } - - public void testBuilderDedup() throws Exception { - SynonymMap.Builder b = new SynonymMap.Builder(true); - final boolean keepOrig = false; - add(b, "a b", "ab", keepOrig); - add(b, "a b", "ab", keepOrig); - add(b, "a b", "ab", keepOrig); - Analyzer a = getAnalyzer(b, true); - - assertAnalyzesTo(a, "a b", new String[]{"ab"}, new int[]{1}); - a.close(); - } - - public void testBuilderNoDedup() throws Exception { - SynonymMap.Builder b = new SynonymMap.Builder(false); - final boolean keepOrig = false; - add(b, "a b", "ab", keepOrig); - add(b, "a b", "ab", keepOrig); - add(b, "a b", "ab", keepOrig); - Analyzer a = getAnalyzer(b, true); - - assertAnalyzesTo(a, "a b", new String[]{"ab", "ab", "ab"}, new int[]{1, 0, 0}); - a.close(); - } - - public void testRecursion1() throws Exception { - SynonymMap.Builder b = new SynonymMap.Builder(true); - final boolean keepOrig = false; - add(b, "zoo", "zoo", keepOrig); - Analyzer a = getAnalyzer(b, true); - - assertAnalyzesTo(a, "zoo zoo $ zoo", new String[]{"zoo", "zoo", "$", "zoo"}, new int[]{1, 1, 1, 1}); - a.close(); - } - - public void testRecursion2() throws Exception { - SynonymMap.Builder b = new SynonymMap.Builder(true); - final boolean keepOrig = false; - add(b, "zoo", "zoo", keepOrig); - add(b, "zoo", "zoo zoo", keepOrig); - Analyzer a = getAnalyzer(b, true); - - // verify("zoo zoo $ zoo", "zoo/zoo zoo/zoo/zoo $/zoo zoo/zoo zoo"); - assertAnalyzesTo(a, "zoo zoo $ zoo", new String[]{"zoo", "zoo", "zoo", "zoo", "zoo", "zoo", "$", "zoo", "zoo", "zoo"}, new - int[]{1, 0, 1, 1, 0, 1, 1, 1, 0, 1}); - a.close(); - } - - public void testKeepOrig() throws Exception { - SynonymMap.Builder b = new SynonymMap.Builder(true); - final boolean keepOrig = true; - add(b, "a b", "ab", keepOrig); - add(b, "a c", "ac", keepOrig); - add(b, "a", "aa", keepOrig); - add(b, "b", "bb", keepOrig); - add(b, "z x c v", "zxcv", keepOrig); - add(b, "x c", "xc", keepOrig); - Analyzer a = getAnalyzer(b, true); - - assertAnalyzesTo(a, "$", new String[]{"$"}, new int[]{1}); - assertAnalyzesTo(a, "a", new String[]{"aa", "a"}, new int[]{1, 0}); - assertAnalyzesTo(a, "a", new String[]{"aa", "a"}, new int[]{1, 0}); - assertAnalyzesTo(a, "$ a", new String[]{"$", "aa", "a"}, new int[]{1, 1, 0}); - assertAnalyzesTo(a, "a $", new String[]{"aa", "a", "$"}, new int[]{1, 0, 1}); - assertAnalyzesTo(a, "$ a !", new String[]{"$", "aa", "a", "!"}, new int[]{1, 1, 0, 1}); - assertAnalyzesTo(a, "a a", new String[]{"aa", "a", "aa", "a"}, new int[]{1, 0, 1, 0}); - assertAnalyzesTo(a, "b", new String[]{"bb", "b"}, new int[]{1, 0}); - assertAnalyzesTo(a, "z x c v", new String[]{"zxcv", "z", "x", "c", "v"}, new int[]{1, 0, 1, 1, 1}); - assertAnalyzesTo(a, "z x c $", new String[]{"z", "xc", "x", "c", "$"}, new int[]{1, 1, 0, 1, 1}); - a.close(); - } - - private Analyzer getAnalyzer(SynonymMap.Builder b, final boolean ignoreCase) throws IOException { - final SynonymMap map = b.build(); - return new Analyzer() { - @Override - protected TokenStreamComponents createComponents(String fieldName) { - Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); - // Make a local variable so testRandomHuge doesn't share it across threads! - SynonymGraphFilter synFilter = new SynonymGraphFilter(tokenizer, map, ignoreCase); - SynonymGraphFilterTests.this.synFilter = synFilter; - return new TokenStreamComponents(tokenizer, synFilter); - } - }; - } - - private void add(SynonymMap.Builder b, String input, String output, boolean keepOrig) { - if (VERBOSE) { - //System.out.println(" add input=" + input + " output=" + output + " keepOrig=" + keepOrig); - } - CharsRefBuilder inputCharsRef = new CharsRefBuilder(); - SynonymMap.Builder.join(input.split(" +"), inputCharsRef); - - CharsRefBuilder outputCharsRef = new CharsRefBuilder(); - SynonymMap.Builder.join(output.split(" +"), outputCharsRef); - - b.add(inputCharsRef.get(), outputCharsRef.get(), keepOrig); - } - - private char[] randomBinaryChars(int minLen, int maxLen, double bias, char base) { - int len = TestUtil.nextInt(random(), minLen, maxLen); - char[] chars = new char[len]; - for (int i = 0; i < len; i++) { - char ch; - if (random().nextDouble() < bias) { - ch = base; - } else { - ch = (char) (base + 1); - } - chars[i] = ch; - } - - return chars; - } - - private static String toTokenString(char[] chars) { - StringBuilder b = new StringBuilder(); - for (char c : chars) { - if (b.length() > 0) { - b.append(' '); - } - b.append(c); - } - return b.toString(); - } - - private static class OneSyn { - char[] in; - char[] out; - boolean keepOrig; - - @Override - public String toString() { - return toTokenString(in) + " --> " + toTokenString(out) + " (keepOrig=" + keepOrig + ")"; - } - } - - public void testRandomSyns() throws Exception { - int synCount = atLeast(10); - double bias = random().nextDouble(); - boolean dedup = random().nextBoolean(); - - SynonymMap.Builder b = new SynonymMap.Builder(dedup); - List syns = new ArrayList<>(); - // Makes random syns from random a / b tokens, mapping to random x / y tokens - //if (VERBOSE) { - // System.out.println("TEST: make " + synCount + " syns"); - // System.out.println(" bias for a over b=" + bias); - // System.out.println(" dedup=" + dedup); - // System.out.println(" sausage=" + sausage); - //} - - int maxSynLength = 0; - - for (int i = 0; i < synCount; i++) { - OneSyn syn = new OneSyn(); - syn.in = randomBinaryChars(1, 5, bias, 'a'); - syn.out = randomBinaryChars(1, 5, 0.5, 'x'); - syn.keepOrig = random().nextBoolean(); - syns.add(syn); - - maxSynLength = Math.max(maxSynLength, syn.in.length); - - //if (VERBOSE) { - // System.out.println(" " + syn); - //} - add(b, toTokenString(syn.in), toTokenString(syn.out), syn.keepOrig); - } - - // Only used w/ VERBOSE: - Analyzer aNoSausageed; - if (VERBOSE) { - aNoSausageed = getAnalyzer(b, true); - } else { - aNoSausageed = null; - } - - Analyzer a = getAnalyzer(b, true); - int iters = atLeast(20); - for (int iter = 0; iter < iters; iter++) { - - String doc = toTokenString(randomBinaryChars(50, 100, bias, 'a')); - //String doc = toTokenString(randomBinaryChars(10, 50, bias, 'a')); - - //if (VERBOSE) { - // System.out.println("TEST: iter=" + iter + " doc=" + doc); - //} - Automaton expected = slowSynFilter(doc, syns); - if (VERBOSE) { - //System.out.println(" expected:\n" + expected.toDot()); - } - Automaton actual = toAutomaton(a.tokenStream("field", new StringReader(doc))); - //if (VERBOSE) { - // System.out.println(" actual:\n" + actual.toDot()); - //} - - assertTrue("maxLookaheadUsed=" + synFilter.getMaxLookaheadUsed() + " maxSynLength=" + maxSynLength, synFilter - .getMaxLookaheadUsed() <= maxSynLength); - - checkAnalysisConsistency(random(), a, random().nextBoolean(), doc); - // We can easily have a non-deterministic automaton at this point, e.g. if - // more than one syn matched at given point, or if the syn mapped to an - // output token that also happens to be in the input: - try { - actual = Operations.determinize(actual, 50000); - } catch (TooComplexToDeterminizeException tctde) { - // Unfortunately the syns can easily create difficult-to-determinize graphs: - assertTrue(approxEquals(actual, expected)); - continue; - } - - try { - expected = Operations.determinize(expected, 50000); - } catch (TooComplexToDeterminizeException tctde) { - // Unfortunately the syns can easily create difficult-to-determinize graphs: - assertTrue(approxEquals(actual, expected)); - continue; - } - - assertTrue(approxEquals(actual, expected)); - assertTrue(Operations.sameLanguage(actual, expected)); - } - - a.close(); - } - - /** - * Only used when true equality is too costly to check! - */ - private boolean approxEquals(Automaton actual, Automaton expected) { - // Don't collapse these into one line else the thread stack won't say which direction failed!: - boolean b1 = approxSubsetOf(actual, expected); - boolean b2 = approxSubsetOf(expected, actual); - return b1 && b2; - } - - private boolean approxSubsetOf(Automaton a1, Automaton a2) { - AutomatonTestUtil.RandomAcceptedStrings ras = new AutomatonTestUtil.RandomAcceptedStrings(a1); - for (int i = 0; i < 2000; i++) { - int[] ints = ras.getRandomAcceptedString(random()); - IntsRef path = new IntsRef(ints, 0, ints.length); - if (accepts(a2, path) == false) { - throw new RuntimeException("a2 does not accept " + path); - } - } - - // Presumed true - return true; - } - - /** - * Like {@link Operations#run} except the incoming automaton is allowed to be non-deterministic. - */ - private static boolean accepts(Automaton a, IntsRef path) { - Set states = new HashSet<>(); - states.add(0); - Transition t = new Transition(); - for (int i = 0; i < path.length; i++) { - int digit = path.ints[path.offset + i]; - Set nextStates = new HashSet<>(); - for (int state : states) { - int count = a.initTransition(state, t); - for (int j = 0; j < count; j++) { - a.getNextTransition(t); - if (digit >= t.min && digit <= t.max) { - nextStates.add(t.dest); - } - } - } - states = nextStates; - if (states.isEmpty()) { - return false; - } - } - - for (int state : states) { - if (a.isAccept(state)) { - return true; - } - } - - return false; - } - - /** - * Stupid, slow brute-force, yet hopefully bug-free, synonym filter. - */ - private Automaton slowSynFilter(String doc, List syns) { - String[] tokens = doc.split(" +"); - //if (VERBOSE) { - // System.out.println(" doc has " + tokens.length + " tokens"); - //} - int i = 0; - Automaton.Builder a = new Automaton.Builder(); - int lastState = a.createState(); - while (i < tokens.length) { - // Consider all possible syn matches starting at this point: - assert tokens[i].length() == 1; - //if (VERBOSE) { - // System.out.println(" i=" + i); - //} - - List matches = new ArrayList<>(); - for (OneSyn syn : syns) { - if (i + syn.in.length <= tokens.length) { - boolean match = true; - for (int j = 0; j < syn.in.length; j++) { - if (tokens[i + j].charAt(0) != syn.in[j]) { - match = false; - break; - } - } - - if (match) { - if (matches.isEmpty() == false) { - if (syn.in.length < matches.get(0).in.length) { - // Greedy matching: we already found longer syns matching here - continue; - } else if (syn.in.length > matches.get(0).in.length) { - // Greedy matching: all previous matches were shorter, so we drop them - matches.clear(); - } else { - // Keep the current matches: we allow multiple synonyms matching the same input string - } - } - - matches.add(syn); - } - } - } - - int nextState = a.createState(); - - if (matches.isEmpty() == false) { - // We have match(es) starting at this token - //if (VERBOSE) { - // System.out.println(" matches @ i=" + i + ": " + matches); - //} - // We keepOrig if any of the matches said to: - boolean keepOrig = false; - for (OneSyn syn : matches) { - keepOrig |= syn.keepOrig; - } - - if (keepOrig) { - // Add path for the original tokens - addSidePath(a, lastState, nextState, matches.get(0).in); - } - - for (OneSyn syn : matches) { - addSidePath(a, lastState, nextState, syn.out); - } - - i += matches.get(0).in.length; - } else { - a.addTransition(lastState, nextState, tokens[i].charAt(0)); - i++; - } - - lastState = nextState; - } - - a.setAccept(lastState, true); - - return topoSort(a.finish()); - } - - /** - * Just creates a side path from startState to endState with the provided tokens. - */ - private static void addSidePath(Automaton.Builder a, int startState, int endState, char[] tokens) { - int lastState = startState; - for (int i = 0; i < tokens.length; i++) { - int nextState; - if (i == tokens.length - 1) { - nextState = endState; - } else { - nextState = a.createState(); - } - - a.addTransition(lastState, nextState, tokens[i]); - - lastState = nextState; - } - } - - private Automaton toAutomaton(TokenStream ts) throws IOException { - PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class); - PositionLengthAttribute posLenAtt = ts.addAttribute(PositionLengthAttribute.class); - CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); - ts.reset(); - Automaton a = new Automaton(); - int srcNode = -1; - int destNode = -1; - int state = a.createState(); - while (ts.incrementToken()) { - assert termAtt.length() == 1; - char c = termAtt.charAt(0); - int posInc = posIncAtt.getPositionIncrement(); - if (posInc != 0) { - srcNode += posInc; - while (state < srcNode) { - state = a.createState(); - } - } - destNode = srcNode + posLenAtt.getPositionLength(); - while (state < destNode) { - state = a.createState(); - } - a.addTransition(srcNode, destNode, c); - } - ts.end(); - ts.close(); - a.finishState(); - a.setAccept(destNode, true); - return a; - } - - /** - * Renumbers nodes according to their topo sort - */ - private Automaton topoSort(Automaton in) { - int[] newToOld = Operations.topoSortStates(in); - int[] oldToNew = new int[newToOld.length]; - - Automaton.Builder a = new Automaton.Builder(); - //System.out.println("remap:"); - for (int i = 0; i < newToOld.length; i++) { - a.createState(); - oldToNew[newToOld[i]] = i; - //System.out.println(" " + newToOld[i] + " -> " + i); - if (in.isAccept(newToOld[i])) { - a.setAccept(i, true); - //System.out.println(" **"); - } - } - - Transition t = new Transition(); - for (int i = 0; i < newToOld.length; i++) { - int count = in.initTransition(newToOld[i], t); - for (int j = 0; j < count; j++) { - in.getNextTransition(t); - a.addTransition(i, oldToNew[t.dest], t.min, t.max); - } - } - - return a.finish(); - } - - /** - * Helper method to validate all strings that can be generated from a token stream. Uses {@link - * TokenStreamToAutomaton} to create an automaton. Asserts the finite strings of the automaton - * are all and only the given valid strings. - * - * @param analyzer analyzer containing the SynonymFilter under test. - * @param text text to be analyzed. - * @param expectedStrings all expected finite strings. - */ - public void assertAllStrings(Analyzer analyzer, String text, String[] expectedStrings) throws IOException { - TokenStream tokenStream = analyzer.tokenStream("dummy", text); - try { - Automaton automaton = new TokenStreamToAutomaton().toAutomaton(tokenStream); - Set finiteStrings = AutomatonTestUtil.getFiniteStringsRecursive(automaton, -1); - - assertEquals("Invalid resulting strings count. Expected " + expectedStrings.length + " was " + finiteStrings.size(), - expectedStrings.length, finiteStrings.size()); - - Set expectedStringsSet = new HashSet<>(Arrays.asList(expectedStrings)); - - BytesRefBuilder scratchBytesRefBuilder = new BytesRefBuilder(); - for (IntsRef ir : finiteStrings) { - String s = Util.toBytesRef(ir, scratchBytesRefBuilder).utf8ToString().replace((char) TokenStreamToAutomaton.POS_SEP, ' '); - assertTrue("Unexpected string found: " + s, expectedStringsSet.contains(s)); - } - } finally { - tokenStream.close(); - } - } -} diff --git a/core/src/test/java/org/elasticsearch/index/analysis/BaseWordDelimiterTokenFilterFactoryTestCase.java b/core/src/test/java/org/elasticsearch/index/analysis/BaseWordDelimiterTokenFilterFactoryTestCase.java new file mode 100644 index 0000000000000..edd202fae5fa8 --- /dev/null +++ b/core/src/test/java/org/elasticsearch/index/analysis/BaseWordDelimiterTokenFilterFactoryTestCase.java @@ -0,0 +1,158 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.index.analysis; + + +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.env.Environment; +import org.elasticsearch.test.ESTestCase; +import org.elasticsearch.test.ESTokenStreamTestCase; + +import java.io.IOException; +import java.io.StringReader; + +/** + * Base class to test {@link WordDelimiterGraphTokenFilterFactory} and {@link WordDelimiterGraphTokenFilterFactory} + */ +public abstract class BaseWordDelimiterTokenFilterFactoryTestCase extends ESTokenStreamTestCase { + final String type; + + public BaseWordDelimiterTokenFilterFactoryTestCase(String type) { + this.type = type; + } + + public void testDefault() throws IOException { + ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put("index.analysis.filter.my_word_delimiter.type", type) + .build()); + TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter"); + String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's"; + String[] expected = new String[]{"Power", "Shot", "500", "42", "wi", "fi", "wi", "fi", "4000", "j", "2", "se", "O", "Neil"}; + Tokenizer tokenizer = new WhitespaceTokenizer(); + tokenizer.setReader(new StringReader(source)); + assertTokenStreamContents(tokenFilter.create(tokenizer), expected); + } + + public void testCatenateWords() throws IOException { + ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put("index.analysis.filter.my_word_delimiter.type", type) + .put("index.analysis.filter.my_word_delimiter.catenate_words", "true") + .put("index.analysis.filter.my_word_delimiter.generate_word_parts", "false") + .build()); + TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter"); + String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's"; + String[] expected = new String[]{"PowerShot", "500", "42", "wifi", "wifi", "4000", "j", "2", "se", "ONeil"}; + Tokenizer tokenizer = new WhitespaceTokenizer(); + tokenizer.setReader(new StringReader(source)); + assertTokenStreamContents(tokenFilter.create(tokenizer), expected); + } + + public void testCatenateNumbers() throws IOException { + ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put("index.analysis.filter.my_word_delimiter.type", type) + .put("index.analysis.filter.my_word_delimiter.generate_number_parts", "false") + .put("index.analysis.filter.my_word_delimiter.catenate_numbers", "true") + .build()); + TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter"); + String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's"; + String[] expected = new String[]{"Power", "Shot", "50042", "wi", "fi", "wi", "fi", "4000", "j", "2", "se", "O", "Neil"}; + Tokenizer tokenizer = new WhitespaceTokenizer(); + tokenizer.setReader(new StringReader(source)); + assertTokenStreamContents(tokenFilter.create(tokenizer), expected); + } + + public void testCatenateAll() throws IOException { + ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put("index.analysis.filter.my_word_delimiter.type", type) + .put("index.analysis.filter.my_word_delimiter.generate_word_parts", "false") + .put("index.analysis.filter.my_word_delimiter.generate_number_parts", "false") + .put("index.analysis.filter.my_word_delimiter.catenate_all", "true") + .build()); + TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter"); + String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's"; + String[] expected = new String[]{"PowerShot", "50042", "wifi", "wifi4000", "j2se", "ONeil"}; + Tokenizer tokenizer = new WhitespaceTokenizer(); + tokenizer.setReader(new StringReader(source)); + assertTokenStreamContents(tokenFilter.create(tokenizer), expected); + } + + public void testSplitOnCaseChange() throws IOException { + ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put("index.analysis.filter.my_word_delimiter.type", type) + .put("index.analysis.filter.my_word_delimiter.split_on_case_change", "false") + .build()); + TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter"); + String source = "PowerShot"; + String[] expected = new String[]{"PowerShot"}; + Tokenizer tokenizer = new WhitespaceTokenizer(); + tokenizer.setReader(new StringReader(source)); + assertTokenStreamContents(tokenFilter.create(tokenizer), expected); + } + + public void testPreserveOriginal() throws IOException { + ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put("index.analysis.filter.my_word_delimiter.type", type) + .put("index.analysis.filter.my_word_delimiter.preserve_original", "true") + .build()); + TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter"); + String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's"; + String[] expected = new String[]{"PowerShot", "Power", "Shot", "500-42", "500", "42", "wi-fi", "wi", "fi", "wi-fi-4000", "wi", "fi", "4000", "j2se", "j", "2", "se", "O'Neil's", "O", "Neil"}; + Tokenizer tokenizer = new WhitespaceTokenizer(); + tokenizer.setReader(new StringReader(source)); + assertTokenStreamContents(tokenFilter.create(tokenizer), expected); + } + + public void testStemEnglishPossessive() throws IOException { + ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put("index.analysis.filter.my_word_delimiter.type", type) + .put("index.analysis.filter.my_word_delimiter.stem_english_possessive", "false") + .build()); + TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter"); + String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's"; + String[] expected = new String[]{"Power", "Shot", "500", "42", "wi", "fi", "wi", "fi", "4000", "j", "2", "se", "O", "Neil", "s"}; + Tokenizer tokenizer = new WhitespaceTokenizer(); + tokenizer.setReader(new StringReader(source)); + assertTokenStreamContents(tokenFilter.create(tokenizer), expected); + } + + /** Correct offset order when doing both parts and concatenation: PowerShot is a synonym of Power */ + public void testPartsAndCatenate() throws IOException { + ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put("index.analysis.filter.my_word_delimiter.type", type) + .put("index.analysis.filter.my_word_delimiter.catenate_words", "true") + .put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true") + .build()); + TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter"); + String source = "PowerShot"; + String[] expected = new String[]{"Power", "PowerShot", "Shot" }; + Tokenizer tokenizer = new WhitespaceTokenizer(); + tokenizer.setReader(new StringReader(source)); + assertTokenStreamContents(tokenFilter.create(tokenizer), expected); + } +} diff --git a/core/src/test/java/org/elasticsearch/index/analysis/WordDelimiterGraphTokenFilterFactoryTests.java b/core/src/test/java/org/elasticsearch/index/analysis/WordDelimiterGraphTokenFilterFactoryTests.java new file mode 100644 index 0000000000000..793c634e6a412 --- /dev/null +++ b/core/src/test/java/org/elasticsearch/index/analysis/WordDelimiterGraphTokenFilterFactoryTests.java @@ -0,0 +1,56 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.index.analysis; + + +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.env.Environment; +import org.elasticsearch.test.ESTestCase; + +import java.io.IOException; +import java.io.StringReader; + +public class WordDelimiterGraphTokenFilterFactoryTests extends BaseWordDelimiterTokenFilterFactoryTestCase { + public WordDelimiterGraphTokenFilterFactoryTests() { + super("word_delimiter_graph"); + } + + public void testMultiTerms() throws IOException { + ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put("index.analysis.filter.my_word_delimiter.type", type) + .put("index.analysis.filter.my_word_delimiter.catenate_all", "true") + .put("index.analysis.filter.my_word_delimiter.preserve_original", "true") + .build()); + + TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter"); + String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's"; + String[] expected = new String[]{"PowerShot", "PowerShot", "Power", "Shot", "50042", "500-42", "500", "42", + "wifi", "wi-fi", "wi", "fi", "wifi4000", "wi-fi-4000", "wi", "fi", "4000", "j2se", "j2se", "j", "2", "se", + "ONeil", "O'Neil's", "O", "Neil" }; + Tokenizer tokenizer = new WhitespaceTokenizer(); + tokenizer.setReader(new StringReader(source)); + int[] expectedIncr = new int[]{1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1}; + int[] expectedPosLen = new int[]{2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 3, 3, 1, 1, 1, 3, 3, 1, 1, 1, 2, 2, 1, 1}; + assertTokenStreamContents(tokenFilter.create(tokenizer), expected, null, null, null, + expectedIncr, expectedPosLen, null); + } +} diff --git a/core/src/test/java/org/elasticsearch/index/analysis/WordDelimiterTokenFilterFactoryTests.java b/core/src/test/java/org/elasticsearch/index/analysis/WordDelimiterTokenFilterFactoryTests.java index 1a7903bcfac09..bfa93c4eb01b7 100644 --- a/core/src/test/java/org/elasticsearch/index/analysis/WordDelimiterTokenFilterFactoryTests.java +++ b/core/src/test/java/org/elasticsearch/index/analysis/WordDelimiterTokenFilterFactoryTests.java @@ -19,131 +19,8 @@ package org.elasticsearch.index.analysis; -import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.core.WhitespaceTokenizer; -import org.elasticsearch.common.settings.Settings; -import org.elasticsearch.env.Environment; -import org.elasticsearch.test.ESTestCase; -import org.elasticsearch.test.ESTokenStreamTestCase; - -import java.io.IOException; -import java.io.StringReader; - -public class WordDelimiterTokenFilterFactoryTests extends ESTokenStreamTestCase { - public void testDefault() throws IOException { - ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() - .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) - .put("index.analysis.filter.my_word_delimiter.type", "word_delimiter") - .build()); - TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter"); - String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's"; - String[] expected = new String[]{"Power", "Shot", "500", "42", "wi", "fi", "wi", "fi", "4000", "j", "2", "se", "O", "Neil"}; - Tokenizer tokenizer = new WhitespaceTokenizer(); - tokenizer.setReader(new StringReader(source)); - assertTokenStreamContents(tokenFilter.create(tokenizer), expected); - } - - public void testCatenateWords() throws IOException { - ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() - .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) - .put("index.analysis.filter.my_word_delimiter.type", "word_delimiter") - .put("index.analysis.filter.my_word_delimiter.catenate_words", "true") - .put("index.analysis.filter.my_word_delimiter.generate_word_parts", "false") - .build()); - TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter"); - String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's"; - String[] expected = new String[]{"PowerShot", "500", "42", "wifi", "wifi", "4000", "j", "2", "se", "ONeil"}; - Tokenizer tokenizer = new WhitespaceTokenizer(); - tokenizer.setReader(new StringReader(source)); - assertTokenStreamContents(tokenFilter.create(tokenizer), expected); - } - - public void testCatenateNumbers() throws IOException { - ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() - .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) - .put("index.analysis.filter.my_word_delimiter.type", "word_delimiter") - .put("index.analysis.filter.my_word_delimiter.generate_number_parts", "false") - .put("index.analysis.filter.my_word_delimiter.catenate_numbers", "true") - .build()); - TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter"); - String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's"; - String[] expected = new String[]{"Power", "Shot", "50042", "wi", "fi", "wi", "fi", "4000", "j", "2", "se", "O", "Neil"}; - Tokenizer tokenizer = new WhitespaceTokenizer(); - tokenizer.setReader(new StringReader(source)); - assertTokenStreamContents(tokenFilter.create(tokenizer), expected); - } - - public void testCatenateAll() throws IOException { - ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() - .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) - .put("index.analysis.filter.my_word_delimiter.type", "word_delimiter") - .put("index.analysis.filter.my_word_delimiter.generate_word_parts", "false") - .put("index.analysis.filter.my_word_delimiter.generate_number_parts", "false") - .put("index.analysis.filter.my_word_delimiter.catenate_all", "true") - .build()); - TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter"); - String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's"; - String[] expected = new String[]{"PowerShot", "50042", "wifi", "wifi4000", "j2se", "ONeil"}; - Tokenizer tokenizer = new WhitespaceTokenizer(); - tokenizer.setReader(new StringReader(source)); - assertTokenStreamContents(tokenFilter.create(tokenizer), expected); - } - - public void testSplitOnCaseChange() throws IOException { - ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() - .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) - .put("index.analysis.filter.my_word_delimiter.type", "word_delimiter") - .put("index.analysis.filter.my_word_delimiter.split_on_case_change", "false") - .build()); - TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter"); - String source = "PowerShot"; - String[] expected = new String[]{"PowerShot"}; - Tokenizer tokenizer = new WhitespaceTokenizer(); - tokenizer.setReader(new StringReader(source)); - assertTokenStreamContents(tokenFilter.create(tokenizer), expected); - } - - public void testPreserveOriginal() throws IOException { - ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() - .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) - .put("index.analysis.filter.my_word_delimiter.type", "word_delimiter") - .put("index.analysis.filter.my_word_delimiter.preserve_original", "true") - .build()); - TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter"); - String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's"; - String[] expected = new String[]{"PowerShot", "Power", "Shot", "500-42", "500", "42", "wi-fi", "wi", "fi", "wi-fi-4000", "wi", "fi", "4000", "j2se", "j", "2", "se", "O'Neil's", "O", "Neil"}; - Tokenizer tokenizer = new WhitespaceTokenizer(); - tokenizer.setReader(new StringReader(source)); - assertTokenStreamContents(tokenFilter.create(tokenizer), expected); - } - - public void testStemEnglishPossessive() throws IOException { - ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() - .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) - .put("index.analysis.filter.my_word_delimiter.type", "word_delimiter") - .put("index.analysis.filter.my_word_delimiter.stem_english_possessive", "false") - .build()); - TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter"); - String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's"; - String[] expected = new String[]{"Power", "Shot", "500", "42", "wi", "fi", "wi", "fi", "4000", "j", "2", "se", "O", "Neil", "s"}; - Tokenizer tokenizer = new WhitespaceTokenizer(); - tokenizer.setReader(new StringReader(source)); - assertTokenStreamContents(tokenFilter.create(tokenizer), expected); - } - - /** Correct offset order when doing both parts and concatenation: PowerShot is a synonym of Power */ - public void testPartsAndCatenate() throws IOException { - ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() - .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) - .put("index.analysis.filter.my_word_delimiter.type", "word_delimiter") - .put("index.analysis.filter.my_word_delimiter.catenate_words", "true") - .put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true") - .build()); - TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter"); - String source = "PowerShot"; - String[] expected = new String[]{"Power", "PowerShot", "Shot" }; - Tokenizer tokenizer = new WhitespaceTokenizer(); - tokenizer.setReader(new StringReader(source)); - assertTokenStreamContents(tokenFilter.create(tokenizer), expected); +public class WordDelimiterTokenFilterFactoryTests extends BaseWordDelimiterTokenFilterFactoryTestCase { + public WordDelimiterTokenFilterFactoryTests() { + super("word_delimiter"); } } diff --git a/docs/reference/analysis/tokenfilters/word-delimiter-graph-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/word-delimiter-graph-tokenfilter.asciidoc new file mode 100644 index 0000000000000..b415d1c376ec1 --- /dev/null +++ b/docs/reference/analysis/tokenfilters/word-delimiter-graph-tokenfilter.asciidoc @@ -0,0 +1,97 @@ +[[analysis-word-delimiter-graph-tokenfilter]] +=== Word Delimiter Graph Token Filter + +experimental[] + +Named `word_delimiter_graph`, it Splits words into subwords and performs +optional transformations on subword groups. Words are split into +subwords with the following rules: + +* split on intra-word delimiters (by default, all non alpha-numeric +characters). +* "Wi-Fi" -> "Wi", "Fi" +* split on case transitions: "PowerShot" -> "Power", "Shot" +* split on letter-number transitions: "SD500" -> "SD", "500" +* leading and trailing intra-word delimiters on each subword are +ignored: "//hello---there, 'dude'" -> "hello", "there", "dude" +* trailing "'s" are removed for each subword: "O'Neil's" -> "O", "Neil" + +Unlike the `word_delimiter`, this token filter correctly handles positions for +multi terms expansion at search-time when any of the following options +are set to true: + + * `preserve_original` + * `catenate_numbers` + * `catenate_words` + * `catenate_all` + +Parameters include: + +`generate_word_parts`:: + If `true` causes parts of words to be + generated: "PowerShot" => "Power" "Shot". Defaults to `true`. + +`generate_number_parts`:: + If `true` causes number subwords to be + generated: "500-42" => "500" "42". Defaults to `true`. + +`catenate_words`:: + If `true` causes maximum runs of word parts to be + catenated: "wi-fi" => "wifi". Defaults to `false`. + +`catenate_numbers`:: + If `true` causes maximum runs of number parts to + be catenated: "500-42" => "50042". Defaults to `false`. + +`catenate_all`:: + If `true` causes all subword parts to be catenated: + "wi-fi-4000" => "wifi4000". Defaults to `false`. + +`split_on_case_change`:: + If `true` causes "PowerShot" to be two tokens; + ("Power-Shot" remains two parts regards). Defaults to `true`. + +`preserve_original`:: + If `true` includes original words in subwords: + "500-42" => "500-42" "500" "42". Defaults to `false`. + +`split_on_numerics`:: + If `true` causes "j2se" to be three tokens; "j" + "2" "se". Defaults to `true`. + +`stem_english_possessive`:: + If `true` causes trailing "'s" to be + removed for each subword: "O'Neil's" => "O", "Neil". Defaults to `true`. + +Advance settings include: + +`protected_words`:: + A list of protected words from being delimiter. + Either an array, or also can set `protected_words_path` which resolved + to a file configured with protected words (one on each line). + Automatically resolves to `config/` based location if exists. + +`type_table`:: + A custom type mapping table, for example (when configured + using `type_table_path`): + +[source,js] +-------------------------------------------------- + # Map the $, %, '.', and ',' characters to DIGIT + # This might be useful for financial data. + $ => DIGIT + % => DIGIT + . => DIGIT + \\u002C => DIGIT + + # in some cases you might not want to split on ZWJ + # this also tests the case where we need a bigger byte[] + # see http://en.wikipedia.org/wiki/Zero-width_joiner + \\u200D => ALPHANUM +-------------------------------------------------- + +NOTE: Using a tokenizer like the `standard` tokenizer may interfere with +the `catenate_*` and `preserve_original` parameters, as the original +string may already have lost punctuation during tokenization. Instead, +you may want to use the `whitespace` tokenizer. + From 271467e5dfdf0958f339bfc4a41903dfce681947 Mon Sep 17 00:00:00 2001 From: Jim Ferenczi Date: Fri, 24 Feb 2017 00:52:35 +0100 Subject: [PATCH 2/2] Address review --- .../WordDelimiterGraphTokenFilterFactory.java | 4 --- ...rdDelimiterTokenFilterFactoryTestCase.java | 30 ++++++------------- ...DelimiterGraphTokenFilterFactoryTests.java | 19 ++++++++++++ .../WordDelimiterTokenFilterFactoryTests.java | 25 ++++++++++++++++ docs/build.gradle | 1 + .../word-delimiter-graph-tokenfilter.asciidoc | 2 +- 6 files changed, 55 insertions(+), 26 deletions(-) diff --git a/core/src/main/java/org/elasticsearch/index/analysis/WordDelimiterGraphTokenFilterFactory.java b/core/src/main/java/org/elasticsearch/index/analysis/WordDelimiterGraphTokenFilterFactory.java index 20fc7f0ad9c81..7cdc215f1b34a 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/WordDelimiterGraphTokenFilterFactory.java +++ b/core/src/main/java/org/elasticsearch/index/analysis/WordDelimiterGraphTokenFilterFactory.java @@ -98,8 +98,4 @@ private int getFlag(int flag, Settings settings, String key, boolean defaultValu } return 0; } - - int getFlags() { - return flags; - } } diff --git a/core/src/test/java/org/elasticsearch/index/analysis/BaseWordDelimiterTokenFilterFactoryTestCase.java b/core/src/test/java/org/elasticsearch/index/analysis/BaseWordDelimiterTokenFilterFactoryTestCase.java index edd202fae5fa8..713e9424759a0 100644 --- a/core/src/test/java/org/elasticsearch/index/analysis/BaseWordDelimiterTokenFilterFactoryTestCase.java +++ b/core/src/test/java/org/elasticsearch/index/analysis/BaseWordDelimiterTokenFilterFactoryTestCase.java @@ -30,7 +30,7 @@ import java.io.StringReader; /** - * Base class to test {@link WordDelimiterGraphTokenFilterFactory} and {@link WordDelimiterGraphTokenFilterFactory} + * Base class to test {@link WordDelimiterTokenFilterFactory} and {@link WordDelimiterGraphTokenFilterFactory} */ public abstract class BaseWordDelimiterTokenFilterFactoryTestCase extends ESTokenStreamTestCase { final String type; @@ -46,7 +46,8 @@ public void testDefault() throws IOException { .build()); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter"); String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's"; - String[] expected = new String[]{"Power", "Shot", "500", "42", "wi", "fi", "wi", "fi", "4000", "j", "2", "se", "O", "Neil"}; + String[] expected = new String[]{"Power", "Shot", "500", "42", "wi", "fi", "wi", + "fi", "4000", "j", "2", "se", "O", "Neil"}; Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader(source)); assertTokenStreamContents(tokenFilter.create(tokenizer), expected); @@ -76,7 +77,8 @@ public void testCatenateNumbers() throws IOException { .build()); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter"); String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's"; - String[] expected = new String[]{"Power", "Shot", "50042", "wi", "fi", "wi", "fi", "4000", "j", "2", "se", "O", "Neil"}; + String[] expected = new String[]{"Power", "Shot", "50042", "wi", "fi", "wi", "fi", "4000", "j", "2", + "se", "O", "Neil"}; Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader(source)); assertTokenStreamContents(tokenFilter.create(tokenizer), expected); @@ -120,7 +122,8 @@ public void testPreserveOriginal() throws IOException { .build()); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter"); String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's"; - String[] expected = new String[]{"PowerShot", "Power", "Shot", "500-42", "500", "42", "wi-fi", "wi", "fi", "wi-fi-4000", "wi", "fi", "4000", "j2se", "j", "2", "se", "O'Neil's", "O", "Neil"}; + String[] expected = new String[]{"PowerShot", "Power", "Shot", "500-42", "500", "42", "wi-fi", "wi", "fi", + "wi-fi-4000", "wi", "fi", "4000", "j2se", "j", "2", "se", "O'Neil's", "O", "Neil"}; Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader(source)); assertTokenStreamContents(tokenFilter.create(tokenizer), expected); @@ -134,23 +137,8 @@ public void testStemEnglishPossessive() throws IOException { .build()); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter"); String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's"; - String[] expected = new String[]{"Power", "Shot", "500", "42", "wi", "fi", "wi", "fi", "4000", "j", "2", "se", "O", "Neil", "s"}; - Tokenizer tokenizer = new WhitespaceTokenizer(); - tokenizer.setReader(new StringReader(source)); - assertTokenStreamContents(tokenFilter.create(tokenizer), expected); - } - - /** Correct offset order when doing both parts and concatenation: PowerShot is a synonym of Power */ - public void testPartsAndCatenate() throws IOException { - ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() - .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) - .put("index.analysis.filter.my_word_delimiter.type", type) - .put("index.analysis.filter.my_word_delimiter.catenate_words", "true") - .put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true") - .build()); - TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter"); - String source = "PowerShot"; - String[] expected = new String[]{"Power", "PowerShot", "Shot" }; + String[] expected = new String[]{"Power", "Shot", "500", "42", "wi", "fi", "wi", "fi", "4000", "j", "2", + "se", "O", "Neil", "s"}; Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader(source)); assertTokenStreamContents(tokenFilter.create(tokenizer), expected); diff --git a/core/src/test/java/org/elasticsearch/index/analysis/WordDelimiterGraphTokenFilterFactoryTests.java b/core/src/test/java/org/elasticsearch/index/analysis/WordDelimiterGraphTokenFilterFactoryTests.java index 793c634e6a412..2ae4267104a05 100644 --- a/core/src/test/java/org/elasticsearch/index/analysis/WordDelimiterGraphTokenFilterFactoryTests.java +++ b/core/src/test/java/org/elasticsearch/index/analysis/WordDelimiterGraphTokenFilterFactoryTests.java @@ -53,4 +53,23 @@ public void testMultiTerms() throws IOException { assertTokenStreamContents(tokenFilter.create(tokenizer), expected, null, null, null, expectedIncr, expectedPosLen, null); } + + /** Correct offset order when doing both parts and concatenation: PowerShot is a synonym of Power */ + public void testPartsAndCatenate() throws IOException { + ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put("index.analysis.filter.my_word_delimiter.type", type) + .put("index.analysis.filter.my_word_delimiter.catenate_words", "true") + .put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true") + .build()); + TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter"); + String source = "PowerShot"; + int[] expectedIncr = new int[]{1, 0, 1}; + int[] expectedPosLen = new int[]{2, 1, 1}; + String[] expected = new String[]{"PowerShot", "Power", "Shot" }; + Tokenizer tokenizer = new WhitespaceTokenizer(); + tokenizer.setReader(new StringReader(source)); + assertTokenStreamContents(tokenFilter.create(tokenizer), expected, null, null, null, + expectedIncr, expectedPosLen, null); + } } diff --git a/core/src/test/java/org/elasticsearch/index/analysis/WordDelimiterTokenFilterFactoryTests.java b/core/src/test/java/org/elasticsearch/index/analysis/WordDelimiterTokenFilterFactoryTests.java index bfa93c4eb01b7..1e919e00bbb29 100644 --- a/core/src/test/java/org/elasticsearch/index/analysis/WordDelimiterTokenFilterFactoryTests.java +++ b/core/src/test/java/org/elasticsearch/index/analysis/WordDelimiterTokenFilterFactoryTests.java @@ -19,8 +19,33 @@ package org.elasticsearch.index.analysis; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.env.Environment; +import org.elasticsearch.test.ESTestCase; + +import java.io.IOException; +import java.io.StringReader; + public class WordDelimiterTokenFilterFactoryTests extends BaseWordDelimiterTokenFilterFactoryTestCase { public WordDelimiterTokenFilterFactoryTests() { super("word_delimiter"); } + + /** Correct offset order when doing both parts and concatenation: PowerShot is a synonym of Power */ + public void testPartsAndCatenate() throws IOException { + ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put("index.analysis.filter.my_word_delimiter.type", type) + .put("index.analysis.filter.my_word_delimiter.catenate_words", "true") + .put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true") + .build()); + TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter"); + String source = "PowerShot"; + String[] expected = new String[]{"Power", "PowerShot", "Shot" }; + Tokenizer tokenizer = new WhitespaceTokenizer(); + tokenizer.setReader(new StringReader(source)); + assertTokenStreamContents(tokenFilter.create(tokenizer), expected); + } } diff --git a/docs/build.gradle b/docs/build.gradle index 36727b12e5097..9fd593e2faeb0 100644 --- a/docs/build.gradle +++ b/docs/build.gradle @@ -81,6 +81,7 @@ buildRestTests.expectedUnconvertedCandidates = [ 'reference/analysis/tokenfilters/synonym-tokenfilter.asciidoc', 'reference/analysis/tokenfilters/synonym-graph-tokenfilter.asciidoc', 'reference/analysis/tokenfilters/word-delimiter-tokenfilter.asciidoc', + 'reference/analysis/tokenfilters/word-delimiter-graph-tokenfilter.asciidoc', 'reference/cat/snapshots.asciidoc', 'reference/cat/templates.asciidoc', 'reference/cat/thread_pool.asciidoc', diff --git a/docs/reference/analysis/tokenfilters/word-delimiter-graph-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/word-delimiter-graph-tokenfilter.asciidoc index b415d1c376ec1..01176fa5636c8 100644 --- a/docs/reference/analysis/tokenfilters/word-delimiter-graph-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/word-delimiter-graph-tokenfilter.asciidoc @@ -3,7 +3,7 @@ experimental[] -Named `word_delimiter_graph`, it Splits words into subwords and performs +Named `word_delimiter_graph`, it splits words into subwords and performs optional transformations on subword groups. Words are split into subwords with the following rules: