From b8188b82b7deb341928cbbeb9dc3cecfb397caa8 Mon Sep 17 00:00:00 2001 From: Andy Bristol Date: Wed, 7 Jun 2017 10:46:48 -0700 Subject: [PATCH 1/4] expose simplepattern and simplepatternsplit tokenizers Register these experimental tokenizers. Their default patterns are both set to the empty string. These tokenizers only seem useful if there is a pattern the user has in mind, so there aren't really "sensible" defaults. However tokenizer factories are instantiated at index creation time, so they blow up if there's no default pattern. Add a rest test and entries in the reference for each tokenizer For #23363 --- .../SimplePatternSplitTokenizerFactory.java | 46 +++++++ .../SimplePatternTokenizerFactory.java | 46 +++++++ .../indices/analysis/AnalysisModule.java | 4 + docs/reference/analysis/tokenizers.asciidoc | 16 ++- .../simplepattern-tokenizer.asciidoc | 117 +++++++++++++++++ .../simplepatternsplit-tokenizer.asciidoc | 118 ++++++++++++++++++ .../test/analysis-common/30_tokenizers.yml | 30 +++++ .../analysis/AnalysisFactoryTestCase.java | 32 ++--- 8 files changed, 392 insertions(+), 17 deletions(-) create mode 100644 core/src/main/java/org/elasticsearch/index/analysis/SimplePatternSplitTokenizerFactory.java create mode 100644 core/src/main/java/org/elasticsearch/index/analysis/SimplePatternTokenizerFactory.java create mode 100644 docs/reference/analysis/tokenizers/simplepattern-tokenizer.asciidoc create mode 100644 docs/reference/analysis/tokenizers/simplepatternsplit-tokenizer.asciidoc diff --git a/core/src/main/java/org/elasticsearch/index/analysis/SimplePatternSplitTokenizerFactory.java b/core/src/main/java/org/elasticsearch/index/analysis/SimplePatternSplitTokenizerFactory.java new file mode 100644 index 0000000000000..5d08cf903f062 --- /dev/null +++ b/core/src/main/java/org/elasticsearch/index/analysis/SimplePatternSplitTokenizerFactory.java @@ -0,0 +1,46 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.pattern.SimplePatternSplitTokenizer; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.env.Environment; +import org.elasticsearch.index.IndexSettings; + +public class SimplePatternSplitTokenizerFactory extends AbstractTokenizerFactory { + + private final String pattern; + + public SimplePatternSplitTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { + super(indexSettings, name, settings); + + String pattern = settings.get("pattern", ""); + if (pattern == null) { + throw new IllegalArgumentException("pattern is missing for [" + name + "] tokenizer of type 'simplepatternsplit'"); + } + this.pattern = pattern; + } + + @Override + public Tokenizer create() { + return new SimplePatternSplitTokenizer(pattern); + } +} diff --git a/core/src/main/java/org/elasticsearch/index/analysis/SimplePatternTokenizerFactory.java b/core/src/main/java/org/elasticsearch/index/analysis/SimplePatternTokenizerFactory.java new file mode 100644 index 0000000000000..bb23a4609abbf --- /dev/null +++ b/core/src/main/java/org/elasticsearch/index/analysis/SimplePatternTokenizerFactory.java @@ -0,0 +1,46 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.pattern.SimplePatternTokenizer; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.env.Environment; +import org.elasticsearch.index.IndexSettings; + +public class SimplePatternTokenizerFactory extends AbstractTokenizerFactory { + + private final String pattern; + + public SimplePatternTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { + super(indexSettings, name, settings); + + String pattern = settings.get("pattern", ""); + if (pattern == null) { + throw new IllegalArgumentException("pattern is missing for [" + name + "] tokenizer of type 'simplepattern'"); + } + this.pattern = pattern; + } + + @Override + public Tokenizer create() { + return new SimplePatternTokenizer(pattern); + } +} diff --git a/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java b/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java index 3f26b722f41ce..e7dccdc9fd2d9 100644 --- a/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java +++ b/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java @@ -113,6 +113,8 @@ import org.elasticsearch.index.analysis.SerbianNormalizationFilterFactory; import org.elasticsearch.index.analysis.ShingleTokenFilterFactory; import org.elasticsearch.index.analysis.SimpleAnalyzerProvider; +import org.elasticsearch.index.analysis.SimplePatternSplitTokenizerFactory; +import org.elasticsearch.index.analysis.SimplePatternTokenizerFactory; import org.elasticsearch.index.analysis.SnowballAnalyzerProvider; import org.elasticsearch.index.analysis.SoraniAnalyzerProvider; import org.elasticsearch.index.analysis.SoraniNormalizationFilterFactory; @@ -343,6 +345,8 @@ private NamedRegistry> setupTokenizers(List>:: + +The `simplepattern` tokenizer uses a regular expression to capture matching +text as terms. It uses a restricted subset of regular expression features +and is generally faster than the `pattern` tokenizer. + +<>:: + +The `simplepatternsplit` tokenizer uses the same restricted subset as +the `simplepattern` tokenizer, but splits the input at matches rather than +returning the matches as terms. + <>:: The `path_hierarchy` tokenizer takes a hierarchical value like a filesystem @@ -131,6 +143,8 @@ include::tokenizers/keyword-tokenizer.asciidoc[] include::tokenizers/pattern-tokenizer.asciidoc[] -include::tokenizers/pathhierarchy-tokenizer.asciidoc[] +include::tokenizers/simplepattern-tokenizer.asciidoc[] +include::tokenizers/simplepatternsplit-tokenizer.asciidoc[] +include::tokenizers/pathhierarchy-tokenizer.asciidoc[] diff --git a/docs/reference/analysis/tokenizers/simplepattern-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/simplepattern-tokenizer.asciidoc new file mode 100644 index 0000000000000..a0c3c7dcf1288 --- /dev/null +++ b/docs/reference/analysis/tokenizers/simplepattern-tokenizer.asciidoc @@ -0,0 +1,117 @@ +[[analysis-simplepattern-tokenizer]] +=== Simple Pattern Tokenizer + +experimental[] + +The `simplepattern` tokenizer uses a regular expression to capture matching +text as terms. The set of regular expression features it supports is more +limited than the <> tokenizer, but the +tokenization is generally faster. + +This tokenizer does not support splitting the input on a pattern match, unlike +the <> tokenizer. To split on pattern +matches using the same restricted regular expression subset, see the +<> tokenizer. + +This tokenizer uses http://lucene.apache.org/core//6_5_1/core/org/apache/lucene/util/automaton/RegExp.html[Lucene regular expressions]. +For an explanation of the supported features and syntax, see <>. + +The default pattern is the empty string, which produces no terms. This +tokenizer should always be configured with a non-default pattern. + +[WARNING] +.Beware of Pathological Regular Expressions +======================================== + +A badly written regular expression could run very slowly or even throw a +StackOverflowError and cause the node it is running on to exit suddenly. + +Read more about http://www.regular-expressions.info/catastrophic.html[pathological regular expressions and how to avoid them]. + +======================================== + +[float] +=== Configuration + +The `simplepattern` tokenizer accepts the following parameters: + +[horizontal] +`pattern`:: + + A http://lucene.apache.org/core//6_5_1/core/org/apache/lucene/util/automaton/RegExp.html[Lucene regular expression], defaults to the empty string. + +[float] +=== Example configuration + +This example configures the `simplepattern` tokenizer to produce terms that are +three-digit numbers + +[source,js] +---------------------------- +PUT my_index +{ + "settings": { + "analysis": { + "analyzer": { + "my_analyzer": { + "tokenizer": "my_tokenizer" + } + }, + "tokenizer": { + "my_tokenizer": { + "type": "simplepattern", + "pattern": "[0123456789]{3}" + } + } + } + } +} + +POST my_index/_analyze +{ + "analyzer": "my_analyzer", + "text": "fd-786-335-514-x" +} +---------------------------- +// CONSOLE + +///////////////////// + +[source,js] +---------------------------- +{ + "tokens" : [ + { + "token" : "786", + "start_offset" : 3, + "end_offset" : 6, + "type" : "word", + "position" : 0 + }, + { + "token" : "335", + "start_offset" : 7, + "end_offset" : 10, + "type" : "word", + "position" : 1 + }, + { + "token" : "514", + "start_offset" : 11, + "end_offset" : 14, + "type" : "word", + "position" : 2 + } + ] +} +---------------------------- +// TESTRESPONSE + +///////////////////// + +The above example produces these terms: + +[source,text] +--------------------------- +[ 786, 335, 514 ] +--------------------------- diff --git a/docs/reference/analysis/tokenizers/simplepatternsplit-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/simplepatternsplit-tokenizer.asciidoc new file mode 100644 index 0000000000000..04e17a0e68501 --- /dev/null +++ b/docs/reference/analysis/tokenizers/simplepatternsplit-tokenizer.asciidoc @@ -0,0 +1,118 @@ +[[analysis-simplepatternsplit-tokenizer]] +=== Simple Pattern Split Tokenizer + +experimental[] + +The `simplepatternsplit` tokenizer uses a regular expression to split the +input into terms at pattern matches. The set of regular expression features it +supports is more limited than the <> +tokenizer, but the tokenization is generally faster. + +This tokenizer does not produce terms from the matches themselves. To produce +terms from matches using patterns in the same restricted regular expression +subset, see the <> +tokenizer. + +This tokenizer uses http://lucene.apache.org/core//6_5_1/core/org/apache/lucene/util/automaton/RegExp.html[Lucene regular expressions]. +For an explanation of the supported features and syntax, see <>. + +The default pattern is the empty string, which produces one term containing the +full input. This tokenizer should always be configured with a non-default +pattern. + +[WARNING] +.Beware of Pathological Regular Expressions +======================================== + +A badly written regular expression could run very slowly or even throw a +StackOverflowError and cause the node it is running on to exit suddenly. + +Read more about http://www.regular-expressions.info/catastrophic.html[pathological regular expressions and how to avoid them]. + +======================================== + +[float] +=== Configuration + +The `simplepatternsplit` tokenizer accepts the following parameters: + +[horizontal] +`pattern`:: + + A http://lucene.apache.org/core//6_5_1/core/org/apache/lucene/util/automaton/RegExp.html[Lucene regular expression], defaults to the empty string. + +[float] +=== Example configuration + +This example configures the `simplepatternsplit` tokenizer to split the input +text on underscores. + +[source,js] +---------------------------- +PUT my_index +{ + "settings": { + "analysis": { + "analyzer": { + "my_analyzer": { + "tokenizer": "my_tokenizer" + } + }, + "tokenizer": { + "my_tokenizer": { + "type": "simplepatternsplit", + "pattern": "_" + } + } + } + } +} + +POST my_index/_analyze +{ + "analyzer": "my_analyzer", + "text": "an_underscored_phrase" +} +---------------------------- +// CONSOLE + +///////////////////// + +[source,js] +---------------------------- +{ + "tokens" : [ + { + "token" : "an", + "start_offset" : 0, + "end_offset" : 2, + "type" : "word", + "position" : 0 + }, + { + "token" : "underscored", + "start_offset" : 3, + "end_offset" : 14, + "type" : "word", + "position" : 1 + }, + { + "token" : "phrase", + "start_offset" : 15, + "end_offset" : 21, + "type" : "word", + "position" : 2 + } + ] +} +---------------------------- +// TESTRESPONSE + +///////////////////// + +The above example produces these terms: + +[source,text] +--------------------------- +[ an, underscored, phrase ] +--------------------------- diff --git a/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/30_tokenizers.yml b/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/30_tokenizers.yml index 174a15f772bd9..7063437ad4643 100644 --- a/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/30_tokenizers.yml +++ b/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/30_tokenizers.yml @@ -25,3 +25,33 @@ - match: { detail.tokenizer.tokens.0.token: go } - match: { detail.tokenizer.tokens.1.token: oo } - match: { detail.tokenizer.tokens.2.token: od } + +--- +"simplepattern": + - do: + indices.analyze: + body: + text: "a6bf fooo ff61" + explain: true + tokenizer: + type: simplepattern + pattern: "[abcdef0123456789]{4}" + - length: { detail.tokenizer.tokens: 2 } + - match: { detail.tokenizer.name: _anonymous_tokenizer } + - match: { detail.tokenizer.tokens.0.token: a6bf } + - match: { detail.tokenizer.tokens.1.token: ff61 } + +--- +"simplepatternsplit": + - do: + indices.analyze: + body: + text: "foo==bar" + explain: true + tokenizer: + type: simplepatternsplit + pattern: == + - length: { detail.tokenizer.tokens: 2 } + - match: { detail.tokenizer.name: _anonymous_tokenizer } + - match: { detail.tokenizer.tokens.0.token: foo } + - match: { detail.tokenizer.tokens.1.token: bar } diff --git a/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java b/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java index fd8a5e7cd9aed..a962bd323fd20 100644 --- a/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java +++ b/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java @@ -71,6 +71,8 @@ import org.elasticsearch.index.analysis.ScandinavianNormalizationFilterFactory; import org.elasticsearch.index.analysis.SerbianNormalizationFilterFactory; import org.elasticsearch.index.analysis.ShingleTokenFilterFactory; +import org.elasticsearch.index.analysis.SimplePatternSplitTokenizerFactory; +import org.elasticsearch.index.analysis.SimplePatternTokenizerFactory; import org.elasticsearch.index.analysis.SoraniNormalizationFilterFactory; import org.elasticsearch.index.analysis.StandardTokenFilterFactory; import org.elasticsearch.index.analysis.StandardTokenizerFactory; @@ -129,25 +131,23 @@ private static String toCamelCase(String s) { static final Map> KNOWN_TOKENIZERS = new MapBuilder>() // exposed in ES - .put("classic", ClassicTokenizerFactory.class) - .put("edgengram", EdgeNGramTokenizerFactory.class) - .put("keyword", KeywordTokenizerFactory.class) - .put("letter", LetterTokenizerFactory.class) - .put("lowercase", LowerCaseTokenizerFactory.class) - .put("ngram", NGramTokenizerFactory.class) - .put("pathhierarchy", PathHierarchyTokenizerFactory.class) - .put("pattern", PatternTokenizerFactory.class) - .put("standard", StandardTokenizerFactory.class) - .put("thai", ThaiTokenizerFactory.class) - .put("uax29urlemail", UAX29URLEmailTokenizerFactory.class) - .put("whitespace", WhitespaceTokenizerFactory.class) + .put("classic", ClassicTokenizerFactory.class) + .put("edgengram", EdgeNGramTokenizerFactory.class) + .put("keyword", KeywordTokenizerFactory.class) + .put("letter", LetterTokenizerFactory.class) + .put("lowercase", LowerCaseTokenizerFactory.class) + .put("ngram", NGramTokenizerFactory.class) + .put("pathhierarchy", PathHierarchyTokenizerFactory.class) + .put("pattern", PatternTokenizerFactory.class) + .put("simplepattern", SimplePatternTokenizerFactory.class) + .put("simplepatternsplit", SimplePatternSplitTokenizerFactory.class) + .put("standard", StandardTokenizerFactory.class) + .put("thai", ThaiTokenizerFactory.class) + .put("uax29urlemail", UAX29URLEmailTokenizerFactory.class) + .put("whitespace", WhitespaceTokenizerFactory.class) // this one "seems to mess up offsets". probably shouldn't be a tokenizer... .put("wikipedia", Void.class) - - // TODO: expose these - .put("simplepattern", Void.class) - .put("simplepatternsplit", Void.class) .immutableMap(); static final Map> KNOWN_TOKENFILTERS = new MapBuilder>() From 66592578c7655e93e4ae0efa10a6e971b7061306 Mon Sep 17 00:00:00 2001 From: Andy Bristol Date: Mon, 12 Jun 2017 07:57:04 -0700 Subject: [PATCH 2/4] expose simplepattern and simplepatternsplit tokenizers Fixes for code review Take out admonition blocks in reference detail pages on these tokenizers because Lucene's regexes are better protected against being too complex or causing deep stacks. Move these tokenizers to the common-analysis module because that's where we're relocating code that depends on lucene-analyzers-common For #23363 --- .../indices/analysis/AnalysisModule.java | 4 ---- .../tokenizers/simplepattern-tokenizer.asciidoc | 11 ----------- .../tokenizers/simplepatternsplit-tokenizer.asciidoc | 11 ----------- .../analysis/common/CommonAnalysisPlugin.java | 10 ++++++++++ .../common}/SimplePatternSplitTokenizerFactory.java | 6 ++---- .../common}/SimplePatternTokenizerFactory.java | 6 ++---- .../analysis/common/CommonAnalysisFactoryTests.java | 2 ++ .../indices/analysis/AnalysisFactoryTestCase.java | 6 ++---- 8 files changed, 18 insertions(+), 38 deletions(-) rename {core/src/main/java/org/elasticsearch/index/analysis => modules/analysis-common/src/main/java/org/elasticsearch/analysis/common}/SimplePatternSplitTokenizerFactory.java (88%) rename {core/src/main/java/org/elasticsearch/index/analysis => modules/analysis-common/src/main/java/org/elasticsearch/analysis/common}/SimplePatternTokenizerFactory.java (88%) diff --git a/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java b/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java index e7dccdc9fd2d9..3f26b722f41ce 100644 --- a/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java +++ b/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java @@ -113,8 +113,6 @@ import org.elasticsearch.index.analysis.SerbianNormalizationFilterFactory; import org.elasticsearch.index.analysis.ShingleTokenFilterFactory; import org.elasticsearch.index.analysis.SimpleAnalyzerProvider; -import org.elasticsearch.index.analysis.SimplePatternSplitTokenizerFactory; -import org.elasticsearch.index.analysis.SimplePatternTokenizerFactory; import org.elasticsearch.index.analysis.SnowballAnalyzerProvider; import org.elasticsearch.index.analysis.SoraniAnalyzerProvider; import org.elasticsearch.index.analysis.SoraniNormalizationFilterFactory; @@ -345,8 +343,6 @@ private NamedRegistry> setupTokenizers(List> getTokenFilters() { return filters; } + @Override public Map> getCharFilters() { Map> filters = new TreeMap<>(); filters.put("html_strip", HtmlStripCharFilterFactory::new); @@ -108,6 +110,14 @@ public Map> getCharFilters() { return filters; } + @Override + public Map> getTokenizers() { + Map> tokenizers = new TreeMap<>(); + tokenizers.put("simplepattern", SimplePatternTokenizerFactory::new); + tokenizers.put("simplepatternsplit", SimplePatternSplitTokenizerFactory::new); + return tokenizers; + } + @Override public List getPreConfiguredCharFilters() { List filters = new ArrayList<>(); diff --git a/core/src/main/java/org/elasticsearch/index/analysis/SimplePatternSplitTokenizerFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/SimplePatternSplitTokenizerFactory.java similarity index 88% rename from core/src/main/java/org/elasticsearch/index/analysis/SimplePatternSplitTokenizerFactory.java rename to modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/SimplePatternSplitTokenizerFactory.java index 5d08cf903f062..b4c16b35dd946 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/SimplePatternSplitTokenizerFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/SimplePatternSplitTokenizerFactory.java @@ -17,13 +17,14 @@ * under the License. */ -package org.elasticsearch.index.analysis; +package org.elasticsearch.analysis.common; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.pattern.SimplePatternSplitTokenizer; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.analysis.AbstractTokenizerFactory; public class SimplePatternSplitTokenizerFactory extends AbstractTokenizerFactory { @@ -33,9 +34,6 @@ public SimplePatternSplitTokenizerFactory(IndexSettings indexSettings, Environme super(indexSettings, name, settings); String pattern = settings.get("pattern", ""); - if (pattern == null) { - throw new IllegalArgumentException("pattern is missing for [" + name + "] tokenizer of type 'simplepatternsplit'"); - } this.pattern = pattern; } diff --git a/core/src/main/java/org/elasticsearch/index/analysis/SimplePatternTokenizerFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/SimplePatternTokenizerFactory.java similarity index 88% rename from core/src/main/java/org/elasticsearch/index/analysis/SimplePatternTokenizerFactory.java rename to modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/SimplePatternTokenizerFactory.java index bb23a4609abbf..530f7e5bef4e6 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/SimplePatternTokenizerFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/SimplePatternTokenizerFactory.java @@ -17,13 +17,14 @@ * under the License. */ -package org.elasticsearch.index.analysis; +package org.elasticsearch.analysis.common; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.pattern.SimplePatternTokenizer; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.analysis.AbstractTokenizerFactory; public class SimplePatternTokenizerFactory extends AbstractTokenizerFactory { @@ -33,9 +34,6 @@ public SimplePatternTokenizerFactory(IndexSettings indexSettings, Environment en super(indexSettings, name, settings); String pattern = settings.get("pattern", ""); - if (pattern == null) { - throw new IllegalArgumentException("pattern is missing for [" + name + "] tokenizer of type 'simplepattern'"); - } this.pattern = pattern; } diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java index 59164f7506504..f7313572e13ee 100644 --- a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java @@ -43,6 +43,8 @@ public CommonAnalysisFactoryTests() { @Override protected Map> getTokenizers() { Map> tokenizers = new TreeMap<>(super.getTokenizers()); + tokenizers.put("simplepattern", SimplePatternTokenizerFactory.class); + tokenizers.put("simplepatternsplit", SimplePatternSplitTokenizerFactory.class); return tokenizers; } diff --git a/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java b/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java index a962bd323fd20..bcb9dfbe5a7b0 100644 --- a/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java +++ b/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java @@ -71,8 +71,6 @@ import org.elasticsearch.index.analysis.ScandinavianNormalizationFilterFactory; import org.elasticsearch.index.analysis.SerbianNormalizationFilterFactory; import org.elasticsearch.index.analysis.ShingleTokenFilterFactory; -import org.elasticsearch.index.analysis.SimplePatternSplitTokenizerFactory; -import org.elasticsearch.index.analysis.SimplePatternTokenizerFactory; import org.elasticsearch.index.analysis.SoraniNormalizationFilterFactory; import org.elasticsearch.index.analysis.StandardTokenFilterFactory; import org.elasticsearch.index.analysis.StandardTokenizerFactory; @@ -139,8 +137,8 @@ private static String toCamelCase(String s) { .put("ngram", NGramTokenizerFactory.class) .put("pathhierarchy", PathHierarchyTokenizerFactory.class) .put("pattern", PatternTokenizerFactory.class) - .put("simplepattern", SimplePatternTokenizerFactory.class) - .put("simplepatternsplit", SimplePatternSplitTokenizerFactory.class) + .put("simplepattern", MovedToAnalysisCommon.class) + .put("simplepatternsplit", MovedToAnalysisCommon.class) .put("standard", StandardTokenizerFactory.class) .put("thai", ThaiTokenizerFactory.class) .put("uax29urlemail", UAX29URLEmailTokenizerFactory.class) From d97846a6eea5d312df8d5c393ca7c7bd5588d4ec Mon Sep 17 00:00:00 2001 From: Andy Bristol Date: Mon, 12 Jun 2017 11:39:05 -0700 Subject: [PATCH 3/4] expose simplepattern and simplepatternsplit tokenizers Fix for code review to cleanup unnecessary variables For #23363 --- .../analysis/common/SimplePatternSplitTokenizerFactory.java | 3 +-- .../analysis/common/SimplePatternTokenizerFactory.java | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/SimplePatternSplitTokenizerFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/SimplePatternSplitTokenizerFactory.java index b4c16b35dd946..f861ec3792f5e 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/SimplePatternSplitTokenizerFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/SimplePatternSplitTokenizerFactory.java @@ -33,8 +33,7 @@ public class SimplePatternSplitTokenizerFactory extends AbstractTokenizerFactory public SimplePatternSplitTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { super(indexSettings, name, settings); - String pattern = settings.get("pattern", ""); - this.pattern = pattern; + pattern = settings.get("pattern", ""); } @Override diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/SimplePatternTokenizerFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/SimplePatternTokenizerFactory.java index 530f7e5bef4e6..6db3cfa67a318 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/SimplePatternTokenizerFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/SimplePatternTokenizerFactory.java @@ -33,8 +33,7 @@ public class SimplePatternTokenizerFactory extends AbstractTokenizerFactory { public SimplePatternTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { super(indexSettings, name, settings); - String pattern = settings.get("pattern", ""); - this.pattern = pattern; + pattern = settings.get("pattern", ""); } @Override From b1e503c2e25f29c894f2161240513319df3a9f07 Mon Sep 17 00:00:00 2001 From: Andy Bristol Date: Mon, 12 Jun 2017 16:01:58 -0700 Subject: [PATCH 4/4] expose simplepattern and simplepatternsplit tokenizers Make links to lucene javadocs relative to the lucene-core-javadoc property so they'll stay up to date as we change lucene versions Whitespace formatting in tokenizer docs Whitespace formatting in AnalysisFactoryTestCase so that we don't have to change spacing every time we edit that map Clearer usage in the header for simplepatternsplit's section For #23363 --- docs/reference/analysis/tokenizers.asciidoc | 6 ++-- .../simplepattern-tokenizer.asciidoc | 5 ++-- .../simplepatternsplit-tokenizer.asciidoc | 5 ++-- .../analysis/AnalysisFactoryTestCase.java | 28 +++++++++---------- 4 files changed, 21 insertions(+), 23 deletions(-) diff --git a/docs/reference/analysis/tokenizers.asciidoc b/docs/reference/analysis/tokenizers.asciidoc index c26974b9cad61..f1e0899d7abf1 100644 --- a/docs/reference/analysis/tokenizers.asciidoc +++ b/docs/reference/analysis/tokenizers.asciidoc @@ -105,9 +105,9 @@ and is generally faster than the `pattern` tokenizer. <>:: -The `simplepatternsplit` tokenizer uses the same restricted subset as -the `simplepattern` tokenizer, but splits the input at matches rather than -returning the matches as terms. +The `simplepatternsplit` tokenizer uses the same restricted regular expression +subset as the `simplepattern` tokenizer, but splits the input at matches rather +than returning the matches as terms. <>:: diff --git a/docs/reference/analysis/tokenizers/simplepattern-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/simplepattern-tokenizer.asciidoc index 997b3b5251dd7..bee92c75d26cd 100644 --- a/docs/reference/analysis/tokenizers/simplepattern-tokenizer.asciidoc +++ b/docs/reference/analysis/tokenizers/simplepattern-tokenizer.asciidoc @@ -13,7 +13,7 @@ the <> tokenizer. To split on pattern matches using the same restricted regular expression subset, see the <> tokenizer. -This tokenizer uses http://lucene.apache.org/core//6_5_1/core/org/apache/lucene/util/automaton/RegExp.html[Lucene regular expressions]. +This tokenizer uses {lucene-core-javadoc}/org/apache/lucene/util/automaton/RegExp.html[Lucene regular expressions]. For an explanation of the supported features and syntax, see <>. The default pattern is the empty string, which produces no terms. This @@ -26,8 +26,7 @@ The `simplepattern` tokenizer accepts the following parameters: [horizontal] `pattern`:: - - A http://lucene.apache.org/core//6_5_1/core/org/apache/lucene/util/automaton/RegExp.html[Lucene regular expression], defaults to the empty string. + {lucene-core-javadoc}/org/apache/lucene/util/automaton/RegExp.html[Lucene regular expression], defaults to the empty string. [float] === Example configuration diff --git a/docs/reference/analysis/tokenizers/simplepatternsplit-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/simplepatternsplit-tokenizer.asciidoc index dc850d09fc16b..c009f8cb7a400 100644 --- a/docs/reference/analysis/tokenizers/simplepatternsplit-tokenizer.asciidoc +++ b/docs/reference/analysis/tokenizers/simplepatternsplit-tokenizer.asciidoc @@ -13,7 +13,7 @@ terms from matches using patterns in the same restricted regular expression subset, see the <> tokenizer. -This tokenizer uses http://lucene.apache.org/core//6_5_1/core/org/apache/lucene/util/automaton/RegExp.html[Lucene regular expressions]. +This tokenizer uses {lucene-core-javadoc}/org/apache/lucene/util/automaton/RegExp.html[Lucene regular expressions]. For an explanation of the supported features and syntax, see <>. The default pattern is the empty string, which produces one term containing the @@ -27,8 +27,7 @@ The `simplepatternsplit` tokenizer accepts the following parameters: [horizontal] `pattern`:: - - A http://lucene.apache.org/core//6_5_1/core/org/apache/lucene/util/automaton/RegExp.html[Lucene regular expression], defaults to the empty string. + A {lucene-core-javadoc}/org/apache/lucene/util/automaton/RegExp.html[Lucene regular expression], defaults to the empty string. [float] === Example configuration diff --git a/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java b/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java index bcb9dfbe5a7b0..a3fe52d005c24 100644 --- a/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java +++ b/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java @@ -129,23 +129,23 @@ private static String toCamelCase(String s) { static final Map> KNOWN_TOKENIZERS = new MapBuilder>() // exposed in ES - .put("classic", ClassicTokenizerFactory.class) - .put("edgengram", EdgeNGramTokenizerFactory.class) - .put("keyword", KeywordTokenizerFactory.class) - .put("letter", LetterTokenizerFactory.class) - .put("lowercase", LowerCaseTokenizerFactory.class) - .put("ngram", NGramTokenizerFactory.class) - .put("pathhierarchy", PathHierarchyTokenizerFactory.class) - .put("pattern", PatternTokenizerFactory.class) - .put("simplepattern", MovedToAnalysisCommon.class) + .put("classic", ClassicTokenizerFactory.class) + .put("edgengram", EdgeNGramTokenizerFactory.class) + .put("keyword", KeywordTokenizerFactory.class) + .put("letter", LetterTokenizerFactory.class) + .put("lowercase", LowerCaseTokenizerFactory.class) + .put("ngram", NGramTokenizerFactory.class) + .put("pathhierarchy", PathHierarchyTokenizerFactory.class) + .put("pattern", PatternTokenizerFactory.class) + .put("simplepattern", MovedToAnalysisCommon.class) .put("simplepatternsplit", MovedToAnalysisCommon.class) - .put("standard", StandardTokenizerFactory.class) - .put("thai", ThaiTokenizerFactory.class) - .put("uax29urlemail", UAX29URLEmailTokenizerFactory.class) - .put("whitespace", WhitespaceTokenizerFactory.class) + .put("standard", StandardTokenizerFactory.class) + .put("thai", ThaiTokenizerFactory.class) + .put("uax29urlemail", UAX29URLEmailTokenizerFactory.class) + .put("whitespace", WhitespaceTokenizerFactory.class) // this one "seems to mess up offsets". probably shouldn't be a tokenizer... - .put("wikipedia", Void.class) + .put("wikipedia", Void.class) .immutableMap(); static final Map> KNOWN_TOKENFILTERS = new MapBuilder>()