diff --git a/docs/reference/analysis/tokenizers.asciidoc b/docs/reference/analysis/tokenizers.asciidoc index e042268a72f11..f1e0899d7abf1 100644 --- a/docs/reference/analysis/tokenizers.asciidoc +++ b/docs/reference/analysis/tokenizers.asciidoc @@ -97,6 +97,18 @@ The `pattern` tokenizer uses a regular expression to either split text into terms whenever it matches a word separator, or to capture matching text as terms. +<>:: + +The `simplepattern` tokenizer uses a regular expression to capture matching +text as terms. It uses a restricted subset of regular expression features +and is generally faster than the `pattern` tokenizer. + +<>:: + +The `simplepatternsplit` tokenizer uses the same restricted regular expression +subset as the `simplepattern` tokenizer, but splits the input at matches rather +than returning the matches as terms. + <>:: The `path_hierarchy` tokenizer takes a hierarchical value like a filesystem @@ -131,6 +143,8 @@ include::tokenizers/keyword-tokenizer.asciidoc[] include::tokenizers/pattern-tokenizer.asciidoc[] -include::tokenizers/pathhierarchy-tokenizer.asciidoc[] +include::tokenizers/simplepattern-tokenizer.asciidoc[] +include::tokenizers/simplepatternsplit-tokenizer.asciidoc[] +include::tokenizers/pathhierarchy-tokenizer.asciidoc[] diff --git a/docs/reference/analysis/tokenizers/simplepattern-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/simplepattern-tokenizer.asciidoc new file mode 100644 index 0000000000000..bee92c75d26cd --- /dev/null +++ b/docs/reference/analysis/tokenizers/simplepattern-tokenizer.asciidoc @@ -0,0 +1,105 @@ +[[analysis-simplepattern-tokenizer]] +=== Simple Pattern Tokenizer + +experimental[] + +The `simplepattern` tokenizer uses a regular expression to capture matching +text as terms. The set of regular expression features it supports is more +limited than the <> tokenizer, but the +tokenization is generally faster. + +This tokenizer does not support splitting the input on a pattern match, unlike +the <> tokenizer. To split on pattern +matches using the same restricted regular expression subset, see the +<> tokenizer. + +This tokenizer uses {lucene-core-javadoc}/org/apache/lucene/util/automaton/RegExp.html[Lucene regular expressions]. +For an explanation of the supported features and syntax, see <>. + +The default pattern is the empty string, which produces no terms. This +tokenizer should always be configured with a non-default pattern. + +[float] +=== Configuration + +The `simplepattern` tokenizer accepts the following parameters: + +[horizontal] +`pattern`:: + {lucene-core-javadoc}/org/apache/lucene/util/automaton/RegExp.html[Lucene regular expression], defaults to the empty string. + +[float] +=== Example configuration + +This example configures the `simplepattern` tokenizer to produce terms that are +three-digit numbers + +[source,js] +---------------------------- +PUT my_index +{ + "settings": { + "analysis": { + "analyzer": { + "my_analyzer": { + "tokenizer": "my_tokenizer" + } + }, + "tokenizer": { + "my_tokenizer": { + "type": "simplepattern", + "pattern": "[0123456789]{3}" + } + } + } + } +} + +POST my_index/_analyze +{ + "analyzer": "my_analyzer", + "text": "fd-786-335-514-x" +} +---------------------------- +// CONSOLE + +///////////////////// + +[source,js] +---------------------------- +{ + "tokens" : [ + { + "token" : "786", + "start_offset" : 3, + "end_offset" : 6, + "type" : "word", + "position" : 0 + }, + { + "token" : "335", + "start_offset" : 7, + "end_offset" : 10, + "type" : "word", + "position" : 1 + }, + { + "token" : "514", + "start_offset" : 11, + "end_offset" : 14, + "type" : "word", + "position" : 2 + } + ] +} +---------------------------- +// TESTRESPONSE + +///////////////////// + +The above example produces these terms: + +[source,text] +--------------------------- +[ 786, 335, 514 ] +--------------------------- diff --git a/docs/reference/analysis/tokenizers/simplepatternsplit-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/simplepatternsplit-tokenizer.asciidoc new file mode 100644 index 0000000000000..c009f8cb7a400 --- /dev/null +++ b/docs/reference/analysis/tokenizers/simplepatternsplit-tokenizer.asciidoc @@ -0,0 +1,106 @@ +[[analysis-simplepatternsplit-tokenizer]] +=== Simple Pattern Split Tokenizer + +experimental[] + +The `simplepatternsplit` tokenizer uses a regular expression to split the +input into terms at pattern matches. The set of regular expression features it +supports is more limited than the <> +tokenizer, but the tokenization is generally faster. + +This tokenizer does not produce terms from the matches themselves. To produce +terms from matches using patterns in the same restricted regular expression +subset, see the <> +tokenizer. + +This tokenizer uses {lucene-core-javadoc}/org/apache/lucene/util/automaton/RegExp.html[Lucene regular expressions]. +For an explanation of the supported features and syntax, see <>. + +The default pattern is the empty string, which produces one term containing the +full input. This tokenizer should always be configured with a non-default +pattern. + +[float] +=== Configuration + +The `simplepatternsplit` tokenizer accepts the following parameters: + +[horizontal] +`pattern`:: + A {lucene-core-javadoc}/org/apache/lucene/util/automaton/RegExp.html[Lucene regular expression], defaults to the empty string. + +[float] +=== Example configuration + +This example configures the `simplepatternsplit` tokenizer to split the input +text on underscores. + +[source,js] +---------------------------- +PUT my_index +{ + "settings": { + "analysis": { + "analyzer": { + "my_analyzer": { + "tokenizer": "my_tokenizer" + } + }, + "tokenizer": { + "my_tokenizer": { + "type": "simplepatternsplit", + "pattern": "_" + } + } + } + } +} + +POST my_index/_analyze +{ + "analyzer": "my_analyzer", + "text": "an_underscored_phrase" +} +---------------------------- +// CONSOLE + +///////////////////// + +[source,js] +---------------------------- +{ + "tokens" : [ + { + "token" : "an", + "start_offset" : 0, + "end_offset" : 2, + "type" : "word", + "position" : 0 + }, + { + "token" : "underscored", + "start_offset" : 3, + "end_offset" : 14, + "type" : "word", + "position" : 1 + }, + { + "token" : "phrase", + "start_offset" : 15, + "end_offset" : 21, + "type" : "word", + "position" : 2 + } + ] +} +---------------------------- +// TESTRESPONSE + +///////////////////// + +The above example produces these terms: + +[source,text] +--------------------------- +[ an, underscored, phrase ] +--------------------------- diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java index c33023d1cb251..2f8f1d7405a96 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java @@ -73,6 +73,7 @@ import org.elasticsearch.index.analysis.PreConfiguredTokenFilter; import org.elasticsearch.index.analysis.PreConfiguredTokenizer; import org.elasticsearch.index.analysis.TokenFilterFactory; +import org.elasticsearch.index.analysis.TokenizerFactory; import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider; import org.elasticsearch.plugins.AnalysisPlugin; import org.elasticsearch.plugins.Plugin; @@ -100,6 +101,7 @@ public Map> getTokenFilters() { return filters; } + @Override public Map> getCharFilters() { Map> filters = new TreeMap<>(); filters.put("html_strip", HtmlStripCharFilterFactory::new); @@ -108,6 +110,14 @@ public Map> getCharFilters() { return filters; } + @Override + public Map> getTokenizers() { + Map> tokenizers = new TreeMap<>(); + tokenizers.put("simplepattern", SimplePatternTokenizerFactory::new); + tokenizers.put("simplepatternsplit", SimplePatternSplitTokenizerFactory::new); + return tokenizers; + } + @Override public List getPreConfiguredCharFilters() { List filters = new ArrayList<>(); diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/SimplePatternSplitTokenizerFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/SimplePatternSplitTokenizerFactory.java new file mode 100644 index 0000000000000..f861ec3792f5e --- /dev/null +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/SimplePatternSplitTokenizerFactory.java @@ -0,0 +1,43 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.analysis.common; + +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.pattern.SimplePatternSplitTokenizer; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.env.Environment; +import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.analysis.AbstractTokenizerFactory; + +public class SimplePatternSplitTokenizerFactory extends AbstractTokenizerFactory { + + private final String pattern; + + public SimplePatternSplitTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { + super(indexSettings, name, settings); + + pattern = settings.get("pattern", ""); + } + + @Override + public Tokenizer create() { + return new SimplePatternSplitTokenizer(pattern); + } +} diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/SimplePatternTokenizerFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/SimplePatternTokenizerFactory.java new file mode 100644 index 0000000000000..6db3cfa67a318 --- /dev/null +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/SimplePatternTokenizerFactory.java @@ -0,0 +1,43 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.analysis.common; + +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.pattern.SimplePatternTokenizer; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.env.Environment; +import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.analysis.AbstractTokenizerFactory; + +public class SimplePatternTokenizerFactory extends AbstractTokenizerFactory { + + private final String pattern; + + public SimplePatternTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { + super(indexSettings, name, settings); + + pattern = settings.get("pattern", ""); + } + + @Override + public Tokenizer create() { + return new SimplePatternTokenizer(pattern); + } +} diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java index 59164f7506504..f7313572e13ee 100644 --- a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java @@ -43,6 +43,8 @@ public CommonAnalysisFactoryTests() { @Override protected Map> getTokenizers() { Map> tokenizers = new TreeMap<>(super.getTokenizers()); + tokenizers.put("simplepattern", SimplePatternTokenizerFactory.class); + tokenizers.put("simplepatternsplit", SimplePatternSplitTokenizerFactory.class); return tokenizers; } diff --git a/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/30_tokenizers.yml b/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/30_tokenizers.yml index 174a15f772bd9..7063437ad4643 100644 --- a/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/30_tokenizers.yml +++ b/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/30_tokenizers.yml @@ -25,3 +25,33 @@ - match: { detail.tokenizer.tokens.0.token: go } - match: { detail.tokenizer.tokens.1.token: oo } - match: { detail.tokenizer.tokens.2.token: od } + +--- +"simplepattern": + - do: + indices.analyze: + body: + text: "a6bf fooo ff61" + explain: true + tokenizer: + type: simplepattern + pattern: "[abcdef0123456789]{4}" + - length: { detail.tokenizer.tokens: 2 } + - match: { detail.tokenizer.name: _anonymous_tokenizer } + - match: { detail.tokenizer.tokens.0.token: a6bf } + - match: { detail.tokenizer.tokens.1.token: ff61 } + +--- +"simplepatternsplit": + - do: + indices.analyze: + body: + text: "foo==bar" + explain: true + tokenizer: + type: simplepatternsplit + pattern: == + - length: { detail.tokenizer.tokens: 2 } + - match: { detail.tokenizer.name: _anonymous_tokenizer } + - match: { detail.tokenizer.tokens.0.token: foo } + - match: { detail.tokenizer.tokens.1.token: bar } diff --git a/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java b/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java index fd8a5e7cd9aed..a3fe52d005c24 100644 --- a/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java +++ b/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java @@ -129,25 +129,23 @@ private static String toCamelCase(String s) { static final Map> KNOWN_TOKENIZERS = new MapBuilder>() // exposed in ES - .put("classic", ClassicTokenizerFactory.class) - .put("edgengram", EdgeNGramTokenizerFactory.class) - .put("keyword", KeywordTokenizerFactory.class) - .put("letter", LetterTokenizerFactory.class) - .put("lowercase", LowerCaseTokenizerFactory.class) - .put("ngram", NGramTokenizerFactory.class) + .put("classic", ClassicTokenizerFactory.class) + .put("edgengram", EdgeNGramTokenizerFactory.class) + .put("keyword", KeywordTokenizerFactory.class) + .put("letter", LetterTokenizerFactory.class) + .put("lowercase", LowerCaseTokenizerFactory.class) + .put("ngram", NGramTokenizerFactory.class) .put("pathhierarchy", PathHierarchyTokenizerFactory.class) - .put("pattern", PatternTokenizerFactory.class) - .put("standard", StandardTokenizerFactory.class) - .put("thai", ThaiTokenizerFactory.class) + .put("pattern", PatternTokenizerFactory.class) + .put("simplepattern", MovedToAnalysisCommon.class) + .put("simplepatternsplit", MovedToAnalysisCommon.class) + .put("standard", StandardTokenizerFactory.class) + .put("thai", ThaiTokenizerFactory.class) .put("uax29urlemail", UAX29URLEmailTokenizerFactory.class) - .put("whitespace", WhitespaceTokenizerFactory.class) + .put("whitespace", WhitespaceTokenizerFactory.class) // this one "seems to mess up offsets". probably shouldn't be a tokenizer... - .put("wikipedia", Void.class) - - // TODO: expose these - .put("simplepattern", Void.class) - .put("simplepatternsplit", Void.class) + .put("wikipedia", Void.class) .immutableMap(); static final Map> KNOWN_TOKENFILTERS = new MapBuilder>()