elastic · andyb-elastic · Jun 13, 2017 · Jun 7, 2017 · Jun 12, 2017 · Jun 12, 2017
diff --git a/docs/reference/analysis/tokenizers.asciidoc b/docs/reference/analysis/tokenizers.asciidoc
@@ -97,6 +97,18 @@ The `pattern` tokenizer uses a regular expression to either split text into
 terms whenever it matches a word separator, or to capture matching text as
 terms.
 
+<<analysis-simplepattern-tokenizer,Simple Pattern Tokenizer>>::
+
+The `simplepattern` tokenizer uses a regular expression to capture matching
+text as terms. It uses a restricted subset of regular expression features
+and is generally faster than the `pattern` tokenizer.
+
+<<analysis-simplepatternsplit-tokenizer,Simple Pattern Split Tokenizer>>::
+
+The `simplepatternsplit` tokenizer uses the same restricted regular expression
+subset as the `simplepattern` tokenizer, but splits the input at matches rather
+than returning the matches as terms.
+
 <<analysis-pathhierarchy-tokenizer,Path Tokenizer>>::
 
 The `path_hierarchy` tokenizer takes a hierarchical value like a filesystem
@@ -131,6 +143,8 @@ include::tokenizers/keyword-tokenizer.asciidoc[]
 
 include::tokenizers/pattern-tokenizer.asciidoc[]
 
-include::tokenizers/pathhierarchy-tokenizer.asciidoc[]
+include::tokenizers/simplepattern-tokenizer.asciidoc[]
 
+include::tokenizers/simplepatternsplit-tokenizer.asciidoc[]
 
+include::tokenizers/pathhierarchy-tokenizer.asciidoc[]
diff --git a/docs/reference/analysis/tokenizers/simplepattern-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/simplepattern-tokenizer.asciidoc
@@ -0,0 +1,105 @@
+[[analysis-simplepattern-tokenizer]]
+=== Simple Pattern Tokenizer
+
+experimental[]
+
+The `simplepattern` tokenizer uses a regular expression to capture matching
+text as terms. The set of regular expression features it supports is more
+limited than the <<analysis-pattern-tokenizer,`pattern`>> tokenizer, but the
+tokenization is generally faster.
+
+This tokenizer does not support splitting the input on a pattern match, unlike
+the <<analysis-pattern-tokenizer,`pattern`>> tokenizer. To split on pattern
+matches using the same restricted regular expression subset, see the
+<<analysis-simplepatternsplit-tokenizer,`simplepatternsplit`>> tokenizer.
+
+This tokenizer uses {lucene-core-javadoc}/org/apache/lucene/util/automaton/RegExp.html[Lucene regular expressions].
+For an explanation of the supported features and syntax, see <<regexp-syntax,Regular Expression Syntax>>.
+
+The default pattern is the empty string, which produces no terms. This
+tokenizer should always be configured with a non-default pattern.
+
+[float]
+=== Configuration
+
+The `simplepattern` tokenizer accepts the following parameters:
+
+[horizontal]
+`pattern`::
+    {lucene-core-javadoc}/org/apache/lucene/util/automaton/RegExp.html[Lucene regular expression], defaults to the empty string.
+
+[float]
+=== Example configuration
+
+This example configures the `simplepattern` tokenizer to produce terms that are
+three-digit numbers
+
+[source,js]
+----------------------------
+PUT my_index
+{
+  "settings": {
+    "analysis": {
+      "analyzer": {
+        "my_analyzer": {
+          "tokenizer": "my_tokenizer"
+        }
+      },
+      "tokenizer": {
+        "my_tokenizer": {
+          "type": "simplepattern",
+          "pattern": "[0123456789]{3}"
+        }
+      }
+    }
+  }
+}
+
+POST my_index/_analyze
+{
+  "analyzer": "my_analyzer",
+  "text": "fd-786-335-514-x"
+}
+----------------------------
+// CONSOLE
+
+/////////////////////
+
+[source,js]
+----------------------------
+{
+  "tokens" : [
+    {
+      "token" : "786",
+      "start_offset" : 3,
+      "end_offset" : 6,
+      "type" : "word",
+      "position" : 0
+    },
+    {
+      "token" : "335",
+      "start_offset" : 7,
+      "end_offset" : 10,
+      "type" : "word",
+      "position" : 1
+    },
+    {
+      "token" : "514",
+      "start_offset" : 11,
+      "end_offset" : 14,
+      "type" : "word",
+      "position" : 2
+    }
+  ]
+}
+----------------------------
+// TESTRESPONSE
+
+/////////////////////
+
+The above example produces these terms:
+
+[source,text]
+---------------------------
+[ 786, 335, 514 ]
+---------------------------
diff --git a/docs/reference/analysis/tokenizers/simplepatternsplit-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/simplepatternsplit-tokenizer.asciidoc
@@ -0,0 +1,106 @@
+[[analysis-simplepatternsplit-tokenizer]]
+=== Simple Pattern Split Tokenizer
+
+experimental[]
+
+The `simplepatternsplit` tokenizer uses a regular expression to split the
+input into terms at pattern matches. The set of regular expression features it
+supports is more limited than the <<analysis-pattern-tokenizer,`pattern`>>
+tokenizer, but the tokenization is generally faster.
+
+This tokenizer does not produce terms from the matches themselves. To produce
+terms from matches using patterns in the same restricted regular expression
+subset, see the <<analysis-simplepattern-tokenizer,`simplepattern`>>
+tokenizer.
+
+This tokenizer uses {lucene-core-javadoc}/org/apache/lucene/util/automaton/RegExp.html[Lucene regular expressions].
+For an explanation of the supported features and syntax, see <<regexp-syntax,Regular Expression Syntax>>.
+
+The default pattern is the empty string, which produces one term containing the
+full input. This tokenizer should always be configured with a non-default
+pattern.
+
+[float]
+=== Configuration
+
+The `simplepatternsplit` tokenizer accepts the following parameters:
+
+[horizontal]
+`pattern`::
+    A {lucene-core-javadoc}/org/apache/lucene/util/automaton/RegExp.html[Lucene regular expression], defaults to the empty string.
+
+[float]
+=== Example configuration
+
+This example configures the `simplepatternsplit` tokenizer to split the input
+text on underscores.
+
+[source,js]
+----------------------------
+PUT my_index
+{
+  "settings": {
+    "analysis": {
+      "analyzer": {
+        "my_analyzer": {
+          "tokenizer": "my_tokenizer"
+        }
+      },
+      "tokenizer": {
+        "my_tokenizer": {
+          "type": "simplepatternsplit",
+          "pattern": "_"
+        }
+      }
+    }
+  }
+}
+
+POST my_index/_analyze
+{
+  "analyzer": "my_analyzer",
+  "text": "an_underscored_phrase"
+}
+----------------------------
+// CONSOLE
+
+/////////////////////
+
+[source,js]
+----------------------------
+{
+  "tokens" : [
+    {
+      "token" : "an",
+      "start_offset" : 0,
+      "end_offset" : 2,
+      "type" : "word",
+      "position" : 0
+    },
+    {
+      "token" : "underscored",
+      "start_offset" : 3,
+      "end_offset" : 14,
+      "type" : "word",
+      "position" : 1
+    },
+    {
+      "token" : "phrase",
+      "start_offset" : 15,
+      "end_offset" : 21,
+      "type" : "word",
+      "position" : 2
+    }
+  ]
+}
+----------------------------
+// TESTRESPONSE
+
+/////////////////////
+
+The above example produces these terms:
+
+[source,text]
+---------------------------
+[ an, underscored, phrase ]
+---------------------------
diff --git a/...analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java b/...analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java
@@ -73,6 +73,7 @@
 import org.elasticsearch.index.analysis.PreConfiguredTokenFilter;
 import org.elasticsearch.index.analysis.PreConfiguredTokenizer;
 import org.elasticsearch.index.analysis.TokenFilterFactory;
+import org.elasticsearch.index.analysis.TokenizerFactory;
 import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider;
 import org.elasticsearch.plugins.AnalysisPlugin;
 import org.elasticsearch.plugins.Plugin;
@@ -100,6 +101,7 @@ public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
         return filters;
     }
 
+    @Override
     public Map<String, AnalysisProvider<CharFilterFactory>> getCharFilters() {
         Map<String, AnalysisProvider<CharFilterFactory>> filters = new TreeMap<>();
         filters.put("html_strip", HtmlStripCharFilterFactory::new);
@@ -108,6 +110,14 @@ public Map<String, AnalysisProvider<CharFilterFactory>> getCharFilters() {
         return filters;
     }
 
+    @Override
+    public Map<String, AnalysisProvider<TokenizerFactory>> getTokenizers() {
+        Map<String, AnalysisProvider<TokenizerFactory>> tokenizers = new TreeMap<>();
+        tokenizers.put("simplepattern", SimplePatternTokenizerFactory::new);
+        tokenizers.put("simplepatternsplit", SimplePatternSplitTokenizerFactory::new);
+        return tokenizers;
+    }
+
     @Override
     public List<PreConfiguredCharFilter> getPreConfiguredCharFilters() {
         List<PreConfiguredCharFilter> filters = new ArrayList<>();

diff --git a/...n/src/main/java/org/elasticsearch/analysis/common/SimplePatternSplitTokenizerFactory.java b/...n/src/main/java/org/elasticsearch/analysis/common/SimplePatternSplitTokenizerFactory.java
@@ -0,0 +1,43 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.analysis.common;
+
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.pattern.SimplePatternSplitTokenizer;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.index.IndexSettings;
+import org.elasticsearch.index.analysis.AbstractTokenizerFactory;
+
+public class SimplePatternSplitTokenizerFactory extends AbstractTokenizerFactory {
+
+    private final String pattern;
+
+    public SimplePatternSplitTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
+        super(indexSettings, name, settings);
+
+        pattern = settings.get("pattern", "");
+    }
+
+    @Override
+    public Tokenizer create() {
+        return new SimplePatternSplitTokenizer(pattern);
+    }
+}
diff --git a/...common/src/main/java/org/elasticsearch/analysis/common/SimplePatternTokenizerFactory.java b/...common/src/main/java/org/elasticsearch/analysis/common/SimplePatternTokenizerFactory.java
@@ -0,0 +1,43 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.analysis.common;
+
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.pattern.SimplePatternTokenizer;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.index.IndexSettings;
+import org.elasticsearch.index.analysis.AbstractTokenizerFactory;
+
+public class SimplePatternTokenizerFactory extends AbstractTokenizerFactory {
+
+    private final String pattern;
+
+    public SimplePatternTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
+        super(indexSettings, name, settings);
+
+        pattern = settings.get("pattern", "");
+    }
+
+    @Override
+    public Tokenizer create() {
+        return new SimplePatternTokenizer(pattern);
+    }
+}
diff --git a/...is-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java b/...is-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java
@@ -43,6 +43,8 @@ public CommonAnalysisFactoryTests() {
     @Override
     protected Map<String, Class<?>> getTokenizers() {
         Map<String, Class<?>> tokenizers = new TreeMap<>(super.getTokenizers());
+        tokenizers.put("simplepattern", SimplePatternTokenizerFactory.class);
+        tokenizers.put("simplepatternsplit", SimplePatternSplitTokenizerFactory.class);
         return tokenizers;
     }