From b8188b82b7deb341928cbbeb9dc3cecfb397caa8 Mon Sep 17 00:00:00 2001
From: Andy Bristol <andy.bristol@elastic.co>
Date: Wed, 7 Jun 2017 10:46:48 -0700
Subject: [PATCH 1/4] expose simplepattern and simplepatternsplit tokenizers

Register these experimental tokenizers. Their default patterns
are both set to the empty string. These tokenizers only seem
useful if there is a pattern the user has in mind, so there
aren't really "sensible" defaults. However tokenizer factories
are instantiated at index creation time, so they blow up if
there's no default pattern.

Add a rest test and entries in the reference for each tokenizer

For #23363
---
 .../SimplePatternSplitTokenizerFactory.java   |  46 +++++++
 .../SimplePatternTokenizerFactory.java        |  46 +++++++
 .../indices/analysis/AnalysisModule.java      |   4 +
 docs/reference/analysis/tokenizers.asciidoc   |  16 ++-
 .../simplepattern-tokenizer.asciidoc          | 117 +++++++++++++++++
 .../simplepatternsplit-tokenizer.asciidoc     | 118 ++++++++++++++++++
 .../test/analysis-common/30_tokenizers.yml    |  30 +++++
 .../analysis/AnalysisFactoryTestCase.java     |  32 ++---
 8 files changed, 392 insertions(+), 17 deletions(-)
 create mode 100644 core/src/main/java/org/elasticsearch/index/analysis/SimplePatternSplitTokenizerFactory.java
 create mode 100644 core/src/main/java/org/elasticsearch/index/analysis/SimplePatternTokenizerFactory.java
 create mode 100644 docs/reference/analysis/tokenizers/simplepattern-tokenizer.asciidoc
 create mode 100644 docs/reference/analysis/tokenizers/simplepatternsplit-tokenizer.asciidoc

diff --git a/core/src/main/java/org/elasticsearch/index/analysis/SimplePatternSplitTokenizerFactory.java b/core/src/main/java/org/elasticsearch/index/analysis/SimplePatternSplitTokenizerFactory.java
new file mode 100644
index 0000000000000..5d08cf903f062
--- /dev/null
+++ b/core/src/main/java/org/elasticsearch/index/analysis/SimplePatternSplitTokenizerFactory.java
@@ -0,0 +1,46 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.analysis;
+
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.pattern.SimplePatternSplitTokenizer;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.index.IndexSettings;
+
+public class SimplePatternSplitTokenizerFactory extends AbstractTokenizerFactory {
+
+    private final String pattern;
+
+    public SimplePatternSplitTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
+        super(indexSettings, name, settings);
+
+        String pattern = settings.get("pattern", "");
+        if (pattern == null) {
+            throw new IllegalArgumentException("pattern is missing for [" + name + "] tokenizer of type 'simplepatternsplit'");
+        }
+        this.pattern = pattern;
+    }
+
+    @Override
+    public Tokenizer create() {
+        return new SimplePatternSplitTokenizer(pattern);
+    }
+}
diff --git a/core/src/main/java/org/elasticsearch/index/analysis/SimplePatternTokenizerFactory.java b/core/src/main/java/org/elasticsearch/index/analysis/SimplePatternTokenizerFactory.java
new file mode 100644
index 0000000000000..bb23a4609abbf
--- /dev/null
+++ b/core/src/main/java/org/elasticsearch/index/analysis/SimplePatternTokenizerFactory.java
@@ -0,0 +1,46 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.analysis;
+
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.pattern.SimplePatternTokenizer;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.index.IndexSettings;
+
+public class SimplePatternTokenizerFactory extends AbstractTokenizerFactory {
+
+    private final String pattern;
+
+    public SimplePatternTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
+        super(indexSettings, name, settings);
+
+        String pattern = settings.get("pattern", "");
+        if (pattern == null) {
+            throw new IllegalArgumentException("pattern is missing for [" + name + "] tokenizer of type 'simplepattern'");
+        }
+        this.pattern = pattern;
+    }
+
+    @Override
+    public Tokenizer create() {
+        return new SimplePatternTokenizer(pattern);
+    }
+}
diff --git a/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java b/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java
index 3f26b722f41ce..e7dccdc9fd2d9 100644
--- a/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java
+++ b/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java
@@ -113,6 +113,8 @@
 import org.elasticsearch.index.analysis.SerbianNormalizationFilterFactory;
 import org.elasticsearch.index.analysis.ShingleTokenFilterFactory;
 import org.elasticsearch.index.analysis.SimpleAnalyzerProvider;
+import org.elasticsearch.index.analysis.SimplePatternSplitTokenizerFactory;
+import org.elasticsearch.index.analysis.SimplePatternTokenizerFactory;
 import org.elasticsearch.index.analysis.SnowballAnalyzerProvider;
 import org.elasticsearch.index.analysis.SoraniAnalyzerProvider;
 import org.elasticsearch.index.analysis.SoraniNormalizationFilterFactory;
@@ -343,6 +345,8 @@ private NamedRegistry<AnalysisProvider<TokenizerFactory>> setupTokenizers(List<A
         tokenizers.register("edgeNGram", EdgeNGramTokenizerFactory::new);
         tokenizers.register("edge_ngram", EdgeNGramTokenizerFactory::new);
         tokenizers.register("pattern", PatternTokenizerFactory::new);
+        tokenizers.register("simplepattern", SimplePatternTokenizerFactory::new);
+        tokenizers.register("simplepatternsplit", SimplePatternSplitTokenizerFactory::new);
         tokenizers.register("classic", ClassicTokenizerFactory::new);
         tokenizers.register("thai", ThaiTokenizerFactory::new);
         tokenizers.extractAndRegister(plugins, AnalysisPlugin::getTokenizers);
diff --git a/docs/reference/analysis/tokenizers.asciidoc b/docs/reference/analysis/tokenizers.asciidoc
index e042268a72f11..c26974b9cad61 100644
--- a/docs/reference/analysis/tokenizers.asciidoc
+++ b/docs/reference/analysis/tokenizers.asciidoc
@@ -97,6 +97,18 @@ The `pattern` tokenizer uses a regular expression to either split text into
 terms whenever it matches a word separator, or to capture matching text as
 terms.
 
+<<analysis-simplepattern-tokenizer,Simple Pattern Tokenizer>>::
+
+The `simplepattern` tokenizer uses a regular expression to capture matching
+text as terms. It uses a restricted subset of regular expression features
+and is generally faster than the `pattern` tokenizer.
+
+<<analysis-simplepatternsplit-tokenizer,Simple Pattern Split Tokenizer>>::
+
+The `simplepatternsplit` tokenizer uses the same restricted subset as
+the `simplepattern` tokenizer, but splits the input at matches rather than 
+returning the matches as terms.
+
 <<analysis-pathhierarchy-tokenizer,Path Tokenizer>>::
 
 The `path_hierarchy` tokenizer takes a hierarchical value like a filesystem
@@ -131,6 +143,8 @@ include::tokenizers/keyword-tokenizer.asciidoc[]
 
 include::tokenizers/pattern-tokenizer.asciidoc[]
 
-include::tokenizers/pathhierarchy-tokenizer.asciidoc[]
+include::tokenizers/simplepattern-tokenizer.asciidoc[]
 
+include::tokenizers/simplepatternsplit-tokenizer.asciidoc[]
 
+include::tokenizers/pathhierarchy-tokenizer.asciidoc[]
diff --git a/docs/reference/analysis/tokenizers/simplepattern-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/simplepattern-tokenizer.asciidoc
new file mode 100644
index 0000000000000..a0c3c7dcf1288
--- /dev/null
+++ b/docs/reference/analysis/tokenizers/simplepattern-tokenizer.asciidoc
@@ -0,0 +1,117 @@
+[[analysis-simplepattern-tokenizer]]
+=== Simple Pattern Tokenizer
+
+experimental[]
+
+The `simplepattern` tokenizer uses a regular expression to capture matching
+text as terms. The set of regular expression features it supports is more
+limited than the <<analysis-pattern-tokenizer,`pattern`>> tokenizer, but the
+tokenization is generally faster.
+
+This tokenizer does not support splitting the input on a pattern match, unlike
+the <<analysis-pattern-tokenizer,`pattern`>> tokenizer. To split on pattern
+matches using the same restricted regular expression subset, see the
+<<analysis-simplepatternsplit-tokenizer,`simplepatternsplit`>> tokenizer.
+
+This tokenizer uses http://lucene.apache.org/core//6_5_1/core/org/apache/lucene/util/automaton/RegExp.html[Lucene regular expressions].
+For an explanation of the supported features and syntax, see <<regexp-syntax,Regular Expression Syntax>>.
+
+The default pattern is the empty string, which produces no terms. This
+tokenizer should always be configured with a non-default pattern.
+
+[WARNING]
+.Beware of Pathological Regular Expressions
+========================================
+
+A badly written regular expression could run very slowly or even throw a
+StackOverflowError and cause the node it is running on to exit suddenly.
+
+Read more about http://www.regular-expressions.info/catastrophic.html[pathological regular expressions and how to avoid them].
+
+========================================
+
+[float]
+=== Configuration
+
+The `simplepattern` tokenizer accepts the following parameters:
+
+[horizontal]
+`pattern`::
+
+    A http://lucene.apache.org/core//6_5_1/core/org/apache/lucene/util/automaton/RegExp.html[Lucene regular expression], defaults to the empty string.
+
+[float]
+=== Example configuration
+
+This example configures the `simplepattern` tokenizer to produce terms that are
+three-digit numbers
+
+[source,js]
+----------------------------
+PUT my_index
+{
+  "settings": {
+    "analysis": {
+      "analyzer": {
+        "my_analyzer": {
+          "tokenizer": "my_tokenizer"
+        }
+      },
+      "tokenizer": {
+        "my_tokenizer": {
+          "type": "simplepattern",
+          "pattern": "[0123456789]{3}"
+        }
+      }
+    }
+  }
+}
+
+POST my_index/_analyze
+{
+  "analyzer": "my_analyzer",
+  "text": "fd-786-335-514-x"
+}
+----------------------------
+// CONSOLE
+
+/////////////////////
+
+[source,js]
+----------------------------
+{
+  "tokens" : [
+    {
+      "token" : "786",
+      "start_offset" : 3,
+      "end_offset" : 6,
+      "type" : "word",
+      "position" : 0
+    },
+    {
+      "token" : "335",
+      "start_offset" : 7,
+      "end_offset" : 10,
+      "type" : "word",
+      "position" : 1
+    },
+    {
+      "token" : "514",
+      "start_offset" : 11,
+      "end_offset" : 14,
+      "type" : "word",
+      "position" : 2
+    }
+  ]
+}
+----------------------------
+// TESTRESPONSE
+
+/////////////////////
+
+The above example produces these terms:
+
+[source,text]
+---------------------------
+[ 786, 335, 514 ]
+---------------------------
diff --git a/docs/reference/analysis/tokenizers/simplepatternsplit-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/simplepatternsplit-tokenizer.asciidoc
new file mode 100644
index 0000000000000..04e17a0e68501
--- /dev/null
+++ b/docs/reference/analysis/tokenizers/simplepatternsplit-tokenizer.asciidoc
@@ -0,0 +1,118 @@
+[[analysis-simplepatternsplit-tokenizer]]
+=== Simple Pattern Split Tokenizer
+
+experimental[]
+
+The `simplepatternsplit` tokenizer uses a regular expression to split the
+input into terms at pattern matches. The set of regular expression features it
+supports is more limited than the <<analysis-pattern-tokenizer,`pattern`>>
+tokenizer, but the tokenization is generally faster.
+
+This tokenizer does not produce terms from the matches themselves. To produce
+terms from matches using patterns in the same restricted regular expression
+subset, see the <<analysis-simplepattern-tokenizer,`simplepattern`>>
+tokenizer.
+
+This tokenizer uses http://lucene.apache.org/core//6_5_1/core/org/apache/lucene/util/automaton/RegExp.html[Lucene regular expressions].
+For an explanation of the supported features and syntax, see <<regexp-syntax,Regular Expression Syntax>>.
+
+The default pattern is the empty string, which produces one term containing the
+full input. This tokenizer should always be configured with a non-default
+pattern.
+
+[WARNING]
+.Beware of Pathological Regular Expressions
+========================================
+
+A badly written regular expression could run very slowly or even throw a
+StackOverflowError and cause the node it is running on to exit suddenly.
+
+Read more about http://www.regular-expressions.info/catastrophic.html[pathological regular expressions and how to avoid them].
+
+========================================
+
+[float]
+=== Configuration
+
+The `simplepatternsplit` tokenizer accepts the following parameters:
+
+[horizontal]
+`pattern`::
+
+  A http://lucene.apache.org/core//6_5_1/core/org/apache/lucene/util/automaton/RegExp.html[Lucene regular expression], defaults to the empty string.
+
+[float]
+=== Example configuration
+
+This example configures the `simplepatternsplit` tokenizer to split the input
+text on underscores.
+
+[source,js]
+----------------------------
+PUT my_index
+{
+  "settings": {
+    "analysis": {
+      "analyzer": {
+        "my_analyzer": {
+          "tokenizer": "my_tokenizer"
+        }
+      },
+      "tokenizer": {
+        "my_tokenizer": {
+          "type": "simplepatternsplit",
+          "pattern": "_"
+        }
+      }
+    }
+  }
+}
+
+POST my_index/_analyze
+{
+  "analyzer": "my_analyzer",
+  "text": "an_underscored_phrase"
+}
+----------------------------
+// CONSOLE
+
+/////////////////////
+
+[source,js]
+----------------------------
+{
+  "tokens" : [
+    {
+      "token" : "an",
+      "start_offset" : 0,
+      "end_offset" : 2,
+      "type" : "word",
+      "position" : 0
+    },
+    {
+      "token" : "underscored",
+      "start_offset" : 3,
+      "end_offset" : 14,
+      "type" : "word",
+      "position" : 1
+    },
+    {
+      "token" : "phrase",
+      "start_offset" : 15,
+      "end_offset" : 21,
+      "type" : "word",
+      "position" : 2
+    }
+  ]
+}
+----------------------------
+// TESTRESPONSE
+
+/////////////////////
+
+The above example produces these terms:
+
+[source,text]
+---------------------------
+[ an, underscored, phrase ]
+---------------------------
diff --git a/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/30_tokenizers.yml b/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/30_tokenizers.yml
index 174a15f772bd9..7063437ad4643 100644
--- a/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/30_tokenizers.yml
+++ b/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/30_tokenizers.yml
@@ -25,3 +25,33 @@
     - match:  { detail.tokenizer.tokens.0.token: go }
     - match:  { detail.tokenizer.tokens.1.token: oo }
     - match:  { detail.tokenizer.tokens.2.token: od }
+
+---
+"simplepattern":
+    - do:
+        indices.analyze:
+          body:
+            text: "a6bf fooo ff61"
+            explain: true
+            tokenizer:
+              type: simplepattern
+              pattern: "[abcdef0123456789]{4}"
+    - length: { detail.tokenizer.tokens: 2 }
+    - match:  { detail.tokenizer.name: _anonymous_tokenizer }
+    - match:  { detail.tokenizer.tokens.0.token: a6bf }
+    - match:  { detail.tokenizer.tokens.1.token: ff61 }
+
+---
+"simplepatternsplit":
+    - do:
+        indices.analyze:
+          body:
+            text: "foo==bar"
+            explain: true
+            tokenizer:
+              type: simplepatternsplit
+              pattern: ==
+    - length: { detail.tokenizer.tokens: 2 }
+    - match:  { detail.tokenizer.name: _anonymous_tokenizer }
+    - match:  { detail.tokenizer.tokens.0.token: foo }
+    - match:  { detail.tokenizer.tokens.1.token: bar }
diff --git a/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java b/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java
index fd8a5e7cd9aed..a962bd323fd20 100644
--- a/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java
+++ b/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java
@@ -71,6 +71,8 @@
 import org.elasticsearch.index.analysis.ScandinavianNormalizationFilterFactory;
 import org.elasticsearch.index.analysis.SerbianNormalizationFilterFactory;
 import org.elasticsearch.index.analysis.ShingleTokenFilterFactory;
+import org.elasticsearch.index.analysis.SimplePatternSplitTokenizerFactory;
+import org.elasticsearch.index.analysis.SimplePatternTokenizerFactory;
 import org.elasticsearch.index.analysis.SoraniNormalizationFilterFactory;
 import org.elasticsearch.index.analysis.StandardTokenFilterFactory;
 import org.elasticsearch.index.analysis.StandardTokenizerFactory;
@@ -129,25 +131,23 @@ private static String toCamelCase(String s) {
 
     static final Map<String,Class<?>> KNOWN_TOKENIZERS = new MapBuilder<String,Class<?>>()
         // exposed in ES
-        .put("classic",       ClassicTokenizerFactory.class)
-        .put("edgengram",     EdgeNGramTokenizerFactory.class)
-        .put("keyword",       KeywordTokenizerFactory.class)
-        .put("letter",        LetterTokenizerFactory.class)
-        .put("lowercase",     LowerCaseTokenizerFactory.class)
-        .put("ngram",         NGramTokenizerFactory.class)
-        .put("pathhierarchy", PathHierarchyTokenizerFactory.class)
-        .put("pattern",       PatternTokenizerFactory.class)
-        .put("standard",      StandardTokenizerFactory.class)
-        .put("thai",          ThaiTokenizerFactory.class)
-        .put("uax29urlemail", UAX29URLEmailTokenizerFactory.class)
-        .put("whitespace",    WhitespaceTokenizerFactory.class)
+        .put("classic",            ClassicTokenizerFactory.class)
+        .put("edgengram",          EdgeNGramTokenizerFactory.class)
+        .put("keyword",            KeywordTokenizerFactory.class)
+        .put("letter",             LetterTokenizerFactory.class)
+        .put("lowercase",          LowerCaseTokenizerFactory.class)
+        .put("ngram",              NGramTokenizerFactory.class)
+        .put("pathhierarchy",      PathHierarchyTokenizerFactory.class)
+        .put("pattern",            PatternTokenizerFactory.class)
+        .put("simplepattern",      SimplePatternTokenizerFactory.class)
+        .put("simplepatternsplit", SimplePatternSplitTokenizerFactory.class)
+        .put("standard",           StandardTokenizerFactory.class)
+        .put("thai",               ThaiTokenizerFactory.class)
+        .put("uax29urlemail",      UAX29URLEmailTokenizerFactory.class)
+        .put("whitespace",         WhitespaceTokenizerFactory.class)
 
         // this one "seems to mess up offsets". probably shouldn't be a tokenizer...
         .put("wikipedia",     Void.class)
-
-        // TODO: expose these
-        .put("simplepattern",    Void.class)
-        .put("simplepatternsplit",    Void.class)
         .immutableMap();
 
     static final Map<String,Class<?>> KNOWN_TOKENFILTERS = new MapBuilder<String,Class<?>>()

From 66592578c7655e93e4ae0efa10a6e971b7061306 Mon Sep 17 00:00:00 2001
From: Andy Bristol <andy.bristol@elastic.co>
Date: Mon, 12 Jun 2017 07:57:04 -0700
Subject: [PATCH 2/4] expose simplepattern and simplepatternsplit tokenizers

Fixes for code review

Take out admonition blocks in reference detail pages on
these tokenizers because Lucene's regexes are better protected
against being too complex or causing deep stacks.

Move these tokenizers to the common-analysis module because
that's where we're relocating code that depends on
lucene-analyzers-common

For #23363
---
 .../indices/analysis/AnalysisModule.java              |  4 ----
 .../tokenizers/simplepattern-tokenizer.asciidoc       | 11 -----------
 .../tokenizers/simplepatternsplit-tokenizer.asciidoc  | 11 -----------
 .../analysis/common/CommonAnalysisPlugin.java         | 10 ++++++++++
 .../common}/SimplePatternSplitTokenizerFactory.java   |  6 ++----
 .../common}/SimplePatternTokenizerFactory.java        |  6 ++----
 .../analysis/common/CommonAnalysisFactoryTests.java   |  2 ++
 .../indices/analysis/AnalysisFactoryTestCase.java     |  6 ++----
 8 files changed, 18 insertions(+), 38 deletions(-)
 rename {core/src/main/java/org/elasticsearch/index/analysis => modules/analysis-common/src/main/java/org/elasticsearch/analysis/common}/SimplePatternSplitTokenizerFactory.java (88%)
 rename {core/src/main/java/org/elasticsearch/index/analysis => modules/analysis-common/src/main/java/org/elasticsearch/analysis/common}/SimplePatternTokenizerFactory.java (88%)

diff --git a/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java b/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java
index e7dccdc9fd2d9..3f26b722f41ce 100644
--- a/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java
+++ b/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java
@@ -113,8 +113,6 @@
 import org.elasticsearch.index.analysis.SerbianNormalizationFilterFactory;
 import org.elasticsearch.index.analysis.ShingleTokenFilterFactory;
 import org.elasticsearch.index.analysis.SimpleAnalyzerProvider;
-import org.elasticsearch.index.analysis.SimplePatternSplitTokenizerFactory;
-import org.elasticsearch.index.analysis.SimplePatternTokenizerFactory;
 import org.elasticsearch.index.analysis.SnowballAnalyzerProvider;
 import org.elasticsearch.index.analysis.SoraniAnalyzerProvider;
 import org.elasticsearch.index.analysis.SoraniNormalizationFilterFactory;
@@ -345,8 +343,6 @@ private NamedRegistry<AnalysisProvider<TokenizerFactory>> setupTokenizers(List<A
         tokenizers.register("edgeNGram", EdgeNGramTokenizerFactory::new);
         tokenizers.register("edge_ngram", EdgeNGramTokenizerFactory::new);
         tokenizers.register("pattern", PatternTokenizerFactory::new);
-        tokenizers.register("simplepattern", SimplePatternTokenizerFactory::new);
-        tokenizers.register("simplepatternsplit", SimplePatternSplitTokenizerFactory::new);
         tokenizers.register("classic", ClassicTokenizerFactory::new);
         tokenizers.register("thai", ThaiTokenizerFactory::new);
         tokenizers.extractAndRegister(plugins, AnalysisPlugin::getTokenizers);
diff --git a/docs/reference/analysis/tokenizers/simplepattern-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/simplepattern-tokenizer.asciidoc
index a0c3c7dcf1288..997b3b5251dd7 100644
--- a/docs/reference/analysis/tokenizers/simplepattern-tokenizer.asciidoc
+++ b/docs/reference/analysis/tokenizers/simplepattern-tokenizer.asciidoc
@@ -19,17 +19,6 @@ For an explanation of the supported features and syntax, see <<regexp-syntax,Reg
 The default pattern is the empty string, which produces no terms. This
 tokenizer should always be configured with a non-default pattern.
 
-[WARNING]
-.Beware of Pathological Regular Expressions
-========================================
-
-A badly written regular expression could run very slowly or even throw a
-StackOverflowError and cause the node it is running on to exit suddenly.
-
-Read more about http://www.regular-expressions.info/catastrophic.html[pathological regular expressions and how to avoid them].
-
-========================================
-
 [float]
 === Configuration
 
diff --git a/docs/reference/analysis/tokenizers/simplepatternsplit-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/simplepatternsplit-tokenizer.asciidoc
index 04e17a0e68501..dc850d09fc16b 100644
--- a/docs/reference/analysis/tokenizers/simplepatternsplit-tokenizer.asciidoc
+++ b/docs/reference/analysis/tokenizers/simplepatternsplit-tokenizer.asciidoc
@@ -20,17 +20,6 @@ The default pattern is the empty string, which produces one term containing the
 full input. This tokenizer should always be configured with a non-default
 pattern.
 
-[WARNING]
-.Beware of Pathological Regular Expressions
-========================================
-
-A badly written regular expression could run very slowly or even throw a
-StackOverflowError and cause the node it is running on to exit suddenly.
-
-Read more about http://www.regular-expressions.info/catastrophic.html[pathological regular expressions and how to avoid them].
-
-========================================
-
 [float]
 === Configuration
 
diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java
index c33023d1cb251..2f8f1d7405a96 100644
--- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java
+++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java
@@ -73,6 +73,7 @@
 import org.elasticsearch.index.analysis.PreConfiguredTokenFilter;
 import org.elasticsearch.index.analysis.PreConfiguredTokenizer;
 import org.elasticsearch.index.analysis.TokenFilterFactory;
+import org.elasticsearch.index.analysis.TokenizerFactory;
 import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider;
 import org.elasticsearch.plugins.AnalysisPlugin;
 import org.elasticsearch.plugins.Plugin;
@@ -100,6 +101,7 @@ public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
         return filters;
     }
 
+    @Override
     public Map<String, AnalysisProvider<CharFilterFactory>> getCharFilters() {
         Map<String, AnalysisProvider<CharFilterFactory>> filters = new TreeMap<>();
         filters.put("html_strip", HtmlStripCharFilterFactory::new);
@@ -108,6 +110,14 @@ public Map<String, AnalysisProvider<CharFilterFactory>> getCharFilters() {
         return filters;
     }
 
+    @Override
+    public Map<String, AnalysisProvider<TokenizerFactory>> getTokenizers() {
+        Map<String, AnalysisProvider<TokenizerFactory>> tokenizers = new TreeMap<>();
+        tokenizers.put("simplepattern", SimplePatternTokenizerFactory::new);
+        tokenizers.put("simplepatternsplit", SimplePatternSplitTokenizerFactory::new);
+        return tokenizers;
+    }
+
     @Override
     public List<PreConfiguredCharFilter> getPreConfiguredCharFilters() {
         List<PreConfiguredCharFilter> filters = new ArrayList<>();
diff --git a/core/src/main/java/org/elasticsearch/index/analysis/SimplePatternSplitTokenizerFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/SimplePatternSplitTokenizerFactory.java
similarity index 88%
rename from core/src/main/java/org/elasticsearch/index/analysis/SimplePatternSplitTokenizerFactory.java
rename to modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/SimplePatternSplitTokenizerFactory.java
index 5d08cf903f062..b4c16b35dd946 100644
--- a/core/src/main/java/org/elasticsearch/index/analysis/SimplePatternSplitTokenizerFactory.java
+++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/SimplePatternSplitTokenizerFactory.java
@@ -17,13 +17,14 @@
  * under the License.
  */
 
-package org.elasticsearch.index.analysis;
+package org.elasticsearch.analysis.common;
 
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.pattern.SimplePatternSplitTokenizer;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.env.Environment;
 import org.elasticsearch.index.IndexSettings;
+import org.elasticsearch.index.analysis.AbstractTokenizerFactory;
 
 public class SimplePatternSplitTokenizerFactory extends AbstractTokenizerFactory {
 
@@ -33,9 +34,6 @@ public SimplePatternSplitTokenizerFactory(IndexSettings indexSettings, Environme
         super(indexSettings, name, settings);
 
         String pattern = settings.get("pattern", "");
-        if (pattern == null) {
-            throw new IllegalArgumentException("pattern is missing for [" + name + "] tokenizer of type 'simplepatternsplit'");
-        }
         this.pattern = pattern;
     }
 
diff --git a/core/src/main/java/org/elasticsearch/index/analysis/SimplePatternTokenizerFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/SimplePatternTokenizerFactory.java
similarity index 88%
rename from core/src/main/java/org/elasticsearch/index/analysis/SimplePatternTokenizerFactory.java
rename to modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/SimplePatternTokenizerFactory.java
index bb23a4609abbf..530f7e5bef4e6 100644
--- a/core/src/main/java/org/elasticsearch/index/analysis/SimplePatternTokenizerFactory.java
+++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/SimplePatternTokenizerFactory.java
@@ -17,13 +17,14 @@
  * under the License.
  */
 
-package org.elasticsearch.index.analysis;
+package org.elasticsearch.analysis.common;
 
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.pattern.SimplePatternTokenizer;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.env.Environment;
 import org.elasticsearch.index.IndexSettings;
+import org.elasticsearch.index.analysis.AbstractTokenizerFactory;
 
 public class SimplePatternTokenizerFactory extends AbstractTokenizerFactory {
 
@@ -33,9 +34,6 @@ public SimplePatternTokenizerFactory(IndexSettings indexSettings, Environment en
         super(indexSettings, name, settings);
 
         String pattern = settings.get("pattern", "");
-        if (pattern == null) {
-            throw new IllegalArgumentException("pattern is missing for [" + name + "] tokenizer of type 'simplepattern'");
-        }
         this.pattern = pattern;
     }
 
diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java
index 59164f7506504..f7313572e13ee 100644
--- a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java
+++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java
@@ -43,6 +43,8 @@ public CommonAnalysisFactoryTests() {
     @Override
     protected Map<String, Class<?>> getTokenizers() {
         Map<String, Class<?>> tokenizers = new TreeMap<>(super.getTokenizers());
+        tokenizers.put("simplepattern", SimplePatternTokenizerFactory.class);
+        tokenizers.put("simplepatternsplit", SimplePatternSplitTokenizerFactory.class);
         return tokenizers;
     }
 
diff --git a/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java b/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java
index a962bd323fd20..bcb9dfbe5a7b0 100644
--- a/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java
+++ b/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java
@@ -71,8 +71,6 @@
 import org.elasticsearch.index.analysis.ScandinavianNormalizationFilterFactory;
 import org.elasticsearch.index.analysis.SerbianNormalizationFilterFactory;
 import org.elasticsearch.index.analysis.ShingleTokenFilterFactory;
-import org.elasticsearch.index.analysis.SimplePatternSplitTokenizerFactory;
-import org.elasticsearch.index.analysis.SimplePatternTokenizerFactory;
 import org.elasticsearch.index.analysis.SoraniNormalizationFilterFactory;
 import org.elasticsearch.index.analysis.StandardTokenFilterFactory;
 import org.elasticsearch.index.analysis.StandardTokenizerFactory;
@@ -139,8 +137,8 @@ private static String toCamelCase(String s) {
         .put("ngram",              NGramTokenizerFactory.class)
         .put("pathhierarchy",      PathHierarchyTokenizerFactory.class)
         .put("pattern",            PatternTokenizerFactory.class)
-        .put("simplepattern",      SimplePatternTokenizerFactory.class)
-        .put("simplepatternsplit", SimplePatternSplitTokenizerFactory.class)
+        .put("simplepattern",      MovedToAnalysisCommon.class)
+        .put("simplepatternsplit", MovedToAnalysisCommon.class)
         .put("standard",           StandardTokenizerFactory.class)
         .put("thai",               ThaiTokenizerFactory.class)
         .put("uax29urlemail",      UAX29URLEmailTokenizerFactory.class)

From d97846a6eea5d312df8d5c393ca7c7bd5588d4ec Mon Sep 17 00:00:00 2001
From: Andy Bristol <andy.bristol@elastic.co>
Date: Mon, 12 Jun 2017 11:39:05 -0700
Subject: [PATCH 3/4] expose simplepattern and simplepatternsplit tokenizers

Fix for code review to cleanup unnecessary variables

For #23363
---
 .../analysis/common/SimplePatternSplitTokenizerFactory.java    | 3 +--
 .../analysis/common/SimplePatternTokenizerFactory.java         | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/SimplePatternSplitTokenizerFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/SimplePatternSplitTokenizerFactory.java
index b4c16b35dd946..f861ec3792f5e 100644
--- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/SimplePatternSplitTokenizerFactory.java
+++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/SimplePatternSplitTokenizerFactory.java
@@ -33,8 +33,7 @@ public class SimplePatternSplitTokenizerFactory extends AbstractTokenizerFactory
     public SimplePatternSplitTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
         super(indexSettings, name, settings);
 
-        String pattern = settings.get("pattern", "");
-        this.pattern = pattern;
+        pattern = settings.get("pattern", "");
     }
 
     @Override
diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/SimplePatternTokenizerFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/SimplePatternTokenizerFactory.java
index 530f7e5bef4e6..6db3cfa67a318 100644
--- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/SimplePatternTokenizerFactory.java
+++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/SimplePatternTokenizerFactory.java
@@ -33,8 +33,7 @@ public class SimplePatternTokenizerFactory extends AbstractTokenizerFactory {
     public SimplePatternTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
         super(indexSettings, name, settings);
 
-        String pattern = settings.get("pattern", "");
-        this.pattern = pattern;
+        pattern = settings.get("pattern", "");
     }
 
     @Override

From b1e503c2e25f29c894f2161240513319df3a9f07 Mon Sep 17 00:00:00 2001
From: Andy Bristol <andy.bristol@elastic.co>
Date: Mon, 12 Jun 2017 16:01:58 -0700
Subject: [PATCH 4/4] expose simplepattern and simplepatternsplit tokenizers

Make links to lucene javadocs relative to the
lucene-core-javadoc property so they'll stay up to date
as we change lucene versions

Whitespace formatting in tokenizer docs

Whitespace formatting in AnalysisFactoryTestCase so that
we don't have to change spacing every time we edit that map

Clearer usage in the header for simplepatternsplit's section

For #23363
---
 docs/reference/analysis/tokenizers.asciidoc   |  6 ++--
 .../simplepattern-tokenizer.asciidoc          |  5 ++--
 .../simplepatternsplit-tokenizer.asciidoc     |  5 ++--
 .../analysis/AnalysisFactoryTestCase.java     | 28 +++++++++----------
 4 files changed, 21 insertions(+), 23 deletions(-)

diff --git a/docs/reference/analysis/tokenizers.asciidoc b/docs/reference/analysis/tokenizers.asciidoc
index c26974b9cad61..f1e0899d7abf1 100644
--- a/docs/reference/analysis/tokenizers.asciidoc
+++ b/docs/reference/analysis/tokenizers.asciidoc
@@ -105,9 +105,9 @@ and is generally faster than the `pattern` tokenizer.
 
 <<analysis-simplepatternsplit-tokenizer,Simple Pattern Split Tokenizer>>::
 
-The `simplepatternsplit` tokenizer uses the same restricted subset as
-the `simplepattern` tokenizer, but splits the input at matches rather than 
-returning the matches as terms.
+The `simplepatternsplit` tokenizer uses the same restricted regular expression
+subset as the `simplepattern` tokenizer, but splits the input at matches rather
+than returning the matches as terms.
 
 <<analysis-pathhierarchy-tokenizer,Path Tokenizer>>::
 
diff --git a/docs/reference/analysis/tokenizers/simplepattern-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/simplepattern-tokenizer.asciidoc
index 997b3b5251dd7..bee92c75d26cd 100644
--- a/docs/reference/analysis/tokenizers/simplepattern-tokenizer.asciidoc
+++ b/docs/reference/analysis/tokenizers/simplepattern-tokenizer.asciidoc
@@ -13,7 +13,7 @@ the <<analysis-pattern-tokenizer,`pattern`>> tokenizer. To split on pattern
 matches using the same restricted regular expression subset, see the
 <<analysis-simplepatternsplit-tokenizer,`simplepatternsplit`>> tokenizer.
 
-This tokenizer uses http://lucene.apache.org/core//6_5_1/core/org/apache/lucene/util/automaton/RegExp.html[Lucene regular expressions].
+This tokenizer uses {lucene-core-javadoc}/org/apache/lucene/util/automaton/RegExp.html[Lucene regular expressions].
 For an explanation of the supported features and syntax, see <<regexp-syntax,Regular Expression Syntax>>.
 
 The default pattern is the empty string, which produces no terms. This
@@ -26,8 +26,7 @@ The `simplepattern` tokenizer accepts the following parameters:
 
 [horizontal]
 `pattern`::
-
-    A http://lucene.apache.org/core//6_5_1/core/org/apache/lucene/util/automaton/RegExp.html[Lucene regular expression], defaults to the empty string.
+    {lucene-core-javadoc}/org/apache/lucene/util/automaton/RegExp.html[Lucene regular expression], defaults to the empty string.
 
 [float]
 === Example configuration
diff --git a/docs/reference/analysis/tokenizers/simplepatternsplit-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/simplepatternsplit-tokenizer.asciidoc
index dc850d09fc16b..c009f8cb7a400 100644
--- a/docs/reference/analysis/tokenizers/simplepatternsplit-tokenizer.asciidoc
+++ b/docs/reference/analysis/tokenizers/simplepatternsplit-tokenizer.asciidoc
@@ -13,7 +13,7 @@ terms from matches using patterns in the same restricted regular expression
 subset, see the <<analysis-simplepattern-tokenizer,`simplepattern`>>
 tokenizer.
 
-This tokenizer uses http://lucene.apache.org/core//6_5_1/core/org/apache/lucene/util/automaton/RegExp.html[Lucene regular expressions].
+This tokenizer uses {lucene-core-javadoc}/org/apache/lucene/util/automaton/RegExp.html[Lucene regular expressions].
 For an explanation of the supported features and syntax, see <<regexp-syntax,Regular Expression Syntax>>.
 
 The default pattern is the empty string, which produces one term containing the
@@ -27,8 +27,7 @@ The `simplepatternsplit` tokenizer accepts the following parameters:
 
 [horizontal]
 `pattern`::
-
-  A http://lucene.apache.org/core//6_5_1/core/org/apache/lucene/util/automaton/RegExp.html[Lucene regular expression], defaults to the empty string.
+    A {lucene-core-javadoc}/org/apache/lucene/util/automaton/RegExp.html[Lucene regular expression], defaults to the empty string.
 
 [float]
 === Example configuration
diff --git a/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java b/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java
index bcb9dfbe5a7b0..a3fe52d005c24 100644
--- a/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java
+++ b/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java
@@ -129,23 +129,23 @@ private static String toCamelCase(String s) {
 
     static final Map<String,Class<?>> KNOWN_TOKENIZERS = new MapBuilder<String,Class<?>>()
         // exposed in ES
-        .put("classic",            ClassicTokenizerFactory.class)
-        .put("edgengram",          EdgeNGramTokenizerFactory.class)
-        .put("keyword",            KeywordTokenizerFactory.class)
-        .put("letter",             LetterTokenizerFactory.class)
-        .put("lowercase",          LowerCaseTokenizerFactory.class)
-        .put("ngram",              NGramTokenizerFactory.class)
-        .put("pathhierarchy",      PathHierarchyTokenizerFactory.class)
-        .put("pattern",            PatternTokenizerFactory.class)
-        .put("simplepattern",      MovedToAnalysisCommon.class)
+        .put("classic", ClassicTokenizerFactory.class)
+        .put("edgengram", EdgeNGramTokenizerFactory.class)
+        .put("keyword", KeywordTokenizerFactory.class)
+        .put("letter", LetterTokenizerFactory.class)
+        .put("lowercase", LowerCaseTokenizerFactory.class)
+        .put("ngram", NGramTokenizerFactory.class)
+        .put("pathhierarchy", PathHierarchyTokenizerFactory.class)
+        .put("pattern", PatternTokenizerFactory.class)
+        .put("simplepattern", MovedToAnalysisCommon.class)
         .put("simplepatternsplit", MovedToAnalysisCommon.class)
-        .put("standard",           StandardTokenizerFactory.class)
-        .put("thai",               ThaiTokenizerFactory.class)
-        .put("uax29urlemail",      UAX29URLEmailTokenizerFactory.class)
-        .put("whitespace",         WhitespaceTokenizerFactory.class)
+        .put("standard", StandardTokenizerFactory.class)
+        .put("thai", ThaiTokenizerFactory.class)
+        .put("uax29urlemail", UAX29URLEmailTokenizerFactory.class)
+        .put("whitespace", WhitespaceTokenizerFactory.class)
 
         // this one "seems to mess up offsets". probably shouldn't be a tokenizer...
-        .put("wikipedia",     Void.class)
+        .put("wikipedia", Void.class)
         .immutableMap();
 
     static final Map<String,Class<?>> KNOWN_TOKENFILTERS = new MapBuilder<String,Class<?>>()