diff --git a/src/Nest/Analysis/Tokenizers/CharGroupTokenizer.cs b/src/Nest/Analysis/Tokenizers/CharGroupTokenizer.cs
new file mode 100644
index 00000000000..3996824fc8a
--- /dev/null
+++ b/src/Nest/Analysis/Tokenizers/CharGroupTokenizer.cs
@@ -0,0 +1,48 @@
+using System.Collections.Generic;
+using Newtonsoft.Json;
+
+namespace Nest
+{
+ ///
+ /// A tokenizer that breaks text into terms whenever it encounters a character which is in a defined set. It is mostly useful
+ /// for cases where a simple custom tokenization is desired, and the overhead of use of is not acceptable.
+ ///
+ public interface ICharGroupTokenizer : ITokenizer
+ {
+ ///
+ /// A list containing a list of characters to tokenize the string on. Whenever a character from this list is encountered, a
+ /// new token is started. This accepts either single characters like eg. -, or character groups: whitespace, letter, digit,
+ /// punctuation, symbol.
+ ///
+ [JsonProperty("tokenize_on_chars")]
+ IEnumerable TokenizeOnCharacters { get; set; }
+ }
+
+ ///
+ public class CharGroupTokenizer : TokenizerBase, ICharGroupTokenizer
+ {
+ internal const string TokenizerType = "char_group";
+
+ public CharGroupTokenizer() => this.Type = TokenizerType;
+
+ ///
+ public IEnumerable TokenizeOnCharacters { get; set; }
+ }
+
+ ///
+ public class CharGroupTokenizerDescriptor
+ : TokenizerDescriptorBase, ICharGroupTokenizer
+ {
+ protected override string Type => CharGroupTokenizer.TokenizerType;
+
+ IEnumerable ICharGroupTokenizer.TokenizeOnCharacters { get; set; }
+
+ ///
+ public CharGroupTokenizerDescriptor TokenizeOnCharacters(params string[] characters) =>
+ Assign(a => a.TokenizeOnCharacters = characters);
+
+ ///
+ public CharGroupTokenizerDescriptor TokenizeOnCharacters(IEnumerable characters) =>
+ Assign(a => a.TokenizeOnCharacters = characters);
+ }
+}
diff --git a/src/Nest/Analysis/Tokenizers/Tokenizers.cs b/src/Nest/Analysis/Tokenizers/Tokenizers.cs
index 78536994658..bc23e2d5f4c 100644
--- a/src/Nest/Analysis/Tokenizers/Tokenizers.cs
+++ b/src/Nest/Analysis/Tokenizers/Tokenizers.cs
@@ -112,5 +112,9 @@ public TokenizersDescriptor Kuromoji(string name, Func
public TokenizersDescriptor Icu(string name, Func selector) =>
Assign(name, selector?.Invoke(new IcuTokenizerDescriptor()));
+
+ /// >
+ public TokenizersDescriptor CharGroup(string name, Func selector) =>
+ Assign(name, selector?.Invoke(new CharGroupTokenizerDescriptor()));
}
}
diff --git a/src/Nest/Indices/Analyze/AnalyzeTokenizersDescriptor.cs b/src/Nest/Indices/Analyze/AnalyzeTokenizersDescriptor.cs
index bdc38962321..edb9133e86c 100644
--- a/src/Nest/Indices/Analyze/AnalyzeTokenizersDescriptor.cs
+++ b/src/Nest/Indices/Analyze/AnalyzeTokenizersDescriptor.cs
@@ -91,5 +91,8 @@ public ITokenizer Kuromoji(Func
///
public ITokenizer Icu(Func selector) =>
(selector?.Invoke(new IcuTokenizerDescriptor()));
+
+ /// >
+ public ITokenizer CharGroup(Func selector) => selector?.Invoke(new CharGroupTokenizerDescriptor());
}
}
diff --git a/src/Tests/Tests/Analysis/Tokenizers/TokenizerTests.cs b/src/Tests/Tests/Analysis/Tokenizers/TokenizerTests.cs
index a36399ad2a3..8d27bff379d 100644
--- a/src/Tests/Tests/Analysis/Tokenizers/TokenizerTests.cs
+++ b/src/Tests/Tests/Analysis/Tokenizers/TokenizerTests.cs
@@ -203,5 +203,25 @@ public class StandardTests : TokenizerAssertionBase
public override object Json => new {type = "standard"};
}
+
+ public class CharGroupTests : TokenizerAssertionBase
+ {
+ private readonly string[] _chars = {"whitespace", "-", "\n"};
+ public override string Name => "uax";
+ public override ITokenizer Initializer => new CharGroupTokenizer
+ {
+ TokenizeOnCharacters = _chars
+ };
+
+ public override FuncTokenizer Fluent => (n, t) => t.CharGroup(n, e => e
+ .TokenizeOnCharacters(_chars)
+ );
+
+ public override object Json => new
+ {
+ tokenize_on_chars = _chars,
+ type = "char_group"
+ };
+ }
}
}