diff --git a/src/Nest/Analysis/Tokenizers/CharGroupTokenizer.cs b/src/Nest/Analysis/Tokenizers/CharGroupTokenizer.cs new file mode 100644 index 00000000000..3996824fc8a --- /dev/null +++ b/src/Nest/Analysis/Tokenizers/CharGroupTokenizer.cs @@ -0,0 +1,48 @@ +using System.Collections.Generic; +using Newtonsoft.Json; + +namespace Nest +{ + /// + /// A tokenizer that breaks text into terms whenever it encounters a character which is in a defined set. It is mostly useful + /// for cases where a simple custom tokenization is desired, and the overhead of use of is not acceptable. + /// + public interface ICharGroupTokenizer : ITokenizer + { + /// + /// A list containing a list of characters to tokenize the string on. Whenever a character from this list is encountered, a + /// new token is started. This accepts either single characters like eg. -, or character groups: whitespace, letter, digit, + /// punctuation, symbol. + /// + [JsonProperty("tokenize_on_chars")] + IEnumerable TokenizeOnCharacters { get; set; } + } + + /// + public class CharGroupTokenizer : TokenizerBase, ICharGroupTokenizer + { + internal const string TokenizerType = "char_group"; + + public CharGroupTokenizer() => this.Type = TokenizerType; + + /// + public IEnumerable TokenizeOnCharacters { get; set; } + } + + /// + public class CharGroupTokenizerDescriptor + : TokenizerDescriptorBase, ICharGroupTokenizer + { + protected override string Type => CharGroupTokenizer.TokenizerType; + + IEnumerable ICharGroupTokenizer.TokenizeOnCharacters { get; set; } + + /// + public CharGroupTokenizerDescriptor TokenizeOnCharacters(params string[] characters) => + Assign(a => a.TokenizeOnCharacters = characters); + + /// + public CharGroupTokenizerDescriptor TokenizeOnCharacters(IEnumerable characters) => + Assign(a => a.TokenizeOnCharacters = characters); + } +} diff --git a/src/Nest/Analysis/Tokenizers/Tokenizers.cs b/src/Nest/Analysis/Tokenizers/Tokenizers.cs index 78536994658..bc23e2d5f4c 100644 --- a/src/Nest/Analysis/Tokenizers/Tokenizers.cs +++ b/src/Nest/Analysis/Tokenizers/Tokenizers.cs @@ -112,5 +112,9 @@ public TokenizersDescriptor Kuromoji(string name, Func public TokenizersDescriptor Icu(string name, Func selector) => Assign(name, selector?.Invoke(new IcuTokenizerDescriptor())); + + /// > + public TokenizersDescriptor CharGroup(string name, Func selector) => + Assign(name, selector?.Invoke(new CharGroupTokenizerDescriptor())); } } diff --git a/src/Nest/Indices/Analyze/AnalyzeTokenizersDescriptor.cs b/src/Nest/Indices/Analyze/AnalyzeTokenizersDescriptor.cs index bdc38962321..edb9133e86c 100644 --- a/src/Nest/Indices/Analyze/AnalyzeTokenizersDescriptor.cs +++ b/src/Nest/Indices/Analyze/AnalyzeTokenizersDescriptor.cs @@ -91,5 +91,8 @@ public ITokenizer Kuromoji(Func /// public ITokenizer Icu(Func selector) => (selector?.Invoke(new IcuTokenizerDescriptor())); + + /// > + public ITokenizer CharGroup(Func selector) => selector?.Invoke(new CharGroupTokenizerDescriptor()); } } diff --git a/src/Tests/Tests/Analysis/Tokenizers/TokenizerTests.cs b/src/Tests/Tests/Analysis/Tokenizers/TokenizerTests.cs index a36399ad2a3..8d27bff379d 100644 --- a/src/Tests/Tests/Analysis/Tokenizers/TokenizerTests.cs +++ b/src/Tests/Tests/Analysis/Tokenizers/TokenizerTests.cs @@ -203,5 +203,25 @@ public class StandardTests : TokenizerAssertionBase public override object Json => new {type = "standard"}; } + + public class CharGroupTests : TokenizerAssertionBase + { + private readonly string[] _chars = {"whitespace", "-", "\n"}; + public override string Name => "uax"; + public override ITokenizer Initializer => new CharGroupTokenizer + { + TokenizeOnCharacters = _chars + }; + + public override FuncTokenizer Fluent => (n, t) => t.CharGroup(n, e => e + .TokenizeOnCharacters(_chars) + ); + + public override object Json => new + { + tokenize_on_chars = _chars, + type = "char_group" + }; + } } }