Skip to content

Commit db95308

Browse files
Add max_token_length to chargroup tokenizer (#4911) (#4926)
Relates: elastic/elasticsearch#56860 Co-authored-by: Russ Cam <[email protected]>
1 parent f344142 commit db95308

File tree

2 files changed

+47
-2
lines changed

2 files changed

+47
-2
lines changed

src/Nest/Analysis/Tokenizers/CharGroupTokenizer.cs

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
1-
// Licensed to Elasticsearch B.V under one or more agreements.
1+
// Licensed to Elasticsearch B.V under one or more agreements.
22
// Elasticsearch B.V licenses this file to you under the Apache 2.0 License.
33
// See the LICENSE file in the project root for more information
44

5-
using System.Collections.Generic;
5+
using System.Collections.Generic;
66
using System.Runtime.Serialization;
7+
using Elasticsearch.Net.Utf8Json;
78

89
namespace Nest
910
{
@@ -20,6 +21,16 @@ public interface ICharGroupTokenizer : ITokenizer
2021
/// </summary>
2122
[DataMember(Name ="tokenize_on_chars")]
2223
IEnumerable<string> TokenizeOnCharacters { get; set; }
24+
25+
/// <summary>
26+
/// The maximum token length. If a token is seen that exceeds this length then
27+
/// it is split at <see cref="MaxTokenLength"/> intervals. Defaults to `255`.
28+
/// <para />
29+
/// Valid in Elasticsearch 7.9.0+
30+
/// </summary>
31+
[DataMember(Name = "max_token_length")]
32+
[JsonFormatter(typeof(NullableStringIntFormatter))]
33+
int? MaxTokenLength { get; set; }
2334
}
2435

2536
/// <inheritdoc cref="ICharGroupTokenizer" />
@@ -31,6 +42,9 @@ public class CharGroupTokenizer : TokenizerBase, ICharGroupTokenizer
3142

3243
/// <inheritdoc cref="ICharGroupTokenizer.TokenizeOnCharacters" />
3344
public IEnumerable<string> TokenizeOnCharacters { get; set; }
45+
46+
/// <inheritdoc cref="ICharGroupTokenizer.MaxTokenLength" />
47+
public int? MaxTokenLength { get; set; }
3448
}
3549

3650
/// <inheritdoc cref="ICharGroupTokenizer" />
@@ -40,6 +54,7 @@ public class CharGroupTokenizerDescriptor
4054
protected override string Type => CharGroupTokenizer.TokenizerType;
4155

4256
IEnumerable<string> ICharGroupTokenizer.TokenizeOnCharacters { get; set; }
57+
int? ICharGroupTokenizer.MaxTokenLength { get; set; }
4358

4459
/// <inheritdoc cref="ICharGroupTokenizer.TokenizeOnCharacters" />
4560
public CharGroupTokenizerDescriptor TokenizeOnCharacters(params string[] characters) =>
@@ -48,5 +63,9 @@ public CharGroupTokenizerDescriptor TokenizeOnCharacters(params string[] charact
4863
/// <inheritdoc cref="ICharGroupTokenizer.TokenizeOnCharacters" />
4964
public CharGroupTokenizerDescriptor TokenizeOnCharacters(IEnumerable<string> characters) =>
5065
Assign(characters, (a, v) => a.TokenizeOnCharacters = v);
66+
67+
/// <inheritdoc cref="ICharGroupTokenizer.MaxTokenLength" />
68+
public CharGroupTokenizerDescriptor MaxTokenLength(int? maxTokenLength) =>
69+
Assign(maxTokenLength, (a, v) => a.MaxTokenLength = v);
5170
}
5271
}

tests/Tests/Analysis/Tokenizers/TokenizerTests.cs

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -337,6 +337,32 @@ public class CharGroupTests : TokenizerAssertionBase<CharGroupTests>
337337
public override string Name => "char_group";
338338
}
339339

340+
[SkipVersion("<7.9.0", "max_token_length introduced in 7.9.0")]
341+
public class CharGroupMaxTokenLengthTests : TokenizerAssertionBase<CharGroupMaxTokenLengthTests>
342+
{
343+
private readonly string[] _chars = { "whitespace", "-", "\n" };
344+
345+
public override FuncTokenizer Fluent => (n, t) => t.CharGroup(n, e => e
346+
.TokenizeOnCharacters(_chars)
347+
.MaxTokenLength(255)
348+
);
349+
350+
public override ITokenizer Initializer => new CharGroupTokenizer
351+
{
352+
TokenizeOnCharacters = _chars,
353+
MaxTokenLength = 255
354+
};
355+
356+
public override object Json => new
357+
{
358+
tokenize_on_chars = _chars,
359+
type = "char_group",
360+
max_token_length = 255
361+
};
362+
363+
public override string Name => "char_group_max_token_length";
364+
}
365+
340366
[SkipVersion("<7.7.0", "discard_punctuation introduced in 7.7.0")]
341367
public class DiscardPunctuationTests : TokenizerAssertionBase<DiscardPunctuationTests>
342368
{

0 commit comments

Comments
 (0)