Skip to content

Commit 8ace22d

Browse files
committed
Add User Dictionary Rules to NoriTokenizer (#3634)
Relates #3615, elastic/elasticsearch#3620
1 parent 74efe05 commit 8ace22d

File tree

2 files changed

+45
-1
lines changed

2 files changed

+45
-1
lines changed

src/Nest/Analysis/Tokenizers/NoriTokenizer.cs

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
using System.Runtime.Serialization;
1+
using System.Collections.Generic;
2+
using System.Runtime.Serialization;
23
using Elasticsearch.Net;
34

45
namespace Nest
@@ -36,6 +37,16 @@ public interface INoriTokenizer : ITokenizer
3637
/// </summary>
3738
[DataMember(Name = "user_dictionary")]
3839
string UserDictionary { get; set; }
40+
41+
/// <summary>
42+
/// The Nori tokenizer uses the mecab-ko-dic dictionary by default. A user_dictionary with custom nouns (NNG)
43+
/// can be specified inline with this property
44+
/// </summary>
45+
/// <remarks>
46+
/// Valid for Elasticsearch 6.6.0+
47+
/// </remarks>
48+
[DataMember(Name = "user_dictionary_rules")]
49+
IEnumerable<string> UserDictionaryRules { get; set; }
3950
}
4051

4152
/// <inheritdoc cref="INoriTokenizer" />
@@ -48,6 +59,9 @@ public class NoriTokenizer : TokenizerBase, INoriTokenizer
4859

4960
/// <inheritdoc cref="INoriTokenizer.UserDictionary" />
5061
public string UserDictionary { get; set; }
62+
63+
/// <inheritdoc cref="INoriTokenizer.UserDictionaryRules" />
64+
public IEnumerable<string> UserDictionaryRules { get; set; }
5165
}
5266

5367
/// <inheritdoc cref="INoriTokenizer" />
@@ -58,11 +72,18 @@ public class NoriTokenizerDescriptor
5872

5973
NoriDecompoundMode? INoriTokenizer.DecompoundMode { get; set; }
6074
string INoriTokenizer.UserDictionary { get; set; }
75+
IEnumerable<string> INoriTokenizer.UserDictionaryRules { get; set; }
6176

6277
/// <inheritdoc cref="INoriTokenizer.DecompoundMode" />
6378
public NoriTokenizerDescriptor DecompoundMode(NoriDecompoundMode? mode) => Assign(mode, (a, v) => a.DecompoundMode = v);
6479

6580
/// <inheritdoc cref="INoriTokenizer.UserDictionary" />
6681
public NoriTokenizerDescriptor UserDictionary(string path) => Assign(path, (a, v) => a.UserDictionary = v);
82+
83+
/// <inheritdoc cref="INoriTokenizer.UserDictionaryRules" />
84+
public NoriTokenizerDescriptor UserDictionaryRules(params string[] rules) => Assign(rules, (a, v) => a.UserDictionaryRules = v);
85+
86+
/// <inheritdoc cref="INoriTokenizer.UserDictionaryRules" />
87+
public NoriTokenizerDescriptor UserDictionaryRules(IEnumerable<string> rules) => Assign(rules, (a, v) => a.UserDictionaryRules = v);
6788
}
6889
}

src/Tests/Tests/Analysis/Tokenizers/TokenizerTests.cs

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -222,6 +222,29 @@ public class NoriTests : TokenizerAssertionBase<NoriTests>
222222
public override string Name => "nori";
223223
}
224224

225+
[SkipVersion("<6.6.0", "inline user dictionary rules introduced in 6.6.0")]
226+
public class NoriWithUserDictionaryTests : TokenizerAssertionBase<NoriWithUserDictionaryTests>
227+
{
228+
public override FuncTokenizer Fluent => (n, t) => t.Nori(n, e => e
229+
.DecompoundMode(NoriDecompoundMode.Mixed)
230+
.UserDictionaryRules("c++", "C샤프", "세종", "세종시 세종 시")
231+
);
232+
233+
public override ITokenizer Initializer => new NoriTokenizer
234+
{
235+
DecompoundMode = NoriDecompoundMode.Mixed,
236+
UserDictionaryRules = new [] { "c++", "C샤프", "세종", "세종시 세종 시" }
237+
};
238+
239+
public override object Json => new
240+
{
241+
type = "nori_tokenizer",
242+
decompound_mode = "mixed",
243+
user_dictionary_rules = new [] { "c++", "C샤프", "세종", "세종시 세종 시" }
244+
};
245+
public override string Name => "nori_userdictionary";
246+
}
247+
225248
[SkipVersion("<6.4.0", "char_group introduced in 6.4.0")]
226249
public class CharGroupTests : TokenizerAssertionBase<CharGroupTests>
227250
{

0 commit comments

Comments
 (0)