Skip to content

Commit d319d53

Browse files
committed
expose word delimiter graph token filter as per elastic/elasticsearch#23327
1 parent 9e0145f commit d319d53

File tree

4 files changed

+254
-0
lines changed

4 files changed

+254
-0
lines changed

src/Nest/Analysis/TokenFilters/TokenFilterJsonConverter.cs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ public override object ReadJson(JsonReader reader, Type objectType, object exist
5454
case "unique": return o.ToObject<UniqueTokenFilter>(ElasticContractResolver.Empty);
5555
case "uppercase": return o.ToObject<UppercaseTokenFilter>(ElasticContractResolver.Empty);
5656
case "word_delimiter": return o.ToObject<WordDelimiterTokenFilter>(ElasticContractResolver.Empty);
57+
case "word_delimiter_graph": return o.ToObject<WordDelimiterGraphTokenFilter>(ElasticContractResolver.Empty);
5758
case "fingerprint": return o.ToObject<FingerprintTokenFilter>(ElasticContractResolver.Empty);
5859
case "kuromoji_readingform": return o.ToObject<KuromojiReadingFormTokenFilter>(ElasticContractResolver.Empty);
5960
case "kuromoji_part_of_speech": return o.ToObject<KuromojiPartOfSpeechTokenFilter>(ElasticContractResolver.Empty);

src/Nest/Analysis/TokenFilters/TokenFilters.cs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,13 @@ public TokenFiltersDescriptor SynonymGraph(string name, Func<SynonymGraphTokenFi
9595
public TokenFiltersDescriptor WordDelimiter(string name, Func<WordDelimiterTokenFilterDescriptor, IWordDelimiterTokenFilter> selector) =>
9696
Assign(name, selector?.Invoke(new WordDelimiterTokenFilterDescriptor()));
9797

98+
/// <summary>
99+
/// A token filter of type asciifolding that converts alphabetic, numeric, and symbolic Unicode characters which are
100+
/// <para> not in the first 127 ASCII characters (the “Basic Latin” Unicode block) into their ASCII equivalents, if one exists.</para>
101+
/// </summary>
102+
public TokenFiltersDescriptor WordDelimiterGraph(string name, Func<WordDelimiterGraphTokenFilterDescriptor, IWordDelimiterGraphTokenFilter> selector) =>
103+
Assign(name, selector?.Invoke(new WordDelimiterGraphTokenFilterDescriptor()));
104+
98105
/// <summary>
99106
/// A token filter of type asciifolding that converts alphabetic, numeric, and symbolic Unicode characters which are
100107
/// <para> not in the first 127 ASCII characters (the “Basic Latin” Unicode block) into their ASCII equivalents, if one exists.</para>
Lines changed: 205 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,205 @@
1+
using System.Collections.Generic;
2+
using Newtonsoft.Json;
3+
4+
namespace Nest
5+
{
6+
/// <summary>
7+
/// Named word_delimiter, it Splits words into subwords and performs optional transformations on subword groups.
8+
/// Unlike the word_delimiter this token filter named word_delimiter_graph correctly handles multi terms expansion at query time.
9+
/// </summary>
10+
public interface IWordDelimiterGraphTokenFilter : ITokenFilter
11+
{
12+
/// <summary>
13+
/// If true causes parts of words to be generated: "PowerShot" ⇒ "Power" "Shot". Defaults to true.
14+
/// </summary>
15+
[JsonProperty("generate_word_parts")]
16+
bool? GenerateWordParts { get; set; }
17+
18+
/// <summary>
19+
/// If true causes number subwords to be generated: "500-42" ⇒ "500" "42". Defaults to true.
20+
/// </summary>
21+
[JsonProperty("generate_number_parts")]
22+
bool? GenerateNumberParts { get; set; }
23+
24+
/// <summary>
25+
/// If true causes maximum runs of word parts to be catenated: "wi-fi" ⇒ "wifi". Defaults to false.
26+
/// </summary>
27+
[JsonProperty("catenate_words")]
28+
bool? CatenateWords { get; set; }
29+
30+
/// <summary>
31+
/// If true causes maximum runs of number parts to be catenated: "500-42" ⇒ "50042". Defaults to false.
32+
/// </summary>
33+
[JsonProperty("catenate_numbers")]
34+
bool? CatenateNumbers { get; set; }
35+
36+
/// <summary>
37+
/// If true causes all subword parts to be catenated: "wi-fi-4000" ⇒ "wifi4000". Defaults to false.
38+
/// </summary>
39+
[JsonProperty("catenate_all")]
40+
bool? CatenateAll { get; set; }
41+
42+
/// <summary>
43+
/// If true causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards). Defaults to true.
44+
/// </summary>
45+
[JsonProperty("split_on_case_change")]
46+
bool? SplitOnCaseChange { get; set; }
47+
48+
/// <summary>
49+
/// If true includes original words in subwords: "500-42" ⇒ "500-42" "500" "42". Defaults to false.
50+
/// </summary>
51+
[JsonProperty("preserve_original")]
52+
bool? PreserveOriginal { get; set; }
53+
54+
/// <summary>
55+
/// If true causes "j2se" to be three tokens; "j" "2" "se". Defaults to true.
56+
/// </summary>
57+
[JsonProperty("split_on_numerics")]
58+
bool? SplitOnNumerics { get; set; }
59+
60+
/// <summary>
61+
/// If true causes trailing "'s" to be removed for each subword: "O’Neil’s" ⇒ "O", "Neil". Defaults to true.
62+
/// </summary>
63+
[JsonProperty("stem_english_possessive")]
64+
bool? StemEnglishPossessive { get; set; }
65+
66+
/// <summary>
67+
/// A list of protected words from being delimiter.
68+
/// </summary>
69+
[JsonProperty("protected_words")]
70+
IEnumerable<string> ProtectedWords { get; set; }
71+
72+
/// <summary>
73+
///protected_words_path which resolved to a file configured with protected words (one on each line).
74+
/// Automatically resolves to config/ based location if exists.
75+
/// </summary>
76+
[JsonProperty("protected_words_path ")]
77+
string ProtectedWordsPath { get; set; }
78+
79+
/// <summary>
80+
/// A custom type mapping table
81+
/// </summary>
82+
[JsonProperty("type_table")]
83+
IEnumerable<string> TypeTable { get; set; }
84+
85+
/// <summary>
86+
/// A path to a custom type mapping table file
87+
/// </summary>
88+
[JsonProperty("type_table_path")]
89+
string TypeTablePath { get; set; }
90+
91+
}
92+
93+
/// <inheritdoc/>
94+
public class WordDelimiterGraphTokenFilter : TokenFilterBase, IWordDelimiterGraphTokenFilter
95+
{
96+
public WordDelimiterGraphTokenFilter() : base("word_delimiter_graph") { }
97+
98+
/// <inheritdoc/>
99+
public bool? GenerateWordParts { get; set; }
100+
101+
/// <inheritdoc/>
102+
public bool? GenerateNumberParts { get; set; }
103+
104+
/// <inheritdoc/>
105+
public bool? CatenateWords { get; set; }
106+
107+
/// <inheritdoc/>
108+
public bool? CatenateNumbers { get; set; }
109+
110+
/// <inheritdoc/>
111+
public bool? CatenateAll { get; set; }
112+
113+
/// <inheritdoc/>
114+
public bool? SplitOnCaseChange { get; set; }
115+
116+
/// <inheritdoc/>
117+
public bool? PreserveOriginal { get; set; }
118+
119+
/// <inheritdoc/>
120+
public bool? SplitOnNumerics { get; set; }
121+
122+
/// <inheritdoc/>
123+
public bool? StemEnglishPossessive { get; set; }
124+
125+
/// <inheritdoc/>
126+
public IEnumerable<string> ProtectedWords { get; set; }
127+
128+
/// <inheritdoc/>
129+
public string ProtectedWordsPath { get; set; }
130+
131+
/// <inheritdoc/>
132+
public IEnumerable<string> TypeTable { get; set; }
133+
134+
/// <inheritdoc/>
135+
public string TypeTablePath { get; set; }
136+
}
137+
138+
///<inheritdoc/>
139+
public class WordDelimiterGraphTokenFilterDescriptor
140+
: TokenFilterDescriptorBase<WordDelimiterGraphTokenFilterDescriptor, IWordDelimiterGraphTokenFilter>, IWordDelimiterGraphTokenFilter
141+
{
142+
protected override string Type => "word_delimiter_graph";
143+
144+
IEnumerable<string> IWordDelimiterGraphTokenFilter.ProtectedWords { get; set; }
145+
string IWordDelimiterGraphTokenFilter.ProtectedWordsPath { get; set; }
146+
IEnumerable<string> IWordDelimiterGraphTokenFilter.TypeTable { get; set; }
147+
string IWordDelimiterGraphTokenFilter.TypeTablePath { get; set; }
148+
bool? IWordDelimiterGraphTokenFilter.GenerateWordParts { get; set; }
149+
bool? IWordDelimiterGraphTokenFilter.GenerateNumberParts { get; set; }
150+
bool? IWordDelimiterGraphTokenFilter.CatenateWords { get; set; }
151+
bool? IWordDelimiterGraphTokenFilter.CatenateNumbers { get; set; }
152+
bool? IWordDelimiterGraphTokenFilter.CatenateAll { get; set; }
153+
bool? IWordDelimiterGraphTokenFilter.SplitOnCaseChange { get; set; }
154+
bool? IWordDelimiterGraphTokenFilter.PreserveOriginal { get; set; }
155+
bool? IWordDelimiterGraphTokenFilter.SplitOnNumerics { get; set; }
156+
bool? IWordDelimiterGraphTokenFilter.StemEnglishPossessive { get; set; }
157+
158+
///<inheritdoc/>
159+
public WordDelimiterGraphTokenFilterDescriptor GenerateWordParts(bool? generateWordParts = true) => Assign(a => a.GenerateWordParts = generateWordParts);
160+
161+
///<inheritdoc/>
162+
public WordDelimiterGraphTokenFilterDescriptor GenerateNumberParts(bool? generateNumberParts = true) => Assign(a => a.GenerateNumberParts = generateNumberParts);
163+
164+
///<inheritdoc/>
165+
public WordDelimiterGraphTokenFilterDescriptor CatenateWords(bool? catenateWords = true) => Assign(a => a.CatenateWords = catenateWords);
166+
167+
///<inheritdoc/>
168+
public WordDelimiterGraphTokenFilterDescriptor CatenateNumbers(bool? catenateNumbers = true) => Assign(a => a.CatenateNumbers = catenateNumbers);
169+
170+
///<inheritdoc/>
171+
public WordDelimiterGraphTokenFilterDescriptor CatenateAll(bool? catenateAll = true) => Assign(a => a.CatenateAll = catenateAll);
172+
173+
///<inheritdoc/>
174+
public WordDelimiterGraphTokenFilterDescriptor SplitOnCaseChange(bool? split = true) => Assign(a => a.SplitOnCaseChange = split);
175+
176+
///<inheritdoc/>
177+
public WordDelimiterGraphTokenFilterDescriptor SplitOnNumerics(bool? split = true) => Assign(a => a.SplitOnNumerics = split);
178+
179+
///<inheritdoc/>
180+
public WordDelimiterGraphTokenFilterDescriptor PreserveOriginal(bool? preserve = true) => Assign(a => a.PreserveOriginal = preserve);
181+
182+
///<inheritdoc/>
183+
public WordDelimiterGraphTokenFilterDescriptor StemEnglishPossessive(bool? stem = true) => Assign(a => a.StemEnglishPossessive = stem);
184+
185+
///<inheritdoc/>
186+
public WordDelimiterGraphTokenFilterDescriptor ProtectedWords(IEnumerable<string> protectedWords) => Assign(a => a.ProtectedWords = protectedWords);
187+
188+
///<inheritdoc/>
189+
public WordDelimiterGraphTokenFilterDescriptor ProtectedWords(params string[] protectedWords) => Assign(a => a.ProtectedWords = protectedWords);
190+
191+
///<inheritdoc/>
192+
public WordDelimiterGraphTokenFilterDescriptor ProtectedWordsPath(string path) => Assign(a => a.ProtectedWordsPath = path);
193+
194+
///<inheritdoc/>
195+
public WordDelimiterGraphTokenFilterDescriptor TypeTable(IEnumerable<string> typeTable) => Assign(a => a.TypeTable = typeTable);
196+
197+
///<inheritdoc/>
198+
public WordDelimiterGraphTokenFilterDescriptor TypeTable(params string[] typeTable) => Assign(a => a.TypeTable = typeTable);
199+
200+
///<inheritdoc/>
201+
public WordDelimiterGraphTokenFilterDescriptor TypeTablePath(string path) => Assign(a => a.TypeTablePath = path);
202+
203+
}
204+
205+
}

src/Tests/Analysis/TokenFilters/TokenFilterUsageTests.cs

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -267,6 +267,20 @@ public class TokenFilterUsageTests : PromiseUsageTestBase<IIndexSettings, IndexS
267267
split_on_numerics = true,
268268
stem_english_possessive = true,
269269
protected_words = new[] {"x", "y", "z"}
270+
},
271+
wdg = new
272+
{
273+
type = "word_delimiter_graph",
274+
generate_word_parts = true,
275+
generate_number_parts = true,
276+
catenate_words = true,
277+
catenate_numbers = true,
278+
catenate_all = true,
279+
split_on_case_change = true,
280+
preserve_original = true,
281+
split_on_numerics = true,
282+
stem_english_possessive = true,
283+
protected_words = new[] {"x", "y", "z"}
270284
}
271285
}
272286
}
@@ -403,6 +417,18 @@ public class TokenFilterUsageTests : PromiseUsageTestBase<IIndexSettings, IndexS
403417
.SplitOnNumerics()
404418
.StemEnglishPossessive()
405419
)
420+
.WordDelimiterGraph("wdg", t => t
421+
.CatenateAll()
422+
.CatenateNumbers()
423+
.CatenateWords()
424+
.GenerateNumberParts()
425+
.GenerateWordParts()
426+
.PreserveOriginal()
427+
.ProtectedWords("x", "y", "z")
428+
.SplitOnCaseChange()
429+
.SplitOnNumerics()
430+
.StemEnglishPossessive()
431+
)
406432
.KuromojiPartOfSpeech("kpos", t => t
407433
.StopTags("# verb-main:", "動詞-自立")
408434
)
@@ -549,6 +575,21 @@ public class TokenFilterUsageTests : PromiseUsageTestBase<IIndexSettings, IndexS
549575
StemEnglishPossessive = true
550576
}
551577
},
578+
{
579+
"wdg", new WordDelimiterGraphTokenFilter
580+
{
581+
CatenateAll = true,
582+
CatenateNumbers = true,
583+
CatenateWords = true,
584+
GenerateNumberParts = true,
585+
GenerateWordParts = true,
586+
PreserveOriginal = true,
587+
ProtectedWords = new[] {"x", "y", "z"},
588+
SplitOnCaseChange = true,
589+
SplitOnNumerics = true,
590+
StemEnglishPossessive = true
591+
}
592+
},
552593
{"kpos", new KuromojiPartOfSpeechTokenFilter {StopTags = new[] {"# verb-main:", "動詞-自立"}}},
553594
{"kfr", new KuromojiReadingFormTokenFilter {UseRomaji = true}},
554595
{"ks", new KuromojiStemmerTokenFilter {MinimumLength = 4}},

0 commit comments

Comments
 (0)