3434import org .apache .lucene .analysis .commongrams .CommonGramsFilter ;
3535import org .apache .lucene .analysis .core .DecimalDigitFilter ;
3636import org .apache .lucene .analysis .core .KeywordTokenizer ;
37+ import org .apache .lucene .analysis .core .LetterTokenizer ;
3738import org .apache .lucene .analysis .core .LowerCaseTokenizer ;
3839import org .apache .lucene .analysis .core .StopAnalyzer ;
3940import org .apache .lucene .analysis .core .UpperCaseFilter ;
41+ import org .apache .lucene .analysis .core .WhitespaceTokenizer ;
4042import org .apache .lucene .analysis .cz .CzechStemFilter ;
4143import org .apache .lucene .analysis .de .GermanNormalizationFilter ;
4244import org .apache .lucene .analysis .de .GermanStemFilter ;
5860import org .apache .lucene .analysis .miscellaneous .WordDelimiterFilter ;
5961import org .apache .lucene .analysis .miscellaneous .WordDelimiterGraphFilter ;
6062import org .apache .lucene .analysis .ngram .EdgeNGramTokenFilter ;
63+ import org .apache .lucene .analysis .ngram .EdgeNGramTokenizer ;
6164import org .apache .lucene .analysis .ngram .NGramTokenFilter ;
65+ import org .apache .lucene .analysis .ngram .NGramTokenizer ;
66+ import org .apache .lucene .analysis .path .PathHierarchyTokenizer ;
67+ import org .apache .lucene .analysis .pattern .PatternTokenizer ;
6268import org .apache .lucene .analysis .payloads .DelimitedPayloadTokenFilter ;
6369import org .apache .lucene .analysis .payloads .TypeAsPayloadTokenFilter ;
6470import org .apache .lucene .analysis .reverse .ReverseStringFilter ;
6571import org .apache .lucene .analysis .shingle .ShingleFilter ;
6672import org .apache .lucene .analysis .snowball .SnowballFilter ;
6773import org .apache .lucene .analysis .standard .ClassicFilter ;
74+ import org .apache .lucene .analysis .standard .ClassicTokenizer ;
75+ import org .apache .lucene .analysis .standard .UAX29URLEmailTokenizer ;
76+ import org .apache .lucene .analysis .th .ThaiTokenizer ;
6877import org .apache .lucene .analysis .tr .ApostropheFilter ;
6978import org .apache .lucene .analysis .util .ElisionFilter ;
7079import org .elasticsearch .common .logging .DeprecationLogger ;
7180import org .elasticsearch .common .logging .Loggers ;
81+ import org .elasticsearch .common .regex .Regex ;
7282import org .elasticsearch .index .analysis .CharFilterFactory ;
7383import org .elasticsearch .index .analysis .PreConfiguredCharFilter ;
7484import org .elasticsearch .index .analysis .PreConfiguredTokenFilter ;
@@ -169,6 +179,19 @@ public Map<String, AnalysisProvider<TokenizerFactory>> getTokenizers() {
169179 Map <String , AnalysisProvider <TokenizerFactory >> tokenizers = new TreeMap <>();
170180 tokenizers .put ("simple_pattern" , SimplePatternTokenizerFactory ::new );
171181 tokenizers .put ("simple_pattern_split" , SimplePatternSplitTokenizerFactory ::new );
182+ tokenizers .put ("thai" , ThaiTokenizerFactory ::new );
183+ tokenizers .put ("nGram" , NGramTokenizerFactory ::new );
184+ tokenizers .put ("ngram" , NGramTokenizerFactory ::new );
185+ tokenizers .put ("edgeNGram" , EdgeNGramTokenizerFactory ::new );
186+ tokenizers .put ("edge_ngram" , EdgeNGramTokenizerFactory ::new );
187+ tokenizers .put ("classic" , ClassicTokenizerFactory ::new );
188+ tokenizers .put ("letter" , LetterTokenizerFactory ::new );
189+ tokenizers .put ("lowercase" , LowerCaseTokenizerFactory ::new );
190+ tokenizers .put ("path_hierarchy" , PathHierarchyTokenizerFactory ::new );
191+ tokenizers .put ("PathHierarchy" , PathHierarchyTokenizerFactory ::new );
192+ tokenizers .put ("pattern" , PatternTokenizerFactory ::new );
193+ tokenizers .put ("uax_url_email" , UAX29URLEmailTokenizerFactory ::new );
194+ tokenizers .put ("whitespace" , WhitespaceTokenizerFactory ::new );
172195 return tokenizers ;
173196 }
174197
@@ -283,6 +306,16 @@ public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
283306 public List <PreConfiguredTokenizer > getPreConfiguredTokenizers () {
284307 List <PreConfiguredTokenizer > tokenizers = new ArrayList <>();
285308 tokenizers .add (PreConfiguredTokenizer .singleton ("keyword" , KeywordTokenizer ::new , null ));
309+ tokenizers .add (PreConfiguredTokenizer .singleton ("classic" , ClassicTokenizer ::new , null ));
310+ tokenizers .add (PreConfiguredTokenizer .singleton ("uax_url_email" , UAX29URLEmailTokenizer ::new , null ));
311+ tokenizers .add (PreConfiguredTokenizer .singleton ("path_hierarchy" , PathHierarchyTokenizer ::new , null ));
312+ tokenizers .add (PreConfiguredTokenizer .singleton ("letter" , LetterTokenizer ::new , null ));
313+ tokenizers .add (PreConfiguredTokenizer .singleton ("whitespace" , WhitespaceTokenizer ::new , null ));
314+ tokenizers .add (PreConfiguredTokenizer .singleton ("ngram" , NGramTokenizer ::new , null ));
315+ tokenizers .add (PreConfiguredTokenizer .singleton ("edge_ngram" ,
316+ () -> new EdgeNGramTokenizer (EdgeNGramTokenizer .DEFAULT_MIN_GRAM_SIZE , EdgeNGramTokenizer .DEFAULT_MAX_GRAM_SIZE ), null ));
317+ tokenizers .add (PreConfiguredTokenizer .singleton ("pattern" , () -> new PatternTokenizer (Regex .compile ("\\ W+" , null ), -1 ), null ));
318+ tokenizers .add (PreConfiguredTokenizer .singleton ("thai" , ThaiTokenizer ::new , null ));
286319 tokenizers .add (PreConfiguredTokenizer .singleton ("lowercase" , LowerCaseTokenizer ::new , () -> new TokenFilterFactory () {
287320 @ Override
288321 public String name () {
@@ -294,6 +327,13 @@ public TokenStream create(TokenStream tokenStream) {
294327 return new LowerCaseFilter (tokenStream );
295328 }
296329 }));
330+
331+ // Temporary shim for aliases. TODO deprecate after they are moved
332+ tokenizers .add (PreConfiguredTokenizer .singleton ("nGram" , NGramTokenizer ::new , null ));
333+ tokenizers .add (PreConfiguredTokenizer .singleton ("edgeNGram" ,
334+ () -> new EdgeNGramTokenizer (EdgeNGramTokenizer .DEFAULT_MIN_GRAM_SIZE , EdgeNGramTokenizer .DEFAULT_MAX_GRAM_SIZE ), null ));
335+ tokenizers .add (PreConfiguredTokenizer .singleton ("PathHierarchy" , PathHierarchyTokenizer ::new , null ));
336+
297337 return tokenizers ;
298338 }
299339}
0 commit comments