Skip to content

Commit 7b95470

Browse files
authored
Moved tokenizers to analysis common module (#30538)
The following tokenizers were moved: classic, edge_ngram, letter, lowercase, ngram, path_hierarchy, pattern, thai, uax_url_email and whitespace. Left keyword tokenizer factory in server module, because normalizers directly depend on it.This should be addressed on a follow up change. Relates to #23658
1 parent 9014361 commit 7b95470

File tree

41 files changed

+679
-336
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+679
-336
lines changed

server/src/main/java/org/elasticsearch/index/analysis/CharMatcher.java renamed to modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CharMatcher.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
* under the License.
1818
*/
1919

20-
package org.elasticsearch.index.analysis;
20+
package org.elasticsearch.analysis.common;
2121

2222
import java.util.HashSet;
2323
import java.util.Set;

server/src/main/java/org/elasticsearch/index/analysis/ClassicTokenizerFactory.java renamed to modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ClassicTokenizerFactory.java

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,15 @@
1717
* under the License.
1818
*/
1919

20-
package org.elasticsearch.index.analysis;
20+
package org.elasticsearch.analysis.common;
2121

2222
import org.apache.lucene.analysis.Tokenizer;
2323
import org.apache.lucene.analysis.standard.ClassicTokenizer;
2424
import org.apache.lucene.analysis.standard.StandardAnalyzer;
2525
import org.elasticsearch.common.settings.Settings;
2626
import org.elasticsearch.env.Environment;
2727
import org.elasticsearch.index.IndexSettings;
28+
import org.elasticsearch.index.analysis.AbstractTokenizerFactory;
2829

2930
/**
3031
* Factory for {@link ClassicTokenizer}
@@ -33,7 +34,7 @@ public class ClassicTokenizerFactory extends AbstractTokenizerFactory {
3334

3435
private final int maxTokenLength;
3536

36-
public ClassicTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
37+
ClassicTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
3738
super(indexSettings, name, settings);
3839
maxTokenLength = settings.getAsInt("max_token_length", StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH);
3940
}

modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,11 @@
3434
import org.apache.lucene.analysis.commongrams.CommonGramsFilter;
3535
import org.apache.lucene.analysis.core.DecimalDigitFilter;
3636
import org.apache.lucene.analysis.core.KeywordTokenizer;
37+
import org.apache.lucene.analysis.core.LetterTokenizer;
3738
import org.apache.lucene.analysis.core.LowerCaseTokenizer;
3839
import org.apache.lucene.analysis.core.StopAnalyzer;
3940
import org.apache.lucene.analysis.core.UpperCaseFilter;
41+
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
4042
import org.apache.lucene.analysis.cz.CzechStemFilter;
4143
import org.apache.lucene.analysis.de.GermanNormalizationFilter;
4244
import org.apache.lucene.analysis.de.GermanStemFilter;
@@ -58,17 +60,25 @@
5860
import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter;
5961
import org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter;
6062
import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter;
63+
import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer;
6164
import org.apache.lucene.analysis.ngram.NGramTokenFilter;
65+
import org.apache.lucene.analysis.ngram.NGramTokenizer;
66+
import org.apache.lucene.analysis.path.PathHierarchyTokenizer;
67+
import org.apache.lucene.analysis.pattern.PatternTokenizer;
6268
import org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilter;
6369
import org.apache.lucene.analysis.payloads.TypeAsPayloadTokenFilter;
6470
import org.apache.lucene.analysis.reverse.ReverseStringFilter;
6571
import org.apache.lucene.analysis.shingle.ShingleFilter;
6672
import org.apache.lucene.analysis.snowball.SnowballFilter;
6773
import org.apache.lucene.analysis.standard.ClassicFilter;
74+
import org.apache.lucene.analysis.standard.ClassicTokenizer;
75+
import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer;
76+
import org.apache.lucene.analysis.th.ThaiTokenizer;
6877
import org.apache.lucene.analysis.tr.ApostropheFilter;
6978
import org.apache.lucene.analysis.util.ElisionFilter;
7079
import org.elasticsearch.common.logging.DeprecationLogger;
7180
import org.elasticsearch.common.logging.Loggers;
81+
import org.elasticsearch.common.regex.Regex;
7282
import org.elasticsearch.index.analysis.CharFilterFactory;
7383
import org.elasticsearch.index.analysis.PreConfiguredCharFilter;
7484
import org.elasticsearch.index.analysis.PreConfiguredTokenFilter;
@@ -169,6 +179,19 @@ public Map<String, AnalysisProvider<TokenizerFactory>> getTokenizers() {
169179
Map<String, AnalysisProvider<TokenizerFactory>> tokenizers = new TreeMap<>();
170180
tokenizers.put("simple_pattern", SimplePatternTokenizerFactory::new);
171181
tokenizers.put("simple_pattern_split", SimplePatternSplitTokenizerFactory::new);
182+
tokenizers.put("thai", ThaiTokenizerFactory::new);
183+
tokenizers.put("nGram", NGramTokenizerFactory::new);
184+
tokenizers.put("ngram", NGramTokenizerFactory::new);
185+
tokenizers.put("edgeNGram", EdgeNGramTokenizerFactory::new);
186+
tokenizers.put("edge_ngram", EdgeNGramTokenizerFactory::new);
187+
tokenizers.put("classic", ClassicTokenizerFactory::new);
188+
tokenizers.put("letter", LetterTokenizerFactory::new);
189+
tokenizers.put("lowercase", LowerCaseTokenizerFactory::new);
190+
tokenizers.put("path_hierarchy", PathHierarchyTokenizerFactory::new);
191+
tokenizers.put("PathHierarchy", PathHierarchyTokenizerFactory::new);
192+
tokenizers.put("pattern", PatternTokenizerFactory::new);
193+
tokenizers.put("uax_url_email", UAX29URLEmailTokenizerFactory::new);
194+
tokenizers.put("whitespace", WhitespaceTokenizerFactory::new);
172195
return tokenizers;
173196
}
174197

@@ -283,6 +306,16 @@ public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
283306
public List<PreConfiguredTokenizer> getPreConfiguredTokenizers() {
284307
List<PreConfiguredTokenizer> tokenizers = new ArrayList<>();
285308
tokenizers.add(PreConfiguredTokenizer.singleton("keyword", KeywordTokenizer::new, null));
309+
tokenizers.add(PreConfiguredTokenizer.singleton("classic", ClassicTokenizer::new, null));
310+
tokenizers.add(PreConfiguredTokenizer.singleton("uax_url_email", UAX29URLEmailTokenizer::new, null));
311+
tokenizers.add(PreConfiguredTokenizer.singleton("path_hierarchy", PathHierarchyTokenizer::new, null));
312+
tokenizers.add(PreConfiguredTokenizer.singleton("letter", LetterTokenizer::new, null));
313+
tokenizers.add(PreConfiguredTokenizer.singleton("whitespace", WhitespaceTokenizer::new, null));
314+
tokenizers.add(PreConfiguredTokenizer.singleton("ngram", NGramTokenizer::new, null));
315+
tokenizers.add(PreConfiguredTokenizer.singleton("edge_ngram",
316+
() -> new EdgeNGramTokenizer(EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE), null));
317+
tokenizers.add(PreConfiguredTokenizer.singleton("pattern", () -> new PatternTokenizer(Regex.compile("\\W+", null), -1), null));
318+
tokenizers.add(PreConfiguredTokenizer.singleton("thai", ThaiTokenizer::new, null));
286319
tokenizers.add(PreConfiguredTokenizer.singleton("lowercase", LowerCaseTokenizer::new, () -> new TokenFilterFactory() {
287320
@Override
288321
public String name() {
@@ -294,6 +327,13 @@ public TokenStream create(TokenStream tokenStream) {
294327
return new LowerCaseFilter(tokenStream);
295328
}
296329
}));
330+
331+
// Temporary shim for aliases. TODO deprecate after they are moved
332+
tokenizers.add(PreConfiguredTokenizer.singleton("nGram", NGramTokenizer::new, null));
333+
tokenizers.add(PreConfiguredTokenizer.singleton("edgeNGram",
334+
() -> new EdgeNGramTokenizer(EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE), null));
335+
tokenizers.add(PreConfiguredTokenizer.singleton("PathHierarchy", PathHierarchyTokenizer::new, null));
336+
297337
return tokenizers;
298338
}
299339
}

server/src/main/java/org/elasticsearch/index/analysis/EdgeNGramTokenizerFactory.java renamed to modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EdgeNGramTokenizerFactory.java

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,27 +17,25 @@
1717
* under the License.
1818
*/
1919

20-
package org.elasticsearch.index.analysis;
20+
package org.elasticsearch.analysis.common;
2121

2222
import org.apache.lucene.analysis.Tokenizer;
2323
import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer;
2424
import org.apache.lucene.analysis.ngram.NGramTokenizer;
2525
import org.elasticsearch.common.settings.Settings;
2626
import org.elasticsearch.env.Environment;
2727
import org.elasticsearch.index.IndexSettings;
28+
import org.elasticsearch.index.analysis.AbstractTokenizerFactory;
2829

29-
import static org.elasticsearch.index.analysis.NGramTokenizerFactory.parseTokenChars;
30+
import static org.elasticsearch.analysis.common.NGramTokenizerFactory.parseTokenChars;
3031

3132
public class EdgeNGramTokenizerFactory extends AbstractTokenizerFactory {
3233

3334
private final int minGram;
34-
3535
private final int maxGram;
36-
3736
private final CharMatcher matcher;
3837

39-
40-
public EdgeNGramTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
38+
EdgeNGramTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
4139
super(indexSettings, name, settings);
4240
this.minGram = settings.getAsInt("min_gram", NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE);
4341
this.maxGram = settings.getAsInt("max_gram", NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE);

server/src/main/java/org/elasticsearch/index/analysis/LetterTokenizerFactory.java renamed to modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/LetterTokenizerFactory.java

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,17 +17,18 @@
1717
* under the License.
1818
*/
1919

20-
package org.elasticsearch.index.analysis;
20+
package org.elasticsearch.analysis.common;
2121

2222
import org.apache.lucene.analysis.Tokenizer;
2323
import org.apache.lucene.analysis.core.LetterTokenizer;
2424
import org.elasticsearch.common.settings.Settings;
2525
import org.elasticsearch.env.Environment;
2626
import org.elasticsearch.index.IndexSettings;
27+
import org.elasticsearch.index.analysis.AbstractTokenizerFactory;
2728

2829
public class LetterTokenizerFactory extends AbstractTokenizerFactory {
2930

30-
public LetterTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
31+
LetterTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
3132
super(indexSettings, name, settings);
3233
}
3334

server/src/main/java/org/elasticsearch/index/analysis/LowerCaseTokenizerFactory.java renamed to modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/LowerCaseTokenizerFactory.java

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,17 +17,19 @@
1717
* under the License.
1818
*/
1919

20-
package org.elasticsearch.index.analysis;
20+
package org.elasticsearch.analysis.common;
2121

2222
import org.apache.lucene.analysis.Tokenizer;
2323
import org.apache.lucene.analysis.core.LowerCaseTokenizer;
2424
import org.elasticsearch.common.settings.Settings;
2525
import org.elasticsearch.env.Environment;
2626
import org.elasticsearch.index.IndexSettings;
27+
import org.elasticsearch.index.analysis.AbstractTokenizerFactory;
28+
import org.elasticsearch.index.analysis.MultiTermAwareComponent;
2729

2830
public class LowerCaseTokenizerFactory extends AbstractTokenizerFactory implements MultiTermAwareComponent {
2931

30-
public LowerCaseTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
32+
LowerCaseTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
3133
super(indexSettings, name, settings);
3234
}
3335

server/src/main/java/org/elasticsearch/index/analysis/NGramTokenizerFactory.java renamed to modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/NGramTokenizerFactory.java

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,15 @@
1717
* under the License.
1818
*/
1919

20-
package org.elasticsearch.index.analysis;
20+
package org.elasticsearch.analysis.common;
2121

2222
import org.apache.lucene.analysis.Tokenizer;
2323
import org.apache.lucene.analysis.ngram.NGramTokenizer;
2424
import org.elasticsearch.Version;
2525
import org.elasticsearch.common.settings.Settings;
2626
import org.elasticsearch.env.Environment;
2727
import org.elasticsearch.index.IndexSettings;
28+
import org.elasticsearch.index.analysis.AbstractTokenizerFactory;
2829

2930
import java.lang.reflect.Field;
3031
import java.lang.reflect.Modifier;
@@ -83,7 +84,7 @@ static CharMatcher parseTokenChars(List<String> characterClasses) {
8384
return builder.build();
8485
}
8586

86-
public NGramTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
87+
NGramTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
8788
super(indexSettings, name, settings);
8889
int maxAllowedNgramDiff = indexSettings.getMaxNgramDiff();
8990
this.minGram = settings.getAsInt("min_gram", NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE);

server/src/main/java/org/elasticsearch/index/analysis/PathHierarchyTokenizerFactory.java renamed to modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/PathHierarchyTokenizerFactory.java

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,15 @@
1717
* under the License.
1818
*/
1919

20-
package org.elasticsearch.index.analysis;
20+
package org.elasticsearch.analysis.common;
2121

2222
import org.apache.lucene.analysis.Tokenizer;
2323
import org.apache.lucene.analysis.path.PathHierarchyTokenizer;
2424
import org.apache.lucene.analysis.path.ReversePathHierarchyTokenizer;
2525
import org.elasticsearch.common.settings.Settings;
2626
import org.elasticsearch.env.Environment;
2727
import org.elasticsearch.index.IndexSettings;
28+
import org.elasticsearch.index.analysis.AbstractTokenizerFactory;
2829

2930
public class PathHierarchyTokenizerFactory extends AbstractTokenizerFactory {
3031

@@ -35,7 +36,7 @@ public class PathHierarchyTokenizerFactory extends AbstractTokenizerFactory {
3536
private final int skip;
3637
private final boolean reverse;
3738

38-
public PathHierarchyTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
39+
PathHierarchyTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
3940
super(indexSettings, name, settings);
4041
bufferSize = settings.getAsInt("buffer_size", 1024);
4142
String delimiter = settings.get("delimiter");

server/src/main/java/org/elasticsearch/index/analysis/PatternTokenizerFactory.java renamed to modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/PatternTokenizerFactory.java

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,15 @@
1717
* under the License.
1818
*/
1919

20-
package org.elasticsearch.index.analysis;
20+
package org.elasticsearch.analysis.common;
2121

2222
import org.apache.lucene.analysis.Tokenizer;
2323
import org.apache.lucene.analysis.pattern.PatternTokenizer;
2424
import org.elasticsearch.common.regex.Regex;
2525
import org.elasticsearch.common.settings.Settings;
2626
import org.elasticsearch.env.Environment;
2727
import org.elasticsearch.index.IndexSettings;
28+
import org.elasticsearch.index.analysis.AbstractTokenizerFactory;
2829

2930
import java.util.regex.Pattern;
3031

@@ -33,7 +34,7 @@ public class PatternTokenizerFactory extends AbstractTokenizerFactory {
3334
private final Pattern pattern;
3435
private final int group;
3536

36-
public PatternTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
37+
PatternTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
3738
super(indexSettings, name, settings);
3839

3940
String sPattern = settings.get("pattern", "\\W+" /*PatternAnalyzer.NON_WORD_PATTERN*/);

server/src/main/java/org/elasticsearch/index/analysis/ThaiTokenizerFactory.java renamed to modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ThaiTokenizerFactory.java

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,20 +17,21 @@
1717
* under the License.
1818
*/
1919

20-
package org.elasticsearch.index.analysis;
20+
package org.elasticsearch.analysis.common;
2121

2222
import org.apache.lucene.analysis.Tokenizer;
2323
import org.apache.lucene.analysis.th.ThaiTokenizer;
2424
import org.elasticsearch.common.settings.Settings;
2525
import org.elasticsearch.env.Environment;
2626
import org.elasticsearch.index.IndexSettings;
27+
import org.elasticsearch.index.analysis.AbstractTokenizerFactory;
2728

2829
/**
2930
* Factory for {@link ThaiTokenizer}
3031
*/
3132
public class ThaiTokenizerFactory extends AbstractTokenizerFactory {
3233

33-
public ThaiTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
34+
ThaiTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
3435
super(indexSettings, name, settings);
3536
}
3637

0 commit comments

Comments
 (0)