Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
* under the License.
*/

package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;

import java.util.HashSet;
import java.util.Set;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,15 @@
* under the License.
*/

package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;

import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.standard.ClassicTokenizer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenizerFactory;

/**
* Factory for {@link ClassicTokenizer}
Expand All @@ -33,7 +34,7 @@ public class ClassicTokenizerFactory extends AbstractTokenizerFactory {

private final int maxTokenLength;

public ClassicTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
ClassicTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);
maxTokenLength = settings.getAsInt("max_token_length", StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,11 @@
import org.apache.lucene.analysis.commongrams.CommonGramsFilter;
import org.apache.lucene.analysis.core.DecimalDigitFilter;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.core.LetterTokenizer;
import org.apache.lucene.analysis.core.LowerCaseTokenizer;
import org.apache.lucene.analysis.core.StopAnalyzer;
import org.apache.lucene.analysis.core.UpperCaseFilter;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.cz.CzechStemFilter;
import org.apache.lucene.analysis.de.GermanNormalizationFilter;
import org.apache.lucene.analysis.de.GermanStemFilter;
Expand All @@ -58,17 +60,25 @@
import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter;
import org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter;
import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter;
import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer;
import org.apache.lucene.analysis.ngram.NGramTokenFilter;
import org.apache.lucene.analysis.ngram.NGramTokenizer;
import org.apache.lucene.analysis.path.PathHierarchyTokenizer;
import org.apache.lucene.analysis.pattern.PatternTokenizer;
import org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilter;
import org.apache.lucene.analysis.payloads.TypeAsPayloadTokenFilter;
import org.apache.lucene.analysis.reverse.ReverseStringFilter;
import org.apache.lucene.analysis.shingle.ShingleFilter;
import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.lucene.analysis.standard.ClassicFilter;
import org.apache.lucene.analysis.standard.ClassicTokenizer;
import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer;
import org.apache.lucene.analysis.th.ThaiTokenizer;
import org.apache.lucene.analysis.tr.ApostropheFilter;
import org.apache.lucene.analysis.util.ElisionFilter;
import org.elasticsearch.common.logging.DeprecationLogger;
import org.elasticsearch.common.logging.Loggers;
import org.elasticsearch.common.regex.Regex;
import org.elasticsearch.index.analysis.CharFilterFactory;
import org.elasticsearch.index.analysis.PreConfiguredCharFilter;
import org.elasticsearch.index.analysis.PreConfiguredTokenFilter;
Expand Down Expand Up @@ -169,6 +179,19 @@ public Map<String, AnalysisProvider<TokenizerFactory>> getTokenizers() {
Map<String, AnalysisProvider<TokenizerFactory>> tokenizers = new TreeMap<>();
tokenizers.put("simple_pattern", SimplePatternTokenizerFactory::new);
tokenizers.put("simple_pattern_split", SimplePatternSplitTokenizerFactory::new);
tokenizers.put("thai", ThaiTokenizerFactory::new);
tokenizers.put("nGram", NGramTokenizerFactory::new);
tokenizers.put("ngram", NGramTokenizerFactory::new);
tokenizers.put("edgeNGram", EdgeNGramTokenizerFactory::new);
tokenizers.put("edge_ngram", EdgeNGramTokenizerFactory::new);
tokenizers.put("classic", ClassicTokenizerFactory::new);
tokenizers.put("letter", LetterTokenizerFactory::new);
tokenizers.put("lowercase", LowerCaseTokenizerFactory::new);
tokenizers.put("path_hierarchy", PathHierarchyTokenizerFactory::new);
tokenizers.put("PathHierarchy", PathHierarchyTokenizerFactory::new);
tokenizers.put("pattern", PatternTokenizerFactory::new);
tokenizers.put("uax_url_email", UAX29URLEmailTokenizerFactory::new);
tokenizers.put("whitespace", WhitespaceTokenizerFactory::new);
return tokenizers;
}

Expand Down Expand Up @@ -283,6 +306,16 @@ public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
public List<PreConfiguredTokenizer> getPreConfiguredTokenizers() {
List<PreConfiguredTokenizer> tokenizers = new ArrayList<>();
tokenizers.add(PreConfiguredTokenizer.singleton("keyword", KeywordTokenizer::new, null));
tokenizers.add(PreConfiguredTokenizer.singleton("classic", ClassicTokenizer::new, null));
tokenizers.add(PreConfiguredTokenizer.singleton("uax_url_email", UAX29URLEmailTokenizer::new, null));
tokenizers.add(PreConfiguredTokenizer.singleton("path_hierarchy", PathHierarchyTokenizer::new, null));
tokenizers.add(PreConfiguredTokenizer.singleton("letter", LetterTokenizer::new, null));
tokenizers.add(PreConfiguredTokenizer.singleton("whitespace", WhitespaceTokenizer::new, null));
tokenizers.add(PreConfiguredTokenizer.singleton("ngram", NGramTokenizer::new, null));
tokenizers.add(PreConfiguredTokenizer.singleton("edge_ngram",
() -> new EdgeNGramTokenizer(EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE), null));
tokenizers.add(PreConfiguredTokenizer.singleton("pattern", () -> new PatternTokenizer(Regex.compile("\\W+", null), -1), null));
tokenizers.add(PreConfiguredTokenizer.singleton("thai", ThaiTokenizer::new, null));
tokenizers.add(PreConfiguredTokenizer.singleton("lowercase", LowerCaseTokenizer::new, () -> new TokenFilterFactory() {
@Override
public String name() {
Expand All @@ -294,6 +327,13 @@ public TokenStream create(TokenStream tokenStream) {
return new LowerCaseFilter(tokenStream);
}
}));

// Temporary shim for aliases. TODO deprecate after they are moved
tokenizers.add(PreConfiguredTokenizer.singleton("nGram", NGramTokenizer::new, null));
tokenizers.add(PreConfiguredTokenizer.singleton("edgeNGram",
() -> new EdgeNGramTokenizer(EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE), null));
tokenizers.add(PreConfiguredTokenizer.singleton("PathHierarchy", PathHierarchyTokenizer::new, null));

return tokenizers;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -17,27 +17,25 @@
* under the License.
*/

package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;

import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer;
import org.apache.lucene.analysis.ngram.NGramTokenizer;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenizerFactory;

import static org.elasticsearch.index.analysis.NGramTokenizerFactory.parseTokenChars;
import static org.elasticsearch.analysis.common.NGramTokenizerFactory.parseTokenChars;

public class EdgeNGramTokenizerFactory extends AbstractTokenizerFactory {

private final int minGram;

private final int maxGram;

private final CharMatcher matcher;


public EdgeNGramTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
EdgeNGramTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);
this.minGram = settings.getAsInt("min_gram", NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE);
this.maxGram = settings.getAsInt("max_gram", NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,17 +17,18 @@
* under the License.
*/

package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;

import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.LetterTokenizer;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenizerFactory;

public class LetterTokenizerFactory extends AbstractTokenizerFactory {

public LetterTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
LetterTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,17 +17,19 @@
* under the License.
*/

package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;

import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.LowerCaseTokenizer;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenizerFactory;
import org.elasticsearch.index.analysis.MultiTermAwareComponent;

public class LowerCaseTokenizerFactory extends AbstractTokenizerFactory implements MultiTermAwareComponent {

public LowerCaseTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
LowerCaseTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,15 @@
* under the License.
*/

package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;

import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.ngram.NGramTokenizer;
import org.elasticsearch.Version;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenizerFactory;

import java.lang.reflect.Field;
import java.lang.reflect.Modifier;
Expand Down Expand Up @@ -83,7 +84,7 @@ static CharMatcher parseTokenChars(List<String> characterClasses) {
return builder.build();
}

public NGramTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
NGramTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);
int maxAllowedNgramDiff = indexSettings.getMaxNgramDiff();
this.minGram = settings.getAsInt("min_gram", NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,15 @@
* under the License.
*/

package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;

import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.path.PathHierarchyTokenizer;
import org.apache.lucene.analysis.path.ReversePathHierarchyTokenizer;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenizerFactory;

public class PathHierarchyTokenizerFactory extends AbstractTokenizerFactory {

Expand All @@ -35,7 +36,7 @@ public class PathHierarchyTokenizerFactory extends AbstractTokenizerFactory {
private final int skip;
private final boolean reverse;

public PathHierarchyTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
PathHierarchyTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);
bufferSize = settings.getAsInt("buffer_size", 1024);
String delimiter = settings.get("delimiter");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,15 @@
* under the License.
*/

package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;

import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.pattern.PatternTokenizer;
import org.elasticsearch.common.regex.Regex;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenizerFactory;

import java.util.regex.Pattern;

Expand All @@ -33,7 +34,7 @@ public class PatternTokenizerFactory extends AbstractTokenizerFactory {
private final Pattern pattern;
private final int group;

public PatternTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
PatternTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);

String sPattern = settings.get("pattern", "\\W+" /*PatternAnalyzer.NON_WORD_PATTERN*/);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,20 +17,21 @@
* under the License.
*/

package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;

import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.th.ThaiTokenizer;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenizerFactory;

/**
* Factory for {@link ThaiTokenizer}
*/
public class ThaiTokenizerFactory extends AbstractTokenizerFactory {

public ThaiTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
ThaiTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,20 +17,21 @@
* under the License.
*/

package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;

import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenizerFactory;

public class UAX29URLEmailTokenizerFactory extends AbstractTokenizerFactory {

private final int maxTokenLength;

public UAX29URLEmailTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
UAX29URLEmailTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);
maxTokenLength = settings.getAsInt("max_token_length", StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH);
}
Expand All @@ -41,4 +42,4 @@ public Tokenizer create() {
tokenizer.setMaxTokenLength(maxTokenLength);
return tokenizer;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
* under the License.
*/

package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
Expand All @@ -26,13 +26,14 @@
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenizerFactory;

public class WhitespaceTokenizerFactory extends AbstractTokenizerFactory {

static final String MAX_TOKEN_LENGTH = "max_token_length";
private Integer maxTokenLength;

public WhitespaceTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
WhitespaceTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);
maxTokenLength = settings.getAsInt(MAX_TOKEN_LENGTH, StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
* under the License.
*/

package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;

import org.elasticsearch.test.ESTestCase;

Expand Down
Loading