Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion buildSrc/src/main/resources/checkstyle_suppressions.xml
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,6 @@
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]CustomAnalyzerProvider.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]ShingleTokenFilterFactory.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]StemmerOverrideTokenFilterFactory.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]compound[/\\]HyphenationCompoundWordTokenFilterFactory.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]cache[/\\]bitset[/\\]BitsetFilterCache.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]codec[/\\]PerFieldMappingPostingFormatCodec.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]engine[/\\]ElasticsearchConcurrentMergeScheduler.java" checks="LineLength" />
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
* under the License.
*/

package org.elasticsearch.index.analysis.compound;
package org.elasticsearch.analysis.common;

import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.compound.CompoundWordTokenFilterBase;
Expand All @@ -38,7 +38,7 @@ public abstract class AbstractCompoundWordTokenFilterFactory extends AbstractTok
protected final boolean onlyLongestMatch;
protected final CharArraySet wordList;

public AbstractCompoundWordTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
protected AbstractCompoundWordTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, name, settings);

minWordSize = settings.getAsInt("min_word_size", CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,6 @@
import org.elasticsearch.index.analysis.DutchAnalyzerProvider;
import org.elasticsearch.index.analysis.DutchStemTokenFilterFactory;
import org.elasticsearch.index.analysis.EdgeNGramTokenizerFactory;
import org.elasticsearch.index.analysis.ElisionTokenFilterFactory;
import org.elasticsearch.index.analysis.EnglishAnalyzerProvider;
import org.elasticsearch.index.analysis.FingerprintAnalyzerProvider;
import org.elasticsearch.index.analysis.FingerprintTokenFilterFactory;
Expand All @@ -75,7 +74,6 @@
import org.elasticsearch.index.analysis.IndonesianAnalyzerProvider;
import org.elasticsearch.index.analysis.IrishAnalyzerProvider;
import org.elasticsearch.index.analysis.ItalianAnalyzerProvider;
import org.elasticsearch.index.analysis.KStemTokenFilterFactory;
import org.elasticsearch.index.analysis.KeepTypesFilterFactory;
import org.elasticsearch.index.analysis.KeepWordFilterFactory;
import org.elasticsearch.index.analysis.KeywordAnalyzerProvider;
Expand All @@ -99,7 +97,6 @@
import org.elasticsearch.index.analysis.PreConfiguredCharFilter;
import org.elasticsearch.index.analysis.PreConfiguredTokenFilter;
import org.elasticsearch.index.analysis.PreConfiguredTokenizer;
import org.elasticsearch.index.analysis.ReverseTokenFilterFactory;
import org.elasticsearch.index.analysis.RomanianAnalyzerProvider;
import org.elasticsearch.index.analysis.RussianAnalyzerProvider;
import org.elasticsearch.index.analysis.RussianStemTokenFilterFactory;
Expand All @@ -116,22 +113,17 @@
import org.elasticsearch.index.analysis.StandardHtmlStripAnalyzerProvider;
import org.elasticsearch.index.analysis.StandardTokenFilterFactory;
import org.elasticsearch.index.analysis.StandardTokenizerFactory;
import org.elasticsearch.index.analysis.StemmerOverrideTokenFilterFactory;
import org.elasticsearch.index.analysis.StemmerTokenFilterFactory;
import org.elasticsearch.index.analysis.StopAnalyzerProvider;
import org.elasticsearch.index.analysis.StopTokenFilterFactory;
import org.elasticsearch.index.analysis.SwedishAnalyzerProvider;
import org.elasticsearch.index.analysis.ThaiAnalyzerProvider;
import org.elasticsearch.index.analysis.ThaiTokenizerFactory;
import org.elasticsearch.index.analysis.TokenFilterFactory;
import org.elasticsearch.index.analysis.TokenizerFactory;
import org.elasticsearch.index.analysis.TruncateTokenFilterFactory;
import org.elasticsearch.index.analysis.TurkishAnalyzerProvider;
import org.elasticsearch.index.analysis.UAX29URLEmailTokenizerFactory;
import org.elasticsearch.index.analysis.WhitespaceAnalyzerProvider;
import org.elasticsearch.index.analysis.WhitespaceTokenizerFactory;
import org.elasticsearch.index.analysis.compound.DictionaryCompoundWordTokenFilterFactory;
import org.elasticsearch.index.analysis.compound.HyphenationCompoundWordTokenFilterFactory;
import org.elasticsearch.plugins.AnalysisPlugin;

import java.io.IOException;
Expand Down Expand Up @@ -201,31 +193,23 @@ private NamedRegistry<AnalysisProvider<TokenFilterFactory>> setupTokenFilters(Li
hunspellService) {
NamedRegistry<AnalysisProvider<TokenFilterFactory>> tokenFilters = new NamedRegistry<>("token_filter");
tokenFilters.register("stop", StopTokenFilterFactory::new);
tokenFilters.register("reverse", ReverseTokenFilterFactory::new);
tokenFilters.register("kstem", KStemTokenFilterFactory::new);
tokenFilters.register("standard", StandardTokenFilterFactory::new);
tokenFilters.register("shingle", ShingleTokenFilterFactory::new);
tokenFilters.register("min_hash", MinHashTokenFilterFactory::new);
tokenFilters.register("truncate", requriesAnalysisSettings(TruncateTokenFilterFactory::new));
tokenFilters.register("limit", LimitTokenCountFilterFactory::new);
tokenFilters.register("common_grams", requriesAnalysisSettings(CommonGramsTokenFilterFactory::new));
tokenFilters.register("stemmer", StemmerTokenFilterFactory::new);
tokenFilters.register("delimited_payload_filter", DelimitedPayloadTokenFilterFactory::new);
tokenFilters.register("elision", ElisionTokenFilterFactory::new);
tokenFilters.register("keep", requriesAnalysisSettings(KeepWordFilterFactory::new));
tokenFilters.register("keep_types", requriesAnalysisSettings(KeepTypesFilterFactory::new));
tokenFilters.register("pattern_capture", requriesAnalysisSettings(PatternCaptureGroupTokenFilterFactory::new));
tokenFilters.register("pattern_replace", requriesAnalysisSettings(PatternReplaceTokenFilterFactory::new));
tokenFilters.register("dictionary_decompounder", requriesAnalysisSettings(DictionaryCompoundWordTokenFilterFactory::new));
tokenFilters.register("hyphenation_decompounder", requriesAnalysisSettings(HyphenationCompoundWordTokenFilterFactory::new));
tokenFilters.register("arabic_stem", ArabicStemTokenFilterFactory::new);
tokenFilters.register("brazilian_stem", BrazilianStemTokenFilterFactory::new);
tokenFilters.register("czech_stem", CzechStemTokenFilterFactory::new);
tokenFilters.register("dutch_stem", DutchStemTokenFilterFactory::new);
tokenFilters.register("french_stem", FrenchStemTokenFilterFactory::new);
tokenFilters.register("german_stem", GermanStemTokenFilterFactory::new);
tokenFilters.register("russian_stem", RussianStemTokenFilterFactory::new);
tokenFilters.register("stemmer_override", requriesAnalysisSettings(StemmerOverrideTokenFilterFactory::new));
tokenFilters.register("arabic_normalization", ArabicNormalizationFilterFactory::new);
tokenFilters.register("german_normalization", GermanNormalizationFilterFactory::new);
tokenFilters.register("hindi_normalization", HindiNormalizationFilterFactory::new);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@
import org.elasticsearch.index.analysis.StandardTokenizerFactory;
import org.elasticsearch.index.analysis.StopTokenFilterFactory;
import org.elasticsearch.index.analysis.TokenFilterFactory;
import org.elasticsearch.index.analysis.filter1.MyFilterTokenFilterFactory;
import org.elasticsearch.index.analysis.MyFilterTokenFilterFactory;
import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider;
import org.elasticsearch.plugins.AnalysisPlugin;
import org.elasticsearch.test.ESTestCase;
Expand Down Expand Up @@ -196,18 +196,6 @@ private void testSimpleConfiguration(Settings settings) throws IOException {
// assertThat(czechstemmeranalyzer.tokenizerFactory(), instanceOf(StandardTokenizerFactory.class));
// assertThat(czechstemmeranalyzer.tokenFilters().length, equalTo(4));
// assertThat(czechstemmeranalyzer.tokenFilters()[3], instanceOf(CzechStemTokenFilterFactory.class));
//
// // check dictionary decompounder
// analyzer = analysisService.analyzer("decompoundingAnalyzer").analyzer();
// assertThat(analyzer, instanceOf(CustomAnalyzer.class));
// CustomAnalyzer dictionaryDecompounderAnalyze = (CustomAnalyzer) analyzer;
// assertThat(dictionaryDecompounderAnalyze.tokenizerFactory(), instanceOf(StandardTokenizerFactory.class));
// assertThat(dictionaryDecompounderAnalyze.tokenFilters().length, equalTo(1));
// assertThat(dictionaryDecompounderAnalyze.tokenFilters()[0], instanceOf(DictionaryCompoundWordTokenFilterFactory.class));

Set<?> wordList = Analysis.getWordSet(null, Version.CURRENT, settings, "index.analysis.filter.dict_dec.word_list");
MatcherAssert.assertThat(wordList.size(), equalTo(6));
// MatcherAssert.assertThat(wordList, hasItems("donau", "dampf", "schiff", "spargel", "creme", "suppe"));
}

public void testWordListPath() throws Exception {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -93,16 +93,16 @@ public void testAnalyzeWithNoIndex() throws Exception {
assertThat(analyzeResponse.getTokens().size(), equalTo(1));
assertThat(analyzeResponse.getTokens().get(0).getTerm(), equalTo("this is a test"));

analyzeResponse = client().admin().indices().prepareAnalyze("THIS IS A TEST").setTokenizer("standard").addTokenFilter("lowercase").addTokenFilter("reverse").get();
analyzeResponse = client().admin().indices().prepareAnalyze("THIS IS A TEST").setTokenizer("standard").addTokenFilter("lowercase").get();
assertThat(analyzeResponse.getTokens().size(), equalTo(4));
AnalyzeResponse.AnalyzeToken token = analyzeResponse.getTokens().get(0);
assertThat(token.getTerm(), equalTo("siht"));
assertThat(token.getTerm(), equalTo("this"));
token = analyzeResponse.getTokens().get(1);
assertThat(token.getTerm(), equalTo("si"));
assertThat(token.getTerm(), equalTo("is"));
token = analyzeResponse.getTokens().get(2);
assertThat(token.getTerm(), equalTo("a"));
token = analyzeResponse.getTokens().get(3);
assertThat(token.getTerm(), equalTo("tset"));
assertThat(token.getTerm(), equalTo("test"));

analyzeResponse = client().admin().indices().prepareAnalyze("of course").setTokenizer("standard").addTokenFilter("stop").get();
assertThat(analyzeResponse.getTokens().size(), equalTo(1));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -445,8 +445,6 @@ public void testStopwordsOnlyPhraseSuggest() throws IOException {
public void testPrefixLength() throws IOException {
CreateIndexRequestBuilder builder = prepareCreate("test").setSettings(Settings.builder()
.put(SETTING_NUMBER_OF_SHARDS, 1)
.put("index.analysis.analyzer.reverse.tokenizer", "standard")
.putArray("index.analysis.analyzer.reverse.filter", "lowercase", "reverse")
.put("index.analysis.analyzer.body.tokenizer", "standard")
.putArray("index.analysis.analyzer.body.filter", "lowercase")
.put("index.analysis.analyzer.bigram.tokenizer", "standard")
Expand All @@ -458,7 +456,6 @@ public void testPrefixLength() throws IOException {
XContentBuilder mapping = XContentFactory.jsonBuilder().startObject().startObject("type1")
.startObject("properties")
.startObject("body").field("type", "text").field("analyzer", "body").endObject()
.startObject("body_reverse").field("type", "text").field("analyzer", "reverse").endObject()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Weird that this wasn't used!

.startObject("bigram").field("type", "text").field("analyzer", "bigram").endObject()
.endObject()
.endObject().endObject();
Expand Down Expand Up @@ -486,8 +483,6 @@ public void testPrefixLength() throws IOException {
public void testBasicPhraseSuggest() throws IOException, URISyntaxException {
CreateIndexRequestBuilder builder = prepareCreate("test").setSettings(Settings.builder()
.put(indexSettings())
.put("index.analysis.analyzer.reverse.tokenizer", "standard")
.putArray("index.analysis.analyzer.reverse.filter", "lowercase", "reverse")
.put("index.analysis.analyzer.body.tokenizer", "standard")
.putArray("index.analysis.analyzer.body.filter", "lowercase")
.put("index.analysis.analyzer.bigram.tokenizer", "standard")
Expand All @@ -503,10 +498,6 @@ public void testBasicPhraseSuggest() throws IOException, URISyntaxException {
field("type", "text").
field("analyzer", "body")
.endObject()
.startObject("body_reverse").
field("type", "text").
field("analyzer", "reverse")
.endObject()
.startObject("bigram").
field("type", "text").
field("analyzer", "bigram")
Expand Down Expand Up @@ -536,7 +527,7 @@ public void testBasicPhraseSuggest() throws IOException, URISyntaxException {
"Police sergeant who stops the film",
};
for (String line : strings) {
index("test", "type1", line, "body", line, "body_reverse", line, "bigram", line);
index("test", "type1", line, "body", line, "bigram", line);
}
refresh();

Expand Down Expand Up @@ -576,14 +567,6 @@ public void testBasicPhraseSuggest() throws IOException, URISyntaxException {
searchSuggest = searchSuggest( "Arthur, King of the Britons", "simple_phrase", phraseSuggest);
assertSuggestion(searchSuggest, 0, "simple_phrase", "arthur king of the britons");

//test reverse suggestions with pre & post filter
phraseSuggest
.addCandidateGenerator(candidateGenerator("body").minWordLength(1).suggestMode("always"))
.addCandidateGenerator(candidateGenerator("body_reverse").minWordLength(1).suggestMode("always").preFilter("reverse")
.postFilter("reverse"));
searchSuggest = searchSuggest( "Artur, Ging of the Britons", "simple_phrase", phraseSuggest);
assertSuggestion(searchSuggest, 0, "simple_phrase", "arthur king of the britons");

// set all mass to trigrams (not indexed)
phraseSuggest.clearCandidateGenerators()
.addCandidateGenerator(candidateGenerator("body").minWordLength(1).suggestMode("always"))
Expand Down Expand Up @@ -633,8 +616,6 @@ public void testBasicPhraseSuggest() throws IOException, URISyntaxException {
public void testSizeParam() throws IOException {
CreateIndexRequestBuilder builder = prepareCreate("test").setSettings(Settings.builder()
.put(SETTING_NUMBER_OF_SHARDS, 1)
.put("index.analysis.analyzer.reverse.tokenizer", "standard")
.putArray("index.analysis.analyzer.reverse.filter", "lowercase", "reverse")
.put("index.analysis.analyzer.body.tokenizer", "standard")
.putArray("index.analysis.analyzer.body.filter", "lowercase")
.put("index.analysis.analyzer.bigram.tokenizer", "standard")
Expand All @@ -652,10 +633,6 @@ public void testSizeParam() throws IOException {
.field("type", "text")
.field("analyzer", "body")
.endObject()
.startObject("body_reverse")
.field("type", "text")
.field("analyzer", "reverse")
.endObject()
.startObject("bigram")
.field("type", "text")
.field("analyzer", "bigram")
Expand All @@ -667,9 +644,9 @@ public void testSizeParam() throws IOException {
ensureGreen();

String line = "xorr the god jewel";
index("test", "type1", "1", "body", line, "body_reverse", line, "bigram", line);
index("test", "type1", "1", "body", line, "bigram", line);
line = "I got it this time";
index("test", "type1", "2", "body", line, "body_reverse", line, "bigram", line);
index("test", "type1", "2", "body", line, "bigram", line);
refresh();

PhraseSuggestionBuilder phraseSuggestion = phraseSuggestion("bigram")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,6 @@
},
"my":{
"type":"myfilter"
},
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if it is time to remove this entire file somehow!

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh! I see. You don't need to get rid of it because you can plugin mocks into it. Great.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Naming this the same as the one for compound analysis seems bad. Maybe each one should have its own name.

"dict_dec":{
"type":"dictionary_decompounder",
"word_list":["donau", "dampf", "schiff", "spargel", "creme", "suppe"]
}
},
"analyzer":{
Expand All @@ -43,10 +39,6 @@
"czechAnalyzerWithStemmer":{
"tokenizer":"standard",
"filter":["standard", "lowercase", "stop", "czech_stem"]
},
"decompoundingAnalyzer":{
"tokenizer":"standard",
"filter":["dict_dec"]
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,6 @@ index :
stopwords : [stop2-1, stop2-2]
my :
type : myfilter
dict_dec :
type : dictionary_decompounder
word_list : [donau, dampf, schiff, spargel, creme, suppe]
analyzer :
standard :
type : standard
Expand All @@ -34,6 +31,3 @@ index :
czechAnalyzerWithStemmer :
tokenizer : standard
filter : [standard, lowercase, stop, czech_stem]
decompoundingAnalyzer :
tokenizer : standard
filter : [dict_dec]
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,14 @@ public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
filters.put("ngram", NGramTokenFilterFactory::new);
filters.put("edgeNGram", EdgeNGramTokenFilterFactory::new);
filters.put("edge_ngram", EdgeNGramTokenFilterFactory::new);
filters.put("stemmer", StemmerTokenFilterFactory::new);
filters.put("stemmer_override", requriesAnalysisSettings(StemmerOverrideTokenFilterFactory::new));
filters.put("kstem", KStemTokenFilterFactory::new);
filters.put("dictionary_decompounder", requriesAnalysisSettings(DictionaryCompoundWordTokenFilterFactory::new));
filters.put("hyphenation_decompounder", requriesAnalysisSettings(HyphenationCompoundWordTokenFilterFactory::new));
filters.put("reverse", ReverseTokenFilterFactory::new);
filters.put("elision", ElisionTokenFilterFactory::new);
filters.put("truncate", requriesAnalysisSettings(TruncateTokenFilterFactory::new));
return filters;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
* under the License.
*/

package org.elasticsearch.index.analysis.compound;
package org.elasticsearch.analysis.common;

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.compound.DictionaryCompoundWordTokenFilter;
Expand All @@ -33,7 +33,7 @@
*/
public class DictionaryCompoundWordTokenFilterFactory extends AbstractCompoundWordTokenFilterFactory {

public DictionaryCompoundWordTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
DictionaryCompoundWordTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, env, name, settings);
}

Expand Down
Loading