Skip to content

Commit 99c2e47

Browse files
avdvcbuescher
authored andcommitted
Fix kuromoji default stoptags (#26600)
Initialize the default stop-tags in `KuromojiPartOfSpeechFilterFactory` if the `stoptags` are not given in the config. Also adding a test which checks that part-of-speech tokens are removed when using the kuromoji_part_of_speech filter.
1 parent fc49997 commit 99c2e47

File tree

2 files changed

+19
-1
lines changed

2 files changed

+19
-1
lines changed

plugins/analysis-kuromoji/src/main/java/org/elasticsearch/index/analysis/KuromojiPartOfSpeechFilterFactory.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
package org.elasticsearch.index.analysis;
2121

2222
import org.apache.lucene.analysis.TokenStream;
23+
import org.apache.lucene.analysis.ja.JapaneseAnalyzer;
2324
import org.apache.lucene.analysis.ja.JapanesePartOfSpeechStopFilter;
2425
import org.elasticsearch.common.settings.Settings;
2526
import org.elasticsearch.env.Environment;
@@ -38,6 +39,8 @@ public KuromojiPartOfSpeechFilterFactory(IndexSettings indexSettings, Environmen
3839
List<String> wordList = Analysis.getWordList(env, settings, "stoptags");
3940
if (wordList != null) {
4041
stopTags.addAll(wordList);
42+
} else {
43+
stopTags.addAll(JapaneseAnalyzer.getDefaultStopTags());
4144
}
4245
}
4346

plugins/analysis-kuromoji/src/test/java/org/elasticsearch/index/analysis/KuromojiAnalysisTests.java

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,21 @@ public void testBaseFormFilterFactory() throws IOException {
9595
assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
9696
}
9797

98+
public void testPartOfSpeechFilter() throws IOException {
99+
TestAnalysis analysis = createTestAnalysis();
100+
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("kuromoji_part_of_speech");
101+
102+
assertThat(tokenFilter, instanceOf(KuromojiPartOfSpeechFilterFactory.class));
103+
104+
String source = "寿司がおいしいね";
105+
String[] expected_tokens = new String[]{"寿司", "おいしい"};
106+
107+
Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH);
108+
tokenizer.setReader(new StringReader(source));
109+
110+
assertSimpleTSOutput(tokenFilter.create(tokenizer), expected_tokens);
111+
}
112+
98113
public void testReadingFormFilterFactory() throws IOException {
99114
TestAnalysis analysis = createTestAnalysis();
100115
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("kuromoji_rf");
@@ -210,7 +225,7 @@ public static void assertSimpleTSOutput(TokenStream stream,
210225
int i = 0;
211226
while (stream.incrementToken()) {
212227
assertThat(expected.length, greaterThan(i));
213-
assertThat( "expected different term at index " + i, expected[i++], equalTo(termAttr.toString()));
228+
assertThat("expected different term at index " + i, termAttr.toString(), equalTo(expected[i++]));
214229
}
215230
assertThat("not all tokens produced", i, equalTo(expected.length));
216231
}

0 commit comments

Comments
 (0)