Skip to content

Commit 7184cf8

Browse files
avdvcbuescher
authored andcommitted
Fix kuromoji default stoptags (#26600)
Initialize the default stop-tags in `KuromojiPartOfSpeechFilterFactory` if the `stoptags` are not given in the config. Also adding a test which checks that part-of-speech tokens are removed when using the kuromoji_part_of_speech filter.
1 parent 7f74a62 commit 7184cf8

File tree

2 files changed

+19
-1
lines changed

2 files changed

+19
-1
lines changed

plugins/analysis-kuromoji/src/main/java/org/elasticsearch/index/analysis/KuromojiPartOfSpeechFilterFactory.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
package org.elasticsearch.index.analysis;
2121

2222
import org.apache.lucene.analysis.TokenStream;
23+
import org.apache.lucene.analysis.ja.JapaneseAnalyzer;
2324
import org.apache.lucene.analysis.ja.JapanesePartOfSpeechStopFilter;
2425
import org.elasticsearch.common.settings.Settings;
2526
import org.elasticsearch.env.Environment;
@@ -38,6 +39,8 @@ public KuromojiPartOfSpeechFilterFactory(IndexSettings indexSettings, Environmen
3839
List<String> wordList = Analysis.getWordList(env, settings, "stoptags");
3940
if (wordList != null) {
4041
stopTags.addAll(wordList);
42+
} else {
43+
stopTags.addAll(JapaneseAnalyzer.getDefaultStopTags());
4144
}
4245
}
4346

plugins/analysis-kuromoji/src/test/java/org/elasticsearch/index/analysis/KuromojiAnalysisTests.java

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,21 @@ public void testBaseFormFilterFactory() throws IOException {
9393
assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
9494
}
9595

96+
public void testPartOfSpeechFilter() throws IOException {
97+
TestAnalysis analysis = createTestAnalysis();
98+
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("kuromoji_part_of_speech");
99+
100+
assertThat(tokenFilter, instanceOf(KuromojiPartOfSpeechFilterFactory.class));
101+
102+
String source = "寿司がおいしいね";
103+
String[] expected_tokens = new String[]{"寿司", "おいしい"};
104+
105+
Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH);
106+
tokenizer.setReader(new StringReader(source));
107+
108+
assertSimpleTSOutput(tokenFilter.create(tokenizer), expected_tokens);
109+
}
110+
96111
public void testReadingFormFilterFactory() throws IOException {
97112
TestAnalysis analysis = createTestAnalysis();
98113
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("kuromoji_rf");
@@ -208,7 +223,7 @@ public static void assertSimpleTSOutput(TokenStream stream,
208223
int i = 0;
209224
while (stream.incrementToken()) {
210225
assertThat(expected.length, greaterThan(i));
211-
assertThat( "expected different term at index " + i, expected[i++], equalTo(termAttr.toString()));
226+
assertThat("expected different term at index " + i, termAttr.toString(), equalTo(expected[i++]));
212227
}
213228
assertThat("not all tokens produced", i, equalTo(expected.length));
214229
}

0 commit comments

Comments
 (0)