Fix kuromoji default stoptags (#26600)

avdv · cbuescher · commit 7184cf8b5b34 · 2017-09-15T12:25:09.000+02:00
Initialize the default stop-tags in `KuromojiPartOfSpeechFilterFactory` if the
`stoptags` are not given in the config. Also adding a test which checks that 
part-of-speech tokens are removed when using the kuromoji_part_of_speech 
filter.
diff --git a/plugins/analysis-kuromoji/src/main/java/org/elasticsearch/index/analysis/KuromojiPartOfSpeechFilterFactory.java b/plugins/analysis-kuromoji/src/main/java/org/elasticsearch/index/analysis/KuromojiPartOfSpeechFilterFactory.java
@@ -20,6 +20,7 @@
 package org.elasticsearch.index.analysis;
 
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.ja.JapaneseAnalyzer;
 import org.apache.lucene.analysis.ja.JapanesePartOfSpeechStopFilter;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.env.Environment;
@@ -38,6 +39,8 @@ public KuromojiPartOfSpeechFilterFactory(IndexSettings indexSettings, Environmen
         List<String> wordList = Analysis.getWordList(env, settings, "stoptags");
         if (wordList != null) {
             stopTags.addAll(wordList);
+        } else {
+            stopTags.addAll(JapaneseAnalyzer.getDefaultStopTags());
         }
     }
 
diff --git a/plugins/analysis-kuromoji/src/test/java/org/elasticsearch/index/analysis/KuromojiAnalysisTests.java b/plugins/analysis-kuromoji/src/test/java/org/elasticsearch/index/analysis/KuromojiAnalysisTests.java
@@ -93,6 +93,21 @@ public void testBaseFormFilterFactory() throws IOException {
         assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
     }
 
+    public void testPartOfSpeechFilter() throws IOException {
+        TestAnalysis analysis = createTestAnalysis();
+        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("kuromoji_part_of_speech");
+
+        assertThat(tokenFilter, instanceOf(KuromojiPartOfSpeechFilterFactory.class));
+
+        String source = "寿司がおいしいね";
+        String[] expected_tokens = new String[]{"寿司", "おいしい"};
+
+        Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH);
+        tokenizer.setReader(new StringReader(source));
+
+        assertSimpleTSOutput(tokenFilter.create(tokenizer), expected_tokens);
+    }
+
     public void testReadingFormFilterFactory() throws IOException {
         TestAnalysis analysis = createTestAnalysis();
         TokenFilterFactory tokenFilter = analysis.tokenFilter.get("kuromoji_rf");
@@ -208,7 +223,7 @@ public static void assertSimpleTSOutput(TokenStream stream,
         int i = 0;
         while (stream.incrementToken()) {
             assertThat(expected.length, greaterThan(i));
-            assertThat( "expected different term at index " + i, expected[i++], equalTo(termAttr.toString()));
+            assertThat("expected different term at index " + i, termAttr.toString(), equalTo(expected[i++]));
         }
         assertThat("not all tokens produced", i, equalTo(expected.length));
     }

Original file line number	Diff line number	Diff line change
`@@ -20,6 +20,7 @@`
`20`	`20`	`package org.elasticsearch.index.analysis;`
`21`	`21`
`22`	`22`	`import org.apache.lucene.analysis.TokenStream;`
	`23`	`+import org.apache.lucene.analysis.ja.JapaneseAnalyzer;`
`23`	`24`	`import org.apache.lucene.analysis.ja.JapanesePartOfSpeechStopFilter;`
`24`	`25`	`import org.elasticsearch.common.settings.Settings;`
`25`	`26`	`import org.elasticsearch.env.Environment;`
`@@ -38,6 +39,8 @@ public KuromojiPartOfSpeechFilterFactory(IndexSettings indexSettings, Environmen`
`38`	`39`	`List<String> wordList = Analysis.getWordList(env, settings, "stoptags");`
`39`	`40`	`if (wordList != null) {`
`40`	`41`	`stopTags.addAll(wordList);`
	`42`	`+ } else {`
	`43`	`+ stopTags.addAll(JapaneseAnalyzer.getDefaultStopTags());`
`41`	`44`	`}`
`42`	`45`	`}`
`43`	`46`