diff --git a/docs/plugins/analysis-kuromoji.asciidoc b/docs/plugins/analysis-kuromoji.asciidoc index 383df5afb485b..8ccea28beda9f 100644 --- a/docs/plugins/analysis-kuromoji.asciidoc +++ b/docs/plugins/analysis-kuromoji.asciidoc @@ -98,6 +98,39 @@ dictionary to `$ES_HOME/config/userdict_ja.txt`: 東京スカイツリー,東京 スカイツリー,トウキョウ スカイツリー,カスタム名詞 ----------------------- +-- + +You can also inline the rules directly in the tokenizer definition using +the `user_dictionary_rules` option: + +[source,js] +-------------------------------------------------- +PUT nori_sample +{ + "settings": { + "index": { + "analysis": { + "tokenizer": { + "kuromoji_user_dict": { + "type": "kuromoji_tokenizer", + "mode": "extended", + "user_dictionary_rules": ["東京スカイツリー,東京 スカイツリー,トウキョウ スカイツリー,カスタム名詞"] + } + }, + "analyzer": { + "my_analyzer": { + "type": "custom", + "tokenizer": "kuromoji_user_dict" + } + } + } + } + } +} +-------------------------------------------------- +// CONSOLE +-- + `nbest_cost`/`nbest_examples`:: + -- diff --git a/plugins/analysis-kuromoji/src/main/java/org/elasticsearch/index/analysis/KuromojiTokenizerFactory.java b/plugins/analysis-kuromoji/src/main/java/org/elasticsearch/index/analysis/KuromojiTokenizerFactory.java index 22000cf7979ea..c7e71fab63486 100644 --- a/plugins/analysis-kuromoji/src/main/java/org/elasticsearch/index/analysis/KuromojiTokenizerFactory.java +++ b/plugins/analysis-kuromoji/src/main/java/org/elasticsearch/index/analysis/KuromojiTokenizerFactory.java @@ -23,17 +23,22 @@ import org.apache.lucene.analysis.ja.JapaneseTokenizer; import org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode; import org.apache.lucene.analysis.ja.dict.UserDictionary; +import org.apache.lucene.analysis.ja.util.CSVUtil; import org.elasticsearch.ElasticsearchException; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; import java.io.IOException; -import java.io.Reader; +import java.io.StringReader; +import java.util.HashSet; +import java.util.List; +import java.util.Set; public class KuromojiTokenizerFactory extends AbstractTokenizerFactory { - private static final String USER_DICT_OPTION = "user_dictionary"; + private static final String USER_DICT_PATH_OPTION = "user_dictionary"; + private static final String USER_DICT_RULES_OPTION = "user_dictionary_rules"; private static final String NBEST_COST = "nbest_cost"; private static final String NBEST_EXAMPLES = "nbest_examples"; @@ -54,17 +59,33 @@ public KuromojiTokenizerFactory(IndexSettings indexSettings, Environment env, St } public static UserDictionary getUserDictionary(Environment env, Settings settings) { + if (settings.get(USER_DICT_PATH_OPTION) != null && settings.get(USER_DICT_RULES_OPTION) != null) { + throw new IllegalArgumentException("It is not allowed to use [" + USER_DICT_PATH_OPTION + "] in conjunction" + + " with [" + USER_DICT_RULES_OPTION + "]"); + } try { - final Reader reader = Analysis.getReaderFromFile(env, settings, USER_DICT_OPTION); - if (reader == null) { + List ruleList = Analysis.getWordList(env, settings, USER_DICT_PATH_OPTION, USER_DICT_RULES_OPTION, false); + if (ruleList == null || ruleList.isEmpty()) { return null; - } else { - try { - return UserDictionary.open(reader); - } finally { - reader.close(); + } + Set dup = new HashSet<>(); + int lineNum = 0; + for (String line : ruleList) { + // ignore comments + if (line.startsWith("#") == false) { + String[] values = CSVUtil.parse(line); + if (dup.add(values[0]) == false) { + throw new IllegalArgumentException("Found duplicate term [" + values[0] + "] in user dictionary " + + "at line [" + lineNum + "]"); + } } + ++ lineNum; + } + StringBuilder sb = new StringBuilder(); + for (String line : ruleList) { + sb.append(line).append(System.lineSeparator()); } + return UserDictionary.open(new StringReader(sb.toString())); } catch (IOException e) { throw new ElasticsearchException("failed to load kuromoji user dictionary", e); } diff --git a/plugins/analysis-kuromoji/src/test/java/org/elasticsearch/index/analysis/KuromojiAnalysisTests.java b/plugins/analysis-kuromoji/src/test/java/org/elasticsearch/index/analysis/KuromojiAnalysisTests.java index 29e73d5a9fa29..9add830c26c68 100644 --- a/plugins/analysis-kuromoji/src/test/java/org/elasticsearch/index/analysis/KuromojiAnalysisTests.java +++ b/plugins/analysis-kuromoji/src/test/java/org/elasticsearch/index/analysis/KuromojiAnalysisTests.java @@ -19,6 +19,7 @@ package org.elasticsearch.index.analysis; +import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.ja.JapaneseAnalyzer; @@ -39,6 +40,8 @@ import java.nio.file.Files; import java.nio.file.Path; +import static org.apache.lucene.analysis.BaseTokenStreamTestCase.assertTokenStreamContents; +import static org.hamcrest.CoreMatchers.containsString; import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.greaterThan; import static org.hamcrest.Matchers.instanceOf; @@ -307,4 +310,55 @@ public void testNumberFilterFactory() throws Exception { tokenizer.setReader(new StringReader(source)); assertSimpleTSOutput(tokenFilter.create(tokenizer), expected); } + + public void testKuromojiAnalyzerUserDict() throws Exception { + Settings settings = Settings.builder() + .put("index.analysis.analyzer.my_analyzer.type", "kuromoji") + .putList("index.analysis.analyzer.my_analyzer.user_dictionary_rules", "c++,c++,w,w", "制限スピード,制限スピード,セイゲンスピード,テスト名詞") + .build(); + TestAnalysis analysis = createTestAnalysis(settings); + Analyzer analyzer = analysis.indexAnalyzers.get("my_analyzer"); + try (TokenStream stream = analyzer.tokenStream("", "制限スピード")) { + assertTokenStreamContents(stream, new String[]{"制限スピード"}); + } + + try (TokenStream stream = analyzer.tokenStream("", "c++world")) { + assertTokenStreamContents(stream, new String[]{"c++", "world"}); + } + } + + public void testKuromojiAnalyzerInvalidUserDictOption() throws Exception { + Settings settings = Settings.builder() + .put("index.analysis.analyzer.my_analyzer.type", "kuromoji") + .put("index.analysis.analyzer.my_analyzer.user_dictionary", "user_dict.txt") + .putList("index.analysis.analyzer.my_analyzer.user_dictionary_rules", "c++,c++,w,w") + .build(); + IllegalArgumentException exc = expectThrows(IllegalArgumentException.class, () -> createTestAnalysis(settings)); + assertThat(exc.getMessage(), containsString("It is not allowed to use [user_dictionary] in conjunction " + + "with [user_dictionary_rules]")); + } + + public void testKuromojiAnalyzerDuplicateUserDictRule() throws Exception { + Settings settings = Settings.builder() + .put("index.analysis.analyzer.my_analyzer.type", "kuromoji") + .putList("index.analysis.analyzer.my_analyzer.user_dictionary_rules", + "c++,c++,w,w", "#comment", "制限スピード,制限スピード,セイゲンスピード,テスト名詞", "制限スピード,制限スピード,セイゲンスピード,テスト名詞") + .build(); + IllegalArgumentException exc = expectThrows(IllegalArgumentException.class, () -> createTestAnalysis(settings)); + assertThat(exc.getMessage(), containsString("[制限スピード] in user dictionary at line [3]")); + } + + private TestAnalysis createTestAnalysis(Settings analysisSettings) throws IOException { + InputStream dict = KuromojiAnalysisTests.class.getResourceAsStream("user_dict.txt"); + Path home = createTempDir(); + Path config = home.resolve("config"); + Files.createDirectory(config); + Files.copy(dict, config.resolve("user_dict.txt")); + Settings settings = Settings.builder() + .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) + .put(Environment.PATH_HOME_SETTING.getKey(), home) + .put(analysisSettings) + .build(); + return AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new AnalysisKuromojiPlugin()); + } } diff --git a/plugins/analysis-kuromoji/src/test/resources/rest-api-spec/test/analysis_nori/10_basic.yml b/plugins/analysis-kuromoji/src/test/resources/rest-api-spec/test/analysis_kuromoji/10_basic.yml similarity index 100% rename from plugins/analysis-kuromoji/src/test/resources/rest-api-spec/test/analysis_nori/10_basic.yml rename to plugins/analysis-kuromoji/src/test/resources/rest-api-spec/test/analysis_kuromoji/10_basic.yml diff --git a/plugins/analysis-kuromoji/src/test/resources/rest-api-spec/test/analysis_nori/20_search.yml b/plugins/analysis-kuromoji/src/test/resources/rest-api-spec/test/analysis_kuromoji/20_search.yml similarity index 100% rename from plugins/analysis-kuromoji/src/test/resources/rest-api-spec/test/analysis_nori/20_search.yml rename to plugins/analysis-kuromoji/src/test/resources/rest-api-spec/test/analysis_kuromoji/20_search.yml diff --git a/plugins/analysis-nori/src/main/java/org/elasticsearch/index/analysis/NoriTokenizerFactory.java b/plugins/analysis-nori/src/main/java/org/elasticsearch/index/analysis/NoriTokenizerFactory.java index 8830cf7c9772a..bac5dd2a77065 100644 --- a/plugins/analysis-nori/src/main/java/org/elasticsearch/index/analysis/NoriTokenizerFactory.java +++ b/plugins/analysis-nori/src/main/java/org/elasticsearch/index/analysis/NoriTokenizerFactory.java @@ -51,7 +51,7 @@ public static UserDictionary getUserDictionary(Environment env, Settings setting throw new IllegalArgumentException("It is not allowed to use [" + USER_DICT_PATH_OPTION + "] in conjunction" + " with [" + USER_DICT_RULES_OPTION + "]"); } - List ruleList = Analysis.getWordList(env, settings, USER_DICT_PATH_OPTION, USER_DICT_RULES_OPTION); + List ruleList = Analysis.getWordList(env, settings, USER_DICT_PATH_OPTION, USER_DICT_RULES_OPTION, true); StringBuilder sb = new StringBuilder(); if (ruleList == null || ruleList.isEmpty()) { return null; diff --git a/server/src/main/java/org/elasticsearch/index/analysis/Analysis.java b/server/src/main/java/org/elasticsearch/index/analysis/Analysis.java index 303e7049306a7..01b3e97af5988 100644 --- a/server/src/main/java/org/elasticsearch/index/analysis/Analysis.java +++ b/server/src/main/java/org/elasticsearch/index/analysis/Analysis.java @@ -215,7 +215,7 @@ public static CharArraySet getWordSet(Environment env, Settings settings, String * If the word list cannot be found at either key. */ public static List getWordList(Environment env, Settings settings, String settingPrefix) { - return getWordList(env, settings, settingPrefix + "_path", settingPrefix); + return getWordList(env, settings, settingPrefix + "_path", settingPrefix, true); } /** @@ -225,7 +225,8 @@ public static List getWordList(Environment env, Settings settings, Strin * @throws IllegalArgumentException * If the word list cannot be found at either key. */ - public static List getWordList(Environment env, Settings settings, String settingPath, String settingList) { + public static List getWordList(Environment env, Settings settings, + String settingPath, String settingList, boolean removeComments) { String wordListPath = settings.get(settingPath, null); if (wordListPath == null) { @@ -240,7 +241,7 @@ public static List getWordList(Environment env, Settings settings, Strin final Path path = env.configFile().resolve(wordListPath); try { - return loadWordList(path, "#"); + return loadWordList(path, removeComments); } catch (CharacterCodingException ex) { String message = String.format(Locale.ROOT, "Unsupported character encoding detected while reading %s: %s - files must be UTF-8 encoded", @@ -252,15 +253,15 @@ public static List getWordList(Environment env, Settings settings, Strin } } - private static List loadWordList(Path path, String comment) throws IOException { + private static List loadWordList(Path path, boolean removeComments) throws IOException { final List result = new ArrayList<>(); try (BufferedReader br = Files.newBufferedReader(path, StandardCharsets.UTF_8)) { String word; while ((word = br.readLine()) != null) { - if (!Strings.hasText(word)) { + if (Strings.hasText(word) == false) { continue; } - if (!word.startsWith(comment)) { + if (removeComments == false || word.startsWith("#") == false) { result.add(word.trim()); } }