diff --git a/plugins/analysis-ukrainian/src/main/java/org/apache/lucene/analysis/uk/XUkrainianMorfologikAnalyzer.java b/plugins/analysis-ukrainian/src/main/java/org/apache/lucene/analysis/uk/XUkrainianMorfologikAnalyzer.java new file mode 100644 index 0000000000000..d11bb1b96ad44 --- /dev/null +++ b/plugins/analysis-ukrainian/src/main/java/org/apache/lucene/analysis/uk/XUkrainianMorfologikAnalyzer.java @@ -0,0 +1,158 @@ +/*@notice + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.uk; + +import morfologik.stemming.Dictionary; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.CharArraySet; +import org.apache.lucene.analysis.LowerCaseFilter; +import org.apache.lucene.analysis.StopFilter; +import org.apache.lucene.analysis.StopwordAnalyzerBase; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.WordlistLoader; +import org.apache.lucene.analysis.charfilter.MappingCharFilter; +import org.apache.lucene.analysis.charfilter.NormalizeCharMap; +import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter; +import org.apache.lucene.analysis.morfologik.MorfologikFilter; +import org.apache.lucene.analysis.standard.StandardTokenizer; +import org.apache.lucene.util.IOUtils; +import org.elasticsearch.common.SuppressForbidden; + +import java.io.IOException; +import java.io.Reader; +import java.nio.charset.StandardCharsets; + +/** + * A dictionary-based {@link Analyzer} for Ukrainian. + * + * Modified from lucene 8.8.0 sources to incorporate a bugfix for + * https://issues.apache.org/jira/browse/LUCENE-9930 + */ +public final class XUkrainianMorfologikAnalyzer extends StopwordAnalyzerBase { + private final CharArraySet stemExclusionSet; + + /** File containing default Ukrainian stopwords. */ + public static final String DEFAULT_STOPWORD_FILE = "stopwords.txt"; + + /** + * Returns an unmodifiable instance of the default stop words set. + * @return default stop words set. + */ + public static CharArraySet getDefaultStopSet() { + return DefaultSetHolder.DEFAULT_STOP_SET; + } + + /** + * Atomically loads the DEFAULT_STOP_SET and DICTIONARY in a lazy fashion once the outer class + * accesses the static final set the first time.; + */ + @SuppressForbidden(reason="Lucene uses IOUtils") + private static class DefaultSetHolder { + static final CharArraySet DEFAULT_STOP_SET; + static final Dictionary DICTIONARY; + + static { + try { + DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(UkrainianMorfologikAnalyzer.class, + DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8)); + DICTIONARY = Dictionary.read( + UkrainianMorfologikAnalyzer.class.getClassLoader().getResource("ua/net/nlp/ukrainian.dict")); + } catch (IOException ex) { + // default set should always be present as it is part of the + // distribution (JAR) + throw new RuntimeException("Unable to load resources", ex); + } + } + } + + /** + * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}. + */ + public XUkrainianMorfologikAnalyzer() { + this(DefaultSetHolder.DEFAULT_STOP_SET); + } + + /** + * Builds an analyzer with the given stop words. + * + * @param stopwords a stopword set + */ + public XUkrainianMorfologikAnalyzer(CharArraySet stopwords) { + this(stopwords, CharArraySet.EMPTY_SET); + } + + /** + * Builds an analyzer with the given stop words. If a non-empty stem exclusion set is + * provided this analyzer will add a {@link SetKeywordMarkerFilter} before + * stemming. + * + * @param stopwords a stopword set + * @param stemExclusionSet a set of terms not to be stemmed + */ + public XUkrainianMorfologikAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) { + super(stopwords); + this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet)); + } + + @Override + protected Reader initReader(String fieldName, Reader reader) { + NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); + // different apostrophes + builder.add("\u2019", "'"); + builder.add("\u2018", "'"); + builder.add("\u02BC", "'"); + builder.add("`", "'"); + builder.add("´", "'"); + // ignored characters + builder.add("\u0301", ""); + builder.add("\u00AD", ""); + builder.add("ґ", "г"); + builder.add("Ґ", "Г"); + + NormalizeCharMap normMap = builder.build(); + reader = new MappingCharFilter(normMap, reader); + return reader; + } + + /** + * Creates a + * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} + * which tokenizes all the text in the provided {@link Reader}. + * + * @return A + * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} + * built from an {@link StandardTokenizer} filtered with + * {@link LowerCaseFilter}, {@link StopFilter} + * , {@link SetKeywordMarkerFilter} if a stem exclusion set is + * provided and {@link MorfologikFilter} on the Ukrainian dictionary. + */ + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer source = new StandardTokenizer(); + TokenStream result = new LowerCaseFilter(source); + result = new StopFilter(result, stopwords); + + if (stemExclusionSet.isEmpty() == false) { + result = new SetKeywordMarkerFilter(result, stemExclusionSet); + } + + result = new MorfologikFilter(result, DefaultSetHolder.DICTIONARY); + return new TokenStreamComponents(source, result); + } + +} diff --git a/plugins/analysis-ukrainian/src/main/java/org/elasticsearch/index/analysis/UkrainianAnalyzerProvider.java b/plugins/analysis-ukrainian/src/main/java/org/elasticsearch/index/analysis/UkrainianAnalyzerProvider.java index 9c1880e8ca3ff..8802640835f25 100644 --- a/plugins/analysis-ukrainian/src/main/java/org/elasticsearch/index/analysis/UkrainianAnalyzerProvider.java +++ b/plugins/analysis-ukrainian/src/main/java/org/elasticsearch/index/analysis/UkrainianAnalyzerProvider.java @@ -10,17 +10,18 @@ import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.uk.UkrainianMorfologikAnalyzer; +import org.apache.lucene.analysis.uk.XUkrainianMorfologikAnalyzer; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; -public class UkrainianAnalyzerProvider extends AbstractIndexAnalyzerProvider { +public class UkrainianAnalyzerProvider extends AbstractIndexAnalyzerProvider { - private final UkrainianMorfologikAnalyzer analyzer; + private final XUkrainianMorfologikAnalyzer analyzer; public UkrainianAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); - analyzer = new UkrainianMorfologikAnalyzer( + analyzer = new XUkrainianMorfologikAnalyzer( Analysis.parseStopWords(env, settings, UkrainianMorfologikAnalyzer.getDefaultStopSet()), Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET) ); @@ -28,9 +29,8 @@ public UkrainianAnalyzerProvider(IndexSettings indexSettings, Environment env, S } @Override - public UkrainianMorfologikAnalyzer get() { + public XUkrainianMorfologikAnalyzer get() { return this.analyzer; } - } diff --git a/plugins/analysis-ukrainian/src/test/java/org/elasticsearch/index/analysis/UkrainianAnalysisTests.java b/plugins/analysis-ukrainian/src/test/java/org/elasticsearch/index/analysis/UkrainianAnalysisTests.java index 7fe6b189ad933..fa8d75fdcf2c3 100644 --- a/plugins/analysis-ukrainian/src/test/java/org/elasticsearch/index/analysis/UkrainianAnalysisTests.java +++ b/plugins/analysis-ukrainian/src/test/java/org/elasticsearch/index/analysis/UkrainianAnalysisTests.java @@ -9,7 +9,7 @@ package org.elasticsearch.index.analysis; import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.uk.UkrainianMorfologikAnalyzer; +import org.apache.lucene.analysis.uk.XUkrainianMorfologikAnalyzer; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.index.Index; import org.elasticsearch.plugin.analysis.ukrainian.AnalysisUkrainianPlugin; @@ -27,6 +27,6 @@ public void testDefaultsUkranianAnalysis() throws IOException { new AnalysisUkrainianPlugin()); Analyzer analyzer = analysis.indexAnalyzers.get("ukrainian").analyzer(); - MatcherAssert.assertThat(analyzer, instanceOf(UkrainianMorfologikAnalyzer.class)); + MatcherAssert.assertThat(analyzer, instanceOf(XUkrainianMorfologikAnalyzer.class)); } }