From 5059c42989eb8c7093296ac340ac7cfc5f9eb6ab Mon Sep 17 00:00:00 2001 From: Amit Khandelwal Date: Sun, 19 Apr 2020 17:26:54 +0530 Subject: [PATCH] Completed TODO task of adding preserve_original setting in ngram-token-filter and added the missing test class. --- .../tokenfilters/ngram-tokenfilter.asciidoc | 4 ++ .../common/NGramTokenFilterFactory.java | 8 +-- .../common/NGramTokenFilterFactoryTests.java | 66 +++++++++++++++++++ 3 files changed, 74 insertions(+), 4 deletions(-) create mode 100644 modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/NGramTokenFilterFactoryTests.java diff --git a/docs/reference/analysis/tokenfilters/ngram-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/ngram-tokenfilter.asciidoc index 1dbe6886c53c1..0ffd143aff422 100644 --- a/docs/reference/analysis/tokenfilters/ngram-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/ngram-tokenfilter.asciidoc @@ -185,6 +185,10 @@ Maximum length of characters in a gram. Defaults to `2`. (Optional, integer) Minimum length of characters in a gram. Defaults to `1`. +`preserve_original`:: +(Optional, boolean) +Emits original token when set to `true`. Defaults to `false`. + You can use the <> index-level setting to control the maximum allowed difference between the `max_gram` and `min_gram` values. diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/NGramTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/NGramTokenFilterFactory.java index 5c8d2f6003a56..e03f0dabe83e3 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/NGramTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/NGramTokenFilterFactory.java @@ -29,10 +29,10 @@ public class NGramTokenFilterFactory extends AbstractTokenFilterFactory { - private final int minGram; - private final int maxGram; + private final boolean preserveOriginal; + private static final String PRESERVE_ORIG_KEY = "preserve_original"; NGramTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { super(indexSettings, name, settings); @@ -46,12 +46,12 @@ public class NGramTokenFilterFactory extends AbstractTokenFilterFactory { + maxAllowedNgramDiff + "] but was [" + ngramDiff + "]. This limit can be set by changing the [" + IndexSettings.MAX_NGRAM_DIFF_SETTING.getKey() + "] index level setting."); } + preserveOriginal = settings.getAsBoolean(PRESERVE_ORIG_KEY, false); } @Override public TokenStream create(TokenStream tokenStream) { - // TODO: Expose preserveOriginal - return new NGramTokenFilter(tokenStream, minGram, maxGram, false); + return new NGramTokenFilter(tokenStream, minGram, maxGram, preserveOriginal); } @Override diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/NGramTokenFilterFactoryTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/NGramTokenFilterFactoryTests.java new file mode 100644 index 0000000000000..4271b5aab60fa --- /dev/null +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/NGramTokenFilterFactoryTests.java @@ -0,0 +1,66 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.analysis.common; + +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.standard.StandardTokenizer; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.env.Environment; +import org.elasticsearch.index.analysis.AnalysisTestsHelper; +import org.elasticsearch.index.analysis.TokenFilterFactory; +import org.elasticsearch.test.ESTestCase; +import org.elasticsearch.test.ESTokenStreamTestCase; + +import java.io.IOException; +import java.io.StringReader; + +public class NGramTokenFilterFactoryTests extends ESTokenStreamTestCase { + + public void testDefault() throws IOException { + ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings( + Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put("index.analysis.filter.my_ngram.type", "ngram") + .build(), + new CommonAnalysisPlugin()); + TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_ngram"); + String source = "foo"; + String[] expected = new String[]{"f", "fo", "o", "oo", "o"}; + Tokenizer tokenizer = new StandardTokenizer(); + tokenizer.setReader(new StringReader(source)); + assertTokenStreamContents(tokenFilter.create(tokenizer), expected); + } + + public void testPreserveOriginal() throws IOException { + ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings( + Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put("index.analysis.filter.my_ngram.type", "ngram") + .put("index.analysis.filter.my_ngram.preserve_original", true) + .build(), + new CommonAnalysisPlugin()); + TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_ngram"); + String source = "foo"; + String[] expected = new String[]{"f", "fo", "o", "oo", "o", "foo"}; + Tokenizer tokenizer = new StandardTokenizer(); + tokenizer.setReader(new StringReader(source)); + assertTokenStreamContents(tokenFilter.create(tokenizer), expected); + } +}