From f4e97b93e9dad17fba195a4eb85af727918e8e4d Mon Sep 17 00:00:00 2001 From: Amit Khandelwal Date: Mon, 4 May 2020 14:36:37 +0530 Subject: [PATCH] Analysis enhancement - add preserve_original setting in ngram-token-filter (#55432) --- .../tokenfilters/ngram-tokenfilter.asciidoc | 4 ++ .../common/NGramTokenFilterFactory.java | 7 +- .../common/NGramTokenFilterFactoryTests.java | 66 +++++++++++++++++++ 3 files changed, 74 insertions(+), 3 deletions(-) create mode 100644 modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/NGramTokenFilterFactoryTests.java diff --git a/docs/reference/analysis/tokenfilters/ngram-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/ngram-tokenfilter.asciidoc index 1dbe6886c53c1..0ffd143aff422 100644 --- a/docs/reference/analysis/tokenfilters/ngram-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/ngram-tokenfilter.asciidoc @@ -185,6 +185,10 @@ Maximum length of characters in a gram. Defaults to `2`. (Optional, integer) Minimum length of characters in a gram. Defaults to `1`. +`preserve_original`:: +(Optional, boolean) +Emits original token when set to `true`. Defaults to `false`. + You can use the <> index-level setting to control the maximum allowed difference between the `max_gram` and `min_gram` values. diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/NGramTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/NGramTokenFilterFactory.java index 1154bf0a88a32..7a10137a4b38c 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/NGramTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/NGramTokenFilterFactory.java @@ -37,8 +37,9 @@ public class NGramTokenFilterFactory extends AbstractTokenFilterFactory { = new DeprecationLogger(LogManager.getLogger(NGramTokenFilterFactory.class)); private final int minGram; - private final int maxGram; + private final boolean preserveOriginal; + private static final String PRESERVE_ORIG_KEY = "preserve_original"; NGramTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { super(indexSettings, name, settings); @@ -58,12 +59,12 @@ public class NGramTokenFilterFactory extends AbstractTokenFilterFactory { + "expected difference must be less than or equal to: [" + maxAllowedNgramDiff + "]"); } } + preserveOriginal = settings.getAsBoolean(PRESERVE_ORIG_KEY, false); } @Override public TokenStream create(TokenStream tokenStream) { - // TODO: Expose preserveOriginal - return new NGramTokenFilter(tokenStream, minGram, maxGram, false); + return new NGramTokenFilter(tokenStream, minGram, maxGram, preserveOriginal); } @Override diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/NGramTokenFilterFactoryTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/NGramTokenFilterFactoryTests.java new file mode 100644 index 0000000000000..4271b5aab60fa --- /dev/null +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/NGramTokenFilterFactoryTests.java @@ -0,0 +1,66 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.analysis.common; + +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.standard.StandardTokenizer; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.env.Environment; +import org.elasticsearch.index.analysis.AnalysisTestsHelper; +import org.elasticsearch.index.analysis.TokenFilterFactory; +import org.elasticsearch.test.ESTestCase; +import org.elasticsearch.test.ESTokenStreamTestCase; + +import java.io.IOException; +import java.io.StringReader; + +public class NGramTokenFilterFactoryTests extends ESTokenStreamTestCase { + + public void testDefault() throws IOException { + ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings( + Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put("index.analysis.filter.my_ngram.type", "ngram") + .build(), + new CommonAnalysisPlugin()); + TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_ngram"); + String source = "foo"; + String[] expected = new String[]{"f", "fo", "o", "oo", "o"}; + Tokenizer tokenizer = new StandardTokenizer(); + tokenizer.setReader(new StringReader(source)); + assertTokenStreamContents(tokenFilter.create(tokenizer), expected); + } + + public void testPreserveOriginal() throws IOException { + ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings( + Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put("index.analysis.filter.my_ngram.type", "ngram") + .put("index.analysis.filter.my_ngram.preserve_original", true) + .build(), + new CommonAnalysisPlugin()); + TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_ngram"); + String source = "foo"; + String[] expected = new String[]{"f", "fo", "o", "oo", "o", "foo"}; + Tokenizer tokenizer = new StandardTokenizer(); + tokenizer.setReader(new StringReader(source)); + assertTokenStreamContents(tokenFilter.create(tokenizer), expected); + } +}