Skip to content

Commit 00fef6d

Browse files
authored
Analysis enhancement - add preserve_original setting in ngram-token-filter (#55432)
1 parent caf6c5a commit 00fef6d

File tree

3 files changed

+74
-4
lines changed

3 files changed

+74
-4
lines changed

docs/reference/analysis/tokenfilters/ngram-tokenfilter.asciidoc

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,10 @@ Maximum length of characters in a gram. Defaults to `2`.
185185
(Optional, integer)
186186
Minimum length of characters in a gram. Defaults to `1`.
187187

188+
`preserve_original`::
189+
(Optional, boolean)
190+
Emits original token when set to `true`. Defaults to `false`.
191+
188192
You can use the <<index-max-ngram-diff,`index.max_ngram_diff`>> index-level
189193
setting to control the maximum allowed difference between the `max_gram` and
190194
`min_gram` values.

modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/NGramTokenFilterFactory.java

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -29,10 +29,10 @@
2929

3030

3131
public class NGramTokenFilterFactory extends AbstractTokenFilterFactory {
32-
3332
private final int minGram;
34-
3533
private final int maxGram;
34+
private final boolean preserveOriginal;
35+
private static final String PRESERVE_ORIG_KEY = "preserve_original";
3636

3737
NGramTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
3838
super(indexSettings, name, settings);
@@ -46,12 +46,12 @@ public class NGramTokenFilterFactory extends AbstractTokenFilterFactory {
4646
+ maxAllowedNgramDiff + "] but was [" + ngramDiff + "]. This limit can be set by changing the ["
4747
+ IndexSettings.MAX_NGRAM_DIFF_SETTING.getKey() + "] index level setting.");
4848
}
49+
preserveOriginal = settings.getAsBoolean(PRESERVE_ORIG_KEY, false);
4950
}
5051

5152
@Override
5253
public TokenStream create(TokenStream tokenStream) {
53-
// TODO: Expose preserveOriginal
54-
return new NGramTokenFilter(tokenStream, minGram, maxGram, false);
54+
return new NGramTokenFilter(tokenStream, minGram, maxGram, preserveOriginal);
5555
}
5656

5757
@Override
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
/*
2+
* Licensed to Elasticsearch under one or more contributor
3+
* license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright
5+
* ownership. Elasticsearch licenses this file to you under
6+
* the Apache License, Version 2.0 (the "License"); you may
7+
* not use this file except in compliance with the License.
8+
* You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package org.elasticsearch.analysis.common;
21+
22+
import org.apache.lucene.analysis.Tokenizer;
23+
import org.apache.lucene.analysis.standard.StandardTokenizer;
24+
import org.elasticsearch.common.settings.Settings;
25+
import org.elasticsearch.env.Environment;
26+
import org.elasticsearch.index.analysis.AnalysisTestsHelper;
27+
import org.elasticsearch.index.analysis.TokenFilterFactory;
28+
import org.elasticsearch.test.ESTestCase;
29+
import org.elasticsearch.test.ESTokenStreamTestCase;
30+
31+
import java.io.IOException;
32+
import java.io.StringReader;
33+
34+
public class NGramTokenFilterFactoryTests extends ESTokenStreamTestCase {
35+
36+
public void testDefault() throws IOException {
37+
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
38+
Settings.builder()
39+
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
40+
.put("index.analysis.filter.my_ngram.type", "ngram")
41+
.build(),
42+
new CommonAnalysisPlugin());
43+
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_ngram");
44+
String source = "foo";
45+
String[] expected = new String[]{"f", "fo", "o", "oo", "o"};
46+
Tokenizer tokenizer = new StandardTokenizer();
47+
tokenizer.setReader(new StringReader(source));
48+
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
49+
}
50+
51+
public void testPreserveOriginal() throws IOException {
52+
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
53+
Settings.builder()
54+
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
55+
.put("index.analysis.filter.my_ngram.type", "ngram")
56+
.put("index.analysis.filter.my_ngram.preserve_original", true)
57+
.build(),
58+
new CommonAnalysisPlugin());
59+
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_ngram");
60+
String source = "foo";
61+
String[] expected = new String[]{"f", "fo", "o", "oo", "o", "foo"};
62+
Tokenizer tokenizer = new StandardTokenizer();
63+
tokenizer.setReader(new StringReader(source));
64+
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
65+
}
66+
}

0 commit comments

Comments
 (0)