Skip to content

Commit 7ef3900

Browse files
authored
Move remaining pre-configured token filters into analysis-common (#24716)
Moves the remaining preconfigured token figured into the analysis-common module. There were a couple of tests in core that depended on the pre-configured token filters so I had to touch them: * `GetTermVectorsCheckDocFreqIT` depended on `type_as_payload` but didn't do anything important with it. I dropped the dependency. Then I moved the test to a single node test case because we're trying to cut down on the number of `ESIntegTestCase` subclasses. * `AbstractTermVectorsTestCase` and its subclasses depended on `type_as_payload`. I dropped their usage of the token filter and added an integration test for the termvectors API that uses `type_as_payload` to the `analysis-common` module. * `AnalysisModuleTests` expected a few pre-configured token filtes be registered by default. They aren't any more so I dropped this assertion. We assert that the `CommonAnalysisPlugin` registers these pre-built token filters in `CommonAnalysisFactoryTests` * `SearchQueryIT` and `SuggestSearchIT` had tests that depended on the specific behavior of the token filters so I moved the tests to integration tests in `analysis-common`.
1 parent a001659 commit 7ef3900

File tree

15 files changed

+402
-700
lines changed

15 files changed

+402
-700
lines changed

core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -278,22 +278,6 @@ static Map<String, PreConfiguredTokenFilter> setupPreConfiguredTokenFilters(List
278278
* version uses a set of English stop words that are in
279279
* lucene-analyzers-common so "stop" is defined in the analysis-common
280280
* module. */
281-
282-
// Add token filters declared in PreBuiltTokenFilters until they have all been migrated
283-
for (PreBuiltTokenFilters preBuilt : PreBuiltTokenFilters.values()) {
284-
switch (preBuilt) {
285-
case LOWERCASE:
286-
// This has been migrated but has to stick around until PreBuiltTokenizers is removed.
287-
continue;
288-
default:
289-
if (CachingStrategy.ONE != preBuilt.getCachingStrategy()) {
290-
throw new UnsupportedOperationException("shim not available for " + preBuilt.getCachingStrategy());
291-
}
292-
String name = preBuilt.name().toLowerCase(Locale.ROOT);
293-
preConfiguredTokenFilters.register(name, PreConfiguredTokenFilter.singleton(name, preBuilt.isMultiTermAware(),
294-
tokenStream -> preBuilt.create(tokenStream, Version.CURRENT)));
295-
}
296-
}
297281

298282
for (AnalysisPlugin plugin: plugins) {
299283
for (PreConfiguredTokenFilter filter : plugin.getPreConfiguredTokenFilters()) {

core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltTokenFilters.java

Lines changed: 1 addition & 251 deletions
Original file line numberDiff line numberDiff line change
@@ -20,38 +20,10 @@
2020

2121
import org.apache.lucene.analysis.LowerCaseFilter;
2222
import org.apache.lucene.analysis.TokenStream;
23-
import org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
24-
import org.apache.lucene.analysis.ar.ArabicStemFilter;
25-
import org.apache.lucene.analysis.br.BrazilianStemFilter;
26-
import org.apache.lucene.analysis.cjk.CJKBigramFilter;
27-
import org.apache.lucene.analysis.cjk.CJKWidthFilter;
28-
import org.apache.lucene.analysis.ckb.SoraniNormalizationFilter;
29-
import org.apache.lucene.analysis.core.DecimalDigitFilter;
30-
import org.apache.lucene.analysis.cz.CzechStemFilter;
31-
import org.apache.lucene.analysis.de.GermanNormalizationFilter;
32-
import org.apache.lucene.analysis.de.GermanStemFilter;
33-
import org.apache.lucene.analysis.fa.PersianNormalizationFilter;
34-
import org.apache.lucene.analysis.fr.FrenchAnalyzer;
35-
import org.apache.lucene.analysis.hi.HindiNormalizationFilter;
36-
import org.apache.lucene.analysis.in.IndicNormalizationFilter;
37-
import org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilter;
38-
import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilter;
39-
import org.apache.lucene.analysis.miscellaneous.ScandinavianFoldingFilter;
40-
import org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizationFilter;
41-
import org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilter;
42-
import org.apache.lucene.analysis.payloads.TypeAsPayloadTokenFilter;
43-
import org.apache.lucene.analysis.shingle.ShingleFilter;
44-
import org.apache.lucene.analysis.snowball.SnowballFilter;
45-
import org.apache.lucene.analysis.tr.ApostropheFilter;
46-
import org.apache.lucene.analysis.util.ElisionFilter;
4723
import org.elasticsearch.Version;
48-
import org.elasticsearch.index.analysis.DelimitedPayloadTokenFilterFactory;
49-
import org.elasticsearch.index.analysis.LimitTokenCountFilterFactory;
5024
import org.elasticsearch.index.analysis.MultiTermAwareComponent;
5125
import org.elasticsearch.index.analysis.TokenFilterFactory;
5226
import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy;
53-
import org.tartarus.snowball.ext.DutchStemmer;
54-
import org.tartarus.snowball.ext.FrenchStemmer;
5527

5628
import java.util.Locale;
5729

@@ -66,229 +38,7 @@ public TokenStream create(TokenStream tokenStream, Version version) {
6638
protected boolean isMultiTermAware() {
6739
return true;
6840
}
69-
},
70-
71-
// Extended Token Filters
72-
ELISION(CachingStrategy.ONE) {
73-
@Override
74-
public TokenStream create(TokenStream tokenStream, Version version) {
75-
return new ElisionFilter(tokenStream, FrenchAnalyzer.DEFAULT_ARTICLES);
76-
}
77-
@Override
78-
protected boolean isMultiTermAware() {
79-
return true;
80-
}
81-
},
82-
83-
ARABIC_STEM(CachingStrategy.ONE) {
84-
@Override
85-
public TokenStream create(TokenStream tokenStream, Version version) {
86-
return new ArabicStemFilter(tokenStream);
87-
}
88-
},
89-
90-
BRAZILIAN_STEM(CachingStrategy.ONE) {
91-
@Override
92-
public TokenStream create(TokenStream tokenStream, Version version) {
93-
return new BrazilianStemFilter(tokenStream);
94-
}
95-
},
96-
97-
CZECH_STEM(CachingStrategy.ONE) {
98-
@Override
99-
public TokenStream create(TokenStream tokenStream, Version version) {
100-
return new CzechStemFilter(tokenStream);
101-
}
102-
},
103-
104-
DUTCH_STEM(CachingStrategy.ONE) {
105-
@Override
106-
public TokenStream create(TokenStream tokenStream, Version version) {
107-
return new SnowballFilter(tokenStream, new DutchStemmer());
108-
}
109-
},
110-
111-
FRENCH_STEM(CachingStrategy.ONE) {
112-
@Override
113-
public TokenStream create(TokenStream tokenStream, Version version) {
114-
return new SnowballFilter(tokenStream, new FrenchStemmer());
115-
}
116-
},
117-
118-
GERMAN_STEM(CachingStrategy.ONE) {
119-
@Override
120-
public TokenStream create(TokenStream tokenStream, Version version) {
121-
return new GermanStemFilter(tokenStream);
122-
}
123-
},
124-
125-
RUSSIAN_STEM(CachingStrategy.ONE) {
126-
@Override
127-
public TokenStream create(TokenStream tokenStream, Version version) {
128-
return new SnowballFilter(tokenStream, "Russian");
129-
}
130-
},
131-
132-
KEYWORD_REPEAT(CachingStrategy.ONE) {
133-
@Override
134-
public TokenStream create(TokenStream tokenStream, Version version) {
135-
return new KeywordRepeatFilter(tokenStream);
136-
}
137-
},
138-
139-
ARABIC_NORMALIZATION(CachingStrategy.ONE) {
140-
@Override
141-
public TokenStream create(TokenStream tokenStream, Version version) {
142-
return new ArabicNormalizationFilter(tokenStream);
143-
}
144-
@Override
145-
protected boolean isMultiTermAware() {
146-
return true;
147-
}
148-
},
149-
150-
PERSIAN_NORMALIZATION(CachingStrategy.ONE) {
151-
@Override
152-
public TokenStream create(TokenStream tokenStream, Version version) {
153-
return new PersianNormalizationFilter(tokenStream);
154-
}
155-
@Override
156-
protected boolean isMultiTermAware() {
157-
return true;
158-
}
159-
},
160-
161-
TYPE_AS_PAYLOAD(CachingStrategy.ONE) {
162-
@Override
163-
public TokenStream create(TokenStream tokenStream, Version version) {
164-
return new TypeAsPayloadTokenFilter(tokenStream);
165-
}
166-
},
167-
168-
SHINGLE(CachingStrategy.ONE) {
169-
@Override
170-
public TokenStream create(TokenStream tokenStream, Version version) {
171-
return new ShingleFilter(tokenStream);
172-
}
173-
},
174-
175-
GERMAN_NORMALIZATION(CachingStrategy.ONE) {
176-
@Override
177-
public TokenStream create(TokenStream tokenStream, Version version) {
178-
return new GermanNormalizationFilter(tokenStream);
179-
}
180-
@Override
181-
protected boolean isMultiTermAware() {
182-
return true;
183-
}
184-
},
185-
186-
HINDI_NORMALIZATION(CachingStrategy.ONE) {
187-
@Override
188-
public TokenStream create(TokenStream tokenStream, Version version) {
189-
return new HindiNormalizationFilter(tokenStream);
190-
}
191-
@Override
192-
protected boolean isMultiTermAware() {
193-
return true;
194-
}
195-
},
196-
197-
INDIC_NORMALIZATION(CachingStrategy.ONE) {
198-
@Override
199-
public TokenStream create(TokenStream tokenStream, Version version) {
200-
return new IndicNormalizationFilter(tokenStream);
201-
}
202-
@Override
203-
protected boolean isMultiTermAware() {
204-
return true;
205-
}
206-
},
207-
208-
SORANI_NORMALIZATION(CachingStrategy.ONE) {
209-
@Override
210-
public TokenStream create(TokenStream tokenStream, Version version) {
211-
return new SoraniNormalizationFilter(tokenStream);
212-
}
213-
@Override
214-
protected boolean isMultiTermAware() {
215-
return true;
216-
}
217-
},
218-
219-
SCANDINAVIAN_NORMALIZATION(CachingStrategy.ONE) {
220-
@Override
221-
public TokenStream create(TokenStream tokenStream, Version version) {
222-
return new ScandinavianNormalizationFilter(tokenStream);
223-
}
224-
@Override
225-
protected boolean isMultiTermAware() {
226-
return true;
227-
}
228-
},
229-
230-
SCANDINAVIAN_FOLDING(CachingStrategy.ONE) {
231-
@Override
232-
public TokenStream create(TokenStream tokenStream, Version version) {
233-
return new ScandinavianFoldingFilter(tokenStream);
234-
}
235-
@Override
236-
protected boolean isMultiTermAware() {
237-
return true;
238-
}
239-
},
240-
241-
APOSTROPHE(CachingStrategy.ONE) {
242-
@Override
243-
public TokenStream create(TokenStream tokenStream, Version version) {
244-
return new ApostropheFilter(tokenStream);
245-
}
246-
},
247-
248-
CJK_WIDTH(CachingStrategy.ONE) {
249-
@Override
250-
public TokenStream create(TokenStream tokenStream, Version version) {
251-
return new CJKWidthFilter(tokenStream);
252-
}
253-
@Override
254-
protected boolean isMultiTermAware() {
255-
return true;
256-
}
257-
},
258-
259-
DECIMAL_DIGIT(CachingStrategy.ONE) {
260-
@Override
261-
public TokenStream create(TokenStream tokenStream, Version version) {
262-
return new DecimalDigitFilter(tokenStream);
263-
}
264-
@Override
265-
protected boolean isMultiTermAware() {
266-
return true;
267-
}
268-
},
269-
270-
CJK_BIGRAM(CachingStrategy.ONE) {
271-
@Override
272-
public TokenStream create(TokenStream tokenStream, Version version) {
273-
return new CJKBigramFilter(tokenStream);
274-
}
275-
},
276-
277-
DELIMITED_PAYLOAD_FILTER(CachingStrategy.ONE) {
278-
@Override
279-
public TokenStream create(TokenStream tokenStream, Version version) {
280-
return new DelimitedPayloadTokenFilter(tokenStream, DelimitedPayloadTokenFilterFactory.DEFAULT_DELIMITER, DelimitedPayloadTokenFilterFactory.DEFAULT_ENCODER);
281-
}
282-
},
283-
284-
LIMIT(CachingStrategy.ONE) {
285-
@Override
286-
public TokenStream create(TokenStream tokenStream, Version version) {
287-
return new LimitTokenCountFilter(tokenStream, LimitTokenCountFilterFactory.DEFAULT_MAX_TOKEN_COUNT, LimitTokenCountFilterFactory.DEFAULT_CONSUME_ALL_TOKENS);
288-
}
289-
},
290-
291-
;
41+
};
29242

29343
protected boolean isMultiTermAware() {
29444
return false;

core/src/test/java/org/elasticsearch/action/termvectors/AbstractTermVectorsTestCase.java

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,6 @@
6666
import static org.hamcrest.Matchers.equalTo;
6767

6868
public abstract class AbstractTermVectorsTestCase extends ESIntegTestCase {
69-
7069
protected static class TestFieldSetting {
7170
public final String name;
7271
public final boolean storedOffset;
@@ -211,7 +210,7 @@ protected void createIndexBasedOnFieldSettings(String index, String alias, TestF
211210
Settings.Builder settings = Settings.builder()
212211
.put(indexSettings())
213212
.put("index.analysis.analyzer.tv_test.tokenizer", "standard")
214-
.putArray("index.analysis.analyzer.tv_test.filter", "type_as_payload", "lowercase");
213+
.putArray("index.analysis.analyzer.tv_test.filter", "lowercase");
215214
assertAcked(prepareCreate(index).addMapping("type1", mappingBuilder).setSettings(settings).addAlias(new Alias(alias)));
216215
}
217216

@@ -395,11 +394,7 @@ protected void validateResponse(TermVectorsResponse esResponse, Fields luceneFie
395394
assertThat("Missing offset test failed" + failDesc, esDocsPosEnum.startOffset(), equalTo(-1));
396395
assertThat("Missing offset test failed" + failDesc, esDocsPosEnum.endOffset(), equalTo(-1));
397396
}
398-
if (field.storedPayloads && testConfig.requestPayloads) {
399-
assertThat("Payload test failed" + failDesc, luceneDocsPosEnum.getPayload(), equalTo(esDocsPosEnum.getPayload()));
400-
} else {
401-
assertThat("Missing payload test failed" + failDesc, esDocsPosEnum.getPayload(), equalTo(null));
402-
}
397+
assertNull("Missing payload test failed" + failDesc, esDocsPosEnum.getPayload());
403398
}
404399
}
405400
assertNull("Es returned terms are done but lucene isn't", luceneTermEnum.next());

0 commit comments

Comments
 (0)