diff --git a/core/src/main/java/org/elasticsearch/index/analysis/FingerprintAnalyzerProvider.java b/core/src/main/java/org/elasticsearch/index/analysis/FingerprintAnalyzerProvider.java index fa0ce5223f85c..3aba3f882fb72 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/FingerprintAnalyzerProvider.java +++ b/core/src/main/java/org/elasticsearch/index/analysis/FingerprintAnalyzerProvider.java @@ -33,17 +33,19 @@ */ public class FingerprintAnalyzerProvider extends AbstractIndexAnalyzerProvider { - public static ParseField MAX_OUTPUT_SIZE = FingerprintTokenFilterFactory.MAX_OUTPUT_SIZE; + public static ParseField SEPARATOR = new ParseField("separator"); + public static ParseField MAX_OUTPUT_SIZE = new ParseField("max_output_size"); - public static int DEFAULT_MAX_OUTPUT_SIZE = FingerprintTokenFilterFactory.DEFAULT_MAX_OUTPUT_SIZE; + public static int DEFAULT_MAX_OUTPUT_SIZE = 255; public static CharArraySet DEFAULT_STOP_WORDS = CharArraySet.EMPTY_SET; + public static final char DEFAULT_SEPARATOR = ' '; private final FingerprintAnalyzer analyzer; public FingerprintAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); - char separator = FingerprintTokenFilterFactory.parseSeparator(settings); + char separator = parseSeparator(settings); int maxOutputSize = settings.getAsInt(MAX_OUTPUT_SIZE.getPreferredName(),DEFAULT_MAX_OUTPUT_SIZE); CharArraySet stopWords = Analysis.parseStopWords(env, indexSettings.getIndexVersionCreated(), settings, DEFAULT_STOP_WORDS); @@ -54,4 +56,16 @@ public FingerprintAnalyzerProvider(IndexSettings indexSettings, Environment env, public FingerprintAnalyzer get() { return analyzer; } + + public static char parseSeparator(Settings settings) throws IllegalArgumentException { + String customSeparator = settings.get(SEPARATOR.getPreferredName()); + if (customSeparator == null) { + return DEFAULT_SEPARATOR; + } else if (customSeparator.length() == 1) { + return customSeparator.charAt(0); + } + + throw new IllegalArgumentException("Setting [separator] must be a single, non-null character. [" + + customSeparator + "] was provided."); + } } diff --git a/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java b/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java index 5eb33bd64c38b..ffba1969753f6 100644 --- a/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java +++ b/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java @@ -29,7 +29,6 @@ import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.analysis.AnalysisRegistry; import org.elasticsearch.index.analysis.AnalyzerProvider; -import org.elasticsearch.index.analysis.ApostropheFilterFactory; import org.elasticsearch.index.analysis.ArabicAnalyzerProvider; import org.elasticsearch.index.analysis.ArabicStemTokenFilterFactory; import org.elasticsearch.index.analysis.ArmenianAnalyzerProvider; @@ -41,19 +40,15 @@ import org.elasticsearch.index.analysis.CharFilterFactory; import org.elasticsearch.index.analysis.ChineseAnalyzerProvider; import org.elasticsearch.index.analysis.CjkAnalyzerProvider; -import org.elasticsearch.index.analysis.ClassicFilterFactory; import org.elasticsearch.index.analysis.ClassicTokenizerFactory; import org.elasticsearch.index.analysis.CzechAnalyzerProvider; import org.elasticsearch.index.analysis.CzechStemTokenFilterFactory; import org.elasticsearch.index.analysis.DanishAnalyzerProvider; -import org.elasticsearch.index.analysis.DecimalDigitFilterFactory; -import org.elasticsearch.index.analysis.DelimitedPayloadTokenFilterFactory; import org.elasticsearch.index.analysis.DutchAnalyzerProvider; import org.elasticsearch.index.analysis.DutchStemTokenFilterFactory; import org.elasticsearch.index.analysis.EdgeNGramTokenizerFactory; import org.elasticsearch.index.analysis.EnglishAnalyzerProvider; import org.elasticsearch.index.analysis.FingerprintAnalyzerProvider; -import org.elasticsearch.index.analysis.FingerprintTokenFilterFactory; import org.elasticsearch.index.analysis.FinnishAnalyzerProvider; import org.elasticsearch.index.analysis.FrenchAnalyzerProvider; import org.elasticsearch.index.analysis.FrenchStemTokenFilterFactory; @@ -67,15 +62,12 @@ import org.elasticsearch.index.analysis.IndonesianAnalyzerProvider; import org.elasticsearch.index.analysis.IrishAnalyzerProvider; import org.elasticsearch.index.analysis.ItalianAnalyzerProvider; -import org.elasticsearch.index.analysis.KeepTypesFilterFactory; -import org.elasticsearch.index.analysis.KeepWordFilterFactory; import org.elasticsearch.index.analysis.KeywordAnalyzerProvider; import org.elasticsearch.index.analysis.KeywordTokenizerFactory; import org.elasticsearch.index.analysis.LatvianAnalyzerProvider; import org.elasticsearch.index.analysis.LetterTokenizerFactory; import org.elasticsearch.index.analysis.LithuanianAnalyzerProvider; import org.elasticsearch.index.analysis.LowerCaseTokenizerFactory; -import org.elasticsearch.index.analysis.MinHashTokenFilterFactory; import org.elasticsearch.index.analysis.NGramTokenizerFactory; import org.elasticsearch.index.analysis.NorwegianAnalyzerProvider; import org.elasticsearch.index.analysis.PathHierarchyTokenizerFactory; @@ -89,7 +81,6 @@ import org.elasticsearch.index.analysis.RomanianAnalyzerProvider; import org.elasticsearch.index.analysis.RussianAnalyzerProvider; import org.elasticsearch.index.analysis.RussianStemTokenFilterFactory; -import org.elasticsearch.index.analysis.ScandinavianFoldingFilterFactory; import org.elasticsearch.index.analysis.ShingleTokenFilterFactory; import org.elasticsearch.index.analysis.SimpleAnalyzerProvider; import org.elasticsearch.index.analysis.SnowballAnalyzerProvider; @@ -181,10 +172,6 @@ private NamedRegistry> setupTokenFilters(Li tokenFilters.register("stop", StopTokenFilterFactory::new); tokenFilters.register("standard", StandardTokenFilterFactory::new); tokenFilters.register("shingle", ShingleTokenFilterFactory::new); - tokenFilters.register("min_hash", MinHashTokenFilterFactory::new); - tokenFilters.register("delimited_payload_filter", DelimitedPayloadTokenFilterFactory::new); - tokenFilters.register("keep", requriesAnalysisSettings(KeepWordFilterFactory::new)); - tokenFilters.register("keep_types", requriesAnalysisSettings(KeepTypesFilterFactory::new)); tokenFilters.register("arabic_stem", ArabicStemTokenFilterFactory::new); tokenFilters.register("brazilian_stem", BrazilianStemTokenFilterFactory::new); tokenFilters.register("czech_stem", CzechStemTokenFilterFactory::new); @@ -192,15 +179,10 @@ private NamedRegistry> setupTokenFilters(Li tokenFilters.register("french_stem", FrenchStemTokenFilterFactory::new); tokenFilters.register("german_stem", GermanStemTokenFilterFactory::new); tokenFilters.register("russian_stem", RussianStemTokenFilterFactory::new); - tokenFilters.register("scandinavian_folding", ScandinavianFoldingFilterFactory::new); tokenFilters.register("hunspell", requriesAnalysisSettings((indexSettings, env, name, settings) -> new HunspellTokenFilterFactory (indexSettings, name, settings, hunspellService))); - tokenFilters.register("apostrophe", ApostropheFilterFactory::new); - tokenFilters.register("classic", ClassicFilterFactory::new); - tokenFilters.register("decimal_digit", DecimalDigitFilterFactory::new); - tokenFilters.register("fingerprint", FingerprintTokenFilterFactory::new); tokenFilters.extractAndRegister(plugins, AnalysisPlugin::getTokenFilters); return tokenFilters; } diff --git a/core/src/test/java/org/elasticsearch/action/admin/indices/TransportAnalyzeActionTests.java b/core/src/test/java/org/elasticsearch/action/admin/indices/TransportAnalyzeActionTests.java index 79e9975484e7e..54c7ba3aab084 100644 --- a/core/src/test/java/org/elasticsearch/action/admin/indices/TransportAnalyzeActionTests.java +++ b/core/src/test/java/org/elasticsearch/action/admin/indices/TransportAnalyzeActionTests.java @@ -319,14 +319,14 @@ public void testUnknown() throws IOException { public void testNonPreBuildTokenFilter() throws IOException { AnalyzeRequest request = new AnalyzeRequest(); request.tokenizer("whitespace"); - request.addTokenFilter("min_hash"); + request.addTokenFilter("stop"); // stop token filter is not prebuilt in AnalysisModule#setupPreConfiguredTokenFilters() request.text("the quick brown fox"); AnalyzeResponse analyze = TransportAnalyzeAction.analyze(request, AllFieldMapper.NAME, null, indexAnalyzers, registry, environment); List tokens = analyze.getTokens(); - int default_hash_count = 1; - int default_bucket_size = 512; - int default_hash_set_size = 1; - assertEquals(default_hash_count * default_bucket_size * default_hash_set_size, tokens.size()); + assertEquals(3, tokens.size()); + assertEquals("quick", tokens.get(0).getTerm()); + assertEquals("brown", tokens.get(1).getTerm()); + assertEquals("fox", tokens.get(2).getTerm()); } public void testNormalizerWithIndex() throws IOException { diff --git a/core/src/test/java/org/elasticsearch/action/termvectors/GetTermVectorsIT.java b/core/src/test/java/org/elasticsearch/action/termvectors/GetTermVectorsIT.java index cc7a73278efe0..bbd7d5501783c 100644 --- a/core/src/test/java/org/elasticsearch/action/termvectors/GetTermVectorsIT.java +++ b/core/src/test/java/org/elasticsearch/action/termvectors/GetTermVectorsIT.java @@ -19,9 +19,6 @@ package org.elasticsearch.action.termvectors; -import com.carrotsearch.hppc.ObjectIntHashMap; - -import org.apache.lucene.analysis.payloads.PayloadHelper; import org.apache.lucene.document.FieldType; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.Fields; @@ -29,7 +26,6 @@ import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.util.BytesRef; -import org.elasticsearch.ElasticsearchException; import org.elasticsearch.action.ActionFuture; import org.elasticsearch.action.admin.cluster.shards.ClusterSearchShardsResponse; import org.elasticsearch.action.admin.indices.alias.Alias; @@ -374,171 +370,6 @@ public void testDuelESLucene() throws Exception { } } - public void testRandomPayloadWithDelimitedPayloadTokenFilter() throws IOException { - //create the test document - int encoding = randomIntBetween(0, 2); - String encodingString = ""; - if (encoding == 0) { - encodingString = "float"; - } - if (encoding == 1) { - encodingString = "int"; - } - if (encoding == 2) { - encodingString = "identity"; - } - String[] tokens = crateRandomTokens(); - Map> payloads = createPayloads(tokens, encoding); - String delimiter = createRandomDelimiter(tokens); - String queryString = createString(tokens, payloads, encoding, delimiter.charAt(0)); - //create the mapping - XContentBuilder mapping = jsonBuilder().startObject().startObject("type1").startObject("properties") - .startObject("field").field("type", "text").field("term_vector", "with_positions_offsets_payloads") - .field("analyzer", "payload_test").endObject().endObject().endObject().endObject(); - assertAcked(prepareCreate("test").addMapping("type1", mapping).setSettings( - Settings.builder() - .put(indexSettings()) - .put("index.analysis.analyzer.payload_test.tokenizer", "whitespace") - .putArray("index.analysis.analyzer.payload_test.filter", "my_delimited_payload_filter") - .put("index.analysis.filter.my_delimited_payload_filter.delimiter", delimiter) - .put("index.analysis.filter.my_delimited_payload_filter.encoding", encodingString) - .put("index.analysis.filter.my_delimited_payload_filter.type", "delimited_payload_filter"))); - - client().prepareIndex("test", "type1", Integer.toString(1)) - .setSource(jsonBuilder().startObject().field("field", queryString).endObject()).execute().actionGet(); - refresh(); - TermVectorsRequestBuilder resp = client().prepareTermVectors("test", "type1", Integer.toString(1)).setPayloads(true).setOffsets(true) - .setPositions(true).setSelectedFields(); - TermVectorsResponse response = resp.execute().actionGet(); - assertThat("doc id 1 doesn't exists but should", response.isExists(), equalTo(true)); - Fields fields = response.getFields(); - assertThat(fields.size(), equalTo(1)); - Terms terms = fields.terms("field"); - TermsEnum iterator = terms.iterator(); - while (iterator.next() != null) { - String term = iterator.term().utf8ToString(); - PostingsEnum docsAndPositions = iterator.postings(null, PostingsEnum.ALL); - assertThat(docsAndPositions.nextDoc(), equalTo(0)); - List curPayloads = payloads.get(term); - assertThat(term, curPayloads, notNullValue()); - assertNotNull(docsAndPositions); - for (int k = 0; k < docsAndPositions.freq(); k++) { - docsAndPositions.nextPosition(); - if (docsAndPositions.getPayload()!=null){ - String infoString = "\nterm: " + term + " has payload \n"+ docsAndPositions.getPayload().toString() + "\n but should have payload \n"+curPayloads.get(k).toString(); - assertThat(infoString, docsAndPositions.getPayload(), equalTo(curPayloads.get(k))); - } else { - String infoString = "\nterm: " + term + " has no payload but should have payload \n"+curPayloads.get(k).toString(); - assertThat(infoString, curPayloads.get(k).length, equalTo(0)); - } - } - } - assertThat(iterator.next(), nullValue()); - } - - private String createRandomDelimiter(String[] tokens) { - String delimiter = ""; - boolean isTokenOrWhitespace = true; - while(isTokenOrWhitespace) { - isTokenOrWhitespace = false; - delimiter = randomUnicodeOfLength(1); - for(String token:tokens) { - if(token.contains(delimiter)) { - isTokenOrWhitespace = true; - } - } - if(Character.isWhitespace(delimiter.charAt(0))) { - isTokenOrWhitespace = true; - } - } - return delimiter; - } - - private String createString(String[] tokens, Map> payloads, int encoding, char delimiter) { - String resultString = ""; - ObjectIntHashMap payloadCounter = new ObjectIntHashMap<>(); - for (String token : tokens) { - if (!payloadCounter.containsKey(token)) { - payloadCounter.putIfAbsent(token, 0); - } else { - payloadCounter.put(token, payloadCounter.get(token) + 1); - } - resultString = resultString + token; - BytesRef payload = payloads.get(token).get(payloadCounter.get(token)); - if (payload.length > 0) { - resultString = resultString + delimiter; - switch (encoding) { - case 0: { - resultString = resultString + Float.toString(PayloadHelper.decodeFloat(payload.bytes, payload.offset)); - break; - } - case 1: { - resultString = resultString + Integer.toString(PayloadHelper.decodeInt(payload.bytes, payload.offset)); - break; - } - case 2: { - resultString = resultString + payload.utf8ToString(); - break; - } - default: { - throw new ElasticsearchException("unsupported encoding type"); - } - } - } - resultString = resultString + " "; - } - return resultString; - } - - private Map> createPayloads(String[] tokens, int encoding) { - Map> payloads = new HashMap<>(); - for (String token : tokens) { - if (payloads.get(token) == null) { - payloads.put(token, new ArrayList()); - } - boolean createPayload = randomBoolean(); - if (createPayload) { - switch (encoding) { - case 0: { - float theFloat = randomFloat(); - payloads.get(token).add(new BytesRef(PayloadHelper.encodeFloat(theFloat))); - break; - } - case 1: { - payloads.get(token).add(new BytesRef(PayloadHelper.encodeInt(randomInt()))); - break; - } - case 2: { - String payload = randomUnicodeOfLengthBetween(50, 100); - for (int c = 0; c < payload.length(); c++) { - if (Character.isWhitespace(payload.charAt(c))) { - payload = payload.replace(payload.charAt(c), 'w'); - } - } - payloads.get(token).add(new BytesRef(payload)); - break; - } - default: { - throw new ElasticsearchException("unsupported encoding type"); - } - } - } else { - payloads.get(token).add(new BytesRef()); - } - } - return payloads; - } - - private String[] crateRandomTokens() { - String[] tokens = { "the", "quick", "brown", "fox" }; - int numTokensWithDuplicates = randomIntBetween(3, 15); - String[] finalTokens = new String[numTokensWithDuplicates]; - for (int i = 0; i < numTokensWithDuplicates; i++) { - finalTokens[i] = tokens[randomIntBetween(0, tokens.length - 1)]; - } - return finalTokens; - } - // like testSimpleTermVectors but we create fields with no term vectors public void testSimpleTermVectorsWithGenerate() throws IOException { String[] fieldNames = new String[10]; diff --git a/core/src/test/java/org/elasticsearch/action/termvectors/GetTermVectorsTests.java b/core/src/test/java/org/elasticsearch/action/termvectors/GetTermVectorsTests.java new file mode 100644 index 0000000000000..e4d55da9f92b1 --- /dev/null +++ b/core/src/test/java/org/elasticsearch/action/termvectors/GetTermVectorsTests.java @@ -0,0 +1,294 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.action.termvectors; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.payloads.FloatEncoder; +import org.apache.lucene.analysis.payloads.IdentityEncoder; +import org.apache.lucene.analysis.payloads.IntegerEncoder; +import org.apache.lucene.analysis.payloads.PayloadEncoder; +import org.apache.lucene.analysis.payloads.PayloadHelper; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; +import org.apache.lucene.index.Fields; +import org.apache.lucene.index.PostingsEnum; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.util.BytesRef; +import org.elasticsearch.ElasticsearchException; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.xcontent.XContentBuilder; +import org.elasticsearch.index.analysis.TokenFilterFactory; +import org.elasticsearch.indices.analysis.AnalysisModule; +import org.elasticsearch.plugins.AnalysisPlugin; +import org.elasticsearch.plugins.Plugin; +import org.elasticsearch.test.ESSingleNodeTestCase; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder; +import static org.hamcrest.Matchers.equalTo; +import static org.hamcrest.Matchers.notNullValue; +import static org.hamcrest.Matchers.nullValue; + +public class GetTermVectorsTests extends ESSingleNodeTestCase { + + @Override + protected Collection> getPlugins() { + return Collections.singleton(MockPayloadAnalyzerPlugin.class); + } + + // Delimited payload token filter was moved to analysis-common module, + // This test relies heavily on this token filter, even though it is not testing this token filter. + // Solution for now is copy what delimited payload token filter does in this test. + // Unfortunately MockPayloadAnalyzer couldn't be used here as it misses functionality. + public static class MockPayloadAnalyzerPlugin extends Plugin implements AnalysisPlugin { + + @Override + public Map> getTokenFilters() { + return Collections.singletonMap("mock_payload_filter", (indexSettings, environment, name, settings) -> { + return new TokenFilterFactory() { + @Override + public String name() { + return "mock_payload_filter"; + } + + @Override + public TokenStream create(TokenStream tokenStream) { + String delimiter = settings.get("delimiter"); + PayloadEncoder encoder = null; + if (settings.get("encoding").equals("float")) { + encoder = new FloatEncoder(); + } else if (settings.get("encoding").equals("int")) { + encoder = new IntegerEncoder(); + } else if (settings.get("encoding").equals("identity")) { + encoder = new IdentityEncoder(); + } + return new MockPayloadTokenFilter(tokenStream, delimiter.charAt(0), encoder); + } + }; + }); + } + + // Based on DelimitedPayloadTokenFilter: + final class MockPayloadTokenFilter extends TokenFilter { + private final char delimiter; + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final PayloadAttribute payAtt = addAttribute(PayloadAttribute.class); + private final PayloadEncoder encoder; + + + MockPayloadTokenFilter(TokenStream input, char delimiter, PayloadEncoder encoder) { + super(input); + this.delimiter = delimiter; + this.encoder = encoder; + } + + @Override + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + final char[] buffer = termAtt.buffer(); + final int length = termAtt.length(); + for (int i = 0; i < length; i++) { + if (buffer[i] == delimiter) { + payAtt.setPayload(encoder.encode(buffer, i + 1, (length - (i + 1)))); + termAtt.setLength(i); // simply set a new length + return true; + } + } + // we have not seen the delimiter + payAtt.setPayload(null); + return true; + } else { + return false; + } + } + } + } + + public void testRandomPayloadWithDelimitedPayloadTokenFilter() throws IOException { + //create the test document + int encoding = randomIntBetween(0, 2); + String encodingString = ""; + if (encoding == 0) { + encodingString = "float"; + } + if (encoding == 1) { + encodingString = "int"; + } + if (encoding == 2) { + encodingString = "identity"; + } + String[] tokens = crateRandomTokens(); + Map> payloads = createPayloads(tokens, encoding); + String delimiter = createRandomDelimiter(tokens); + String queryString = createString(tokens, payloads, encoding, delimiter.charAt(0)); + //create the mapping + XContentBuilder mapping = jsonBuilder().startObject().startObject("type1").startObject("properties") + .startObject("field").field("type", "text").field("term_vector", "with_positions_offsets_payloads") + .field("analyzer", "payload_test").endObject().endObject().endObject().endObject(); + Settings setting = Settings.builder() + .put("index.analysis.analyzer.payload_test.tokenizer", "whitespace") + .putArray("index.analysis.analyzer.payload_test.filter", "my_delimited_payload_filter") + .put("index.analysis.filter.my_delimited_payload_filter.delimiter", delimiter) + .put("index.analysis.filter.my_delimited_payload_filter.encoding", encodingString) + .put("index.analysis.filter.my_delimited_payload_filter.type", "mock_payload_filter").build(); + createIndex("test", setting, "type1", mapping); + + client().prepareIndex("test", "type1", Integer.toString(1)) + .setSource(jsonBuilder().startObject().field("field", queryString).endObject()).execute().actionGet(); + client().admin().indices().prepareRefresh().get(); + TermVectorsRequestBuilder resp = client().prepareTermVectors("test", "type1", Integer.toString(1)) + .setPayloads(true).setOffsets(true).setPositions(true).setSelectedFields(); + TermVectorsResponse response = resp.execute().actionGet(); + assertThat("doc id 1 doesn't exists but should", response.isExists(), equalTo(true)); + Fields fields = response.getFields(); + assertThat(fields.size(), equalTo(1)); + Terms terms = fields.terms("field"); + TermsEnum iterator = terms.iterator(); + while (iterator.next() != null) { + String term = iterator.term().utf8ToString(); + PostingsEnum docsAndPositions = iterator.postings(null, PostingsEnum.ALL); + assertThat(docsAndPositions.nextDoc(), equalTo(0)); + List curPayloads = payloads.get(term); + assertThat(term, curPayloads, notNullValue()); + assertNotNull(docsAndPositions); + for (int k = 0; k < docsAndPositions.freq(); k++) { + docsAndPositions.nextPosition(); + if (docsAndPositions.getPayload()!=null){ + String infoString = "\nterm: " + term + " has payload \n"+ docsAndPositions.getPayload().toString() + + "\n but should have payload \n"+curPayloads.get(k).toString(); + assertThat(infoString, docsAndPositions.getPayload(), equalTo(curPayloads.get(k))); + } else { + String infoString = "\nterm: " + term + " has no payload but should have payload \n"+curPayloads.get(k).toString(); + assertThat(infoString, curPayloads.get(k).length, equalTo(0)); + } + } + } + assertThat(iterator.next(), nullValue()); + } + + private String createString(String[] tokens, Map> payloads, int encoding, char delimiter) { + String resultString = ""; + Map payloadCounter = new HashMap<>(); + for (String token : tokens) { + if (!payloadCounter.containsKey(token)) { + payloadCounter.putIfAbsent(token, 0); + } else { + payloadCounter.put(token, payloadCounter.get(token) + 1); + } + resultString = resultString + token; + BytesRef payload = payloads.get(token).get(payloadCounter.get(token)); + if (payload.length > 0) { + resultString = resultString + delimiter; + switch (encoding) { + case 0: { + resultString = resultString + Float.toString(PayloadHelper.decodeFloat(payload.bytes, payload.offset)); + break; + } + case 1: { + resultString = resultString + Integer.toString(PayloadHelper.decodeInt(payload.bytes, payload.offset)); + break; + } + case 2: { + resultString = resultString + payload.utf8ToString(); + break; + } + default: { + throw new ElasticsearchException("unsupported encoding type"); + } + } + } + resultString = resultString + " "; + } + return resultString; + } + + private String[] crateRandomTokens() { + String[] tokens = { "the", "quick", "brown", "fox" }; + int numTokensWithDuplicates = randomIntBetween(3, 15); + String[] finalTokens = new String[numTokensWithDuplicates]; + for (int i = 0; i < numTokensWithDuplicates; i++) { + finalTokens[i] = tokens[randomIntBetween(0, tokens.length - 1)]; + } + return finalTokens; + } + + private String createRandomDelimiter(String[] tokens) { + String delimiter = ""; + boolean isTokenOrWhitespace = true; + while(isTokenOrWhitespace) { + isTokenOrWhitespace = false; + delimiter = randomUnicodeOfLength(1); + for(String token:tokens) { + if(token.contains(delimiter)) { + isTokenOrWhitespace = true; + } + } + if(Character.isWhitespace(delimiter.charAt(0))) { + isTokenOrWhitespace = true; + } + } + return delimiter; + } + + private Map> createPayloads(String[] tokens, int encoding) { + Map> payloads = new HashMap<>(); + for (String token : tokens) { + payloads.computeIfAbsent(token, k -> new ArrayList<>()); + boolean createPayload = randomBoolean(); + if (createPayload) { + switch (encoding) { + case 0: { + float theFloat = randomFloat(); + payloads.get(token).add(new BytesRef(PayloadHelper.encodeFloat(theFloat))); + break; + } + case 1: { + payloads.get(token).add(new BytesRef(PayloadHelper.encodeInt(randomInt()))); + break; + } + case 2: { + String payload = randomUnicodeOfLengthBetween(50, 100); + for (int c = 0; c < payload.length(); c++) { + if (Character.isWhitespace(payload.charAt(c))) { + payload = payload.replace(payload.charAt(c), 'w'); + } + } + payloads.get(token).add(new BytesRef(payload)); + break; + } + default: { + throw new ElasticsearchException("unsupported encoding type"); + } + } + } else { + payloads.get(token).add(new BytesRef()); + } + } + return payloads; + } +} diff --git a/core/src/main/java/org/elasticsearch/index/analysis/ApostropheFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ApostropheFilterFactory.java similarity index 85% rename from core/src/main/java/org/elasticsearch/index/analysis/ApostropheFilterFactory.java rename to modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ApostropheFilterFactory.java index 0ab84f7caf64e..a0de5507d109c 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/ApostropheFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ApostropheFilterFactory.java @@ -16,20 +16,21 @@ * specific language governing permissions and limitations * under the License. */ -package org.elasticsearch.index.analysis; +package org.elasticsearch.analysis.common; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tr.ApostropheFilter; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; /** * Factory for {@link ApostropheFilter} */ public class ApostropheFilterFactory extends AbstractTokenFilterFactory { - public ApostropheFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { + ApostropheFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { super(indexSettings, name, settings); } diff --git a/core/src/main/java/org/elasticsearch/index/analysis/ClassicFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ClassicFilterFactory.java similarity index 85% rename from core/src/main/java/org/elasticsearch/index/analysis/ClassicFilterFactory.java rename to modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ClassicFilterFactory.java index defda87c7f7e0..141112a0176c0 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/ClassicFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ClassicFilterFactory.java @@ -16,20 +16,21 @@ * specific language governing permissions and limitations * under the License. */ -package org.elasticsearch.index.analysis; +package org.elasticsearch.analysis.common; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.standard.ClassicFilter; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; /** * Factory for {@link ClassicFilter} */ public class ClassicFilterFactory extends AbstractTokenFilterFactory { - public ClassicFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { + ClassicFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { super(indexSettings, name, settings); } diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java index 562484cf07f97..e01a79f56342c 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java @@ -67,7 +67,6 @@ import org.apache.lucene.analysis.tr.ApostropheFilter; import org.apache.lucene.analysis.util.ElisionFilter; import org.elasticsearch.index.analysis.CharFilterFactory; -import org.elasticsearch.index.analysis.DelimitedPayloadTokenFilterFactory; import org.elasticsearch.index.analysis.HtmlStripCharFilterFactory; import org.elasticsearch.index.analysis.PreConfiguredCharFilter; import org.elasticsearch.index.analysis.PreConfiguredTokenFilter; @@ -92,44 +91,53 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin { @Override public Map> getTokenFilters() { Map> filters = new TreeMap<>(); + filters.put("apostrophe", ApostropheFilterFactory::new); + filters.put("arabic_normalization", ArabicNormalizationFilterFactory::new); filters.put("asciifolding", ASCIIFoldingTokenFilterFactory::new); - filters.put("keyword_marker", requriesAnalysisSettings(KeywordMarkerTokenFilterFactory::new)); - filters.put("porter_stem", PorterStemTokenFilterFactory::new); - filters.put("snowball", SnowballTokenFilterFactory::new); - filters.put("trim", TrimTokenFilterFactory::new); - filters.put("word_delimiter", WordDelimiterTokenFilterFactory::new); - filters.put("word_delimiter_graph", WordDelimiterGraphTokenFilterFactory::new); - filters.put("unique", UniqueTokenFilterFactory::new); - filters.put("flatten_graph", FlattenGraphTokenFilterFactory::new); - filters.put("length", LengthTokenFilterFactory::new); - filters.put("lowercase", LowerCaseTokenFilterFactory::new); - filters.put("uppercase", UpperCaseTokenFilterFactory::new); - filters.put("nGram", NGramTokenFilterFactory::new); - filters.put("ngram", NGramTokenFilterFactory::new); - filters.put("edgeNGram", EdgeNGramTokenFilterFactory::new); - filters.put("edge_ngram", EdgeNGramTokenFilterFactory::new); - filters.put("stemmer", StemmerTokenFilterFactory::new); - filters.put("stemmer_override", requriesAnalysisSettings(StemmerOverrideTokenFilterFactory::new)); - filters.put("kstem", KStemTokenFilterFactory::new); + filters.put("cjk_bigram", CJKBigramFilterFactory::new); + filters.put("cjk_width", CJKWidthFilterFactory::new); + filters.put("classic", ClassicFilterFactory::new); + filters.put("common_grams", requriesAnalysisSettings(CommonGramsTokenFilterFactory::new)); + filters.put("decimal_digit", DecimalDigitFilterFactory::new); + filters.put("delimited_payload_filter", DelimitedPayloadTokenFilterFactory::new); filters.put("dictionary_decompounder", requriesAnalysisSettings(DictionaryCompoundWordTokenFilterFactory::new)); - filters.put("hyphenation_decompounder", requriesAnalysisSettings(HyphenationCompoundWordTokenFilterFactory::new)); - filters.put("reverse", ReverseTokenFilterFactory::new); + filters.put("edge_ngram", EdgeNGramTokenFilterFactory::new); + filters.put("edgeNGram", EdgeNGramTokenFilterFactory::new); filters.put("elision", ElisionTokenFilterFactory::new); - filters.put("truncate", requriesAnalysisSettings(TruncateTokenFilterFactory::new)); - filters.put("limit", LimitTokenCountFilterFactory::new); - filters.put("common_grams", requriesAnalysisSettings(CommonGramsTokenFilterFactory::new)); - filters.put("pattern_replace", requriesAnalysisSettings(PatternReplaceTokenFilterFactory::new)); - filters.put("pattern_capture", requriesAnalysisSettings(PatternCaptureGroupTokenFilterFactory::new)); - filters.put("arabic_normalization", ArabicNormalizationFilterFactory::new); + filters.put("fingerprint", FingerprintTokenFilterFactory::new); + filters.put("flatten_graph", FlattenGraphTokenFilterFactory::new); filters.put("german_normalization", GermanNormalizationFilterFactory::new); filters.put("hindi_normalization", HindiNormalizationFilterFactory::new); + filters.put("hyphenation_decompounder", requriesAnalysisSettings(HyphenationCompoundWordTokenFilterFactory::new)); filters.put("indic_normalization", IndicNormalizationFilterFactory::new); + filters.put("keep", requriesAnalysisSettings(KeepWordFilterFactory::new)); + filters.put("keep_types", requriesAnalysisSettings(KeepTypesFilterFactory::new)); + filters.put("keyword_marker", requriesAnalysisSettings(KeywordMarkerTokenFilterFactory::new)); + filters.put("kstem", KStemTokenFilterFactory::new); + filters.put("length", LengthTokenFilterFactory::new); + filters.put("limit", LimitTokenCountFilterFactory::new); + filters.put("lowercase", LowerCaseTokenFilterFactory::new); + filters.put("min_hash", MinHashTokenFilterFactory::new); + filters.put("ngram", NGramTokenFilterFactory::new); + filters.put("nGram", NGramTokenFilterFactory::new); + filters.put("pattern_capture", requriesAnalysisSettings(PatternCaptureGroupTokenFilterFactory::new)); + filters.put("pattern_replace", requriesAnalysisSettings(PatternReplaceTokenFilterFactory::new)); filters.put("persian_normalization", PersianNormalizationFilterFactory::new); + filters.put("porter_stem", PorterStemTokenFilterFactory::new); + filters.put("reverse", ReverseTokenFilterFactory::new); + filters.put("scandinavian_folding", ScandinavianFoldingFilterFactory::new); filters.put("scandinavian_normalization", ScandinavianNormalizationFilterFactory::new); filters.put("serbian_normalization", SerbianNormalizationFilterFactory::new); + filters.put("snowball", SnowballTokenFilterFactory::new); filters.put("sorani_normalization", SoraniNormalizationFilterFactory::new); - filters.put("cjk_width", CJKWidthFilterFactory::new); - filters.put("cjk_bigram", CJKBigramFilterFactory::new); + filters.put("stemmer_override", requriesAnalysisSettings(StemmerOverrideTokenFilterFactory::new)); + filters.put("stemmer", StemmerTokenFilterFactory::new); + filters.put("trim", TrimTokenFilterFactory::new); + filters.put("truncate", requriesAnalysisSettings(TruncateTokenFilterFactory::new)); + filters.put("unique", UniqueTokenFilterFactory::new); + filters.put("uppercase", UpperCaseTokenFilterFactory::new); + filters.put("word_delimiter_graph", WordDelimiterGraphTokenFilterFactory::new); + filters.put("word_delimiter", WordDelimiterTokenFilterFactory::new); return filters; } diff --git a/core/src/main/java/org/elasticsearch/index/analysis/DecimalDigitFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/DecimalDigitFilterFactory.java similarity index 83% rename from core/src/main/java/org/elasticsearch/index/analysis/DecimalDigitFilterFactory.java rename to modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/DecimalDigitFilterFactory.java index b4806ab707392..2fa4b91f3c812 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/DecimalDigitFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/DecimalDigitFilterFactory.java @@ -17,20 +17,22 @@ * under the License. */ -package org.elasticsearch.index.analysis; +package org.elasticsearch.analysis.common; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.core.DecimalDigitFilter; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; +import org.elasticsearch.index.analysis.MultiTermAwareComponent; /** * Factory for {@link DecimalDigitFilter} */ public final class DecimalDigitFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent { - public DecimalDigitFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { + DecimalDigitFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); } diff --git a/core/src/main/java/org/elasticsearch/index/analysis/DelimitedPayloadTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/DelimitedPayloadTokenFilterFactory.java similarity index 84% rename from core/src/main/java/org/elasticsearch/index/analysis/DelimitedPayloadTokenFilterFactory.java rename to modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/DelimitedPayloadTokenFilterFactory.java index 12d0a041bd5e4..e4c299d172ce5 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/DelimitedPayloadTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/DelimitedPayloadTokenFilterFactory.java @@ -17,7 +17,7 @@ * under the License. */ -package org.elasticsearch.index.analysis; +package org.elasticsearch.analysis.common; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilter; @@ -28,6 +28,7 @@ import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; public class DelimitedPayloadTokenFilterFactory extends AbstractTokenFilterFactory { @@ -37,11 +38,10 @@ public class DelimitedPayloadTokenFilterFactory extends AbstractTokenFilterFacto static final String ENCODING = "encoding"; static final String DELIMITER = "delimiter"; - char delimiter; - PayloadEncoder encoder; + private final char delimiter; + private final PayloadEncoder encoder; - public DelimitedPayloadTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, - Settings settings) { + DelimitedPayloadTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); String delimiterConf = settings.get(DELIMITER); if (delimiterConf != null) { @@ -57,6 +57,8 @@ public DelimitedPayloadTokenFilterFactory(IndexSettings indexSettings, Environme encoder = new IntegerEncoder(); } else if (settings.get(ENCODING).equals("identity")) { encoder = new IdentityEncoder(); + } else { + encoder = DEFAULT_ENCODER; } } else { encoder = DEFAULT_ENCODER; @@ -65,8 +67,7 @@ public DelimitedPayloadTokenFilterFactory(IndexSettings indexSettings, Environme @Override public TokenStream create(TokenStream tokenStream) { - DelimitedPayloadTokenFilter filter = new DelimitedPayloadTokenFilter(tokenStream, delimiter, encoder); - return filter; + return new DelimitedPayloadTokenFilter(tokenStream, delimiter, encoder); } } diff --git a/core/src/main/java/org/elasticsearch/index/analysis/FingerprintTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/FingerprintTokenFilterFactory.java similarity index 58% rename from core/src/main/java/org/elasticsearch/index/analysis/FingerprintTokenFilterFactory.java rename to modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/FingerprintTokenFilterFactory.java index 55623e8f831da..bee11bbec7412 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/FingerprintTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/FingerprintTokenFilterFactory.java @@ -17,7 +17,7 @@ * under the License. */ -package org.elasticsearch.index.analysis; +package org.elasticsearch.analysis.common; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.miscellaneous.FingerprintFilter; @@ -25,24 +25,21 @@ import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; +import org.elasticsearch.index.analysis.FingerprintAnalyzerProvider; +import static org.elasticsearch.index.analysis.FingerprintAnalyzerProvider.DEFAULT_MAX_OUTPUT_SIZE; +import static org.elasticsearch.index.analysis.FingerprintAnalyzerProvider.MAX_OUTPUT_SIZE; public class FingerprintTokenFilterFactory extends AbstractTokenFilterFactory { private final char separator; private final int maxOutputSize; - public static ParseField SEPARATOR = new ParseField("separator"); - public static ParseField MAX_OUTPUT_SIZE = new ParseField("max_output_size"); - - public static final char DEFAULT_SEPARATOR = ' '; - public static final int DEFAULT_MAX_OUTPUT_SIZE = 255; - - public FingerprintTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { + FingerprintTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { super(indexSettings, name, settings); - this.separator = parseSeparator(settings); - this.maxOutputSize = settings.getAsInt(MAX_OUTPUT_SIZE.getPreferredName(), - FingerprintTokenFilterFactory.DEFAULT_MAX_OUTPUT_SIZE); + this.separator = FingerprintAnalyzerProvider.parseSeparator(settings); + this.maxOutputSize = settings.getAsInt(MAX_OUTPUT_SIZE.getPreferredName(), DEFAULT_MAX_OUTPUT_SIZE); } @Override @@ -52,15 +49,4 @@ public TokenStream create(TokenStream tokenStream) { return result; } - public static char parseSeparator(Settings settings) throws IllegalArgumentException { - String customSeparator = settings.get(SEPARATOR.getPreferredName()); - if (customSeparator == null) { - return FingerprintTokenFilterFactory.DEFAULT_SEPARATOR; - } else if (customSeparator.length() == 1) { - return customSeparator.charAt(0); - } - - throw new IllegalArgumentException("Setting [separator] must be a single, non-null character. [" - + customSeparator + "] was provided."); - } } diff --git a/core/src/main/java/org/elasticsearch/index/analysis/KeepTypesFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/KeepTypesFilterFactory.java similarity index 88% rename from core/src/main/java/org/elasticsearch/index/analysis/KeepTypesFilterFactory.java rename to modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/KeepTypesFilterFactory.java index 1f44657a89733..4da560836eb13 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/KeepTypesFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/KeepTypesFilterFactory.java @@ -17,13 +17,15 @@ * under the License. */ -package org.elasticsearch.index.analysis; +package org.elasticsearch.analysis.common; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.core.TypeTokenFilter; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; +import org.elasticsearch.index.analysis.TokenFilterFactory; import java.util.Arrays; import java.util.HashSet; @@ -43,8 +45,7 @@ public class KeepTypesFilterFactory extends AbstractTokenFilterFactory { private final Set keepTypes; private static final String KEEP_TYPES_KEY = "types"; - public KeepTypesFilterFactory(IndexSettings indexSettings, - Environment env, String name, Settings settings) { + KeepTypesFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); final String[] arrayKeepTypes = settings.getAsArray(KEEP_TYPES_KEY, null); diff --git a/core/src/main/java/org/elasticsearch/index/analysis/KeepWordFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/KeepWordFilterFactory.java similarity index 89% rename from core/src/main/java/org/elasticsearch/index/analysis/KeepWordFilterFactory.java rename to modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/KeepWordFilterFactory.java index 631040d233309..f42797e0ff644 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/KeepWordFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/KeepWordFilterFactory.java @@ -17,7 +17,7 @@ * under the License. */ -package org.elasticsearch.index.analysis; +package org.elasticsearch.analysis.common; import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.TokenStream; @@ -26,6 +26,10 @@ import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; +import org.elasticsearch.index.analysis.Analysis; +import org.elasticsearch.index.analysis.StopTokenFilterFactory; +import org.elasticsearch.index.analysis.TokenFilterFactory; /** * A {@link TokenFilterFactory} for {@link KeepWordFilter}. This filter only @@ -54,8 +58,7 @@ public class KeepWordFilterFactory extends AbstractTokenFilterFactory { // unsupported ancient option private static final String ENABLE_POS_INC_KEY = "enable_position_increments"; - public KeepWordFilterFactory(IndexSettings indexSettings, - Environment env, String name, Settings settings) { + KeepWordFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); final String[] arrayKeepWords = settings.getAsArray(KEEP_WORDS_KEY, null); diff --git a/core/src/main/java/org/elasticsearch/index/analysis/MinHashTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/MinHashTokenFilterFactory.java similarity index 89% rename from core/src/main/java/org/elasticsearch/index/analysis/MinHashTokenFilterFactory.java rename to modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/MinHashTokenFilterFactory.java index 19213dffe2ab9..bc3f96c087efe 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/MinHashTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/MinHashTokenFilterFactory.java @@ -17,13 +17,14 @@ * under the License. */ -package org.elasticsearch.index.analysis; +package org.elasticsearch.analysis.common; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.minhash.MinHashFilterFactory; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; import java.util.HashMap; import java.util.Map; @@ -36,7 +37,7 @@ public class MinHashTokenFilterFactory extends AbstractTokenFilterFactory { private final MinHashFilterFactory minHashFilterFactory; - public MinHashTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { + MinHashTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { super(indexSettings, name, settings); minHashFilterFactory = new MinHashFilterFactory(convertSettings(settings)); } diff --git a/core/src/main/java/org/elasticsearch/index/analysis/ScandinavianFoldingFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ScandinavianFoldingFilterFactory.java similarity index 83% rename from core/src/main/java/org/elasticsearch/index/analysis/ScandinavianFoldingFilterFactory.java rename to modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ScandinavianFoldingFilterFactory.java index c55b487fe79a6..6a1dbfdb19228 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/ScandinavianFoldingFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ScandinavianFoldingFilterFactory.java @@ -16,20 +16,22 @@ * specific language governing permissions and limitations * under the License. */ -package org.elasticsearch.index.analysis; +package org.elasticsearch.analysis.common; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.miscellaneous.ScandinavianFoldingFilter; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; +import org.elasticsearch.index.analysis.MultiTermAwareComponent; /** * Factory for {@link ScandinavianFoldingFilter} */ public class ScandinavianFoldingFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent { - public ScandinavianFoldingFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { + ScandinavianFoldingFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { super(indexSettings, name, settings); } diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java index 47f37f3a0e513..fbc57d3b9cf52 100644 --- a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java @@ -22,7 +22,6 @@ import org.apache.lucene.analysis.charfilter.HTMLStripCharFilterFactory; import org.apache.lucene.analysis.en.PorterStemFilterFactory; import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilterFactory; -import org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilterFactory; import org.apache.lucene.analysis.reverse.ReverseStringFilterFactory; import org.apache.lucene.analysis.snowball.SnowballPorterFilterFactory; import org.elasticsearch.index.analysis.HtmlStripCharFilterFactory; @@ -117,6 +116,15 @@ protected Map> getTokenFilters() { filters.put("soraninormalization", SoraniNormalizationFilterFactory.class); filters.put("cjkwidth", CJKWidthFilterFactory.class); filters.put("cjkbigram", CJKBigramFilterFactory.class); + filters.put("delimitedpayload", DelimitedPayloadTokenFilterFactory.class); + filters.put("keepword", KeepWordFilterFactory.class); + filters.put("type", KeepTypesFilterFactory.class); + filters.put("classic", ClassicFilterFactory.class); + filters.put("apostrophe", ApostropheFilterFactory.class); + filters.put("decimaldigit", DecimalDigitFilterFactory.class); + filters.put("fingerprint", FingerprintTokenFilterFactory.class); + filters.put("minhash", MinHashTokenFilterFactory.class); + filters.put("scandinavianfolding", ScandinavianFoldingFilterFactory.class); return filters; } @@ -155,7 +163,7 @@ protected Map> getPreConfiguredTokenFilters() { filters.put("common_grams", null); filters.put("czech_stem", null); filters.put("decimal_digit", null); - filters.put("delimited_payload_filter", DelimitedPayloadTokenFilterFactory.class); + filters.put("delimited_payload_filter", org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilterFactory.class); filters.put("dutch_stem", SnowballPorterFilterFactory.class); filters.put("edge_ngram", null); filters.put("edgeNGram", null); diff --git a/core/src/test/java/org/elasticsearch/index/analysis/KeepFilterFactoryTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/KeepFilterFactoryTests.java similarity index 87% rename from core/src/test/java/org/elasticsearch/index/analysis/KeepFilterFactoryTests.java rename to modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/KeepFilterFactoryTests.java index d5a6a590e78ae..83373e169b418 100644 --- a/core/src/test/java/org/elasticsearch/index/analysis/KeepFilterFactoryTests.java +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/KeepFilterFactoryTests.java @@ -17,12 +17,14 @@ * under the License. */ -package org.elasticsearch.index.analysis; +package org.elasticsearch.analysis.common; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.WhitespaceTokenizer; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; +import org.elasticsearch.index.analysis.AnalysisTestsHelper; +import org.elasticsearch.index.analysis.TokenFilterFactory; import org.elasticsearch.test.ESTestCase; import org.elasticsearch.test.ESTokenStreamTestCase; import org.junit.Assert; @@ -33,10 +35,11 @@ import static org.hamcrest.Matchers.instanceOf; public class KeepFilterFactoryTests extends ESTokenStreamTestCase { - private static final String RESOURCE = "/org/elasticsearch/index/analysis/keep_analysis.json"; + private static final String RESOURCE = "/org/elasticsearch/analysis/common/keep_analysis.json"; public void testLoadWithoutSettings() throws IOException { - ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE); + ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath( + createTempDir(), RESOURCE, new CommonAnalysisPlugin()); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("keep"); Assert.assertNull(tokenFilter); } @@ -49,7 +52,7 @@ public void testLoadOverConfiguredSettings() { .put("index.analysis.filter.broken_keep_filter.keep_words", "[\"Hello\", \"worlD\"]") .build(); try { - AnalysisTestsHelper.createTestAnalysisFromSettings(settings); + AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin()); Assert.fail("path and array are configured"); } catch (IllegalArgumentException e) { } catch (IOException e) { @@ -65,7 +68,7 @@ public void testKeepWordsPathSettings() { .build(); try { // test our none existing setup is picked up - AnalysisTestsHelper.createTestAnalysisFromSettings(settings); + AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin()); fail("expected an exception due to non existent keep_words_path"); } catch (IllegalArgumentException e) { } catch (IOException e) { @@ -77,7 +80,7 @@ public void testKeepWordsPathSettings() { .build(); try { // test our none existing setup is picked up - AnalysisTestsHelper.createTestAnalysisFromSettings(settings); + AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin()); fail("expected an exception indicating that you can't use [keep_words_path] with [keep_words] "); } catch (IllegalArgumentException e) { } catch (IOException e) { @@ -87,7 +90,8 @@ public void testKeepWordsPathSettings() { } public void testCaseInsensitiveMapping() throws IOException { - ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE); + ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath( + createTempDir(), RESOURCE, new CommonAnalysisPlugin()); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_keep_filter"); assertThat(tokenFilter, instanceOf(KeepWordFilterFactory.class)); String source = "hello small world"; @@ -98,7 +102,8 @@ public void testCaseInsensitiveMapping() throws IOException { } public void testCaseSensitiveMapping() throws IOException { - ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE); + ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath( + createTempDir(), RESOURCE, new CommonAnalysisPlugin()); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_case_sensitive_keep_filter"); assertThat(tokenFilter, instanceOf(KeepWordFilterFactory.class)); String source = "Hello small world"; diff --git a/core/src/test/java/org/elasticsearch/index/analysis/KeepTypesFilterFactoryTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/KeepTypesFilterFactoryTests.java similarity index 90% rename from core/src/test/java/org/elasticsearch/index/analysis/KeepTypesFilterFactoryTests.java rename to modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/KeepTypesFilterFactoryTests.java index 48ce1139d8f20..4df1fb780e932 100644 --- a/core/src/test/java/org/elasticsearch/index/analysis/KeepTypesFilterFactoryTests.java +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/KeepTypesFilterFactoryTests.java @@ -17,12 +17,14 @@ * under the License. */ -package org.elasticsearch.index.analysis; +package org.elasticsearch.analysis.common; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.standard.StandardTokenizer; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; +import org.elasticsearch.index.analysis.AnalysisTestsHelper; +import org.elasticsearch.index.analysis.TokenFilterFactory; import org.elasticsearch.test.ESTestCase; import org.elasticsearch.test.ESTokenStreamTestCase; @@ -38,7 +40,7 @@ public void testKeepTypes() throws IOException { .put("index.analysis.filter.keep_numbers.type", "keep_types") .putArray("index.analysis.filter.keep_numbers.types", new String[] {"", ""}) .build(); - ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings); + ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin()); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("keep_numbers"); assertThat(tokenFilter, instanceOf(KeepTypesFilterFactory.class)); String source = "Hello 123 world"; diff --git a/core/src/test/java/org/elasticsearch/index/analysis/MinHashFilterFactoryTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/MinHashFilterFactoryTests.java similarity index 91% rename from core/src/test/java/org/elasticsearch/index/analysis/MinHashFilterFactoryTests.java rename to modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/MinHashFilterFactoryTests.java index fc78afa7ab9cc..52879c6b7e7aa 100644 --- a/core/src/test/java/org/elasticsearch/index/analysis/MinHashFilterFactoryTests.java +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/MinHashFilterFactoryTests.java @@ -17,12 +17,14 @@ * under the License. */ -package org.elasticsearch.index.analysis; +package org.elasticsearch.analysis.common; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.WhitespaceTokenizer; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; +import org.elasticsearch.index.analysis.AnalysisTestsHelper; +import org.elasticsearch.index.analysis.TokenFilterFactory; import org.elasticsearch.test.ESTestCase; import org.elasticsearch.test.ESTokenStreamTestCase; @@ -37,7 +39,7 @@ public void testDefault() throws IOException { Settings settings = Settings.builder() .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) .build(); - ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings); + ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin()); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("min_hash"); String source = "the quick brown fox"; Tokenizer tokenizer = new WhitespaceTokenizer(); @@ -58,7 +60,7 @@ public void testSettings() throws IOException { .put("index.analysis.filter.test_min_hash.with_rotation", false) .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) .build(); - ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings); + ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin()); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("test_min_hash"); String source = "sushi"; Tokenizer tokenizer = new WhitespaceTokenizer(); diff --git a/core/src/test/resources/org/elasticsearch/index/analysis/keep_analysis.json b/modules/analysis-common/src/test/resources/org/elasticsearch/analysis/common/keep_analysis.json similarity index 100% rename from core/src/test/resources/org/elasticsearch/index/analysis/keep_analysis.json rename to modules/analysis-common/src/test/resources/org/elasticsearch/analysis/common/keep_analysis.json diff --git a/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/40_token_filters.yml b/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/40_token_filters.yml index 1ec5060a25746..f2db3facb3d01 100644 --- a/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/40_token_filters.yml +++ b/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/40_token_filters.yml @@ -993,3 +993,268 @@ - match: { tokens.8.token: に落 } - match: { tokens.9.token: 落ち } - match: { tokens.10.token: ちた } + +--- +"delimited_payload_filter": + - do: + indices.create: + index: test + body: + settings: + analysis: + filter: + my_delimited_payload_filter: + type: delimited_payload_filter + delimiter: ^ + encoding: identity + - do: + indices.analyze: + index: test + body: + text: foo^bar + tokenizer: keyword + filter: [my_delimited_payload_filter] + - length: { tokens: 1 } + - match: { tokens.0.token: foo } + + # Test pre-configured token filter too: + - do: + indices.analyze: + body: + text: foo|5 + tokenizer: keyword + filter: [delimited_payload_filter] + - length: { tokens: 1 } + - match: { tokens.0.token: foo } + +--- +"keep_filter": + - do: + indices.create: + index: test + body: + settings: + analysis: + filter: + my_keep: + type: keep + keep_words: [foo,bar] + - do: + indices.analyze: + index: test + body: + text: foo bar baz + tokenizer: whitespace + filter: [my_keep] + - length: { tokens: 2 } + - match: { tokens.0.token: foo } + - match: { tokens.1.token: bar } + +--- +"keep_types_filter": + - do: + indices.create: + index: test + body: + settings: + analysis: + filter: + my_keep_types: + type: keep_types + types: [] + - do: + indices.analyze: + index: test + body: + text: foo 123 bar + tokenizer: standard + filter: [my_keep_types] + - length: { tokens: 1 } + - match: { tokens.0.token: "123" } + +--- +"classic": + - do: + indices.create: + index: test + body: + settings: + analysis: + filter: + my_classic: + type: classic + - do: + indices.analyze: + index: test + body: + text: foo's bar + tokenizer: classic + filter: [my_classic] + - length: { tokens: 2 } + - match: { tokens.0.token: foo } + - match: { tokens.1.token: bar } + + # Test pre-configured token filter too: + - do: + indices.analyze: + body: + text: foo's bar + tokenizer: classic + filter: [classic] + - length: { tokens: 2 } + - match: { tokens.0.token: foo } + - match: { tokens.1.token: bar } + +--- +"apostrophe": + - do: + indices.create: + index: test + body: + settings: + analysis: + filter: + my_apostrophe: + type: apostrophe + - do: + indices.analyze: + index: test + body: + text: foo's bar + tokenizer: keyword + filter: [my_apostrophe] + - length: { tokens: 1 } + - match: { tokens.0.token: foo } + + # Test pre-configured token filter too: + - do: + indices.analyze: + body: + text: foo's bar + tokenizer: keyword + filter: [apostrophe] + - length: { tokens: 1 } + - match: { tokens.0.token: foo } + +--- +"decimal_digit": + - do: + indices.create: + index: test + body: + settings: + analysis: + filter: + my_decimal_digit: + type: decimal_digit + - do: + indices.analyze: + index: test + body: + text: ١٢٣٤ + tokenizer: keyword + filter: [my_decimal_digit] + - length: { tokens: 1 } + - match: { tokens.0.token: "1234" } + + # Test pre-configured token filter too: + - do: + indices.analyze: + body: + text: ١٢٣٤ + tokenizer: keyword + filter: [decimal_digit] + - length: { tokens: 1 } + - match: { tokens.0.token: "1234" } + +--- +"fingerprint": + - do: + indices.create: + index: test + body: + settings: + analysis: + filter: + my_fingerprint: + type: fingerprint + separator: _ + - do: + indices.analyze: + index: test + body: + text: A1 B2 A1 D4 C3 + tokenizer: whitespace + filter: [my_fingerprint] + - length: { tokens: 1 } + - match: { tokens.0.token: A1_B2_C3_D4 } + + # Test pre-configured token filter too: + - do: + indices.analyze: + body: + text: A1 B2 A1 D4 C3 + tokenizer: whitespace + filter: [fingerprint] + - length: { tokens: 1 } + - match: { tokens.0.token: A1 B2 C3 D4 } + +--- +"min_hash": + - do: + indices.create: + index: test + body: + settings: + analysis: + filter: + my_min_hash: + type: min_hash + - do: + indices.analyze: + index: test + body: + text: the quick brown fox + tokenizer: whitespace + filter: [my_min_hash] + - length: { tokens: 512 } + + # Test pre-configured token filter too: + - do: + indices.analyze: + body: + text: the quick brown fox + tokenizer: whitespace + filter: [min_hash] + - length: { tokens: 512 } + +--- +"scandinavian_folding": + - do: + indices.create: + index: test + body: + settings: + analysis: + filter: + my_scandinavian_folding: + type: scandinavian_folding + - do: + indices.analyze: + index: test + body: + text: räksmörgås + tokenizer: keyword + filter: [my_scandinavian_folding] + - length: { tokens: 1 } + - match: { tokens.0.token: raksmorgas } + + # Test pre-configured token filter too: + - do: + indices.analyze: + body: + text: räksmörgås + tokenizer: keyword + filter: [scandinavian_folding] + - length: { tokens: 1 } + - match: { tokens.0.token: raksmorgas } diff --git a/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java b/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java index c9c214560a3d8..3a04f86f34ecd 100644 --- a/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java +++ b/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java @@ -23,23 +23,16 @@ import org.apache.lucene.analysis.util.TokenFilterFactory; import org.apache.lucene.analysis.util.TokenizerFactory; import org.elasticsearch.common.collect.MapBuilder; -import org.elasticsearch.index.analysis.ApostropheFilterFactory; import org.elasticsearch.index.analysis.ArabicStemTokenFilterFactory; import org.elasticsearch.index.analysis.BrazilianStemTokenFilterFactory; -import org.elasticsearch.index.analysis.ClassicFilterFactory; import org.elasticsearch.index.analysis.ClassicTokenizerFactory; import org.elasticsearch.index.analysis.CzechStemTokenFilterFactory; -import org.elasticsearch.index.analysis.DecimalDigitFilterFactory; -import org.elasticsearch.index.analysis.DelimitedPayloadTokenFilterFactory; import org.elasticsearch.index.analysis.EdgeNGramTokenizerFactory; import org.elasticsearch.index.analysis.GermanStemTokenFilterFactory; import org.elasticsearch.index.analysis.HunspellTokenFilterFactory; -import org.elasticsearch.index.analysis.KeepTypesFilterFactory; -import org.elasticsearch.index.analysis.KeepWordFilterFactory; import org.elasticsearch.index.analysis.KeywordTokenizerFactory; import org.elasticsearch.index.analysis.LetterTokenizerFactory; import org.elasticsearch.index.analysis.LowerCaseTokenizerFactory; -import org.elasticsearch.index.analysis.MinHashTokenFilterFactory; import org.elasticsearch.index.analysis.MultiTermAwareComponent; import org.elasticsearch.index.analysis.NGramTokenizerFactory; import org.elasticsearch.index.analysis.PathHierarchyTokenizerFactory; @@ -47,7 +40,6 @@ import org.elasticsearch.index.analysis.PreConfiguredCharFilter; import org.elasticsearch.index.analysis.PreConfiguredTokenFilter; import org.elasticsearch.index.analysis.PreConfiguredTokenizer; -import org.elasticsearch.index.analysis.ScandinavianFoldingFilterFactory; import org.elasticsearch.index.analysis.ShingleTokenFilterFactory; import org.elasticsearch.index.analysis.StandardTokenFilterFactory; import org.elasticsearch.index.analysis.StandardTokenizerFactory; @@ -120,7 +112,7 @@ private static String toCamelCase(String s) { static final Map> KNOWN_TOKENFILTERS = new MapBuilder>() // exposed in ES - .put("apostrophe", ApostropheFilterFactory.class) + .put("apostrophe", MovedToAnalysisCommon.class) .put("arabicnormalization", MovedToAnalysisCommon.class) .put("arabicstem", ArabicStemTokenFilterFactory.class) .put("asciifolding", MovedToAnalysisCommon.class) @@ -128,12 +120,12 @@ private static String toCamelCase(String s) { .put("bulgarianstem", MovedToAnalysisCommon.class) .put("cjkbigram", MovedToAnalysisCommon.class) .put("cjkwidth", MovedToAnalysisCommon.class) - .put("classic", ClassicFilterFactory.class) + .put("classic", MovedToAnalysisCommon.class) .put("commongrams", MovedToAnalysisCommon.class) .put("commongramsquery", MovedToAnalysisCommon.class) .put("czechstem", CzechStemTokenFilterFactory.class) - .put("decimaldigit", DecimalDigitFilterFactory.class) - .put("delimitedpayload", DelimitedPayloadTokenFilterFactory.class) + .put("decimaldigit", MovedToAnalysisCommon.class) + .put("delimitedpayload", MovedToAnalysisCommon.class) .put("dictionarycompoundword", MovedToAnalysisCommon.class) .put("edgengram", MovedToAnalysisCommon.class) .put("elision", MovedToAnalysisCommon.class) @@ -159,7 +151,7 @@ private static String toCamelCase(String s) { .put("irishlowercase", MovedToAnalysisCommon.class) .put("indonesianstem", MovedToAnalysisCommon.class) .put("italianlightstem", MovedToAnalysisCommon.class) - .put("keepword", KeepWordFilterFactory.class) + .put("keepword", MovedToAnalysisCommon.class) .put("keywordmarker", MovedToAnalysisCommon.class) .put("kstem", MovedToAnalysisCommon.class) .put("latvianstem", MovedToAnalysisCommon.class) @@ -178,11 +170,11 @@ private static String toCamelCase(String s) { .put("portugueseminimalstem", MovedToAnalysisCommon.class) .put("reversestring", MovedToAnalysisCommon.class) .put("russianlightstem", MovedToAnalysisCommon.class) - .put("scandinavianfolding", ScandinavianFoldingFilterFactory.class) + .put("scandinavianfolding", MovedToAnalysisCommon.class) .put("scandinaviannormalization", MovedToAnalysisCommon.class) .put("serbiannormalization", MovedToAnalysisCommon.class) .put("shingle", ShingleTokenFilterFactory.class) - .put("minhash", MinHashTokenFilterFactory.class) + .put("minhash", MovedToAnalysisCommon.class) .put("snowballporter", MovedToAnalysisCommon.class) .put("soraninormalization", MovedToAnalysisCommon.class) .put("soranistem", MovedToAnalysisCommon.class) @@ -196,7 +188,7 @@ private static String toCamelCase(String s) { .put("trim", MovedToAnalysisCommon.class) .put("truncate", MovedToAnalysisCommon.class) .put("turkishlowercase", MovedToAnalysisCommon.class) - .put("type", KeepTypesFilterFactory.class) + .put("type", MovedToAnalysisCommon.class) .put("uppercase", MovedToAnalysisCommon.class) .put("worddelimiter", MovedToAnalysisCommon.class) .put("worddelimitergraph", MovedToAnalysisCommon.class)