Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -33,17 +33,19 @@
*/
public class FingerprintAnalyzerProvider extends AbstractIndexAnalyzerProvider<Analyzer> {

public static ParseField MAX_OUTPUT_SIZE = FingerprintTokenFilterFactory.MAX_OUTPUT_SIZE;
public static ParseField SEPARATOR = new ParseField("separator");
public static ParseField MAX_OUTPUT_SIZE = new ParseField("max_output_size");

public static int DEFAULT_MAX_OUTPUT_SIZE = FingerprintTokenFilterFactory.DEFAULT_MAX_OUTPUT_SIZE;
public static int DEFAULT_MAX_OUTPUT_SIZE = 255;
public static CharArraySet DEFAULT_STOP_WORDS = CharArraySet.EMPTY_SET;
public static final char DEFAULT_SEPARATOR = ' ';

private final FingerprintAnalyzer analyzer;

public FingerprintAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, name, settings);

char separator = FingerprintTokenFilterFactory.parseSeparator(settings);
char separator = parseSeparator(settings);
int maxOutputSize = settings.getAsInt(MAX_OUTPUT_SIZE.getPreferredName(),DEFAULT_MAX_OUTPUT_SIZE);
CharArraySet stopWords = Analysis.parseStopWords(env, indexSettings.getIndexVersionCreated(), settings, DEFAULT_STOP_WORDS);

Expand All @@ -54,4 +56,16 @@ public FingerprintAnalyzerProvider(IndexSettings indexSettings, Environment env,
public FingerprintAnalyzer get() {
return analyzer;
}

public static char parseSeparator(Settings settings) throws IllegalArgumentException {
String customSeparator = settings.get(SEPARATOR.getPreferredName());
if (customSeparator == null) {
return DEFAULT_SEPARATOR;
} else if (customSeparator.length() == 1) {
return customSeparator.charAt(0);
}

throw new IllegalArgumentException("Setting [separator] must be a single, non-null character. ["
+ customSeparator + "] was provided.");
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AnalysisRegistry;
import org.elasticsearch.index.analysis.AnalyzerProvider;
import org.elasticsearch.index.analysis.ApostropheFilterFactory;
import org.elasticsearch.index.analysis.ArabicAnalyzerProvider;
import org.elasticsearch.index.analysis.ArabicStemTokenFilterFactory;
import org.elasticsearch.index.analysis.ArmenianAnalyzerProvider;
Expand All @@ -41,19 +40,15 @@
import org.elasticsearch.index.analysis.CharFilterFactory;
import org.elasticsearch.index.analysis.ChineseAnalyzerProvider;
import org.elasticsearch.index.analysis.CjkAnalyzerProvider;
import org.elasticsearch.index.analysis.ClassicFilterFactory;
import org.elasticsearch.index.analysis.ClassicTokenizerFactory;
import org.elasticsearch.index.analysis.CzechAnalyzerProvider;
import org.elasticsearch.index.analysis.CzechStemTokenFilterFactory;
import org.elasticsearch.index.analysis.DanishAnalyzerProvider;
import org.elasticsearch.index.analysis.DecimalDigitFilterFactory;
import org.elasticsearch.index.analysis.DelimitedPayloadTokenFilterFactory;
import org.elasticsearch.index.analysis.DutchAnalyzerProvider;
import org.elasticsearch.index.analysis.DutchStemTokenFilterFactory;
import org.elasticsearch.index.analysis.EdgeNGramTokenizerFactory;
import org.elasticsearch.index.analysis.EnglishAnalyzerProvider;
import org.elasticsearch.index.analysis.FingerprintAnalyzerProvider;
import org.elasticsearch.index.analysis.FingerprintTokenFilterFactory;
import org.elasticsearch.index.analysis.FinnishAnalyzerProvider;
import org.elasticsearch.index.analysis.FrenchAnalyzerProvider;
import org.elasticsearch.index.analysis.FrenchStemTokenFilterFactory;
Expand All @@ -67,15 +62,12 @@
import org.elasticsearch.index.analysis.IndonesianAnalyzerProvider;
import org.elasticsearch.index.analysis.IrishAnalyzerProvider;
import org.elasticsearch.index.analysis.ItalianAnalyzerProvider;
import org.elasticsearch.index.analysis.KeepTypesFilterFactory;
import org.elasticsearch.index.analysis.KeepWordFilterFactory;
import org.elasticsearch.index.analysis.KeywordAnalyzerProvider;
import org.elasticsearch.index.analysis.KeywordTokenizerFactory;
import org.elasticsearch.index.analysis.LatvianAnalyzerProvider;
import org.elasticsearch.index.analysis.LetterTokenizerFactory;
import org.elasticsearch.index.analysis.LithuanianAnalyzerProvider;
import org.elasticsearch.index.analysis.LowerCaseTokenizerFactory;
import org.elasticsearch.index.analysis.MinHashTokenFilterFactory;
import org.elasticsearch.index.analysis.NGramTokenizerFactory;
import org.elasticsearch.index.analysis.NorwegianAnalyzerProvider;
import org.elasticsearch.index.analysis.PathHierarchyTokenizerFactory;
Expand All @@ -89,7 +81,6 @@
import org.elasticsearch.index.analysis.RomanianAnalyzerProvider;
import org.elasticsearch.index.analysis.RussianAnalyzerProvider;
import org.elasticsearch.index.analysis.RussianStemTokenFilterFactory;
import org.elasticsearch.index.analysis.ScandinavianFoldingFilterFactory;
import org.elasticsearch.index.analysis.ShingleTokenFilterFactory;
import org.elasticsearch.index.analysis.SimpleAnalyzerProvider;
import org.elasticsearch.index.analysis.SnowballAnalyzerProvider;
Expand Down Expand Up @@ -181,26 +172,17 @@ private NamedRegistry<AnalysisProvider<TokenFilterFactory>> setupTokenFilters(Li
tokenFilters.register("stop", StopTokenFilterFactory::new);
tokenFilters.register("standard", StandardTokenFilterFactory::new);
tokenFilters.register("shingle", ShingleTokenFilterFactory::new);
tokenFilters.register("min_hash", MinHashTokenFilterFactory::new);
tokenFilters.register("delimited_payload_filter", DelimitedPayloadTokenFilterFactory::new);
tokenFilters.register("keep", requriesAnalysisSettings(KeepWordFilterFactory::new));
tokenFilters.register("keep_types", requriesAnalysisSettings(KeepTypesFilterFactory::new));
tokenFilters.register("arabic_stem", ArabicStemTokenFilterFactory::new);
tokenFilters.register("brazilian_stem", BrazilianStemTokenFilterFactory::new);
tokenFilters.register("czech_stem", CzechStemTokenFilterFactory::new);
tokenFilters.register("dutch_stem", DutchStemTokenFilterFactory::new);
tokenFilters.register("french_stem", FrenchStemTokenFilterFactory::new);
tokenFilters.register("german_stem", GermanStemTokenFilterFactory::new);
tokenFilters.register("russian_stem", RussianStemTokenFilterFactory::new);
tokenFilters.register("scandinavian_folding", ScandinavianFoldingFilterFactory::new);

tokenFilters.register("hunspell", requriesAnalysisSettings((indexSettings, env, name, settings) -> new HunspellTokenFilterFactory
(indexSettings, name, settings, hunspellService)));

tokenFilters.register("apostrophe", ApostropheFilterFactory::new);
tokenFilters.register("classic", ClassicFilterFactory::new);
tokenFilters.register("decimal_digit", DecimalDigitFilterFactory::new);
tokenFilters.register("fingerprint", FingerprintTokenFilterFactory::new);
tokenFilters.extractAndRegister(plugins, AnalysisPlugin::getTokenFilters);
return tokenFilters;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -319,14 +319,14 @@ public void testUnknown() throws IOException {
public void testNonPreBuildTokenFilter() throws IOException {
AnalyzeRequest request = new AnalyzeRequest();
request.tokenizer("whitespace");
request.addTokenFilter("min_hash");
request.addTokenFilter("stop"); // stop token filter is not prebuilt in AnalysisModule#setupPreConfiguredTokenFilters()
request.text("the quick brown fox");
AnalyzeResponse analyze = TransportAnalyzeAction.analyze(request, AllFieldMapper.NAME, null, indexAnalyzers, registry, environment);
List<AnalyzeResponse.AnalyzeToken> tokens = analyze.getTokens();
int default_hash_count = 1;
int default_bucket_size = 512;
int default_hash_set_size = 1;
assertEquals(default_hash_count * default_bucket_size * default_hash_set_size, tokens.size());
assertEquals(3, tokens.size());
assertEquals("quick", tokens.get(0).getTerm());
assertEquals("brown", tokens.get(1).getTerm());
assertEquals("fox", tokens.get(2).getTerm());
}

public void testNormalizerWithIndex() throws IOException {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,13 @@

package org.elasticsearch.action.termvectors;

import com.carrotsearch.hppc.ObjectIntHashMap;

import org.apache.lucene.analysis.payloads.PayloadHelper;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.action.ActionFuture;
import org.elasticsearch.action.admin.cluster.shards.ClusterSearchShardsResponse;
import org.elasticsearch.action.admin.indices.alias.Alias;
Expand Down Expand Up @@ -374,171 +370,6 @@ public void testDuelESLucene() throws Exception {
}
}

public void testRandomPayloadWithDelimitedPayloadTokenFilter() throws IOException {
//create the test document
int encoding = randomIntBetween(0, 2);
String encodingString = "";
if (encoding == 0) {
encodingString = "float";
}
if (encoding == 1) {
encodingString = "int";
}
if (encoding == 2) {
encodingString = "identity";
}
String[] tokens = crateRandomTokens();
Map<String, List<BytesRef>> payloads = createPayloads(tokens, encoding);
String delimiter = createRandomDelimiter(tokens);
String queryString = createString(tokens, payloads, encoding, delimiter.charAt(0));
//create the mapping
XContentBuilder mapping = jsonBuilder().startObject().startObject("type1").startObject("properties")
.startObject("field").field("type", "text").field("term_vector", "with_positions_offsets_payloads")
.field("analyzer", "payload_test").endObject().endObject().endObject().endObject();
assertAcked(prepareCreate("test").addMapping("type1", mapping).setSettings(
Settings.builder()
.put(indexSettings())
.put("index.analysis.analyzer.payload_test.tokenizer", "whitespace")
.putArray("index.analysis.analyzer.payload_test.filter", "my_delimited_payload_filter")
.put("index.analysis.filter.my_delimited_payload_filter.delimiter", delimiter)
.put("index.analysis.filter.my_delimited_payload_filter.encoding", encodingString)
.put("index.analysis.filter.my_delimited_payload_filter.type", "delimited_payload_filter")));

client().prepareIndex("test", "type1", Integer.toString(1))
.setSource(jsonBuilder().startObject().field("field", queryString).endObject()).execute().actionGet();
refresh();
TermVectorsRequestBuilder resp = client().prepareTermVectors("test", "type1", Integer.toString(1)).setPayloads(true).setOffsets(true)
.setPositions(true).setSelectedFields();
TermVectorsResponse response = resp.execute().actionGet();
assertThat("doc id 1 doesn't exists but should", response.isExists(), equalTo(true));
Fields fields = response.getFields();
assertThat(fields.size(), equalTo(1));
Terms terms = fields.terms("field");
TermsEnum iterator = terms.iterator();
while (iterator.next() != null) {
String term = iterator.term().utf8ToString();
PostingsEnum docsAndPositions = iterator.postings(null, PostingsEnum.ALL);
assertThat(docsAndPositions.nextDoc(), equalTo(0));
List<BytesRef> curPayloads = payloads.get(term);
assertThat(term, curPayloads, notNullValue());
assertNotNull(docsAndPositions);
for (int k = 0; k < docsAndPositions.freq(); k++) {
docsAndPositions.nextPosition();
if (docsAndPositions.getPayload()!=null){
String infoString = "\nterm: " + term + " has payload \n"+ docsAndPositions.getPayload().toString() + "\n but should have payload \n"+curPayloads.get(k).toString();
assertThat(infoString, docsAndPositions.getPayload(), equalTo(curPayloads.get(k)));
} else {
String infoString = "\nterm: " + term + " has no payload but should have payload \n"+curPayloads.get(k).toString();
assertThat(infoString, curPayloads.get(k).length, equalTo(0));
}
}
}
assertThat(iterator.next(), nullValue());
}

private String createRandomDelimiter(String[] tokens) {
String delimiter = "";
boolean isTokenOrWhitespace = true;
while(isTokenOrWhitespace) {
isTokenOrWhitespace = false;
delimiter = randomUnicodeOfLength(1);
for(String token:tokens) {
if(token.contains(delimiter)) {
isTokenOrWhitespace = true;
}
}
if(Character.isWhitespace(delimiter.charAt(0))) {
isTokenOrWhitespace = true;
}
}
return delimiter;
}

private String createString(String[] tokens, Map<String, List<BytesRef>> payloads, int encoding, char delimiter) {
String resultString = "";
ObjectIntHashMap<String> payloadCounter = new ObjectIntHashMap<>();
for (String token : tokens) {
if (!payloadCounter.containsKey(token)) {
payloadCounter.putIfAbsent(token, 0);
} else {
payloadCounter.put(token, payloadCounter.get(token) + 1);
}
resultString = resultString + token;
BytesRef payload = payloads.get(token).get(payloadCounter.get(token));
if (payload.length > 0) {
resultString = resultString + delimiter;
switch (encoding) {
case 0: {
resultString = resultString + Float.toString(PayloadHelper.decodeFloat(payload.bytes, payload.offset));
break;
}
case 1: {
resultString = resultString + Integer.toString(PayloadHelper.decodeInt(payload.bytes, payload.offset));
break;
}
case 2: {
resultString = resultString + payload.utf8ToString();
break;
}
default: {
throw new ElasticsearchException("unsupported encoding type");
}
}
}
resultString = resultString + " ";
}
return resultString;
}

private Map<String, List<BytesRef>> createPayloads(String[] tokens, int encoding) {
Map<String, List<BytesRef>> payloads = new HashMap<>();
for (String token : tokens) {
if (payloads.get(token) == null) {
payloads.put(token, new ArrayList<BytesRef>());
}
boolean createPayload = randomBoolean();
if (createPayload) {
switch (encoding) {
case 0: {
float theFloat = randomFloat();
payloads.get(token).add(new BytesRef(PayloadHelper.encodeFloat(theFloat)));
break;
}
case 1: {
payloads.get(token).add(new BytesRef(PayloadHelper.encodeInt(randomInt())));
break;
}
case 2: {
String payload = randomUnicodeOfLengthBetween(50, 100);
for (int c = 0; c < payload.length(); c++) {
if (Character.isWhitespace(payload.charAt(c))) {
payload = payload.replace(payload.charAt(c), 'w');
}
}
payloads.get(token).add(new BytesRef(payload));
break;
}
default: {
throw new ElasticsearchException("unsupported encoding type");
}
}
} else {
payloads.get(token).add(new BytesRef());
}
}
return payloads;
}

private String[] crateRandomTokens() {
String[] tokens = { "the", "quick", "brown", "fox" };
int numTokensWithDuplicates = randomIntBetween(3, 15);
String[] finalTokens = new String[numTokensWithDuplicates];
for (int i = 0; i < numTokensWithDuplicates; i++) {
finalTokens[i] = tokens[randomIntBetween(0, tokens.length - 1)];
}
return finalTokens;
}

// like testSimpleTermVectors but we create fields with no term vectors
public void testSimpleTermVectorsWithGenerate() throws IOException {
String[] fieldNames = new String[10];
Expand Down
Loading