Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,8 @@
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexableField;
import org.elasticsearch.Version;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.index.analysis.NamedAnalyzer;
Expand All @@ -36,6 +34,7 @@
import java.util.Map;

import static org.elasticsearch.common.xcontent.support.XContentMapValues.nodeIntegerValue;
import static org.elasticsearch.common.xcontent.support.XContentMapValues.nodeBooleanValue;
import static org.elasticsearch.index.mapper.TypeParsers.parseField;

/**
Expand All @@ -47,10 +46,12 @@ public class TokenCountFieldMapper extends FieldMapper {

public static class Defaults {
public static final MappedFieldType FIELD_TYPE = new NumberFieldMapper.NumberFieldType(NumberFieldMapper.NumberType.INTEGER);
public static final boolean DEFAULT_POSITION_INCREMENTS = true;
}

public static class Builder extends FieldMapper.Builder<Builder, TokenCountFieldMapper> {
private NamedAnalyzer analyzer;
private boolean enablePositionIncrements = Defaults.DEFAULT_POSITION_INCREMENTS;

public Builder(String name) {
super(name, Defaults.FIELD_TYPE, Defaults.FIELD_TYPE);
Expand All @@ -66,18 +67,26 @@ public NamedAnalyzer analyzer() {
return analyzer;
}

public Builder enablePositionIncrements(boolean enablePositionIncrements) {
this.enablePositionIncrements = enablePositionIncrements;
return this;
}

public boolean enablePositionIncrements() {
return enablePositionIncrements;
}

@Override
public TokenCountFieldMapper build(BuilderContext context) {
setupFieldType(context);
return new TokenCountFieldMapper(name, fieldType, defaultFieldType,
context.indexSettings(), analyzer, multiFieldsBuilder.build(this, context), copyTo);
context.indexSettings(), analyzer, enablePositionIncrements, multiFieldsBuilder.build(this, context), copyTo);
}
}

public static class TypeParser implements Mapper.TypeParser {
@Override
@SuppressWarnings("unchecked")
public Mapper.Builder parse(String name, Map<String, Object> node, ParserContext parserContext) throws MapperParsingException {
public Mapper.Builder<?,?> parse(String name, Map<String, Object> node, ParserContext parserContext) throws MapperParsingException {
TokenCountFieldMapper.Builder builder = new TokenCountFieldMapper.Builder(name);
for (Iterator<Map.Entry<String, Object>> iterator = node.entrySet().iterator(); iterator.hasNext();) {
Map.Entry<String, Object> entry = iterator.next();
Expand All @@ -93,6 +102,9 @@ public Mapper.Builder parse(String name, Map<String, Object> node, ParserContext
}
builder.analyzer(analyzer);
iterator.remove();
} else if (propName.equals("enable_position_increments")) {
builder.enablePositionIncrements(nodeBooleanValue(propNode));
iterator.remove();
}
}
parseField(builder, name, node, parserContext);
Expand All @@ -104,11 +116,13 @@ public Mapper.Builder parse(String name, Map<String, Object> node, ParserContext
}

private NamedAnalyzer analyzer;
private boolean enablePositionIncrements;

protected TokenCountFieldMapper(String simpleName, MappedFieldType fieldType, MappedFieldType defaultFieldType,
Settings indexSettings, NamedAnalyzer analyzer, MultiFields multiFields, CopyTo copyTo) {
Settings indexSettings, NamedAnalyzer analyzer, boolean enablePositionIncrements, MultiFields multiFields, CopyTo copyTo) {
super(simpleName, fieldType, defaultFieldType, indexSettings, multiFields, copyTo);
this.analyzer = analyzer;
this.enablePositionIncrements = enablePositionIncrements;
}

@Override
Expand All @@ -124,7 +138,7 @@ protected void parseCreateField(ParseContext context, List<IndexableField> field
if (value == null) {
tokenCount = (Integer) fieldType().nullValue();
} else {
tokenCount = countPositions(analyzer, name(), value);
tokenCount = countPositions(analyzer, name(), value, enablePositionIncrements);
}

boolean indexed = fieldType().indexOptions() != IndexOptions.NONE;
Expand All @@ -138,19 +152,26 @@ protected void parseCreateField(ParseContext context, List<IndexableField> field
* @param analyzer analyzer to create token stream
* @param fieldName field name to pass to analyzer
* @param fieldValue field value to pass to analyzer
* @param enablePositionIncrements should we count position increments ?
* @return number of position increments in a token stream
* @throws IOException if tokenStream throws it
*/
static int countPositions(Analyzer analyzer, String fieldName, String fieldValue) throws IOException {
static int countPositions(Analyzer analyzer, String fieldName, String fieldValue, boolean enablePositionIncrements) throws IOException {
try (TokenStream tokenStream = analyzer.tokenStream(fieldName, fieldValue)) {
int count = 0;
PositionIncrementAttribute position = tokenStream.addAttribute(PositionIncrementAttribute.class);
tokenStream.reset();
while (tokenStream.incrementToken()) {
count += position.getPositionIncrement();
if (enablePositionIncrements) {
count += position.getPositionIncrement();
} else {
count += Math.min(1, position.getPositionIncrement());
}
}
tokenStream.end();
count += position.getPositionIncrement();
if (enablePositionIncrements) {
count += position.getPositionIncrement();
}
return count;
}
}
Expand All @@ -163,6 +184,14 @@ public String analyzer() {
return analyzer.name();
}

/**
* Indicates if position increments are counted.
* @return <code>true</code> if position increments are counted
*/
public boolean enablePositionIncrements() {
return enablePositionIncrements;
}

@Override
protected String contentType() {
return CONTENT_TYPE;
Expand All @@ -172,12 +201,16 @@ protected String contentType() {
protected void doMerge(Mapper mergeWith, boolean updateAllTypes) {
super.doMerge(mergeWith, updateAllTypes);
this.analyzer = ((TokenCountFieldMapper) mergeWith).analyzer;
this.enablePositionIncrements = ((TokenCountFieldMapper) mergeWith).enablePositionIncrements;
}

@Override
protected void doXContentBody(XContentBuilder builder, boolean includeDefaults, Params params) throws IOException {
super.doXContentBody(builder, includeDefaults, params);
builder.field("analyzer", analyzer());
if (includeDefaults || enablePositionIncrements() != Defaults.DEFAULT_POSITION_INCREMENTS) {
builder.field("enable_position_increments", enablePositionIncrements());
}
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,12 @@ private void init() throws IOException {
.field("analyzer", "standard")
.field("doc_values", true)
.endObject()
.startObject("token_count_without_position_increments")
.field("type", "token_count")
.field("analyzer", "english")
.field("enable_position_increments", false)
.field("store", true)
.endObject()
.endObject()
.endObject()
.endObject()
Expand Down Expand Up @@ -169,6 +175,7 @@ private SearchRequestBuilder searchByNumericRange(int low, int high) {
private SearchRequestBuilder prepareSearch() {
SearchRequestBuilder request = client().prepareSearch("test").setTypes("test");
request.addStoredField("foo.token_count");
request.addStoredField("foo.token_count_without_position_increments");
if (loadCountedFields) {
request.addStoredField("foo");
}
Expand All @@ -186,32 +193,38 @@ private void assertSearchReturns(SearchResponse result, String... ids) {
for (SearchHit hit : result.getHits()) {
String id = hit.getId();
if (id.equals("single")) {
assertSearchHit(hit, 4);
assertSearchHit(hit, new int[]{4}, new int[]{4});
} else if (id.equals("bulk1")) {
assertSearchHit(hit, 3);
assertSearchHit(hit, new int[]{3}, new int[]{3});
} else if (id.equals("bulk2")) {
assertSearchHit(hit, 5);
assertSearchHit(hit, new int[]{5}, new int[]{4});
} else if (id.equals("multi")) {
assertSearchHit(hit, 2, 7);
assertSearchHit(hit, new int[]{2, 7}, new int[]{2, 7});
} else if (id.equals("multibulk1")) {
assertSearchHit(hit, 1, 8);
assertSearchHit(hit, new int[]{1, 8}, new int[]{1, 8});
} else if (id.equals("multibulk2")) {
assertSearchHit(hit, 6, 10);
assertSearchHit(hit, new int[]{6, 10}, new int[]{3, 9});
} else {
throw new ElasticsearchException("Unexpected response!");
}
}
}

private void assertSearchHit(SearchHit hit, int... termCounts) {
private void assertSearchHit(SearchHit hit, int[] standardTermCounts, int[] englishTermCounts) {
assertThat(hit.field("foo.token_count"), not(nullValue()));
assertThat(hit.field("foo.token_count").getValues().size(), equalTo(termCounts.length));
for (int i = 0; i < termCounts.length; i++) {
assertThat((Integer) hit.field("foo.token_count").getValues().get(i), equalTo(termCounts[i]));
assertThat(hit.field("foo.token_count").getValues().size(), equalTo(standardTermCounts.length));
for (int i = 0; i < standardTermCounts.length; i++) {
assertThat((Integer) hit.field("foo.token_count").getValues().get(i), equalTo(standardTermCounts[i]));
}

assertThat(hit.field("foo.token_count_without_position_increments"), not(nullValue()));
assertThat(hit.field("foo.token_count_without_position_increments").getValues().size(), equalTo(englishTermCounts.length));
for (int i = 0; i < englishTermCounts.length; i++) {
assertThat((Integer) hit.field("foo.token_count_without_position_increments").getValues().get(i), equalTo(englishTermCounts[i]));
}

if (loadCountedFields && storeCountedFields) {
assertThat(hit.field("foo").getValues().size(), equalTo(termCounts.length));
assertThat(hit.field("foo").getValues().size(), equalTo(standardTermCounts.length));
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -24,23 +24,18 @@
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.elasticsearch.Version;
import org.elasticsearch.cluster.metadata.IndexMetaData;
import org.elasticsearch.common.compress.CompressedXContent;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.xcontent.XContentFactory;
import org.elasticsearch.index.IndexService;
import org.elasticsearch.plugins.Plugin;
import org.elasticsearch.test.ESSingleNodeTestCase;
import org.elasticsearch.test.InternalSettingsPlugin;
import org.elasticsearch.test.VersionUtils;

import java.io.IOException;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;

import static com.carrotsearch.randomizedtesting.RandomizedTest.getRandom;
import static org.hamcrest.Matchers.containsString;
import static org.hamcrest.Matchers.equalTo;

Expand Down Expand Up @@ -80,26 +75,49 @@ public void testMerge() throws IOException {
assertThat(((TokenCountFieldMapper) stage2.mappers().smartNameFieldMapper("tc")).analyzer(), equalTo("standard"));
}

public void testCountPositions() throws IOException {
// We're looking to make sure that we:
Token t1 = new Token(); // Don't count tokens without an increment
/**
* When position increments are counted, we're looking to make sure that we:
- don't count tokens without an increment
- count normal tokens with one increment
- count funny tokens with more than one increment
- count the final token increments on the rare token streams that have them
*/
public void testCountPositionsWithIncrements() throws IOException {
Analyzer analyzer = createMockAnalyzer();
assertThat(TokenCountFieldMapper.countPositions(analyzer, "", "", true), equalTo(7));
}

/**
* When position increments are not counted (only positions are counted), we're looking to make sure that we:
- don't count tokens without an increment
- count normal tokens with one increment
- count funny tokens with more than one increment as only one
- don't count the final token increments on the rare token streams that have them
*/
public void testCountPositionsWithoutIncrements() throws IOException {
Analyzer analyzer = createMockAnalyzer();
assertThat(TokenCountFieldMapper.countPositions(analyzer, "", "", false), equalTo(2));
}

private Analyzer createMockAnalyzer() {
Token t1 = new Token(); // Token without an increment
t1.setPositionIncrement(0);
Token t2 = new Token();
t2.setPositionIncrement(1); // Count normal tokens with one increment
t2.setPositionIncrement(1); // Normal token with one increment
Token t3 = new Token();
t2.setPositionIncrement(2); // Count funny tokens with more than one increment
int finalTokenIncrement = 4; // Count the final token increment on the rare token streams that have them
t2.setPositionIncrement(2); // Funny token with more than one increment
int finalTokenIncrement = 4; // Final token increment
Token[] tokens = new Token[] {t1, t2, t3};
Collections.shuffle(Arrays.asList(tokens), random());
final TokenStream tokenStream = new CannedTokenStream(finalTokenIncrement, 0, tokens);
// TODO: we have no CannedAnalyzer?
Analyzer analyzer = new Analyzer() {
@Override
public TokenStreamComponents createComponents(String fieldName) {
return new TokenStreamComponents(new MockTokenizer(), tokenStream);
}
};
assertThat(TokenCountFieldMapper.countPositions(analyzer, "", ""), equalTo(7));
@Override
public TokenStreamComponents createComponents(String fieldName) {
return new TokenStreamComponents(new MockTokenizer(), tokenStream);
}
};
return analyzer;
}

@Override
Expand Down
12 changes: 6 additions & 6 deletions docs/reference/mapping/types/token-count.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -48,12 +48,6 @@ GET my_index/_search
<2> The `name.length` field is a `token_count` <<multi-fields,multi-field>> which will index the number of tokens in the `name` field.
<3> This query matches only the document containing `Rachel Alice Williams`, as it contains three tokens.

[NOTE]
===================================================================
Technically the `token_count` type sums position increments rather than
counting tokens. This means that even if the analyzer filters out stop
words they are included in the count.
===================================================================

[[token-count-params]]
==== Parameters for `token_count` fields
Expand All @@ -68,6 +62,12 @@ The following parameters are accepted by `token_count` fields:
value. Required. For best performance, use an analyzer without token
filters.

`enable_position_increments`::

Indicates if position increments should be counted.
Set to `false` if you don't want to count tokens removed by analyzer filters (like <<analysis-stop-tokenfilter,`stop`>>).
Defaults to `true`.

<<mapping-boost,`boost`>>::

Mapping field-level query time boosting. Accepts a floating point number, defaults
Expand Down