From 6db89aeaf314c4754748f7cf30ae7f0b149e7c90 Mon Sep 17 00:00:00 2001 From: fbaligand Date: Wed, 26 Apr 2017 17:54:33 +0200 Subject: [PATCH] token_count type : add an option to count tokens (fix #23227) Add option "enable_position_increments" with default value true. If option is set to false, indexed value is the number of positions (position increments are not counted) --- .../mapper/core/TokenCountFieldMapper.java | 51 +++++++++++++++--- .../TokenCountFieldMapperIntegrationIT.java | 35 +++++++++---- .../core/TokenCountFieldMapperTests.java | 52 +++++++++++++------ .../mapping/types/token-count.asciidoc | 12 ++--- 4 files changed, 110 insertions(+), 40 deletions(-) diff --git a/core/src/main/java/org/elasticsearch/index/mapper/core/TokenCountFieldMapper.java b/core/src/main/java/org/elasticsearch/index/mapper/core/TokenCountFieldMapper.java index 85df5ea3d3ba7..055feeeacafe7 100644 --- a/core/src/main/java/org/elasticsearch/index/mapper/core/TokenCountFieldMapper.java +++ b/core/src/main/java/org/elasticsearch/index/mapper/core/TokenCountFieldMapper.java @@ -43,6 +43,7 @@ import static org.apache.lucene.index.IndexOptions.NONE; import static org.elasticsearch.common.xcontent.support.XContentMapValues.nodeIntegerValue; +import static org.elasticsearch.common.xcontent.support.XContentMapValues.nodeBooleanValue; import static org.elasticsearch.index.mapper.MapperBuilders.tokenCountField; import static org.elasticsearch.index.mapper.core.TypeParsers.parseNumberField; @@ -54,11 +55,13 @@ public class TokenCountFieldMapper extends IntegerFieldMapper { public static final String CONTENT_TYPE = "token_count"; public static class Defaults extends IntegerFieldMapper.Defaults { + public static final boolean DEFAULT_POSITION_INCREMENTS = true; } public static class Builder extends NumberFieldMapper.Builder { private NamedAnalyzer analyzer; + private boolean enablePositionIncrements = Defaults.DEFAULT_POSITION_INCREMENTS; public Builder(String name) { super(name, Defaults.FIELD_TYPE, Defaults.PRECISION_STEP_32_BIT); @@ -74,12 +77,21 @@ public NamedAnalyzer analyzer() { return analyzer; } + public Builder enablePositionIncrements(boolean enablePositionIncrements) { + this.enablePositionIncrements = enablePositionIncrements; + return this; + } + + public boolean enablePositionIncrements() { + return enablePositionIncrements; + } + @Override public TokenCountFieldMapper build(BuilderContext context) { setupFieldType(context); TokenCountFieldMapper fieldMapper = new TokenCountFieldMapper(name, fieldType, defaultFieldType, ignoreMalformed(context), coerce(context), context.indexSettings(), - analyzer, multiFieldsBuilder.build(this, context), copyTo); + analyzer, enablePositionIncrements, multiFieldsBuilder.build(this, context), copyTo); return (TokenCountFieldMapper) fieldMapper.includeInAll(includeInAll); } @@ -96,8 +108,7 @@ protected int maxPrecisionStep() { public static class TypeParser implements Mapper.TypeParser { @Override - @SuppressWarnings("unchecked") - public Mapper.Builder parse(String name, Map node, ParserContext parserContext) throws MapperParsingException { + public Mapper.Builder parse(String name, Map node, ParserContext parserContext) throws MapperParsingException { TokenCountFieldMapper.Builder builder = tokenCountField(name); for (Iterator> iterator = node.entrySet().iterator(); iterator.hasNext();) { Map.Entry entry = iterator.next(); @@ -113,6 +124,9 @@ public Mapper.Builder parse(String name, Map node, ParserContext } builder.analyzer(analyzer); iterator.remove(); + } else if (propName.equals("enable_position_increments")) { + builder.enablePositionIncrements(nodeBooleanValue(propNode)); + iterator.remove(); } } parseNumberField(builder, name, node, parserContext); @@ -124,11 +138,13 @@ public Mapper.Builder parse(String name, Map node, ParserContext } private NamedAnalyzer analyzer; + private boolean enablePositionIncrements; protected TokenCountFieldMapper(String simpleName, MappedFieldType fieldType, MappedFieldType defaultFieldType, Explicit ignoreMalformed, - Explicit coerce, Settings indexSettings, NamedAnalyzer analyzer, MultiFields multiFields, CopyTo copyTo) { + Explicit coerce, Settings indexSettings, NamedAnalyzer analyzer, boolean enablePositionIncrements, MultiFields multiFields, CopyTo copyTo) { super(simpleName, fieldType, defaultFieldType, ignoreMalformed, coerce, indexSettings, multiFields, copyTo); this.analyzer = analyzer; + this.enablePositionIncrements = enablePositionIncrements; } @Override @@ -143,7 +159,7 @@ protected void parseCreateField(ParseContext context, List fields) throws if (valueAndBoost.value() == null) { count = fieldType().nullValue(); } else { - count = countPositions(analyzer, simpleName(), valueAndBoost.value()); + count = countPositions(analyzer, simpleName(), valueAndBoost.value(), enablePositionIncrements); } addIntegerFields(context, fields, count, valueAndBoost.boost()); } @@ -154,19 +170,26 @@ protected void parseCreateField(ParseContext context, List fields) throws * @param analyzer analyzer to create token stream * @param fieldName field name to pass to analyzer * @param fieldValue field value to pass to analyzer + * @param enablePositionIncrements should we count position increments ? * @return number of position increments in a token stream * @throws IOException if tokenStream throws it */ - static int countPositions(Analyzer analyzer, String fieldName, String fieldValue) throws IOException { + static int countPositions(Analyzer analyzer, String fieldName, String fieldValue, boolean enablePositionIncrements) throws IOException { try (TokenStream tokenStream = analyzer.tokenStream(fieldName, fieldValue)) { int count = 0; PositionIncrementAttribute position = tokenStream.addAttribute(PositionIncrementAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { - count += position.getPositionIncrement(); + if (enablePositionIncrements) { + count += position.getPositionIncrement(); + } else { + count += Math.min(1, position.getPositionIncrement()); + } } tokenStream.end(); - count += position.getPositionIncrement(); + if (enablePositionIncrements) { + count += position.getPositionIncrement(); + } return count; } } @@ -179,6 +202,14 @@ public String analyzer() { return analyzer.name(); } + /** + * Indicates if position increments are counted. + * @return true if position increments are counted + */ + public boolean enablePositionIncrements() { + return enablePositionIncrements; + } + @Override protected String contentType() { return CONTENT_TYPE; @@ -188,6 +219,7 @@ protected String contentType() { protected void doMerge(Mapper mergeWith, boolean updateAllTypes) { super.doMerge(mergeWith, updateAllTypes); this.analyzer = ((TokenCountFieldMapper) mergeWith).analyzer; + this.enablePositionIncrements = ((TokenCountFieldMapper) mergeWith).enablePositionIncrements; } @Override @@ -195,6 +227,9 @@ protected void doXContentBody(XContentBuilder builder, boolean includeDefaults, super.doXContentBody(builder, includeDefaults, params); builder.field("analyzer", analyzer()); + if (includeDefaults || enablePositionIncrements() != Defaults.DEFAULT_POSITION_INCREMENTS) { + builder.field("enable_position_increments", enablePositionIncrements()); + } } @Override diff --git a/core/src/test/java/org/elasticsearch/index/mapper/core/TokenCountFieldMapperIntegrationIT.java b/core/src/test/java/org/elasticsearch/index/mapper/core/TokenCountFieldMapperIntegrationIT.java index 613cdde97e61e..50246891f95c9 100644 --- a/core/src/test/java/org/elasticsearch/index/mapper/core/TokenCountFieldMapperIntegrationIT.java +++ b/core/src/test/java/org/elasticsearch/index/mapper/core/TokenCountFieldMapperIntegrationIT.java @@ -136,6 +136,12 @@ private void init() throws IOException { .field("format", "doc_values") .endObject() .endObject() + .startObject("token_count_without_position_increments") + .field("type", "token_count") + .field("analyzer", "english") + .field("enable_position_increments", false) + .field("store", true) + .endObject() .endObject() .endObject() .endObject() @@ -173,6 +179,7 @@ private SearchRequestBuilder searchByNumericRange(int low, int high) { private SearchRequestBuilder prepareSearch() { SearchRequestBuilder request = client().prepareSearch("test").setTypes("test"); request.addField("foo.token_count"); + request.addField("foo.token_count_without_position_increments"); if (loadCountedFields) { request.addField("foo"); } @@ -190,32 +197,38 @@ private void assertSearchReturns(SearchResponse result, String... ids) { for (SearchHit hit : result.getHits()) { String id = hit.id(); if (id.equals("single")) { - assertSearchHit(hit, 4); + assertSearchHit(hit, new int[]{4}, new int[]{4}); } else if (id.equals("bulk1")) { - assertSearchHit(hit, 3); + assertSearchHit(hit, new int[]{3}, new int[]{3}); } else if (id.equals("bulk2")) { - assertSearchHit(hit, 5); + assertSearchHit(hit, new int[]{5}, new int[]{4}); } else if (id.equals("multi")) { - assertSearchHit(hit, 2, 7); + assertSearchHit(hit, new int[]{2, 7}, new int[]{2, 7}); } else if (id.equals("multibulk1")) { - assertSearchHit(hit, 1, 8); + assertSearchHit(hit, new int[]{1, 8}, new int[]{1, 8}); } else if (id.equals("multibulk2")) { - assertSearchHit(hit, 6, 10); + assertSearchHit(hit, new int[]{6, 10}, new int[]{3, 9}); } else { throw new ElasticsearchException("Unexpected response!"); } } } - private void assertSearchHit(SearchHit hit, int... termCounts) { + private void assertSearchHit(SearchHit hit, int[] standardTermCounts, int[] englishTermCounts) { assertThat(hit.field("foo.token_count"), not(nullValue())); - assertThat(hit.field("foo.token_count").values().size(), equalTo(termCounts.length)); - for (int i = 0; i < termCounts.length; i++) { - assertThat((Integer) hit.field("foo.token_count").values().get(i), equalTo(termCounts[i])); + assertThat(hit.field("foo.token_count").values().size(), equalTo(standardTermCounts.length)); + for (int i = 0; i < standardTermCounts.length; i++) { + assertThat((Integer) hit.field("foo.token_count").values().get(i), equalTo(standardTermCounts[i])); + } + + assertThat(hit.field("foo.token_count_without_position_increments"), not(nullValue())); + assertThat(hit.field("foo.token_count_without_position_increments").values().size(), equalTo(englishTermCounts.length)); + for (int i = 0; i < englishTermCounts.length; i++) { + assertThat((Integer) hit.field("foo.token_count_without_position_increments").values().get(i), equalTo(englishTermCounts[i])); } if (loadCountedFields && storeCountedFields) { - assertThat(hit.field("foo").values().size(), equalTo(termCounts.length)); + assertThat(hit.field("foo").values().size(), equalTo(standardTermCounts.length)); } } } diff --git a/core/src/test/java/org/elasticsearch/index/mapper/core/TokenCountFieldMapperTests.java b/core/src/test/java/org/elasticsearch/index/mapper/core/TokenCountFieldMapperTests.java index a4a87eda10d8f..94da353e78e91 100644 --- a/core/src/test/java/org/elasticsearch/index/mapper/core/TokenCountFieldMapperTests.java +++ b/core/src/test/java/org/elasticsearch/index/mapper/core/TokenCountFieldMapperTests.java @@ -27,7 +27,6 @@ import org.elasticsearch.common.compress.CompressedXContent; import org.elasticsearch.common.xcontent.XContentFactory; import org.elasticsearch.index.mapper.DocumentMapper; -import org.elasticsearch.index.mapper.DocumentMapperParser; import org.elasticsearch.index.mapper.MapperService; import org.elasticsearch.test.ESSingleNodeTestCase; import org.junit.Test; @@ -73,26 +72,49 @@ public void testMerge() throws IOException { assertThat(((TokenCountFieldMapper) stage2.mappers().smartNameFieldMapper("tc")).analyzer(), equalTo("standard")); } - @Test - public void testCountPositions() throws IOException { - // We're looking to make sure that we: - Token t1 = new Token(); // Don't count tokens without an increment + /** + * When position increments are counted, we're looking to make sure that we: + - don't count tokens without an increment + - count normal tokens with one increment + - count funny tokens with more than one increment + - count the final token increments on the rare token streams that have them + */ + public void testCountPositionsWithIncrements() throws IOException { + Analyzer analyzer = createMockAnalyzer(); + assertThat(TokenCountFieldMapper.countPositions(analyzer, "", "", true), equalTo(7)); + } + + /** + * When position increments are not counted (only positions are counted), we're looking to make sure that we: + - don't count tokens without an increment + - count normal tokens with one increment + - count funny tokens with more than one increment as only one + - don't count the final token increments on the rare token streams that have them + */ + public void testCountPositionsWithoutIncrements() throws IOException { + Analyzer analyzer = createMockAnalyzer(); + assertThat(TokenCountFieldMapper.countPositions(analyzer, "", "", false), equalTo(2)); + } + + private Analyzer createMockAnalyzer() { + Token t1 = new Token(); // Token without an increment t1.setPositionIncrement(0); Token t2 = new Token(); - t2.setPositionIncrement(1); // Count normal tokens with one increment + t2.setPositionIncrement(1); // Normal token with one increment Token t3 = new Token(); - t2.setPositionIncrement(2); // Count funny tokens with more than one increment - int finalTokenIncrement = 4; // Count the final token increment on the rare token streams that have them + t2.setPositionIncrement(2); // Funny token with more than one increment + int finalTokenIncrement = 4; // Final token increment Token[] tokens = new Token[] {t1, t2, t3}; - Collections.shuffle(Arrays.asList(tokens), getRandom()); + Collections.shuffle(Arrays.asList(tokens), random()); final TokenStream tokenStream = new CannedTokenStream(finalTokenIncrement, 0, tokens); // TODO: we have no CannedAnalyzer? Analyzer analyzer = new Analyzer() { - @Override - public TokenStreamComponents createComponents(String fieldName) { - return new TokenStreamComponents(new MockTokenizer(), tokenStream); - } - }; - assertThat(TokenCountFieldMapper.countPositions(analyzer, "", ""), equalTo(7)); + @Override + public TokenStreamComponents createComponents(String fieldName) { + return new TokenStreamComponents(new MockTokenizer(), tokenStream); + } + }; + return analyzer; } + } diff --git a/docs/reference/mapping/types/token-count.asciidoc b/docs/reference/mapping/types/token-count.asciidoc index ec02a647a6f67..e4a235b3196c3 100644 --- a/docs/reference/mapping/types/token-count.asciidoc +++ b/docs/reference/mapping/types/token-count.asciidoc @@ -48,12 +48,6 @@ GET my_index/_search <2> The `name.length` field is a `token_count` <> which will index the number of tokens in the `name` field. <3> This query matches only the document containing `Rachel Alice Williams`, as it contains three tokens. -[NOTE] -=================================================================== -Technically the `token_count` type sums position increments rather than -counting tokens. This means that even if the analyzer filters out stop -words they are included in the count. -=================================================================== [[token-count-params]] ==== Parameters for `token_count` fields @@ -68,6 +62,12 @@ The following parameters are accepted by `token_count` fields: value. Required. For best performance, use an analyzer without token filters. +`enable_position_increments`:: + + Indicates if position increments should be counted. + Set to `false` if you don't want to count tokens removed by analyzer filters (like <>). + Defaults to `true`. + <>:: Field-level index time boosting. Accepts a floating point number, defaults