From 6db89aeaf314c4754748f7cf30ae7f0b149e7c90 Mon Sep 17 00:00:00 2001
From: fbaligand <fbaligand@gmail.com>
Date: Wed, 26 Apr 2017 17:54:33 +0200
Subject: [PATCH] token_count type : add an option to count tokens (fix #23227)

Add option "enable_position_increments" with default value true.
If option is set to false, indexed value is the number of positions
(position increments are not counted)
---
 .../mapper/core/TokenCountFieldMapper.java    | 51 +++++++++++++++---
 .../TokenCountFieldMapperIntegrationIT.java   | 35 +++++++++----
 .../core/TokenCountFieldMapperTests.java      | 52 +++++++++++++------
 .../mapping/types/token-count.asciidoc        | 12 ++---
 4 files changed, 110 insertions(+), 40 deletions(-)

diff --git a/core/src/main/java/org/elasticsearch/index/mapper/core/TokenCountFieldMapper.java b/core/src/main/java/org/elasticsearch/index/mapper/core/TokenCountFieldMapper.java
index 85df5ea3d3ba7..055feeeacafe7 100644
--- a/core/src/main/java/org/elasticsearch/index/mapper/core/TokenCountFieldMapper.java
+++ b/core/src/main/java/org/elasticsearch/index/mapper/core/TokenCountFieldMapper.java
@@ -43,6 +43,7 @@
 
 import static org.apache.lucene.index.IndexOptions.NONE;
 import static org.elasticsearch.common.xcontent.support.XContentMapValues.nodeIntegerValue;
+import static org.elasticsearch.common.xcontent.support.XContentMapValues.nodeBooleanValue;
 import static org.elasticsearch.index.mapper.MapperBuilders.tokenCountField;
 import static org.elasticsearch.index.mapper.core.TypeParsers.parseNumberField;
 
@@ -54,11 +55,13 @@ public class TokenCountFieldMapper extends IntegerFieldMapper {
     public static final String CONTENT_TYPE = "token_count";
 
     public static class Defaults extends IntegerFieldMapper.Defaults {
+        public static final boolean DEFAULT_POSITION_INCREMENTS = true;
 
     }
 
     public static class Builder extends NumberFieldMapper.Builder<Builder, TokenCountFieldMapper> {
         private NamedAnalyzer analyzer;
+        private boolean enablePositionIncrements = Defaults.DEFAULT_POSITION_INCREMENTS;
 
         public Builder(String name) {
             super(name, Defaults.FIELD_TYPE, Defaults.PRECISION_STEP_32_BIT);
@@ -74,12 +77,21 @@ public NamedAnalyzer analyzer() {
             return analyzer;
         }
 
+        public Builder enablePositionIncrements(boolean enablePositionIncrements) {
+            this.enablePositionIncrements = enablePositionIncrements;
+            return this;
+        }
+
+        public boolean enablePositionIncrements() {
+            return enablePositionIncrements;
+        }
+
         @Override
         public TokenCountFieldMapper build(BuilderContext context) {
             setupFieldType(context);
             TokenCountFieldMapper fieldMapper = new TokenCountFieldMapper(name, fieldType, defaultFieldType,
                     ignoreMalformed(context), coerce(context), context.indexSettings(),
-                    analyzer, multiFieldsBuilder.build(this, context), copyTo);
+                    analyzer, enablePositionIncrements, multiFieldsBuilder.build(this, context), copyTo);
             return (TokenCountFieldMapper) fieldMapper.includeInAll(includeInAll);
         }
 
@@ -96,8 +108,7 @@ protected int maxPrecisionStep() {
 
     public static class TypeParser implements Mapper.TypeParser {
         @Override
-        @SuppressWarnings("unchecked")
-        public Mapper.Builder parse(String name, Map<String, Object> node, ParserContext parserContext) throws MapperParsingException {
+        public Mapper.Builder<?,?> parse(String name, Map<String, Object> node, ParserContext parserContext) throws MapperParsingException {
             TokenCountFieldMapper.Builder builder = tokenCountField(name);
             for (Iterator<Map.Entry<String, Object>> iterator = node.entrySet().iterator(); iterator.hasNext();) {
                 Map.Entry<String, Object> entry = iterator.next();
@@ -113,6 +124,9 @@ public Mapper.Builder parse(String name, Map<String, Object> node, ParserContext
                     }
                     builder.analyzer(analyzer);
                     iterator.remove();
+                } else if (propName.equals("enable_position_increments")) {
+                    builder.enablePositionIncrements(nodeBooleanValue(propNode));
+                    iterator.remove();
                 }
             }
             parseNumberField(builder, name, node, parserContext);
@@ -124,11 +138,13 @@ public Mapper.Builder parse(String name, Map<String, Object> node, ParserContext
     }
 
     private NamedAnalyzer analyzer;
+    private boolean enablePositionIncrements;
 
     protected TokenCountFieldMapper(String simpleName, MappedFieldType fieldType, MappedFieldType defaultFieldType, Explicit<Boolean> ignoreMalformed,
-                                    Explicit<Boolean> coerce, Settings indexSettings, NamedAnalyzer analyzer, MultiFields multiFields, CopyTo copyTo) {
+                                    Explicit<Boolean> coerce, Settings indexSettings, NamedAnalyzer analyzer, boolean enablePositionIncrements, MultiFields multiFields, CopyTo copyTo) {
         super(simpleName, fieldType, defaultFieldType, ignoreMalformed, coerce, indexSettings, multiFields, copyTo);
         this.analyzer = analyzer;
+        this.enablePositionIncrements = enablePositionIncrements;
     }
 
     @Override
@@ -143,7 +159,7 @@ protected void parseCreateField(ParseContext context, List<Field> fields) throws
             if (valueAndBoost.value() == null) {
                 count = fieldType().nullValue();
             } else {
-                count = countPositions(analyzer, simpleName(), valueAndBoost.value());
+                count = countPositions(analyzer, simpleName(), valueAndBoost.value(), enablePositionIncrements);
             }
             addIntegerFields(context, fields, count, valueAndBoost.boost());
         }
@@ -154,19 +170,26 @@ protected void parseCreateField(ParseContext context, List<Field> fields) throws
      * @param analyzer analyzer to create token stream
      * @param fieldName field name to pass to analyzer
      * @param fieldValue field value to pass to analyzer
+     * @param enablePositionIncrements should we count position increments ?
      * @return number of position increments in a token stream
      * @throws IOException if tokenStream throws it
      */
-    static int countPositions(Analyzer analyzer, String fieldName, String fieldValue) throws IOException {
+    static int countPositions(Analyzer analyzer, String fieldName, String fieldValue, boolean enablePositionIncrements) throws IOException {
         try (TokenStream tokenStream = analyzer.tokenStream(fieldName, fieldValue)) {
             int count = 0;
             PositionIncrementAttribute position = tokenStream.addAttribute(PositionIncrementAttribute.class);
             tokenStream.reset();
             while (tokenStream.incrementToken()) {
-                count += position.getPositionIncrement();
+                if (enablePositionIncrements) {
+                    count += position.getPositionIncrement();
+                } else {
+                    count += Math.min(1, position.getPositionIncrement());
+                }
             }
             tokenStream.end();
-            count += position.getPositionIncrement();
+            if (enablePositionIncrements) {
+                count += position.getPositionIncrement();
+            }
             return count;
         }
     }
@@ -179,6 +202,14 @@ public String analyzer() {
         return analyzer.name();
     }
 
+    /**
+     * Indicates if position increments are counted.
+     * @return <code>true</code> if position increments are counted
+     */
+    public boolean enablePositionIncrements() {
+        return enablePositionIncrements;
+    }
+
     @Override
     protected String contentType() {
         return CONTENT_TYPE;
@@ -188,6 +219,7 @@ protected String contentType() {
     protected void doMerge(Mapper mergeWith, boolean updateAllTypes) {
         super.doMerge(mergeWith, updateAllTypes);
         this.analyzer = ((TokenCountFieldMapper) mergeWith).analyzer;
+        this.enablePositionIncrements = ((TokenCountFieldMapper) mergeWith).enablePositionIncrements;
     }
 
     @Override
@@ -195,6 +227,9 @@ protected void doXContentBody(XContentBuilder builder, boolean includeDefaults,
         super.doXContentBody(builder, includeDefaults, params);
 
         builder.field("analyzer", analyzer());
+        if (includeDefaults || enablePositionIncrements() != Defaults.DEFAULT_POSITION_INCREMENTS) {
+            builder.field("enable_position_increments", enablePositionIncrements());
+        }
     }
 
     @Override
diff --git a/core/src/test/java/org/elasticsearch/index/mapper/core/TokenCountFieldMapperIntegrationIT.java b/core/src/test/java/org/elasticsearch/index/mapper/core/TokenCountFieldMapperIntegrationIT.java
index 613cdde97e61e..50246891f95c9 100644
--- a/core/src/test/java/org/elasticsearch/index/mapper/core/TokenCountFieldMapperIntegrationIT.java
+++ b/core/src/test/java/org/elasticsearch/index/mapper/core/TokenCountFieldMapperIntegrationIT.java
@@ -136,6 +136,12 @@ private void init() throws IOException {
                                         .field("format", "doc_values")
                                     .endObject()
                                 .endObject()
+                                .startObject("token_count_without_position_increments")
+                                    .field("type", "token_count")
+                                    .field("analyzer", "english")
+                                    .field("enable_position_increments", false)
+                                    .field("store", true)
+                                .endObject()
                             .endObject()
                         .endObject()
                     .endObject()
@@ -173,6 +179,7 @@ private SearchRequestBuilder searchByNumericRange(int low, int high) {
     private SearchRequestBuilder prepareSearch() {
         SearchRequestBuilder request = client().prepareSearch("test").setTypes("test");
         request.addField("foo.token_count");
+        request.addField("foo.token_count_without_position_increments");
         if (loadCountedFields) {
             request.addField("foo");
         }
@@ -190,32 +197,38 @@ private void assertSearchReturns(SearchResponse result, String... ids) {
         for (SearchHit hit : result.getHits()) {
             String id = hit.id();
             if (id.equals("single")) {
-                assertSearchHit(hit, 4);
+                assertSearchHit(hit, new int[]{4}, new int[]{4});
             } else if (id.equals("bulk1")) {
-                assertSearchHit(hit, 3);
+                assertSearchHit(hit, new int[]{3}, new int[]{3});
             } else if (id.equals("bulk2")) {
-                assertSearchHit(hit, 5);
+                assertSearchHit(hit, new int[]{5}, new int[]{4});
             } else if (id.equals("multi")) {
-                assertSearchHit(hit, 2, 7);
+                assertSearchHit(hit, new int[]{2, 7}, new int[]{2, 7});
             } else if (id.equals("multibulk1")) {
-                assertSearchHit(hit, 1, 8);
+                assertSearchHit(hit, new int[]{1, 8}, new int[]{1, 8});
             } else if (id.equals("multibulk2")) {
-                assertSearchHit(hit, 6, 10);
+                assertSearchHit(hit, new int[]{6, 10}, new int[]{3, 9});
             } else {
                 throw new ElasticsearchException("Unexpected response!");
             }
         }
     }
 
-    private void assertSearchHit(SearchHit hit, int... termCounts) {
+    private void assertSearchHit(SearchHit hit, int[] standardTermCounts, int[] englishTermCounts) {
         assertThat(hit.field("foo.token_count"), not(nullValue()));
-        assertThat(hit.field("foo.token_count").values().size(), equalTo(termCounts.length));
-        for (int i = 0; i < termCounts.length; i++) {
-            assertThat((Integer) hit.field("foo.token_count").values().get(i), equalTo(termCounts[i]));
+        assertThat(hit.field("foo.token_count").values().size(), equalTo(standardTermCounts.length));
+        for (int i = 0; i < standardTermCounts.length; i++) {
+            assertThat((Integer) hit.field("foo.token_count").values().get(i), equalTo(standardTermCounts[i]));
+        }
+
+        assertThat(hit.field("foo.token_count_without_position_increments"), not(nullValue()));
+        assertThat(hit.field("foo.token_count_without_position_increments").values().size(), equalTo(englishTermCounts.length));
+        for (int i = 0; i < englishTermCounts.length; i++) {
+            assertThat((Integer) hit.field("foo.token_count_without_position_increments").values().get(i), equalTo(englishTermCounts[i]));
         }
 
         if (loadCountedFields && storeCountedFields) {
-            assertThat(hit.field("foo").values().size(), equalTo(termCounts.length));
+            assertThat(hit.field("foo").values().size(), equalTo(standardTermCounts.length));
         }
     }
 }
diff --git a/core/src/test/java/org/elasticsearch/index/mapper/core/TokenCountFieldMapperTests.java b/core/src/test/java/org/elasticsearch/index/mapper/core/TokenCountFieldMapperTests.java
index a4a87eda10d8f..94da353e78e91 100644
--- a/core/src/test/java/org/elasticsearch/index/mapper/core/TokenCountFieldMapperTests.java
+++ b/core/src/test/java/org/elasticsearch/index/mapper/core/TokenCountFieldMapperTests.java
@@ -27,7 +27,6 @@
 import org.elasticsearch.common.compress.CompressedXContent;
 import org.elasticsearch.common.xcontent.XContentFactory;
 import org.elasticsearch.index.mapper.DocumentMapper;
-import org.elasticsearch.index.mapper.DocumentMapperParser;
 import org.elasticsearch.index.mapper.MapperService;
 import org.elasticsearch.test.ESSingleNodeTestCase;
 import org.junit.Test;
@@ -73,26 +72,49 @@ public void testMerge() throws IOException {
         assertThat(((TokenCountFieldMapper) stage2.mappers().smartNameFieldMapper("tc")).analyzer(), equalTo("standard"));
     }
 
-    @Test
-    public void testCountPositions() throws IOException {
-        // We're looking to make sure that we:
-        Token t1 = new Token();      // Don't count tokens without an increment
+    /**
+     *  When position increments are counted, we're looking to make sure that we:
+        - don't count tokens without an increment
+        - count normal tokens with one increment
+        - count funny tokens with more than one increment
+        - count the final token increments on the rare token streams that have them
+     */
+    public void testCountPositionsWithIncrements() throws IOException {
+        Analyzer analyzer = createMockAnalyzer();
+        assertThat(TokenCountFieldMapper.countPositions(analyzer, "", "", true), equalTo(7));
+    }
+
+    /**
+     *  When position increments are not counted (only positions are counted), we're looking to make sure that we:
+        - don't count tokens without an increment
+        - count normal tokens with one increment
+        - count funny tokens with more than one increment as only one
+        - don't count the final token increments on the rare token streams that have them
+     */
+    public void testCountPositionsWithoutIncrements() throws IOException {
+        Analyzer analyzer = createMockAnalyzer();
+        assertThat(TokenCountFieldMapper.countPositions(analyzer, "", "", false), equalTo(2));
+    }
+
+    private Analyzer createMockAnalyzer() {
+        Token t1 = new Token();      // Token without an increment
         t1.setPositionIncrement(0);
         Token t2 = new Token();
-        t2.setPositionIncrement(1);  // Count normal tokens with one increment
+        t2.setPositionIncrement(1);  // Normal token with one increment
         Token t3 = new Token();
-        t2.setPositionIncrement(2);  // Count funny tokens with more than one increment
-        int finalTokenIncrement = 4; // Count the final token increment on the rare token streams that have them
+        t2.setPositionIncrement(2);  // Funny token with more than one increment
+        int finalTokenIncrement = 4; // Final token increment
         Token[] tokens = new Token[] {t1, t2, t3};
-        Collections.shuffle(Arrays.asList(tokens), getRandom());
+        Collections.shuffle(Arrays.asList(tokens), random());
         final TokenStream tokenStream = new CannedTokenStream(finalTokenIncrement, 0, tokens);
         // TODO: we have no CannedAnalyzer?
         Analyzer analyzer = new Analyzer() {
-                @Override
-                public TokenStreamComponents createComponents(String fieldName) {
-                    return new TokenStreamComponents(new MockTokenizer(), tokenStream);
-                }
-            };
-        assertThat(TokenCountFieldMapper.countPositions(analyzer, "", ""), equalTo(7));
+            @Override
+            public TokenStreamComponents createComponents(String fieldName) {
+                return new TokenStreamComponents(new MockTokenizer(), tokenStream);
+            }
+        };
+        return analyzer;
     }
+
 }
diff --git a/docs/reference/mapping/types/token-count.asciidoc b/docs/reference/mapping/types/token-count.asciidoc
index ec02a647a6f67..e4a235b3196c3 100644
--- a/docs/reference/mapping/types/token-count.asciidoc
+++ b/docs/reference/mapping/types/token-count.asciidoc
@@ -48,12 +48,6 @@ GET my_index/_search
 <2> The `name.length` field is a `token_count` <<multi-fields,multi-field>> which will index the number of tokens in the `name` field.
 <3> This query matches only the document containing `Rachel Alice Williams`, as it contains three tokens.
 
-[NOTE]
-===================================================================
-Technically the `token_count` type sums position increments rather than
-counting tokens. This means that even if the analyzer filters out stop
-words they are included in the count.
-===================================================================
 
 [[token-count-params]]
 ==== Parameters for `token_count` fields
@@ -68,6 +62,12 @@ The following parameters are accepted by `token_count` fields:
     value. Required. For best performance, use an analyzer without token
     filters.
 
+`enable_position_increments`:: 
+
+    Indicates if position increments should be counted. 
+    Set to `false` if you don't want to count tokens removed by analyzer filters (like <<analysis-stop-tokenfilter,`stop`>>). 
+    Defaults to `true`.
+
 <<index-boost,`boost`>>::
 
     Field-level index time boosting. Accepts a floating point number, defaults