Prevent SigTerms/SigText from running on fields they do not support (#52851)

polyfractal · web-flow · commit ec4c699defbd · 2020-03-23T15:33:17.000-04:00
* Prevent SigTerms/SigText from running on fields they do not support

SigTerms cannot run on fields that are not searchable, and SigText
cannot run on fields that do not have analyzers.  Both of these
situations fail today with an esoteric exception, so this just formalizes
the constraint by throwing an IllegalArgumentException up front.

In practice, the only affected field seems to be the `binary` field,
which is neither searchable or has a default analyzer (e.g. even numeric
and keyword fields have a default analyzer despite not being tokenized)

Adds supported-type tests, and makes some changes to the test itself
to allow testing sigtext (indexing _source).

Also a few tweaks to the test to avoid bad randomization (negative
numbers, etc).

* Tweak exception text
diff --git a/server/src/main/java/org/elasticsearch/index/mapper/NumberFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/NumberFieldMapper.java
@@ -20,7 +20,6 @@
 package org.elasticsearch.index.mapper;
 
 import com.fasterxml.jackson.core.JsonParseException;
-
 import org.apache.lucene.document.DoublePoint;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.FloatPoint;
diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsAggregatorFactory.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsAggregatorFactory.java
@@ -88,7 +88,12 @@ public SignificantTermsAggregatorFactory(String name,
                                              Map<String, Object> metaData) throws IOException {
         super(name, config, queryShardContext, parent, subFactoriesBuilder, metaData);
 
-        if (!config.unmapped()) {
+        if (config.unmapped() == false) {
+            if (config.fieldContext().fieldType().isSearchable() == false) {
+                throw new IllegalArgumentException("SignificantText aggregation requires fields to be searchable, but ["
+                    + config.fieldContext().fieldType().name() + "] is not");
+            }
+
             this.fieldType = config.fieldContext().fieldType();
             this.indexedFieldName = fieldType.name();
         }
@@ -129,6 +134,10 @@ private FilterableTermsEnum getTermsEnum(String field) throws IOException {
     }
 
     private long getBackgroundFrequency(String value) throws IOException {
+        // fieldType can be null if the field is unmapped, but theoretically this method should only be called
+        // when constructing buckets.  Assert to ensure this is the case
+        // TODO this is a bad setup and it should be refactored
+        assert fieldType != null;
         Query query = fieldType.termQuery(value, queryShardContext);
         if (query instanceof TermQuery) {
             // for types that use the inverted index, we prefer using a caching terms
diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTextAggregatorFactory.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTextAggregatorFactory.java
@@ -83,6 +83,10 @@ public SignificantTextAggregatorFactory(String name,
         // Note that if the field is unmapped (its field type is null), we don't fail,
         // and just use the given field name as a placeholder.
         this.fieldType = queryShardContext.fieldMapper(fieldName);
+        if (fieldType != null && fieldType.indexAnalyzer() == null) {
+            throw new IllegalArgumentException("Field [" + fieldType.name() + "] has no analyzer, but SignificantText " +
+                "requires an analyzed field");
+        }
         this.indexedFieldName = fieldType != null ? fieldType.name() : fieldName;
         this.sourceFieldNames = sourceFieldNames == null
             ? new String[] { indexedFieldName }
@@ -124,6 +128,10 @@ private FilterableTermsEnum getTermsEnum(String field) throws IOException {
     }
 
     private long getBackgroundFrequency(String value) throws IOException {
+        // fieldType can be null if the field is unmapped, but theoretically this method should only be called
+        // when constructing buckets.  Assert to ensure this is the case
+        // TODO this is a bad setup and it should be refactored
+        assert fieldType != null;
         Query query = fieldType.termQuery(value, queryShardContext);
         if (query instanceof TermQuery) {
             // for types that use the inverted index, we prefer using a caching terms
diff --git a/server/src/test/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsAggregatorTests.java b/server/src/test/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsAggregatorTests.java
@@ -36,6 +36,7 @@
 import org.apache.lucene.util.BytesRef;
 import org.elasticsearch.index.analysis.AnalyzerScope;
 import org.elasticsearch.index.analysis.NamedAnalyzer;
+import org.elasticsearch.index.mapper.BinaryFieldMapper;
 import org.elasticsearch.index.mapper.KeywordFieldMapper;
 import org.elasticsearch.index.mapper.MappedFieldType;
 import org.elasticsearch.index.mapper.NumberFieldMapper;
@@ -46,10 +47,14 @@
 import org.elasticsearch.index.mapper.TextFieldMapper.TextFieldType;
 import org.elasticsearch.index.query.QueryBuilder;
 import org.elasticsearch.index.query.QueryBuilders;
+import org.elasticsearch.search.aggregations.AggregationBuilder;
 import org.elasticsearch.search.aggregations.AggregationExecutionException;
 import org.elasticsearch.search.aggregations.AggregatorTestCase;
 import org.elasticsearch.search.aggregations.bucket.significant.SignificantTermsAggregatorFactory.ExecutionMode;
 import org.elasticsearch.search.aggregations.bucket.terms.IncludeExclude;
+import org.elasticsearch.search.aggregations.support.CoreValuesSourceType;
+import org.elasticsearch.search.aggregations.support.ValueType;
+import org.elasticsearch.search.aggregations.support.ValuesSourceType;
 import org.junit.Before;
 
 import java.io.IOException;
@@ -75,6 +80,27 @@ public void setUpTest() throws Exception {
         fieldType.setName("field");
     }
 
+    @Override
+    protected AggregationBuilder createAggBuilderForTypeTest(MappedFieldType fieldType, String fieldName) {
+        return new SignificantTermsAggregationBuilder("foo", ValueType.STRING).field(fieldName);
+    }
+
+    @Override
+    protected List<ValuesSourceType> getSupportedValuesSourceTypes() {
+        return List.of(CoreValuesSourceType.NUMERIC,
+            CoreValuesSourceType.BYTES);
+    }
+
+    @Override
+    protected List<String> unsupportedMappedFieldTypes() {
+        return List.of(
+            NumberFieldMapper.NumberType.DOUBLE.typeName(), // floating points are not supported at all
+            NumberFieldMapper.NumberType.FLOAT.typeName(),
+            NumberFieldMapper.NumberType.HALF_FLOAT.typeName(),
+            BinaryFieldMapper.CONTENT_TYPE // binary fields are not supported because they cannot be searched
+        );
+    }
+
     /**
      * For each provided field type, we also register an alias with name {@code <field>-alias}.
      */
diff --git a/server/src/test/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTextAggregatorTests.java b/server/src/test/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTextAggregatorTests.java
@@ -34,12 +34,17 @@
 import org.apache.lucene.util.BytesRef;
 import org.elasticsearch.index.analysis.AnalyzerScope;
 import org.elasticsearch.index.analysis.NamedAnalyzer;
+import org.elasticsearch.index.mapper.BinaryFieldMapper;
+import org.elasticsearch.index.mapper.GeoPointFieldMapper;
 import org.elasticsearch.index.mapper.MappedFieldType;
 import org.elasticsearch.index.mapper.TextFieldMapper.TextFieldType;
+import org.elasticsearch.search.aggregations.AggregationBuilder;
 import org.elasticsearch.search.aggregations.AggregatorTestCase;
 import org.elasticsearch.search.aggregations.bucket.sampler.InternalSampler;
 import org.elasticsearch.search.aggregations.bucket.sampler.SamplerAggregationBuilder;
 import org.elasticsearch.search.aggregations.support.AggregationInspectionHelper;
+import org.elasticsearch.search.aggregations.support.CoreValuesSourceType;
+import org.elasticsearch.search.aggregations.support.ValuesSourceType;
 
 import java.io.IOException;
 import java.util.Arrays;
@@ -63,6 +68,28 @@ protected Map<String, MappedFieldType> getFieldAliases(MappedFieldType... fieldT
             Function.identity()));
     }
 
+    @Override
+    protected AggregationBuilder createAggBuilderForTypeTest(MappedFieldType fieldType, String fieldName) {
+        return new SignificantTextAggregationBuilder("foo", fieldName);
+    }
+
+    @Override
+    protected List<ValuesSourceType> getSupportedValuesSourceTypes() {
+        // TODO it is likely accidental that SigText supports anything other than Bytes, and then only text fields
+        return List.of(CoreValuesSourceType.NUMERIC,
+            CoreValuesSourceType.BYTES,
+            CoreValuesSourceType.RANGE,
+            CoreValuesSourceType.GEOPOINT);
+    }
+
+    @Override
+    protected List<String> unsupportedMappedFieldTypes() {
+        return List.of(
+            BinaryFieldMapper.CONTENT_TYPE, // binary fields are not supported because they do not have analyzers
+            GeoPointFieldMapper.CONTENT_TYPE // geopoint fields cannot use term queries
+        );
+    }
+
     /**
      * Uses the significant text aggregation to find the keywords in text fields
      */
diff --git a/test/framework/src/main/java/org/elasticsearch/search/aggregations/AggregatorTestCase.java b/test/framework/src/main/java/org/elasticsearch/search/aggregations/AggregatorTestCase.java