elastic · jpountz · May 2, 2018 · Apr 23, 2018 · Apr 25, 2018 · Apr 25, 2018
diff --git a/docs/CHANGELOG.asciidoc b/docs/CHANGELOG.asciidoc
@@ -71,8 +71,12 @@ written to by an older Elasticsearch after writing to it with a newer Elasticsea
 [[release-notes-6.4.0]]
 == {es} 6.4.0
 
-//[float]
-//=== New Features
+[float]
+=== New Features
+
+The new <<mapping-ignored-field,`_ignored`>> field allows to know which fields
+got ignored at index time because of the <<ignore-malformed,`ignore_malformed`>>
+option. ({pull}30140[#29658])
 
 [float]
 === Enhancements

diff --git a/docs/reference/mapping/fields.asciidoc b/docs/reference/mapping/fields.asciidoc
@@ -40,6 +40,14 @@ can be customised when a mapping type is created.
 
     All fields in the document which contain non-null values.
 
+[float]
+=== Indexing meta-fields
+
+<<mapping-ignored-field,`_ignored`>>::
+
+    All fields in the document that have been ignored at index time because of
+    <<ignore-malformed,`ignore_malformed`>>.
+
 [float]
 === Routing meta-field
 
@@ -57,6 +65,8 @@ can be customised when a mapping type is created.
 
 include::fields/field-names-field.asciidoc[]
 
+include::fields/ignored-field.asciidoc[]
+
 include::fields/id-field.asciidoc[]
 
 include::fields/index-field.asciidoc[]

diff --git a/docs/reference/mapping/fields/ignored-field.asciidoc b/docs/reference/mapping/fields/ignored-field.asciidoc
@@ -0,0 +1,45 @@
+[[mapping-ignored-field]]
+=== `_ignored` field
+
+added[6.4.0]
+
+The `_ignored` field indexes and stores the names of every field in a document
+that has been ignored because it was malformed and
+<<ignore-malformed,`ignore_malformed`>> was turned on.
+
+This field is searchable with <<query-dsl-term-query,`term`>>,
+<<query-dsl-terms-query,`terms`>> and <<query-dsl-exists-query,`exists`>>
+queries, and is returned as part of the search hits.
+
+For instance the below query matches all documents that have one or more fields
+that got ignored:
+
+[source,js]
+--------------------------------------------------
+GET _search
+{
+  "query": {
+    "exists": {
+      "field": "_ignored"
+    }
+  }
+}
+--------------------------------------------------
+// CONSOLE
+
+Similarly, the below query finds all documents whose `@timestamp` field was
+ignored at index time:
+
+[source,js]
+--------------------------------------------------
+GET _search
+{
+  "query": {
+    "term": {
+      "_ignored": "@timestamp"
+    }
+  }
+}
+--------------------------------------------------
+// CONSOLE
+
diff --git a/docs/reference/mapping/params/ignore-malformed.asciidoc b/docs/reference/mapping/params/ignore-malformed.asciidoc
@@ -85,3 +85,13 @@ PUT my_index
 
 <1> The `number_one` field inherits the index-level setting.
 <2> The `number_two` field overrides the index-level setting to turn off `ignore_malformed`.
+
+==== Dealing with malformed fields
+
+Malformed fields are silently ignored at indexing time when `ignore_malformed`
+is turned on. Whenever possible it is recommended to keep the number of
+documents that have a malformed field contained, or queries on this field will
+become meaningless. Elasticsearch makes it easy to check how many documents
+have malformed fields by using `exist` or `term` queries on the special
+<<mapping-ignored-field,`_ignored`>> field.
+
diff --git a/rest-api-spec/src/main/resources/rest-api-spec/test/search/200_ignore_malformed.yml b/rest-api-spec/src/main/resources/rest-api-spec/test/search/200_ignore_malformed.yml
@@ -0,0 +1,92 @@
+---
+setup:
+  - skip:
+      version: " - 6.99.99" # TODO: change to 6.3.99 after backport to 6.4
+      reason: _ignored was added in 6.4.0
+
+  - do:
+      indices.create:
+          index:  test
+          body:
+            mappings:
+              _doc:
+                properties:
+                  my_date:
+                    type: date
+                    ignore_malformed: true
+                    store: true
+                  my_ip:
+                    type: ip
+                    ignore_malformed: true
+
+  - do:
+      index:
+        index:  test
+        type:   _doc
+        id:     1
+        body:   { "my_date": "2018-05-11", "my_ip": ":::1" }
+
+  - do:
+      index:
+        index:  test
+        type:   _doc
+        id:     2
+        body:   { "my_date": "bar", "my_ip": "192.168.1.42" }
+
+  - do:
+      index:
+        index:  test
+        type:   _doc
+        id:     3
+        body:   { "my_date": "bar", "my_ip": "quux" }
+
+  - do:
+      indices.refresh: {}
+
+---
+"Exists on _ignored":
+
+  - do:
+      search:
+        body: { query: { exists: { "field": "_ignored" } } }
+
+  - length:   { hits.hits: 3  }
+
+---
+"Search on _ignored with term":
+
+  - do:
+      search:
+        body: { query: { term: { "_ignored": "my_date" } } }
+
+  - length:   { hits.hits: 2  }
+
+---
+"Search on _ignored with terms":
+
+  - do:
+      search:
+        body: { query: { terms: { "_ignored": [ "my_date", "my_ip" ] } } }
+
+  - length:   { hits.hits: 3  }
+
+---
+"_ignored is returned by default":
+
+  - do:
+      search:
+        body: { query: { ids: { "values": [ "3" ] } } }
+
+  - length:   { hits.hits: 1  }
+  - length:   { hits.hits.0._ignored: 2}
+
+---
+"_ignored is still returned with explicit list of stored fields":
+
+  - do:
+      search:
+        stored_fields: [ "my_date" ]
+        body: { query: { ids: { "values": [ "3" ] } } }
+
+  - length:   { hits.hits: 1  }
+  - is_true:  hits.hits.0._ignored
diff --git a/server/src/main/java/org/elasticsearch/index/fieldvisitor/FieldsVisitor.java b/server/src/main/java/org/elasticsearch/index/fieldvisitor/FieldsVisitor.java
@@ -24,6 +24,7 @@
 import org.elasticsearch.common.bytes.BytesArray;
 import org.elasticsearch.common.bytes.BytesReference;
 import org.elasticsearch.index.mapper.IdFieldMapper;
+import org.elasticsearch.index.mapper.IgnoredFieldMapper;
 import org.elasticsearch.index.mapper.MappedFieldType;
 import org.elasticsearch.index.mapper.MapperService;
 import org.elasticsearch.index.mapper.RoutingFieldMapper;
@@ -69,6 +70,12 @@ public Status needsField(FieldInfo fieldInfo) throws IOException {
         if (requiredFields.remove(fieldInfo.name)) {
             return Status.YES;
         }
+        // Always load _ignored to be explicit about ignored fields
+        // This works because _ignored is added as the first metadata mapper,
+        // so its stored fields always appear first in the list.
+        if (IgnoredFieldMapper.NAME.equals(fieldInfo.name)) {
+            return Status.YES;
+        }
         // All these fields are single-valued so we can stop when the set is
         // empty
         return requiredFields.isEmpty()

diff --git a/server/src/main/java/org/elasticsearch/index/mapper/DateFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/DateFieldMapper.java
@@ -446,6 +446,7 @@ protected void parseCreateField(ParseContext context, List<IndexableField> field
             timestamp = fieldType().parse(dateAsString);
         } catch (IllegalArgumentException e) {
             if (ignoreMalformed.value()) {
+                context.addIgnoredField(fieldType.name());
                 return;
             } else {
                 throw e;

diff --git a/server/src/main/java/org/elasticsearch/index/mapper/GeoPointFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/GeoPointFieldMapper.java
@@ -305,6 +305,7 @@ public Mapper parse(ParseContext context) throws IOException {
                             if (ignoreMalformed.value() == false) {
                                 throw e;
                             }
+                            context.addIgnoredField(fieldType.name());
                         }
                         token = context.parser().nextToken();
                     }
@@ -352,6 +353,7 @@ public Mapper parse(ParseContext context) throws IOException {
                     if (ignoreMalformed.value() == false) {
                         throw e;
                     }
+                    context.addIgnoredField(fieldType.name());
                 }
             }
         }

diff --git a/server/src/main/java/org/elasticsearch/index/mapper/GeoShapeFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/GeoShapeFieldMapper.java
@@ -506,6 +506,7 @@ public Mapper parse(ParseContext context) throws IOException {
             if (ignoreMalformed.value() == false) {
                 throw new MapperParsingException("failed to parse [" + fieldType().name() + "]", e);
             }
+            context.addIgnoredField(fieldType.name());
         }
         return null;
     }

diff --git a/server/src/main/java/org/elasticsearch/index/mapper/IgnoredFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/IgnoredFieldMapper.java
@@ -0,0 +1,154 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.mapper;
+
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.IndexOptions;
+import org.apache.lucene.index.IndexableField;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.TermRangeQuery;
+import org.elasticsearch.common.lucene.Lucene;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.common.xcontent.XContentBuilder;
+import org.elasticsearch.index.query.QueryShardContext;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * A field mapper that records fields that have been ignored because they were malformed.
+ */
+public final class IgnoredFieldMapper extends MetadataFieldMapper {
+
+    public static final String NAME = "_ignored";
+
+    public static final String CONTENT_TYPE = "_ignored";
+
+    public static class Defaults {
+        public static final String NAME = IgnoredFieldMapper.NAME;
+
+        public static final MappedFieldType FIELD_TYPE = new IgnoredFieldType();
+
+        static {
+            FIELD_TYPE.setIndexOptions(IndexOptions.DOCS);
+            FIELD_TYPE.setTokenized(false);
+            FIELD_TYPE.setStored(true);
+            FIELD_TYPE.setOmitNorms(true);
+            FIELD_TYPE.setIndexAnalyzer(Lucene.KEYWORD_ANALYZER);
+            FIELD_TYPE.setSearchAnalyzer(Lucene.KEYWORD_ANALYZER);
+            FIELD_TYPE.setName(NAME);
+            FIELD_TYPE.freeze();
+        }
+    }
+
+    public static class Builder extends MetadataFieldMapper.Builder<Builder, IgnoredFieldMapper> {
+
+        public Builder(MappedFieldType existing) {
+            super(Defaults.NAME, existing == null ? Defaults.FIELD_TYPE : existing, Defaults.FIELD_TYPE);
+        }
+
+        @Override
+        public IgnoredFieldMapper build(BuilderContext context) {
+            return new IgnoredFieldMapper(context.indexSettings());
+        }
+    }
+
+    public static class TypeParser implements MetadataFieldMapper.TypeParser {
+        @Override
+        public MetadataFieldMapper.Builder<?,?> parse(String name, Map<String, Object> node,
+                ParserContext parserContext) throws MapperParsingException {
+            return new Builder(parserContext.mapperService().fullName(NAME));
+        }
+
+        @Override
+        public MetadataFieldMapper getDefault(MappedFieldType fieldType, ParserContext context) {
+            final Settings indexSettings = context.mapperService().getIndexSettings().getSettings();
+            return new IgnoredFieldMapper(indexSettings);
+        }
+    }
+
+    public static final class IgnoredFieldType extends TermBasedFieldType {
+
+        public IgnoredFieldType() {
+        }
+
+        protected IgnoredFieldType(IgnoredFieldType ref) {
+            super(ref);
+        }
+
+        @Override
+        public IgnoredFieldType clone() {
+            return new IgnoredFieldType(this);
+        }
+
+        @Override
+        public String typeName() {
+            return CONTENT_TYPE;
+        }
+
+        @Override
+        public Query existsQuery(QueryShardContext context) {
+            // This query is not performance sensitive, it only helps assess
+            // quality of the data, so we may use a slow query. It shouldn't
+            // be too slow in practice since the number of unique terms in this
+            // field is bounded by the number of fields in the mappings.
+            return new TermRangeQuery(name(), null, null, true, true);
+        }
+
+    }
+
+    private IgnoredFieldMapper(Settings indexSettings) {
+        super(NAME, Defaults.FIELD_TYPE, Defaults.FIELD_TYPE, indexSettings);
+    }
+
+    @Override
+    public void preParse(ParseContext context) throws IOException {
+    }
+
+    @Override
+    public void postParse(ParseContext context) throws IOException {
+        super.parse(context);
+    }
+
+    @Override
+    public Mapper parse(ParseContext context) throws IOException {
+        // done in post-parse
+        return null;
+    }
+
+    @Override
+    protected void parseCreateField(ParseContext context, List<IndexableField> fields) throws IOException {
+        for (String field : context.getIgnoredFields()) {
+            context.doc().add(new Field(NAME, field, fieldType()));
+        }
+    }
+
+    @Override
+    protected String contentType() {
+        return CONTENT_TYPE;
+    }
+
+    @Override
+    public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
+        return builder;
+    }
+
+}