From 5df2994b7e60b1a95f0638df87b85519c6670dbc Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Wed, 14 Dec 2016 16:55:41 +0100 Subject: [PATCH 1/3] Date detection should not rely on a hardcoded set of characters. Currently we only apply date detection on strings that contain either `:`, `-` or `/`. This commit inverses the heuristic in order to only apply date detection on strings that are not parseable as a number, so that more date formats can be used as dynamic dates formats. Closes #1694 --- .../index/mapper/DocumentParser.java | 77 ++++++++++--------- 1 file changed, 42 insertions(+), 35 deletions(-) diff --git a/core/src/main/java/org/elasticsearch/index/mapper/DocumentParser.java b/core/src/main/java/org/elasticsearch/index/mapper/DocumentParser.java index 72d021c0e3e0d..1829dce1ecd49 100644 --- a/core/src/main/java/org/elasticsearch/index/mapper/DocumentParser.java +++ b/core/src/main/java/org/elasticsearch/index/mapper/DocumentParser.java @@ -22,7 +22,6 @@ import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexableField; import org.elasticsearch.Version; -import org.elasticsearch.common.Strings; import org.elasticsearch.common.collect.Tuple; import org.elasticsearch.common.joda.FormatDateTimeFormatter; import org.elasticsearch.common.xcontent.XContentHelper; @@ -650,47 +649,55 @@ private static Mapper.Builder createBuilderFromFieldType(final ParseContext private static Mapper.Builder createBuilderFromDynamicValue(final ParseContext context, XContentParser.Token token, String currentFieldName) throws IOException { if (token == XContentParser.Token.VALUE_STRING) { - if (context.root().dateDetection()) { - String text = context.parser().text(); - // a safe check since "1" gets parsed as well - if (Strings.countOccurrencesOf(text, ":") > 1 || Strings.countOccurrencesOf(text, "-") > 1 || Strings.countOccurrencesOf(text, "/") > 1) { - for (FormatDateTimeFormatter dateTimeFormatter : context.root().dynamicDateTimeFormatters()) { - try { - dateTimeFormatter.parser().parseMillis(text); - Mapper.Builder builder = context.root().findTemplateBuilder(context, currentFieldName, XContentFieldType.DATE); - if (builder == null) { - builder = newDateBuilder(currentFieldName, dateTimeFormatter, Version.indexCreated(context.indexSettings())); - } - return builder; - } catch (Exception e) { - // failure to parse this, continue - } - } - } + String text = context.parser().text(); + + boolean parseableAsLong = false; + try { + Long.parseLong(text); + parseableAsLong = true; + } catch (NumberFormatException e) { + // not a long number } - if (context.root().numericDetection()) { - String text = context.parser().text(); - try { - Long.parseLong(text); - Mapper.Builder builder = context.root().findTemplateBuilder(context, currentFieldName, XContentFieldType.LONG); - if (builder == null) { - builder = newLongBuilder(currentFieldName, Version.indexCreated(context.indexSettings())); - } - return builder; - } catch (NumberFormatException e) { - // not a long number + + boolean parseableAsDouble = false; + try { + Double.parseDouble(text); + parseableAsDouble = true; + } catch (NumberFormatException e) { + // not a double number + } + + if (parseableAsLong && context.root().numericDetection()) { + Mapper.Builder builder = context.root().findTemplateBuilder(context, currentFieldName, XContentFieldType.LONG); + if (builder == null) { + builder = newLongBuilder(currentFieldName, Version.indexCreated(context.indexSettings())); } - try { - Double.parseDouble(text); - Mapper.Builder builder = context.root().findTemplateBuilder(context, currentFieldName, XContentFieldType.DOUBLE); + return builder; + } else if (parseableAsDouble && context.root().numericDetection()) { + Mapper.Builder builder = context.root().findTemplateBuilder(context, currentFieldName, XContentFieldType.DOUBLE); + if (builder == null) { + builder = newFloatBuilder(currentFieldName, Version.indexCreated(context.indexSettings())); + } + return builder; + } else if (parseableAsLong == false && parseableAsDouble == false && context.root().dateDetection()) { + // We refuse to match pure numbers, which are too likely to be + // false positives with date formats that include eg. + // `epoch_millis` or `YYYY` + for (FormatDateTimeFormatter dateTimeFormatter : context.root().dynamicDateTimeFormatters()) { + try { + dateTimeFormatter.parser().parseMillis(text); + } catch (IllegalArgumentException e) { + // failure to parse this, continue + continue; + } + Mapper.Builder builder = context.root().findTemplateBuilder(context, currentFieldName, XContentFieldType.DATE); if (builder == null) { - builder = newFloatBuilder(currentFieldName, Version.indexCreated(context.indexSettings())); + builder = newDateBuilder(currentFieldName, dateTimeFormatter, Version.indexCreated(context.indexSettings())); } return builder; - } catch (NumberFormatException e) { - // not a long number } } + Mapper.Builder builder = context.root().findTemplateBuilder(context, currentFieldName, XContentFieldType.STRING); if (builder == null) { builder = new TextFieldMapper.Builder(currentFieldName) From 07cbcc84a9631cd455044eb4ce96d07d87ecc6c9 Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Wed, 14 Dec 2016 16:59:19 +0100 Subject: [PATCH 2/3] iter --- .../org/elasticsearch/common/Strings.java | 20 ------------------- 1 file changed, 20 deletions(-) diff --git a/core/src/main/java/org/elasticsearch/common/Strings.java b/core/src/main/java/org/elasticsearch/common/Strings.java index 1ef13e3bc7057..1689054e043c3 100644 --- a/core/src/main/java/org/elasticsearch/common/Strings.java +++ b/core/src/main/java/org/elasticsearch/common/Strings.java @@ -276,26 +276,6 @@ public static boolean substringMatch(CharSequence str, int index, CharSequence s return true; } - /** - * Count the occurrences of the substring in string s. - * - * @param str string to search in. Return 0 if this is null. - * @param sub string to search for. Return 0 if this is null. - */ - public static int countOccurrencesOf(String str, String sub) { - if (str == null || sub == null || str.length() == 0 || sub.length() == 0) { - return 0; - } - int count = 0; - int pos = 0; - int idx; - while ((idx = str.indexOf(sub, pos)) != -1) { - ++count; - pos = idx + sub.length(); - } - return count; - } - /** * Replace all occurrences of a substring within a string with * another string. From c1fa7e332393eb10b1e95a530cb5423479aace01 Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Thu, 22 Dec 2016 11:42:43 +0100 Subject: [PATCH 3/3] tests --- .../index/mapper/DocumentParserTests.java | 46 +++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/core/src/test/java/org/elasticsearch/index/mapper/DocumentParserTests.java b/core/src/test/java/org/elasticsearch/index/mapper/DocumentParserTests.java index f5a92d3f9797e..f964492ea6644 100644 --- a/core/src/test/java/org/elasticsearch/index/mapper/DocumentParserTests.java +++ b/core/src/test/java/org/elasticsearch/index/mapper/DocumentParserTests.java @@ -49,6 +49,7 @@ import static org.elasticsearch.test.StreamsUtils.copyToStringFromClasspath; import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.instanceOf; +import static org.hamcrest.Matchers.not; // TODO: make this a real unit test public class DocumentParserTests extends ESSingleNodeTestCase { @@ -1260,4 +1261,49 @@ public void testIncludeInAllPropagation() throws IOException { } assertEquals(new HashSet<>(Arrays.asList("b", "d")), values); } + + public void testDynamicDateDetectionDisabledOnNumbers() throws IOException { + DocumentMapperParser mapperParser = createIndex("test").mapperService().documentMapperParser(); + String mapping = XContentFactory.jsonBuilder().startObject().startObject("type") + .startArray("dynamic_date_formats") + .value("yyyy") + .endArray().endObject().endObject().string(); + DocumentMapper mapper = mapperParser.parse("type", new CompressedXContent(mapping)); + + BytesReference bytes = XContentFactory.jsonBuilder() + .startObject() + .field("foo", "2016") + .endObject().bytes(); + + // Even though we matched the dynamic format, we do not match on numbers, + // which are too likely to be false positives + ParsedDocument doc = mapper.parse("test", "type", "1", bytes); + Mapping update = doc.dynamicMappingsUpdate(); + assertNotNull(update); + Mapper dateMapper = update.root().getMapper("foo"); + assertNotNull(dateMapper); + assertThat(dateMapper, not(instanceOf(DateFieldMapper.class))); + } + + public void testDynamicDateDetectionEnabledWithNoSpecialCharacters() throws IOException { + DocumentMapperParser mapperParser = createIndex("test").mapperService().documentMapperParser(); + String mapping = XContentFactory.jsonBuilder().startObject().startObject("type") + .startArray("dynamic_date_formats") + .value("yyyy MM") + .endArray().endObject().endObject().string(); + DocumentMapper mapper = mapperParser.parse("type", new CompressedXContent(mapping)); + + BytesReference bytes = XContentFactory.jsonBuilder() + .startObject() + .field("foo", "2016 12") + .endObject().bytes(); + + // We should have generated a date field + ParsedDocument doc = mapper.parse("test", "type", "1", bytes); + Mapping update = doc.dynamicMappingsUpdate(); + assertNotNull(update); + Mapper dateMapper = update.root().getMapper("foo"); + assertNotNull(dateMapper); + assertThat(dateMapper, instanceOf(DateFieldMapper.class)); + } }