From 8c153fce9f965e4968c48aeee93697ba8c755a06 Mon Sep 17 00:00:00 2001 From: David Roberts Date: Mon, 3 Sep 2018 15:33:28 +0100 Subject: [PATCH 1/2] [ML] Minor improvements to categorization Grok pattern creation 1. The TOMCAT_DATESTAMP format needs to be checked before TIMESTAMP_ISO8601, otherwise TIMESTAMP_ISO8601 will match the start of the Tomcat datestamp. 2. Exclude more characters before and after numbers. For example, in 1.2.3 we don't want to match 1.2 as a float. --- .../job/categorization/GrokPatternCreator.java | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/categorization/GrokPatternCreator.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/categorization/GrokPatternCreator.java index 04280261b2634..a0e00ebf73353 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/categorization/GrokPatternCreator.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/categorization/GrokPatternCreator.java @@ -25,14 +25,15 @@ */ public final class GrokPatternCreator { - private static String PREFACE = "preface"; - private static String EPILOGUE = "epilogue"; + private static final String PREFACE = "preface"; + private static final String EPILOGUE = "epilogue"; /** * The first match in this list will be chosen, so it needs to be ordered * such that more generic patterns come after more specific patterns. */ private static final List ORDERED_CANDIDATE_GROK_PATTERNS = Arrays.asList( + new GrokPatternCandidate("TOMCAT_DATESTAMP", "timestamp"), new GrokPatternCandidate("TIMESTAMP_ISO8601", "timestamp"), new GrokPatternCandidate("DATESTAMP_RFC822", "timestamp"), new GrokPatternCandidate("DATESTAMP_RFC2822", "timestamp"), @@ -41,7 +42,6 @@ public final class GrokPatternCreator { new GrokPatternCandidate("SYSLOGTIMESTAMP", "timestamp"), new GrokPatternCandidate("HTTPDATE", "timestamp"), new GrokPatternCandidate("CATALINA_DATESTAMP", "timestamp"), - new GrokPatternCandidate("TOMCAT_DATESTAMP", "timestamp"), new GrokPatternCandidate("CISCOTIMESTAMP", "timestamp"), new GrokPatternCandidate("DATE", "date"), new GrokPatternCandidate("TIME", "time"), @@ -56,12 +56,10 @@ public final class GrokPatternCreator { new GrokPatternCandidate("IP", "ipaddress"), // This already includes pre/post break conditions new GrokPatternCandidate("QUOTEDSTRING", "field", "", ""), - // Can't use \b as the break before, because it doesn't work for negative numbers (the - // minus sign is not a "word" character) - new GrokPatternCandidate("NUMBER", "field", "(? Date: Mon, 3 Sep 2018 17:36:51 +0100 Subject: [PATCH 2/2] Add unit tests --- .../GrokPatternCreatorTests.java | 34 +++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/categorization/GrokPatternCreatorTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/categorization/GrokPatternCreatorTests.java index 4189dc35f0caa..381a02e4580ad 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/categorization/GrokPatternCreatorTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/categorization/GrokPatternCreatorTests.java @@ -76,6 +76,40 @@ public void testAppendBestGrokMatchForStringsGivenTimestampsAndLogLevels() { assertEquals(".+?%{TIMESTAMP_ISO8601:timestamp}.+?%{LOGLEVEL:loglevel}.+?", overallGrokPatternBuilder.toString()); } + public void testAppendBestGrokMatchForStringsGivenTomcatDatestamps() { + + // The first part of the Tomcat datestamp can match as an ISO8601 + // timestamp if the ordering of candidate patterns is wrong + Collection mustMatchStrings = Arrays.asList("2018-09-03 17:03:28,269 +0100 | ERROR | ", + "2018-09-03 17:04:27,279 +0100 | DEBUG | ", + "2018-09-03 17:05:26,289 +0100 | ERROR | "); + + Map fieldNameCountStore = new HashMap<>(); + StringBuilder overallGrokPatternBuilder = new StringBuilder(); + + GrokPatternCreator.appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, false, mustMatchStrings); + + assertEquals(".*?%{TOMCAT_DATESTAMP:timestamp}.+?%{LOGLEVEL:loglevel}.+?", overallGrokPatternBuilder.toString()); + } + + public void testAppendBestGrokMatchForStringsGivenTrappyFloatCandidates() { + + // If we're not careful then we might detect the first part of these strings as a + // number, e.g. 1.2 in the first example, but this is inappropriate given the + // trailing dot and digit + Collection mustMatchStrings = Arrays.asList("1.2.3", + "-2.3.4", + "4.5.6.7", + "-9.8.7.6.5"); + + Map fieldNameCountStore = new HashMap<>(); + StringBuilder overallGrokPatternBuilder = new StringBuilder(); + + GrokPatternCreator.appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, false, mustMatchStrings); + + assertEquals(".+?", overallGrokPatternBuilder.toString()); + } + public void testAppendBestGrokMatchForStringsGivenNumbersInBrackets() { Collection mustMatchStrings = Arrays.asList("(-2)",