diff --git a/client/rest-high-level/src/test/java/org/elasticsearch/client/MachineLearningIT.java b/client/rest-high-level/src/test/java/org/elasticsearch/client/MachineLearningIT.java index 9eeb0cb6602cf..d2b18e422ace8 100644 --- a/client/rest-high-level/src/test/java/org/elasticsearch/client/MachineLearningIT.java +++ b/client/rest-high-level/src/test/java/org/elasticsearch/client/MachineLearningIT.java @@ -2895,8 +2895,17 @@ public void testFindFileStructure() throws IOException { FindFileStructureRequest request = new FindFileStructureRequest(); request.setSample(sample.getBytes(StandardCharsets.UTF_8)); - FindFileStructureResponse response = - execute(request, machineLearningClient::findFileStructure, machineLearningClient::findFileStructureAsync); + FindFileStructureResponse response = execute( + request, + machineLearningClient::findFileStructure, + machineLearningClient::findFileStructureAsync, + RequestOptions.DEFAULT + .toBuilder() + .setWarningsHandler( + warnings -> Collections.singletonList( + "[POST /_ml/find_file_structure] is deprecated! Use [POST /_text_structure/find_structure] instead." + ).equals(warnings) == false + ).build()); FileStructure structure = response.getFileStructure(); diff --git a/client/rest-high-level/src/test/java/org/elasticsearch/client/documentation/MlClientDocumentationIT.java b/client/rest-high-level/src/test/java/org/elasticsearch/client/documentation/MlClientDocumentationIT.java index bbbf3cd042344..c795914aff0d4 100644 --- a/client/rest-high-level/src/test/java/org/elasticsearch/client/documentation/MlClientDocumentationIT.java +++ b/client/rest-high-level/src/test/java/org/elasticsearch/client/documentation/MlClientDocumentationIT.java @@ -1870,8 +1870,17 @@ public void testFindFileStructure() throws Exception { // end::find-file-structure-request-options // tag::find-file-structure-execute - FindFileStructureResponse findFileStructureResponse = - client.machineLearning().findFileStructure(findFileStructureRequest, RequestOptions.DEFAULT); + FindFileStructureResponse findFileStructureResponse = client + .machineLearning() + .findFileStructure( + findFileStructureRequest, + RequestOptions.DEFAULT + .toBuilder() + .setWarningsHandler( + warnings -> Collections.singletonList( + "[POST /_ml/find_file_structure] is deprecated! Use [POST /_text_structure/find_structure] instead." + ).equals(warnings) == false + ).build()); // end::find-file-structure-execute // tag::find-file-structure-response diff --git a/docs/reference/ml/anomaly-detection/apis/find-file-structure.asciidoc b/docs/reference/ml/anomaly-detection/apis/find-file-structure.asciidoc index a1c7516c01824..edfc3bf86e8e2 100644 --- a/docs/reference/ml/anomaly-detection/apis/find-file-structure.asciidoc +++ b/docs/reference/ml/anomaly-detection/apis/find-file-structure.asciidoc @@ -278,6 +278,7 @@ POST _ml/find_file_structure {"name": "The Left Hand of Darkness", "author": "Ursula K. Le Guin", "release_date": "1969-06-01", "page_count": 304} {"name": "The Moon is a Harsh Mistress", "author": "Robert A. Heinlein", "release_date": "1966-04-01", "page_count": 288} ---- +// TEST[warning:[POST /_ml/find_file_structure] is deprecated! Use [POST /_text_structure/find_structure] instead.] If the request does not encounter errors, you receive the following result: diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/XPackClientPlugin.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/XPackClientPlugin.java index ab5396529e890..908b093cff72a 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/XPackClientPlugin.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/XPackClientPlugin.java @@ -90,7 +90,7 @@ import org.elasticsearch.xpack.core.ml.action.EvaluateDataFrameAction; import org.elasticsearch.xpack.core.ml.action.ExplainDataFrameAnalyticsAction; import org.elasticsearch.xpack.core.ml.action.FinalizeJobExecutionAction; -import org.elasticsearch.xpack.core.ml.action.FindFileStructureAction; +import org.elasticsearch.xpack.core.textstructure.action.FindFileStructureAction; import org.elasticsearch.xpack.core.ml.action.FlushJobAction; import org.elasticsearch.xpack.core.ml.action.ForecastJobAction; import org.elasticsearch.xpack.core.ml.action.GetBucketsAction; @@ -316,7 +316,6 @@ public List> getClientActions() { GetCalendarEventsAction.INSTANCE, PostCalendarEventsAction.INSTANCE, PersistJobAction.INSTANCE, - FindFileStructureAction.INSTANCE, SetUpgradeModeAction.INSTANCE, PutDataFrameAnalyticsAction.INSTANCE, GetDataFrameAnalyticsAction.INSTANCE, @@ -408,7 +407,9 @@ public List> getClientActions() { // Async Search SubmitAsyncSearchAction.INSTANCE, GetAsyncSearchAction.INSTANCE, - DeleteAsyncResultAction.INSTANCE + DeleteAsyncResultAction.INSTANCE, + // Text Structure + FindFileStructureAction.INSTANCE )); // rollupV2 diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/action/FindFileStructureAction.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/textstructure/action/FindFileStructureAction.java similarity index 99% rename from x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/action/FindFileStructureAction.java rename to x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/textstructure/action/FindFileStructureAction.java index 757760ec3ec9b..aaef615e12fd4 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/action/FindFileStructureAction.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/textstructure/action/FindFileStructureAction.java @@ -3,7 +3,7 @@ * or more contributor license agreements. Licensed under the Elastic License; * you may not use this file except in compliance with the Elastic License. */ -package org.elasticsearch.xpack.core.ml.action; +package org.elasticsearch.xpack.core.textstructure.action; import org.elasticsearch.action.ActionRequest; import org.elasticsearch.action.ActionRequestValidationException; @@ -18,7 +18,7 @@ import org.elasticsearch.common.xcontent.StatusToXContentObject; import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.rest.RestStatus; -import org.elasticsearch.xpack.core.ml.filestructurefinder.FileStructure; +import org.elasticsearch.xpack.core.textstructure.structurefinder.FileStructure; import java.io.IOException; import java.util.Arrays; diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/filestructurefinder/FieldStats.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/textstructure/structurefinder/FieldStats.java similarity index 99% rename from x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/filestructurefinder/FieldStats.java rename to x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/textstructure/structurefinder/FieldStats.java index 8bc7113c2c3a3..cbd5c36412f9a 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/filestructurefinder/FieldStats.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/textstructure/structurefinder/FieldStats.java @@ -3,7 +3,7 @@ * or more contributor license agreements. Licensed under the Elastic License; * you may not use this file except in compliance with the Elastic License. */ -package org.elasticsearch.xpack.core.ml.filestructurefinder; +package org.elasticsearch.xpack.core.textstructure.structurefinder; import org.elasticsearch.common.ParseField; import org.elasticsearch.common.Strings; diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/filestructurefinder/FileStructure.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/textstructure/structurefinder/FileStructure.java similarity index 99% rename from x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/filestructurefinder/FileStructure.java rename to x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/textstructure/structurefinder/FileStructure.java index 31098223d4b7b..fc2e91bbc9981 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/filestructurefinder/FileStructure.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/textstructure/structurefinder/FileStructure.java @@ -3,7 +3,7 @@ * or more contributor license agreements. Licensed under the Elastic License; * you may not use this file except in compliance with the Elastic License. */ -package org.elasticsearch.xpack.core.ml.filestructurefinder; +package org.elasticsearch.xpack.core.textstructure.structurefinder; import org.elasticsearch.common.ParseField; import org.elasticsearch.common.io.stream.StreamInput; @@ -25,7 +25,7 @@ import java.util.TreeMap; /** - * Stores the file format determined by Machine Learning. + * Stores the determined file format. */ public class FileStructure implements ToXContentObject, Writeable { diff --git a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/security/authz/store/ReservedRolesStoreTests.java b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/security/authz/store/ReservedRolesStoreTests.java index 15ab0e1491e65..0fbb79db08634 100644 --- a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/security/authz/store/ReservedRolesStoreTests.java +++ b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/security/authz/store/ReservedRolesStoreTests.java @@ -73,7 +73,7 @@ import org.elasticsearch.xpack.core.ml.action.EvaluateDataFrameAction; import org.elasticsearch.xpack.core.ml.action.ExplainDataFrameAnalyticsAction; import org.elasticsearch.xpack.core.ml.action.FinalizeJobExecutionAction; -import org.elasticsearch.xpack.core.ml.action.FindFileStructureAction; +import org.elasticsearch.xpack.core.textstructure.action.FindFileStructureAction; import org.elasticsearch.xpack.core.ml.action.FlushJobAction; import org.elasticsearch.xpack.core.ml.action.ForecastJobAction; import org.elasticsearch.xpack.core.ml.action.GetBucketsAction; diff --git a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/action/FindFileStructureActionRequestTests.java b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/textstructure/action/FindFileStructureActionRequestTests.java similarity index 97% rename from x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/action/FindFileStructureActionRequestTests.java rename to x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/textstructure/action/FindFileStructureActionRequestTests.java index 557a044d27b21..d60c04c0537f7 100644 --- a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/action/FindFileStructureActionRequestTests.java +++ b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/textstructure/action/FindFileStructureActionRequestTests.java @@ -3,13 +3,13 @@ * or more contributor license agreements. Licensed under the Elastic License; * you may not use this file except in compliance with the Elastic License. */ -package org.elasticsearch.xpack.core.ml.action; +package org.elasticsearch.xpack.core.textstructure.action; import org.elasticsearch.action.ActionRequestValidationException; import org.elasticsearch.common.bytes.BytesArray; import org.elasticsearch.common.io.stream.Writeable; import org.elasticsearch.test.AbstractWireSerializingTestCase; -import org.elasticsearch.xpack.core.ml.filestructurefinder.FileStructure; +import org.elasticsearch.xpack.core.textstructure.structurefinder.FileStructure; import java.util.Arrays; diff --git a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/action/FindFileStructureActionResponseTests.java b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/textstructure/action/FindFileStructureActionResponseTests.java similarity index 78% rename from x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/action/FindFileStructureActionResponseTests.java rename to x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/textstructure/action/FindFileStructureActionResponseTests.java index 0c8970fd35b4b..83fea7fba34e6 100644 --- a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/action/FindFileStructureActionResponseTests.java +++ b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/textstructure/action/FindFileStructureActionResponseTests.java @@ -3,11 +3,12 @@ * or more contributor license agreements. Licensed under the Elastic License; * you may not use this file except in compliance with the Elastic License. */ -package org.elasticsearch.xpack.core.ml.action; +package org.elasticsearch.xpack.core.textstructure.action; import org.elasticsearch.common.io.stream.Writeable; import org.elasticsearch.test.AbstractWireSerializingTestCase; -import org.elasticsearch.xpack.core.ml.filestructurefinder.FileStructureTests; +import org.elasticsearch.xpack.core.textstructure.action.FindFileStructureAction; +import org.elasticsearch.xpack.core.textstructure.structurefinder.FileStructureTests; public class FindFileStructureActionResponseTests extends AbstractWireSerializingTestCase { diff --git a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/filestructurefinder/FieldStatsTests.java b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/textstructure/structurefinder/FieldStatsTests.java similarity index 97% rename from x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/filestructurefinder/FieldStatsTests.java rename to x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/textstructure/structurefinder/FieldStatsTests.java index 889eae19387fc..37c1aa7fdc9bf 100644 --- a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/filestructurefinder/FieldStatsTests.java +++ b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/textstructure/structurefinder/FieldStatsTests.java @@ -3,7 +3,7 @@ * or more contributor license agreements. Licensed under the Elastic License; * you may not use this file except in compliance with the Elastic License. */ -package org.elasticsearch.xpack.core.ml.filestructurefinder; +package org.elasticsearch.xpack.core.textstructure.structurefinder; import org.elasticsearch.common.io.stream.Writeable; import org.elasticsearch.common.xcontent.XContentParser; diff --git a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/filestructurefinder/FileStructureTests.java b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/textstructure/structurefinder/FileStructureTests.java similarity index 98% rename from x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/filestructurefinder/FileStructureTests.java rename to x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/textstructure/structurefinder/FileStructureTests.java index d1493f2fe4dae..a186e3372bdc9 100644 --- a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/filestructurefinder/FileStructureTests.java +++ b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/textstructure/structurefinder/FileStructureTests.java @@ -3,7 +3,7 @@ * or more contributor license agreements. Licensed under the Elastic License; * you may not use this file except in compliance with the Elastic License. */ -package org.elasticsearch.xpack.core.ml.filestructurefinder; +package org.elasticsearch.xpack.core.textstructure.structurefinder; import org.elasticsearch.common.io.stream.Writeable; import org.elasticsearch.common.xcontent.ToXContent; diff --git a/x-pack/plugin/ml/build.gradle b/x-pack/plugin/ml/build.gradle index 64a673b264d16..fd2bb739acf92 100644 --- a/x-pack/plugin/ml/build.gradle +++ b/x-pack/plugin/ml/build.gradle @@ -58,7 +58,6 @@ dependencies { // ml deps api project(':libs:elasticsearch-grok') - api "com.ibm.icu:icu4j:${versions.icu4j}" api "net.sf.supercsv:super-csv:${versions.supercsv}" nativeBundle("org.elasticsearch.ml:ml-cpp:${project.version}@zip") { changing = true diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MachineLearning.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MachineLearning.java index 70a4b6382d7e8..96c9b72b854a4 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MachineLearning.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MachineLearning.java @@ -94,7 +94,6 @@ import org.elasticsearch.xpack.core.ml.action.EvaluateDataFrameAction; import org.elasticsearch.xpack.core.ml.action.ExplainDataFrameAnalyticsAction; import org.elasticsearch.xpack.core.ml.action.FinalizeJobExecutionAction; -import org.elasticsearch.xpack.core.ml.action.FindFileStructureAction; import org.elasticsearch.xpack.core.ml.action.FlushJobAction; import org.elasticsearch.xpack.core.ml.action.ForecastJobAction; import org.elasticsearch.xpack.core.ml.action.GetBucketsAction; @@ -173,7 +172,6 @@ import org.elasticsearch.xpack.ml.action.TransportEvaluateDataFrameAction; import org.elasticsearch.xpack.ml.action.TransportExplainDataFrameAnalyticsAction; import org.elasticsearch.xpack.ml.action.TransportFinalizeJobExecutionAction; -import org.elasticsearch.xpack.ml.action.TransportFindFileStructureAction; import org.elasticsearch.xpack.ml.action.TransportFlushJobAction; import org.elasticsearch.xpack.ml.action.TransportForecastJobAction; import org.elasticsearch.xpack.ml.action.TransportGetBucketsAction; @@ -279,7 +277,6 @@ import org.elasticsearch.xpack.ml.process.NativeController; import org.elasticsearch.xpack.ml.process.NativeStorageProvider; import org.elasticsearch.xpack.ml.rest.RestDeleteExpiredDataAction; -import org.elasticsearch.xpack.ml.rest.RestFindFileStructureAction; import org.elasticsearch.xpack.ml.rest.RestMlInfoAction; import org.elasticsearch.xpack.ml.rest.RestSetUpgradeModeAction; import org.elasticsearch.xpack.ml.rest.calendar.RestDeleteCalendarAction; @@ -922,7 +919,6 @@ public List getRestHandlers(Settings settings, RestController restC new RestPutCalendarJobAction(), new RestGetCalendarEventsAction(), new RestPostCalendarEventAction(), - new RestFindFileStructureAction(), new RestSetUpgradeModeAction(), new RestGetDataFrameAnalyticsAction(), new RestGetDataFrameAnalyticsStatsAction(), @@ -1005,7 +1001,6 @@ public List getRestHandlers(Settings settings, RestController restC new ActionHandler<>(GetCalendarEventsAction.INSTANCE, TransportGetCalendarEventsAction.class), new ActionHandler<>(PostCalendarEventsAction.INSTANCE, TransportPostCalendarEventsAction.class), new ActionHandler<>(PersistJobAction.INSTANCE, TransportPersistJobAction.class), - new ActionHandler<>(FindFileStructureAction.INSTANCE, TransportFindFileStructureAction.class), new ActionHandler<>(SetUpgradeModeAction.INSTANCE, TransportSetUpgradeModeAction.class), new ActionHandler<>(GetDataFrameAnalyticsAction.INSTANCE, TransportGetDataFrameAnalyticsAction.class), new ActionHandler<>(GetDataFrameAnalyticsStatsAction.INSTANCE, TransportGetDataFrameAnalyticsStatsAction.class), diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureTestCase.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureTestCase.java deleted file mode 100644 index f47a3582c66a9..0000000000000 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureTestCase.java +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one - * or more contributor license agreements. Licensed under the Elastic License; - * you may not use this file except in compliance with the Elastic License. - */ -package org.elasticsearch.xpack.ml.filestructurefinder; - -import org.apache.logging.log4j.LogManager; -import org.elasticsearch.test.ESTestCase; -import org.junit.After; -import org.junit.Before; - -import java.nio.charset.Charset; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; -import java.util.Locale; -import java.util.stream.Collectors; - -public abstract class FileStructureTestCase extends ESTestCase { - - protected static final List POSSIBLE_CHARSETS = Collections.unmodifiableList(Charset.availableCharsets().keySet().stream() - .filter(name -> FileStructureFinderManager.FILEBEAT_SUPPORTED_ENCODINGS.contains(name.toLowerCase(Locale.ROOT))) - .collect(Collectors.toList())); - - protected static final String CSV_SAMPLE = "time,id,value\n" + - "2018-05-17T16:23:40,key1,42.0\n" + - "2018-05-17T16:24:11,\"key with spaces\",42.0\n"; - - protected static final String NDJSON_SAMPLE = "{\"logger\":\"controller\",\"timestamp\":1478261151445,\"level\":\"INFO\"," + - "\"pid\":42,\"thread\":\"0x7fff7d2a8000\",\"message\":\"message 1\",\"class\":\"ml\"," + - "\"method\":\"core::SomeNoiseMaker\",\"file\":\"Noisemaker.cc\",\"line\":333}\n" + - "{\"logger\":\"controller\",\"timestamp\":1478261151445," + - "\"level\":\"INFO\",\"pid\":42,\"thread\":\"0x7fff7d2a8000\",\"message\":\"message 2\",\"class\":\"ml\"," + - "\"method\":\"core::SomeNoiseMaker\",\"file\":\"Noisemaker.cc\",\"line\":333}\n"; - - protected static final String PIPE_DELIMITED_SAMPLE = "2018-01-06 16:56:14.295748|INFO |VirtualServer |1 |" + - "listening on 0.0.0.0:9987, :::9987\n" + - "2018-01-06 17:19:44.465252|INFO |VirtualServer |1 |client " + - "'User1'(id:2) changed default admin channelgroup to 'Guest'(id:8)\n" + - "2018-01-06 17:21:25.764368|INFO |VirtualServer |1 |client " + - "'User1'(id:2) was added to channelgroup 'Channel Admin'(id:5) by client 'User1'(id:2) in channel 'Default Channel'(id:1)"; - - protected static final String SEMI_COLON_DELIMITED_SAMPLE = "\"pos_id\";\"trip_id\";\"latitude\";\"longitude\";\"altitude\";" + - "\"timestamp\"\n" + - "\"1\";\"3\";\"4703.7815\";\"1527.4713\";\"359.9\";\"2017-01-19 16:19:04.742113\"\n" + - "\"2\";\"3\";\"4703.7815\";\"1527.4714\";\"359.9\";\"2017-01-19 16:19:05.741890\"\n" + - "\"3\";\"3\";\"4703.7816\";\"1527.4716\";\"360.3\";\"2017-01-19 16:19:06.738842\""; - - protected static final String TEXT_SAMPLE = "[2018-05-11T17:07:29,461][INFO ][o.e.n.Node ] [node-0] initializing ...\n" + - "[2018-05-11T17:07:29,553][INFO ][o.e.e.NodeEnvironment ] [node-0] using [1] data paths, mounts [[/ (/dev/disk1)]], " + - "net usable_space [223.4gb], net total_space [464.7gb], types [hfs]\n" + - "[2018-05-11T17:07:29,553][INFO ][o.e.e.NodeEnvironment ] [node-0] heap size [3.9gb], " + - "compressed ordinary object pointers [true]\n" + - "[2018-05-11T17:07:29,556][INFO ][o.e.n.Node ] [node-0] node name [node-0], node ID [tJ9u8HcaTbWxRtnlfz1RQA]\n"; - - protected static final String TSV_SAMPLE = "time\tid\tvalue\n" + - "2018-05-17T16:23:40\tkey1\t42.0\n" + - "2018-05-17T16:24:11\t\"key with spaces\"\t42.0\n"; - - protected static final String XML_SAMPLE = "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n"; - - // This doesn't need closing because it has an infinite timeout - protected static final TimeoutChecker NOOP_TIMEOUT_CHECKER = new TimeoutChecker("unit test", null, null); - - protected List explanation; - - @Before - public void initExplanation() { - explanation = new ArrayList<>(); - } - - @After - public void printExplanation() { - LogManager.getLogger(getClass()).info("Explanation:\n" + String.join("\n", explanation)); - } - - protected Boolean randomHasByteOrderMarker(String charset) { - return charset.toUpperCase(Locale.ROOT).startsWith("UTF") ? randomBoolean() : null; - } -} diff --git a/x-pack/plugin/src/test/resources/rest-api-spec/api/ml.find_file_structure.json b/x-pack/plugin/src/test/resources/rest-api-spec/api/text_structure.find_structure.json similarity index 97% rename from x-pack/plugin/src/test/resources/rest-api-spec/api/ml.find_file_structure.json rename to x-pack/plugin/src/test/resources/rest-api-spec/api/text_structure.find_structure.json index 1eae145d0d4a5..ec28e8b6ba6e9 100644 --- a/x-pack/plugin/src/test/resources/rest-api-spec/api/ml.find_file_structure.json +++ b/x-pack/plugin/src/test/resources/rest-api-spec/api/text_structure.find_structure.json @@ -1,5 +1,5 @@ { - "ml.find_file_structure":{ + "text_structure.find_structure":{ "documentation":{ "url":"https://www.elastic.co/guide/en/elasticsearch/reference/current/ml-find-file-structure.html", "description":"Finds the structure of a text file. The text file must contain data that is suitable to be ingested into Elasticsearch." @@ -13,7 +13,7 @@ "url":{ "paths":[ { - "path":"/_ml/find_file_structure", + "path":"/_text_structure/find_structure", "methods":[ "POST" ] diff --git a/x-pack/plugin/src/test/resources/rest-api-spec/test/ml/find_file_structure.yml b/x-pack/plugin/src/test/resources/rest-api-spec/test/text_structure/find_file_structure.yml similarity index 98% rename from x-pack/plugin/src/test/resources/rest-api-spec/test/ml/find_file_structure.yml rename to x-pack/plugin/src/test/resources/rest-api-spec/test/text_structure/find_file_structure.yml index 756f8b43ed02c..0d983616368c1 100644 --- a/x-pack/plugin/src/test/resources/rest-api-spec/test/ml/find_file_structure.yml +++ b/x-pack/plugin/src/test/resources/rest-api-spec/test/text_structure/find_file_structure.yml @@ -8,7 +8,7 @@ setup: # This is to stop the usual content type randomization, which # would obviously ruin the results for this particular test Content-Type: "application/json" - ml.find_file_structure: + text_structure.find_structure: lines_to_sample: 3 line_merge_size_limit: 1234 timeout: 10s @@ -67,7 +67,7 @@ setup: # This is to stop the usual content type randomization, which # would obviously ruin the results for this particular test Content-Type: "application/json" - ml.find_file_structure: + text_structure.find_structure: charset: UTF-8 format: ndjson timestamp_field: time diff --git a/x-pack/plugin/text-structure/build.gradle b/x-pack/plugin/text-structure/build.gradle new file mode 100644 index 0000000000000..0b312279d25e0 --- /dev/null +++ b/x-pack/plugin/text-structure/build.gradle @@ -0,0 +1,20 @@ +apply plugin: 'elasticsearch.esplugin' +esplugin { + name 'x-pack-text-structure' + description 'Elasticsearch Expanded Pack Plugin - Text Structure' + classname 'org.elasticsearch.xpack.textstructure.TextStructurePlugin' + extendedPlugins = ['x-pack-core'] +} +archivesBaseName = 'x-pack-text-structure' + +dependencies { + compileOnly project(path: xpackModule('core'), configuration: 'default') + testImplementation project(path: xpackModule('core'), configuration: 'testArtifacts') + api project(':libs:elasticsearch-grok') + api "com.ibm.icu:icu4j:${versions.icu4j}" + api "net.sf.supercsv:super-csv:${versions.supercsv}" +} + +addQaCheckDependencies() + +tasks.named("testingConventions").configure { enabled = false } diff --git a/x-pack/plugin/ml/licenses/icu4j-62.1.jar.sha1 b/x-pack/plugin/text-structure/licenses/icu4j-62.1.jar.sha1 similarity index 100% rename from x-pack/plugin/ml/licenses/icu4j-62.1.jar.sha1 rename to x-pack/plugin/text-structure/licenses/icu4j-62.1.jar.sha1 diff --git a/x-pack/plugin/ml/licenses/icu4j-LICENSE.txt b/x-pack/plugin/text-structure/licenses/icu4j-LICENSE.txt similarity index 100% rename from x-pack/plugin/ml/licenses/icu4j-LICENSE.txt rename to x-pack/plugin/text-structure/licenses/icu4j-LICENSE.txt diff --git a/x-pack/plugin/ml/licenses/icu4j-NOTICE.txt b/x-pack/plugin/text-structure/licenses/icu4j-NOTICE.txt similarity index 100% rename from x-pack/plugin/ml/licenses/icu4j-NOTICE.txt rename to x-pack/plugin/text-structure/licenses/icu4j-NOTICE.txt diff --git a/x-pack/plugin/text-structure/licenses/super-csv-2.4.0.jar.sha1 b/x-pack/plugin/text-structure/licenses/super-csv-2.4.0.jar.sha1 new file mode 100644 index 0000000000000..a0b402133090d --- /dev/null +++ b/x-pack/plugin/text-structure/licenses/super-csv-2.4.0.jar.sha1 @@ -0,0 +1 @@ +017f8708c929029dde48bc298deaf3c7ae2452d3 \ No newline at end of file diff --git a/x-pack/plugin/text-structure/licenses/super-csv-LICENSE.txt b/x-pack/plugin/text-structure/licenses/super-csv-LICENSE.txt new file mode 100644 index 0000000000000..9e0ad072b2527 --- /dev/null +++ b/x-pack/plugin/text-structure/licenses/super-csv-LICENSE.txt @@ -0,0 +1,203 @@ +/* + * Apache License + * Version 2.0, January 2004 + * http://www.apache.org/licenses/ + * + * TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + * + * 1. Definitions. + * + * "License" shall mean the terms and conditions for use, reproduction, + * and distribution as defined by Sections 1 through 9 of this document. + * + * "Licensor" shall mean the copyright owner or entity authorized by + * the copyright owner that is granting the License. + * + * "Legal Entity" shall mean the union of the acting entity and all + * other entities that control, are controlled by, or are under common + * control with that entity. For the purposes of this definition, + * "control" means (i) the power, direct or indirect, to cause the + * direction or management of such entity, whether by contract or + * otherwise, or (ii) ownership of fifty percent (50%) or more of the + * outstanding shares, or (iii) beneficial ownership of such entity. + * + * "You" (or "Your") shall mean an individual or Legal Entity + * exercising permissions granted by this License. + * + * "Source" form shall mean the preferred form for making modifications, + * including but not limited to software source code, documentation + * source, and configuration files. + * + * "Object" form shall mean any form resulting from mechanical + * transformation or translation of a Source form, including but + * not limited to compiled object code, generated documentation, + * and conversions to other media types. + * + * "Work" shall mean the work of authorship, whether in Source or + * Object form, made available under the License, as indicated by a + * copyright notice that is included in or attached to the work + * (an example is provided in the Appendix below). + * + * "Derivative Works" shall mean any work, whether in Source or Object + * form, that is based on (or derived from) the Work and for which the + * editorial revisions, annotations, elaborations, or other modifications + * represent, as a whole, an original work of authorship. For the purposes + * of this License, Derivative Works shall not include works that remain + * separable from, or merely link (or bind by name) to the interfaces of, + * the Work and Derivative Works thereof. + * + * "Contribution" shall mean any work of authorship, including + * the original version of the Work and any modifications or additions + * to that Work or Derivative Works thereof, that is intentionally + * submitted to Licensor for inclusion in the Work by the copyright owner + * or by an individual or Legal Entity authorized to submit on behalf of + * the copyright owner. For the purposes of this definition, "submitted" + * means any form of electronic, verbal, or written communication sent + * to the Licensor or its representatives, including but not limited to + * communication on electronic mailing lists, source code control systems, + * and issue tracking systems that are managed by, or on behalf of, the + * Licensor for the purpose of discussing and improving the Work, but + * excluding communication that is conspicuously marked or otherwise + * designated in writing by the copyright owner as "Not a Contribution." + * + * "Contributor" shall mean Licensor and any individual or Legal Entity + * on behalf of whom a Contribution has been received by Licensor and + * subsequently incorporated within the Work. + * + * 2. Grant of Copyright License. Subject to the terms and conditions of + * this License, each Contributor hereby grants to You a perpetual, + * worldwide, non-exclusive, no-charge, royalty-free, irrevocable + * copyright license to reproduce, prepare Derivative Works of, + * publicly display, publicly perform, sublicense, and distribute the + * Work and such Derivative Works in Source or Object form. + * + * 3. Grant of Patent License. Subject to the terms and conditions of + * this License, each Contributor hereby grants to You a perpetual, + * worldwide, non-exclusive, no-charge, royalty-free, irrevocable + * (except as stated in this section) patent license to make, have made, + * use, offer to sell, sell, import, and otherwise transfer the Work, + * where such license applies only to those patent claims licensable + * by such Contributor that are necessarily infringed by their + * Contribution(s) alone or by combination of their Contribution(s) + * with the Work to which such Contribution(s) was submitted. If You + * institute patent litigation against any entity (including a + * cross-claim or counterclaim in a lawsuit) alleging that the Work + * or a Contribution incorporated within the Work constitutes direct + * or contributory patent infringement, then any patent licenses + * granted to You under this License for that Work shall terminate + * as of the date such litigation is filed. + * + * 4. Redistribution. You may reproduce and distribute copies of the + * Work or Derivative Works thereof in any medium, with or without + * modifications, and in Source or Object form, provided that You + * meet the following conditions: + * + * (a) You must give any other recipients of the Work or + * Derivative Works a copy of this License; and + * + * (b) You must cause any modified files to carry prominent notices + * stating that You changed the files; and + * + * (c) You must retain, in the Source form of any Derivative Works + * that You distribute, all copyright, patent, trademark, and + * attribution notices from the Source form of the Work, + * excluding those notices that do not pertain to any part of + * the Derivative Works; and + * + * (d) If the Work includes a "NOTICE" text file as part of its + * distribution, then any Derivative Works that You distribute must + * include a readable copy of the attribution notices contained + * within such NOTICE file, excluding those notices that do not + * pertain to any part of the Derivative Works, in at least one + * of the following places: within a NOTICE text file distributed + * as part of the Derivative Works; within the Source form or + * documentation, if provided along with the Derivative Works; or, + * within a display generated by the Derivative Works, if and + * wherever such third-party notices normally appear. The contents + * of the NOTICE file are for informational purposes only and + * do not modify the License. You may add Your own attribution + * notices within Derivative Works that You distribute, alongside + * or as an addendum to the NOTICE text from the Work, provided + * that such additional attribution notices cannot be construed + * as modifying the License. + * + * You may add Your own copyright statement to Your modifications and + * may provide additional or different license terms and conditions + * for use, reproduction, or distribution of Your modifications, or + * for any such Derivative Works as a whole, provided Your use, + * reproduction, and distribution of the Work otherwise complies with + * the conditions stated in this License. + * + * 5. Submission of Contributions. Unless You explicitly state otherwise, + * any Contribution intentionally submitted for inclusion in the Work + * by You to the Licensor shall be under the terms and conditions of + * this License, without any additional terms or conditions. + * Notwithstanding the above, nothing herein shall supersede or modify + * the terms of any separate license agreement you may have executed + * with Licensor regarding such Contributions. + * + * 6. Trademarks. This License does not grant permission to use the trade + * names, trademarks, service marks, or product names of the Licensor, + * except as required for reasonable and customary use in describing the + * origin of the Work and reproducing the content of the NOTICE file. + * + * 7. Disclaimer of Warranty. Unless required by applicable law or + * agreed to in writing, Licensor provides the Work (and each + * Contributor provides its Contributions) on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied, including, without limitation, any warranties or conditions + * of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + * PARTICULAR PURPOSE. You are solely responsible for determining the + * appropriateness of using or redistributing the Work and assume any + * risks associated with Your exercise of permissions under this License. + * + * 8. Limitation of Liability. In no event and under no legal theory, + * whether in tort (including negligence), contract, or otherwise, + * unless required by applicable law (such as deliberate and grossly + * negligent acts) or agreed to in writing, shall any Contributor be + * liable to You for damages, including any direct, indirect, special, + * incidental, or consequential damages of any character arising as a + * result of this License or out of the use or inability to use the + * Work (including but not limited to damages for loss of goodwill, + * work stoppage, computer failure or malfunction, or any and all + * other commercial damages or losses), even if such Contributor + * has been advised of the possibility of such damages. + * + * 9. Accepting Warranty or Additional Liability. While redistributing + * the Work or Derivative Works thereof, You may choose to offer, + * and charge a fee for, acceptance of support, warranty, indemnity, + * or other liability obligations and/or rights consistent with this + * License. However, in accepting such obligations, You may act only + * on Your own behalf and on Your sole responsibility, not on behalf + * of any other Contributor, and only if You agree to indemnify, + * defend, and hold each Contributor harmless for any liability + * incurred by, or claims asserted against, such Contributor by reason + * of your accepting any such warranty or additional liability. + * + * END OF TERMS AND CONDITIONS + * + * APPENDIX: How to apply the Apache License to your work. + * + * To apply the Apache License to your work, attach the following + * boilerplate notice, with the fields enclosed by brackets "[]" + * replaced with your own identifying information. (Don't include + * the brackets!) The text should be enclosed in the appropriate + * comment syntax for the file format. We also recommend that a + * file or class name and description of purpose be included on the + * same "printed page" as the copyright notice for easier + * identification within third-party archives. + * + * Copyright 2007 Kasper B. Graversen + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ diff --git a/x-pack/plugin/text-structure/licenses/super-csv-NOTICE.txt b/x-pack/plugin/text-structure/licenses/super-csv-NOTICE.txt new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/x-pack/plugin/text-structure/src/main/java/org/elasticsearch/xpack/textstructure/TextStructurePlugin.java b/x-pack/plugin/text-structure/src/main/java/org/elasticsearch/xpack/textstructure/TextStructurePlugin.java new file mode 100644 index 0000000000000..ad44ce7b11718 --- /dev/null +++ b/x-pack/plugin/text-structure/src/main/java/org/elasticsearch/xpack/textstructure/TextStructurePlugin.java @@ -0,0 +1,55 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +package org.elasticsearch.xpack.textstructure; + +import org.elasticsearch.action.ActionRequest; +import org.elasticsearch.action.ActionResponse; +import org.elasticsearch.cluster.metadata.IndexNameExpressionResolver; +import org.elasticsearch.cluster.node.DiscoveryNodes; +import org.elasticsearch.common.settings.ClusterSettings; +import org.elasticsearch.common.settings.IndexScopedSettings; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.settings.SettingsFilter; +import org.elasticsearch.plugins.ActionPlugin; +import org.elasticsearch.plugins.Plugin; +import org.elasticsearch.rest.RestController; +import org.elasticsearch.rest.RestHandler; +import org.elasticsearch.xpack.core.textstructure.action.FindFileStructureAction; +import org.elasticsearch.xpack.textstructure.rest.RestFindFileStructureAction; +import org.elasticsearch.xpack.textstructure.transport.TransportFindFileStructureAction; + +import java.util.Arrays; +import java.util.List; +import java.util.function.Supplier; + +/** + * This plugin provides APIs for text structure analysis. + * + */ +public class TextStructurePlugin extends Plugin implements ActionPlugin { + + public static final String BASE_PATH = "/_text_structure/"; + + @Override + public List getRestHandlers( + Settings settings, + RestController restController, + ClusterSettings clusterSettings, + IndexScopedSettings indexScopedSettings, + SettingsFilter settingsFilter, + IndexNameExpressionResolver indexNameExpressionResolver, + Supplier nodesInCluster + ) { + return Arrays.asList(new RestFindFileStructureAction()); + } + + @Override + public List> getActions() { + return Arrays.asList(new ActionHandler<>(FindFileStructureAction.INSTANCE, TransportFindFileStructureAction.class)); + } + +} diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/rest/RestFindFileStructureAction.java b/x-pack/plugin/text-structure/src/main/java/org/elasticsearch/xpack/textstructure/rest/RestFindFileStructureAction.java similarity index 61% rename from x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/rest/RestFindFileStructureAction.java rename to x-pack/plugin/text-structure/src/main/java/org/elasticsearch/xpack/textstructure/rest/RestFindFileStructureAction.java index 66e97a9bac9ce..835ca7b40bbd6 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/rest/RestFindFileStructureAction.java +++ b/x-pack/plugin/text-structure/src/main/java/org/elasticsearch/xpack/textstructure/rest/RestFindFileStructureAction.java @@ -3,7 +3,7 @@ * or more contributor license agreements. Licensed under the Elastic License; * you may not use this file except in compliance with the Elastic License. */ -package org.elasticsearch.xpack.ml.rest; +package org.elasticsearch.xpack.textstructure.rest; import org.elasticsearch.ElasticsearchParseException; import org.elasticsearch.client.node.NodeClient; @@ -11,18 +11,17 @@ import org.elasticsearch.rest.BaseRestHandler; import org.elasticsearch.rest.RestRequest; import org.elasticsearch.rest.action.RestToXContentListener; -import org.elasticsearch.xpack.core.ml.action.FindFileStructureAction; -import org.elasticsearch.xpack.core.ml.filestructurefinder.FileStructure; -import org.elasticsearch.xpack.ml.MachineLearning; -import org.elasticsearch.xpack.ml.filestructurefinder.FileStructureFinderManager; +import org.elasticsearch.xpack.core.textstructure.action.FindFileStructureAction; +import org.elasticsearch.xpack.core.textstructure.structurefinder.FileStructure; +import org.elasticsearch.xpack.textstructure.structurefinder.FileStructureFinderManager; -import java.io.IOException; import java.util.Collections; import java.util.List; import java.util.Set; import java.util.concurrent.TimeUnit; import static org.elasticsearch.rest.RestRequest.Method.POST; +import static org.elasticsearch.xpack.textstructure.TextStructurePlugin.BASE_PATH; public class RestFindFileStructureAction extends BaseRestHandler { @@ -30,34 +29,51 @@ public class RestFindFileStructureAction extends BaseRestHandler { @Override public List routes() { - return Collections.singletonList( - new Route(POST, MachineLearning.BASE_PATH + "find_file_structure") - ); + return Collections.emptyList(); + } + + @Override + public List replacedRoutes() { + return Collections.singletonList(new ReplacedRoute(POST, BASE_PATH + "find_structure", POST, "/_ml/find_file_structure")); } @Override public String getName() { - return "ml_find_file_structure_action"; + return "text_structure_find_structure_action"; } @Override - protected RestChannelConsumer prepareRequest(RestRequest restRequest, NodeClient client) throws IOException { + protected RestChannelConsumer prepareRequest(RestRequest restRequest, NodeClient client) { FindFileStructureAction.Request request = new FindFileStructureAction.Request(); - request.setLinesToSample(restRequest.paramAsInt(FindFileStructureAction.Request.LINES_TO_SAMPLE.getPreferredName(), - FileStructureFinderManager.DEFAULT_IDEAL_SAMPLE_LINE_COUNT)); - request.setLineMergeSizeLimit(restRequest.paramAsInt(FindFileStructureAction.Request.LINE_MERGE_SIZE_LIMIT.getPreferredName(), - FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT)); - request.setTimeout(TimeValue.parseTimeValue(restRequest.param(FindFileStructureAction.Request.TIMEOUT.getPreferredName()), - DEFAULT_TIMEOUT, FindFileStructureAction.Request.TIMEOUT.getPreferredName())); + request.setLinesToSample( + restRequest.paramAsInt( + FindFileStructureAction.Request.LINES_TO_SAMPLE.getPreferredName(), + FileStructureFinderManager.DEFAULT_IDEAL_SAMPLE_LINE_COUNT + ) + ); + request.setLineMergeSizeLimit( + restRequest.paramAsInt( + FindFileStructureAction.Request.LINE_MERGE_SIZE_LIMIT.getPreferredName(), + FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT + ) + ); + request.setTimeout( + TimeValue.parseTimeValue( + restRequest.param(FindFileStructureAction.Request.TIMEOUT.getPreferredName()), + DEFAULT_TIMEOUT, + FindFileStructureAction.Request.TIMEOUT.getPreferredName() + ) + ); request.setCharset(restRequest.param(FindFileStructureAction.Request.CHARSET.getPreferredName())); request.setFormat(restRequest.param(FindFileStructureAction.Request.FORMAT.getPreferredName())); request.setColumnNames(restRequest.paramAsStringArray(FindFileStructureAction.Request.COLUMN_NAMES.getPreferredName(), null)); request.setHasHeaderRow(restRequest.paramAsBoolean(FindFileStructureAction.Request.HAS_HEADER_ROW.getPreferredName(), null)); request.setDelimiter(restRequest.param(FindFileStructureAction.Request.DELIMITER.getPreferredName())); request.setQuote(restRequest.param(FindFileStructureAction.Request.QUOTE.getPreferredName())); - request.setShouldTrimFields(restRequest.paramAsBoolean(FindFileStructureAction.Request.SHOULD_TRIM_FIELDS.getPreferredName(), - null)); + request.setShouldTrimFields( + restRequest.paramAsBoolean(FindFileStructureAction.Request.SHOULD_TRIM_FIELDS.getPreferredName(), null) + ); request.setGrokPattern(restRequest.param(FindFileStructureAction.Request.GROK_PATTERN.getPreferredName())); request.setTimestampFormat(restRequest.param(FindFileStructureAction.Request.TIMESTAMP_FORMAT.getPreferredName())); request.setTimestampField(restRequest.param(FindFileStructureAction.Request.TIMESTAMP_FIELD.getPreferredName())); diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinder.java b/x-pack/plugin/text-structure/src/main/java/org/elasticsearch/xpack/textstructure/structurefinder/DelimitedFileStructureFinder.java similarity index 74% rename from x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinder.java rename to x-pack/plugin/text-structure/src/main/java/org/elasticsearch/xpack/textstructure/structurefinder/DelimitedFileStructureFinder.java index e267705b8a641..d91980b7f567f 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinder.java +++ b/x-pack/plugin/text-structure/src/main/java/org/elasticsearch/xpack/textstructure/structurefinder/DelimitedFileStructureFinder.java @@ -3,12 +3,12 @@ * or more contributor license agreements. Licensed under the Elastic License; * you may not use this file except in compliance with the Elastic License. */ -package org.elasticsearch.xpack.ml.filestructurefinder; +package org.elasticsearch.xpack.textstructure.structurefinder; import org.apache.logging.log4j.message.ParameterizedMessage; import org.elasticsearch.common.collect.Tuple; -import org.elasticsearch.xpack.core.ml.filestructurefinder.FieldStats; -import org.elasticsearch.xpack.core.ml.filestructurefinder.FileStructure; +import org.elasticsearch.xpack.core.textstructure.structurefinder.FieldStats; +import org.elasticsearch.xpack.core.textstructure.structurefinder.FileStructure; import org.supercsv.exception.SuperCsvException; import org.supercsv.io.CsvListReader; import org.supercsv.prefs.CsvPreference; @@ -38,11 +38,16 @@ public class DelimitedFileStructureFinder implements FileStructureFinder { private final List sampleMessages; private final FileStructure structure; - static DelimitedFileStructureFinder makeDelimitedFileStructureFinder(List explanation, String sample, String charsetName, - Boolean hasByteOrderMarker, CsvPreference csvPreference, - boolean trimFields, FileStructureOverrides overrides, - TimeoutChecker timeoutChecker) - throws IOException { + static DelimitedFileStructureFinder makeDelimitedFileStructureFinder( + List explanation, + String sample, + String charsetName, + Boolean hasByteOrderMarker, + CsvPreference csvPreference, + boolean trimFields, + FileStructureOverrides overrides, + TimeoutChecker timeoutChecker + ) throws IOException { Tuple>, List> parsed = readRows(sample, csvPreference, timeoutChecker); List> rows = parsed.v1(); @@ -58,8 +63,15 @@ static DelimitedFileStructureFinder makeDelimitedFileStructureFinder(List overriddenColumnNames = overrides.getColumnNames(); if (overriddenColumnNames != null) { if (overriddenColumnNames.size() != header.length) { - throw new IllegalArgumentException("[" + overriddenColumnNames.size() + "] column names were specified [" + - String.join(",", overriddenColumnNames) + "] but there are [" + header.length + "] columns in the sample"); + throw new IllegalArgumentException( + "[" + + overriddenColumnNames.size() + + "] column names were specified [" + + String.join(",", overriddenColumnNames) + + "] but there are [" + + header.length + + "] columns in the sample" + ); } columnNames = overriddenColumnNames.toArray(new String[0]); } else { @@ -86,8 +98,11 @@ static DelimitedFileStructureFinder makeDelimitedFileStructureFinder(List sampleRecord = new LinkedHashMap<>(); - Util.filterListToMap(sampleRecord, columnNames, - trimFields ? row.stream().map(field -> (field == null) ? null : field.trim()).collect(Collectors.toList()) : row); + Util.filterListToMap( + sampleRecord, + columnNames, + trimFields ? row.stream().map(field -> (field == null) ? null : field.trim()).collect(Collectors.toList()) : row + ); sampleRecords.add(sampleRecord); sampleMessages.add(String.join("\n", sampleLines.subList(prevMessageEndLineNumber, lineNumber))); maxLinesPerMessage = Math.max(maxLinesPerMessage, lineNumber - prevMessageEndLineNumber); @@ -99,8 +114,8 @@ static DelimitedFileStructureFinder makeDelimitedFileStructureFinder(List, SortedMap> mappingsAndFieldStats = - FileStructureUtils.guessMappingsAndCalculateFieldStats(explanation, sampleRecords, timeoutChecker); + Tuple, SortedMap> mappingsAndFieldStats = FileStructureUtils + .guessMappingsAndCalculateFieldStats(explanation, sampleRecords, timeoutChecker); SortedMap fieldMappings = mappingsAndFieldStats.v1(); @@ -110,8 +125,7 @@ static DelimitedFileStructureFinder makeDelimitedFileStructureFinder(List csvProcessorSettings = makeCsvProcessorSettings("message", columnNamesList, delimiter, quoteChar, trimFields); - FileStructure.Builder structureBuilder = new FileStructure.Builder(FileStructure.Format.DELIMITED) - .setCharset(charsetName) + FileStructure.Builder structureBuilder = new FileStructure.Builder(FileStructure.Format.DELIMITED).setCharset(charsetName) .setHasByteOrderMarker(hasByteOrderMarker) .setSampleStart(preamble) .setNumLinesAnalyzed(lineNumbers.get(lineNumbers.size() - 1)) @@ -125,21 +139,29 @@ static DelimitedFileStructureFinder makeDelimitedFileStructureFinder(List - optQuotePattern + column.replace(quote, twoQuotes).replaceAll(REGEX_NEEDS_ESCAPE_PATTERN, "\\\\$1") + optQuotePattern) - .collect(Collectors.joining(delimiterPattern))); + structureBuilder.setExcludeLinesPattern( + "^" + + Arrays.stream(header) + .map( + column -> optQuotePattern + column.replace(quote, twoQuotes).replaceAll(REGEX_NEEDS_ESCAPE_PATTERN, "\\\\$1") + + optQuotePattern + ) + .collect(Collectors.joining(delimiterPattern)) + ); } if (trimFields) { structureBuilder.setShouldTrimFields(true); } - Tuple timeField = FileStructureUtils.guessTimestampField(explanation, sampleRecords, overrides, - timeoutChecker); + Tuple timeField = FileStructureUtils.guessTimestampField( + explanation, + sampleRecords, + overrides, + timeoutChecker + ); if (timeField != null) { boolean needClientTimeZone = timeField.v2().hasTimezoneDependentParsing(); @@ -148,28 +170,66 @@ static DelimitedFileStructureFinder makeDelimitedFileStructureFinder(List>, List> readRows(String sample, CsvPrefe return new Tuple<>(rows, lineNumbers); } - static Tuple findHeaderFromSample(List explanation, List> rows, - FileStructureOverrides overrides) { + static Tuple findHeaderFromSample( + List explanation, + List> rows, + FileStructureOverrides overrides + ) { assert rows.isEmpty() == false; @@ -250,8 +313,12 @@ static Tuple findHeaderFromSample(List explanation, L if (isHeaderInFile && overriddenColumnNames == null) { String duplicateValue = findDuplicateNonEmptyValues(firstRow); if (duplicateValue != null) { - throw new IllegalArgumentException("Sample specified to contain a header row, " + - "but the first row contains duplicate values: [" + duplicateValue + "]"); + throw new IllegalArgumentException( + "Sample specified to contain a header row, " + + "but the first row contains duplicate values: [" + + duplicateValue + + "]" + ); } } explanation.add("Sample specified to " + (isHeaderInFile ? "contain" : "not contain") + " a header row"); @@ -308,19 +375,22 @@ private static boolean isFirstRowUnusual(List explanation, List (double) otherRow.length()) + DoubleSummaryStatistics otherRowStats = otherRowStrs.stream() + .mapToDouble(otherRow -> (double) otherRow.length()) .collect(DoubleSummaryStatistics::new, DoubleSummaryStatistics::accept, DoubleSummaryStatistics::combine); double otherLengthRange = otherRowStats.getMax() - otherRowStats.getMin(); - if (firstRowLength < otherRowStats.getMin() - otherLengthRange / 10.0 || - firstRowLength > otherRowStats.getMax() + otherLengthRange / 10.0) { - explanation.add("First row is unusual based on length test: [" + firstRowLength + "] and [" + - toNiceString(otherRowStats) + "]"); + if (firstRowLength < otherRowStats.getMin() - otherLengthRange / 10.0 + || firstRowLength > otherRowStats.getMax() + otherLengthRange / 10.0) { + explanation.add( + "First row is unusual based on length test: [" + firstRowLength + "] and [" + toNiceString(otherRowStats) + "]" + ); return true; } - explanation.add("First row is not unusual based on length test: [" + firstRowLength + "] and [" + - toNiceString(otherRowStats) + "]"); + explanation.add( + "First row is not unusual based on length test: [" + firstRowLength + "] and [" + toNiceString(otherRowStats) + "]" + ); // Check edit distances between short fields @@ -329,7 +399,8 @@ private static boolean isFirstRowUnusual(List explanation, List (double) levenshteinFieldwiseCompareRows(firstRow, otherRow, shortFieldMask)) .collect(DoubleSummaryStatistics::new, DoubleSummaryStatistics::accept, DoubleSummaryStatistics::combine); @@ -339,28 +410,44 @@ private static boolean isFirstRowUnusual(List explanation, List otherRowStats.getAverage() * 1.2) { - explanation.add("First row is unusual based on Levenshtein test [" + toNiceString(firstRowStats) + - "] and [" + toNiceString(otherRowStats) + "]"); + explanation.add( + "First row is unusual based on Levenshtein test [" + + toNiceString(firstRowStats) + + "] and [" + + toNiceString(otherRowStats) + + "]" + ); return true; } - explanation.add("First row is not unusual based on Levenshtein test [" + toNiceString(firstRowStats) + - "] and [" + toNiceString(otherRowStats) + "]"); + explanation.add( + "First row is not unusual based on Levenshtein test [" + + toNiceString(firstRowStats) + + "] and [" + + toNiceString(otherRowStats) + + "]" + ); return false; } private static String toNiceString(DoubleSummaryStatistics stats) { - return String.format(Locale.ROOT, "count=%d, min=%f, average=%f, max=%f", stats.getCount(), stats.getMin(), stats.getAverage(), - stats.getMax()); + return String.format( + Locale.ROOT, + "count=%d, min=%f, average=%f, max=%f", + stats.getCount(), + stats.getMin(), + stats.getAverage(), + stats.getMax() + ); } /** @@ -376,8 +463,10 @@ static BitSet makeShortFieldMask(List> rows, int longFieldThreshold int maxLength = rows.stream().map(List::size).max(Integer::compareTo).get(); for (int index = 0; index < maxLength; ++index) { final int i = index; - shortFieldMask.set(i, - rows.stream().allMatch(row -> i >= row.size() || row.get(i) == null || row.get(i).length() < longFieldThreshold)); + shortFieldMask.set( + i, + rows.stream().allMatch(row -> i >= row.size() || row.get(i) == null || row.get(i).length() < longFieldThreshold) + ); } return shortFieldMask; @@ -410,8 +499,10 @@ static int levenshteinFieldwiseCompareRows(List firstRow, List s int result = 0; for (int index = fieldMask.nextSetBit(0); index >= 0; index = fieldMask.nextSetBit(index + 1)) { - result += levenshteinDistance((index < firstRow.size()) ? firstRow.get(index) : "", - (index < secondRow.size()) ? secondRow.get(index) : ""); + result += levenshteinDistance( + (index < firstRow.size()) ? firstRow.get(index) : "", + (index < secondRow.size()) ? secondRow.get(index) : "" + ); } return result; @@ -484,25 +575,36 @@ static boolean lineHasUnescapedQuote(String line, CsvPreference csvPreference) { char quote = csvPreference.getQuoteChar(); String lineWithEscapedQuotesRemoved = line.replace(String.valueOf(quote) + quote, ""); for (int index = 1; index < lineWithEscapedQuotesRemoved.length() - 1; ++index) { - if (lineWithEscapedQuotesRemoved.charAt(index) == quote && - lineWithEscapedQuotesRemoved.codePointAt(index - 1) != csvPreference.getDelimiterChar() && - lineWithEscapedQuotesRemoved.codePointAt(index + 1) != csvPreference.getDelimiterChar()) { + if (lineWithEscapedQuotesRemoved.charAt(index) == quote + && lineWithEscapedQuotesRemoved.codePointAt(index - 1) != csvPreference.getDelimiterChar() + && lineWithEscapedQuotesRemoved.codePointAt(index + 1) != csvPreference.getDelimiterChar()) { return true; } } return false; } - static boolean canCreateFromSample(List explanation, String sample, int minFieldsPerRow, CsvPreference csvPreference, - String formatName, double allowedFractionOfBadLines) { + static boolean canCreateFromSample( + List explanation, + String sample, + int minFieldsPerRow, + CsvPreference csvPreference, + String formatName, + double allowedFractionOfBadLines + ) { // Logstash's CSV parser won't tolerate fields where just part of the // value is quoted, whereas SuperCSV will, hence this extra check String[] sampleLines = sample.split("\n"); for (String sampleLine : sampleLines) { if (lineHasUnescapedQuote(sampleLine, csvPreference)) { - explanation.add("Not " + formatName + - " because a line has an unescaped quote that is not at the beginning or end of a field: [" + sampleLine + "]"); + explanation.add( + "Not " + + formatName + + " because a line has an unescaped quote that is not at the beginning or end of a field: [" + + sampleLine + + "]" + ); return false; } } @@ -524,8 +626,15 @@ static boolean canCreateFromSample(List explanation, String sample, int if (fieldsInFirstRow < 0) { fieldsInFirstRow = fieldsInThisRow; if (fieldsInFirstRow < minFieldsPerRow) { - explanation.add("Not " + formatName + " because the first row has fewer than [" + minFieldsPerRow + - "] fields: [" + fieldsInFirstRow + "]"); + explanation.add( + "Not " + + formatName + + " because the first row has fewer than [" + + minFieldsPerRow + + "] fields: [" + + fieldsInFirstRow + + "]" + ); return false; } fieldsInLastRow = fieldsInFirstRow; @@ -549,12 +658,16 @@ static boolean canCreateFromSample(List explanation, String sample, int // We should only allow a certain percentage of ill formatted rows // as it may have and down stream effects if (illFormattedRows.size() > Math.ceil(allowedFractionOfBadLines * totalNumberOfRows)) { - explanation.add(new ParameterizedMessage( - "Not {} because {} or more rows did not have the same number of fields as the first row ({}). Bad rows {}", - formatName, - illFormattedRows.size(), - fieldsInFirstRow, - illFormattedRows).getFormattedMessage()); + explanation.add( + new ParameterizedMessage( + "Not {} because {} or more rows did not have the same number of fields " + + "as the first row ({}). Bad rows {}", + formatName, + illFormattedRows.size(), + fieldsInFirstRow, + illFormattedRows + ).getFormattedMessage() + ); return false; } continue; @@ -564,8 +677,15 @@ static boolean canCreateFromSample(List explanation, String sample, int } if (fieldsInLastRow > fieldsInFirstRow) { - explanation.add("Not " + formatName + " because last row has more fields than first row: [" + fieldsInFirstRow + - "] and [" + fieldsInLastRow + "]"); + explanation.add( + "Not " + + formatName + + " because last row has more fields than first row: [" + + fieldsInFirstRow + + "] and [" + + fieldsInLastRow + + "]" + ); return false; } if (fieldsInLastRow < fieldsInFirstRow) { @@ -627,9 +747,16 @@ static Map makeCsvProcessorSettings(String field, List t * field. It doesn't work when fields prior to the chosen field contain newlines in some of the * records. */ - static String makeMultilineStartPattern(List explanation, List columnNames, int maxLinesPerMessage, - String delimiterPattern, String quotePattern, Map fieldMappings, - String timeFieldName, TimestampFormatFinder timeFieldFormat) { + static String makeMultilineStartPattern( + List explanation, + List columnNames, + int maxLinesPerMessage, + String delimiterPattern, + String quotePattern, + Map fieldMappings, + String timeFieldName, + TimestampFormatFinder timeFieldFormat + ) { assert columnNames.isEmpty() == false; assert maxLinesPerMessage > 0; @@ -678,8 +805,13 @@ static String makeMultilineStartPattern(List explanation, List c break; } if (columnPattern != null) { - builder.append("(?:").append(columnPattern).append("|") - .append(quotePattern).append(columnPattern).append(quotePattern).append(")") + builder.append("(?:") + .append(columnPattern) + .append("|") + .append(quotePattern) + .append(columnPattern) + .append(quotePattern) + .append(")") .append(delimiterPattern); explanation.add("Created a multi-line start pattern based on [" + type + "] column [" + columnName + "]"); return builder.toString(); diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderFactory.java b/x-pack/plugin/text-structure/src/main/java/org/elasticsearch/xpack/textstructure/structurefinder/DelimitedFileStructureFinderFactory.java similarity index 73% rename from x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderFactory.java rename to x-pack/plugin/text-structure/src/main/java/org/elasticsearch/xpack/textstructure/structurefinder/DelimitedFileStructureFinderFactory.java index fc316dd283fcd..4ddb4324153d8 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderFactory.java +++ b/x-pack/plugin/text-structure/src/main/java/org/elasticsearch/xpack/textstructure/structurefinder/DelimitedFileStructureFinderFactory.java @@ -3,9 +3,9 @@ * or more contributor license agreements. Licensed under the Elastic License; * you may not use this file except in compliance with the Elastic License. */ -package org.elasticsearch.xpack.ml.filestructurefinder; +package org.elasticsearch.xpack.textstructure.structurefinder; -import org.elasticsearch.xpack.core.ml.filestructurefinder.FileStructure; +import org.elasticsearch.xpack.core.textstructure.structurefinder.FileStructure; import org.supercsv.prefs.CsvPreference; import java.io.IOException; @@ -28,8 +28,12 @@ public class DelimitedFileStructureFinderFactory implements FileStructureFinderF DelimitedFileStructureFinderFactory makeSimilar(Character quote, Boolean trimFields) { - return new DelimitedFileStructureFinderFactory((char) csvPreference.getDelimiterChar(), - (quote == null) ? csvPreference.getQuoteChar() : quote, minFieldsPerRow, (trimFields == null) ? this.trimFields : trimFields); + return new DelimitedFileStructureFinderFactory( + (char) csvPreference.getDelimiterChar(), + (quote == null) ? csvPreference.getQuoteChar() : quote, + minFieldsPerRow, + (trimFields == null) ? this.trimFields : trimFields + ); } @Override @@ -59,20 +63,36 @@ public boolean canCreateFromSample(List explanation, String sample, doub formatName = Character.getName(csvPreference.getDelimiterChar()).toLowerCase(Locale.ROOT) + " delimited values"; break; } - return DelimitedFileStructureFinder.canCreateFromSample(explanation, + return DelimitedFileStructureFinder.canCreateFromSample( + explanation, sample, minFieldsPerRow, csvPreference, formatName, - allowedFractionOfBadLines); + allowedFractionOfBadLines + ); } @Override - public FileStructureFinder createFromSample(List explanation, String sample, String charsetName, Boolean hasByteOrderMarker, - int lineMergeSizeLimit, FileStructureOverrides overrides, TimeoutChecker timeoutChecker) - throws IOException { + public FileStructureFinder createFromSample( + List explanation, + String sample, + String charsetName, + Boolean hasByteOrderMarker, + int lineMergeSizeLimit, + FileStructureOverrides overrides, + TimeoutChecker timeoutChecker + ) throws IOException { CsvPreference adjustedCsvPreference = new CsvPreference.Builder(csvPreference).maxLinesPerRow(lineMergeSizeLimit).build(); - return DelimitedFileStructureFinder.makeDelimitedFileStructureFinder(explanation, sample, charsetName, hasByteOrderMarker, - adjustedCsvPreference, trimFields, overrides, timeoutChecker); + return DelimitedFileStructureFinder.makeDelimitedFileStructureFinder( + explanation, + sample, + charsetName, + hasByteOrderMarker, + adjustedCsvPreference, + trimFields, + overrides, + timeoutChecker + ); } } diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FieldStatsCalculator.java b/x-pack/plugin/text-structure/src/main/java/org/elasticsearch/xpack/textstructure/structurefinder/FieldStatsCalculator.java similarity index 87% rename from x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FieldStatsCalculator.java rename to x-pack/plugin/text-structure/src/main/java/org/elasticsearch/xpack/textstructure/structurefinder/FieldStatsCalculator.java index 39bf613165e2e..313dc564d2385 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FieldStatsCalculator.java +++ b/x-pack/plugin/text-structure/src/main/java/org/elasticsearch/xpack/textstructure/structurefinder/FieldStatsCalculator.java @@ -3,12 +3,12 @@ * or more contributor license agreements. Licensed under the Elastic License; * you may not use this file except in compliance with the Elastic License. */ -package org.elasticsearch.xpack.ml.filestructurefinder; +package org.elasticsearch.xpack.textstructure.structurefinder; import org.elasticsearch.common.time.DateFormatter; import org.elasticsearch.common.time.DateFormatters; import org.elasticsearch.index.mapper.DateFieldMapper; -import org.elasticsearch.xpack.core.ml.filestructurefinder.FieldStats; +import org.elasticsearch.xpack.core.textstructure.structurefinder.FieldStats; import java.time.Instant; import java.util.ArrayList; @@ -96,8 +96,10 @@ public void accept(Collection fieldValues) { } catch (NumberFormatException e) { // This should not happen in the usual context this class is used in within the file structure finder, // as "double" should be big enough to hold any value that the file structure finder considers numeric - throw new IllegalArgumentException("Field with numeric mapping [" + fieldValue + "] could not be parsed as type double", - e); + throw new IllegalArgumentException( + "Field with numeric mapping [" + fieldValue + "] could not be parsed as type double", + e + ); } } else { countsByStringValue.compute(fieldValue, (k, v) -> (v == null) ? 1 : (1 + v)); @@ -129,8 +131,15 @@ public FieldStats calculate(int numTopHits) { return new FieldStats(count, 0, Collections.emptyList()); } else { assert count > 0; - return new FieldStats(count, countsByNumericValue.size(), countsByNumericValue.firstKey(), countsByNumericValue.lastKey(), - calculateMean(), calculateMedian(), findNumericTopHits(numTopHits)); + return new FieldStats( + count, + countsByNumericValue.size(), + countsByNumericValue.firstKey(), + countsByNumericValue.lastKey(), + calculateMean(), + calculateMedian(), + findNumericTopHits(numTopHits) + ); } } else { return new FieldStats(count, countsByStringValue.size(), earliestTimeString, latestTimeString, findStringTopHits(numTopHits)); @@ -221,8 +230,12 @@ Double calculateMedian() { List> findNumericTopHits(int numTopHits) { assert countsByNumericValue != null; - return findTopHits(numTopHits, countsByNumericValue, Comparator.comparing(Map.Entry::getKey), - FieldStats::toIntegerIfInteger); + return findTopHits( + numTopHits, + countsByNumericValue, + Comparator.comparing(Map.Entry::getKey), + FieldStats::toIntegerIfInteger + ); } List> findStringTopHits(int numTopHits) { @@ -232,13 +245,18 @@ List> findStringTopHits(int numTopHits) { /** * Order by descending count, with a secondary sort to ensure reproducibility of results. */ - private static List> findTopHits(int numTopHits, Map countsByValue, - Comparator> secondarySort, - Function outputMapper) { - - List> sortedByCount = countsByValue.entrySet().stream() + private static List> findTopHits( + int numTopHits, + Map countsByValue, + Comparator> secondarySort, + Function outputMapper + ) { + + List> sortedByCount = countsByValue.entrySet() + .stream() .sorted(Comparator.comparing(Map.Entry::getValue, Comparator.reverseOrder()).thenComparing(secondarySort)) - .limit(numTopHits).collect(Collectors.toList()); + .limit(numTopHits) + .collect(Collectors.toList()); List> topHits = new ArrayList<>(sortedByCount.size()); diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinder.java b/x-pack/plugin/text-structure/src/main/java/org/elasticsearch/xpack/textstructure/structurefinder/FileStructureFinder.java similarity index 81% rename from x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinder.java rename to x-pack/plugin/text-structure/src/main/java/org/elasticsearch/xpack/textstructure/structurefinder/FileStructureFinder.java index c09978b6bcb0e..5815f3dc8c9f4 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinder.java +++ b/x-pack/plugin/text-structure/src/main/java/org/elasticsearch/xpack/textstructure/structurefinder/FileStructureFinder.java @@ -3,9 +3,9 @@ * or more contributor license agreements. Licensed under the Elastic License; * you may not use this file except in compliance with the Elastic License. */ -package org.elasticsearch.xpack.ml.filestructurefinder; +package org.elasticsearch.xpack.textstructure.structurefinder; -import org.elasticsearch.xpack.core.ml.filestructurefinder.FileStructure; +import org.elasticsearch.xpack.core.textstructure.structurefinder.FileStructure; import java.util.List; diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderFactory.java b/x-pack/plugin/text-structure/src/main/java/org/elasticsearch/xpack/textstructure/structurefinder/FileStructureFinderFactory.java similarity index 85% rename from x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderFactory.java rename to x-pack/plugin/text-structure/src/main/java/org/elasticsearch/xpack/textstructure/structurefinder/FileStructureFinderFactory.java index 1fc79e146e7e2..2b4db024d62c1 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderFactory.java +++ b/x-pack/plugin/text-structure/src/main/java/org/elasticsearch/xpack/textstructure/structurefinder/FileStructureFinderFactory.java @@ -3,9 +3,9 @@ * or more contributor license agreements. Licensed under the Elastic License; * you may not use this file except in compliance with the Elastic License. */ -package org.elasticsearch.xpack.ml.filestructurefinder; +package org.elasticsearch.xpack.textstructure.structurefinder; -import org.elasticsearch.xpack.core.ml.filestructurefinder.FileStructure; +import org.elasticsearch.xpack.core.textstructure.structurefinder.FileStructure; import java.util.List; @@ -46,7 +46,13 @@ public interface FileStructureFinderFactory { * @return A {@link FileStructureFinder} object suitable for determining the structure of the supplied sample. * @throws Exception if something goes wrong during creation. */ - FileStructureFinder createFromSample(List explanation, String sample, String charsetName, Boolean hasByteOrderMarker, - int lineMergeSizeLimit, FileStructureOverrides overrides, - TimeoutChecker timeoutChecker) throws Exception; + FileStructureFinder createFromSample( + List explanation, + String sample, + String charsetName, + Boolean hasByteOrderMarker, + int lineMergeSizeLimit, + FileStructureOverrides overrides, + TimeoutChecker timeoutChecker + ) throws Exception; } diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderManager.java b/x-pack/plugin/text-structure/src/main/java/org/elasticsearch/xpack/textstructure/structurefinder/FileStructureFinderManager.java similarity index 52% rename from x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderManager.java rename to x-pack/plugin/text-structure/src/main/java/org/elasticsearch/xpack/textstructure/structurefinder/FileStructureFinderManager.java index 9f29519f662e4..22a3e113522bb 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderManager.java +++ b/x-pack/plugin/text-structure/src/main/java/org/elasticsearch/xpack/textstructure/structurefinder/FileStructureFinderManager.java @@ -3,7 +3,7 @@ * or more contributor license agreements. Licensed under the Elastic License; * you may not use this file except in compliance with the Elastic License. */ -package org.elasticsearch.xpack.ml.filestructurefinder; +package org.elasticsearch.xpack.textstructure.structurefinder; import com.ibm.icu.text.CharsetDetector; import com.ibm.icu.text.CharsetMatch; @@ -11,7 +11,7 @@ import org.elasticsearch.ElasticsearchTimeoutException; import org.elasticsearch.common.collect.Tuple; import org.elasticsearch.common.unit.TimeValue; -import org.elasticsearch.xpack.core.ml.filestructurefinder.FileStructure; +import org.elasticsearch.xpack.core.textstructure.structurefinder.FileStructure; import java.io.BufferedInputStream; import java.io.BufferedReader; @@ -46,243 +46,245 @@ public final class FileStructureFinderManager { public static final int DEFAULT_LINE_MERGE_SIZE_LIMIT = 10000; static final Set FILEBEAT_SUPPORTED_ENCODINGS = Set.of( - "866", - "ansi_x3.4-1968", - "arabic", - "ascii", - "asmo-708", - "big5", - "big5-hkscs", - "chinese", - "cn-big5", - "cp1250", - "cp1251", - "cp1252", - "cp1253", - "cp1254", - "cp1255", - "cp1256", - "cp1257", - "cp1258", - "cp819", - "cp866", - "csbig5", - "cseuckr", - "cseucpkdfmtjapanese", - "csgb2312", - "csibm866", - "csiso2022jp", - "csiso2022kr", - "csiso58gb231280", - "csiso88596e", - "csiso88596i", - "csiso88598e", - "csiso88598i", - "csisolatin1", - "csisolatin2", - "csisolatin3", - "csisolatin4", - "csisolatin5", - "csisolatin6", - "csisolatin9", - "csisolatinarabic", - "csisolatincyrillic", - "csisolatingreek", - "csisolatinhebrew", - "cskoi8r", - "csksc56011987", - "csmacintosh", - "csshiftjis", - "cyrillic", - "dos-874", - "ecma-114", - "ecma-118", - "elot_928", - "euc-jp", - "euc-kr", - "gb18030", - "gb2312", - "gb_2312", - "gb_2312-80", - "gbk", - "greek", - "greek8", - "hebrew", - "hz-gb-2312", - "ibm819", - "ibm866", - "iso-2022-cn", - "iso-2022-cn-ext", - "iso-2022-jp", - "iso-2022-kr", - "iso-8859-1", - "iso-8859-10", - "iso-8859-11", - "iso-8859-13", - "iso-8859-14", - "iso-8859-15", - "iso-8859-16", - "iso-8859-2", - "iso-8859-3", - "iso-8859-4", - "iso-8859-5", - "iso-8859-6", - "iso-8859-6-e", - "iso-8859-6-i", - "iso-8859-7", - "iso-8859-8", - "iso-8859-8-e", - "iso-8859-8-i", - "iso-8859-9", - "iso-ir-100", - "iso-ir-101", - "iso-ir-109", - "iso-ir-110", - "iso-ir-126", - "iso-ir-127", - "iso-ir-138", - "iso-ir-144", - "iso-ir-148", - "iso-ir-149", - "iso-ir-157", - "iso-ir-58", - "iso8859-1", - "iso8859-10", - "iso8859-11", - "iso8859-13", - "iso8859-14", - "iso8859-15", - "iso8859-2", - "iso8859-3", - "iso8859-4", - "iso8859-5", - "iso8859-6", - "iso8859-6e", - "iso8859-6i", - "iso8859-7", - "iso8859-8", - "iso8859-8e", - "iso8859-8i", - "iso8859-9", - "iso88591", - "iso885910", - "iso885911", - "iso885913", - "iso885914", - "iso885915", - "iso88592", - "iso88593", - "iso88594", - "iso88595", - "iso88596", - "iso88597", - "iso88598", - "iso88599", - "iso_8859-1", - "iso_8859-15", - "iso_8859-1:1987", - "iso_8859-2", - "iso_8859-2:1987", - "iso_8859-3", - "iso_8859-3:1988", - "iso_8859-4", - "iso_8859-4:1988", - "iso_8859-5", - "iso_8859-5:1988", - "iso_8859-6", - "iso_8859-6:1987", - "iso_8859-7", - "iso_8859-7:1987", - "iso_8859-8", - "iso_8859-8:1988", - "iso_8859-9", - "iso_8859-9:1989", - "koi", - "koi8", - "koi8-r", - "koi8-ru", - "koi8-u", - "koi8_r", - "korean", - "ks_c_5601-1987", - "ks_c_5601-1989", - "ksc5601", - "ksc_5601", - "l1", - "l2", - "l3", - "l4", - "l5", - "l6", - "l9", - "latin1", - "latin2", - "latin3", - "latin4", - "latin5", - "latin6", - "logical", - "mac", - "macintosh", - "ms932", - "ms_kanji", - "shift-jis", - "shift_jis", - "sjis", - "sun_eu_greek", - "tis-620", - "unicode-1-1-utf-8", - "us-ascii", - "utf-16", - "utf-16-bom", - "utf-16be", - "utf-16be-bom", - "utf-16le", - "utf-16le-bom", - "utf-8", - "utf8", - "visual", - "windows-1250", - "windows-1251", - "windows-1252", - "windows-1253", - "windows-1254", - "windows-1255", - "windows-1256", - "windows-1257", - "windows-1258", - "windows-31j", - "windows-874", - "windows-949", - "x-cp1250", - "x-cp1251", - "x-cp1252", - "x-cp1253", - "x-cp1254", - "x-cp1255", - "x-cp1256", - "x-cp1257", - "x-cp1258", - "x-euc-jp", - "x-gbk", - "x-mac-cyrillic", - "x-mac-roman", - "x-mac-ukrainian", - "x-sjis", - "x-x-big5"); + "866", + "ansi_x3.4-1968", + "arabic", + "ascii", + "asmo-708", + "big5", + "big5-hkscs", + "chinese", + "cn-big5", + "cp1250", + "cp1251", + "cp1252", + "cp1253", + "cp1254", + "cp1255", + "cp1256", + "cp1257", + "cp1258", + "cp819", + "cp866", + "csbig5", + "cseuckr", + "cseucpkdfmtjapanese", + "csgb2312", + "csibm866", + "csiso2022jp", + "csiso2022kr", + "csiso58gb231280", + "csiso88596e", + "csiso88596i", + "csiso88598e", + "csiso88598i", + "csisolatin1", + "csisolatin2", + "csisolatin3", + "csisolatin4", + "csisolatin5", + "csisolatin6", + "csisolatin9", + "csisolatinarabic", + "csisolatincyrillic", + "csisolatingreek", + "csisolatinhebrew", + "cskoi8r", + "csksc56011987", + "csmacintosh", + "csshiftjis", + "cyrillic", + "dos-874", + "ecma-114", + "ecma-118", + "elot_928", + "euc-jp", + "euc-kr", + "gb18030", + "gb2312", + "gb_2312", + "gb_2312-80", + "gbk", + "greek", + "greek8", + "hebrew", + "hz-gb-2312", + "ibm819", + "ibm866", + "iso-2022-cn", + "iso-2022-cn-ext", + "iso-2022-jp", + "iso-2022-kr", + "iso-8859-1", + "iso-8859-10", + "iso-8859-11", + "iso-8859-13", + "iso-8859-14", + "iso-8859-15", + "iso-8859-16", + "iso-8859-2", + "iso-8859-3", + "iso-8859-4", + "iso-8859-5", + "iso-8859-6", + "iso-8859-6-e", + "iso-8859-6-i", + "iso-8859-7", + "iso-8859-8", + "iso-8859-8-e", + "iso-8859-8-i", + "iso-8859-9", + "iso-ir-100", + "iso-ir-101", + "iso-ir-109", + "iso-ir-110", + "iso-ir-126", + "iso-ir-127", + "iso-ir-138", + "iso-ir-144", + "iso-ir-148", + "iso-ir-149", + "iso-ir-157", + "iso-ir-58", + "iso8859-1", + "iso8859-10", + "iso8859-11", + "iso8859-13", + "iso8859-14", + "iso8859-15", + "iso8859-2", + "iso8859-3", + "iso8859-4", + "iso8859-5", + "iso8859-6", + "iso8859-6e", + "iso8859-6i", + "iso8859-7", + "iso8859-8", + "iso8859-8e", + "iso8859-8i", + "iso8859-9", + "iso88591", + "iso885910", + "iso885911", + "iso885913", + "iso885914", + "iso885915", + "iso88592", + "iso88593", + "iso88594", + "iso88595", + "iso88596", + "iso88597", + "iso88598", + "iso88599", + "iso_8859-1", + "iso_8859-15", + "iso_8859-1:1987", + "iso_8859-2", + "iso_8859-2:1987", + "iso_8859-3", + "iso_8859-3:1988", + "iso_8859-4", + "iso_8859-4:1988", + "iso_8859-5", + "iso_8859-5:1988", + "iso_8859-6", + "iso_8859-6:1987", + "iso_8859-7", + "iso_8859-7:1987", + "iso_8859-8", + "iso_8859-8:1988", + "iso_8859-9", + "iso_8859-9:1989", + "koi", + "koi8", + "koi8-r", + "koi8-ru", + "koi8-u", + "koi8_r", + "korean", + "ks_c_5601-1987", + "ks_c_5601-1989", + "ksc5601", + "ksc_5601", + "l1", + "l2", + "l3", + "l4", + "l5", + "l6", + "l9", + "latin1", + "latin2", + "latin3", + "latin4", + "latin5", + "latin6", + "logical", + "mac", + "macintosh", + "ms932", + "ms_kanji", + "shift-jis", + "shift_jis", + "sjis", + "sun_eu_greek", + "tis-620", + "unicode-1-1-utf-8", + "us-ascii", + "utf-16", + "utf-16-bom", + "utf-16be", + "utf-16be-bom", + "utf-16le", + "utf-16le-bom", + "utf-8", + "utf8", + "visual", + "windows-1250", + "windows-1251", + "windows-1252", + "windows-1253", + "windows-1254", + "windows-1255", + "windows-1256", + "windows-1257", + "windows-1258", + "windows-31j", + "windows-874", + "windows-949", + "x-cp1250", + "x-cp1251", + "x-cp1252", + "x-cp1253", + "x-cp1254", + "x-cp1255", + "x-cp1256", + "x-cp1257", + "x-cp1258", + "x-euc-jp", + "x-gbk", + "x-mac-cyrillic", + "x-mac-roman", + "x-mac-ukrainian", + "x-sjis", + "x-x-big5" + ); /** * These need to be ordered so that the more generic formats come after the more specific ones */ private static final List ORDERED_STRUCTURE_FACTORIES = List.of( - // NDJSON will often also be valid (although utterly weird) CSV, so NDJSON must come before CSV - new NdJsonFileStructureFinderFactory(), - new XmlFileStructureFinderFactory(), - new DelimitedFileStructureFinderFactory(',', '"', 2, false), - new DelimitedFileStructureFinderFactory('\t', '"', 2, false), - new DelimitedFileStructureFinderFactory(';', '"', 4, false), - new DelimitedFileStructureFinderFactory('|', '"', 5, true), - new TextLogFileStructureFinderFactory()); + // NDJSON will often also be valid (although utterly weird) CSV, so NDJSON must come before CSV + new NdJsonFileStructureFinderFactory(), + new XmlFileStructureFinderFactory(), + new DelimitedFileStructureFinderFactory(',', '"', 2, false), + new DelimitedFileStructureFinderFactory('\t', '"', 2, false), + new DelimitedFileStructureFinderFactory(';', '"', 4, false), + new DelimitedFileStructureFinderFactory('|', '"', 5, true), + new TextLogFileStructureFinderFactory() + ); private static final int BUFFER_SIZE = 8192; @@ -296,8 +298,8 @@ public FileStructureFinderManager(ScheduledExecutorService scheduler) { this.scheduler = Objects.requireNonNull(scheduler); } - public FileStructureFinder findFileStructure(Integer idealSampleLineCount, Integer lineMergeSizeLimit, - InputStream fromFile) throws Exception { + public FileStructureFinder findFileStructure(Integer idealSampleLineCount, Integer lineMergeSizeLimit, InputStream fromFile) + throws Exception { return findFileStructure(idealSampleLineCount, lineMergeSizeLimit, fromFile, FileStructureOverrides.EMPTY_OVERRIDES, null); } @@ -319,21 +321,47 @@ public FileStructureFinder findFileStructure(Integer idealSampleLineCount, Integ * @return A {@link FileStructureFinder} object from which the structure and messages can be queried. * @throws Exception A variety of problems could occur at various stages of the structure finding process. */ - public FileStructureFinder findFileStructure(Integer idealSampleLineCount, Integer lineMergeSizeLimit, InputStream fromFile, - FileStructureOverrides overrides, TimeValue timeout) throws Exception { - return findFileStructure(new ArrayList<>(), (idealSampleLineCount == null) ? DEFAULT_IDEAL_SAMPLE_LINE_COUNT : idealSampleLineCount, - (lineMergeSizeLimit == null) ? DEFAULT_LINE_MERGE_SIZE_LIMIT : lineMergeSizeLimit, fromFile, overrides, timeout); + public FileStructureFinder findFileStructure( + Integer idealSampleLineCount, + Integer lineMergeSizeLimit, + InputStream fromFile, + FileStructureOverrides overrides, + TimeValue timeout + ) throws Exception { + return findFileStructure( + new ArrayList<>(), + (idealSampleLineCount == null) ? DEFAULT_IDEAL_SAMPLE_LINE_COUNT : idealSampleLineCount, + (lineMergeSizeLimit == null) ? DEFAULT_LINE_MERGE_SIZE_LIMIT : lineMergeSizeLimit, + fromFile, + overrides, + timeout + ); } - public FileStructureFinder findFileStructure(List explanation, int idealSampleLineCount, int lineMergeSizeLimit, - InputStream fromFile) throws Exception { - return findFileStructure(explanation, idealSampleLineCount, lineMergeSizeLimit, fromFile, FileStructureOverrides.EMPTY_OVERRIDES, - null); + public FileStructureFinder findFileStructure( + List explanation, + int idealSampleLineCount, + int lineMergeSizeLimit, + InputStream fromFile + ) throws Exception { + return findFileStructure( + explanation, + idealSampleLineCount, + lineMergeSizeLimit, + fromFile, + FileStructureOverrides.EMPTY_OVERRIDES, + null + ); } - public FileStructureFinder findFileStructure(List explanation, int idealSampleLineCount, int lineMergeSizeLimit, - InputStream fromFile, FileStructureOverrides overrides, - TimeValue timeout) throws Exception { + public FileStructureFinder findFileStructure( + List explanation, + int idealSampleLineCount, + int lineMergeSizeLimit, + InputStream fromFile, + FileStructureOverrides overrides, + TimeValue timeout + ) throws Exception { try (TimeoutChecker timeoutChecker = new TimeoutChecker("structure analysis", timeout, scheduler)) { @@ -349,17 +377,30 @@ public FileStructureFinder findFileStructure(List explanation, int ideal sampleReader = charsetMatch.getReader(); } - Tuple sampleInfo = sampleFile(sampleReader, charsetName, MIN_SAMPLE_LINE_COUNT, - Math.max(MIN_SAMPLE_LINE_COUNT, idealSampleLineCount), timeoutChecker); - - return makeBestStructureFinder(explanation, sampleInfo.v1(), charsetName, sampleInfo.v2(), lineMergeSizeLimit, overrides, - timeoutChecker); + Tuple sampleInfo = sampleFile( + sampleReader, + charsetName, + MIN_SAMPLE_LINE_COUNT, + Math.max(MIN_SAMPLE_LINE_COUNT, idealSampleLineCount), + timeoutChecker + ); + + return makeBestStructureFinder( + explanation, + sampleInfo.v1(), + charsetName, + sampleInfo.v2(), + lineMergeSizeLimit, + overrides, + timeoutChecker + ); } catch (Exception e) { // Add a dummy exception containing the explanation so far - this can be invaluable for troubleshooting as incorrect // decisions made early on in the structure analysis can result in seemingly crazy decisions or timeouts later on if (explanation.isEmpty() == false) { e.addSuppressed( - new ElasticsearchException(explanation.stream().collect(Collectors.joining("]\n[", "Explanation so far:\n[", "]\n")))); + new ElasticsearchException(explanation.stream().collect(Collectors.joining("]\n[", "Explanation so far:\n[", "]\n"))) + ); } throw e; } @@ -409,15 +450,22 @@ CharsetMatch findCharset(List explanation, InputStream inputStream, Time timeoutChecker.check("character set detection"); if (pureAscii) { - // If the input is pure ASCII then many single byte character sets will match. We want to favour + // If the input is pure ASCII then many single byte character sets will match. We want to favour // UTF-8 in this case, as it avoids putting a bold declaration of a dubious character set choice // in the config files. Optional utf8CharsetMatch = Arrays.stream(charsetMatches) - .filter(charsetMatch -> StandardCharsets.UTF_8.name().equals(charsetMatch.getName())).findFirst(); + .filter(charsetMatch -> StandardCharsets.UTF_8.name().equals(charsetMatch.getName())) + .findFirst(); if (utf8CharsetMatch.isPresent()) { - explanation.add("Using character encoding [" + StandardCharsets.UTF_8.name() + - "], which matched the input with [" + utf8CharsetMatch.get().getConfidence() + "%] confidence - first [" + - (BUFFER_SIZE / 1024) + "kB] of input was pure ASCII"); + explanation.add( + "Using character encoding [" + + StandardCharsets.UTF_8.name() + + "], which matched the input with [" + + utf8CharsetMatch.get().getConfidence() + + "%] confidence - first [" + + (BUFFER_SIZE / 1024) + + "kB] of input was pure ASCII" + ); return utf8CharsetMatch.get(); } } @@ -429,13 +477,13 @@ CharsetMatch findCharset(List explanation, InputStream inputStream, Time String name = charsetMatch.getName(); if (Charset.isSupported(name) && FILEBEAT_SUPPORTED_ENCODINGS.contains(name.toLowerCase(Locale.ROOT))) { - // This extra test is to avoid trying to read binary files as text. Running the structure + // This extra test is to avoid trying to read binary files as text. Running the structure // finding algorithms on binary files is very slow as the binary files generally appear to // have very long lines. boolean spaceEncodingContainsZeroByte = false; Charset charset = Charset.forName(name); - // Some character sets cannot be encoded. These are extremely rare so it's likely that - // they've been chosen based on incorrectly provided binary data. Therefore, err on + // Some character sets cannot be encoded. These are extremely rare so it's likely that + // they've been chosen based on incorrectly provided binary data. Therefore, err on // the side of rejecting binary data. if (charset.canEncode()) { byte[] spaceBytes = " ".getBytes(charset); @@ -444,32 +492,67 @@ CharsetMatch findCharset(List explanation, InputStream inputStream, Time } } if (containsZeroBytes && spaceEncodingContainsZeroByte == false) { - explanation.add("Character encoding [" + name + "] matched the input with [" + charsetMatch.getConfidence() + - "%] confidence but was rejected as the input contains zero bytes and the [" + name + "] encoding does not"); + explanation.add( + "Character encoding [" + + name + + "] matched the input with [" + + charsetMatch.getConfidence() + + "%] confidence but was rejected as the input contains zero bytes and the [" + + name + + "] encoding does not" + ); } else if (containsZeroBytes && 3 * oddPosZeroCount > 2 * evenPosZeroCount && 3 * evenPosZeroCount > 2 * oddPosZeroCount) { - explanation.add("Character encoding [" + name + "] matched the input with [" + charsetMatch.getConfidence() + - "%] confidence but was rejected as the distribution of zero bytes between odd and even positions in the " + - "file is very close - [" + evenPosZeroCount + "] and [" + oddPosZeroCount + "] in the first [" + - (BUFFER_SIZE / 1024) + "kB] of input"); + explanation.add( + "Character encoding [" + + name + + "] matched the input with [" + + charsetMatch.getConfidence() + + "%] confidence but was rejected as the distribution of zero bytes between odd and even positions in the " + + "file is very close - [" + + evenPosZeroCount + + "] and [" + + oddPosZeroCount + + "] in the first [" + + (BUFFER_SIZE / 1024) + + "kB] of input" + ); } else { - explanation.add("Using character encoding [" + name + "], which matched the input with [" + - charsetMatch.getConfidence() + "%] confidence"); + explanation.add( + "Using character encoding [" + + name + + "], which matched the input with [" + + charsetMatch.getConfidence() + + "%] confidence" + ); return charsetMatch; } } else { - explanation.add("Character encoding [" + name + "] matched the input with [" + charsetMatch.getConfidence() + - "%] confidence but was rejected as it is not supported by [" + - (Charset.isSupported(name) ? "Filebeat" : "the JVM") + "]"); + explanation.add( + "Character encoding [" + + name + + "] matched the input with [" + + charsetMatch.getConfidence() + + "%] confidence but was rejected as it is not supported by [" + + (Charset.isSupported(name) ? "Filebeat" : "the JVM") + + "]" + ); } } - throw new IllegalArgumentException("Could not determine a usable character encoding for the input" + - (containsZeroBytes ? " - could it be binary data?" : "")); + throw new IllegalArgumentException( + "Could not determine a usable character encoding for the input" + (containsZeroBytes ? " - could it be binary data?" : "") + ); } - FileStructureFinder makeBestStructureFinder(List explanation, String sample, String charsetName, Boolean hasByteOrderMarker, - int lineMergeSizeLimit, FileStructureOverrides overrides, - TimeoutChecker timeoutChecker) throws Exception { + FileStructureFinder makeBestStructureFinder( + List explanation, + String sample, + String charsetName, + Boolean hasByteOrderMarker, + int lineMergeSizeLimit, + FileStructureOverrides overrides, + TimeoutChecker timeoutChecker + ) throws Exception { Character delimiter = overrides.getDelimiter(); Character quote = overrides.getQuote(); @@ -481,15 +564,22 @@ FileStructureFinder makeBestStructureFinder(List explanation, String sam // If a precise delimiter is specified, we only need one structure finder // factory, and we'll tolerate as little as one column in the input - factories = Collections.singletonList(new DelimitedFileStructureFinderFactory(delimiter, (quote == null) ? '"' : quote, 1, - (shouldTrimFields == null) ? (delimiter == '|') : shouldTrimFields)); + factories = Collections.singletonList( + new DelimitedFileStructureFinderFactory( + delimiter, + (quote == null) ? '"' : quote, + 1, + (shouldTrimFields == null) ? (delimiter == '|') : shouldTrimFields + ) + ); } else if (quote != null || shouldTrimFields != null || FileStructure.Format.DELIMITED.equals(overrides.getFormat())) { allowedFractionOfBadLines = DelimitedFileStructureFinderFactory.FORMAT_OVERRIDDEN_ALLOWED_FRACTION_OF_BAD_LINES; // The delimiter is not specified, but some other aspect of delimited files is, // so clone our default delimited factories altering the overridden values - factories = ORDERED_STRUCTURE_FACTORIES.stream().filter(factory -> factory instanceof DelimitedFileStructureFinderFactory) + factories = ORDERED_STRUCTURE_FACTORIES.stream() + .filter(factory -> factory instanceof DelimitedFileStructureFinderFactory) .map(factory -> ((DelimitedFileStructureFinderFactory) factory).makeSimilar(quote, shouldTrimFields)) .collect(Collectors.toList()); @@ -497,20 +587,30 @@ FileStructureFinder makeBestStructureFinder(List explanation, String sam // We can use the default factories, but possibly filtered down to a specific format factories = ORDERED_STRUCTURE_FACTORIES.stream() - .filter(factory -> factory.canFindFormat(overrides.getFormat())).collect(Collectors.toList()); + .filter(factory -> factory.canFindFormat(overrides.getFormat())) + .collect(Collectors.toList()); } for (FileStructureFinderFactory factory : factories) { timeoutChecker.check("high level format detection"); if (factory.canCreateFromSample(explanation, sample, allowedFractionOfBadLines)) { - return factory.createFromSample(explanation, sample, charsetName, hasByteOrderMarker, lineMergeSizeLimit, overrides, - timeoutChecker); + return factory.createFromSample( + explanation, + sample, + charsetName, + hasByteOrderMarker, + lineMergeSizeLimit, + overrides, + timeoutChecker + ); } } - throw new IllegalArgumentException("Input did not match " + - ((overrides.getFormat() == null) ? "any known formats" : "the specified format [" + overrides.getFormat() + "]")); + throw new IllegalArgumentException( + "Input did not match " + + ((overrides.getFormat() == null) ? "any known formats" : "the specified format [" + overrides.getFormat() + "]") + ); } private Tuple sampleFile(Reader reader, String charsetName, int minLines, int maxLines, TimeoutChecker timeoutChecker) @@ -520,14 +620,13 @@ private Tuple sampleFile(Reader reader, String charsetName, int BufferedReader bufferedReader = new BufferedReader(reader); StringBuilder sample = new StringBuilder(); - // Don't include any byte-order-marker in the sample. (The logic to skip it works for both + // Don't include any byte-order-marker in the sample. (The logic to skip it works for both // UTF-8 and UTF-16 assuming the character set of the reader was correctly detected.) Boolean hasByteOrderMarker = null; if (charsetName.toUpperCase(Locale.ROOT).startsWith("UTF")) { int maybeByteOrderMarker = reader.read(); hasByteOrderMarker = ((char) maybeByteOrderMarker == '\uFEFF'); - if (maybeByteOrderMarker >= 0 && hasByteOrderMarker == false && (char) maybeByteOrderMarker != '\r') - { + if (maybeByteOrderMarker >= 0 && hasByteOrderMarker == false && (char) maybeByteOrderMarker != '\r') { sample.appendCodePoint(maybeByteOrderMarker); if ((char) maybeByteOrderMarker == '\n') { ++lineCount; diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureOverrides.java b/x-pack/plugin/text-structure/src/main/java/org/elasticsearch/xpack/textstructure/structurefinder/FileStructureOverrides.java similarity index 68% rename from x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureOverrides.java rename to x-pack/plugin/text-structure/src/main/java/org/elasticsearch/xpack/textstructure/structurefinder/FileStructureOverrides.java index bf80c1076896f..2fbd9cab1b14f 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureOverrides.java +++ b/x-pack/plugin/text-structure/src/main/java/org/elasticsearch/xpack/textstructure/structurefinder/FileStructureOverrides.java @@ -3,10 +3,10 @@ * or more contributor license agreements. Licensed under the Elastic License; * you may not use this file except in compliance with the Elastic License. */ -package org.elasticsearch.xpack.ml.filestructurefinder; +package org.elasticsearch.xpack.textstructure.structurefinder; -import org.elasticsearch.xpack.core.ml.action.FindFileStructureAction; -import org.elasticsearch.xpack.core.ml.filestructurefinder.FileStructure; +import org.elasticsearch.xpack.core.textstructure.action.FindFileStructureAction; +import org.elasticsearch.xpack.core.textstructure.structurefinder.FileStructure; import java.util.List; import java.util.Objects; @@ -36,14 +36,32 @@ public class FileStructureOverrides { public FileStructureOverrides(FindFileStructureAction.Request request) { - this(request.getCharset(), request.getFormat(), request.getColumnNames(), request.getHasHeaderRow(), request.getDelimiter(), - request.getQuote(), request.getShouldTrimFields(), request.getGrokPattern(), request.getTimestampFormat(), - request.getTimestampField()); - } - - private FileStructureOverrides(String charset, FileStructure.Format format, List columnNames, Boolean hasHeaderRow, - Character delimiter, Character quote, Boolean shouldTrimFields, String grokPattern, - String timestampFormat, String timestampField) { + this( + request.getCharset(), + request.getFormat(), + request.getColumnNames(), + request.getHasHeaderRow(), + request.getDelimiter(), + request.getQuote(), + request.getShouldTrimFields(), + request.getGrokPattern(), + request.getTimestampFormat(), + request.getTimestampField() + ); + } + + private FileStructureOverrides( + String charset, + FileStructure.Format format, + List columnNames, + Boolean hasHeaderRow, + Character delimiter, + Character quote, + Boolean shouldTrimFields, + String grokPattern, + String timestampFormat, + String timestampField + ) { this.charset = charset; this.format = format; this.columnNames = (columnNames == null) ? null : List.copyOf(columnNames); @@ -103,8 +121,18 @@ public String getTimestampField() { @Override public int hashCode() { - return Objects.hash(charset, format, columnNames, hasHeaderRow, delimiter, quote, shouldTrimFields, grokPattern, timestampFormat, - timestampField); + return Objects.hash( + charset, + format, + columnNames, + hasHeaderRow, + delimiter, + quote, + shouldTrimFields, + grokPattern, + timestampFormat, + timestampField + ); } @Override @@ -119,16 +147,16 @@ public boolean equals(Object other) { } FileStructureOverrides that = (FileStructureOverrides) other; - return Objects.equals(this.charset, that.charset) && - Objects.equals(this.format, that.format) && - Objects.equals(this.columnNames, that.columnNames) && - Objects.equals(this.hasHeaderRow, that.hasHeaderRow) && - Objects.equals(this.delimiter, that.delimiter) && - Objects.equals(this.quote, that.quote) && - Objects.equals(this.shouldTrimFields, that.shouldTrimFields) && - Objects.equals(this.grokPattern, that.grokPattern) && - Objects.equals(this.timestampFormat, that.timestampFormat) && - Objects.equals(this.timestampField, that.timestampField); + return Objects.equals(this.charset, that.charset) + && Objects.equals(this.format, that.format) + && Objects.equals(this.columnNames, that.columnNames) + && Objects.equals(this.hasHeaderRow, that.hasHeaderRow) + && Objects.equals(this.delimiter, that.delimiter) + && Objects.equals(this.quote, that.quote) + && Objects.equals(this.shouldTrimFields, that.shouldTrimFields) + && Objects.equals(this.grokPattern, that.grokPattern) + && Objects.equals(this.timestampFormat, that.timestampFormat) + && Objects.equals(this.timestampField, that.timestampField); } public static class Builder { @@ -196,8 +224,18 @@ public Builder setTimestampField(String timestampField) { public FileStructureOverrides build() { - return new FileStructureOverrides(charset, format, columnNames, hasHeaderRow, delimiter, quote, shouldTrimFields, grokPattern, - timestampFormat, timestampField); + return new FileStructureOverrides( + charset, + format, + columnNames, + hasHeaderRow, + delimiter, + quote, + shouldTrimFields, + grokPattern, + timestampFormat, + timestampField + ); } } } diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtils.java b/x-pack/plugin/text-structure/src/main/java/org/elasticsearch/xpack/textstructure/structurefinder/FileStructureUtils.java similarity index 79% rename from x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtils.java rename to x-pack/plugin/text-structure/src/main/java/org/elasticsearch/xpack/textstructure/structurefinder/FileStructureUtils.java index e74e144d1e00c..ce0398d636b5c 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtils.java +++ b/x-pack/plugin/text-structure/src/main/java/org/elasticsearch/xpack/textstructure/structurefinder/FileStructureUtils.java @@ -3,7 +3,7 @@ * or more contributor license agreements. Licensed under the Elastic License; * you may not use this file except in compliance with the Elastic License. */ -package org.elasticsearch.xpack.ml.filestructurefinder; +package org.elasticsearch.xpack.textstructure.structurefinder; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; @@ -11,7 +11,7 @@ import org.elasticsearch.common.util.set.Sets; import org.elasticsearch.grok.Grok; import org.elasticsearch.ingest.Pipeline; -import org.elasticsearch.xpack.core.ml.filestructurefinder.FieldStats; +import org.elasticsearch.xpack.core.textstructure.structurefinder.FieldStats; import java.util.ArrayList; import java.util.Arrays; @@ -34,11 +34,11 @@ public final class FileStructureUtils { public static final String MAPPING_TYPE_SETTING = "type"; public static final String MAPPING_FORMAT_SETTING = "format"; public static final String MAPPING_PROPERTIES_SETTING = "properties"; - public static final Map DATE_MAPPING_WITHOUT_FORMAT = - Collections.singletonMap(MAPPING_TYPE_SETTING, "date"); + public static final Map DATE_MAPPING_WITHOUT_FORMAT = Collections.singletonMap(MAPPING_TYPE_SETTING, "date"); public static final String NANOSECOND_DATE_OUTPUT_FORMAT = "yyyy-MM-dd'T'HH:mm:ss.SSSSSSSSSXXX"; - public static final Set CONVERTIBLE_TYPES = - Collections.unmodifiableSet(Sets.newHashSet("integer", "long", "float", "double", "boolean")); + public static final Set CONVERTIBLE_TYPES = Collections.unmodifiableSet( + Sets.newHashSet("integer", "long", "float", "double", "boolean") + ); private static final Map EXTENDED_PATTERNS; static { @@ -64,19 +64,26 @@ public final class FileStructureUtils { private static final int NUM_TOP_HITS = 10; // NUMBER Grok pattern doesn't support scientific notation, so we extend it - private static final Grok NUMBER_GROK = new Grok(Grok.BUILTIN_PATTERNS, "^%{NUMBER}(?:[eE][+-]?[0-3]?[0-9]{1,2})?$", - TimeoutChecker.watchdog, logger::warn); + private static final Grok NUMBER_GROK = new Grok( + Grok.BUILTIN_PATTERNS, + "^%{NUMBER}(?:[eE][+-]?[0-3]?[0-9]{1,2})?$", + TimeoutChecker.watchdog, + logger::warn + ); private static final Grok IP_GROK = new Grok(Grok.BUILTIN_PATTERNS, "^%{IP}$", TimeoutChecker.watchdog, logger::warn); private static final Grok GEO_POINT_WKT = new Grok(EXTENDED_PATTERNS, "^%{WKT_POINT}$", TimeoutChecker.watchdog, logger::warn); - private static final Grok GEO_WKT = new Grok(EXTENDED_PATTERNS, "^(?:%{WKT_ANY}|%{WKT_GEOMETRYCOLLECTION})$", TimeoutChecker.watchdog, - logger::warn); + private static final Grok GEO_WKT = new Grok( + EXTENDED_PATTERNS, + "^(?:%{WKT_ANY}|%{WKT_GEOMETRYCOLLECTION})$", + TimeoutChecker.watchdog, + logger::warn + ); private static final int KEYWORD_MAX_LEN = 256; private static final int KEYWORD_MAX_SPACES = 5; private static final String BEAT_TIMEZONE_FIELD = "event.timezone"; - private FileStructureUtils() { - } + private FileStructureUtils() {} /** * Given one or more sample records, find a timestamp field that is consistently present in them all. @@ -96,8 +103,12 @@ private FileStructureUtils() { * @return A tuple of (field name, timestamp format finder) if one can be found, or null if * there is no consistent timestamp. */ - static Tuple guessTimestampField(List explanation, List> sampleRecords, - FileStructureOverrides overrides, TimeoutChecker timeoutChecker) { + static Tuple guessTimestampField( + List explanation, + List> sampleRecords, + FileStructureOverrides overrides, + TimeoutChecker timeoutChecker + ) { if (sampleRecords.isEmpty()) { return null; } @@ -115,11 +126,17 @@ static Tuple guessTimestampField(List exp Object fieldValue = sampleRecord.get(fieldName); if (fieldValue == null) { if (overrides.getTimestampField() != null) { - throw new IllegalArgumentException("Specified timestamp field [" + overrides.getTimestampField() + - "] is not present in record [" + sampleRecord + "]"); + throw new IllegalArgumentException( + "Specified timestamp field [" + + overrides.getTimestampField() + + "] is not present in record [" + + sampleRecord + + "]" + ); } - explanation.add("First sample match [" + fieldName + "] ruled out because record [" + sampleRecord + - "] doesn't have field"); + explanation.add( + "First sample match [" + fieldName + "] ruled out because record [" + sampleRecord + "] doesn't have field" + ); allGood = false; break; } @@ -131,23 +148,34 @@ static Tuple guessTimestampField(List exp } catch (IllegalArgumentException e) { if (overrides.getTimestampFormat() != null) { if (exceptionMsg == null) { - exceptionMsg = new StringBuilder("Specified timestamp format [" + overrides.getTimestampFormat() + - "] does not match"); + exceptionMsg = new StringBuilder( + "Specified timestamp format [" + overrides.getTimestampFormat() + "] does not match" + ); } else { exceptionMsg.append(", nor"); } exceptionMsg.append(" for record [").append(sampleRecord).append("] in field [").append(fieldName).append("]"); } - explanation.add("First sample match " + timestampFormatFinder.getRawJavaTimestampFormats() - + " ruled out because record [" + sampleRecord + "] does not match"); + explanation.add( + "First sample match " + + timestampFormatFinder.getRawJavaTimestampFormats() + + " ruled out because record [" + + sampleRecord + + "] does not match" + ); allGood = false; break; } } if (allGood) { - explanation.add(((overrides.getTimestampField() == null) ? "Guessing timestamp" : "Timestamp") + - " field is [" + fieldName + "] with format " + timestampFormatFinder.getJavaTimestampFormats()); + explanation.add( + ((overrides.getTimestampField() == null) ? "Guessing timestamp" : "Timestamp") + + " field is [" + + fieldName + + "] with format " + + timestampFormatFinder.getJavaTimestampFormats() + ); return candidate; } } @@ -159,17 +187,21 @@ static Tuple guessTimestampField(List exp return null; } - private static List> findCandidates(List explanation, List> sampleRecords, - FileStructureOverrides overrides, - TimeoutChecker timeoutChecker) { + private static List> findCandidates( + List explanation, + List> sampleRecords, + FileStructureOverrides overrides, + TimeoutChecker timeoutChecker + ) { assert sampleRecords.isEmpty() == false; Map firstRecord = sampleRecords.get(0); String onlyConsiderField = overrides.getTimestampField(); if (onlyConsiderField != null && firstRecord.get(onlyConsiderField) == null) { - throw new IllegalArgumentException("Specified timestamp field [" + overrides.getTimestampField() + - "] is not present in record [" + firstRecord + "]"); + throw new IllegalArgumentException( + "Specified timestamp field [" + overrides.getTimestampField() + "] is not present in record [" + firstRecord + "]" + ); } List> candidates = new ArrayList<>(); @@ -182,13 +214,24 @@ private static List> findCandidates(List(fieldName, timestampFormatFinder)); - explanation.add("First sample timestamp match " + timestampFormatFinder.getRawJavaTimestampFormats() - + " for field [" + fieldName + "]"); + explanation.add( + "First sample timestamp match " + + timestampFormatFinder.getRawJavaTimestampFormats() + + " for field [" + + fieldName + + "]" + ); } catch (IllegalArgumentException e) { // No possible timestamp format found in this particular field - not a problem } @@ -197,8 +240,9 @@ private static List> findCandidates(List> findCandidates(List, SortedMap> guessMappingsAndCalculateFieldStats( - List explanation, List> sampleRecords, TimeoutChecker timeoutChecker) { + List explanation, + List> sampleRecords, + TimeoutChecker timeoutChecker + ) { SortedMap mappings = new TreeMap<>(); SortedMap fieldStats = new TreeMap<>(); @@ -222,11 +269,17 @@ static Tuple, SortedMap> guessMapp for (String fieldName : uniqueFieldNames) { - List fieldValues = sampleRecords.stream().map(record -> record.get(fieldName)).filter(fieldValue -> fieldValue != null) + List fieldValues = sampleRecords.stream() + .map(record -> record.get(fieldName)) + .filter(fieldValue -> fieldValue != null) .collect(Collectors.toList()); - Tuple, FieldStats> mappingAndFieldStats = - guessMappingAndCalculateFieldStats(explanation, fieldName, fieldValues, timeoutChecker); + Tuple, FieldStats> mappingAndFieldStats = guessMappingAndCalculateFieldStats( + explanation, + fieldName, + fieldValues, + timeoutChecker + ); if (mappingAndFieldStats != null) { if (mappingAndFieldStats.v1() != null) { mappings.put(fieldName, mappingAndFieldStats.v1()); @@ -240,9 +293,12 @@ static Tuple, SortedMap> guessMapp return new Tuple<>(mappings, fieldStats); } - static Tuple, FieldStats> guessMappingAndCalculateFieldStats(List explanation, - String fieldName, List fieldValues, - TimeoutChecker timeoutChecker) { + static Tuple, FieldStats> guessMappingAndCalculateFieldStats( + List explanation, + String fieldName, + List fieldValues, + TimeoutChecker timeoutChecker + ) { if (fieldValues == null || fieldValues.isEmpty()) { // We can get here if all the records that contained a given field had a null value for it. // In this case it's best not to make any statement about what the mapping type should be. @@ -253,14 +309,19 @@ static Tuple, FieldStats> guessMappingAndCalculateFieldStats if (fieldValues.stream().allMatch(value -> value instanceof Map)) { return new Tuple<>(Collections.singletonMap(MAPPING_TYPE_SETTING, "object"), null); } - throw new IllegalArgumentException("Field [" + fieldName + - "] has both object and non-object values - this is not supported by Elasticsearch"); + throw new IllegalArgumentException( + "Field [" + fieldName + "] has both object and non-object values - this is not supported by Elasticsearch" + ); } if (fieldValues.stream().anyMatch(value -> value instanceof List || value instanceof Object[])) { // Elasticsearch fields can be either arrays or single values, but array values must all have the same type - return guessMappingAndCalculateFieldStats(explanation, fieldName, - fieldValues.stream().flatMap(FileStructureUtils::flatten).collect(Collectors.toList()), timeoutChecker); + return guessMappingAndCalculateFieldStats( + explanation, + fieldName, + fieldValues.stream().flatMap(FileStructureUtils::flatten).collect(Collectors.toList()), + timeoutChecker + ); } Collection fieldValuesAsStrings = fieldValues.stream().map(Object::toString).collect(Collectors.toList()); @@ -293,8 +354,11 @@ private static Stream flatten(Object value) { * @param timeoutChecker Will abort the operation if its timeout is exceeded. * @return The sub-section of the index mappings most appropriate for the field. */ - static Map findTimestampMapping(List explanation, Collection fieldValues, - TimeoutChecker timeoutChecker) { + static Map findTimestampMapping( + List explanation, + Collection fieldValues, + TimeoutChecker timeoutChecker + ) { assert fieldValues.isEmpty() == false; TimestampFormatFinder timestampFormatFinder = new TimestampFormatFinder(explanation, true, true, true, timeoutChecker); @@ -316,8 +380,12 @@ static Map findTimestampMapping(List explanation, Collec * @return The sub-section of the index mappings most appropriate for the field, * for example { "type" : "keyword" }. */ - static Map guessScalarMapping(List explanation, String fieldName, Collection fieldValues, - TimeoutChecker timeoutChecker) { + static Map guessScalarMapping( + List explanation, + String fieldName, + Collection fieldValues, + TimeoutChecker timeoutChecker + ) { assert fieldValues.isEmpty() == false; if (fieldValues.stream().allMatch(value -> "true".equals(value) || "false".equals(value))) { @@ -344,10 +412,9 @@ static Map guessScalarMapping(List explanation, String f } catch (NumberFormatException e) { explanation.add("Rejecting type 'double' for field [" + fieldName + "] due to parse failure: [" + e.getMessage() + "]"); } - } - else if (fieldValues.stream().allMatch(IP_GROK::match)) { + } else if (fieldValues.stream().allMatch(IP_GROK::match)) { return Collections.singletonMap(MAPPING_TYPE_SETTING, "ip"); - // geo_point mapping MUST be checked before geo_shape as geo_shape also contains a matcher for geo_point + // geo_point mapping MUST be checked before geo_shape as geo_shape also contains a matcher for geo_point } else if (fieldValues.stream().allMatch(GEO_POINT_WKT::match)) { return Collections.singletonMap(MAPPING_TYPE_SETTING, "geo_point"); } else if (fieldValues.stream().allMatch(GEO_WKT::match)) { @@ -402,12 +469,16 @@ static boolean isMoreLikelyTextThanKeyword(String str) { * @param needNanosecondPrecision Does the timestamp have more than millisecond accuracy? * @return The ingest pipeline definition, or null if none is required. */ - public static Map makeIngestPipelineDefinition(String grokPattern, Map customGrokPatternDefinitions, - Map csvProcessorSettings, - Map mappingsForConversions, - String timestampField, List timestampFormats, - boolean needClientTimezone, - boolean needNanosecondPrecision) { + public static Map makeIngestPipelineDefinition( + String grokPattern, + Map customGrokPatternDefinitions, + Map csvProcessorSettings, + Map mappingsForConversions, + String timestampField, + List timestampFormats, + boolean needClientTimezone, + boolean needNanosecondPrecision + ) { if (grokPattern == null && csvProcessorSettings == null && timestampField == null) { return null; diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/GrokPatternCreator.java b/x-pack/plugin/text-structure/src/main/java/org/elasticsearch/xpack/textstructure/structurefinder/GrokPatternCreator.java similarity index 81% rename from x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/GrokPatternCreator.java rename to x-pack/plugin/text-structure/src/main/java/org/elasticsearch/xpack/textstructure/structurefinder/GrokPatternCreator.java index 687c386395842..aeab640a8d320 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/GrokPatternCreator.java +++ b/x-pack/plugin/text-structure/src/main/java/org/elasticsearch/xpack/textstructure/structurefinder/GrokPatternCreator.java @@ -3,13 +3,13 @@ * or more contributor license agreements. Licensed under the Elastic License; * you may not use this file except in compliance with the Elastic License. */ -package org.elasticsearch.xpack.ml.filestructurefinder; +package org.elasticsearch.xpack.textstructure.structurefinder; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.elasticsearch.common.collect.Tuple; import org.elasticsearch.grok.Grok; -import org.elasticsearch.xpack.core.ml.filestructurefinder.FieldStats; +import org.elasticsearch.xpack.core.textstructure.structurefinder.FieldStats; import java.util.ArrayList; import java.util.Arrays; @@ -106,11 +106,11 @@ public final class GrokPatternCreator { new ValueOnlyGrokPatternCandidate("BASE16NUM", "keyword", "field", "(? explanation, Collection sampleMessages, Map mappings, - Map fieldStats, Map customGrokPatternDefinitions, - TimeoutChecker timeoutChecker) { + public GrokPatternCreator( + List explanation, + Collection sampleMessages, + Map mappings, + Map fieldStats, + Map customGrokPatternDefinitions, + TimeoutChecker timeoutChecker + ) { this.explanation = Objects.requireNonNull(explanation); this.sampleMessages = Collections.unmodifiableCollection(sampleMessages); this.mappings = mappings; @@ -186,8 +191,11 @@ public Tuple findFullLineGrokPattern(String timestampField) { */ public void validateFullLineGrokPattern(String grokPattern, String timestampField) { - FullMatchGrokPatternCandidate candidate = FullMatchGrokPatternCandidate.fromGrokPattern(grokPattern, timestampField, - grokPatternDefinitions); + FullMatchGrokPatternCandidate candidate = FullMatchGrokPatternCandidate.fromGrokPattern( + grokPattern, + timestampField, + grokPatternDefinitions + ); if (candidate.matchesAll(sampleMessages, timeoutChecker)) { candidate.processMatch(explanation, sampleMessages, mappings, fieldStats, timeoutChecker); } else { @@ -206,8 +214,12 @@ public String createGrokPatternFromExamples(String seedPatternName, Map snippets, - boolean ignoreKeyValueCandidateLeft, int ignoreValueOnlyCandidatesLeft, - boolean ignoreKeyValueCandidateRight, int ignoreValueOnlyCandidatesRight) { + private void processCandidateAndSplit( + GrokPatternCandidate chosenPattern, + boolean isLast, + Collection snippets, + boolean ignoreKeyValueCandidateLeft, + int ignoreValueOnlyCandidatesLeft, + boolean ignoreKeyValueCandidateRight, + int ignoreValueOnlyCandidatesRight + ) { Collection prefaces = new ArrayList<>(); Collection epilogues = new ArrayList<>(); - String patternBuilderContent = chosenPattern.processCaptures(explanation, fieldNameCountStore, snippets, prefaces, epilogues, - mappings, fieldStats, timeoutChecker); + String patternBuilderContent = chosenPattern.processCaptures( + explanation, + fieldNameCountStore, + snippets, + prefaces, + epilogues, + mappings, + fieldStats, + timeoutChecker + ); appendBestGrokMatchForStrings(false, prefaces, ignoreKeyValueCandidateLeft, ignoreValueOnlyCandidatesLeft); overallGrokPatternBuilder.append(patternBuilderContent); appendBestGrokMatchForStrings(isLast, epilogues, ignoreKeyValueCandidateRight, ignoreValueOnlyCandidatesRight); @@ -245,8 +271,12 @@ private void processCandidateAndSplit(GrokPatternCandidate chosenPattern, boolea * to use matches it best. Then append the appropriate Grok language to represent that finding onto * the supplied string builder. */ - void appendBestGrokMatchForStrings(boolean isLast, Collection snippets, - boolean ignoreKeyValueCandidate, int ignoreValueOnlyCandidates) { + void appendBestGrokMatchForStrings( + boolean isLast, + Collection snippets, + boolean ignoreKeyValueCandidate, + int ignoreValueOnlyCandidates + ) { snippets = adjustForPunctuation(snippets); @@ -257,8 +287,10 @@ void appendBestGrokMatchForStrings(boolean isLast, Collection snippets, bestCandidate = kvCandidate; } else { ignoreKeyValueCandidate = true; - for (GrokPatternCandidate candidate : - ORDERED_CANDIDATE_GROK_PATTERNS.subList(ignoreValueOnlyCandidates, ORDERED_CANDIDATE_GROK_PATTERNS.size())) { + for (GrokPatternCandidate candidate : ORDERED_CANDIDATE_GROK_PATTERNS.subList( + ignoreValueOnlyCandidates, + ORDERED_CANDIDATE_GROK_PATTERNS.size() + )) { if (candidate.matchesAll(snippets)) { bestCandidate = candidate; break; @@ -275,8 +307,15 @@ void appendBestGrokMatchForStrings(boolean isLast, Collection snippets, addIntermediateRegex(snippets); } } else { - processCandidateAndSplit(bestCandidate, isLast, snippets, true, ignoreValueOnlyCandidates + (ignoreKeyValueCandidate ? 1 : 0), - ignoreKeyValueCandidate, ignoreValueOnlyCandidates); + processCandidateAndSplit( + bestCandidate, + isLast, + snippets, + true, + ignoreValueOnlyCandidates + (ignoreKeyValueCandidate ? 1 : 0), + ignoreKeyValueCandidate, + ignoreValueOnlyCandidates + ); } } @@ -396,8 +435,8 @@ private void finalizeGrokPattern(Collection snippets) { char ch = driver.charAt(i); int driverIndex = i; Boolean punctuationOrSpaceNeedsEscaping = PUNCTUATION_OR_SPACE_NEEDS_ESCAPING.get(ch); - if (punctuationOrSpaceNeedsEscaping != null && - others.stream().allMatch(other -> other.length() > driverIndex && other.charAt(driverIndex) == ch)) { + if (punctuationOrSpaceNeedsEscaping != null + && others.stream().allMatch(other -> other.length() > driverIndex && other.charAt(driverIndex) == ch)) { if (punctuationOrSpaceNeedsEscaping) { overallGrokPatternBuilder.append('\\'); } @@ -427,9 +466,16 @@ interface GrokPatternCandidate { * calculate field stats. * @return The string that needs to be incorporated into the overall Grok pattern for the line. */ - String processCaptures(List explanation, Map fieldNameCountStore, Collection snippets, - Collection prefaces, Collection epilogues, Map mappings, - Map fieldStats, TimeoutChecker timeoutChecker); + String processCaptures( + List explanation, + Map fieldNameCountStore, + Collection snippets, + Collection prefaces, + Collection epilogues, + Map mappings, + Map fieldStats, + TimeoutChecker timeoutChecker + ); } /** @@ -456,8 +502,14 @@ static class ValueOnlyGrokPatternCandidate implements GrokPatternCandidate { * @param fieldName Name of the field to extract from the match. */ ValueOnlyGrokPatternCandidate(String grokPatternName, String mappingType, String fieldName) { - this(grokPatternName, Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, mappingType), fieldName, - "\\b", "\\b", Grok.BUILTIN_PATTERNS); + this( + grokPatternName, + Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, mappingType), + fieldName, + "\\b", + "\\b", + Grok.BUILTIN_PATTERNS + ); } /** @@ -466,10 +518,20 @@ static class ValueOnlyGrokPatternCandidate implements GrokPatternCandidate { * @param fieldName Name of the field to extract from the match. * @param grokPatternDefinitions Definitions of Grok patterns to be used. */ - ValueOnlyGrokPatternCandidate(String grokPatternName, String mappingType, String fieldName, - Map grokPatternDefinitions) { - this(grokPatternName, Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, mappingType), fieldName, - "\\b", "\\b", grokPatternDefinitions); + ValueOnlyGrokPatternCandidate( + String grokPatternName, + String mappingType, + String fieldName, + Map grokPatternDefinitions + ) { + this( + grokPatternName, + Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, mappingType), + fieldName, + "\\b", + "\\b", + grokPatternDefinitions + ); } /** @@ -480,8 +542,14 @@ static class ValueOnlyGrokPatternCandidate implements GrokPatternCandidate { * @param postBreak Only consider the match if it's broken from the following text by this. */ ValueOnlyGrokPatternCandidate(String grokPatternName, String mappingType, String fieldName, String preBreak, String postBreak) { - this(grokPatternName, Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, mappingType), fieldName, - preBreak, postBreak, Grok.BUILTIN_PATTERNS); + this( + grokPatternName, + Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, mappingType), + fieldName, + preBreak, + postBreak, + Grok.BUILTIN_PATTERNS + ); } /** @@ -492,16 +560,36 @@ static class ValueOnlyGrokPatternCandidate implements GrokPatternCandidate { * @param postBreak Only consider the match if it's broken from the following text by this. * @param grokPatternDefinitions Definitions of Grok patterns to be used. */ - ValueOnlyGrokPatternCandidate(String grokPatternName, Map mapping, String fieldName, String preBreak, - String postBreak, Map grokPatternDefinitions) { + ValueOnlyGrokPatternCandidate( + String grokPatternName, + Map mapping, + String fieldName, + String preBreak, + String postBreak, + Map grokPatternDefinitions + ) { this.grokPatternName = Objects.requireNonNull(grokPatternName); this.mapping = Collections.unmodifiableMap(mapping); this.fieldName = Objects.requireNonNull(fieldName); // The (?m) here has the Ruby meaning, which is equivalent to (?s) in Java - grok = new Grok(grokPatternDefinitions, - "(?m)%{DATA:" + PREFACE + "}" + Objects.requireNonNull(preBreak) + - "%{" + grokPatternName + ":" + VALUE + "}" + Objects.requireNonNull(postBreak) + "%{GREEDYDATA:" + EPILOGUE + "}", - TimeoutChecker.watchdog, logger::warn); + grok = new Grok( + grokPatternDefinitions, + "(?m)%{DATA:" + + PREFACE + + "}" + + Objects.requireNonNull(preBreak) + + "%{" + + grokPatternName + + ":" + + VALUE + + "}" + + Objects.requireNonNull(postBreak) + + "%{GREEDYDATA:" + + EPILOGUE + + "}", + TimeoutChecker.watchdog, + logger::warn + ); } @Override @@ -515,9 +603,16 @@ public boolean matchesAll(Collection snippets) { * bit that matches. */ @Override - public String processCaptures(List explanation, Map fieldNameCountStore, Collection snippets, - Collection prefaces, Collection epilogues, Map mappings, - Map fieldStats, TimeoutChecker timeoutChecker) { + public String processCaptures( + List explanation, + Map fieldNameCountStore, + Collection snippets, + Collection prefaces, + Collection epilogues, + Map mappings, + Map fieldStats, + TimeoutChecker timeoutChecker + ) { Collection values = new ArrayList<>(); for (String snippet : snippets) { Map captures = timeoutChecker.grokCaptures(grok, snippet, "full message Grok pattern field extraction"); @@ -577,8 +672,9 @@ public boolean matchesAll(Collection snippets) { } isFirst = false; } else { - candidateNames.removeIf(candidateName -> - Pattern.compile("\\b" + candidateName + "=[\\w.-]+").matcher(snippet).find() == false); + candidateNames.removeIf( + candidateName -> Pattern.compile("\\b" + candidateName + "=[\\w.-]+").matcher(snippet).find() == false + ); } if (candidateNames.isEmpty()) { break; @@ -588,14 +684,25 @@ public boolean matchesAll(Collection snippets) { } @Override - public String processCaptures(List explanation, Map fieldNameCountStore, Collection snippets, - Collection prefaces, Collection epilogues, Map mappings, - Map fieldStats, TimeoutChecker timeoutChecker) { + public String processCaptures( + List explanation, + Map fieldNameCountStore, + Collection snippets, + Collection prefaces, + Collection epilogues, + Map mappings, + Map fieldStats, + TimeoutChecker timeoutChecker + ) { if (fieldName == null) { throw new IllegalStateException("Cannot process KV matches until a field name has been determined"); } - Grok grok = new Grok(Grok.BUILTIN_PATTERNS, "(?m)%{DATA:" + PREFACE + "}\\b" + - fieldName + "=%{USER:" + VALUE + "}%{GREEDYDATA:" + EPILOGUE + "}", TimeoutChecker.watchdog, logger::warn); + Grok grok = new Grok( + Grok.BUILTIN_PATTERNS, + "(?m)%{DATA:" + PREFACE + "}\\b" + fieldName + "=%{USER:" + VALUE + "}%{GREEDYDATA:" + EPILOGUE + "}", + TimeoutChecker.watchdog, + logger::warn + ); Collection values = new ArrayList<>(); for (String snippet : snippets) { Map captures = grok.captures(snippet); @@ -626,15 +733,26 @@ public String processCaptures(List explanation, Map fie */ static class PrecalculatedMappingGrokPatternCandidate extends ValueOnlyGrokPatternCandidate { - PrecalculatedMappingGrokPatternCandidate(String grokPatternName, Map mapping, String fieldName, - Map grokPatternDefinitions) { + PrecalculatedMappingGrokPatternCandidate( + String grokPatternName, + Map mapping, + String fieldName, + Map grokPatternDefinitions + ) { super(grokPatternName, mapping, fieldName, "\\b", "\\b", grokPatternDefinitions); } @Override - public String processCaptures(List explanation, Map fieldNameCountStore, Collection snippets, - Collection prefaces, Collection epilogues, Map mappings, - Map fieldStats, TimeoutChecker timeoutChecker) { + public String processCaptures( + List explanation, + Map fieldNameCountStore, + Collection snippets, + Collection prefaces, + Collection epilogues, + Map mappings, + Map fieldStats, + TimeoutChecker timeoutChecker + ) { return super.processCaptures(explanation, fieldNameCountStore, snippets, prefaces, epilogues, null, fieldStats, timeoutChecker); } } @@ -652,8 +770,11 @@ static FullMatchGrokPatternCandidate fromGrokPatternName(String grokPatternName, return new FullMatchGrokPatternCandidate("%{" + grokPatternName + "}", timeField, Grok.BUILTIN_PATTERNS); } - static FullMatchGrokPatternCandidate fromGrokPatternName(String grokPatternName, String timeField, - Map grokPatternDefinitions) { + static FullMatchGrokPatternCandidate fromGrokPatternName( + String grokPatternName, + String timeField, + Map grokPatternDefinitions + ) { return new FullMatchGrokPatternCandidate("%{" + grokPatternName + "}", timeField, grokPatternDefinitions); } @@ -661,8 +782,11 @@ static FullMatchGrokPatternCandidate fromGrokPattern(String grokPattern, String return new FullMatchGrokPatternCandidate(grokPattern, timeField, Grok.BUILTIN_PATTERNS); } - static FullMatchGrokPatternCandidate fromGrokPattern(String grokPattern, String timeField, - Map grokPatternDefinitions) { + static FullMatchGrokPatternCandidate fromGrokPattern( + String grokPattern, + String timeField, + Map grokPatternDefinitions + ) { return new FullMatchGrokPatternCandidate(grokPattern, timeField, grokPatternDefinitions); } @@ -690,8 +814,13 @@ public boolean matchesAll(Collection sampleMessages, TimeoutChecker time * This must only be called if {@link #matchesAll} returns true. * @return A tuple of (time field name, Grok string). */ - public Tuple processMatch(List explanation, Collection sampleMessages, Map mappings, - Map fieldStats, TimeoutChecker timeoutChecker) { + public Tuple processMatch( + List explanation, + Collection sampleMessages, + Map mappings, + Map fieldStats, + TimeoutChecker timeoutChecker + ) { explanation.add("A full message Grok pattern [" + grokPattern.substring(2, grokPattern.length() - 1) + "] looks appropriate"); @@ -699,8 +828,11 @@ public Tuple processMatch(List explanation, Collection> valuesPerField = new HashMap<>(); for (String sampleMessage : sampleMessages) { - Map captures = timeoutChecker.grokCaptures(grok, sampleMessage, - "full message Grok pattern field extraction"); + Map captures = timeoutChecker.grokCaptures( + grok, + sampleMessage, + "full message Grok pattern field extraction" + ); // If the pattern doesn't match then captures will be null if (captures == null) { throw new IllegalStateException("[" + grokPattern + "] does not match snippet [" + sampleMessage + "]"); @@ -722,16 +854,22 @@ public Tuple processMatch(List explanation, Collection> valuesForField : valuesPerField.entrySet()) { String fieldName = valuesForField.getKey(); - Map mapping = - FileStructureUtils.guessScalarMapping(explanation, fieldName, valuesForField.getValue(), timeoutChecker); + Map mapping = FileStructureUtils.guessScalarMapping( + explanation, + fieldName, + valuesForField.getValue(), + timeoutChecker + ); timeoutChecker.check("mapping determination"); // Exclude the time field because that will be dropped and replaced with @timestamp if (mappings != null && fieldName.equals(timeField) == false) { mappings.put(fieldName, mapping); } if (fieldStats != null) { - fieldStats.put(fieldName, - FileStructureUtils.calculateFieldStats(mapping, valuesForField.getValue(), timeoutChecker)); + fieldStats.put( + fieldName, + FileStructureUtils.calculateFieldStats(mapping, valuesForField.getValue(), timeoutChecker) + ); } } } diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/NdJsonFileStructureFinder.java b/x-pack/plugin/text-structure/src/main/java/org/elasticsearch/xpack/textstructure/structurefinder/NdJsonFileStructureFinder.java similarity index 60% rename from x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/NdJsonFileStructureFinder.java rename to x-pack/plugin/text-structure/src/main/java/org/elasticsearch/xpack/textstructure/structurefinder/NdJsonFileStructureFinder.java index b538143a561dd..1326c68b54a7b 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/NdJsonFileStructureFinder.java +++ b/x-pack/plugin/text-structure/src/main/java/org/elasticsearch/xpack/textstructure/structurefinder/NdJsonFileStructureFinder.java @@ -3,14 +3,14 @@ * or more contributor license agreements. Licensed under the Elastic License; * you may not use this file except in compliance with the Elastic License. */ -package org.elasticsearch.xpack.ml.filestructurefinder; +package org.elasticsearch.xpack.textstructure.structurefinder; import org.elasticsearch.common.collect.Tuple; import org.elasticsearch.common.xcontent.DeprecationHandler; import org.elasticsearch.common.xcontent.NamedXContentRegistry; import org.elasticsearch.common.xcontent.XContentParser; -import org.elasticsearch.xpack.core.ml.filestructurefinder.FieldStats; -import org.elasticsearch.xpack.core.ml.filestructurefinder.FileStructure; +import org.elasticsearch.xpack.core.textstructure.structurefinder.FieldStats; +import org.elasticsearch.xpack.core.textstructure.structurefinder.FileStructure; import java.io.IOException; import java.util.ArrayList; @@ -31,29 +31,40 @@ public class NdJsonFileStructureFinder implements FileStructureFinder { private final List sampleMessages; private final FileStructure structure; - static NdJsonFileStructureFinder makeNdJsonFileStructureFinder(List explanation, String sample, String charsetName, - Boolean hasByteOrderMarker, FileStructureOverrides overrides, - TimeoutChecker timeoutChecker) throws IOException { + static NdJsonFileStructureFinder makeNdJsonFileStructureFinder( + List explanation, + String sample, + String charsetName, + Boolean hasByteOrderMarker, + FileStructureOverrides overrides, + TimeoutChecker timeoutChecker + ) throws IOException { List> sampleRecords = new ArrayList<>(); List sampleMessages = Arrays.asList(sample.split("\n")); for (String sampleMessage : sampleMessages) { - XContentParser parser = jsonXContent.createParser(NamedXContentRegistry.EMPTY, DeprecationHandler.THROW_UNSUPPORTED_OPERATION, - sampleMessage); + XContentParser parser = jsonXContent.createParser( + NamedXContentRegistry.EMPTY, + DeprecationHandler.THROW_UNSUPPORTED_OPERATION, + sampleMessage + ); sampleRecords.add(parser.mapOrdered()); timeoutChecker.check("NDJSON parsing"); } - FileStructure.Builder structureBuilder = new FileStructure.Builder(FileStructure.Format.NDJSON) - .setCharset(charsetName) + FileStructure.Builder structureBuilder = new FileStructure.Builder(FileStructure.Format.NDJSON).setCharset(charsetName) .setHasByteOrderMarker(hasByteOrderMarker) .setSampleStart(sampleMessages.stream().limit(2).collect(Collectors.joining("\n", "", "\n"))) .setNumLinesAnalyzed(sampleMessages.size()) .setNumMessagesAnalyzed(sampleRecords.size()); - Tuple timeField = - FileStructureUtils.guessTimestampField(explanation, sampleRecords, overrides, timeoutChecker); + Tuple timeField = FileStructureUtils.guessTimestampField( + explanation, + sampleRecords, + overrides, + timeoutChecker + ); if (timeField != null) { boolean needClientTimeZone = timeField.v2().hasTimezoneDependentParsing(); @@ -61,15 +72,24 @@ static NdJsonFileStructureFinder makeNdJsonFileStructureFinder(List expl .setJodaTimestampFormats(timeField.v2().getJodaTimestampFormats()) .setJavaTimestampFormats(timeField.v2().getJavaTimestampFormats()) .setNeedClientTimezone(needClientTimeZone) - .setIngestPipeline(FileStructureUtils.makeIngestPipelineDefinition(null, Collections.emptyMap(), null, - // Note: no convert processors are added based on mappings for NDJSON input - // because it's reasonable that _source matches the supplied JSON precisely - Collections.emptyMap(), timeField.v1(), timeField.v2().getJavaTimestampFormats(), needClientTimeZone, - timeField.v2().needNanosecondPrecision())); + .setIngestPipeline( + FileStructureUtils.makeIngestPipelineDefinition( + null, + Collections.emptyMap(), + null, + // Note: no convert processors are added based on mappings for NDJSON input + // because it's reasonable that _source matches the supplied JSON precisely + Collections.emptyMap(), + timeField.v1(), + timeField.v2().getJavaTimestampFormats(), + needClientTimeZone, + timeField.v2().needNanosecondPrecision() + ) + ); } - Tuple, SortedMap> mappingsAndFieldStats = - FileStructureUtils.guessMappingsAndCalculateFieldStats(explanation, sampleRecords, timeoutChecker); + Tuple, SortedMap> mappingsAndFieldStats = FileStructureUtils + .guessMappingsAndCalculateFieldStats(explanation, sampleRecords, timeoutChecker); Map fieldMappings = mappingsAndFieldStats.v1(); if (timeField != null) { @@ -80,10 +100,9 @@ static NdJsonFileStructureFinder makeNdJsonFileStructureFinder(List expl structureBuilder.setFieldStats(mappingsAndFieldStats.v2()); } - FileStructure structure = structureBuilder - .setMappings(Collections.singletonMap(FileStructureUtils.MAPPING_PROPERTIES_SETTING, fieldMappings)) - .setExplanation(explanation) - .build(); + FileStructure structure = structureBuilder.setMappings( + Collections.singletonMap(FileStructureUtils.MAPPING_PROPERTIES_SETTING, fieldMappings) + ).setExplanation(explanation).build(); return new NdJsonFileStructureFinder(sampleMessages, structure); } diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/NdJsonFileStructureFinderFactory.java b/x-pack/plugin/text-structure/src/main/java/org/elasticsearch/xpack/textstructure/structurefinder/NdJsonFileStructureFinderFactory.java similarity index 71% rename from x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/NdJsonFileStructureFinderFactory.java rename to x-pack/plugin/text-structure/src/main/java/org/elasticsearch/xpack/textstructure/structurefinder/NdJsonFileStructureFinderFactory.java index b20ddac89228a..7b53554e06dc4 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/NdJsonFileStructureFinderFactory.java +++ b/x-pack/plugin/text-structure/src/main/java/org/elasticsearch/xpack/textstructure/structurefinder/NdJsonFileStructureFinderFactory.java @@ -3,12 +3,12 @@ * or more contributor license agreements. Licensed under the Elastic License; * you may not use this file except in compliance with the Elastic License. */ -package org.elasticsearch.xpack.ml.filestructurefinder; +package org.elasticsearch.xpack.textstructure.structurefinder; import org.elasticsearch.common.xcontent.DeprecationHandler; import org.elasticsearch.common.xcontent.NamedXContentRegistry; import org.elasticsearch.common.xcontent.XContentParser; -import org.elasticsearch.xpack.core.ml.filestructurefinder.FileStructure; +import org.elasticsearch.xpack.core.textstructure.structurefinder.FileStructure; import java.io.IOException; import java.io.StringReader; @@ -37,8 +37,13 @@ public boolean canCreateFromSample(List explanation, String sample, doub try { String[] sampleLines = sample.split("\n"); for (String sampleLine : sampleLines) { - try (XContentParser parser = jsonXContent.createParser(NamedXContentRegistry.EMPTY, - DeprecationHandler.THROW_UNSUPPORTED_OPERATION, new ContextPrintingStringReader(sampleLine))) { + try ( + XContentParser parser = jsonXContent.createParser( + NamedXContentRegistry.EMPTY, + DeprecationHandler.THROW_UNSUPPORTED_OPERATION, + new ContextPrintingStringReader(sampleLine) + ) + ) { if (parser.map().isEmpty()) { explanation.add("Not NDJSON because an empty object was parsed: [" + sampleLine + "]"); @@ -46,8 +51,9 @@ DeprecationHandler.THROW_UNSUPPORTED_OPERATION, new ContextPrintingStringReader( } ++completeDocCount; if (parser.nextToken() != null) { - explanation.add("Not newline delimited NDJSON because a line contained more than a single object: [" + - sampleLine + "]"); + explanation.add( + "Not newline delimited NDJSON because a line contained more than a single object: [" + sampleLine + "]" + ); return false; } } @@ -67,11 +73,23 @@ DeprecationHandler.THROW_UNSUPPORTED_OPERATION, new ContextPrintingStringReader( } @Override - public FileStructureFinder createFromSample(List explanation, String sample, String charsetName, Boolean hasByteOrderMarker, - int lineMergeSizeLimit, FileStructureOverrides overrides, TimeoutChecker timeoutChecker) - throws IOException { - return NdJsonFileStructureFinder.makeNdJsonFileStructureFinder(explanation, sample, charsetName, hasByteOrderMarker, overrides, - timeoutChecker); + public FileStructureFinder createFromSample( + List explanation, + String sample, + String charsetName, + Boolean hasByteOrderMarker, + int lineMergeSizeLimit, + FileStructureOverrides overrides, + TimeoutChecker timeoutChecker + ) throws IOException { + return NdJsonFileStructureFinder.makeNdJsonFileStructureFinder( + explanation, + sample, + charsetName, + hasByteOrderMarker, + overrides, + timeoutChecker + ); } private static class ContextPrintingStringReader extends StringReader { diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinder.java b/x-pack/plugin/text-structure/src/main/java/org/elasticsearch/xpack/textstructure/structurefinder/TextLogFileStructureFinder.java similarity index 63% rename from x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinder.java rename to x-pack/plugin/text-structure/src/main/java/org/elasticsearch/xpack/textstructure/structurefinder/TextLogFileStructureFinder.java index 5bee91180b045..ee1b0f0963f03 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinder.java +++ b/x-pack/plugin/text-structure/src/main/java/org/elasticsearch/xpack/textstructure/structurefinder/TextLogFileStructureFinder.java @@ -3,12 +3,12 @@ * or more contributor license agreements. Licensed under the Elastic License; * you may not use this file except in compliance with the Elastic License. */ -package org.elasticsearch.xpack.ml.filestructurefinder; +package org.elasticsearch.xpack.textstructure.structurefinder; import org.elasticsearch.common.collect.Tuple; -import org.elasticsearch.xpack.core.ml.action.FindFileStructureAction; -import org.elasticsearch.xpack.core.ml.filestructurefinder.FieldStats; -import org.elasticsearch.xpack.core.ml.filestructurefinder.FileStructure; +import org.elasticsearch.xpack.core.textstructure.action.FindFileStructureAction; +import org.elasticsearch.xpack.core.textstructure.structurefinder.FieldStats; +import org.elasticsearch.xpack.core.textstructure.structurefinder.FileStructure; import java.util.ArrayList; import java.util.Collection; @@ -24,18 +24,26 @@ public class TextLogFileStructureFinder implements FileStructureFinder { private final List sampleMessages; private final FileStructure structure; - static TextLogFileStructureFinder makeTextLogFileStructureFinder(List explanation, String sample, String charsetName, - Boolean hasByteOrderMarker, int lineMergeSizeLimit, - FileStructureOverrides overrides, TimeoutChecker timeoutChecker) { + static TextLogFileStructureFinder makeTextLogFileStructureFinder( + List explanation, + String sample, + String charsetName, + Boolean hasByteOrderMarker, + int lineMergeSizeLimit, + FileStructureOverrides overrides, + TimeoutChecker timeoutChecker + ) { String[] sampleLines = sample.split("\n"); TimestampFormatFinder timestampFormatFinder = populateTimestampFormatFinder(explanation, sampleLines, overrides, timeoutChecker); switch (timestampFormatFinder.getNumMatchedFormats()) { case 0: // Is it appropriate to treat a file that is neither structured nor has - // a regular pattern of timestamps as a log file? Probably not... - throw new IllegalArgumentException("Could not find " + ((overrides.getTimestampFormat() == null) - ? "a timestamp" - : "the specified timestamp format") + " in the sample provided"); + // a regular pattern of timestamps as a log file? Probably not... + throw new IllegalArgumentException( + "Could not find " + + ((overrides.getTimestampFormat() == null) ? "a timestamp" : "the specified timestamp format") + + " in the sample provided" + ); case 1: // Simple case break; @@ -44,16 +52,21 @@ static TextLogFileStructureFinder makeTextLogFileStructureFinder(List ex break; } - explanation.add(((overrides.getTimestampFormat() == null) ? "Most likely timestamp" : "Timestamp") + " format is " + - timestampFormatFinder.getJavaTimestampFormats()); + explanation.add( + ((overrides.getTimestampFormat() == null) ? "Most likely timestamp" : "Timestamp") + + " format is " + + timestampFormatFinder.getJavaTimestampFormats() + ); List sampleMessages = new ArrayList<>(); StringBuilder preamble = new StringBuilder(); int linesConsumed = 0; StringBuilder message = null; int linesInMessage = 0; - String multiLineRegex = createMultiLineMessageStartRegex(timestampFormatFinder.getPrefaces(), - timestampFormatFinder.getSimplePattern().pattern()); + String multiLineRegex = createMultiLineMessageStartRegex( + timestampFormatFinder.getPrefaces(), + timestampFormatFinder.getSimplePattern().pattern() + ); Pattern multiLinePattern = Pattern.compile(multiLineRegex); for (String sampleLine : sampleLines) { if (multiLinePattern.matcher(sampleLine).find()) { @@ -74,11 +87,21 @@ static TextLogFileStructureFinder makeTextLogFileStructureFinder(List ex long lengthAfterAppend = message.length() + 1L + sampleLine.length(); if (lengthAfterAppend > lineMergeSizeLimit) { assert linesInMessage > 0; - throw new IllegalArgumentException("Merging lines into messages resulted in an unacceptably long message. " - + "Merged message would have [" + (linesInMessage + 1) + "] lines and [" + lengthAfterAppend + "] " - + "characters (limit [" + lineMergeSizeLimit + "]). If you have messages this big please increase " - + "the value of [" + FindFileStructureAction.Request.LINE_MERGE_SIZE_LIMIT + "]. Otherwise it " - + "probably means the timestamp has been incorrectly detected, so try overriding that."); + throw new IllegalArgumentException( + "Merging lines into messages resulted in an unacceptably long message. " + + "Merged message would have [" + + (linesInMessage + 1) + + "] lines and [" + + lengthAfterAppend + + "] " + + "characters (limit [" + + lineMergeSizeLimit + + "]). If you have messages this big please increase " + + "the value of [" + + FindFileStructureAction.Request.LINE_MERGE_SIZE_LIMIT + + "]. Otherwise it " + + "probably means the timestamp has been incorrectly detected, so try overriding that." + ); } message.append('\n').append(sampleLine); ++linesInMessage; @@ -92,16 +115,19 @@ static TextLogFileStructureFinder makeTextLogFileStructureFinder(List ex // Don't add the last message, as it might be partial and mess up subsequent pattern finding if (sampleMessages.isEmpty()) { - throw new IllegalArgumentException("Failed to create more than one message from the sample lines provided. (The " - + "last is discarded in case the sample is incomplete.) If your sample does contain multiple messages the " - + "problem is probably that the primary timestamp format has been incorrectly detected, so try overriding it."); + throw new IllegalArgumentException( + "Failed to create more than one message from the sample lines provided. (The " + + "last is discarded in case the sample is incomplete.) If your sample does contain multiple messages the " + + "problem is probably that the primary timestamp format has been incorrectly detected, so try overriding it." + ); } // null to allow GC before Grok pattern search sampleLines = null; - FileStructure.Builder structureBuilder = new FileStructure.Builder(FileStructure.Format.SEMI_STRUCTURED_TEXT) - .setCharset(charsetName) + FileStructure.Builder structureBuilder = new FileStructure.Builder(FileStructure.Format.SEMI_STRUCTURED_TEXT).setCharset( + charsetName + ) .setHasByteOrderMarker(hasByteOrderMarker) .setSampleStart(preamble.toString()) .setNumLinesAnalyzed(linesConsumed) @@ -117,8 +143,14 @@ static TextLogFileStructureFinder makeTextLogFileStructureFinder(List ex fieldStats.put("message", FileStructureUtils.calculateFieldStats(messageMapping, sampleMessages, timeoutChecker)); Map customGrokPatternDefinitions = timestampFormatFinder.getCustomGrokPatternDefinitions(); - GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, fieldMappings, fieldStats, - customGrokPatternDefinitions, timeoutChecker); + GrokPatternCreator grokPatternCreator = new GrokPatternCreator( + explanation, + sampleMessages, + fieldMappings, + fieldStats, + customGrokPatternDefinitions, + timeoutChecker + ); // We can't parse directly into @timestamp using Grok, so parse to some other time field, which the date filter will then remove String interimTimestampField = overrides.getTimestampField(); String grokPattern = overrides.getGrokPattern(); @@ -128,8 +160,7 @@ static TextLogFileStructureFinder makeTextLogFileStructureFinder(List ex } grokPatternCreator.validateFullLineGrokPattern(grokPattern, interimTimestampField); } else { - Tuple timestampFieldAndFullMatchGrokPattern = - grokPatternCreator.findFullLineGrokPattern(interimTimestampField); + Tuple timestampFieldAndFullMatchGrokPattern = grokPatternCreator.findFullLineGrokPattern(interimTimestampField); if (timestampFieldAndFullMatchGrokPattern != null) { interimTimestampField = timestampFieldAndFullMatchGrokPattern.v1(); grokPattern = timestampFieldAndFullMatchGrokPattern.v2(); @@ -137,22 +168,33 @@ static TextLogFileStructureFinder makeTextLogFileStructureFinder(List ex if (interimTimestampField == null) { interimTimestampField = "timestamp"; } - grokPattern = grokPatternCreator.createGrokPatternFromExamples(timestampFormatFinder.getGrokPatternName(), - timestampFormatFinder.getEsDateMappingTypeWithFormat(), interimTimestampField); + grokPattern = grokPatternCreator.createGrokPatternFromExamples( + timestampFormatFinder.getGrokPatternName(), + timestampFormatFinder.getEsDateMappingTypeWithFormat(), + interimTimestampField + ); } } boolean needClientTimeZone = timestampFormatFinder.hasTimezoneDependentParsing(); - FileStructure structure = structureBuilder - .setTimestampField(interimTimestampField) + FileStructure structure = structureBuilder.setTimestampField(interimTimestampField) .setJodaTimestampFormats(timestampFormatFinder.getJodaTimestampFormats()) .setJavaTimestampFormats(timestampFormatFinder.getJavaTimestampFormats()) .setNeedClientTimezone(needClientTimeZone) .setGrokPattern(grokPattern) - .setIngestPipeline(FileStructureUtils.makeIngestPipelineDefinition(grokPattern, customGrokPatternDefinitions, null, - fieldMappings, interimTimestampField, timestampFormatFinder.getJavaTimestampFormats(), needClientTimeZone, - timestampFormatFinder.needNanosecondPrecision())) + .setIngestPipeline( + FileStructureUtils.makeIngestPipelineDefinition( + grokPattern, + customGrokPatternDefinitions, + null, + fieldMappings, + interimTimestampField, + timestampFormatFinder.getJavaTimestampFormats(), + needClientTimeZone, + timestampFormatFinder.needNanosecondPrecision() + ) + ) .setMappings(Collections.singletonMap(FileStructureUtils.MAPPING_PROPERTIES_SETTING, fieldMappings)) .setFieldStats(fieldStats) .setExplanation(explanation) @@ -176,10 +218,20 @@ public FileStructure getStructure() { return structure; } - static TimestampFormatFinder populateTimestampFormatFinder(List explanation, String[] sampleLines, - FileStructureOverrides overrides, TimeoutChecker timeoutChecker) { - TimestampFormatFinder timestampFormatFinder = - new TimestampFormatFinder(explanation, overrides.getTimestampFormat(), false, false, false, timeoutChecker); + static TimestampFormatFinder populateTimestampFormatFinder( + List explanation, + String[] sampleLines, + FileStructureOverrides overrides, + TimeoutChecker timeoutChecker + ) { + TimestampFormatFinder timestampFormatFinder = new TimestampFormatFinder( + explanation, + overrides.getTimestampFormat(), + false, + false, + false, + timeoutChecker + ); for (String sampleLine : sampleLines) { timestampFormatFinder.addSample(sampleLine); diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderFactory.java b/x-pack/plugin/text-structure/src/main/java/org/elasticsearch/xpack/textstructure/structurefinder/TextLogFileStructureFinderFactory.java similarity index 70% rename from x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderFactory.java rename to x-pack/plugin/text-structure/src/main/java/org/elasticsearch/xpack/textstructure/structurefinder/TextLogFileStructureFinderFactory.java index 54752d2dc0012..2e33a8eed6620 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderFactory.java +++ b/x-pack/plugin/text-structure/src/main/java/org/elasticsearch/xpack/textstructure/structurefinder/TextLogFileStructureFinderFactory.java @@ -3,9 +3,9 @@ * or more contributor license agreements. Licensed under the Elastic License; * you may not use this file except in compliance with the Elastic License. */ -package org.elasticsearch.xpack.ml.filestructurefinder; +package org.elasticsearch.xpack.textstructure.structurefinder; -import org.elasticsearch.xpack.core.ml.filestructurefinder.FileStructure; +import org.elasticsearch.xpack.core.textstructure.structurefinder.FileStructure; import java.util.List; import java.util.regex.Pattern; @@ -40,9 +40,23 @@ public boolean canCreateFromSample(List explanation, String sample, doub } @Override - public FileStructureFinder createFromSample(List explanation, String sample, String charsetName, Boolean hasByteOrderMarker, - int lineMergeSizeLimit, FileStructureOverrides overrides, TimeoutChecker timeoutChecker) { - return TextLogFileStructureFinder.makeTextLogFileStructureFinder(explanation, sample, charsetName, hasByteOrderMarker, - lineMergeSizeLimit, overrides, timeoutChecker); + public FileStructureFinder createFromSample( + List explanation, + String sample, + String charsetName, + Boolean hasByteOrderMarker, + int lineMergeSizeLimit, + FileStructureOverrides overrides, + TimeoutChecker timeoutChecker + ) { + return TextLogFileStructureFinder.makeTextLogFileStructureFinder( + explanation, + sample, + charsetName, + hasByteOrderMarker, + lineMergeSizeLimit, + overrides, + timeoutChecker + ); } } diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TimeoutChecker.java b/x-pack/plugin/text-structure/src/main/java/org/elasticsearch/xpack/textstructure/structurefinder/TimeoutChecker.java similarity index 96% rename from x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TimeoutChecker.java rename to x-pack/plugin/text-structure/src/main/java/org/elasticsearch/xpack/textstructure/structurefinder/TimeoutChecker.java index 99ce19e3ad110..949cbb2ceda85 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TimeoutChecker.java +++ b/x-pack/plugin/text-structure/src/main/java/org/elasticsearch/xpack/textstructure/structurefinder/TimeoutChecker.java @@ -3,7 +3,7 @@ * or more contributor license agreements. Licensed under the Elastic License; * you may not use this file except in compliance with the Elastic License. */ -package org.elasticsearch.xpack.ml.filestructurefinder; +package org.elasticsearch.xpack.textstructure.structurefinder; import org.elasticsearch.ElasticsearchTimeoutException; import org.elasticsearch.common.unit.TimeValue; @@ -34,7 +34,7 @@ * {@link Grok#captures}) and this would lead to non-uniform exception types and * misleading error messages in the event that the interrupt was handled by one of * these methods. The code in the long running operation would still have to - * periodically call {@link Thread#interrupted}, so it is not much more of an + * periodically call {@link Thread#interrupt}, so it is not much more of an * inconvenience to have to periodically call this class's {@link #check} method. */ public class TimeoutChecker implements Closeable { @@ -85,8 +85,9 @@ public synchronized void close() { */ public void check(String where) { if (timeoutExceeded) { - throw new ElasticsearchTimeoutException("Aborting " + operation + " during [" + where + - "] as it has taken longer than the timeout of [" + timeout + "]"); + throw new ElasticsearchTimeoutException( + "Aborting " + operation + " during [" + where + "] as it has taken longer than the timeout of [" + timeout + "]" + ); } } diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TimestampFormatFinder.java b/x-pack/plugin/text-structure/src/main/java/org/elasticsearch/xpack/textstructure/structurefinder/TimestampFormatFinder.java similarity index 79% rename from x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TimestampFormatFinder.java rename to x-pack/plugin/text-structure/src/main/java/org/elasticsearch/xpack/textstructure/structurefinder/TimestampFormatFinder.java index ff4a66a06f454..2845d6d82757e 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TimestampFormatFinder.java +++ b/x-pack/plugin/text-structure/src/main/java/org/elasticsearch/xpack/textstructure/structurefinder/TimestampFormatFinder.java @@ -3,7 +3,7 @@ * or more contributor license agreements. Licensed under the Elastic License; * you may not use this file except in compliance with the Elastic License. */ -package org.elasticsearch.xpack.ml.filestructurefinder; +package org.elasticsearch.xpack.textstructure.structurefinder; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; @@ -53,8 +53,9 @@ public final class TimestampFormatFinder { private static final Logger logger = LogManager.getLogger(TimestampFormatFinder.class); private static final String PUNCTUATION_THAT_NEEDS_ESCAPING_IN_REGEX = "\\|()[]{}^$.*?"; private static final String FRACTIONAL_SECOND_SEPARATORS = ":.,"; - private static final Pattern FRACTIONAL_SECOND_INTERPRETER = - Pattern.compile("([" + FRACTIONAL_SECOND_SEPARATORS + "])(\\d{3,9})($|[Z+-])"); + private static final Pattern FRACTIONAL_SECOND_INTERPRETER = Pattern.compile( + "([" + FRACTIONAL_SECOND_SEPARATORS + "])(\\d{3,9})($|[Z+-])" + ); private static final char INDETERMINATE_FIELD_PLACEHOLDER = '?'; // The ? characters in this must match INDETERMINATE_FIELD_PLACEHOLDER // above, but they're literals in this regex to aid readability @@ -97,19 +98,39 @@ public final class TimestampFormatFinder { /** * Candidates for the special format strings (ISO8601, UNIX_MS, UNIX and TAI64N) */ - static final CandidateTimestampFormat ISO8601_CANDIDATE_FORMAT = - new CandidateTimestampFormat(CandidateTimestampFormat::iso8601FormatFromExample, - "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", "\\b%{TIMESTAMP_ISO8601}\\b", "TIMESTAMP_ISO8601", - "1111 11 11 11 11", 0, 19); - static final CandidateTimestampFormat UNIX_MS_CANDIDATE_FORMAT = - new CandidateTimestampFormat(example -> Collections.singletonList("UNIX_MS"), "\\b\\d{13}\\b", "\\b[12]\\d{12}\\b", "POSINT", - "1111111111111", 0, 0); - static final CandidateTimestampFormat UNIX_CANDIDATE_FORMAT = - new CandidateTimestampFormat(example -> Collections.singletonList("UNIX"), "\\b\\d{10}\\b", "\\b[12]\\d{9}(?:\\.\\d{3,9})?\\b", - "NUMBER", "1111111111", 0, 10); - static final CandidateTimestampFormat TAI64N_CANDIDATE_FORMAT = - new CandidateTimestampFormat(example -> Collections.singletonList("TAI64N"), "\\b[0-9A-Fa-f]{24}\\b", "\\b[0-9A-Fa-f]{24}\\b", - "BASE16NUM"); + static final CandidateTimestampFormat ISO8601_CANDIDATE_FORMAT = new CandidateTimestampFormat( + CandidateTimestampFormat::iso8601FormatFromExample, + "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", + "\\b%{TIMESTAMP_ISO8601}\\b", + "TIMESTAMP_ISO8601", + "1111 11 11 11 11", + 0, + 19 + ); + static final CandidateTimestampFormat UNIX_MS_CANDIDATE_FORMAT = new CandidateTimestampFormat( + example -> Collections.singletonList("UNIX_MS"), + "\\b\\d{13}\\b", + "\\b[12]\\d{12}\\b", + "POSINT", + "1111111111111", + 0, + 0 + ); + static final CandidateTimestampFormat UNIX_CANDIDATE_FORMAT = new CandidateTimestampFormat( + example -> Collections.singletonList("UNIX"), + "\\b\\d{10}\\b", + "\\b[12]\\d{9}(?:\\.\\d{3,9})?\\b", + "NUMBER", + "1111111111", + 0, + 10 + ); + static final CandidateTimestampFormat TAI64N_CANDIDATE_FORMAT = new CandidateTimestampFormat( + example -> Collections.singletonList("TAI64N"), + "\\b[0-9A-Fa-f]{24}\\b", + "\\b[0-9A-Fa-f]{24}\\b", + "BASE16NUM" + ); /** * The first match in this list will be chosen, so it needs to be ordered @@ -119,75 +140,151 @@ public final class TimestampFormatFinder { // The TOMCAT_DATESTAMP format has to come before ISO8601 because it's basically ISO8601 but // with a space before the timezone, and because the timezone is optional in ISO8601 it will // be recognised as that with the timezone missed off if ISO8601 is checked first - new CandidateTimestampFormat(example -> CandidateTimestampFormat.iso8601LikeFormatFromExample(example, " ", " "), + new CandidateTimestampFormat( + example -> CandidateTimestampFormat.iso8601LikeFormatFromExample(example, " ", " "), "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}[:.,]\\d{3}", "\\b20\\d{2}-%{MONTHNUM}-%{MONTHDAY} %{HOUR}:?%{MINUTE}:(?:[0-5][0-9]|60)[:.,][0-9]{3,9} (?:Z|[+-]%{HOUR}%{MINUTE})\\b", - "TOMCAT_DATESTAMP", "1111 11 11 11 11 11 111", 0, 13), + "TOMCAT_DATESTAMP", + "1111 11 11 11 11 11 111", + 0, + 13 + ), ISO8601_CANDIDATE_FORMAT, new CandidateTimestampFormat( example -> Arrays.asList("EEE MMM dd yy HH:mm:ss zzz", "EEE MMM d yy HH:mm:ss zzz"), "\\b[A-Z]\\S{2} [A-Z]\\S{2} \\d{1,2} \\d{2} \\d{2}:\\d{2}:\\d{2}\\b", - "\\b%{DAY} %{MONTH} %{MONTHDAY} %{YEAR} %{HOUR}:%{MINUTE}(?::(?:[0-5][0-9]|60)) %{TZ}\\b", "DATESTAMP_RFC822", - Arrays.asList(" 11 11 11 11 11", " 1 11 11 11 11"), 0, 5), + "\\b%{DAY} %{MONTH} %{MONTHDAY} %{YEAR} %{HOUR}:%{MINUTE}(?::(?:[0-5][0-9]|60)) %{TZ}\\b", + "DATESTAMP_RFC822", + Arrays.asList(" 11 11 11 11 11", " 1 11 11 11 11"), + 0, + 5 + ), new CandidateTimestampFormat( example -> CandidateTimestampFormat.adjustTrailingTimezoneFromExample(example, "EEE, dd MMM yyyy HH:mm:ss XX"), "\\b[A-Z]\\S{2}, \\d{1,2} [A-Z]\\S{2} \\d{4} \\d{2}:\\d{2}:\\d{2}\\b", "\\b%{DAY}, %{MONTHDAY} %{MONTH} %{YEAR} %{HOUR}:%{MINUTE}(?::(?:[0-5][0-9]|60)) (?:Z|[+-]%{HOUR}:?%{MINUTE})\\b", - "DATESTAMP_RFC2822", Arrays.asList(" 11 1111 11 11 11", " 1 1111 11 11 11"), 0, 7), + "DATESTAMP_RFC2822", + Arrays.asList(" 11 1111 11 11 11", " 1 1111 11 11 11"), + 0, + 7 + ), new CandidateTimestampFormat( example -> Arrays.asList("EEE MMM dd HH:mm:ss zzz yyyy", "EEE MMM d HH:mm:ss zzz yyyy"), "\\b[A-Z]\\S{2,8} [A-Z]\\S{2,8} \\d{1,2} \\d{2}:\\d{2}:\\d{2}\\b", - "\\b%{DAY} %{MONTH} %{MONTHDAY} %{HOUR}:%{MINUTE}(?::(?:[0-5][0-9]|60)) %{TZ} %{YEAR}\\b", "DATESTAMP_OTHER", - Arrays.asList(" 11 11 11 11", " 1 11 11 11"), 12, 10), - new CandidateTimestampFormat(example -> Collections.singletonList("yyyyMMddHHmmss"), "\\b\\d{14}\\b", + "\\b%{DAY} %{MONTH} %{MONTHDAY} %{HOUR}:%{MINUTE}(?::(?:[0-5][0-9]|60)) %{TZ} %{YEAR}\\b", + "DATESTAMP_OTHER", + Arrays.asList(" 11 11 11 11", " 1 11 11 11"), + 12, + 10 + ), + new CandidateTimestampFormat( + example -> Collections.singletonList("yyyyMMddHHmmss"), + "\\b\\d{14}\\b", "\\b20\\d{2}%{MONTHNUM2}(?:(?:0[1-9])|(?:[12][0-9])|(?:3[01]))(?:2[0123]|[01][0-9])%{MINUTE}(?:[0-5][0-9]|60)\\b", - "DATESTAMP_EVENTLOG", "11111111111111", 0, 0), - new CandidateTimestampFormat(example -> Collections.singletonList("EEE MMM dd HH:mm:ss yyyy"), + "DATESTAMP_EVENTLOG", + "11111111111111", + 0, + 0 + ), + new CandidateTimestampFormat( + example -> Collections.singletonList("EEE MMM dd HH:mm:ss yyyy"), "\\b[A-Z]\\S{2} [A-Z]\\S{2} \\d{2} \\d{2}:\\d{2}:\\d{2} \\d{4}\\b", - "\\b%{DAY} %{MONTH} %{MONTHDAY} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60) %{YEAR}\\b", "HTTPDERROR_DATE", - " 11 11 11 11 1111", 0, 0), + "\\b%{DAY} %{MONTH} %{MONTHDAY} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60) %{YEAR}\\b", + "HTTPDERROR_DATE", + " 11 11 11 11 1111", + 0, + 0 + ), new CandidateTimestampFormat( example -> CandidateTimestampFormat.expandDayAndAdjustFractionalSecondsFromExample(example, "MMM dd HH:mm:ss"), "\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{2}:\\d{2}:\\d{2}\\b", - "%{MONTH} +%{MONTHDAY} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60)(?:[:.,][0-9]{3,9})?\\b", "SYSLOGTIMESTAMP", - Arrays.asList(" 11 11 11 11", " 1 11 11 11"), 6, 10), - new CandidateTimestampFormat(example -> Collections.singletonList("dd/MMM/yyyy:HH:mm:ss XX"), + "%{MONTH} +%{MONTHDAY} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60)(?:[:.,][0-9]{3,9})?\\b", + "SYSLOGTIMESTAMP", + Arrays.asList(" 11 11 11 11", " 1 11 11 11"), + 6, + 10 + ), + new CandidateTimestampFormat( + example -> Collections.singletonList("dd/MMM/yyyy:HH:mm:ss XX"), "\\b\\d{2}/[A-Z]\\S{2}/\\d{4}:\\d{2}:\\d{2}:\\d{2} ", - "\\b%{MONTHDAY}/%{MONTH}/%{YEAR}:%{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60) [+-]?%{HOUR}%{MINUTE}\\b", "HTTPDATE", - "11 1111 11 11 11", 0, 6), - new CandidateTimestampFormat(example -> Collections.singletonList("MMM dd, yyyy h:mm:ss a"), + "\\b%{MONTHDAY}/%{MONTH}/%{YEAR}:%{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60) [+-]?%{HOUR}%{MINUTE}\\b", + "HTTPDATE", + "11 1111 11 11 11", + 0, + 6 + ), + new CandidateTimestampFormat( + example -> Collections.singletonList("MMM dd, yyyy h:mm:ss a"), "\\b[A-Z]\\S{2} \\d{2}, \\d{4} \\d{1,2}:\\d{2}:\\d{2} [AP]M\\b", - "%{MONTH} %{MONTHDAY}, 20\\d{2} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60) (?:AM|PM)\\b", "CATALINA_DATESTAMP", - Arrays.asList(" 11 1111 1 11 11", " 11 1111 11 11 11"), 0, 3), - new CandidateTimestampFormat(example -> Arrays.asList("MMM dd yyyy HH:mm:ss", "MMM d yyyy HH:mm:ss", "MMM d yyyy HH:mm:ss"), + "%{MONTH} %{MONTHDAY}, 20\\d{2} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60) (?:AM|PM)\\b", + "CATALINA_DATESTAMP", + Arrays.asList(" 11 1111 1 11 11", " 11 1111 11 11 11"), + 0, + 3 + ), + new CandidateTimestampFormat( + example -> Arrays.asList("MMM dd yyyy HH:mm:ss", "MMM d yyyy HH:mm:ss", "MMM d yyyy HH:mm:ss"), "\\b[A-Z]\\S{2} {1,2}\\d{1,2} \\d{4} \\d{2}:\\d{2}:\\d{2}\\b", - "%{MONTH} +%{MONTHDAY} %{YEAR} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60)\\b", "CISCOTIMESTAMP", - Arrays.asList(" 11 1111 11 11 11", " 1 1111 11 11 11"), 1, 0), - new CandidateTimestampFormat(CandidateTimestampFormat::indeterminateDayMonthFormatFromExample, - "\\b\\d{1,2}[/.-]\\d{1,2}[/.-](?:\\d{2}){1,2}[- ]\\d{2}:\\d{2}:\\d{2}\\b", "\\b%{DATESTAMP}\\b", "DATESTAMP", + "%{MONTH} +%{MONTHDAY} %{YEAR} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60)\\b", + "CISCOTIMESTAMP", + Arrays.asList(" 11 1111 11 11 11", " 1 1111 11 11 11"), + 1, + 0 + ), + new CandidateTimestampFormat( + CandidateTimestampFormat::indeterminateDayMonthFormatFromExample, + "\\b\\d{1,2}[/.-]\\d{1,2}[/.-](?:\\d{2}){1,2}[- ]\\d{2}:\\d{2}:\\d{2}\\b", + "\\b%{DATESTAMP}\\b", + "DATESTAMP", // In DATESTAMP the month may be 1 or 2 digits, the year 2 or 4, but the day must be 2 // Also note the Grok pattern search space is set to start one character before a quick rule-out // match because we don't want 11 11 11 matching into 1111 11 11 with this pattern - Arrays.asList("11 11 1111 11 11 11", "1 11 1111 11 11 11", "11 1 1111 11 11 11", "11 11 11 11 11 11", "1 11 11 11 11 11", - "11 1 11 11 11 11"), 1, 10), - new CandidateTimestampFormat(CandidateTimestampFormat::indeterminateDayMonthFormatFromExample, - "\\b\\d{1,2}[/.-]\\d{1,2}[/.-](?:\\d{2}){1,2}\\b", "\\b%{DATE}\\b", "DATE", + Arrays.asList( + "11 11 1111 11 11 11", + "1 11 1111 11 11 11", + "11 1 1111 11 11 11", + "11 11 11 11 11 11", + "1 11 11 11 11 11", + "11 1 11 11 11 11" + ), + 1, + 10 + ), + new CandidateTimestampFormat( + CandidateTimestampFormat::indeterminateDayMonthFormatFromExample, + "\\b\\d{1,2}[/.-]\\d{1,2}[/.-](?:\\d{2}){1,2}\\b", + "\\b%{DATE}\\b", + "DATE", // In DATE the month may be 1 or 2 digits, the year 2 or 4, but the day must be 2 // Also note the Grok pattern search space is set to start one character before a quick rule-out // match because we don't want 11 11 11 matching into 1111 11 11 with this pattern - Arrays.asList("11 11 1111", "11 1 1111", "1 11 1111", "11 11 11", "11 1 11", "1 11 11"), 1, 0), + Arrays.asList("11 11 1111", "11 1 1111", "1 11 1111", "11 11 11", "11 1 11", "1 11 11"), + 1, + 0 + ), UNIX_MS_CANDIDATE_FORMAT, UNIX_CANDIDATE_FORMAT, TAI64N_CANDIDATE_FORMAT, // This one is an ISO8601 date with no time, but the TIMESTAMP_ISO8601 Grok pattern doesn't cover it - new CandidateTimestampFormat(example -> Collections.singletonList("ISO8601"), - "\\b\\d{4}-\\d{2}-\\d{2}\\b", "\\b%{YEAR}-%{MONTHNUM2}-%{MONTHDAY}\\b", CUSTOM_TIMESTAMP_GROK_NAME, - "1111 11 11", 0, 0), + new CandidateTimestampFormat( + example -> Collections.singletonList("ISO8601"), + "\\b\\d{4}-\\d{2}-\\d{2}\\b", + "\\b%{YEAR}-%{MONTHNUM2}-%{MONTHDAY}\\b", + CUSTOM_TIMESTAMP_GROK_NAME, + "1111 11 11", + 0, + 0 + ), // The Kibana export format - new CandidateTimestampFormat(example -> Collections.singletonList("MMM dd, yyyy @ HH:mm:ss.SSS"), + new CandidateTimestampFormat( + example -> Collections.singletonList("MMM dd, yyyy @ HH:mm:ss.SSS"), "\\b[A-Z]\\S{2} \\d{2}, \\d{4} @ \\d{2}:\\d{2}:\\d{2}\\.\\d{3}\\b", - "\\b%{MONTH} %{MONTHDAY}, %{YEAR} @ %{HOUR}:%{MINUTE}:%{SECOND}\\b", CUSTOM_TIMESTAMP_GROK_NAME, - " 11 1111 11 11 11 111", 0, 0) + "\\b%{MONTH} %{MONTHDAY}, %{YEAR} @ %{HOUR}:%{MINUTE}:%{SECOND}\\b", + CUSTOM_TIMESTAMP_GROK_NAME, + " 11 1111 11 11 11 111", + 0, + 0 + ) ); /** @@ -215,8 +312,13 @@ public final class TimestampFormatFinder { * @param errorOnMultiplePatterns Should an exception be thrown if samples are uploaded that require different Grok patterns? * @param timeoutChecker Will abort the operation if its timeout is exceeded. */ - public TimestampFormatFinder(List explanation, boolean requireFullMatch, boolean errorOnNoTimestamp, - boolean errorOnMultiplePatterns, TimeoutChecker timeoutChecker) { + public TimestampFormatFinder( + List explanation, + boolean requireFullMatch, + boolean errorOnNoTimestamp, + boolean errorOnMultiplePatterns, + TimeoutChecker timeoutChecker + ) { this(explanation, null, requireFullMatch, errorOnNoTimestamp, errorOnMultiplePatterns, timeoutChecker); } @@ -233,8 +335,14 @@ public TimestampFormatFinder(List explanation, boolean requireFullMatch, * @param errorOnMultiplePatterns Should an exception be thrown if samples are uploaded that require different Grok patterns? * @param timeoutChecker Will abort the operation if its timeout is exceeded. */ - public TimestampFormatFinder(List explanation, @Nullable String overrideFormat, boolean requireFullMatch, - boolean errorOnNoTimestamp, boolean errorOnMultiplePatterns, TimeoutChecker timeoutChecker) { + public TimestampFormatFinder( + List explanation, + @Nullable String overrideFormat, + boolean requireFullMatch, + boolean errorOnNoTimestamp, + boolean errorOnMultiplePatterns, + TimeoutChecker timeoutChecker + ) { this.explanation = Objects.requireNonNull(explanation); this.requireFullMatch = requireFullMatch; this.errorOnNoTimestamp = errorOnNoTimestamp; @@ -259,8 +367,9 @@ static Tuple overrideFormatToGrokAndRegex(String overrideFormat) } if (overrideFormat.indexOf(INDETERMINATE_FIELD_PLACEHOLDER) >= 0) { - throw new IllegalArgumentException("Timestamp format [" + overrideFormat + "] not supported because it contains [" - + INDETERMINATE_FIELD_PLACEHOLDER + "]"); + throw new IllegalArgumentException( + "Timestamp format [" + overrideFormat + "] not supported because it contains [" + INDETERMINATE_FIELD_PLACEHOLDER + "]" + ); } StringBuilder grokPatternBuilder = new StringBuilder(); @@ -286,8 +395,10 @@ static Tuple overrideFormatToGrokAndRegex(String overrideFormat) Tuple grokPatternAndRegexForGroup = VALID_LETTER_GROUPS.get(letterGroup); if (grokPatternAndRegexForGroup == null) { // Special case of fractional seconds - if (curChar != 'S' || FRACTIONAL_SECOND_SEPARATORS.indexOf(prevChar) == -1 || - "ss".equals(prevLetterGroup) == false || endPos - startPos > 9) { + if (curChar != 'S' + || FRACTIONAL_SECOND_SEPARATORS.indexOf(prevChar) == -1 + || "ss".equals(prevLetterGroup) == false + || endPos - startPos > 9) { String msg = "Letter group [" + letterGroup + "] in [" + overrideFormat + "] is not supported"; if (curChar == 'S') { msg += " because it is not preceded by [ss] and a separator from [" + FRACTIONAL_SECOND_SEPARATORS + "]"; @@ -367,26 +478,35 @@ static CandidateTimestampFormat makeCandidateFromOverrideFormat(String overrideF TimestampMatch match = checkCandidate(candidate, generatedTimestamp, null, true, timeoutChecker); if (match != null) { - return new CandidateTimestampFormat(example -> { - - // Modify the built-in candidate so it prefers to return the user supplied format - // if at all possible, and only falls back to standard logic for other situations - try { - // TODO consider support for overriding the locale too - // But since Grok only supports English and German date words ingest - // via Grok will fall down at an earlier stage for other languages... - javaTimeFormatter.parse(example); - return Collections.singletonList(overrideFormat); - } catch (DateTimeException e) { - return candidate.javaTimestampFormatSupplier.apply(example); - } - }, candidate.simplePattern.pattern(), candidate.strictGrokPattern, candidate.outputGrokPatternName); + return new CandidateTimestampFormat( + example -> { + + // Modify the built-in candidate so it prefers to return the user supplied format + // if at all possible, and only falls back to standard logic for other situations + try { + // TODO consider support for overriding the locale too + // But since Grok only supports English and German date words ingest + // via Grok will fall down at an earlier stage for other languages... + javaTimeFormatter.parse(example); + return Collections.singletonList(overrideFormat); + } catch (DateTimeException e) { + return candidate.javaTimestampFormatSupplier.apply(example); + } + }, + candidate.simplePattern.pattern(), + candidate.strictGrokPattern, + candidate.outputGrokPatternName + ); } } // None of the out-of-the-box formats were close, so use the built Grok pattern and simple regex for the override - return new CandidateTimestampFormat(example -> Collections.singletonList(overrideFormat), - grokPatternAndRegex.v2(), grokPatternAndRegex.v1(), CUSTOM_TIMESTAMP_GROK_NAME); + return new CandidateTimestampFormat( + example -> Collections.singletonList(overrideFormat), + grokPatternAndRegex.v2(), + grokPatternAndRegex.v1(), + CUSTOM_TIMESTAMP_GROK_NAME + ); } /** @@ -399,11 +519,19 @@ static CandidateTimestampFormat makeCandidateFromOverrideFormat(String overrideF * @param timeoutChecker Will abort the operation if its timeout is exceeded. * @return The timestamp format, or null if none matches. */ - private static TimestampMatch checkCandidate(CandidateTimestampFormat candidate, String text, @Nullable BitSet numberPosBitSet, - boolean requireFullMatch, TimeoutChecker timeoutChecker) { + private static TimestampMatch checkCandidate( + CandidateTimestampFormat candidate, + String text, + @Nullable BitSet numberPosBitSet, + boolean requireFullMatch, + TimeoutChecker timeoutChecker + ) { if (requireFullMatch) { - Map captures = timeoutChecker.grokCaptures(candidate.strictFullMatchGrok, text, - "timestamp format determination"); + Map captures = timeoutChecker.grokCaptures( + candidate.strictFullMatchGrok, + text, + "timestamp format determination" + ); if (captures != null) { return new TimestampMatch(candidate, "", text, ""); } @@ -415,8 +543,11 @@ private static TimestampMatch checkCandidate(CandidateTimestampFormat candidate, if (boundsForCandidate.v1() >= 0) { assert boundsForCandidate.v2() > boundsForCandidate.v1(); String matchIn = text.substring(boundsForCandidate.v1(), Math.min(boundsForCandidate.v2(), text.length())); - Map captures = timeoutChecker.grokCaptures(candidate.strictSearchGrok, matchIn, - "timestamp format determination"); + Map captures = timeoutChecker.grokCaptures( + candidate.strictSearchGrok, + matchIn, + "timestamp format determination" + ); if (captures != null) { StringBuilder prefaceBuilder = new StringBuilder(); if (boundsForCandidate.v1() > 0) { @@ -428,8 +559,12 @@ private static TimestampMatch checkCandidate(CandidateTimestampFormat candidate, if (boundsForCandidate.v2() < text.length()) { epilogueBuilder.append(text.subSequence(boundsForCandidate.v2(), text.length())); } - return new TimestampMatch(candidate, prefaceBuilder.toString(), text.substring(prefaceBuilder.length(), - text.length() - epilogueBuilder.length()), epilogueBuilder.toString()); + return new TimestampMatch( + candidate, + prefaceBuilder.toString(), + text.substring(prefaceBuilder.length(), text.length() - epilogueBuilder.length()), + epilogueBuilder.toString() + ); } } else { timeoutChecker.check("timestamp format determination"); @@ -472,8 +607,9 @@ public void addSample(String text) { } if (mustAdd) { if (errorOnMultiplePatterns && matchedFormats.isEmpty() == false) { - throw new IllegalArgumentException("Multiple timestamp formats found [" - + matchedFormats.get(0) + "] and [" + newFormat + "]"); + throw new IllegalArgumentException( + "Multiple timestamp formats found [" + matchedFormats.get(0) + "] and [" + newFormat + "]" + ); } matchedFormats.add(newFormat); } @@ -657,8 +793,10 @@ public List getPrefaces() { assert errorOnNoTimestamp == false; return Collections.emptyList(); } - return matches.stream().filter(match -> matchedFormats.size() < 2 || matchedFormats.get(0).canMergeWith(match.timestampFormat)) - .map(match -> match.preface).collect(Collectors.toList()); + return matches.stream() + .filter(match -> matchedFormats.size() < 2 || matchedFormats.get(0).canMergeWith(match.timestampFormat)) + .map(match -> match.preface) + .collect(Collectors.toList()); } /** @@ -697,11 +835,13 @@ public List getJavaTimestampFormats() { if (cachedJavaTimestampFormats != null) { return cachedJavaTimestampFormats; } - return determiniseJavaTimestampFormats(getRawJavaTimestampFormats(), + return determiniseJavaTimestampFormats( + getRawJavaTimestampFormats(), // With multiple formats, only consider the matches that correspond to the first // in the list (which is what we're returning information about via the getters). // With just one format it's most efficient not to bother checking formats. - (matchedFormats.size() > 1) ? matchedFormats.get(0) : null); + (matchedFormats.size() > 1) ? matchedFormats.get(0) : null + ); } /** @@ -714,7 +854,8 @@ public boolean needNanosecondPrecision() { assert errorOnNoTimestamp == false; return false; } - return matches.stream().filter(match -> matchedFormats.size() < 2 || matchedFormats.get(0).canMergeWith(match.timestampFormat)) + return matches.stream() + .filter(match -> matchedFormats.size() < 2 || matchedFormats.get(0).canMergeWith(match.timestampFormat)) .anyMatch(match -> match.hasNanosecondPrecision); } @@ -723,15 +864,18 @@ public boolean needNanosecondPrecision() { * return the corresponding pattern with the placeholders replaced with concrete * day/month formats. */ - private List determiniseJavaTimestampFormats(List rawJavaTimestampFormats, - @Nullable TimestampFormat onlyConsiderFormat) { + private List determiniseJavaTimestampFormats( + List rawJavaTimestampFormats, + @Nullable TimestampFormat onlyConsiderFormat + ) { // This method needs rework if the class is ever made thread safe if (rawJavaTimestampFormats.stream().anyMatch(format -> format.indexOf(INDETERMINATE_FIELD_PLACEHOLDER) >= 0)) { boolean isDayFirst = guessIsDayFirst(rawJavaTimestampFormats, onlyConsiderFormat, Locale.getDefault()); cachedJavaTimestampFormats = rawJavaTimestampFormats.stream() - .map(format -> determiniseJavaTimestampFormat(format, isDayFirst)).collect(Collectors.toList()); + .map(format -> determiniseJavaTimestampFormat(format, isDayFirst)) + .collect(Collectors.toList()); } else { cachedJavaTimestampFormats = rawJavaTimestampFormats; } @@ -743,8 +887,11 @@ private List determiniseJavaTimestampFormats(List rawJavaTimesta * or MM/dd/yyyy for example), make a guess about whether the day comes first. * @return true if the day comes first and false if the month comes first. */ - private boolean guessIsDayFirst(List rawJavaTimestampFormats, @Nullable TimestampFormat onlyConsiderFormat, - Locale localeForFallback) { + private boolean guessIsDayFirst( + List rawJavaTimestampFormats, + @Nullable TimestampFormat onlyConsiderFormat, + Locale localeForFallback + ) { Boolean isDayFirst = guessIsDayFirstFromFormats(rawJavaTimestampFormats); if (isDayFirst != null) { @@ -796,11 +943,15 @@ Boolean guessIsDayFirstFromFormats(List rawJavaTimestampFormats) { if (isDayFirst != null) { if (isDayFirst) { - explanation.add("Guessing day precedes month in timestamps as all detected formats have a two digits in the first number " - + "and a single digit in the second number which is what the %{MONTHDAY} and %{MONTHNUM} Grok patterns permit"); + explanation.add( + "Guessing day precedes month in timestamps as all detected formats have a two digits in the first number " + + "and a single digit in the second number which is what the %{MONTHDAY} and %{MONTHNUM} Grok patterns permit" + ); } else { - explanation.add("Guessing month precedes day in timestamps as all detected formats have a single digit in the first number " - + "and two digits in the second number which is what the %{MONTHNUM} and %{MONTHDAY} Grok patterns permit"); + explanation.add( + "Guessing month precedes day in timestamps as all detected formats have a single digit in the first number " + + "and two digits in the second number which is what the %{MONTHNUM} and %{MONTHDAY} Grok patterns permit" + ); } } @@ -829,8 +980,11 @@ Boolean guessIsDayFirstFromMatches(@Nullable TimestampFormat onlyConsiderFormat) if (match.firstIndeterminateDateNumber > 0) { assert match.firstIndeterminateDateNumber <= 31; if (match.firstIndeterminateDateNumber > 12) { - explanation.add("Guessing day precedes month in timestamps as one sample had first number [" - + match.firstIndeterminateDateNumber + "]"); + explanation.add( + "Guessing day precedes month in timestamps as one sample had first number [" + + match.firstIndeterminateDateNumber + + "]" + ); return Boolean.TRUE; } firstIndeterminateNumbers.set(match.firstIndeterminateDateNumber); @@ -838,8 +992,11 @@ Boolean guessIsDayFirstFromMatches(@Nullable TimestampFormat onlyConsiderFormat) if (match.secondIndeterminateDateNumber > 0) { assert match.secondIndeterminateDateNumber <= 31; if (match.secondIndeterminateDateNumber > 12) { - explanation.add("Guessing month precedes day in timestamps as one sample had second number [" - + match.secondIndeterminateDateNumber + "]"); + explanation.add( + "Guessing month precedes day in timestamps as one sample had second number [" + + match.secondIndeterminateDateNumber + + "]" + ); return Boolean.FALSE; } secondIndeterminateNumbers.set(match.secondIndeterminateDateNumber); @@ -860,14 +1017,27 @@ Boolean guessIsDayFirstFromMatches(@Nullable TimestampFormat onlyConsiderFormat) // firstCardinality can be 0, but then secondCardinality should have been 0 too assert firstCardinality > 0; if (firstCardinality >= ratioForResult * secondCardinality) { - explanation.add("Guessing day precedes month in timestamps as there were [" - + firstCardinality + "] distinct values of the first number but only [" + secondCardinality + "] for the second"); + explanation.add( + "Guessing day precedes month in timestamps as there were [" + + firstCardinality + + "] distinct values of the first number but only [" + + secondCardinality + + "] for the second" + ); return Boolean.TRUE; } if (secondCardinality >= ratioForResult * firstCardinality) { - explanation.add("Guessing month precedes day in timestamps as there " + (firstCardinality == 1 ? "was" : "were") + " only [" - + firstCardinality + "] distinct " + (firstCardinality == 1 ? "value" : "values") - + " of the first number but [" + secondCardinality + "] for the second"); + explanation.add( + "Guessing month precedes day in timestamps as there " + + (firstCardinality == 1 ? "was" : "were") + + " only [" + + firstCardinality + + "] distinct " + + (firstCardinality == 1 ? "value" : "values") + + " of the first number but [" + + secondCardinality + + "] for the second" + ); return Boolean.FALSE; } @@ -886,12 +1056,14 @@ boolean guessIsDayFirstFromLocale(Locale locale) { // Can't use 1 as that occurs in 1970, so 3rd Feb is the earliest date that will reveal the server default. String feb3rd1970 = makeShortLocalizedDateTimeFormatterForLocale(locale).format(LocalDate.ofEpochDay(33)); if (feb3rd1970.indexOf('3') < feb3rd1970.indexOf('2')) { - explanation.add("Guessing day precedes month in timestamps based on server locale [" - + locale.getDisplayName(Locale.ROOT) + "]"); + explanation.add( + "Guessing day precedes month in timestamps based on server locale [" + locale.getDisplayName(Locale.ROOT) + "]" + ); return true; } else { - explanation.add("Guessing month precedes day in timestamps based on server locale [" - + locale.getDisplayName(Locale.ROOT) + "]"); + explanation.add( + "Guessing month precedes day in timestamps based on server locale [" + locale.getDisplayName(Locale.ROOT) + "]" + ); return false; } } @@ -945,8 +1117,11 @@ static String determiniseJavaTimestampFormat(String rawJavaTimestampFormat, bool */ public List getJodaTimestampFormats() { List javaTimestampFormats = getJavaTimestampFormats(); - return (javaTimestampFormats == null) ? null : javaTimestampFormats.stream() - .map(format -> format.replace("yy", "YY").replace("XXX", "ZZ").replace("XX", "Z")).collect(Collectors.toList()); + return (javaTimestampFormats == null) + ? null + : javaTimestampFormats.stream() + .map(format -> format.replace("yy", "YY").replace("XXX", "ZZ").replace("XX", "Z")) + .collect(Collectors.toList()); } /** @@ -959,7 +1134,8 @@ public boolean hasTimezoneDependentParsing() { assert errorOnNoTimestamp == false; return false; } - return matches.stream().filter(match -> matchedFormats.size() < 2 || matchedFormats.get(0).canMergeWith(match.timestampFormat)) + return matches.stream() + .filter(match -> matchedFormats.size() < 2 || matchedFormats.get(0).canMergeWith(match.timestampFormat)) .anyMatch(match -> match.hasTimezoneDependentParsing); } @@ -1043,8 +1219,9 @@ static Tuple findBoundsForCandidate(CandidateTimestampFormat c return new Tuple<>(-1, -1); } int lowerBound = Math.max(0, minFirstMatchStart - candidate.maxCharsBeforeQuickRuleOutMatch); - int upperBound = (Integer.MAX_VALUE - candidate.maxCharsAfterQuickRuleOutMatch - maxLastMatchEnd < 0) ? - Integer.MAX_VALUE : (maxLastMatchEnd + candidate.maxCharsAfterQuickRuleOutMatch); + int upperBound = (Integer.MAX_VALUE - candidate.maxCharsAfterQuickRuleOutMatch - maxLastMatchEnd < 0) + ? Integer.MAX_VALUE + : (maxLastMatchEnd + candidate.maxCharsAfterQuickRuleOutMatch); return new Tuple<>(lowerBound, upperBound); } @@ -1063,8 +1240,8 @@ static int findBitPattern(BitSet findIn, int beginIndex, BitSet toFind) { assert beginIndex >= 0; // Note that this only compares up to the highest bit that is set, so trailing non digit characters will not participate - // in the comparison. This is not currently a problem for this class, but is something to consider if this functionality - // is ever reused elsewhere. The solution would be to use a wrapper class containing a BitSet and a separate int to store + // in the comparison. This is not currently a problem for this class, but is something to consider if this functionality + // is ever reused elsewhere. The solution would be to use a wrapper class containing a BitSet and a separate int to store // the length to compare. int toFindLength = toFind.length(); int findInLength = findIn.length(); @@ -1155,8 +1332,13 @@ static final class TimestampFormat { */ final String prefacePunctuation; - TimestampFormat(List rawJavaTimestampFormats, Pattern simplePattern, String grokPatternName, - Map customGrokPatternDefinitions, String prefacePunctuation) { + TimestampFormat( + List rawJavaTimestampFormats, + Pattern simplePattern, + String grokPatternName, + Map customGrokPatternDefinitions, + String prefacePunctuation + ) { this.rawJavaTimestampFormats = Collections.unmodifiableList(rawJavaTimestampFormats); this.simplePattern = Objects.requireNonNull(simplePattern); this.grokPatternName = Objects.requireNonNull(grokPatternName); @@ -1170,11 +1352,11 @@ boolean canMergeWith(TimestampFormat other) { return true; } - return other != null && - this.simplePattern.pattern().equals(other.simplePattern.pattern()) && - this.grokPatternName.equals(other.grokPatternName) && - Objects.equals(this.customGrokPatternDefinitions, other.customGrokPatternDefinitions) && - this.prefacePunctuation.equals(other.prefacePunctuation); + return other != null + && this.simplePattern.pattern().equals(other.simplePattern.pattern()) + && this.grokPatternName.equals(other.grokPatternName) + && Objects.equals(this.customGrokPatternDefinitions, other.customGrokPatternDefinitions) + && this.prefacePunctuation.equals(other.prefacePunctuation); } TimestampFormat mergeWith(TimestampFormat other) { @@ -1184,8 +1366,13 @@ TimestampFormat mergeWith(TimestampFormat other) { // Do the merge like this to preserve ordering Set mergedJavaTimestampFormats = new LinkedHashSet<>(rawJavaTimestampFormats); if (mergedJavaTimestampFormats.addAll(other.rawJavaTimestampFormats)) { - return new TimestampFormat(new ArrayList<>(mergedJavaTimestampFormats), simplePattern, grokPatternName, - customGrokPatternDefinitions, prefacePunctuation); + return new TimestampFormat( + new ArrayList<>(mergedJavaTimestampFormats), + simplePattern, + grokPatternName, + customGrokPatternDefinitions, + prefacePunctuation + ); } } // The merged format is exactly the same as this format, so there's no need to create a new object @@ -1197,8 +1384,13 @@ TimestampFormat mergeWith(TimestampFormat other) { @Override public int hashCode() { - return Objects.hash(rawJavaTimestampFormats, simplePattern.pattern(), grokPatternName, customGrokPatternDefinitions, - prefacePunctuation); + return Objects.hash( + rawJavaTimestampFormats, + simplePattern.pattern(), + grokPatternName, + customGrokPatternDefinitions, + prefacePunctuation + ); } @Override @@ -1211,19 +1403,26 @@ public boolean equals(Object other) { } TimestampFormat that = (TimestampFormat) other; - return Objects.equals(this.rawJavaTimestampFormats, that.rawJavaTimestampFormats) && - Objects.equals(this.simplePattern.pattern(), that.simplePattern.pattern()) && - Objects.equals(this.grokPatternName, that.grokPatternName) && - Objects.equals(this.customGrokPatternDefinitions, that.customGrokPatternDefinitions) && - Objects.equals(this.prefacePunctuation, that.prefacePunctuation); + return Objects.equals(this.rawJavaTimestampFormats, that.rawJavaTimestampFormats) + && Objects.equals(this.simplePattern.pattern(), that.simplePattern.pattern()) + && Objects.equals(this.grokPatternName, that.grokPatternName) + && Objects.equals(this.customGrokPatternDefinitions, that.customGrokPatternDefinitions) + && Objects.equals(this.prefacePunctuation, that.prefacePunctuation); } @Override public String toString() { - return "Java timestamp formats = " + rawJavaTimestampFormats.stream().collect(Collectors.joining("', '", "[ '", "' ]")) - + ", simple pattern = '" + simplePattern.pattern() + "', grok pattern = '" + grokPatternName + "'" + return "Java timestamp formats = " + + rawJavaTimestampFormats.stream().collect(Collectors.joining("', '", "[ '", "' ]")) + + ", simple pattern = '" + + simplePattern.pattern() + + "', grok pattern = '" + + grokPatternName + + "'" + (customGrokPatternDefinitions.isEmpty() ? "" : ", custom grok pattern definitions = " + customGrokPatternDefinitions) - + ", preface punctuation = '" + prefacePunctuation + "'"; + + ", preface punctuation = '" + + prefacePunctuation + + "'"; } } @@ -1232,7 +1431,7 @@ public String toString() { */ static final class TimestampMatch { - // This picks out punctuation that is likely to represent a field separator. It deliberately + // This picks out punctuation that is likely to represent a field separator. It deliberately // leaves out punctuation that's most likely to vary between field values, such as dots. private static final Pattern NON_PUNCTUATION_PATTERN = Pattern.compile("[^\\\\/|~:;,<>()\\[\\]{}«»\t]+"); @@ -1266,17 +1465,21 @@ static final class TimestampMatch { TimestampMatch(CandidateTimestampFormat chosenTimestampFormat, String preface, String matchedDate, String epilogue) { this.preface = Objects.requireNonNull(preface); - this.timestampFormat = new TimestampFormat(chosenTimestampFormat.javaTimestampFormatSupplier.apply(matchedDate), - chosenTimestampFormat.simplePattern, chosenTimestampFormat.outputGrokPatternName, + this.timestampFormat = new TimestampFormat( + chosenTimestampFormat.javaTimestampFormatSupplier.apply(matchedDate), + chosenTimestampFormat.simplePattern, + chosenTimestampFormat.outputGrokPatternName, chosenTimestampFormat.customGrokPatternDefinitions(), - preface.isEmpty() ? preface : NON_PUNCTUATION_PATTERN.matcher(preface).replaceAll("")); + preface.isEmpty() ? preface : NON_PUNCTUATION_PATTERN.matcher(preface).replaceAll("") + ); int[] indeterminateDateNumbers = parseIndeterminateDateNumbers(matchedDate, timestampFormat.rawJavaTimestampFormats); this.firstIndeterminateDateNumber = indeterminateDateNumbers[0]; this.secondIndeterminateDateNumber = indeterminateDateNumbers[1]; - this.hasTimezoneDependentParsing = requiresTimezoneDependentParsing(timestampFormat.rawJavaTimestampFormats.get(0), - matchedDate); - this.hasNanosecondPrecision = matchHasNanosecondPrecision(timestampFormat.rawJavaTimestampFormats.get(0), - matchedDate); + this.hasTimezoneDependentParsing = requiresTimezoneDependentParsing( + timestampFormat.rawJavaTimestampFormats.get(0), + matchedDate + ); + this.hasNanosecondPrecision = matchHasNanosecondPrecision(timestampFormat.rawJavaTimestampFormats.get(0), matchedDate); this.epilogue = Objects.requireNonNull(epilogue); } @@ -1331,7 +1534,7 @@ static boolean matchHasNanosecondPrecision(String format, String matchedDate) { if (curChar == '\'') { // Literal single quotes are escaped by using two consecutive single quotes. // Technically this code does the wrong thing in this case, as it flips quoting - // from off to on or on to off and then back. However, since by definition there + // from off to on or on to off and then back. However, since by definition there // is nothing in between the consecutive single quotes in this case, the net // effect is correct and good enough for what this method is doing. notQuoted = !notQuoted; @@ -1395,8 +1598,14 @@ static int[] parseIndeterminateDateNumbers(String matchedDate, List rawJ @Override public int hashCode() { - return Objects.hash(preface, timestampFormat, firstIndeterminateDateNumber, secondIndeterminateDateNumber, - hasTimezoneDependentParsing, epilogue); + return Objects.hash( + preface, + timestampFormat, + firstIndeterminateDateNumber, + secondIndeterminateDateNumber, + hasTimezoneDependentParsing, + epilogue + ); } @Override @@ -1409,21 +1618,23 @@ public boolean equals(Object other) { } TimestampMatch that = (TimestampMatch) other; - return Objects.equals(this.preface, that.preface) && - Objects.equals(this.timestampFormat, that.timestampFormat) && - this.firstIndeterminateDateNumber == that.firstIndeterminateDateNumber && - this.secondIndeterminateDateNumber == that.secondIndeterminateDateNumber && - this.hasTimezoneDependentParsing == that.hasTimezoneDependentParsing && - Objects.equals(this.epilogue, that.epilogue); + return Objects.equals(this.preface, that.preface) + && Objects.equals(this.timestampFormat, that.timestampFormat) + && this.firstIndeterminateDateNumber == that.firstIndeterminateDateNumber + && this.secondIndeterminateDateNumber == that.secondIndeterminateDateNumber + && this.hasTimezoneDependentParsing == that.hasTimezoneDependentParsing + && Objects.equals(this.epilogue, that.epilogue); } @Override public String toString() { - return (preface.isEmpty() ? "" : "preface = '" + preface + "', ") + timestampFormat + return (preface.isEmpty() ? "" : "preface = '" + preface + "', ") + + timestampFormat + ((firstIndeterminateDateNumber > 0 || secondIndeterminateDateNumber > 0) ? ", indeterminate date numbers = (" + firstIndeterminateDateNumber + "," + secondIndeterminateDateNumber + ")" : "") - + ", has timezone-dependent parsing = " + hasTimezoneDependentParsing + + ", has timezone-dependent parsing = " + + hasTimezoneDependentParsing + (epilogue.isEmpty() ? "" : ", epilogue = '" + epilogue + "'"); } } @@ -1446,32 +1657,71 @@ static final class CandidateTimestampFormat { final int maxCharsBeforeQuickRuleOutMatch; final int maxCharsAfterQuickRuleOutMatch; - CandidateTimestampFormat(Function> javaTimestampFormatSupplier, String simpleRegex, String strictGrokPattern, - String outputGrokPatternName) { - this(javaTimestampFormatSupplier, simpleRegex, strictGrokPattern, outputGrokPatternName, Collections.emptyList(), - Integer.MAX_VALUE, Integer.MAX_VALUE); - } - - CandidateTimestampFormat(Function> javaTimestampFormatSupplier, String simpleRegex, String strictGrokPattern, - String outputGrokPatternName, String quickRuleOutPattern, int maxCharsBeforeQuickRuleOutMatch, - int maxCharsAfterQuickRuleOutMatch) { - this(javaTimestampFormatSupplier, simpleRegex, strictGrokPattern, outputGrokPatternName, - Collections.singletonList(quickRuleOutPattern), maxCharsBeforeQuickRuleOutMatch, maxCharsAfterQuickRuleOutMatch); - } - - CandidateTimestampFormat(Function> javaTimestampFormatSupplier, String simpleRegex, String strictGrokPattern, - String outputGrokPatternName, List quickRuleOutPatterns, int maxCharsBeforeQuickRuleOutMatch, - int maxCharsAfterQuickRuleOutMatch) { + CandidateTimestampFormat( + Function> javaTimestampFormatSupplier, + String simpleRegex, + String strictGrokPattern, + String outputGrokPatternName + ) { + this( + javaTimestampFormatSupplier, + simpleRegex, + strictGrokPattern, + outputGrokPatternName, + Collections.emptyList(), + Integer.MAX_VALUE, + Integer.MAX_VALUE + ); + } + + CandidateTimestampFormat( + Function> javaTimestampFormatSupplier, + String simpleRegex, + String strictGrokPattern, + String outputGrokPatternName, + String quickRuleOutPattern, + int maxCharsBeforeQuickRuleOutMatch, + int maxCharsAfterQuickRuleOutMatch + ) { + this( + javaTimestampFormatSupplier, + simpleRegex, + strictGrokPattern, + outputGrokPatternName, + Collections.singletonList(quickRuleOutPattern), + maxCharsBeforeQuickRuleOutMatch, + maxCharsAfterQuickRuleOutMatch + ); + } + + CandidateTimestampFormat( + Function> javaTimestampFormatSupplier, + String simpleRegex, + String strictGrokPattern, + String outputGrokPatternName, + List quickRuleOutPatterns, + int maxCharsBeforeQuickRuleOutMatch, + int maxCharsAfterQuickRuleOutMatch + ) { this.javaTimestampFormatSupplier = Objects.requireNonNull(javaTimestampFormatSupplier); this.simplePattern = Pattern.compile(simpleRegex, Pattern.MULTILINE); this.strictGrokPattern = Objects.requireNonNull(strictGrokPattern); // The (?m) here has the Ruby meaning, which is equivalent to (?s) in Java - this.strictSearchGrok = new Grok(Grok.BUILTIN_PATTERNS, "(?m)%{DATA:" + PREFACE + "}" + strictGrokPattern + - "%{GREEDYDATA:" + EPILOGUE + "}", TimeoutChecker.watchdog, logger::warn); - this.strictFullMatchGrok = new Grok(Grok.BUILTIN_PATTERNS, "^" + strictGrokPattern + "$", TimeoutChecker.watchdog, - logger::warn); + this.strictSearchGrok = new Grok( + Grok.BUILTIN_PATTERNS, + "(?m)%{DATA:" + PREFACE + "}" + strictGrokPattern + "%{GREEDYDATA:" + EPILOGUE + "}", + TimeoutChecker.watchdog, + logger::warn + ); + this.strictFullMatchGrok = new Grok( + Grok.BUILTIN_PATTERNS, + "^" + strictGrokPattern + "$", + TimeoutChecker.watchdog, + logger::warn + ); this.outputGrokPatternName = Objects.requireNonNull(outputGrokPatternName); - this.quickRuleOutBitSets = quickRuleOutPatterns.stream().map(TimestampFormatFinder::stringToNumberPosBitSet) + this.quickRuleOutBitSets = quickRuleOutPatterns.stream() + .map(TimestampFormatFinder::stringToNumberPosBitSet) .collect(Collectors.toList()); assert maxCharsBeforeQuickRuleOutMatch >= 0; this.maxCharsBeforeQuickRuleOutMatch = maxCharsBeforeQuickRuleOutMatch; @@ -1531,7 +1781,8 @@ static List iso8601LikeFormatFromExample(String example, String timeSepa static List adjustTrailingTimezoneFromExample(String example, String formatWithSecondsAndXX) { return Collections.singletonList( - TRAILING_OFFSET_WITHOUT_COLON_FINDER.matcher(example).find() ? formatWithSecondsAndXX : formatWithSecondsAndXX + "X"); + TRAILING_OFFSET_WITHOUT_COLON_FINDER.matcher(example).find() ? formatWithSecondsAndXX : formatWithSecondsAndXX + "X" + ); } private static String adjustFractionalSecondsFromEndOfExample(String example, String formatNoFraction) { @@ -1555,8 +1806,14 @@ static List indeterminateDayMonthFormatFromExample(String example) { // INDETERMINATE_FIELD_PLACEHOLDER here could represent either a day number (d) or month number (M) - it // will get changed later based on evidence from many examples - for (Character patternChar - : Arrays.asList(INDETERMINATE_FIELD_PLACEHOLDER, INDETERMINATE_FIELD_PLACEHOLDER, 'y', 'H', 'm', 's')) { + for (Character patternChar : Arrays.asList( + INDETERMINATE_FIELD_PLACEHOLDER, + INDETERMINATE_FIELD_PLACEHOLDER, + 'y', + 'H', + 'm', + 's' + )) { boolean foundDigit = false; while (examplePos < example.length() && Character.isDigit(example.charAt(examplePos))) { @@ -1584,8 +1841,11 @@ static List indeterminateDayMonthFormatFromExample(String example) { format = adjustFractionalSecondsFromEndOfExample(example, format); } - assert Character.isLetter(format.charAt(format.length() - 1)) - : "Unexpected format [" + format + "] from example [" + example + "]"; + assert Character.isLetter(format.charAt(format.length() - 1)) : "Unexpected format [" + + format + + "] from example [" + + example + + "]"; assert format.length() == example.length() : "Unexpected format [" + format + "] from example [" + example + "]"; return Collections.singletonList(format); diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinder.java b/x-pack/plugin/text-structure/src/main/java/org/elasticsearch/xpack/textstructure/structurefinder/XmlFileStructureFinder.java similarity index 82% rename from x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinder.java rename to x-pack/plugin/text-structure/src/main/java/org/elasticsearch/xpack/textstructure/structurefinder/XmlFileStructureFinder.java index 640fc6f19fc2d..8fde2874893e7 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinder.java +++ b/x-pack/plugin/text-structure/src/main/java/org/elasticsearch/xpack/textstructure/structurefinder/XmlFileStructureFinder.java @@ -3,11 +3,11 @@ * or more contributor license agreements. Licensed under the Elastic License; * you may not use this file except in compliance with the Elastic License. */ -package org.elasticsearch.xpack.ml.filestructurefinder; +package org.elasticsearch.xpack.textstructure.structurefinder; import org.elasticsearch.common.collect.Tuple; -import org.elasticsearch.xpack.core.ml.filestructurefinder.FieldStats; -import org.elasticsearch.xpack.core.ml.filestructurefinder.FileStructure; +import org.elasticsearch.xpack.core.textstructure.structurefinder.FieldStats; +import org.elasticsearch.xpack.core.textstructure.structurefinder.FileStructure; import org.w3c.dom.Document; import org.w3c.dom.NamedNodeMap; import org.w3c.dom.Node; @@ -37,10 +37,14 @@ public class XmlFileStructureFinder implements FileStructureFinder { private final List sampleMessages; private final FileStructure structure; - static XmlFileStructureFinder makeXmlFileStructureFinder(List explanation, String sample, String charsetName, - Boolean hasByteOrderMarker, FileStructureOverrides overrides, - TimeoutChecker timeoutChecker) - throws IOException, ParserConfigurationException, SAXException { + static XmlFileStructureFinder makeXmlFileStructureFinder( + List explanation, + String sample, + String charsetName, + Boolean hasByteOrderMarker, + FileStructureOverrides overrides, + TimeoutChecker timeoutChecker + ) throws IOException, ParserConfigurationException, SAXException { String messagePrefix; try (Scanner scanner = new Scanner(sample)) { @@ -85,16 +89,19 @@ static XmlFileStructureFinder makeXmlFileStructureFinder(List explanatio assert messagePrefix.charAt(0) == '<'; String topLevelTag = messagePrefix.substring(1); - FileStructure.Builder structureBuilder = new FileStructure.Builder(FileStructure.Format.XML) - .setCharset(charsetName) + FileStructure.Builder structureBuilder = new FileStructure.Builder(FileStructure.Format.XML).setCharset(charsetName) .setHasByteOrderMarker(hasByteOrderMarker) .setSampleStart(preamble.toString()) .setNumLinesAnalyzed(linesConsumed) .setNumMessagesAnalyzed(sampleRecords.size()) .setMultilineStartPattern("^\\s*<" + topLevelTag); - Tuple timeField = - FileStructureUtils.guessTimestampField(explanation, sampleRecords, overrides, timeoutChecker); + Tuple timeField = FileStructureUtils.guessTimestampField( + explanation, + sampleRecords, + overrides, + timeoutChecker + ); if (timeField != null) { boolean needClientTimeZone = timeField.v2().hasTimezoneDependentParsing(); @@ -102,13 +109,22 @@ static XmlFileStructureFinder makeXmlFileStructureFinder(List explanatio .setJodaTimestampFormats(timeField.v2().getJodaTimestampFormats()) .setJavaTimestampFormats(timeField.v2().getJavaTimestampFormats()) .setNeedClientTimezone(needClientTimeZone) - .setIngestPipeline(FileStructureUtils.makeIngestPipelineDefinition(null, Collections.emptyMap(), null, - Collections.emptyMap(), topLevelTag + "." + timeField.v1(), timeField.v2().getJavaTimestampFormats(), - needClientTimeZone, timeField.v2().needNanosecondPrecision())); + .setIngestPipeline( + FileStructureUtils.makeIngestPipelineDefinition( + null, + Collections.emptyMap(), + null, + Collections.emptyMap(), + topLevelTag + "." + timeField.v1(), + timeField.v2().getJavaTimestampFormats(), + needClientTimeZone, + timeField.v2().needNanosecondPrecision() + ) + ); } - Tuple, SortedMap> mappingsAndFieldStats = - FileStructureUtils.guessMappingsAndCalculateFieldStats(explanation, sampleRecords, timeoutChecker); + Tuple, SortedMap> mappingsAndFieldStats = FileStructureUtils + .guessMappingsAndCalculateFieldStats(explanation, sampleRecords, timeoutChecker); if (mappingsAndFieldStats.v2() != null) { structureBuilder.setFieldStats(mappingsAndFieldStats.v2()); @@ -124,10 +140,9 @@ static XmlFileStructureFinder makeXmlFileStructureFinder(List explanatio outerFieldMappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD, timeField.v2().getEsDateMappingTypeWithoutFormat()); } - FileStructure structure = structureBuilder - .setMappings(Collections.singletonMap(FileStructureUtils.MAPPING_PROPERTIES_SETTING, outerFieldMappings)) - .setExplanation(explanation) - .build(); + FileStructure structure = structureBuilder.setMappings( + Collections.singletonMap(FileStructureUtils.MAPPING_PROPERTIES_SETTING, outerFieldMappings) + ).setExplanation(explanation).build(); return new XmlFileStructureFinder(sampleMessages, structure); } diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderFactory.java b/x-pack/plugin/text-structure/src/main/java/org/elasticsearch/xpack/textstructure/structurefinder/XmlFileStructureFinderFactory.java similarity index 75% rename from x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderFactory.java rename to x-pack/plugin/text-structure/src/main/java/org/elasticsearch/xpack/textstructure/structurefinder/XmlFileStructureFinderFactory.java index aff9b29819e7d..0407067291ecf 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderFactory.java +++ b/x-pack/plugin/text-structure/src/main/java/org/elasticsearch/xpack/textstructure/structurefinder/XmlFileStructureFinderFactory.java @@ -3,9 +3,9 @@ * or more contributor license agreements. Licensed under the Elastic License; * you may not use this file except in compliance with the Elastic License. */ -package org.elasticsearch.xpack.ml.filestructurefinder; +package org.elasticsearch.xpack.textstructure.structurefinder; -import org.elasticsearch.xpack.core.ml.filestructurefinder.FileStructure; +import org.elasticsearch.xpack.core.textstructure.structurefinder.FileStructure; import org.xml.sax.SAXException; import javax.xml.parsers.ParserConfigurationException; @@ -53,7 +53,7 @@ public boolean canCreateFromSample(List explanation, String sample, doub // This processing is extremely complicated because it's necessary // to create a new XML stream reader per document, but each one // will read ahead so will potentially consume characters from the - // following document. We must therefore also recreate the string + // following document. We must therefore also recreate the string // reader for each document. while (mightBeAnotherDocument) { @@ -70,8 +70,14 @@ public boolean canCreateFromSample(List explanation, String sample, doub if (commonRootElementName == null) { commonRootElementName = rootElementName; } else if (commonRootElementName.equals(rootElementName) == false) { - explanation.add("Not XML because different documents have different root " + - "element names: [" + commonRootElementName + "] and [" + rootElementName + "]"); + explanation.add( + "Not XML because different documents have different root " + + "element names: [" + + commonRootElementName + + "] and [" + + rootElementName + + "]" + ); return false; } } @@ -94,8 +100,15 @@ public boolean canCreateFromSample(List explanation, String sample, doub for (int wholeLines = location.getLineNumber() - 1; wholeLines > 0; --wholeLines) { endPos = remainder.indexOf('\n', endPos) + 1; if (endPos == 0) { - explanation.add("Not XML because XML parser location is inconsistent: line [" + - location.getLineNumber() + "], column [" + location.getColumnNumber() + "] in [" + remainder + "]"); + explanation.add( + "Not XML because XML parser location is inconsistent: line [" + + location.getLineNumber() + + "], column [" + + location.getColumnNumber() + + "] in [" + + remainder + + "]" + ); return false; } } @@ -124,10 +137,22 @@ public boolean canCreateFromSample(List explanation, String sample, doub } @Override - public FileStructureFinder createFromSample(List explanation, String sample, String charsetName, Boolean hasByteOrderMarker, - int lineMergeSizeLimit, FileStructureOverrides overrides, TimeoutChecker timeoutChecker) - throws IOException, ParserConfigurationException, SAXException { - return XmlFileStructureFinder.makeXmlFileStructureFinder(explanation, sample, charsetName, hasByteOrderMarker, overrides, - timeoutChecker); + public FileStructureFinder createFromSample( + List explanation, + String sample, + String charsetName, + Boolean hasByteOrderMarker, + int lineMergeSizeLimit, + FileStructureOverrides overrides, + TimeoutChecker timeoutChecker + ) throws IOException, ParserConfigurationException, SAXException { + return XmlFileStructureFinder.makeXmlFileStructureFinder( + explanation, + sample, + charsetName, + hasByteOrderMarker, + overrides, + timeoutChecker + ); } } diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportFindFileStructureAction.java b/x-pack/plugin/text-structure/src/main/java/org/elasticsearch/xpack/textstructure/transport/TransportFindFileStructureAction.java similarity index 62% rename from x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportFindFileStructureAction.java rename to x-pack/plugin/text-structure/src/main/java/org/elasticsearch/xpack/textstructure/transport/TransportFindFileStructureAction.java index 63c292a9bebfd..2113369bf7586 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportFindFileStructureAction.java +++ b/x-pack/plugin/text-structure/src/main/java/org/elasticsearch/xpack/textstructure/transport/TransportFindFileStructureAction.java @@ -3,7 +3,7 @@ * or more contributor license agreements. Licensed under the Elastic License; * you may not use this file except in compliance with the Elastic License. */ -package org.elasticsearch.xpack.ml.action; +package org.elasticsearch.xpack.textstructure.transport; import org.elasticsearch.action.ActionListener; import org.elasticsearch.action.support.ActionFilters; @@ -12,16 +12,18 @@ import org.elasticsearch.tasks.Task; import org.elasticsearch.threadpool.ThreadPool; import org.elasticsearch.transport.TransportService; -import org.elasticsearch.xpack.core.ml.action.FindFileStructureAction; -import org.elasticsearch.xpack.ml.MachineLearning; -import org.elasticsearch.xpack.ml.filestructurefinder.FileStructureFinder; -import org.elasticsearch.xpack.ml.filestructurefinder.FileStructureFinderManager; -import org.elasticsearch.xpack.ml.filestructurefinder.FileStructureOverrides; +import org.elasticsearch.xpack.core.textstructure.action.FindFileStructureAction; +import org.elasticsearch.xpack.textstructure.structurefinder.FileStructureFinder; +import org.elasticsearch.xpack.textstructure.structurefinder.FileStructureFinderManager; +import org.elasticsearch.xpack.textstructure.structurefinder.FileStructureOverrides; import java.io.InputStream; -public class TransportFindFileStructureAction - extends HandledTransportAction { +import static org.elasticsearch.threadpool.ThreadPool.Names.GENERIC; + +public class TransportFindFileStructureAction extends HandledTransportAction< + FindFileStructureAction.Request, + FindFileStructureAction.Response> { private final ThreadPool threadPool; @@ -32,12 +34,15 @@ public TransportFindFileStructureAction(TransportService transportService, Actio } @Override - protected void doExecute(Task task, FindFileStructureAction.Request request, - ActionListener listener) { + protected void doExecute( + Task task, + FindFileStructureAction.Request request, + ActionListener listener + ) { // As determining the file structure might take a while, we run // in a different thread to avoid blocking the network thread. - threadPool.executor(MachineLearning.UTILITY_THREAD_POOL_NAME).execute(() -> { + threadPool.executor(GENERIC).execute(() -> { try { listener.onResponse(buildFileStructureResponse(request)); } catch (Exception e) { @@ -51,8 +56,13 @@ private FindFileStructureAction.Response buildFileStructureResponse(FindFileStru FileStructureFinderManager structureFinderManager = new FileStructureFinderManager(threadPool.scheduler()); try (InputStream sampleStream = request.getSample().streamInput()) { - FileStructureFinder fileStructureFinder = structureFinderManager.findFileStructure(request.getLinesToSample(), - request.getLineMergeSizeLimit(), sampleStream, new FileStructureOverrides(request), request.getTimeout()); + FileStructureFinder fileStructureFinder = structureFinderManager.findFileStructure( + request.getLinesToSample(), + request.getLineMergeSizeLimit(), + sampleStream, + new FileStructureOverrides(request), + request.getTimeout() + ); return new FindFileStructureAction.Response(fileStructureFinder.getStructure()); } diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderFactoryTests.java b/x-pack/plugin/text-structure/src/test/java/org/elasticsearch/xpack/textstructure/structurefinder/DelimitedFileStructureFinderFactoryTests.java similarity index 98% rename from x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderFactoryTests.java rename to x-pack/plugin/text-structure/src/test/java/org/elasticsearch/xpack/textstructure/structurefinder/DelimitedFileStructureFinderFactoryTests.java index 9d761ead28ee3..80614afd7fc61 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderFactoryTests.java +++ b/x-pack/plugin/text-structure/src/test/java/org/elasticsearch/xpack/textstructure/structurefinder/DelimitedFileStructureFinderFactoryTests.java @@ -3,7 +3,7 @@ * or more contributor license agreements. Licensed under the Elastic License; * you may not use this file except in compliance with the Elastic License. */ -package org.elasticsearch.xpack.ml.filestructurefinder; +package org.elasticsearch.xpack.textstructure.structurefinder; public class DelimitedFileStructureFinderFactoryTests extends FileStructureTestCase { diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderTests.java b/x-pack/plugin/text-structure/src/test/java/org/elasticsearch/xpack/textstructure/structurefinder/DelimitedFileStructureFinderTests.java similarity index 63% rename from x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderTests.java rename to x-pack/plugin/text-structure/src/test/java/org/elasticsearch/xpack/textstructure/structurefinder/DelimitedFileStructureFinderTests.java index fd8edac9785d4..f390f4916d7b8 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderTests.java +++ b/x-pack/plugin/text-structure/src/test/java/org/elasticsearch/xpack/textstructure/structurefinder/DelimitedFileStructureFinderTests.java @@ -3,10 +3,10 @@ * or more contributor license agreements. Licensed under the Elastic License; * you may not use this file except in compliance with the Elastic License. */ -package org.elasticsearch.xpack.ml.filestructurefinder; +package org.elasticsearch.xpack.textstructure.structurefinder; import org.elasticsearch.common.collect.Tuple; -import org.elasticsearch.xpack.core.ml.filestructurefinder.FileStructure; +import org.elasticsearch.xpack.core.textstructure.structurefinder.FileStructure; import org.supercsv.prefs.CsvPreference; import java.io.IOException; @@ -20,8 +20,8 @@ import java.util.stream.Collectors; import java.util.stream.Stream; -import static org.elasticsearch.xpack.ml.filestructurefinder.DelimitedFileStructureFinder.levenshteinFieldwiseCompareRows; -import static org.elasticsearch.xpack.ml.filestructurefinder.DelimitedFileStructureFinder.levenshteinDistance; +import static org.elasticsearch.xpack.textstructure.structurefinder.DelimitedFileStructureFinder.levenshteinFieldwiseCompareRows; +import static org.elasticsearch.xpack.textstructure.structurefinder.TimestampFormatFinder.stringToNumberPosBitSet; import static org.hamcrest.Matchers.arrayContaining; import static org.hamcrest.Matchers.contains; import static org.hamcrest.Matchers.equalTo; @@ -34,15 +34,20 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase { private final FileStructureFinderFactory tsvFactory = new DelimitedFileStructureFinderFactory('\t', '"', 3, false); public void testCreateConfigsGivenCompleteCsv() throws Exception { - String sample = "time,message\n" + - "2018-05-17T13:41:23,hello\n" + - "2018-05-17T13:41:32,hello again\n"; + String sample = "time,message\n" + "2018-05-17T13:41:23,hello\n" + "2018-05-17T13:41:32,hello again\n"; assertTrue(csvFactory.canCreateFromSample(explanation, sample, 0.0)); String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); - FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, - FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); + FileStructureFinder structureFinder = csvFactory.createFromSample( + explanation, + sample, + charset, + hasByteOrderMarker, + FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, + FileStructureOverrides.EMPTY_OVERRIDES, + NOOP_TIMEOUT_CHECKER + ); FileStructure structure = structureFinder.getStructure(); @@ -67,28 +72,32 @@ public void testCreateConfigsGivenCompleteCsv() throws Exception { } public void testCreateConfigsGivenIncompleteCsv() throws Exception { - String sample = "time,message\n" + - "2018-05-17T13:41:23,hello\n" + - "badrow\n" + // REALLY bad row - "2018-05-17T13:41:25,hello\n" + - "2018-05-17T13:41:26,hello\n" + - "2018-05-17T13:41:27,hello\n" + - "2018-05-17T13:41:28,hello\n" + - "2018-05-17T13:41:29,hello\n" + - "2018-05-17T13:41:30,hello\n" + - "2018-05-17T13:41:31,hello\n" + - "2018-05-17T13:41:32,hello\n" + - "2018-05-17T13:41:35\n" + // Just missing the column + String sample = "time,message\n" + "2018-05-17T13:41:23,hello\n" + "badrow\n" + // REALLY bad row + "2018-05-17T13:41:25,hello\n" + + "2018-05-17T13:41:26,hello\n" + + "2018-05-17T13:41:27,hello\n" + + "2018-05-17T13:41:28,hello\n" + + "2018-05-17T13:41:29,hello\n" + + "2018-05-17T13:41:30,hello\n" + + "2018-05-17T13:41:31,hello\n" + + "2018-05-17T13:41:32,hello\n" + + "2018-05-17T13:41:35\n" + + // Just missing the column "2018-05-17T13:41:33,hello again\n"; assertFalse(csvFactory.canCreateFromSample(explanation, sample, 0.05)); - assertTrue("assertion failed. Explanation " + explanation, - csvFactory.canCreateFromSample(explanation, sample, 0.10)); + assertTrue("assertion failed. Explanation " + explanation, csvFactory.canCreateFromSample(explanation, sample, 0.10)); String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); - FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, - FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); - + FileStructureFinder structureFinder = csvFactory.createFromSample( + explanation, + sample, + charset, + hasByteOrderMarker, + FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, + FileStructureOverrides.EMPTY_OVERRIDES, + NOOP_TIMEOUT_CHECKER + ); FileStructure structure = structureFinder.getStructure(); @@ -114,29 +123,32 @@ public void testCreateConfigsGivenIncompleteCsv() throws Exception { } public void testCreateConfigsGivenIncompleteCsvWithMultiLinedRows() throws Exception { - String sample = "time,message\n" + - "2018-05-17T13:41:23,\"hello\nnew line\"\n" + - "\"badrow\n\n\n\n\"\n" + // REALLY bad row - "2018-05-17T13:41:25,\"hello\nnew line\"\n" + - "2018-05-17T13:41:26,\"hello\nnew line\"\n" + - "2018-05-17T13:41:27,\"hello\nnew line\"\n" + - "2018-05-17T13:41:28,\"hello\nnew line\"\n" + - "2018-05-17T13:41:29,\"hello\nnew line\"\n" + - "2018-05-17T13:41:30,\"hello\nnew line\"\n" + - "2018-05-17T13:41:31,\"hello\nnew line\"\n" + - "2018-05-17T13:41:32,\"hello\nnew line\"\n" + - "2018-05-17T13:41:35\n" + // Just missing the column + String sample = "time,message\n" + "2018-05-17T13:41:23,\"hello\nnew line\"\n" + "\"badrow\n\n\n\n\"\n" + // REALLY bad row + "2018-05-17T13:41:25,\"hello\nnew line\"\n" + + "2018-05-17T13:41:26,\"hello\nnew line\"\n" + + "2018-05-17T13:41:27,\"hello\nnew line\"\n" + + "2018-05-17T13:41:28,\"hello\nnew line\"\n" + + "2018-05-17T13:41:29,\"hello\nnew line\"\n" + + "2018-05-17T13:41:30,\"hello\nnew line\"\n" + + "2018-05-17T13:41:31,\"hello\nnew line\"\n" + + "2018-05-17T13:41:32,\"hello\nnew line\"\n" + + "2018-05-17T13:41:35\n" + + // Just missing the column "2018-05-17T13:41:33,\"hello again\nnew line\"\n"; assertFalse(csvFactory.canCreateFromSample(explanation, sample, 0.05)); - assertTrue("assertion failed. Explanation " + explanation, - csvFactory.canCreateFromSample(explanation, sample, 0.10)); + assertTrue("assertion failed. Explanation " + explanation, csvFactory.canCreateFromSample(explanation, sample, 0.10)); String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); - FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, + FileStructureFinder structureFinder = csvFactory.createFromSample( + explanation, + sample, + charset, + hasByteOrderMarker, FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, FileStructureOverrides.builder().setQuote('"').build(), - NOOP_TIMEOUT_CHECKER); + NOOP_TIMEOUT_CHECKER + ); FileStructure structure = structureFinder.getStructure(); @@ -165,15 +177,20 @@ public void testCreateConfigsGivenCompleteCsvAndColumnNamesOverride() throws Exc FileStructureOverrides overrides = FileStructureOverrides.builder().setColumnNames(Arrays.asList("my_time", "my_message")).build(); - String sample = "time,message\n" + - "2018-05-17T13:41:23,hello\n" + - "2018-05-17T13:41:32,hello again\n"; + String sample = "time,message\n" + "2018-05-17T13:41:23,hello\n" + "2018-05-17T13:41:32,hello again\n"; assertTrue(csvFactory.canCreateFromSample(explanation, sample, 0.0)); String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); - FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, - FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, overrides, NOOP_TIMEOUT_CHECKER); + FileStructureFinder structureFinder = csvFactory.createFromSample( + explanation, + sample, + charset, + hasByteOrderMarker, + FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, + overrides, + NOOP_TIMEOUT_CHECKER + ); FileStructure structure = structureFinder.getStructure(); @@ -203,15 +220,20 @@ public void testCreateConfigsGivenCompleteCsvAndHasHeaderRowOverride() throws Ex // detection with the wrong choice the results will be completely changed FileStructureOverrides overrides = FileStructureOverrides.builder().setHasHeaderRow(false).build(); - String sample = "time,message\n" + - "2018-05-17T13:41:23,hello\n" + - "2018-05-17T13:41:32,hello again\n"; + String sample = "time,message\n" + "2018-05-17T13:41:23,hello\n" + "2018-05-17T13:41:32,hello again\n"; assertTrue(csvFactory.canCreateFromSample(explanation, sample, 0.0)); String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); - FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, - FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, overrides, NOOP_TIMEOUT_CHECKER); + FileStructureFinder structureFinder = csvFactory.createFromSample( + explanation, + sample, + charset, + hasByteOrderMarker, + FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, + overrides, + NOOP_TIMEOUT_CHECKER + ); FileStructure structure = structureFinder.getStructure(); @@ -236,16 +258,21 @@ public void testCreateConfigsGivenCompleteCsvAndHasHeaderRowOverride() throws Ex } public void testCreateConfigsGivenCsvWithIncompleteLastRecord() throws Exception { - String sample = "time,message,count\n" + - "2018-05-17T13:41:23,\"hello\n" + - "world\",1\n" + - "2019-01-18T14:46:57,\"hello again\n"; // note that this last record is truncated + // note that this last record is truncated + String sample = "time,message,count\n2018-05-17T13:41:23,\"hello\nworld\",1\n2019-01-18T14:46:57,\"hello again\n"; assertTrue(csvFactory.canCreateFromSample(explanation, sample, 0.0)); String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); - FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, - FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); + FileStructureFinder structureFinder = csvFactory.createFromSample( + explanation, + sample, + charset, + hasByteOrderMarker, + FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, + FileStructureOverrides.EMPTY_OVERRIDES, + NOOP_TIMEOUT_CHECKER + ); FileStructure structure = structureFinder.getStructure(); @@ -270,18 +297,25 @@ public void testCreateConfigsGivenCsvWithIncompleteLastRecord() throws Exception } public void testCreateConfigsGivenCsvWithTrailingNulls() throws Exception { - String sample = "VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID," + - "store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount," + - "improvement_surcharge,total_amount,,\n" + - "2,2016-12-31 15:15:01,2016-12-31 15:15:09,1,.00,1,N,264,264,2,1,0,0.5,0,0,0.3,1.8,,\n" + - "1,2016-12-01 00:00:01,2016-12-01 00:10:22,1,1.60,1,N,163,143,2,9,0.5,0.5,0,0,0.3,10.3,,\n" + - "1,2016-12-01 00:00:01,2016-12-01 00:11:01,1,1.40,1,N,164,229,1,9,0.5,0.5,2.05,0,0.3,12.35,,\n"; + String sample = "VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID," + + "store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount," + + "improvement_surcharge,total_amount,,\n" + + "2,2016-12-31 15:15:01,2016-12-31 15:15:09,1,.00,1,N,264,264,2,1,0,0.5,0,0,0.3,1.8,,\n" + + "1,2016-12-01 00:00:01,2016-12-01 00:10:22,1,1.60,1,N,163,143,2,9,0.5,0.5,0,0,0.3,10.3,,\n" + + "1,2016-12-01 00:00:01,2016-12-01 00:11:01,1,1.40,1,N,164,229,1,9,0.5,0.5,2.05,0,0.3,12.35,,\n"; assertTrue(csvFactory.canCreateFromSample(explanation, sample, 0.0)); String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); - FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, - FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); + FileStructureFinder structureFinder = csvFactory.createFromSample( + explanation, + sample, + charset, + hasByteOrderMarker, + FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, + FileStructureOverrides.EMPTY_OVERRIDES, + NOOP_TIMEOUT_CHECKER + ); FileStructure structure = structureFinder.getStructure(); @@ -292,18 +326,42 @@ public void testCreateConfigsGivenCsvWithTrailingNulls() throws Exception { } else { assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker()); } - assertEquals("^\"?VendorID\"?,\"?tpep_pickup_datetime\"?,\"?tpep_dropoff_datetime\"?,\"?passenger_count\"?,\"?trip_distance\"?," + - "\"?RatecodeID\"?,\"?store_and_fwd_flag\"?,\"?PULocationID\"?,\"?DOLocationID\"?,\"?payment_type\"?,\"?fare_amount\"?," + - "\"?extra\"?,\"?mta_tax\"?,\"?tip_amount\"?,\"?tolls_amount\"?,\"?improvement_surcharge\"?,\"?total_amount\"?,\"?\"?,\"?\"?", - structure.getExcludeLinesPattern()); + assertEquals( + "^\"?VendorID\"?,\"?tpep_pickup_datetime\"?,\"?tpep_dropoff_datetime\"?,\"?passenger_count\"?,\"?trip_distance\"?," + + "\"?RatecodeID\"?,\"?store_and_fwd_flag\"?,\"?PULocationID\"?,\"?DOLocationID\"?,\"?payment_type\"?,\"?fare_amount\"?," + + "\"?extra\"?,\"?mta_tax\"?,\"?tip_amount\"?,\"?tolls_amount\"?," + + "\"?improvement_surcharge\"?,\"?total_amount\"?,\"?\"?,\"?\"?", + structure.getExcludeLinesPattern() + ); assertNull(structure.getMultilineStartPattern()); assertEquals(Character.valueOf(','), structure.getDelimiter()); assertEquals(Character.valueOf('"'), structure.getQuote()); assertTrue(structure.getHasHeaderRow()); assertNull(structure.getShouldTrimFields()); - assertEquals(Arrays.asList("VendorID", "tpep_pickup_datetime", "tpep_dropoff_datetime", "passenger_count", "trip_distance", - "RatecodeID", "store_and_fwd_flag", "PULocationID", "DOLocationID", "payment_type", "fare_amount", "extra", "mta_tax", - "tip_amount", "tolls_amount", "improvement_surcharge", "total_amount", "column18", "column19"), structure.getColumnNames()); + assertEquals( + Arrays.asList( + "VendorID", + "tpep_pickup_datetime", + "tpep_dropoff_datetime", + "passenger_count", + "trip_distance", + "RatecodeID", + "store_and_fwd_flag", + "PULocationID", + "DOLocationID", + "payment_type", + "fare_amount", + "extra", + "mta_tax", + "tip_amount", + "tolls_amount", + "improvement_surcharge", + "total_amount", + "column18", + "column19" + ), + structure.getColumnNames() + ); assertNull(structure.getGrokPattern()); assertEquals("tpep_pickup_datetime", structure.getTimestampField()); assertEquals(Collections.singletonList("YYYY-MM-dd HH:mm:ss"), structure.getJodaTimestampFormats()); @@ -316,18 +374,25 @@ public void testCreateConfigsGivenCsvWithTrailingNullsAndOverriddenTimeField() t // consistent timestamp format, so if we want the second we need an override FileStructureOverrides overrides = FileStructureOverrides.builder().setTimestampField("tpep_dropoff_datetime").build(); - String sample = "VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID," + - "store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount," + - "improvement_surcharge,total_amount,,\n" + - "2,2016-12-31 15:15:01,2016-12-31 15:15:09,1,.00,1,N,264,264,2,1,0,0.5,0,0,0.3,1.8,,\n" + - "1,2016-12-01 00:00:01,2016-12-01 00:10:22,1,1.60,1,N,163,143,2,9,0.5,0.5,0,0,0.3,10.3,,\n" + - "1,2016-12-01 00:00:01,2016-12-01 00:11:01,1,1.40,1,N,164,229,1,9,0.5,0.5,2.05,0,0.3,12.35,,\n"; + String sample = "VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID," + + "store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount," + + "improvement_surcharge,total_amount,,\n" + + "2,2016-12-31 15:15:01,2016-12-31 15:15:09,1,.00,1,N,264,264,2,1,0,0.5,0,0,0.3,1.8,,\n" + + "1,2016-12-01 00:00:01,2016-12-01 00:10:22,1,1.60,1,N,163,143,2,9,0.5,0.5,0,0,0.3,10.3,,\n" + + "1,2016-12-01 00:00:01,2016-12-01 00:11:01,1,1.40,1,N,164,229,1,9,0.5,0.5,2.05,0,0.3,12.35,,\n"; assertTrue(csvFactory.canCreateFromSample(explanation, sample, 0.0)); String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); - FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, - FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, overrides, NOOP_TIMEOUT_CHECKER); + FileStructureFinder structureFinder = csvFactory.createFromSample( + explanation, + sample, + charset, + hasByteOrderMarker, + FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, + overrides, + NOOP_TIMEOUT_CHECKER + ); FileStructure structure = structureFinder.getStructure(); @@ -338,18 +403,42 @@ public void testCreateConfigsGivenCsvWithTrailingNullsAndOverriddenTimeField() t } else { assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker()); } - assertEquals("^\"?VendorID\"?,\"?tpep_pickup_datetime\"?,\"?tpep_dropoff_datetime\"?,\"?passenger_count\"?,\"?trip_distance\"?," + - "\"?RatecodeID\"?,\"?store_and_fwd_flag\"?,\"?PULocationID\"?,\"?DOLocationID\"?,\"?payment_type\"?,\"?fare_amount\"?," + - "\"?extra\"?,\"?mta_tax\"?,\"?tip_amount\"?,\"?tolls_amount\"?,\"?improvement_surcharge\"?,\"?total_amount\"?,\"?\"?,\"?\"?", - structure.getExcludeLinesPattern()); + assertEquals( + "^\"?VendorID\"?,\"?tpep_pickup_datetime\"?,\"?tpep_dropoff_datetime\"?,\"?passenger_count\"?,\"?trip_distance\"?," + + "\"?RatecodeID\"?,\"?store_and_fwd_flag\"?,\"?PULocationID\"?,\"?DOLocationID\"?,\"?payment_type\"?,\"?fare_amount\"?," + + "\"?extra\"?,\"?mta_tax\"?,\"?tip_amount\"?,\"?tolls_amount\"?,\"" + + "?improvement_surcharge\"?,\"?total_amount\"?,\"?\"?,\"?\"?", + structure.getExcludeLinesPattern() + ); assertNull(structure.getMultilineStartPattern()); assertEquals(Character.valueOf(','), structure.getDelimiter()); assertEquals(Character.valueOf('"'), structure.getQuote()); assertTrue(structure.getHasHeaderRow()); assertNull(structure.getShouldTrimFields()); - assertEquals(Arrays.asList("VendorID", "tpep_pickup_datetime", "tpep_dropoff_datetime", "passenger_count", "trip_distance", - "RatecodeID", "store_and_fwd_flag", "PULocationID", "DOLocationID", "payment_type", "fare_amount", "extra", "mta_tax", - "tip_amount", "tolls_amount", "improvement_surcharge", "total_amount", "column18", "column19"), structure.getColumnNames()); + assertEquals( + Arrays.asList( + "VendorID", + "tpep_pickup_datetime", + "tpep_dropoff_datetime", + "passenger_count", + "trip_distance", + "RatecodeID", + "store_and_fwd_flag", + "PULocationID", + "DOLocationID", + "payment_type", + "fare_amount", + "extra", + "mta_tax", + "tip_amount", + "tolls_amount", + "improvement_surcharge", + "total_amount", + "column18", + "column19" + ), + structure.getColumnNames() + ); assertNull(structure.getGrokPattern()); assertEquals("tpep_dropoff_datetime", structure.getTimestampField()); assertEquals(Collections.singletonList("YYYY-MM-dd HH:mm:ss"), structure.getJodaTimestampFormats()); @@ -357,18 +446,25 @@ public void testCreateConfigsGivenCsvWithTrailingNullsAndOverriddenTimeField() t } public void testCreateConfigsGivenCsvWithTrailingNullsExceptHeader() throws Exception { - String sample = "VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID," + - "store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount," + - "improvement_surcharge,total_amount\n" + - "2,2016-12-31 15:15:01,2016-12-31 15:15:09,1,.00,1,N,264,264,2,1,0,0.5,0,0,0.3,1.8,,\n" + - "1,2016-12-01 00:00:01,2016-12-01 00:10:22,1,1.60,1,N,163,143,2,9,0.5,0.5,0,0,0.3,10.3,,\n" + - "1,2016-12-01 00:00:01,2016-12-01 00:11:01,1,1.40,1,N,164,229,1,9,0.5,0.5,2.05,0,0.3,12.35,,\n"; + String sample = "VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID," + + "store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount," + + "improvement_surcharge,total_amount\n" + + "2,2016-12-31 15:15:01,2016-12-31 15:15:09,1,.00,1,N,264,264,2,1,0,0.5,0,0,0.3,1.8,,\n" + + "1,2016-12-01 00:00:01,2016-12-01 00:10:22,1,1.60,1,N,163,143,2,9,0.5,0.5,0,0,0.3,10.3,,\n" + + "1,2016-12-01 00:00:01,2016-12-01 00:11:01,1,1.40,1,N,164,229,1,9,0.5,0.5,2.05,0,0.3,12.35,,\n"; assertTrue(csvFactory.canCreateFromSample(explanation, sample, 0.0)); String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); - FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, - FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); + FileStructureFinder structureFinder = csvFactory.createFromSample( + explanation, + sample, + charset, + hasByteOrderMarker, + FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, + FileStructureOverrides.EMPTY_OVERRIDES, + NOOP_TIMEOUT_CHECKER + ); FileStructure structure = structureFinder.getStructure(); @@ -379,18 +475,39 @@ public void testCreateConfigsGivenCsvWithTrailingNullsExceptHeader() throws Exce } else { assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker()); } - assertEquals("^\"?VendorID\"?,\"?tpep_pickup_datetime\"?,\"?tpep_dropoff_datetime\"?,\"?passenger_count\"?,\"?trip_distance\"?," + - "\"?RatecodeID\"?,\"?store_and_fwd_flag\"?,\"?PULocationID\"?,\"?DOLocationID\"?,\"?payment_type\"?,\"?fare_amount\"?," + - "\"?extra\"?,\"?mta_tax\"?,\"?tip_amount\"?,\"?tolls_amount\"?,\"?improvement_surcharge\"?,\"?total_amount\"?", - structure.getExcludeLinesPattern()); + assertEquals( + "^\"?VendorID\"?,\"?tpep_pickup_datetime\"?,\"?tpep_dropoff_datetime\"?,\"?passenger_count\"?,\"?trip_distance\"?," + + "\"?RatecodeID\"?,\"?store_and_fwd_flag\"?,\"?PULocationID\"?,\"?DOLocationID\"?,\"?payment_type\"?,\"?fare_amount\"?," + + "\"?extra\"?,\"?mta_tax\"?,\"?tip_amount\"?,\"?tolls_amount\"?,\"?improvement_surcharge\"?,\"?total_amount\"?", + structure.getExcludeLinesPattern() + ); assertNull(structure.getMultilineStartPattern()); assertEquals(Character.valueOf(','), structure.getDelimiter()); assertEquals(Character.valueOf('"'), structure.getQuote()); assertTrue(structure.getHasHeaderRow()); assertNull(structure.getShouldTrimFields()); - assertEquals(Arrays.asList("VendorID", "tpep_pickup_datetime", "tpep_dropoff_datetime", "passenger_count", "trip_distance", - "RatecodeID", "store_and_fwd_flag", "PULocationID", "DOLocationID", "payment_type", "fare_amount", "extra", "mta_tax", - "tip_amount", "tolls_amount", "improvement_surcharge", "total_amount"), structure.getColumnNames()); + assertEquals( + Arrays.asList( + "VendorID", + "tpep_pickup_datetime", + "tpep_dropoff_datetime", + "passenger_count", + "trip_distance", + "RatecodeID", + "store_and_fwd_flag", + "PULocationID", + "DOLocationID", + "payment_type", + "fare_amount", + "extra", + "mta_tax", + "tip_amount", + "tolls_amount", + "improvement_surcharge", + "total_amount" + ), + structure.getColumnNames() + ); assertNull(structure.getGrokPattern()); assertEquals("tpep_pickup_datetime", structure.getTimestampField()); assertEquals(Collections.singletonList("YYYY-MM-dd HH:mm:ss"), structure.getJodaTimestampFormats()); @@ -400,23 +517,48 @@ public void testCreateConfigsGivenCsvWithTrailingNullsExceptHeader() throws Exce public void testCreateConfigsGivenCsvWithTrailingNullsExceptHeaderAndColumnNamesOverride() throws Exception { FileStructureOverrides overrides = FileStructureOverrides.builder() - .setColumnNames(Arrays.asList("my_VendorID", "my_tpep_pickup_datetime", "my_tpep_dropoff_datetime", "my_passenger_count", - "my_trip_distance", "my_RatecodeID", "my_store_and_fwd_flag", "my_PULocationID", "my_DOLocationID", "my_payment_type", - "my_fare_amount", "my_extra", "my_mta_tax", "my_tip_amount", "my_tolls_amount", "my_improvement_surcharge", - "my_total_amount")).build(); - - String sample = "VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID," + - "store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount," + - "improvement_surcharge,total_amount\n" + - "2,2016-12-31 15:15:01,2016-12-31 15:15:09,1,.00,1,N,264,264,2,1,0,0.5,0,0,0.3,1.8,,\n" + - "1,2016-12-01 00:00:01,2016-12-01 00:10:22,1,1.60,1,N,163,143,2,9,0.5,0.5,0,0,0.3,10.3,,\n" + - "1,2016-12-01 00:00:01,2016-12-01 00:11:01,1,1.40,1,N,164,229,1,9,0.5,0.5,2.05,0,0.3,12.35,,\n"; + .setColumnNames( + Arrays.asList( + "my_VendorID", + "my_tpep_pickup_datetime", + "my_tpep_dropoff_datetime", + "my_passenger_count", + "my_trip_distance", + "my_RatecodeID", + "my_store_and_fwd_flag", + "my_PULocationID", + "my_DOLocationID", + "my_payment_type", + "my_fare_amount", + "my_extra", + "my_mta_tax", + "my_tip_amount", + "my_tolls_amount", + "my_improvement_surcharge", + "my_total_amount" + ) + ) + .build(); + + String sample = "VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID," + + "store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount," + + "improvement_surcharge,total_amount\n" + + "2,2016-12-31 15:15:01,2016-12-31 15:15:09,1,.00,1,N,264,264,2,1,0,0.5,0,0,0.3,1.8,,\n" + + "1,2016-12-01 00:00:01,2016-12-01 00:10:22,1,1.60,1,N,163,143,2,9,0.5,0.5,0,0,0.3,10.3,,\n" + + "1,2016-12-01 00:00:01,2016-12-01 00:11:01,1,1.40,1,N,164,229,1,9,0.5,0.5,2.05,0,0.3,12.35,,\n"; assertTrue(csvFactory.canCreateFromSample(explanation, sample, 0.0)); String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); - FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, - FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, overrides, NOOP_TIMEOUT_CHECKER); + FileStructureFinder structureFinder = csvFactory.createFromSample( + explanation, + sample, + charset, + hasByteOrderMarker, + FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, + overrides, + NOOP_TIMEOUT_CHECKER + ); FileStructure structure = structureFinder.getStructure(); @@ -427,19 +569,39 @@ public void testCreateConfigsGivenCsvWithTrailingNullsExceptHeaderAndColumnNames } else { assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker()); } - assertEquals("^\"?VendorID\"?,\"?tpep_pickup_datetime\"?,\"?tpep_dropoff_datetime\"?,\"?passenger_count\"?,\"?trip_distance\"?," + - "\"?RatecodeID\"?,\"?store_and_fwd_flag\"?,\"?PULocationID\"?,\"?DOLocationID\"?,\"?payment_type\"?,\"?fare_amount\"?," + - "\"?extra\"?,\"?mta_tax\"?,\"?tip_amount\"?,\"?tolls_amount\"?,\"?improvement_surcharge\"?,\"?total_amount\"?", - structure.getExcludeLinesPattern()); + assertEquals( + "^\"?VendorID\"?,\"?tpep_pickup_datetime\"?,\"?tpep_dropoff_datetime\"?,\"?passenger_count\"?,\"?trip_distance\"?," + + "\"?RatecodeID\"?,\"?store_and_fwd_flag\"?,\"?PULocationID\"?,\"?DOLocationID\"?,\"?payment_type\"?,\"?fare_amount\"?," + + "\"?extra\"?,\"?mta_tax\"?,\"?tip_amount\"?,\"?tolls_amount\"?,\"?improvement_surcharge\"?,\"?total_amount\"?", + structure.getExcludeLinesPattern() + ); assertNull(structure.getMultilineStartPattern()); assertEquals(Character.valueOf(','), structure.getDelimiter()); assertEquals(Character.valueOf('"'), structure.getQuote()); assertTrue(structure.getHasHeaderRow()); assertNull(structure.getShouldTrimFields()); - assertEquals(Arrays.asList("my_VendorID", "my_tpep_pickup_datetime", "my_tpep_dropoff_datetime", "my_passenger_count", - "my_trip_distance", "my_RatecodeID", "my_store_and_fwd_flag", "my_PULocationID", "my_DOLocationID", "my_payment_type", - "my_fare_amount", "my_extra", "my_mta_tax", "my_tip_amount", "my_tolls_amount", "my_improvement_surcharge", "my_total_amount"), - structure.getColumnNames()); + assertEquals( + Arrays.asList( + "my_VendorID", + "my_tpep_pickup_datetime", + "my_tpep_dropoff_datetime", + "my_passenger_count", + "my_trip_distance", + "my_RatecodeID", + "my_store_and_fwd_flag", + "my_PULocationID", + "my_DOLocationID", + "my_payment_type", + "my_fare_amount", + "my_extra", + "my_mta_tax", + "my_tip_amount", + "my_tolls_amount", + "my_improvement_surcharge", + "my_total_amount" + ), + structure.getColumnNames() + ); assertNull(structure.getGrokPattern()); assertEquals("my_tpep_pickup_datetime", structure.getTimestampField()); assertEquals(Collections.singletonList("YYYY-MM-dd HH:mm:ss"), structure.getJodaTimestampFormats()); @@ -447,15 +609,22 @@ public void testCreateConfigsGivenCsvWithTrailingNullsExceptHeaderAndColumnNames } public void testCreateConfigsGivenCsvWithTimeLastColumn() throws Exception { - String sample = "\"pos_id\",\"trip_id\",\"latitude\",\"longitude\",\"altitude\",\"timestamp\"\n" + - "\"1\",\"3\",\"4703.7815\",\"1527.4713\",\"359.9\",\"2017-01-19 16:19:04.742113\"\n" + - "\"2\",\"3\",\"4703.7815\",\"1527.4714\",\"359.9\",\"2017-01-19 16:19:05.741890\"\n"; + String sample = "\"pos_id\",\"trip_id\",\"latitude\",\"longitude\",\"altitude\",\"timestamp\"\n" + + "\"1\",\"3\",\"4703.7815\",\"1527.4713\",\"359.9\",\"2017-01-19 16:19:04.742113\"\n" + + "\"2\",\"3\",\"4703.7815\",\"1527.4714\",\"359.9\",\"2017-01-19 16:19:05.741890\"\n"; assertTrue(csvFactory.canCreateFromSample(explanation, sample, 0.0)); String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); - FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, - FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); + FileStructureFinder structureFinder = csvFactory.createFromSample( + explanation, + sample, + charset, + hasByteOrderMarker, + FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, + FileStructureOverrides.EMPTY_OVERRIDES, + NOOP_TIMEOUT_CHECKER + ); FileStructure structure = structureFinder.getStructure(); @@ -466,8 +635,10 @@ public void testCreateConfigsGivenCsvWithTimeLastColumn() throws Exception { } else { assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker()); } - assertEquals("^\"?pos_id\"?,\"?trip_id\"?,\"?latitude\"?,\"?longitude\"?,\"?altitude\"?,\"?timestamp\"?", - structure.getExcludeLinesPattern()); + assertEquals( + "^\"?pos_id\"?,\"?trip_id\"?,\"?latitude\"?,\"?longitude\"?,\"?altitude\"?,\"?timestamp\"?", + structure.getExcludeLinesPattern() + ); assertNull(structure.getMultilineStartPattern()); assertEquals(Character.valueOf(','), structure.getDelimiter()); assertEquals(Character.valueOf('"'), structure.getQuote()); @@ -481,22 +652,29 @@ public void testCreateConfigsGivenCsvWithTimeLastColumn() throws Exception { } public void testCreateConfigsGivenTsvWithSyslogLikeTimestamp() throws Exception { - String sample = "Latitude\tLongitude\tloc\tTimestamp\n" + - "25.78042\t18.441196\t\"25.7804200000,18.4411960000\"\tJun 30 2019 13:21:24\n" + - "25.743484\t18.443047\t\"25.7434840000,18.4430470000\"\tJun 30 2019 06:02:35\n" + - "25.744583\t18.442783\t\"25.7445830000,18.4427830000\"\tJun 30 2019 06:02:35\n" + - "25.754593\t18.431637\t\"25.7545930000,18.4316370000\"\tJul 1 2019 06:02:43\n" + - "25.768574\t18.433483\t\"25.7685740000,18.4334830000\"\tJul 1 2019 06:21:28\n" + - "25.757736\t18.438683\t\"25.7577360000,18.4386830000\"\tJul 1 2019 12:06:08\n" + - "25.76615\t18.436565\t\"25.7661500000,18.4365650000\"\tJul 1 2019 12:06:08\n" + - "25.76896\t18.43586\t\"25.7689600000,18.4358600000\"\tJul 1 2019 12:13:50\n" + - "25.76423\t18.43705\t\"25.7642300000,18.4370500000\"\tJul 1 2019 12:39:10\n"; + String sample = "Latitude\tLongitude\tloc\tTimestamp\n" + + "25.78042\t18.441196\t\"25.7804200000,18.4411960000\"\tJun 30 2019 13:21:24\n" + + "25.743484\t18.443047\t\"25.7434840000,18.4430470000\"\tJun 30 2019 06:02:35\n" + + "25.744583\t18.442783\t\"25.7445830000,18.4427830000\"\tJun 30 2019 06:02:35\n" + + "25.754593\t18.431637\t\"25.7545930000,18.4316370000\"\tJul 1 2019 06:02:43\n" + + "25.768574\t18.433483\t\"25.7685740000,18.4334830000\"\tJul 1 2019 06:21:28\n" + + "25.757736\t18.438683\t\"25.7577360000,18.4386830000\"\tJul 1 2019 12:06:08\n" + + "25.76615\t18.436565\t\"25.7661500000,18.4365650000\"\tJul 1 2019 12:06:08\n" + + "25.76896\t18.43586\t\"25.7689600000,18.4358600000\"\tJul 1 2019 12:13:50\n" + + "25.76423\t18.43705\t\"25.7642300000,18.4370500000\"\tJul 1 2019 12:39:10\n"; assertTrue(tsvFactory.canCreateFromSample(explanation, sample, 0.0)); String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); - FileStructureFinder structureFinder = tsvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, - FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); + FileStructureFinder structureFinder = tsvFactory.createFromSample( + explanation, + sample, + charset, + hasByteOrderMarker, + FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, + FileStructureOverrides.EMPTY_OVERRIDES, + NOOP_TIMEOUT_CHECKER + ); FileStructure structure = structureFinder.getStructure(); @@ -507,8 +685,7 @@ public void testCreateConfigsGivenTsvWithSyslogLikeTimestamp() throws Exception } else { assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker()); } - assertEquals("^\"?Latitude\"?\\t\"?Longitude\"?\\t\"?loc\"?\\t\"?Timestamp\"?", - structure.getExcludeLinesPattern()); + assertEquals("^\"?Latitude\"?\\t\"?Longitude\"?\\t\"?loc\"?\\t\"?Timestamp\"?", structure.getExcludeLinesPattern()); assertNull(structure.getMultilineStartPattern()); assertEquals(Character.valueOf('\t'), structure.getDelimiter()); assertEquals(Character.valueOf('"'), structure.getQuote()); @@ -517,21 +694,28 @@ public void testCreateConfigsGivenTsvWithSyslogLikeTimestamp() throws Exception assertEquals(Arrays.asList("Latitude", "Longitude", "loc", "Timestamp"), structure.getColumnNames()); assertNull(structure.getGrokPattern()); assertEquals("Timestamp", structure.getTimestampField()); - assertEquals(Arrays.asList("MMM dd YYYY HH:mm:ss", "MMM d YYYY HH:mm:ss", "MMM d YYYY HH:mm:ss"), - structure.getJodaTimestampFormats()); + assertEquals( + Arrays.asList("MMM dd YYYY HH:mm:ss", "MMM d YYYY HH:mm:ss", "MMM d YYYY HH:mm:ss"), + structure.getJodaTimestampFormats() + ); assertEquals(Collections.singleton("properties"), structure.getMappings().keySet()); } public void testCreateConfigsGivenDotInFieldName() throws Exception { - String sample = "time.iso8601,message\n" + - "2018-05-17T13:41:23,hello\n" + - "2018-05-17T13:41:32,hello again\n"; + String sample = "time.iso8601,message\n" + "2018-05-17T13:41:23,hello\n" + "2018-05-17T13:41:32,hello again\n"; assertTrue(csvFactory.canCreateFromSample(explanation, sample, 0.0)); String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); - FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, - FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); + FileStructureFinder structureFinder = csvFactory.createFromSample( + explanation, + sample, + charset, + hasByteOrderMarker, + FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, + FileStructureOverrides.EMPTY_OVERRIDES, + NOOP_TIMEOUT_CHECKER + ); FileStructure structure = structureFinder.getStructure(); @@ -557,29 +741,33 @@ public void testCreateConfigsGivenDotInFieldName() throws Exception { } public void testFindHeaderFromSampleGivenHeaderInSample() throws IOException { - String withHeader = "time,airline,responsetime,sourcetype\n" + - "2014-06-23 00:00:00Z,AAL,132.2046,farequote\n" + - "2014-06-23 00:00:00Z,JZA,990.4628,farequote\n" + - "2014-06-23 00:00:01Z,JBU,877.5927,farequote\n" + - "2014-06-23 00:00:01Z,KLM,1355.4812,farequote\n"; - - Tuple header = DelimitedFileStructureFinder.findHeaderFromSample(explanation, + String withHeader = "time,airline,responsetime,sourcetype\n" + + "2014-06-23 00:00:00Z,AAL,132.2046,farequote\n" + + "2014-06-23 00:00:00Z,JZA,990.4628,farequote\n" + + "2014-06-23 00:00:01Z,JBU,877.5927,farequote\n" + + "2014-06-23 00:00:01Z,KLM,1355.4812,farequote\n"; + + Tuple header = DelimitedFileStructureFinder.findHeaderFromSample( + explanation, DelimitedFileStructureFinder.readRows(withHeader, CsvPreference.EXCEL_PREFERENCE, NOOP_TIMEOUT_CHECKER).v1(), - FileStructureOverrides.EMPTY_OVERRIDES); + FileStructureOverrides.EMPTY_OVERRIDES + ); assertTrue(header.v1()); assertThat(header.v2(), arrayContaining("time", "airline", "responsetime", "sourcetype")); } public void testFindHeaderFromSampleGivenHeaderNotInSample() throws IOException { - String noHeader = "2014-06-23 00:00:00Z,AAL,132.2046,farequote\n" + - "2014-06-23 00:00:00Z,JZA,990.4628,farequote\n" + - "2014-06-23 00:00:01Z,JBU,877.5927,farequote\n" + - "2014-06-23 00:00:01Z,KLM,1355.4812,farequote\n"; + String noHeader = "2014-06-23 00:00:00Z,AAL,132.2046,farequote\n" + + "2014-06-23 00:00:00Z,JZA,990.4628,farequote\n" + + "2014-06-23 00:00:01Z,JBU,877.5927,farequote\n" + + "2014-06-23 00:00:01Z,KLM,1355.4812,farequote\n"; - Tuple header = DelimitedFileStructureFinder.findHeaderFromSample(explanation, + Tuple header = DelimitedFileStructureFinder.findHeaderFromSample( + explanation, DelimitedFileStructureFinder.readRows(noHeader, CsvPreference.EXCEL_PREFERENCE, NOOP_TIMEOUT_CHECKER).v1(), - FileStructureOverrides.EMPTY_OVERRIDES); + FileStructureOverrides.EMPTY_OVERRIDES + ); assertFalse(header.v1()); assertThat(header.v2(), arrayContaining("", "", "", "")); @@ -587,25 +775,25 @@ public void testFindHeaderFromSampleGivenHeaderNotInSample() throws IOException public void testLevenshteinDistance() { - assertEquals(0, levenshteinDistance("cat", "cat")); - assertEquals(3, levenshteinDistance("cat", "dog")); - assertEquals(5, levenshteinDistance("cat", "mouse")); - assertEquals(3, levenshteinDistance("cat", "")); - - assertEquals(3, levenshteinDistance("dog", "cat")); - assertEquals(0, levenshteinDistance("dog", "dog")); - assertEquals(4, levenshteinDistance("dog", "mouse")); - assertEquals(3, levenshteinDistance("dog", "")); - - assertEquals(5, levenshteinDistance("mouse", "cat")); - assertEquals(4, levenshteinDistance("mouse", "dog")); - assertEquals(0, levenshteinDistance("mouse", "mouse")); - assertEquals(5, levenshteinDistance("mouse", "")); - - assertEquals(3, levenshteinDistance("", "cat")); - assertEquals(3, levenshteinDistance("", "dog")); - assertEquals(5, levenshteinDistance("", "mouse")); - assertEquals(0, levenshteinDistance("", "")); + assertEquals(0, DelimitedFileStructureFinder.levenshteinDistance("cat", "cat")); + assertEquals(3, DelimitedFileStructureFinder.levenshteinDistance("cat", "dog")); + assertEquals(5, DelimitedFileStructureFinder.levenshteinDistance("cat", "mouse")); + assertEquals(3, DelimitedFileStructureFinder.levenshteinDistance("cat", "")); + + assertEquals(3, DelimitedFileStructureFinder.levenshteinDistance("dog", "cat")); + assertEquals(0, DelimitedFileStructureFinder.levenshteinDistance("dog", "dog")); + assertEquals(4, DelimitedFileStructureFinder.levenshteinDistance("dog", "mouse")); + assertEquals(3, DelimitedFileStructureFinder.levenshteinDistance("dog", "")); + + assertEquals(5, DelimitedFileStructureFinder.levenshteinDistance("mouse", "cat")); + assertEquals(4, DelimitedFileStructureFinder.levenshteinDistance("mouse", "dog")); + assertEquals(0, DelimitedFileStructureFinder.levenshteinDistance("mouse", "mouse")); + assertEquals(5, DelimitedFileStructureFinder.levenshteinDistance("mouse", "")); + + assertEquals(3, DelimitedFileStructureFinder.levenshteinDistance("", "cat")); + assertEquals(3, DelimitedFileStructureFinder.levenshteinDistance("", "dog")); + assertEquals(5, DelimitedFileStructureFinder.levenshteinDistance("", "mouse")); + assertEquals(0, DelimitedFileStructureFinder.levenshteinDistance("", "")); } public void testMakeShortFieldMask() { @@ -617,13 +805,13 @@ public void testMakeShortFieldMask() { rows.add(Arrays.asList(randomAlphaOfLength(5), randomAlphaOfLength(5), randomAlphaOfLength(80))); BitSet shortFieldMask = DelimitedFileStructureFinder.makeShortFieldMask(rows, 110); - assertThat(shortFieldMask, equalTo(TimestampFormatFinder.stringToNumberPosBitSet("111"))); + assertThat(shortFieldMask, equalTo(stringToNumberPosBitSet("111"))); shortFieldMask = DelimitedFileStructureFinder.makeShortFieldMask(rows, 80); - assertThat(shortFieldMask, equalTo(TimestampFormatFinder.stringToNumberPosBitSet("11 "))); + assertThat(shortFieldMask, equalTo(stringToNumberPosBitSet("11 "))); shortFieldMask = DelimitedFileStructureFinder.makeShortFieldMask(rows, 50); - assertThat(shortFieldMask, equalTo(TimestampFormatFinder.stringToNumberPosBitSet(" 1 "))); + assertThat(shortFieldMask, equalTo(stringToNumberPosBitSet(" 1 "))); shortFieldMask = DelimitedFileStructureFinder.makeShortFieldMask(rows, 20); - assertThat(shortFieldMask, equalTo(TimestampFormatFinder.stringToNumberPosBitSet(" "))); + assertThat(shortFieldMask, equalTo(stringToNumberPosBitSet(" "))); } public void testLevenshteinCompareRows() { @@ -639,20 +827,58 @@ public void testLevenshteinCompareRows() { public void testLevenshteinCompareRowsWithMask() { - assertEquals(0, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog"), Arrays.asList("cat", "dog"), - TimestampFormatFinder.stringToNumberPosBitSet(randomFrom(" ", "1 ", " 1", "11")))); - assertEquals(0, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog"), Arrays.asList("cat", "cat"), - TimestampFormatFinder.stringToNumberPosBitSet(randomFrom(" ", "1 ")))); - assertEquals(3, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog"), Arrays.asList("dog", "cat"), - TimestampFormatFinder.stringToNumberPosBitSet(randomFrom(" 1", "1 ")))); - assertEquals(3, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog"), Arrays.asList("mouse", "cat"), - TimestampFormatFinder.stringToNumberPosBitSet(" 1"))); - assertEquals(5, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog", "mouse"), Arrays.asList("mouse", "dog", "cat"), - TimestampFormatFinder.stringToNumberPosBitSet(" 11"))); - assertEquals(4, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog", "mouse"), Arrays.asList("mouse", "mouse", "mouse"), - TimestampFormatFinder.stringToNumberPosBitSet(" 11"))); - assertEquals(7, levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog", "mouse"), Arrays.asList("mouse", "cat", "dog"), - TimestampFormatFinder.stringToNumberPosBitSet(" 11"))); + assertEquals( + 0, + levenshteinFieldwiseCompareRows( + Arrays.asList("cat", "dog"), + Arrays.asList("cat", "dog"), + stringToNumberPosBitSet(randomFrom(" ", "1 ", " 1", "11")) + ) + ); + assertEquals( + 0, + levenshteinFieldwiseCompareRows( + Arrays.asList("cat", "dog"), + Arrays.asList("cat", "cat"), + stringToNumberPosBitSet(randomFrom(" ", "1 ")) + ) + ); + assertEquals( + 3, + levenshteinFieldwiseCompareRows( + Arrays.asList("cat", "dog"), + Arrays.asList("dog", "cat"), + stringToNumberPosBitSet(randomFrom(" 1", "1 ")) + ) + ); + assertEquals( + 3, + levenshteinFieldwiseCompareRows(Arrays.asList("cat", "dog"), Arrays.asList("mouse", "cat"), stringToNumberPosBitSet(" 1")) + ); + assertEquals( + 5, + levenshteinFieldwiseCompareRows( + Arrays.asList("cat", "dog", "mouse"), + Arrays.asList("mouse", "dog", "cat"), + stringToNumberPosBitSet(" 11") + ) + ); + assertEquals( + 4, + levenshteinFieldwiseCompareRows( + Arrays.asList("cat", "dog", "mouse"), + Arrays.asList("mouse", "mouse", "mouse"), + stringToNumberPosBitSet(" 11") + ) + ); + assertEquals( + 7, + levenshteinFieldwiseCompareRows( + Arrays.asList("cat", "dog", "mouse"), + Arrays.asList("mouse", "cat", "dog"), + stringToNumberPosBitSet(" 11") + ) + ); } public void testLineHasUnescapedQuote() { @@ -743,14 +969,28 @@ public void testMultilineStartPatternGivenNoMultiline() { if (columnName.equals(timeFieldName)) { mappings.put(columnName, Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "date")); } else { - mappings.put(columnName, - Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, - randomFrom("boolean", "long", "double", "text", "keyword"))); + mappings.put( + columnName, + Collections.singletonMap( + FileStructureUtils.MAPPING_TYPE_SETTING, + randomFrom("boolean", "long", "double", "text", "keyword") + ) + ); } } - assertNull(DelimitedFileStructureFinder.makeMultilineStartPattern(explanation, columnNames, 1, ",", "\"", mappings, timeFieldName, - timeFieldFormat)); + assertNull( + DelimitedFileStructureFinder.makeMultilineStartPattern( + explanation, + columnNames, + 1, + ",", + "\"", + mappings, + timeFieldName, + timeFieldFormat + ) + ); assertThat(explanation, contains("Not creating a multi-line start pattern as no sampled message spanned multiple lines")); } @@ -770,19 +1010,33 @@ public void testMultilineStartPatternFromTimeField() { } } - String expected = "^" + Stream.generate(() -> ".*?,").limit(timeFieldColumnIndex).collect(Collectors.joining()) + - "\"?\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}"; - assertEquals(expected, DelimitedFileStructureFinder.makeMultilineStartPattern(explanation, columnNames, 2, ",", "\"", mappings, - timeFieldName, timeFieldFormat)); + String expected = "^" + + Stream.generate(() -> ".*?,").limit(timeFieldColumnIndex).collect(Collectors.joining()) + + "\"?\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}"; + assertEquals( + expected, + DelimitedFileStructureFinder.makeMultilineStartPattern( + explanation, + columnNames, + 2, + ",", + "\"", + mappings, + timeFieldName, + timeFieldFormat + ) + ); assertThat(explanation, contains("Created a multi-line start pattern based on timestamp column [" + timeFieldName + "]")); } public void testMultilineStartPatternFromMappings() { int randomIndex = randomIntBetween(0, 2); - String type = new String[]{ "boolean", "long", "double" }[randomIndex]; - String expectedTypePattern = - new String[]{ "(?:true|false)", "[+-]?\\d+", "[+-]?(?:\\d+(?:\\.\\d+)?|\\.\\d+)(?:[eE][+-]?\\d+)?" }[randomIndex]; + String type = new String[] { "boolean", "long", "double" }[randomIndex]; + String expectedTypePattern = new String[] { + "(?:true|false)", + "[+-]?\\d+", + "[+-]?(?:\\d+(?:\\.\\d+)?|\\.\\d+)(?:[eE][+-]?\\d+)?" }[randomIndex]; List columnNames = Stream.generate(() -> randomAlphaOfLengthBetween(5, 10)).limit(10).collect(Collectors.toList()); int chosenFieldColumnIndex = randomIntBetween(0, columnNames.size() - 2); String chosenField = columnNames.get(chosenFieldColumnIndex); @@ -795,10 +1049,17 @@ public void testMultilineStartPatternFromMappings() { } } - String expected = "^" + Stream.generate(() -> ".*?,").limit(chosenFieldColumnIndex).collect(Collectors.joining()) + - "(?:" + expectedTypePattern + "|\"" + expectedTypePattern + "\"),"; - assertEquals(expected, DelimitedFileStructureFinder.makeMultilineStartPattern(explanation, columnNames, 2, ",", "\"", mappings, - null, null)); + String expected = "^" + + Stream.generate(() -> ".*?,").limit(chosenFieldColumnIndex).collect(Collectors.joining()) + + "(?:" + + expectedTypePattern + + "|\"" + + expectedTypePattern + + "\"),"; + assertEquals( + expected, + DelimitedFileStructureFinder.makeMultilineStartPattern(explanation, columnNames, 2, ",", "\"", mappings, null, null) + ); assertThat(explanation, contains("Created a multi-line start pattern based on [" + type + "] column [" + chosenField + "]")); } @@ -816,8 +1077,12 @@ public void testMultilineStartPatternDeterminationTooHard() { static Map randomCsvProcessorSettings() { String field = randomAlphaOfLength(10); - return DelimitedFileStructureFinder.makeCsvProcessorSettings(field, - Arrays.asList(generateRandomStringArray(10, field.length() - 1, false, false)), randomFrom(',', ';', '\t', '|'), - randomFrom('"', '\''), randomBoolean()); + return DelimitedFileStructureFinder.makeCsvProcessorSettings( + field, + Arrays.asList(generateRandomStringArray(10, field.length() - 1, false, false)), + randomFrom(',', ';', '\t', '|'), + randomFrom('"', '\''), + randomBoolean() + ); } } diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FieldStatsCalculatorTests.java b/x-pack/plugin/text-structure/src/test/java/org/elasticsearch/xpack/textstructure/structurefinder/FieldStatsCalculatorTests.java similarity index 91% rename from x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FieldStatsCalculatorTests.java rename to x-pack/plugin/text-structure/src/test/java/org/elasticsearch/xpack/textstructure/structurefinder/FieldStatsCalculatorTests.java index 4efaf64bd092c..0b6af12cc6c9c 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FieldStatsCalculatorTests.java +++ b/x-pack/plugin/text-structure/src/test/java/org/elasticsearch/xpack/textstructure/structurefinder/FieldStatsCalculatorTests.java @@ -3,9 +3,9 @@ * or more contributor license agreements. Licensed under the Elastic License; * you may not use this file except in compliance with the Elastic License. */ -package org.elasticsearch.xpack.ml.filestructurefinder; +package org.elasticsearch.xpack.textstructure.structurefinder; -import org.elasticsearch.xpack.core.ml.filestructurefinder.FieldStats; +import org.elasticsearch.xpack.core.textstructure.structurefinder.FieldStats; import java.util.Arrays; import java.util.Collections; @@ -110,8 +110,9 @@ public void testTopHitsString() { public void testCalculateGivenEmpty() { - FieldStatsCalculator calculator = - new FieldStatsCalculator(randomFrom(Arrays.asList(LONG, DOUBLE, KEYWORD, FileStructureUtils.DATE_MAPPING_WITHOUT_FORMAT))); + FieldStatsCalculator calculator = new FieldStatsCalculator( + randomFrom(Arrays.asList(LONG, DOUBLE, KEYWORD, FileStructureUtils.DATE_MAPPING_WITHOUT_FORMAT)) + ); calculator.accept(Collections.emptyList()); @@ -217,8 +218,16 @@ public void testGivenDateFieldWithoutFormat() { FieldStatsCalculator calculator = new FieldStatsCalculator(FileStructureUtils.DATE_MAPPING_WITHOUT_FORMAT); - calculator.accept(Arrays.asList("2018-10-08T10:49:16.642", "2018-10-08T10:49:16.642", "2018-10-08T10:49:16.642", - "2018-09-08T11:12:13.789", "2019-01-28T01:02:03.456", "2018-09-08T11:12:13.789")); + calculator.accept( + Arrays.asList( + "2018-10-08T10:49:16.642", + "2018-10-08T10:49:16.642", + "2018-10-08T10:49:16.642", + "2018-09-08T11:12:13.789", + "2019-01-28T01:02:03.456", + "2018-09-08T11:12:13.789" + ) + ); FieldStats stats = calculator.calculate(3); @@ -249,8 +258,16 @@ public void testGivenDateFieldWithFormat() { dateMapping.put(FileStructureUtils.MAPPING_FORMAT_SETTING, "M/dd/yyyy h:mma"); FieldStatsCalculator calculator = new FieldStatsCalculator(dateMapping); - calculator.accept(Arrays.asList("10/08/2018 10:49AM", "10/08/2018 10:49AM", "10/08/2018 10:49AM", - "9/08/2018 11:12AM", "1/28/2019 1:02AM", "9/08/2018 11:12AM")); + calculator.accept( + Arrays.asList( + "10/08/2018 10:49AM", + "10/08/2018 10:49AM", + "10/08/2018 10:49AM", + "9/08/2018 11:12AM", + "1/28/2019 1:02AM", + "9/08/2018 11:12AM" + ) + ); FieldStats stats = calculator.calculate(3); diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderManagerTests.java b/x-pack/plugin/text-structure/src/test/java/org/elasticsearch/xpack/textstructure/structurefinder/FileStructureFinderManagerTests.java similarity index 60% rename from x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderManagerTests.java rename to x-pack/plugin/text-structure/src/test/java/org/elasticsearch/xpack/textstructure/structurefinder/FileStructureFinderManagerTests.java index 188bc9a628bd8..69cae0a70ddcd 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderManagerTests.java +++ b/x-pack/plugin/text-structure/src/test/java/org/elasticsearch/xpack/textstructure/structurefinder/FileStructureFinderManagerTests.java @@ -3,13 +3,13 @@ * or more contributor license agreements. Licensed under the Elastic License; * you may not use this file except in compliance with the Elastic License. */ -package org.elasticsearch.xpack.ml.filestructurefinder; +package org.elasticsearch.xpack.textstructure.structurefinder; import com.ibm.icu.text.CharsetMatch; import org.elasticsearch.ElasticsearchTimeoutException; import org.elasticsearch.common.unit.TimeValue; import org.elasticsearch.threadpool.Scheduler; -import org.elasticsearch.xpack.core.ml.filestructurefinder.FileStructure; +import org.elasticsearch.xpack.core.textstructure.structurefinder.FileStructure; import org.junit.After; import org.junit.Before; @@ -25,7 +25,8 @@ import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; -import static org.elasticsearch.xpack.ml.filestructurefinder.FileStructureOverrides.EMPTY_OVERRIDES; +import static org.elasticsearch.xpack.textstructure.structurefinder.FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT; +import static org.elasticsearch.xpack.textstructure.structurefinder.FileStructureOverrides.EMPTY_OVERRIDES; import static org.hamcrest.Matchers.containsString; import static org.hamcrest.Matchers.endsWith; import static org.hamcrest.Matchers.startsWith; @@ -50,15 +51,18 @@ public void shutdownScheduler() { public void testFindCharsetGivenCharacterWidths() throws Exception { for (Charset charset : Arrays.asList(StandardCharsets.UTF_8, StandardCharsets.UTF_16LE, StandardCharsets.UTF_16BE)) { - CharsetMatch charsetMatch = structureFinderManager.findCharset(explanation, - new ByteArrayInputStream(TEXT_SAMPLE.getBytes(charset)), NOOP_TIMEOUT_CHECKER); + CharsetMatch charsetMatch = structureFinderManager.findCharset( + explanation, + new ByteArrayInputStream(TEXT_SAMPLE.getBytes(charset)), + NOOP_TIMEOUT_CHECKER + ); assertEquals(charset.name(), charsetMatch.getName()); } } public void testFindCharsetGivenRandomBinary() throws Exception { - // This input should never match a single byte character set. ICU4J will sometimes decide + // This input should never match a single byte character set. ICU4J will sometimes decide // that it matches a double byte character set, hence the two assertion branches. int size = 1000; byte[] binaryBytes = randomByteArrayOfLength(size); @@ -67,8 +71,11 @@ public void testFindCharsetGivenRandomBinary() throws Exception { } try { - CharsetMatch charsetMatch = structureFinderManager.findCharset(explanation, new ByteArrayInputStream(binaryBytes), - NOOP_TIMEOUT_CHECKER); + CharsetMatch charsetMatch = structureFinderManager.findCharset( + explanation, + new ByteArrayInputStream(binaryBytes), + NOOP_TIMEOUT_CHECKER + ); assertThat(charsetMatch.getName(), startsWith("UTF-16")); } catch (IllegalArgumentException e) { assertEquals("Could not determine a usable character encoding for the input - could it be binary data?", e.getMessage()); @@ -77,7 +84,7 @@ public void testFindCharsetGivenRandomBinary() throws Exception { public void testFindCharsetGivenBinaryNearUtf16() throws Exception { - // This input should never match a single byte character set. ICU4J will probably decide + // This input should never match a single byte character set. ICU4J will probably decide // that it matches both UTF-16BE and UTF-16LE, but we should reject these as there's no // clear winner. ByteArrayOutputStream stream = new ByteArrayOutputStream(); @@ -92,18 +99,31 @@ public void testFindCharsetGivenBinaryNearUtf16() throws Exception { stream.write(randomAlphaOfLengthBetween(3, 4).getBytes(StandardCharsets.UTF_16BE)); } - IllegalArgumentException e = expectThrows(IllegalArgumentException.class, - () -> structureFinderManager.findCharset(explanation, new ByteArrayInputStream(stream.toByteArray()), NOOP_TIMEOUT_CHECKER)); + IllegalArgumentException e = expectThrows( + IllegalArgumentException.class, + () -> structureFinderManager.findCharset(explanation, new ByteArrayInputStream(stream.toByteArray()), NOOP_TIMEOUT_CHECKER) + ); assertEquals("Could not determine a usable character encoding for the input - could it be binary data?", e.getMessage()); - assertThat(explanation.toString(), - containsString("but was rejected as the distribution of zero bytes between odd and even positions in the file is very close")); + assertThat( + explanation.toString(), + containsString("but was rejected as the distribution of zero bytes between odd and even positions in the file is very close") + ); } public void testMakeBestStructureGivenNdJson() throws Exception { - assertThat(structureFinderManager.makeBestStructureFinder(explanation, NDJSON_SAMPLE, StandardCharsets.UTF_8.name(), - randomBoolean(), FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER), - instanceOf(NdJsonFileStructureFinder.class)); + assertThat( + structureFinderManager.makeBestStructureFinder( + explanation, + NDJSON_SAMPLE, + StandardCharsets.UTF_8.name(), + randomBoolean(), + DEFAULT_LINE_MERGE_SIZE_LIMIT, + EMPTY_OVERRIDES, + NOOP_TIMEOUT_CHECKER + ), + instanceOf(NdJsonFileStructureFinder.class) + ); } public void testMakeBestStructureGivenNdJsonAndDelimitedOverride() throws Exception { @@ -111,60 +131,127 @@ public void testMakeBestStructureGivenNdJsonAndDelimitedOverride() throws Except // Need to change the quote character from the default of double quotes // otherwise the quotes in the NDJSON will stop it parsing as CSV FileStructureOverrides overrides = FileStructureOverrides.builder() - .setFormat(FileStructure.Format.DELIMITED).setQuote('\'').build(); - - assertThat(structureFinderManager.makeBestStructureFinder(explanation, NDJSON_SAMPLE, StandardCharsets.UTF_8.name(), - randomBoolean(), FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, overrides, NOOP_TIMEOUT_CHECKER), - instanceOf(DelimitedFileStructureFinder.class)); + .setFormat(FileStructure.Format.DELIMITED) + .setQuote('\'') + .build(); + + assertThat( + structureFinderManager.makeBestStructureFinder( + explanation, + NDJSON_SAMPLE, + StandardCharsets.UTF_8.name(), + randomBoolean(), + DEFAULT_LINE_MERGE_SIZE_LIMIT, + overrides, + NOOP_TIMEOUT_CHECKER + ), + instanceOf(DelimitedFileStructureFinder.class) + ); } public void testMakeBestStructureGivenXml() throws Exception { - assertThat(structureFinderManager.makeBestStructureFinder(explanation, XML_SAMPLE, StandardCharsets.UTF_8.name(), randomBoolean(), - FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER), - instanceOf(XmlFileStructureFinder.class)); + assertThat( + structureFinderManager.makeBestStructureFinder( + explanation, + XML_SAMPLE, + StandardCharsets.UTF_8.name(), + randomBoolean(), + DEFAULT_LINE_MERGE_SIZE_LIMIT, + EMPTY_OVERRIDES, + NOOP_TIMEOUT_CHECKER + ), + instanceOf(XmlFileStructureFinder.class) + ); } public void testMakeBestStructureGivenXmlAndTextOverride() throws Exception { FileStructureOverrides overrides = FileStructureOverrides.builder().setFormat(FileStructure.Format.SEMI_STRUCTURED_TEXT).build(); - assertThat(structureFinderManager.makeBestStructureFinder(explanation, XML_SAMPLE, StandardCharsets.UTF_8.name(), randomBoolean(), - FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, overrides, NOOP_TIMEOUT_CHECKER), - instanceOf(TextLogFileStructureFinder.class)); + assertThat( + structureFinderManager.makeBestStructureFinder( + explanation, + XML_SAMPLE, + StandardCharsets.UTF_8.name(), + randomBoolean(), + DEFAULT_LINE_MERGE_SIZE_LIMIT, + overrides, + NOOP_TIMEOUT_CHECKER + ), + instanceOf(TextLogFileStructureFinder.class) + ); } public void testMakeBestStructureGivenCsv() throws Exception { - assertThat(structureFinderManager.makeBestStructureFinder(explanation, CSV_SAMPLE, StandardCharsets.UTF_8.name(), randomBoolean(), - FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER), - instanceOf(DelimitedFileStructureFinder.class)); + assertThat( + structureFinderManager.makeBestStructureFinder( + explanation, + CSV_SAMPLE, + StandardCharsets.UTF_8.name(), + randomBoolean(), + DEFAULT_LINE_MERGE_SIZE_LIMIT, + EMPTY_OVERRIDES, + NOOP_TIMEOUT_CHECKER + ), + instanceOf(DelimitedFileStructureFinder.class) + ); } public void testMakeBestStructureGivenCsvAndJsonOverride() { FileStructureOverrides overrides = FileStructureOverrides.builder().setFormat(FileStructure.Format.NDJSON).build(); - IllegalArgumentException e = expectThrows(IllegalArgumentException.class, - () -> structureFinderManager.makeBestStructureFinder(explanation, CSV_SAMPLE, StandardCharsets.UTF_8.name(), randomBoolean(), - FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, overrides, NOOP_TIMEOUT_CHECKER)); + IllegalArgumentException e = expectThrows( + IllegalArgumentException.class, + () -> structureFinderManager.makeBestStructureFinder( + explanation, + CSV_SAMPLE, + StandardCharsets.UTF_8.name(), + randomBoolean(), + DEFAULT_LINE_MERGE_SIZE_LIMIT, + overrides, + NOOP_TIMEOUT_CHECKER + ) + ); assertEquals("Input did not match the specified format [ndjson]", e.getMessage()); } public void testMakeBestStructureGivenText() throws Exception { - assertThat(structureFinderManager.makeBestStructureFinder(explanation, TEXT_SAMPLE, StandardCharsets.UTF_8.name(), randomBoolean(), - FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER), - instanceOf(TextLogFileStructureFinder.class)); + assertThat( + structureFinderManager.makeBestStructureFinder( + explanation, + TEXT_SAMPLE, + StandardCharsets.UTF_8.name(), + randomBoolean(), + DEFAULT_LINE_MERGE_SIZE_LIMIT, + FileStructureOverrides.EMPTY_OVERRIDES, + NOOP_TIMEOUT_CHECKER + ), + instanceOf(TextLogFileStructureFinder.class) + ); } public void testMakeBestStructureGivenTextAndDelimitedOverride() throws Exception { // Every line of the text sample has two colons, so colon delimited is possible, just very weird FileStructureOverrides overrides = FileStructureOverrides.builder() - .setFormat(FileStructure.Format.DELIMITED).setDelimiter(':').build(); - - assertThat(structureFinderManager.makeBestStructureFinder(explanation, TEXT_SAMPLE, StandardCharsets.UTF_8.name(), randomBoolean(), - FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, overrides, NOOP_TIMEOUT_CHECKER), - instanceOf(DelimitedFileStructureFinder.class)); + .setFormat(FileStructure.Format.DELIMITED) + .setDelimiter(':') + .build(); + + assertThat( + structureFinderManager.makeBestStructureFinder( + explanation, + TEXT_SAMPLE, + StandardCharsets.UTF_8.name(), + randomBoolean(), + DEFAULT_LINE_MERGE_SIZE_LIMIT, + overrides, + NOOP_TIMEOUT_CHECKER + ), + instanceOf(DelimitedFileStructureFinder.class) + ); } public void testFindFileStructureTimeout() throws IOException, InterruptedException { @@ -196,9 +283,17 @@ public void testFindFileStructureTimeout() throws IOException, InterruptedExcept junkProducer.start(); - ElasticsearchTimeoutException e = expectThrows(ElasticsearchTimeoutException.class, - () -> structureFinderManager.findFileStructure(explanation, FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, - linesOfJunk - 1, bigInput, EMPTY_OVERRIDES, timeout)); + ElasticsearchTimeoutException e = expectThrows( + ElasticsearchTimeoutException.class, + () -> structureFinderManager.findFileStructure( + explanation, + DEFAULT_LINE_MERGE_SIZE_LIMIT, + linesOfJunk - 1, + bigInput, + FileStructureOverrides.EMPTY_OVERRIDES, + timeout + ) + ); assertThat(e.getMessage(), startsWith("Aborting structure analysis during [")); assertThat(e.getMessage(), endsWith("] as it has taken longer than the timeout of [" + timeout + "]")); diff --git a/x-pack/plugin/text-structure/src/test/java/org/elasticsearch/xpack/textstructure/structurefinder/FileStructureTestCase.java b/x-pack/plugin/text-structure/src/test/java/org/elasticsearch/xpack/textstructure/structurefinder/FileStructureTestCase.java new file mode 100644 index 0000000000000..9c9897ba2a50f --- /dev/null +++ b/x-pack/plugin/text-structure/src/test/java/org/elasticsearch/xpack/textstructure/structurefinder/FileStructureTestCase.java @@ -0,0 +1,93 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ +package org.elasticsearch.xpack.textstructure.structurefinder; + +import org.apache.logging.log4j.LogManager; +import org.elasticsearch.test.ESTestCase; +import org.junit.After; +import org.junit.Before; + +import java.nio.charset.Charset; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Locale; +import java.util.stream.Collectors; + +public abstract class FileStructureTestCase extends ESTestCase { + + protected static final List POSSIBLE_CHARSETS = Collections.unmodifiableList( + Charset.availableCharsets() + .keySet() + .stream() + .filter(name -> FileStructureFinderManager.FILEBEAT_SUPPORTED_ENCODINGS.contains(name.toLowerCase(Locale.ROOT))) + .collect(Collectors.toList()) + ); + + protected static final String CSV_SAMPLE = "time,id,value\n" + + "2018-05-17T16:23:40,key1,42.0\n" + + "2018-05-17T16:24:11,\"key with spaces\",42.0\n"; + + protected static final String NDJSON_SAMPLE = "{\"logger\":\"controller\",\"timestamp\":1478261151445,\"level\":\"INFO\"," + + "\"pid\":42,\"thread\":\"0x7fff7d2a8000\",\"message\":\"message 1\",\"class\":\"ml\"," + + "\"method\":\"core::SomeNoiseMaker\",\"file\":\"Noisemaker.cc\",\"line\":333}\n" + + "{\"logger\":\"controller\",\"timestamp\":1478261151445," + + "\"level\":\"INFO\",\"pid\":42,\"thread\":\"0x7fff7d2a8000\",\"message\":\"message 2\",\"class\":\"ml\"," + + "\"method\":\"core::SomeNoiseMaker\",\"file\":\"Noisemaker.cc\",\"line\":333}\n"; + + protected static final String PIPE_DELIMITED_SAMPLE = "2018-01-06 16:56:14.295748|INFO |VirtualServer |1 |" + + "listening on 0.0.0.0:9987, :::9987\n" + + "2018-01-06 17:19:44.465252|INFO |VirtualServer |1 |client " + + "'User1'(id:2) changed default admin channelgroup to 'Guest'(id:8)\n" + + "2018-01-06 17:21:25.764368|INFO |VirtualServer |1 |client " + + "'User1'(id:2) was added to channelgroup 'Channel Admin'(id:5) by client 'User1'(id:2) in channel 'Default Channel'(id:1)"; + + protected static final String SEMI_COLON_DELIMITED_SAMPLE = "\"pos_id\";\"trip_id\";\"latitude\";\"longitude\";\"altitude\";" + + "\"timestamp\"\n" + + "\"1\";\"3\";\"4703.7815\";\"1527.4713\";\"359.9\";\"2017-01-19 16:19:04.742113\"\n" + + "\"2\";\"3\";\"4703.7815\";\"1527.4714\";\"359.9\";\"2017-01-19 16:19:05.741890\"\n" + + "\"3\";\"3\";\"4703.7816\";\"1527.4716\";\"360.3\";\"2017-01-19 16:19:06.738842\""; + + protected static final String TEXT_SAMPLE = "[2018-05-11T17:07:29,461][INFO ][o.e.n.Node ] [node-0] initializing ...\n" + + "[2018-05-11T17:07:29,553][INFO ][o.e.e.NodeEnvironment ] [node-0] using [1] data paths, mounts [[/ (/dev/disk1)]], " + + "net usable_space [223.4gb], net total_space [464.7gb], types [hfs]\n" + + "[2018-05-11T17:07:29,553][INFO ][o.e.e.NodeEnvironment ] [node-0] heap size [3.9gb], " + + "compressed ordinary object pointers [true]\n" + + "[2018-05-11T17:07:29,556][INFO ][o.e.n.Node ] [node-0] node name [node-0], node ID [tJ9u8HcaTbWxRtnlfz1RQA]\n"; + + protected static final String TSV_SAMPLE = "time\tid\tvalue\n" + + "2018-05-17T16:23:40\tkey1\t42.0\n" + + "2018-05-17T16:24:11\t\"key with spaces\"\t42.0\n"; + + protected static final String XML_SAMPLE = "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n"; + + // This doesn't need closing because it has an infinite timeout + protected static final TimeoutChecker NOOP_TIMEOUT_CHECKER = new TimeoutChecker("unit test", null, null); + + protected List explanation; + + @Before + public void initExplanation() { + explanation = new ArrayList<>(); + } + + @After + public void printExplanation() { + LogManager.getLogger(getClass()).info("Explanation:\n" + String.join("\n", explanation)); + } + + protected Boolean randomHasByteOrderMarker(String charset) { + return charset.toUpperCase(Locale.ROOT).startsWith("UTF") ? randomBoolean() : null; + } +} diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtilsTests.java b/x-pack/plugin/text-structure/src/test/java/org/elasticsearch/xpack/textstructure/structurefinder/FileStructureUtilsTests.java similarity index 76% rename from x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtilsTests.java rename to x-pack/plugin/text-structure/src/test/java/org/elasticsearch/xpack/textstructure/structurefinder/FileStructureUtilsTests.java index 5e690d739790a..f6e5e8ac62880 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtilsTests.java +++ b/x-pack/plugin/text-structure/src/test/java/org/elasticsearch/xpack/textstructure/structurefinder/FileStructureUtilsTests.java @@ -3,10 +3,10 @@ * or more contributor license agreements. Licensed under the Elastic License; * you may not use this file except in compliance with the Elastic License. */ -package org.elasticsearch.xpack.ml.filestructurefinder; +package org.elasticsearch.xpack.textstructure.structurefinder; import org.elasticsearch.common.collect.Tuple; -import org.elasticsearch.xpack.core.ml.filestructurefinder.FieldStats; +import org.elasticsearch.xpack.core.textstructure.structurefinder.FieldStats; import java.util.Arrays; import java.util.Collections; @@ -16,8 +16,6 @@ import java.util.Map; import java.util.SortedMap; -import static org.elasticsearch.xpack.ml.filestructurefinder.FileStructureOverrides.EMPTY_OVERRIDES; -import static org.elasticsearch.xpack.ml.filestructurefinder.FileStructureUtils.MAPPING_TYPE_SETTING; import static org.hamcrest.Matchers.contains; import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.instanceOf; @@ -37,8 +35,12 @@ public void testMoreLikelyGivenKeyword() { public void testGuessTimestampGivenSingleSampleSingleField() { Map sample = Collections.singletonMap("field1", "2018-05-24T17:28:31,735"); - Tuple match = FileStructureUtils.guessTimestampField(explanation, Collections.singletonList(sample), - EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); + Tuple match = FileStructureUtils.guessTimestampField( + explanation, + Collections.singletonList(sample), + FileStructureOverrides.EMPTY_OVERRIDES, + NOOP_TIMEOUT_CHECKER + ); assertNotNull(match); assertEquals("field1", match.v1()); assertThat(match.v2().getJavaTimestampFormats(), contains("ISO8601")); @@ -50,8 +52,12 @@ public void testGuessTimestampGivenSingleSampleSingleFieldAndConsistentTimeField FileStructureOverrides overrides = FileStructureOverrides.builder().setTimestampField("field1").build(); Map sample = Collections.singletonMap("field1", "2018-05-24T17:28:31,735"); - Tuple match = FileStructureUtils.guessTimestampField(explanation, Collections.singletonList(sample), - overrides, NOOP_TIMEOUT_CHECKER); + Tuple match = FileStructureUtils.guessTimestampField( + explanation, + Collections.singletonList(sample), + overrides, + NOOP_TIMEOUT_CHECKER + ); assertNotNull(match); assertEquals("field1", match.v1()); assertThat(match.v2().getJavaTimestampFormats(), contains("ISO8601")); @@ -63,9 +69,10 @@ public void testGuessTimestampGivenSingleSampleSingleFieldAndImpossibleTimeField FileStructureOverrides overrides = FileStructureOverrides.builder().setTimestampField("field2").build(); Map sample = Collections.singletonMap("field1", "2018-05-24T17:28:31,735"); - IllegalArgumentException e = expectThrows(IllegalArgumentException.class, - () -> FileStructureUtils.guessTimestampField(explanation, Collections.singletonList(sample), overrides, - NOOP_TIMEOUT_CHECKER)); + IllegalArgumentException e = expectThrows( + IllegalArgumentException.class, + () -> FileStructureUtils.guessTimestampField(explanation, Collections.singletonList(sample), overrides, NOOP_TIMEOUT_CHECKER) + ); assertEquals("Specified timestamp field [field2] is not present in record [{field1=2018-05-24T17:28:31,735}]", e.getMessage()); } @@ -75,8 +82,12 @@ public void testGuessTimestampGivenSingleSampleSingleFieldAndConsistentTimeForma FileStructureOverrides overrides = FileStructureOverrides.builder().setTimestampFormat("ISO8601").build(); Map sample = Collections.singletonMap("field1", "2018-05-24T17:28:31,735"); - Tuple match = FileStructureUtils.guessTimestampField(explanation, Collections.singletonList(sample), - overrides, NOOP_TIMEOUT_CHECKER); + Tuple match = FileStructureUtils.guessTimestampField( + explanation, + Collections.singletonList(sample), + overrides, + NOOP_TIMEOUT_CHECKER + ); assertNotNull(match); assertEquals("field1", match.v1()); assertThat(match.v2().getJavaTimestampFormats(), contains("ISO8601")); @@ -88,19 +99,26 @@ public void testGuessTimestampGivenSingleSampleSingleFieldAndImpossibleTimeForma FileStructureOverrides overrides = FileStructureOverrides.builder().setTimestampFormat("EEE MMM dd HH:mm:ss yyyy").build(); Map sample = Collections.singletonMap("field1", "2018-05-24T17:28:31,735"); - IllegalArgumentException e = expectThrows(IllegalArgumentException.class, - () -> FileStructureUtils.guessTimestampField(explanation, Collections.singletonList(sample), overrides, - NOOP_TIMEOUT_CHECKER)); + IllegalArgumentException e = expectThrows( + IllegalArgumentException.class, + () -> FileStructureUtils.guessTimestampField(explanation, Collections.singletonList(sample), overrides, NOOP_TIMEOUT_CHECKER) + ); - assertEquals("Specified timestamp format [EEE MMM dd HH:mm:ss yyyy] does not match for record [{field1=2018-05-24T17:28:31,735}]", - e.getMessage()); + assertEquals( + "Specified timestamp format [EEE MMM dd HH:mm:ss yyyy] does not match for record [{field1=2018-05-24T17:28:31,735}]", + e.getMessage() + ); } public void testGuessTimestampGivenSamplesWithSameSingleTimeField() { Map sample1 = Collections.singletonMap("field1", "2018-05-24T17:28:31,735"); Map sample2 = Collections.singletonMap("field1", "2018-05-24T17:33:39,406"); - Tuple match = FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), - EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); + Tuple match = FileStructureUtils.guessTimestampField( + explanation, + Arrays.asList(sample1, sample2), + FileStructureOverrides.EMPTY_OVERRIDES, + NOOP_TIMEOUT_CHECKER + ); assertNotNull(match); assertEquals("field1", match.v1()); assertThat(match.v2().getJavaTimestampFormats(), contains("ISO8601")); @@ -110,16 +128,24 @@ public void testGuessTimestampGivenSamplesWithSameSingleTimeField() { public void testGuessTimestampGivenSamplesWithOneSingleTimeFieldDifferentFormat() { Map sample1 = Collections.singletonMap("field1", "2018-05-24T17:28:31,735"); Map sample2 = Collections.singletonMap("field1", "Thu May 24 17:33:39 2018"); - Tuple match = FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), - EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); + Tuple match = FileStructureUtils.guessTimestampField( + explanation, + Arrays.asList(sample1, sample2), + FileStructureOverrides.EMPTY_OVERRIDES, + NOOP_TIMEOUT_CHECKER + ); assertNull(match); } public void testGuessTimestampGivenSamplesWithDifferentSingleTimeField() { Map sample1 = Collections.singletonMap("field1", "2018-05-24T17:28:31,735"); Map sample2 = Collections.singletonMap("another_field", "2018-05-24T17:33:39,406"); - Tuple match = FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), - EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); + Tuple match = FileStructureUtils.guessTimestampField( + explanation, + Arrays.asList(sample1, sample2), + FileStructureOverrides.EMPTY_OVERRIDES, + NOOP_TIMEOUT_CHECKER + ); assertNull(match); } @@ -128,8 +154,12 @@ public void testGuessTimestampGivenSingleSampleManyFieldsOneTimeFormat() { sample.put("foo", "not a time"); sample.put("time", "2018-05-24 17:28:31,735"); sample.put("bar", 42); - Tuple match = FileStructureUtils.guessTimestampField(explanation, Collections.singletonList(sample), - EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); + Tuple match = FileStructureUtils.guessTimestampField( + explanation, + Collections.singletonList(sample), + FileStructureOverrides.EMPTY_OVERRIDES, + NOOP_TIMEOUT_CHECKER + ); assertNotNull(match); assertEquals("time", match.v1()); assertThat(match.v2().getJavaTimestampFormats(), contains("yyyy-MM-dd HH:mm:ss,SSS")); @@ -145,8 +175,12 @@ public void testGuessTimestampGivenSamplesWithManyFieldsSameSingleTimeFormat() { sample2.put("foo", "whatever"); sample2.put("time", "2018-05-29 11:53:02,837"); sample2.put("bar", 17); - Tuple match = FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), - EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); + Tuple match = FileStructureUtils.guessTimestampField( + explanation, + Arrays.asList(sample1, sample2), + FileStructureOverrides.EMPTY_OVERRIDES, + NOOP_TIMEOUT_CHECKER + ); assertNotNull(match); assertEquals("time", match.v1()); assertThat(match.v2().getJavaTimestampFormats(), contains("yyyy-MM-dd HH:mm:ss,SSS")); @@ -162,8 +196,12 @@ public void testGuessTimestampGivenSamplesWithManyFieldsSameTimeFieldDifferentTi sample2.put("foo", "whatever"); sample2.put("time", "May 29 2018 11:53:02"); sample2.put("bar", 17); - Tuple match = FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), - EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); + Tuple match = FileStructureUtils.guessTimestampField( + explanation, + Arrays.asList(sample1, sample2), + FileStructureOverrides.EMPTY_OVERRIDES, + NOOP_TIMEOUT_CHECKER + ); assertNull(match); } @@ -176,8 +214,12 @@ public void testGuessTimestampGivenSamplesWithManyFieldsSameSingleTimeFormatDist sample2.put("red_herring", "whatever"); sample2.put("time", "2018-05-29 11:53:02,837"); sample2.put("bar", 17); - Tuple match = FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), - EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); + Tuple match = FileStructureUtils.guessTimestampField( + explanation, + Arrays.asList(sample1, sample2), + FileStructureOverrides.EMPTY_OVERRIDES, + NOOP_TIMEOUT_CHECKER + ); assertNotNull(match); assertEquals("time", match.v1()); assertThat(match.v2().getJavaTimestampFormats(), contains("yyyy-MM-dd HH:mm:ss,SSS")); @@ -193,8 +235,12 @@ public void testGuessTimestampGivenSamplesWithManyFieldsSameSingleTimeFormatDist sample2.put("foo", "whatever"); sample2.put("time", "May 29 2018 11:53:02"); sample2.put("red_herring", "17"); - Tuple match = FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), - EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); + Tuple match = FileStructureUtils.guessTimestampField( + explanation, + Arrays.asList(sample1, sample2), + FileStructureOverrides.EMPTY_OVERRIDES, + NOOP_TIMEOUT_CHECKER + ); assertNotNull(match); assertEquals("time", match.v1()); assertThat(match.v2().getJavaTimestampFormats(), contains("MMM dd yyyy HH:mm:ss", "MMM d yyyy HH:mm:ss", "MMM d yyyy HH:mm:ss")); @@ -210,8 +256,12 @@ public void testGuessTimestampGivenSamplesWithManyFieldsInconsistentTimeFields() sample2.put("foo", "whatever"); sample2.put("time2", "May 29 2018 11:53:02"); sample2.put("bar", 42); - Tuple match = FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), - EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); + Tuple match = FileStructureUtils.guessTimestampField( + explanation, + Arrays.asList(sample1, sample2), + FileStructureOverrides.EMPTY_OVERRIDES, + NOOP_TIMEOUT_CHECKER + ); assertNull(match); } @@ -226,8 +276,12 @@ public void testGuessTimestampGivenSamplesWithManyFieldsInconsistentAndConsisten sample2.put("time2", "May 10 2018 11:53:02"); sample2.put("time3", "Thu, May 10 2018 11:53:02"); sample2.put("bar", 42); - Tuple match = FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), - EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); + Tuple match = FileStructureUtils.guessTimestampField( + explanation, + Arrays.asList(sample1, sample2), + FileStructureOverrides.EMPTY_OVERRIDES, + NOOP_TIMEOUT_CHECKER + ); assertNotNull(match); assertEquals("time2", match.v1()); assertThat(match.v2().getJavaTimestampFormats(), contains("MMM dd yyyy HH:mm:ss", "MMM d yyyy HH:mm:ss", "MMM d yyyy HH:mm:ss")); @@ -239,26 +293,26 @@ public void testGuessMappingGivenNothing() { } public void testGuessMappingGivenKeyword() { - Map expected = Collections.singletonMap(MAPPING_TYPE_SETTING, "keyword"); + Map expected = Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "keyword"); assertEquals(expected, guessMapping(explanation, "foo", Arrays.asList("ERROR", "INFO", "DEBUG"))); assertEquals(expected, guessMapping(explanation, "foo", Arrays.asList("2018-06-11T13:26:47Z", "not a date"))); } public void testGuessMappingGivenText() { - Map expected = Collections.singletonMap(MAPPING_TYPE_SETTING, "text"); + Map expected = Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "text"); assertEquals(expected, guessMapping(explanation, "foo", Arrays.asList("a", "the quick brown fox jumped over the lazy dog"))); } public void testGuessMappingGivenIp() { - Map expected = Collections.singletonMap(MAPPING_TYPE_SETTING, "ip"); + Map expected = Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "ip"); assertEquals(expected, guessMapping(explanation, "foo", Arrays.asList("10.0.0.1", "172.16.0.1", "192.168.0.1"))); } public void testGuessMappingGivenDouble() { - Map expected = Collections.singletonMap(MAPPING_TYPE_SETTING, "double"); + Map expected = Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "double"); assertEquals(expected, guessMapping(explanation, "foo", Arrays.asList("3.14159265359", "0", "-8"))); // 12345678901234567890 is too long for long @@ -268,7 +322,7 @@ public void testGuessMappingGivenDouble() { } public void testGuessMappingGivenLong() { - Map expected = Collections.singletonMap(MAPPING_TYPE_SETTING, "long"); + Map expected = Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "long"); assertEquals(expected, guessMapping(explanation, "foo", Arrays.asList("500", "3", "-3"))); assertEquals(expected, guessMapping(explanation, "foo", Arrays.asList(500, 6, 0))); @@ -276,39 +330,47 @@ public void testGuessMappingGivenLong() { public void testGuessMappingGivenDate() { Map expected = new HashMap<>(); - expected.put(MAPPING_TYPE_SETTING, "date"); + expected.put(FileStructureUtils.MAPPING_TYPE_SETTING, "date"); expected.put(FileStructureUtils.MAPPING_FORMAT_SETTING, "iso8601"); assertEquals(expected, guessMapping(explanation, "foo", Arrays.asList("2018-06-11T13:26:47Z", "2018-06-11T13:27:12Z"))); } public void testGuessMappingGivenBoolean() { - Map expected = Collections.singletonMap(MAPPING_TYPE_SETTING, "boolean"); + Map expected = Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "boolean"); assertEquals(expected, guessMapping(explanation, "foo", Arrays.asList("false", "true"))); assertEquals(expected, guessMapping(explanation, "foo", Arrays.asList(true, false))); } public void testGuessMappingGivenArray() { - Map expected = Collections.singletonMap(MAPPING_TYPE_SETTING, "long"); + Map expected = Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "long"); assertEquals(expected, guessMapping(explanation, "foo", Arrays.asList(42, Arrays.asList(1, -99)))); - expected = Collections.singletonMap(MAPPING_TYPE_SETTING, "keyword"); + expected = Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "keyword"); - assertEquals(expected, guessMapping(explanation, "foo", Arrays.asList(new String[]{ "x", "y" }, "z"))); + assertEquals(expected, guessMapping(explanation, "foo", Arrays.asList(new String[] { "x", "y" }, "z"))); } public void testGuessMappingGivenObject() { - Map expected = Collections.singletonMap(MAPPING_TYPE_SETTING, "object"); - - assertEquals(expected, guessMapping(explanation, "foo", - Arrays.asList(Collections.singletonMap("name", "value1"), Collections.singletonMap("name", "value2")))); + Map expected = Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "object"); + + assertEquals( + expected, + guessMapping( + explanation, + "foo", + Arrays.asList(Collections.singletonMap("name", "value1"), Collections.singletonMap("name", "value2")) + ) + ); } public void testGuessMappingGivenObjectAndNonObject() { - RuntimeException e = expectThrows(RuntimeException.class, () -> guessMapping(explanation, - "foo", Arrays.asList(Collections.singletonMap("name", "value1"), "value2"))); + RuntimeException e = expectThrows( + RuntimeException.class, + () -> guessMapping(explanation, "foo", Arrays.asList(Collections.singletonMap("name", "value1"), "value2")) + ); assertEquals("Field [foo] has both object and non-object values - this is not supported by Elasticsearch", e.getMessage()); } @@ -325,47 +387,75 @@ public void testGuessMappingsAndCalculateFieldStats() { sample2.put("bar", 17); sample2.put("nothing", null); - Tuple, SortedMap> mappingsAndFieldStats = - FileStructureUtils.guessMappingsAndCalculateFieldStats(explanation, Arrays.asList(sample1, sample2), NOOP_TIMEOUT_CHECKER); + Tuple, SortedMap> mappingsAndFieldStats = FileStructureUtils + .guessMappingsAndCalculateFieldStats(explanation, Arrays.asList(sample1, sample2), NOOP_TIMEOUT_CHECKER); assertNotNull(mappingsAndFieldStats); Map mappings = mappingsAndFieldStats.v1(); assertNotNull(mappings); - assertEquals(Collections.singletonMap(MAPPING_TYPE_SETTING, "keyword"), mappings.get("foo")); + assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("foo")); Map expectedTimeMapping = new HashMap<>(); - expectedTimeMapping.put(MAPPING_TYPE_SETTING, "date"); + expectedTimeMapping.put(FileStructureUtils.MAPPING_TYPE_SETTING, "date"); expectedTimeMapping.put(FileStructureUtils.MAPPING_FORMAT_SETTING, "yyyy-MM-dd HH:mm:ss,SSS"); assertEquals(expectedTimeMapping, mappings.get("time")); - assertEquals(Collections.singletonMap(MAPPING_TYPE_SETTING, "long"), mappings.get("bar")); + assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "long"), mappings.get("bar")); assertNull(mappings.get("nothing")); Map fieldStats = mappingsAndFieldStats.v2(); assertNotNull(fieldStats); assertEquals(3, fieldStats.size()); assertEquals(new FieldStats(2, 2, makeTopHits("not a time", 1, "whatever", 1)), fieldStats.get("foo")); - assertEquals(new FieldStats(2, 2, "2018-05-24 17:28:31,735", "2018-05-29 11:53:02,837", - makeTopHits("2018-05-24 17:28:31,735", 1, "2018-05-29 11:53:02,837", 1)), fieldStats.get("time")); + assertEquals( + new FieldStats( + 2, + 2, + "2018-05-24 17:28:31,735", + "2018-05-29 11:53:02,837", + makeTopHits("2018-05-24 17:28:31,735", 1, "2018-05-29 11:53:02,837", 1) + ), + fieldStats.get("time") + ); assertEquals(new FieldStats(2, 2, 17.0, 42.0, 29.5, 29.5, makeTopHits(17, 1, 42, 1)), fieldStats.get("bar")); assertNull(fieldStats.get("nothing")); } public void testMakeIngestPipelineDefinitionGivenNdJsonWithoutTimestamp() { - assertNull(FileStructureUtils.makeIngestPipelineDefinition(null, Collections.emptyMap(), null, Collections.emptyMap(), null, null, - false, false)); + assertNull( + FileStructureUtils.makeIngestPipelineDefinition( + null, + Collections.emptyMap(), + null, + Collections.emptyMap(), + null, + null, + false, + false + ) + ); } @SuppressWarnings("unchecked") public void testMakeIngestPipelineDefinitionGivenNdJsonWithTimestamp() { String timestampField = randomAlphaOfLength(10); - List timestampFormats = randomFrom(Collections.singletonList("ISO8601"), - Arrays.asList("EEE MMM dd HH:mm:ss yyyy", "EEE MMM d HH:mm:ss yyyy")); + List timestampFormats = randomFrom( + Collections.singletonList("ISO8601"), + Arrays.asList("EEE MMM dd HH:mm:ss yyyy", "EEE MMM d HH:mm:ss yyyy") + ); boolean needClientTimezone = randomBoolean(); boolean needNanosecondPrecision = randomBoolean(); - Map pipeline = FileStructureUtils.makeIngestPipelineDefinition(null, Collections.emptyMap(), null, - Collections.emptyMap(), timestampField, timestampFormats, needClientTimezone, needNanosecondPrecision); + Map pipeline = FileStructureUtils.makeIngestPipelineDefinition( + null, + Collections.emptyMap(), + null, + Collections.emptyMap(), + timestampField, + timestampFormats, + needClientTimezone, + needNanosecondPrecision + ); assertNotNull(pipeline); assertEquals("Ingest pipeline created by file structure finder", pipeline.remove("description")); @@ -394,8 +484,16 @@ public void testMakeIngestPipelineDefinitionGivenDelimitedWithoutTimestamp() { Map csvProcessorSettings = DelimitedFileStructureFinderTests.randomCsvProcessorSettings(); - Map pipeline = FileStructureUtils.makeIngestPipelineDefinition(null, Collections.emptyMap(), csvProcessorSettings, - Collections.emptyMap(), null, null, false, false); + Map pipeline = FileStructureUtils.makeIngestPipelineDefinition( + null, + Collections.emptyMap(), + csvProcessorSettings, + Collections.emptyMap(), + null, + null, + false, + false + ); assertNotNull(pipeline); assertEquals("Ingest pipeline created by file structure finder", pipeline.remove("description")); @@ -425,8 +523,16 @@ public void testMakeIngestPipelineDefinitionGivenDelimitedWithFieldInTargetField String firstTargetField = ((List) csvProcessorSettings.get("target_fields")).get(0); csvProcessorSettings.put("field", firstTargetField); - Map pipeline = FileStructureUtils.makeIngestPipelineDefinition(null, Collections.emptyMap(), csvProcessorSettings, - Collections.emptyMap(), null, null, false, false); + Map pipeline = FileStructureUtils.makeIngestPipelineDefinition( + null, + Collections.emptyMap(), + csvProcessorSettings, + Collections.emptyMap(), + null, + null, + false, + false + ); assertNotNull(pipeline); assertEquals("Ingest pipeline created by file structure finder", pipeline.remove("description")); @@ -452,11 +558,21 @@ public void testMakeIngestPipelineDefinitionGivenDelimitedWithConversion() { boolean expectConversion = randomBoolean(); String mappingType = expectConversion ? randomFrom("long", "double", "boolean") : randomFrom("keyword", "text", "date"); String firstTargetField = ((List) csvProcessorSettings.get("target_fields")).get(0); - Map mappingsForConversions = - Collections.singletonMap(firstTargetField, Collections.singletonMap(MAPPING_TYPE_SETTING, mappingType)); + Map mappingsForConversions = Collections.singletonMap( + firstTargetField, + Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, mappingType) + ); - Map pipeline = FileStructureUtils.makeIngestPipelineDefinition(null, Collections.emptyMap(), csvProcessorSettings, - mappingsForConversions, null, null, false, false); + Map pipeline = FileStructureUtils.makeIngestPipelineDefinition( + null, + Collections.emptyMap(), + csvProcessorSettings, + mappingsForConversions, + null, + null, + false, + false + ); assertNotNull(pipeline); assertEquals("Ingest pipeline created by file structure finder", pipeline.remove("description")); @@ -493,13 +609,23 @@ public void testMakeIngestPipelineDefinitionGivenDelimitedWithTimestamp() { Map csvProcessorSettings = DelimitedFileStructureFinderTests.randomCsvProcessorSettings(); String timestampField = randomAlphaOfLength(10); - List timestampFormats = randomFrom(Collections.singletonList("ISO8601"), - Arrays.asList("EEE MMM dd HH:mm:ss yyyy", "EEE MMM d HH:mm:ss yyyy")); + List timestampFormats = randomFrom( + Collections.singletonList("ISO8601"), + Arrays.asList("EEE MMM dd HH:mm:ss yyyy", "EEE MMM d HH:mm:ss yyyy") + ); boolean needClientTimezone = randomBoolean(); boolean needNanosecondPrecision = randomBoolean(); - Map pipeline = FileStructureUtils.makeIngestPipelineDefinition(null, Collections.emptyMap(), csvProcessorSettings, - Collections.emptyMap(), timestampField, timestampFormats, needClientTimezone, needNanosecondPrecision); + Map pipeline = FileStructureUtils.makeIngestPipelineDefinition( + null, + Collections.emptyMap(), + csvProcessorSettings, + Collections.emptyMap(), + timestampField, + timestampFormats, + needClientTimezone, + needNanosecondPrecision + ); assertNotNull(pipeline); assertEquals("Ingest pipeline created by file structure finder", pipeline.remove("description")); @@ -538,13 +664,23 @@ public void testMakeIngestPipelineDefinitionGivenSemiStructured() { String grokPattern = randomAlphaOfLength(100); String timestampField = randomAlphaOfLength(10); - List timestampFormats = randomFrom(Collections.singletonList("ISO8601"), - Arrays.asList("EEE MMM dd HH:mm:ss yyyy", "EEE MMM d HH:mm:ss yyyy")); + List timestampFormats = randomFrom( + Collections.singletonList("ISO8601"), + Arrays.asList("EEE MMM dd HH:mm:ss yyyy", "EEE MMM d HH:mm:ss yyyy") + ); boolean needClientTimezone = randomBoolean(); boolean needNanosecondPrecision = randomBoolean(); - Map pipeline = FileStructureUtils.makeIngestPipelineDefinition(grokPattern, Collections.emptyMap(), null, - Collections.emptyMap(), timestampField, timestampFormats, needClientTimezone, needNanosecondPrecision); + Map pipeline = FileStructureUtils.makeIngestPipelineDefinition( + grokPattern, + Collections.emptyMap(), + null, + Collections.emptyMap(), + timestampField, + timestampFormats, + needClientTimezone, + needNanosecondPrecision + ); assertNotNull(pipeline); assertEquals("Ingest pipeline created by file structure finder", pipeline.remove("description")); @@ -584,7 +720,7 @@ public void testGuessGeoPoint() { Arrays.asList("POINT (-77.03653 38.897676)", "POINT (-50.03653 28.8973)"), NOOP_TIMEOUT_CHECKER ); - assertThat(mapping.get(MAPPING_TYPE_SETTING), equalTo("geo_point")); + assertThat(mapping.get(FileStructureUtils.MAPPING_TYPE_SETTING), equalTo("geo_point")); mapping = FileStructureUtils.guessScalarMapping( explanation, @@ -592,7 +728,7 @@ public void testGuessGeoPoint() { Arrays.asList("POINT (-77.03653 38.897676)", "bar"), NOOP_TIMEOUT_CHECKER ); - assertThat(mapping.get(MAPPING_TYPE_SETTING), equalTo("keyword")); + assertThat(mapping.get(FileStructureUtils.MAPPING_TYPE_SETTING), equalTo("keyword")); } public void testGuessGeoShape() { @@ -603,19 +739,19 @@ public void testGuessGeoShape() { "POINT (-77.03653 38.897676)", "LINESTRING (-77.03653 38.897676, -77.009051 38.889939)", "POLYGON ((100.0 0.0, 101.0 0.0, 101.0 1.0, 100.0 1.0, 100.0 0.0))", - "POLYGON ((100.0 0.0, 101.0 0.0, 101.0 1.0, 100.0 1.0, 100.0 0.0), " + - "(100.2 0.2, 100.8 0.2, 100.8 0.8, 100.2 0.8, 100.2 0.2))", + "POLYGON ((100.0 0.0, 101.0 0.0, 101.0 1.0, 100.0 1.0, 100.0 0.0), " + + "(100.2 0.2, 100.8 0.2, 100.8 0.8, 100.2 0.8, 100.2 0.2))", "MULTIPOINT (102.0 2.0, 103.0 2.0)", - "MULTILINESTRING ((102.0 2.0, 103.0 2.0, 103.0 3.0, 102.0 3.0), (100.0 0.0, 101.0 0.0, 101.0 1.0, 100.0 1.0)," + - " (100.2 0.2, 100.8 0.2, 100.8 0.8, 100.2 0.8))", - "MULTIPOLYGON (((102.0 2.0, 103.0 2.0, 103.0 3.0, 102.0 3.0, 102.0 2.0)), ((100.0 0.0, 101.0 0.0, 101.0 1.0, " + - "100.0 1.0, 100.0 0.0), (100.2 0.2, 100.8 0.2, 100.8 0.8, 100.2 0.8, 100.2 0.2)))", + "MULTILINESTRING ((102.0 2.0, 103.0 2.0, 103.0 3.0, 102.0 3.0), (100.0 0.0, 101.0 0.0, 101.0 1.0, 100.0 1.0)," + + " (100.2 0.2, 100.8 0.2, 100.8 0.8, 100.2 0.8))", + "MULTIPOLYGON (((102.0 2.0, 103.0 2.0, 103.0 3.0, 102.0 3.0, 102.0 2.0)), ((100.0 0.0, 101.0 0.0, 101.0 1.0, " + + "100.0 1.0, 100.0 0.0), (100.2 0.2, 100.8 0.2, 100.8 0.8, 100.2 0.8, 100.2 0.2)))", "GEOMETRYCOLLECTION (POINT (100.0 0.0), LINESTRING (101.0 0.0, 102.0 1.0))", "BBOX (100.0, 102.0, 2.0, 0.0)" ), NOOP_TIMEOUT_CHECKER ); - assertThat(mapping.get(MAPPING_TYPE_SETTING), equalTo("geo_shape")); + assertThat(mapping.get(FileStructureUtils.MAPPING_TYPE_SETTING), equalTo("geo_shape")); mapping = FileStructureUtils.guessScalarMapping( explanation, @@ -623,12 +759,16 @@ public void testGuessGeoShape() { Arrays.asList("POINT (-77.03653 38.897676)", "LINESTRING (-77.03653 38.897676, -77.009051 38.889939)", "bar"), NOOP_TIMEOUT_CHECKER ); - assertThat(mapping.get(MAPPING_TYPE_SETTING), equalTo("keyword")); + assertThat(mapping.get(FileStructureUtils.MAPPING_TYPE_SETTING), equalTo("keyword")); } private Map guessMapping(List explanation, String fieldName, List fieldValues) { - Tuple, FieldStats> mappingAndFieldStats = FileStructureUtils.guessMappingAndCalculateFieldStats(explanation, - fieldName, fieldValues, NOOP_TIMEOUT_CHECKER); + Tuple, FieldStats> mappingAndFieldStats = FileStructureUtils.guessMappingAndCalculateFieldStats( + explanation, + fieldName, + fieldValues, + NOOP_TIMEOUT_CHECKER + ); return (mappingAndFieldStats == null) ? null : mappingAndFieldStats.v1(); } diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/GrokPatternCreatorTests.java b/x-pack/plugin/text-structure/src/test/java/org/elasticsearch/xpack/textstructure/structurefinder/GrokPatternCreatorTests.java similarity index 63% rename from x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/GrokPatternCreatorTests.java rename to x-pack/plugin/text-structure/src/test/java/org/elasticsearch/xpack/textstructure/structurefinder/GrokPatternCreatorTests.java index 95db2f2e34b7f..58caf79665b13 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/GrokPatternCreatorTests.java +++ b/x-pack/plugin/text-structure/src/test/java/org/elasticsearch/xpack/textstructure/structurefinder/GrokPatternCreatorTests.java @@ -3,10 +3,10 @@ * or more contributor license agreements. Licensed under the Elastic License; * you may not use this file except in compliance with the Elastic License. */ -package org.elasticsearch.xpack.ml.filestructurefinder; +package org.elasticsearch.xpack.textstructure.structurefinder; import org.elasticsearch.common.collect.Tuple; -import org.elasticsearch.xpack.ml.filestructurefinder.GrokPatternCreator.ValueOnlyGrokPatternCandidate; +import org.elasticsearch.xpack.textstructure.structurefinder.GrokPatternCreator.ValueOnlyGrokPatternCandidate; import java.util.ArrayList; import java.util.Arrays; @@ -33,10 +33,12 @@ public void testBuildFieldName() { public void testPopulatePrefacesAndEpiloguesGivenTimestamp() { - Collection matchingStrings = Arrays.asList("[2018-01-25T15:33:23] DEBUG ", + Collection matchingStrings = Arrays.asList( + "[2018-01-25T15:33:23] DEBUG ", "[2018-01-24T12:33:23] ERROR ", "junk [2018-01-22T07:33:23] INFO ", - "[2018-01-21T03:33:23] DEBUG "); + "[2018-01-21T03:33:23] DEBUG " + ); ValueOnlyGrokPatternCandidate candidate = new ValueOnlyGrokPatternCandidate("TIMESTAMP_ISO8601", "date", "extra_timestamp"); Map fieldNameCountStore = new HashMap<>(); @@ -51,9 +53,7 @@ public void testPopulatePrefacesAndEpiloguesGivenTimestamp() { public void testPopulatePrefacesAndEpiloguesGivenEmailAddress() { - Collection matchingStrings = Arrays.asList("before alice@acme.com after", - "abc bob@acme.com xyz", - "carol@acme.com"); + Collection matchingStrings = Arrays.asList("before alice@acme.com after", "abc bob@acme.com xyz", "carol@acme.com"); ValueOnlyGrokPatternCandidate candidate = new ValueOnlyGrokPatternCandidate("EMAILADDRESS", "keyword", "email"); Map fieldNameCountStore = new HashMap<>(); @@ -68,28 +68,41 @@ public void testPopulatePrefacesAndEpiloguesGivenEmailAddress() { public void testAppendBestGrokMatchForStringsGivenTimestampsAndLogLevels() { - Collection snippets = Arrays.asList("[2018-01-25T15:33:23] DEBUG ", + Collection snippets = Arrays.asList( + "[2018-01-25T15:33:23] DEBUG ", "[2018-01-24T12:33:23] ERROR ", "junk [2018-01-22T07:33:23] INFO ", - "[2018-01-21T03:33:23] DEBUG "); + "[2018-01-21T03:33:23] DEBUG " + ); - GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null, null, Collections.emptyMap(), - NOOP_TIMEOUT_CHECKER); + GrokPatternCreator grokPatternCreator = new GrokPatternCreator( + explanation, + snippets, + null, + null, + Collections.emptyMap(), + NOOP_TIMEOUT_CHECKER + ); grokPatternCreator.appendBestGrokMatchForStrings(false, snippets, false, 0); - assertEquals(".*?\\[%{TIMESTAMP_ISO8601:extra_timestamp}\\] %{LOGLEVEL:loglevel} ", - grokPatternCreator.getOverallGrokPatternBuilder().toString()); + assertEquals( + ".*?\\[%{TIMESTAMP_ISO8601:extra_timestamp}\\] %{LOGLEVEL:loglevel} ", + grokPatternCreator.getOverallGrokPatternBuilder().toString() + ); } public void testAppendBestGrokMatchForStringsGivenNumbersInBrackets() { - Collection snippets = Arrays.asList("(-2)", - " (-3)", - " (4)", - " (-5) "); + Collection snippets = Arrays.asList("(-2)", " (-3)", " (4)", " (-5) "); - GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null, null, Collections.emptyMap(), - NOOP_TIMEOUT_CHECKER); + GrokPatternCreator grokPatternCreator = new GrokPatternCreator( + explanation, + snippets, + null, + null, + Collections.emptyMap(), + NOOP_TIMEOUT_CHECKER + ); grokPatternCreator.appendBestGrokMatchForStrings(false, snippets, false, 0); assertEquals(".*?\\(%{INT:field}\\).*?", grokPatternCreator.getOverallGrokPatternBuilder().toString()); @@ -97,12 +110,16 @@ public void testAppendBestGrokMatchForStringsGivenNumbersInBrackets() { public void testAppendBestGrokMatchForStringsGivenNegativeNumbersWithoutBreak() { - Collection snippets = Arrays.asList("before-2 ", - "prior to-3", - "-4"); + Collection snippets = Arrays.asList("before-2 ", "prior to-3", "-4"); - GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null, null, Collections.emptyMap(), - NOOP_TIMEOUT_CHECKER); + GrokPatternCreator grokPatternCreator = new GrokPatternCreator( + explanation, + snippets, + null, + null, + Collections.emptyMap(), + NOOP_TIMEOUT_CHECKER + ); grokPatternCreator.appendBestGrokMatchForStrings(false, snippets, false, 0); // It seems sensible that we don't detect these suffices as either base 10 or base 16 numbers @@ -111,13 +128,16 @@ public void testAppendBestGrokMatchForStringsGivenNegativeNumbersWithoutBreak() public void testAppendBestGrokMatchForStringsGivenHexNumbers() { - Collection snippets = Arrays.asList(" abc", - " 123", - " -123", - "1f is hex"); + Collection snippets = Arrays.asList(" abc", " 123", " -123", "1f is hex"); - GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null, null, Collections.emptyMap(), - NOOP_TIMEOUT_CHECKER); + GrokPatternCreator grokPatternCreator = new GrokPatternCreator( + explanation, + snippets, + null, + null, + Collections.emptyMap(), + NOOP_TIMEOUT_CHECKER + ); grokPatternCreator.appendBestGrokMatchForStrings(false, snippets, false, 0); assertEquals(".*?%{BASE16NUM:field}.*?", grokPatternCreator.getOverallGrokPatternBuilder().toString()); @@ -125,11 +145,16 @@ public void testAppendBestGrokMatchForStringsGivenHexNumbers() { public void testAppendBestGrokMatchForStringsGivenHostnamesWithNumbers() { - Collection snippets = Arrays.asList(" snippets = Arrays.asList(" snippets = Arrays.asList("before alice@acme.com after", - "abc bob@acme.com xyz", - "carol@acme.com"); + Collection snippets = Arrays.asList("before alice@acme.com after", "abc bob@acme.com xyz", "carol@acme.com"); - GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null, null, Collections.emptyMap(), - NOOP_TIMEOUT_CHECKER); + GrokPatternCreator grokPatternCreator = new GrokPatternCreator( + explanation, + snippets, + null, + null, + Collections.emptyMap(), + NOOP_TIMEOUT_CHECKER + ); grokPatternCreator.appendBestGrokMatchForStrings(false, snippets, false, 0); assertEquals(".*?%{EMAILADDRESS:email}.*?", grokPatternCreator.getOverallGrokPatternBuilder().toString()); @@ -151,12 +180,20 @@ public void testAppendBestGrokMatchForStringsGivenEmailAddresses() { public void testAppendBestGrokMatchForStringsGivenUris() { - Collection snippets = Arrays.asList("main site https://www.elastic.co/ with trailing slash", + Collection snippets = Arrays.asList( + "main site https://www.elastic.co/ with trailing slash", "https://www.elastic.co/guide/en/x-pack/current/ml-configuring-categories.html#ml-configuring-categories is a section", - "download today from https://www.elastic.co/downloads"); + "download today from https://www.elastic.co/downloads" + ); - GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null, null, Collections.emptyMap(), - NOOP_TIMEOUT_CHECKER); + GrokPatternCreator grokPatternCreator = new GrokPatternCreator( + explanation, + snippets, + null, + null, + Collections.emptyMap(), + NOOP_TIMEOUT_CHECKER + ); grokPatternCreator.appendBestGrokMatchForStrings(false, snippets, false, 0); assertEquals(".*?%{URI:uri}.*?", grokPatternCreator.getOverallGrokPatternBuilder().toString()); @@ -164,12 +201,16 @@ public void testAppendBestGrokMatchForStringsGivenUris() { public void testAppendBestGrokMatchForStringsGivenPaths() { - Collection snippets = Arrays.asList("on Mac /Users/dave", - "on Windows C:\\Users\\dave", - "on Linux /home/dave"); + Collection snippets = Arrays.asList("on Mac /Users/dave", "on Windows C:\\Users\\dave", "on Linux /home/dave"); - GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null, null, Collections.emptyMap(), - NOOP_TIMEOUT_CHECKER); + GrokPatternCreator grokPatternCreator = new GrokPatternCreator( + explanation, + snippets, + null, + null, + Collections.emptyMap(), + NOOP_TIMEOUT_CHECKER + ); grokPatternCreator.appendBestGrokMatchForStrings(false, snippets, false, 0); assertEquals(".*? .*? %{PATH:path}", grokPatternCreator.getOverallGrokPatternBuilder().toString()); @@ -177,13 +218,21 @@ public void testAppendBestGrokMatchForStringsGivenPaths() { public void testAppendBestGrokMatchForStringsGivenKvPairs() { - Collection snippets = Arrays.asList("foo=1 and bar=a", + Collection snippets = Arrays.asList( + "foo=1 and bar=a", "something foo=2 bar=b something else", "foo=3 bar=c", - " foo=1 bar=a "); + " foo=1 bar=a " + ); - GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null, null, Collections.emptyMap(), - NOOP_TIMEOUT_CHECKER); + GrokPatternCreator grokPatternCreator = new GrokPatternCreator( + explanation, + snippets, + null, + null, + Collections.emptyMap(), + NOOP_TIMEOUT_CHECKER + ); grokPatternCreator.appendBestGrokMatchForStrings(false, snippets, false, 0); assertEquals(".*?\\bfoo=%{USER:foo} .*?\\bbar=%{USER:bar}.*?", grokPatternCreator.getOverallGrokPatternBuilder().toString()); @@ -195,16 +244,24 @@ public void testCreateGrokPatternFromExamplesGivenNamedLogs() { "Sep 8 11:55:06 linux named[22529]: error (unexpected RCODE REFUSED) resolving 'elastic.slack.com/A/IN': 95.110.64.205#53", "Sep 8 11:55:08 linux named[22529]: error (unexpected RCODE REFUSED) resolving 'slack-imgs.com/A/IN': 95.110.64.205#53", "Sep 8 11:55:35 linux named[22529]: error (unexpected RCODE REFUSED) resolving 'www.elastic.co/A/IN': 95.110.68.206#53", - "Sep 8 11:55:42 linux named[22529]: error (unexpected RCODE REFUSED) resolving 'b.akamaiedge.net/A/IN': 95.110.64.205#53"); + "Sep 8 11:55:42 linux named[22529]: error (unexpected RCODE REFUSED) resolving 'b.akamaiedge.net/A/IN': 95.110.64.205#53" + ); Map mappings = new HashMap<>(); - GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings, null, Collections.emptyMap(), - NOOP_TIMEOUT_CHECKER); + GrokPatternCreator grokPatternCreator = new GrokPatternCreator( + explanation, + sampleMessages, + mappings, + null, + Collections.emptyMap(), + NOOP_TIMEOUT_CHECKER + ); - assertEquals("%{SYSLOGTIMESTAMP:timestamp} .*? .*?\\[%{INT:field}\\]: %{LOGLEVEL:loglevel} \\(.*? .*? .*?\\) .*? " + - "%{QUOTEDSTRING:field2}: %{IP:ipaddress}#%{INT:field3}", - grokPatternCreator.createGrokPatternFromExamples("SYSLOGTIMESTAMP", FileStructureUtils.DATE_MAPPING_WITHOUT_FORMAT, - "timestamp")); + assertEquals( + "%{SYSLOGTIMESTAMP:timestamp} .*? .*?\\[%{INT:field}\\]: %{LOGLEVEL:loglevel} \\(.*? .*? .*?\\) .*? " + + "%{QUOTEDSTRING:field2}: %{IP:ipaddress}#%{INT:field3}", + grokPatternCreator.createGrokPatternFromExamples("SYSLOGTIMESTAMP", FileStructureUtils.DATE_MAPPING_WITHOUT_FORMAT, "timestamp") + ); assertEquals(5, mappings.size()); assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "long"), mappings.get("field")); assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("loglevel")); @@ -216,22 +273,34 @@ public void testCreateGrokPatternFromExamplesGivenNamedLogs() { public void testCreateGrokPatternFromExamplesGivenCatalinaLogs() { Collection sampleMessages = Arrays.asList( - "Aug 29, 2009 12:03:33 AM org.apache.tomcat.util.http.Parameters processParameters\nWARNING: Parameters: " + - "Invalid chunk ignored.", - "Aug 29, 2009 12:03:40 AM org.apache.tomcat.util.http.Parameters processParameters\nWARNING: Parameters: " + - "Invalid chunk ignored.", - "Aug 29, 2009 12:03:45 AM org.apache.tomcat.util.http.Parameters processParameters\nWARNING: Parameters: " + - "Invalid chunk ignored.", - "Aug 29, 2009 12:03:57 AM org.apache.tomcat.util.http.Parameters processParameters\nWARNING: Parameters: " + - "Invalid chunk ignored."); + "Aug 29, 2009 12:03:33 AM org.apache.tomcat.util.http.Parameters processParameters\nWARNING: Parameters: " + + "Invalid chunk ignored.", + "Aug 29, 2009 12:03:40 AM org.apache.tomcat.util.http.Parameters processParameters\nWARNING: Parameters: " + + "Invalid chunk ignored.", + "Aug 29, 2009 12:03:45 AM org.apache.tomcat.util.http.Parameters processParameters\nWARNING: Parameters: " + + "Invalid chunk ignored.", + "Aug 29, 2009 12:03:57 AM org.apache.tomcat.util.http.Parameters processParameters\nWARNING: Parameters: " + + "Invalid chunk ignored." + ); Map mappings = new HashMap<>(); - GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings, null, Collections.emptyMap(), - NOOP_TIMEOUT_CHECKER); + GrokPatternCreator grokPatternCreator = new GrokPatternCreator( + explanation, + sampleMessages, + mappings, + null, + Collections.emptyMap(), + NOOP_TIMEOUT_CHECKER + ); - assertEquals("%{CATALINA_DATESTAMP:timestamp} .*? .*?\\n%{LOGLEVEL:loglevel}: .*", - grokPatternCreator.createGrokPatternFromExamples("CATALINA_DATESTAMP", FileStructureUtils.DATE_MAPPING_WITHOUT_FORMAT, - "timestamp")); + assertEquals( + "%{CATALINA_DATESTAMP:timestamp} .*? .*?\\n%{LOGLEVEL:loglevel}: .*", + grokPatternCreator.createGrokPatternFromExamples( + "CATALINA_DATESTAMP", + FileStructureUtils.DATE_MAPPING_WITHOUT_FORMAT, + "timestamp" + ) + ); assertEquals(1, mappings.size()); assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("loglevel")); } @@ -240,23 +309,35 @@ public void testCreateGrokPatternFromExamplesGivenMultiTimestampLogs() { // Two timestamps: one local, one UTC Collection sampleMessages = Arrays.asList( - "559550912540598297\t2016-04-20T14:06:53\t2016-04-20T21:06:53Z\t38545844\tserv02nw07\t192.168.114.28\tAuthpriv\t" + - "Info\tsshd\tsubsystem request for sftp", - "559550912548986880\t2016-04-20T14:06:53\t2016-04-20T21:06:53Z\t9049724\tserv02nw03\t10.120.48.147\tAuthpriv\t" + - "Info\tsshd\tsubsystem request for sftp", - "559550912548986887\t2016-04-20T14:06:53\t2016-04-20T21:06:53Z\t884343\tserv02tw03\t192.168.121.189\tAuthpriv\t" + - "Info\tsshd\tsubsystem request for sftp", - "559550912603512850\t2016-04-20T14:06:53\t2016-04-20T21:06:53Z\t8907014\tserv02nw01\t192.168.118.208\tAuthpriv\t" + - "Info\tsshd\tsubsystem request for sftp"); + "559550912540598297\t2016-04-20T14:06:53\t2016-04-20T21:06:53Z\t38545844\tserv02nw07\t192.168.114.28\tAuthpriv\t" + + "Info\tsshd\tsubsystem request for sftp", + "559550912548986880\t2016-04-20T14:06:53\t2016-04-20T21:06:53Z\t9049724\tserv02nw03\t10.120.48.147\tAuthpriv\t" + + "Info\tsshd\tsubsystem request for sftp", + "559550912548986887\t2016-04-20T14:06:53\t2016-04-20T21:06:53Z\t884343\tserv02tw03\t192.168.121.189\tAuthpriv\t" + + "Info\tsshd\tsubsystem request for sftp", + "559550912603512850\t2016-04-20T14:06:53\t2016-04-20T21:06:53Z\t8907014\tserv02nw01\t192.168.118.208\tAuthpriv\t" + + "Info\tsshd\tsubsystem request for sftp" + ); Map mappings = new HashMap<>(); - GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings, null, Collections.emptyMap(), - NOOP_TIMEOUT_CHECKER); + GrokPatternCreator grokPatternCreator = new GrokPatternCreator( + explanation, + sampleMessages, + mappings, + null, + Collections.emptyMap(), + NOOP_TIMEOUT_CHECKER + ); - assertEquals("%{INT:field}\\t%{TIMESTAMP_ISO8601:timestamp}\\t%{TIMESTAMP_ISO8601:extra_timestamp}\\t%{INT:field2}\\t.*?\\t" + - "%{IP:ipaddress}\\t.*?\\t%{LOGLEVEL:loglevel}\\t.*", - grokPatternCreator.createGrokPatternFromExamples("TIMESTAMP_ISO8601", FileStructureUtils.DATE_MAPPING_WITHOUT_FORMAT, - "timestamp")); + assertEquals( + "%{INT:field}\\t%{TIMESTAMP_ISO8601:timestamp}\\t%{TIMESTAMP_ISO8601:extra_timestamp}\\t%{INT:field2}\\t.*?\\t" + + "%{IP:ipaddress}\\t.*?\\t%{LOGLEVEL:loglevel}\\t.*", + grokPatternCreator.createGrokPatternFromExamples( + "TIMESTAMP_ISO8601", + FileStructureUtils.DATE_MAPPING_WITHOUT_FORMAT, + "timestamp" + ) + ); assertEquals(5, mappings.size()); assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "long"), mappings.get("field")); Map expectedDateMapping = new HashMap<>(); @@ -272,23 +353,35 @@ public void testCreateGrokPatternFromExamplesGivenMultiTimestampLogsAndIndetermi // Two timestamps: one ISO8601, one indeterminate day/month Collection sampleMessages = Arrays.asList( - "559550912540598297\t2016-04-20T14:06:53\t20/04/2016 21:06:53,123456\t38545844\tserv02nw07\t192.168.114.28\tAuthpriv\t" + - "Info\tsshd\tsubsystem request for sftp", - "559550912548986880\t2016-04-20T14:06:53\t20/04/2016 21:06:53,123456\t9049724\tserv02nw03\t10.120.48.147\tAuthpriv\t" + - "Info\tsshd\tsubsystem request for sftp", - "559550912548986887\t2016-04-20T14:06:53\t20/04/2016 21:06:53,123456\t884343\tserv02tw03\t192.168.121.189\tAuthpriv\t" + - "Info\tsshd\tsubsystem request for sftp", - "559550912603512850\t2016-04-20T14:06:53\t20/04/2016 21:06:53,123456\t8907014\tserv02nw01\t192.168.118.208\tAuthpriv\t" + - "Info\tsshd\tsubsystem request for sftp"); + "559550912540598297\t2016-04-20T14:06:53\t20/04/2016 21:06:53,123456\t38545844\tserv02nw07\t192.168.114.28\tAuthpriv\t" + + "Info\tsshd\tsubsystem request for sftp", + "559550912548986880\t2016-04-20T14:06:53\t20/04/2016 21:06:53,123456\t9049724\tserv02nw03\t10.120.48.147\tAuthpriv\t" + + "Info\tsshd\tsubsystem request for sftp", + "559550912548986887\t2016-04-20T14:06:53\t20/04/2016 21:06:53,123456\t884343\tserv02tw03\t192.168.121.189\tAuthpriv\t" + + "Info\tsshd\tsubsystem request for sftp", + "559550912603512850\t2016-04-20T14:06:53\t20/04/2016 21:06:53,123456\t8907014\tserv02nw01\t192.168.118.208\tAuthpriv\t" + + "Info\tsshd\tsubsystem request for sftp" + ); Map mappings = new HashMap<>(); - GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings, null, Collections.emptyMap(), - NOOP_TIMEOUT_CHECKER); + GrokPatternCreator grokPatternCreator = new GrokPatternCreator( + explanation, + sampleMessages, + mappings, + null, + Collections.emptyMap(), + NOOP_TIMEOUT_CHECKER + ); - assertEquals("%{INT:field}\\t%{TIMESTAMP_ISO8601:timestamp}\\t%{DATESTAMP:extra_timestamp}\\t%{INT:field2}\\t.*?\\t" + - "%{IP:ipaddress}\\t.*?\\t%{LOGLEVEL:loglevel}\\t.*", - grokPatternCreator.createGrokPatternFromExamples("TIMESTAMP_ISO8601", FileStructureUtils.DATE_MAPPING_WITHOUT_FORMAT, - "timestamp")); + assertEquals( + "%{INT:field}\\t%{TIMESTAMP_ISO8601:timestamp}\\t%{DATESTAMP:extra_timestamp}\\t%{INT:field2}\\t.*?\\t" + + "%{IP:ipaddress}\\t.*?\\t%{LOGLEVEL:loglevel}\\t.*", + grokPatternCreator.createGrokPatternFromExamples( + "TIMESTAMP_ISO8601", + FileStructureUtils.DATE_MAPPING_WITHOUT_FORMAT, + "timestamp" + ) + ); assertEquals(5, mappings.size()); assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "long"), mappings.get("field")); Map expectedDateMapping = new HashMap<>(); @@ -304,26 +397,34 @@ public void testCreateGrokPatternFromExamplesGivenMultiTimestampLogsAndCustomDef // Two timestamps: one custom, one built-in Collection sampleMessages = Arrays.asList( - "559550912540598297\t4/20/2016 2:06PM\t2016-04-20T21:06:53Z\t38545844\tserv02nw07\t192.168.114.28\tAuthpriv\t" + - "Info\tsshd\tsubsystem request for sftp", - "559550912548986880\t4/20/2016 2:06PM\t2016-04-20T21:06:53Z\t9049724\tserv02nw03\t10.120.48.147\tAuthpriv\t" + - "Info\tsshd\tsubsystem request for sftp", - "559550912548986887\t4/20/2016 2:06PM\t2016-04-20T21:06:53Z\t884343\tserv02tw03\t192.168.121.189\tAuthpriv\t" + - "Info\tsshd\tsubsystem request for sftp", - "559550912603512850\t4/20/2016 2:06PM\t2016-04-20T21:06:53Z\t8907014\tserv02nw01\t192.168.118.208\tAuthpriv\t" + - "Info\tsshd\tsubsystem request for sftp"); + "559550912540598297\t4/20/2016 2:06PM\t2016-04-20T21:06:53Z\t38545844\tserv02nw07\t192.168.114.28\tAuthpriv\t" + + "Info\tsshd\tsubsystem request for sftp", + "559550912548986880\t4/20/2016 2:06PM\t2016-04-20T21:06:53Z\t9049724\tserv02nw03\t10.120.48.147\tAuthpriv\t" + + "Info\tsshd\tsubsystem request for sftp", + "559550912548986887\t4/20/2016 2:06PM\t2016-04-20T21:06:53Z\t884343\tserv02tw03\t192.168.121.189\tAuthpriv\t" + + "Info\tsshd\tsubsystem request for sftp", + "559550912603512850\t4/20/2016 2:06PM\t2016-04-20T21:06:53Z\t8907014\tserv02nw01\t192.168.118.208\tAuthpriv\t" + + "Info\tsshd\tsubsystem request for sftp" + ); Map mappings = new HashMap<>(); - GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings, null, + GrokPatternCreator grokPatternCreator = new GrokPatternCreator( + explanation, + sampleMessages, + mappings, + null, Collections.singletonMap("CUSTOM_TIMESTAMP", "%{MONTHNUM}/%{MONTHDAY}/%{YEAR} %{HOUR}:%{MINUTE}(?:AM|PM)"), - NOOP_TIMEOUT_CHECKER); + NOOP_TIMEOUT_CHECKER + ); Map customMapping = new HashMap<>(); customMapping.put(FileStructureUtils.MAPPING_TYPE_SETTING, "date"); customMapping.put(FileStructureUtils.MAPPING_FORMAT_SETTING, "M/dd/yyyy h:mma"); - assertEquals("%{INT:field}\\t%{CUSTOM_TIMESTAMP:timestamp}\\t%{TIMESTAMP_ISO8601:extra_timestamp}\\t%{INT:field2}\\t.*?\\t" + - "%{IP:ipaddress}\\t.*?\\t%{LOGLEVEL:loglevel}\\t.*", - grokPatternCreator.createGrokPatternFromExamples("CUSTOM_TIMESTAMP", customMapping, "timestamp")); + assertEquals( + "%{INT:field}\\t%{CUSTOM_TIMESTAMP:timestamp}\\t%{TIMESTAMP_ISO8601:extra_timestamp}\\t%{INT:field2}\\t.*?\\t" + + "%{IP:ipaddress}\\t.*?\\t%{LOGLEVEL:loglevel}\\t.*", + grokPatternCreator.createGrokPatternFromExamples("CUSTOM_TIMESTAMP", customMapping, "timestamp") + ); assertEquals(5, mappings.size()); assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "long"), mappings.get("field")); Map expectedDateMapping = new HashMap<>(); @@ -339,23 +440,35 @@ public void testCreateGrokPatternFromExamplesGivenTimestampAndTimeWithoutDate() // Two timestamps: one with date, one without Collection sampleMessages = Arrays.asList( - "559550912540598297\t2016-04-20T14:06:53\t21:06:53.123456\t38545844\tserv02nw07\t192.168.114.28\tAuthpriv\t" + - "Info\tsshd\tsubsystem request for sftp", - "559550912548986880\t2016-04-20T14:06:53\t21:06:53.123456\t9049724\tserv02nw03\t10.120.48.147\tAuthpriv\t" + - "Info\tsshd\tsubsystem request for sftp", - "559550912548986887\t2016-04-20T14:06:53\t21:06:53.123456\t884343\tserv02tw03\t192.168.121.189\tAuthpriv\t" + - "Info\tsshd\tsubsystem request for sftp", - "559550912603512850\t2016-04-20T14:06:53\t21:06:53.123456\t8907014\tserv02nw01\t192.168.118.208\tAuthpriv\t" + - "Info\tsshd\tsubsystem request for sftp"); + "559550912540598297\t2016-04-20T14:06:53\t21:06:53.123456\t38545844\tserv02nw07\t192.168.114.28\tAuthpriv\t" + + "Info\tsshd\tsubsystem request for sftp", + "559550912548986880\t2016-04-20T14:06:53\t21:06:53.123456\t9049724\tserv02nw03\t10.120.48.147\tAuthpriv\t" + + "Info\tsshd\tsubsystem request for sftp", + "559550912548986887\t2016-04-20T14:06:53\t21:06:53.123456\t884343\tserv02tw03\t192.168.121.189\tAuthpriv\t" + + "Info\tsshd\tsubsystem request for sftp", + "559550912603512850\t2016-04-20T14:06:53\t21:06:53.123456\t8907014\tserv02nw01\t192.168.118.208\tAuthpriv\t" + + "Info\tsshd\tsubsystem request for sftp" + ); Map mappings = new HashMap<>(); - GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings, null, Collections.emptyMap(), - NOOP_TIMEOUT_CHECKER); + GrokPatternCreator grokPatternCreator = new GrokPatternCreator( + explanation, + sampleMessages, + mappings, + null, + Collections.emptyMap(), + NOOP_TIMEOUT_CHECKER + ); - assertEquals("%{INT:field}\\t%{TIMESTAMP_ISO8601:timestamp}\\t%{TIME:time}\\t%{INT:field2}\\t.*?\\t" + - "%{IP:ipaddress}\\t.*?\\t%{LOGLEVEL:loglevel}\\t.*", - grokPatternCreator.createGrokPatternFromExamples("TIMESTAMP_ISO8601", FileStructureUtils.DATE_MAPPING_WITHOUT_FORMAT, - "timestamp")); + assertEquals( + "%{INT:field}\\t%{TIMESTAMP_ISO8601:timestamp}\\t%{TIME:time}\\t%{INT:field2}\\t.*?\\t" + + "%{IP:ipaddress}\\t.*?\\t%{LOGLEVEL:loglevel}\\t.*", + grokPatternCreator.createGrokPatternFromExamples( + "TIMESTAMP_ISO8601", + FileStructureUtils.DATE_MAPPING_WITHOUT_FORMAT, + "timestamp" + ) + ); assertEquals(5, mappings.size()); assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "long"), mappings.get("field")); assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("time")); @@ -366,29 +479,38 @@ public void testCreateGrokPatternFromExamplesGivenTimestampAndTimeWithoutDate() public void testFindFullLineGrokPatternGivenApacheCombinedLogs() { Collection sampleMessages = Arrays.asList( - "83.149.9.216 - - [19/Jan/2016:08:13:42 +0000] " + - "\"GET /presentations/logstash-monitorama-2013/images/kibana-search.png HTTP/1.1\" 200 203023 " + - "\"http://semicomplete.com/presentations/logstash-monitorama-2013/\" \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) " + - "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36\"", - "83.149.9.216 - - [19/Jan/2016:08:13:44 +0000] " + - "\"GET /presentations/logstash-monitorama-2013/plugin/zoom-js/zoom.js HTTP/1.1\" 200 7697 " + - "\"http://semicomplete.com/presentations/logstash-monitorama-2013/\" \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) " + - "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36\"", - "83.149.9.216 - - [19/Jan/2016:08:13:44 +0000] " + - "\"GET /presentations/logstash-monitorama-2013/plugin/highlight/highlight.js HTTP/1.1\" 200 26185 " + - "\"http://semicomplete.com/presentations/logstash-monitorama-2013/\" \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) " + - "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36\"", - "83.149.9.216 - - [19/Jan/2016:08:13:42 +0000] " + - "\"GET /presentations/logstash-monitorama-2013/images/sad-medic.png HTTP/1.1\" 200 430406 " + - "\"http://semicomplete.com/presentations/logstash-monitorama-2013/\" \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) " + - "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36\""); + "83.149.9.216 - - [19/Jan/2016:08:13:42 +0000] " + + "\"GET /presentations/logstash-monitorama-2013/images/kibana-search.png HTTP/1.1\" 200 203023 " + + "\"http://semicomplete.com/presentations/logstash-monitorama-2013/\" \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) " + + "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36\"", + "83.149.9.216 - - [19/Jan/2016:08:13:44 +0000] " + + "\"GET /presentations/logstash-monitorama-2013/plugin/zoom-js/zoom.js HTTP/1.1\" 200 7697 " + + "\"http://semicomplete.com/presentations/logstash-monitorama-2013/\" \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) " + + "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36\"", + "83.149.9.216 - - [19/Jan/2016:08:13:44 +0000] " + + "\"GET /presentations/logstash-monitorama-2013/plugin/highlight/highlight.js HTTP/1.1\" 200 26185 " + + "\"http://semicomplete.com/presentations/logstash-monitorama-2013/\" \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) " + + "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36\"", + "83.149.9.216 - - [19/Jan/2016:08:13:42 +0000] " + + "\"GET /presentations/logstash-monitorama-2013/images/sad-medic.png HTTP/1.1\" 200 430406 " + + "\"http://semicomplete.com/presentations/logstash-monitorama-2013/\" \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) " + + "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36\"" + ); Map mappings = new HashMap<>(); - GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings, null, Collections.emptyMap(), - NOOP_TIMEOUT_CHECKER); + GrokPatternCreator grokPatternCreator = new GrokPatternCreator( + explanation, + sampleMessages, + mappings, + null, + Collections.emptyMap(), + NOOP_TIMEOUT_CHECKER + ); - assertEquals(new Tuple<>("timestamp", "%{COMBINEDAPACHELOG}"), - grokPatternCreator.findFullLineGrokPattern(randomBoolean() ? "timestamp" : null)); + assertEquals( + new Tuple<>("timestamp", "%{COMBINEDAPACHELOG}"), + grokPatternCreator.findFullLineGrokPattern(randomBoolean() ? "timestamp" : null) + ); assertEquals(10, mappings.size()); assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "text"), mappings.get("agent")); assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("auth")); @@ -404,37 +526,51 @@ public void testFindFullLineGrokPatternGivenApacheCombinedLogs() { public void testAdjustForPunctuationGivenCommonPrefix() { Collection snippets = Arrays.asList( - "\",\"lab6.localhost\",\"Route Domain\",\"/Common/0\",\"No-lookup\",\"192.168.33.212\",\"No-lookup\",\"192.168.33.132\"," + - "\"80\",\"46721\",\"/Common/Subnet_33\",\"TCP\",\"0\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"Staged\",\"/Common/policy1\"" + - ",\"rule1\",\"Accept\",\"\",\"\",\"\",\"0000000000000000\"", - "\",\"lab6.localhost\",\"Route Domain\",\"/Common/0\",\"No-lookup\",\"192.168.143.244\",\"No-lookup\",\"192.168.33.106\"," + - "\"55025\",\"162\",\"/Common/Subnet_33\",\"UDP\",\"0\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"Staged\",\"/Common/policy1\"" + - ",\"rule1\",\"Accept\",\"\",\"\",\"\",\"0000000000000000\"", - "\",\"lab6.localhost\",\"Route Domain\",\"/Common/0\",\"No-lookup\",\"192.168.33.3\",\"No-lookup\",\"224.0.0.102\"," + - "\"3222\",\"3222\",\"/Common/Subnet_33\",\"UDP\",\"0\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"Staged\",\"/Common/policy1\"" + - ",\"rule1\",\"Accept\",\"\",\"\",\"\",\"0000000000000000\"" - ); - - GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null, null, Collections.emptyMap(), - NOOP_TIMEOUT_CHECKER); + "\",\"lab6.localhost\",\"Route Domain\",\"/Common/0\",\"No-lookup\",\"192.168.33.212\",\"No-lookup\",\"192.168.33.132\"," + + "\"80\",\"46721\",\"/Common/Subnet_33\",\"TCP\",\"0\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"Staged\",\"/Common/policy1\"" + + ",\"rule1\",\"Accept\",\"\",\"\",\"\",\"0000000000000000\"", + "\",\"lab6.localhost\",\"Route Domain\",\"/Common/0\",\"No-lookup\",\"192.168.143.244\",\"No-lookup\",\"192.168.33.106\"," + + "\"55025\",\"162\",\"/Common/Subnet_33\",\"UDP\",\"0\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"Staged\",\"/Common/policy1\"" + + ",\"rule1\",\"Accept\",\"\",\"\",\"\",\"0000000000000000\"", + "\",\"lab6.localhost\",\"Route Domain\",\"/Common/0\",\"No-lookup\",\"192.168.33.3\",\"No-lookup\",\"224.0.0.102\"," + + "\"3222\",\"3222\",\"/Common/Subnet_33\",\"UDP\",\"0\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"Staged\",\"/Common/policy1\"" + + ",\"rule1\",\"Accept\",\"\",\"\",\"\",\"0000000000000000\"" + ); + + GrokPatternCreator grokPatternCreator = new GrokPatternCreator( + explanation, + snippets, + null, + null, + Collections.emptyMap(), + NOOP_TIMEOUT_CHECKER + ); Collection adjustedSnippets = grokPatternCreator.adjustForPunctuation(snippets); assertEquals("\",", grokPatternCreator.getOverallGrokPatternBuilder().toString()); assertNotNull(adjustedSnippets); - assertThat(new ArrayList<>(adjustedSnippets), - containsInAnyOrder(snippets.stream().map(snippet -> snippet.substring(2)).toArray(String[]::new))); + assertThat( + new ArrayList<>(adjustedSnippets), + containsInAnyOrder(snippets.stream().map(snippet -> snippet.substring(2)).toArray(String[]::new)) + ); } public void testAdjustForPunctuationGivenNoCommonPrefix() { Collection snippets = Arrays.asList( "|client (id:2) was removed from servergroup 'Normal'(id:7) by client 'User1'(id:2)", "|servergroup 'GAME'(id:9) was added by 'User1'(id:2)", - "|permission 'i_group_auto_update_type'(id:146) with values (value:30, negated:0, skipchannel:0) " + - "was added by 'User1'(id:2) to servergroup 'GAME'(id:9)" + "|permission 'i_group_auto_update_type'(id:146) with values (value:30, negated:0, skipchannel:0) " + + "was added by 'User1'(id:2) to servergroup 'GAME'(id:9)" ); - GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null, null, Collections.emptyMap(), - NOOP_TIMEOUT_CHECKER); + GrokPatternCreator grokPatternCreator = new GrokPatternCreator( + explanation, + snippets, + null, + null, + Collections.emptyMap(), + NOOP_TIMEOUT_CHECKER + ); Collection adjustedSnippets = grokPatternCreator.adjustForPunctuation(snippets); assertEquals("", grokPatternCreator.getOverallGrokPatternBuilder().toString()); @@ -444,24 +580,31 @@ public void testAdjustForPunctuationGivenNoCommonPrefix() { public void testValidateFullLineGrokPatternGivenValid() { String timestampField = "utc_timestamp"; - String grokPattern = "%{INT:serial_no}\\t%{TIMESTAMP_ISO8601:local_timestamp}\\t%{TIMESTAMP_ISO8601:utc_timestamp}\\t" + - "%{INT:user_id}\\t%{HOSTNAME:host}\\t%{IP:client_ip}\\t%{WORD:method}\\t%{LOGLEVEL:severity}\\t%{PROG:program}\\t" + - "%{GREEDYDATA:message}"; + String grokPattern = "%{INT:serial_no}\\t%{TIMESTAMP_ISO8601:local_timestamp}\\t%{TIMESTAMP_ISO8601:utc_timestamp}\\t" + + "%{INT:user_id}\\t%{HOSTNAME:host}\\t%{IP:client_ip}\\t%{WORD:method}\\t%{LOGLEVEL:severity}\\t%{PROG:program}\\t" + + "%{GREEDYDATA:message}"; // Two timestamps: one local, one UTC Collection sampleMessages = Arrays.asList( - "559550912540598297\t2016-04-20T14:06:53\t2016-04-20T21:06:53Z\t38545844\tserv02nw07\t192.168.114.28\tAuthpriv\t" + - "Info\tsshd\tsubsystem request for sftp", - "559550912548986880\t2016-04-20T14:06:53\t2016-04-20T21:06:53Z\t9049724\tserv02nw03\t10.120.48.147\tAuthpriv\t" + - "Info\tsshd\tsubsystem request for sftp", - "559550912548986887\t2016-04-20T14:06:53\t2016-04-20T21:06:53Z\t884343\tserv02tw03\t192.168.121.189\tAuthpriv\t" + - "Info\tsshd\tsubsystem request for sftp", - "559550912603512850\t2016-04-20T14:06:53\t2016-04-20T21:06:53Z\t8907014\tserv02nw01\t192.168.118.208\tAuthpriv\t" + - "Info\tsshd\tsubsystem request for sftp"); + "559550912540598297\t2016-04-20T14:06:53\t2016-04-20T21:06:53Z\t38545844\tserv02nw07\t192.168.114.28\tAuthpriv\t" + + "Info\tsshd\tsubsystem request for sftp", + "559550912548986880\t2016-04-20T14:06:53\t2016-04-20T21:06:53Z\t9049724\tserv02nw03\t10.120.48.147\tAuthpriv\t" + + "Info\tsshd\tsubsystem request for sftp", + "559550912548986887\t2016-04-20T14:06:53\t2016-04-20T21:06:53Z\t884343\tserv02tw03\t192.168.121.189\tAuthpriv\t" + + "Info\tsshd\tsubsystem request for sftp", + "559550912603512850\t2016-04-20T14:06:53\t2016-04-20T21:06:53Z\t8907014\tserv02nw01\t192.168.118.208\tAuthpriv\t" + + "Info\tsshd\tsubsystem request for sftp" + ); Map mappings = new HashMap<>(); - GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings, null, Collections.emptyMap(), - NOOP_TIMEOUT_CHECKER); + GrokPatternCreator grokPatternCreator = new GrokPatternCreator( + explanation, + sampleMessages, + mappings, + null, + Collections.emptyMap(), + NOOP_TIMEOUT_CHECKER + ); grokPatternCreator.validateFullLineGrokPattern(grokPattern, timestampField); assertEquals(9, mappings.size()); @@ -482,25 +625,31 @@ public void testValidateFullLineGrokPatternGivenValid() { public void testValidateFullLineGrokPatternGivenValidAndCustomDefinition() { String timestampField = "local_timestamp"; - String grokPattern = "%{INT:serial_no}\\t%{CUSTOM_TIMESTAMP:local_timestamp}\\t%{TIMESTAMP_ISO8601:utc_timestamp}\\t" + - "%{INT:user_id}\\t%{HOSTNAME:host}\\t%{IP:client_ip}\\t%{WORD:method}\\t%{LOGLEVEL:severity}\\t%{PROG:program}\\t" + - "%{GREEDYDATA:message}"; + String grokPattern = "%{INT:serial_no}\\t%{CUSTOM_TIMESTAMP:local_timestamp}\\t%{TIMESTAMP_ISO8601:utc_timestamp}\\t" + + "%{INT:user_id}\\t%{HOSTNAME:host}\\t%{IP:client_ip}\\t%{WORD:method}\\t%{LOGLEVEL:severity}\\t%{PROG:program}\\t" + + "%{GREEDYDATA:message}"; // Two timestamps: one local, one UTC Collection sampleMessages = Arrays.asList( - "559550912540598297\t4/20/2016 2:06PM\t2016-04-20T21:06:53Z\t38545844\tserv02nw07\t192.168.114.28\tAuthpriv\t" + - "Info\tsshd\tsubsystem request for sftp", - "559550912548986880\t4/20/2016 2:06PM\t2016-04-20T21:06:53Z\t9049724\tserv02nw03\t10.120.48.147\tAuthpriv\t" + - "Info\tsshd\tsubsystem request for sftp", - "559550912548986887\t4/20/2016 2:06PM\t2016-04-20T21:06:53Z\t884343\tserv02tw03\t192.168.121.189\tAuthpriv\t" + - "Info\tsshd\tsubsystem request for sftp", - "559550912603512850\t4/20/2016 2:06PM\t2016-04-20T21:06:53Z\t8907014\tserv02nw01\t192.168.118.208\tAuthpriv\t" + - "Info\tsshd\tsubsystem request for sftp"); + "559550912540598297\t4/20/2016 2:06PM\t2016-04-20T21:06:53Z\t38545844\tserv02nw07\t192.168.114.28\tAuthpriv\t" + + "Info\tsshd\tsubsystem request for sftp", + "559550912548986880\t4/20/2016 2:06PM\t2016-04-20T21:06:53Z\t9049724\tserv02nw03\t10.120.48.147\tAuthpriv\t" + + "Info\tsshd\tsubsystem request for sftp", + "559550912548986887\t4/20/2016 2:06PM\t2016-04-20T21:06:53Z\t884343\tserv02tw03\t192.168.121.189\tAuthpriv\t" + + "Info\tsshd\tsubsystem request for sftp", + "559550912603512850\t4/20/2016 2:06PM\t2016-04-20T21:06:53Z\t8907014\tserv02nw01\t192.168.118.208\tAuthpriv\t" + + "Info\tsshd\tsubsystem request for sftp" + ); Map mappings = new HashMap<>(); - GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings, null, + GrokPatternCreator grokPatternCreator = new GrokPatternCreator( + explanation, + sampleMessages, + mappings, + null, Collections.singletonMap("CUSTOM_TIMESTAMP", "%{MONTHNUM}/%{MONTHDAY}/%{YEAR} %{HOUR}:%{MINUTE}(?:AM|PM)"), - NOOP_TIMEOUT_CHECKER); + NOOP_TIMEOUT_CHECKER + ); grokPatternCreator.validateFullLineGrokPattern(grokPattern, timestampField); assertEquals(9, mappings.size()); @@ -521,22 +670,31 @@ public void testValidateFullLineGrokPatternGivenValidAndCustomDefinition() { public void testValidateFullLineGrokPatternGivenInvalid() { String timestampField = "utc_timestamp"; - String grokPattern = "%{INT:serial_no}\\t%{TIMESTAMP_ISO8601:local_timestamp}\\t%{TIMESTAMP_ISO8601:utc_timestamp}\\t" + - "%{INT:user_id}\\t%{HOSTNAME:host}\\t%{IP:client_ip}\\t%{WORD:method}\\t%{LOGLEVEL:severity}\\t%{PROG:program}\\t" + - "%{GREEDYDATA:message}"; + String grokPattern = "%{INT:serial_no}\\t%{TIMESTAMP_ISO8601:local_timestamp}\\t%{TIMESTAMP_ISO8601:utc_timestamp}\\t" + + "%{INT:user_id}\\t%{HOSTNAME:host}\\t%{IP:client_ip}\\t%{WORD:method}\\t%{LOGLEVEL:severity}\\t%{PROG:program}\\t" + + "%{GREEDYDATA:message}"; Collection sampleMessages = Arrays.asList( "Sep 8 11:55:06 linux named[22529]: error (unexpected RCODE REFUSED) resolving 'elastic.slack.com/A/IN': 95.110.64.205#53", "Sep 8 11:55:08 linux named[22529]: error (unexpected RCODE REFUSED) resolving 'slack-imgs.com/A/IN': 95.110.64.205#53", "Sep 8 11:55:35 linux named[22529]: error (unexpected RCODE REFUSED) resolving 'www.elastic.co/A/IN': 95.110.68.206#53", - "Sep 8 11:55:42 linux named[22529]: error (unexpected RCODE REFUSED) resolving 'b.akamaiedge.net/A/IN': 95.110.64.205#53"); + "Sep 8 11:55:42 linux named[22529]: error (unexpected RCODE REFUSED) resolving 'b.akamaiedge.net/A/IN': 95.110.64.205#53" + ); Map mappings = new HashMap<>(); - GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings, null, Collections.emptyMap(), - NOOP_TIMEOUT_CHECKER); + GrokPatternCreator grokPatternCreator = new GrokPatternCreator( + explanation, + sampleMessages, + mappings, + null, + Collections.emptyMap(), + NOOP_TIMEOUT_CHECKER + ); - IllegalArgumentException e = expectThrows(IllegalArgumentException.class, - () -> grokPatternCreator.validateFullLineGrokPattern(grokPattern, timestampField)); + IllegalArgumentException e = expectThrows( + IllegalArgumentException.class, + () -> grokPatternCreator.validateFullLineGrokPattern(grokPattern, timestampField) + ); assertEquals("Supplied Grok pattern [" + grokPattern + "] does not match sample messages", e.getMessage()); } diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/NdNdJsonFileStructureFinderFactoryTests.java b/x-pack/plugin/text-structure/src/test/java/org/elasticsearch/xpack/textstructure/structurefinder/NdJsonFileStructureFinderFactoryTests.java similarity index 90% rename from x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/NdNdJsonFileStructureFinderFactoryTests.java rename to x-pack/plugin/text-structure/src/test/java/org/elasticsearch/xpack/textstructure/structurefinder/NdJsonFileStructureFinderFactoryTests.java index 5736b0815ffe7..410d34920b91a 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/NdNdJsonFileStructureFinderFactoryTests.java +++ b/x-pack/plugin/text-structure/src/test/java/org/elasticsearch/xpack/textstructure/structurefinder/NdJsonFileStructureFinderFactoryTests.java @@ -3,9 +3,9 @@ * or more contributor license agreements. Licensed under the Elastic License; * you may not use this file except in compliance with the Elastic License. */ -package org.elasticsearch.xpack.ml.filestructurefinder; +package org.elasticsearch.xpack.textstructure.structurefinder; -public class NdNdJsonFileStructureFinderFactoryTests extends FileStructureTestCase { +public class NdJsonFileStructureFinderFactoryTests extends FileStructureTestCase { private FileStructureFinderFactory factory = new NdJsonFileStructureFinderFactory(); diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/NdJsonFileStructureFinderTests.java b/x-pack/plugin/text-structure/src/test/java/org/elasticsearch/xpack/textstructure/structurefinder/NdJsonFileStructureFinderTests.java similarity index 83% rename from x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/NdJsonFileStructureFinderTests.java rename to x-pack/plugin/text-structure/src/test/java/org/elasticsearch/xpack/textstructure/structurefinder/NdJsonFileStructureFinderTests.java index 475289e7dd38d..6043fe5ae7dd5 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/NdJsonFileStructureFinderTests.java +++ b/x-pack/plugin/text-structure/src/test/java/org/elasticsearch/xpack/textstructure/structurefinder/NdJsonFileStructureFinderTests.java @@ -3,9 +3,9 @@ * or more contributor license agreements. Licensed under the Elastic License; * you may not use this file except in compliance with the Elastic License. */ -package org.elasticsearch.xpack.ml.filestructurefinder; +package org.elasticsearch.xpack.textstructure.structurefinder; -import org.elasticsearch.xpack.core.ml.filestructurefinder.FileStructure; +import org.elasticsearch.xpack.core.textstructure.structurefinder.FileStructure; import java.util.Collections; @@ -18,8 +18,15 @@ public void testCreateConfigsGivenGoodJson() throws Exception { String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); - FileStructureFinder structureFinder = factory.createFromSample(explanation, NDJSON_SAMPLE, charset, hasByteOrderMarker, - FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); + FileStructureFinder structureFinder = factory.createFromSample( + explanation, + NDJSON_SAMPLE, + charset, + hasByteOrderMarker, + FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, + FileStructureOverrides.EMPTY_OVERRIDES, + NOOP_TIMEOUT_CHECKER + ); FileStructure structure = structureFinder.getStructure(); diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderFactoryTests.java b/x-pack/plugin/text-structure/src/test/java/org/elasticsearch/xpack/textstructure/structurefinder/TextLogFileStructureFinderFactoryTests.java similarity index 92% rename from x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderFactoryTests.java rename to x-pack/plugin/text-structure/src/test/java/org/elasticsearch/xpack/textstructure/structurefinder/TextLogFileStructureFinderFactoryTests.java index 33fa71e4e92ec..ccd4115c5edf2 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderFactoryTests.java +++ b/x-pack/plugin/text-structure/src/test/java/org/elasticsearch/xpack/textstructure/structurefinder/TextLogFileStructureFinderFactoryTests.java @@ -3,7 +3,7 @@ * or more contributor license agreements. Licensed under the Elastic License; * you may not use this file except in compliance with the Elastic License. */ -package org.elasticsearch.xpack.ml.filestructurefinder; +package org.elasticsearch.xpack.textstructure.structurefinder; public class TextLogFileStructureFinderFactoryTests extends FileStructureTestCase { diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderTests.java b/x-pack/plugin/text-structure/src/test/java/org/elasticsearch/xpack/textstructure/structurefinder/TextLogFileStructureFinderTests.java similarity index 69% rename from x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderTests.java rename to x-pack/plugin/text-structure/src/test/java/org/elasticsearch/xpack/textstructure/structurefinder/TextLogFileStructureFinderTests.java index 2da503d646a05..9a650b711f0cd 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderTests.java +++ b/x-pack/plugin/text-structure/src/test/java/org/elasticsearch/xpack/textstructure/structurefinder/TextLogFileStructureFinderTests.java @@ -3,11 +3,11 @@ * or more contributor license agreements. Licensed under the Elastic License; * you may not use this file except in compliance with the Elastic License. */ -package org.elasticsearch.xpack.ml.filestructurefinder; +package org.elasticsearch.xpack.textstructure.structurefinder; import org.elasticsearch.common.util.set.Sets; -import org.elasticsearch.xpack.core.ml.filestructurefinder.FieldStats; -import org.elasticsearch.xpack.core.ml.filestructurefinder.FileStructure; +import org.elasticsearch.xpack.core.textstructure.structurefinder.FieldStats; +import org.elasticsearch.xpack.core.textstructure.structurefinder.FileStructure; import java.util.Collections; import java.util.Set; @@ -22,25 +22,37 @@ public class TextLogFileStructureFinderTests extends FileStructureTestCase { public void testCreateConfigsGivenLowLineMergeSizeLimit() { - String sample = "2019-05-16 16:56:14 line 1 abcdefghijklmnopqrstuvwxyz\n" + - "2019-05-16 16:56:14 line 2 abcdefghijklmnopqrstuvwxyz\n" + - "continuation line 2.1\n" + - "continuation line 2.2\n" + - "continuation line 2.3\n" + - "continuation line 2.4\n" + - "2019-05-16 16:56:14 line 3 abcdefghijklmnopqrstuvwxyz\n"; + String sample = "2019-05-16 16:56:14 line 1 abcdefghijklmnopqrstuvwxyz\n" + + "2019-05-16 16:56:14 line 2 abcdefghijklmnopqrstuvwxyz\n" + + "continuation line 2.1\n" + + "continuation line 2.2\n" + + "continuation line 2.3\n" + + "continuation line 2.4\n" + + "2019-05-16 16:56:14 line 3 abcdefghijklmnopqrstuvwxyz\n"; assertTrue(factory.canCreateFromSample(explanation, sample, 0.0)); String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); - IllegalArgumentException e = expectThrows(IllegalArgumentException.class, - () -> factory.createFromSample(explanation, sample, charset, hasByteOrderMarker, 100, - FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER)); - - assertEquals("Merging lines into messages resulted in an unacceptably long message. Merged message would have [4] lines and " - + "[119] characters (limit [100]). If you have messages this big please increase the value of [line_merge_size_limit]. " - + "Otherwise it probably means the timestamp has been incorrectly detected, so try overriding that.", e.getMessage()); + IllegalArgumentException e = expectThrows( + IllegalArgumentException.class, + () -> factory.createFromSample( + explanation, + sample, + charset, + hasByteOrderMarker, + 100, + FileStructureOverrides.EMPTY_OVERRIDES, + NOOP_TIMEOUT_CHECKER + ) + ); + + assertEquals( + "Merging lines into messages resulted in an unacceptably long message. Merged message would have [4] lines and " + + "[119] characters (limit [100]). If you have messages this big please increase the value of [line_merge_size_limit]. " + + "Otherwise it probably means the timestamp has been incorrectly detected, so try overriding that.", + e.getMessage() + ); } public void testCreateConfigsGivenElasticsearchLog() throws Exception { @@ -48,8 +60,15 @@ public void testCreateConfigsGivenElasticsearchLog() throws Exception { String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); - FileStructureFinder structureFinder = factory.createFromSample(explanation, TEXT_SAMPLE, charset, hasByteOrderMarker, - FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); + FileStructureFinder structureFinder = factory.createFromSample( + explanation, + TEXT_SAMPLE, + charset, + hasByteOrderMarker, + FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, + FileStructureOverrides.EMPTY_OVERRIDES, + NOOP_TIMEOUT_CHECKER + ); FileStructure structure = structureFinder.getStructure(); @@ -79,10 +98,10 @@ public void testCreateConfigsGivenElasticsearchLog() throws Exception { public void testCreateConfigsGivenElasticsearchLogAndTimestampFormatOverride() throws Exception { - String sample = "12/31/2018 1:40PM INFO foo\n" + - "1/31/2019 11:40AM DEBUG bar\n" + - "2/1/2019 11:00PM INFO foo\n" + - "2/2/2019 1:23AM DEBUG bar\n"; + String sample = "12/31/2018 1:40PM INFO foo\n" + + "1/31/2019 11:40AM DEBUG bar\n" + + "2/1/2019 11:00PM INFO foo\n" + + "2/2/2019 1:23AM DEBUG bar\n"; FileStructureOverrides overrides = FileStructureOverrides.builder().setTimestampFormat("M/d/yyyy h:mma").build(); @@ -90,8 +109,15 @@ public void testCreateConfigsGivenElasticsearchLogAndTimestampFormatOverride() t String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); - FileStructureFinder structureFinder = factory.createFromSample(explanation, sample, charset, hasByteOrderMarker, - FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, overrides, NOOP_TIMEOUT_CHECKER); + FileStructureFinder structureFinder = factory.createFromSample( + explanation, + sample, + charset, + hasByteOrderMarker, + FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, + overrides, + NOOP_TIMEOUT_CHECKER + ); FileStructure structure = structureFinder.getStructure(); @@ -127,8 +153,15 @@ public void testCreateConfigsGivenElasticsearchLogAndTimestampFieldOverride() th String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); - FileStructureFinder structureFinder = factory.createFromSample(explanation, TEXT_SAMPLE, charset, hasByteOrderMarker, - FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, overrides, NOOP_TIMEOUT_CHECKER); + FileStructureFinder structureFinder = factory.createFromSample( + explanation, + TEXT_SAMPLE, + charset, + hasByteOrderMarker, + FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, + overrides, + NOOP_TIMEOUT_CHECKER + ); FileStructure structure = structureFinder.getStructure(); @@ -158,15 +191,26 @@ public void testCreateConfigsGivenElasticsearchLogAndTimestampFieldOverride() th public void testCreateConfigsGivenElasticsearchLogAndGrokPatternOverride() throws Exception { - FileStructureOverrides overrides = FileStructureOverrides.builder().setGrokPattern("\\[%{TIMESTAMP_ISO8601:timestamp}\\]" + - "\\[%{LOGLEVEL:loglevel} *\\]\\[%{JAVACLASS:class} *\\] \\[%{HOSTNAME:node}\\] %{JAVALOGMESSAGE:message}").build(); + FileStructureOverrides overrides = FileStructureOverrides.builder() + .setGrokPattern( + "\\[%{TIMESTAMP_ISO8601:timestamp}\\]" + + "\\[%{LOGLEVEL:loglevel} *\\]\\[%{JAVACLASS:class} *\\] \\[%{HOSTNAME:node}\\] %{JAVALOGMESSAGE:message}" + ) + .build(); assertTrue(factory.canCreateFromSample(explanation, TEXT_SAMPLE, 0.0)); String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); - FileStructureFinder structureFinder = factory.createFromSample(explanation, TEXT_SAMPLE, charset, hasByteOrderMarker, - FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, overrides, NOOP_TIMEOUT_CHECKER); + FileStructureFinder structureFinder = factory.createFromSample( + explanation, + TEXT_SAMPLE, + charset, + hasByteOrderMarker, + FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, + overrides, + NOOP_TIMEOUT_CHECKER + ); FileStructure structure = structureFinder.getStructure(); @@ -183,8 +227,11 @@ public void testCreateConfigsGivenElasticsearchLogAndGrokPatternOverride() throw assertNull(structure.getQuote()); assertNull(structure.getHasHeaderRow()); assertNull(structure.getShouldTrimFields()); - assertEquals("\\[%{TIMESTAMP_ISO8601:timestamp}\\]\\[%{LOGLEVEL:loglevel} *\\]" + - "\\[%{JAVACLASS:class} *\\] \\[%{HOSTNAME:node}\\] %{JAVALOGMESSAGE:message}", structure.getGrokPattern()); + assertEquals( + "\\[%{TIMESTAMP_ISO8601:timestamp}\\]\\[%{LOGLEVEL:loglevel} *\\]" + + "\\[%{JAVACLASS:class} *\\] \\[%{HOSTNAME:node}\\] %{JAVALOGMESSAGE:message}", + structure.getGrokPattern() + ); assertEquals("timestamp", structure.getTimestampField()); assertEquals(Collections.singletonList("ISO8601"), structure.getJodaTimestampFormats()); FieldStats messageFieldStats = structure.getFieldStats().get("message"); @@ -200,63 +247,97 @@ public void testCreateConfigsGivenElasticsearchLogAndGrokPatternOverride() throw public void testCreateConfigsGivenElasticsearchLogAndImpossibleGrokPatternOverride() { // This Grok pattern cannot be matched against the messages in the sample because the fields are in the wrong order - FileStructureOverrides overrides = FileStructureOverrides.builder().setGrokPattern("\\[%{LOGLEVEL:loglevel} *\\]" + - "\\[%{HOSTNAME:node}\\]\\[%{TIMESTAMP_ISO8601:timestamp}\\] \\[%{JAVACLASS:class} *\\] %{JAVALOGMESSAGE:message}").build(); + FileStructureOverrides overrides = FileStructureOverrides.builder() + .setGrokPattern( + "\\[%{LOGLEVEL:loglevel} *\\]" + + "\\[%{HOSTNAME:node}\\]\\[%{TIMESTAMP_ISO8601:timestamp}\\] \\[%{JAVACLASS:class} *\\] %{JAVALOGMESSAGE:message}" + ) + .build(); assertTrue(factory.canCreateFromSample(explanation, TEXT_SAMPLE, 0.0)); String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); - IllegalArgumentException e = expectThrows(IllegalArgumentException.class, - () -> factory.createFromSample(explanation, TEXT_SAMPLE, charset, hasByteOrderMarker, - FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, overrides, NOOP_TIMEOUT_CHECKER)); - - assertEquals("Supplied Grok pattern [\\[%{LOGLEVEL:loglevel} *\\]\\[%{HOSTNAME:node}\\]\\[%{TIMESTAMP_ISO8601:timestamp}\\] " + - "\\[%{JAVACLASS:class} *\\] %{JAVALOGMESSAGE:message}] does not match sample messages", e.getMessage()); + IllegalArgumentException e = expectThrows( + IllegalArgumentException.class, + () -> factory.createFromSample( + explanation, + TEXT_SAMPLE, + charset, + hasByteOrderMarker, + FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, + overrides, + NOOP_TIMEOUT_CHECKER + ) + ); + + assertEquals( + "Supplied Grok pattern [\\[%{LOGLEVEL:loglevel} *\\]\\[%{HOSTNAME:node}\\]\\[%{TIMESTAMP_ISO8601:timestamp}\\] " + + "\\[%{JAVACLASS:class} *\\] %{JAVALOGMESSAGE:message}] does not match sample messages", + e.getMessage() + ); } public void testErrorOnIncorrectMessageFormation() { // This sample causes problems because the (very weird) primary timestamp format // is not detected but a secondary format that only occurs in one line is detected - String sample = "Day 21 Month 1 Year 2019 11:04 INFO [localhost] - starting\n" + - "Day 21 Month 1 Year 2019 11:04 INFO [localhost] - startup date [Mon Jan 21 11:04:19 CET 2019]\n" + - "Day 21 Month 1 Year 2019 11:04 DEBUG [localhost] - details\n" + - "Day 21 Month 1 Year 2019 11:04 DEBUG [localhost] - more details\n" + - "Day 21 Month 1 Year 2019 11:04 WARN [localhost] - something went wrong\n"; + String sample = "Day 21 Month 1 Year 2019 11:04 INFO [localhost] - starting\n" + + "Day 21 Month 1 Year 2019 11:04 INFO [localhost] - startup date [Mon Jan 21 11:04:19 CET 2019]\n" + + "Day 21 Month 1 Year 2019 11:04 DEBUG [localhost] - details\n" + + "Day 21 Month 1 Year 2019 11:04 DEBUG [localhost] - more details\n" + + "Day 21 Month 1 Year 2019 11:04 WARN [localhost] - something went wrong\n"; String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); - IllegalArgumentException e = expectThrows(IllegalArgumentException.class, - () -> factory.createFromSample(explanation, sample, charset, hasByteOrderMarker, - FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER)); - - assertEquals("Failed to create more than one message from the sample lines provided. (The last is discarded in " - + "case the sample is incomplete.) If your sample does contain multiple messages the problem is probably that " - + "the primary timestamp format has been incorrectly detected, so try overriding it.", e.getMessage()); + IllegalArgumentException e = expectThrows( + IllegalArgumentException.class, + () -> factory.createFromSample( + explanation, + sample, + charset, + hasByteOrderMarker, + FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, + FileStructureOverrides.EMPTY_OVERRIDES, + NOOP_TIMEOUT_CHECKER + ) + ); + + assertEquals( + "Failed to create more than one message from the sample lines provided. (The last is discarded in " + + "case the sample is incomplete.) If your sample does contain multiple messages the problem is probably that " + + "the primary timestamp format has been incorrectly detected, so try overriding it.", + e.getMessage() + ); } public void testCreateMultiLineMessageStartRegexGivenNoPrefaces() { for (TimestampFormatFinder.CandidateTimestampFormat candidateTimestampFormat : TimestampFormatFinder.ORDERED_CANDIDATE_FORMATS) { String simpleDateRegex = candidateTimestampFormat.simplePattern.pattern(); - assertEquals("^" + simpleDateRegex.replaceFirst("^\\\\b", ""), - TextLogFileStructureFinder.createMultiLineMessageStartRegex(Collections.emptySet(), simpleDateRegex)); + assertEquals( + "^" + simpleDateRegex.replaceFirst("^\\\\b", ""), + TextLogFileStructureFinder.createMultiLineMessageStartRegex(Collections.emptySet(), simpleDateRegex) + ); } } public void testCreateMultiLineMessageStartRegexGivenOneEmptyPreface() { for (TimestampFormatFinder.CandidateTimestampFormat candidateTimestampFormat : TimestampFormatFinder.ORDERED_CANDIDATE_FORMATS) { String simpleDateRegex = candidateTimestampFormat.simplePattern.pattern(); - assertEquals("^" + simpleDateRegex.replaceFirst("^\\\\b", ""), - TextLogFileStructureFinder.createMultiLineMessageStartRegex(Collections.singleton(""), simpleDateRegex)); + assertEquals( + "^" + simpleDateRegex.replaceFirst("^\\\\b", ""), + TextLogFileStructureFinder.createMultiLineMessageStartRegex(Collections.singleton(""), simpleDateRegex) + ); } } public void testCreateMultiLineMessageStartRegexGivenOneLogLevelPreface() { for (TimestampFormatFinder.CandidateTimestampFormat candidateTimestampFormat : TimestampFormatFinder.ORDERED_CANDIDATE_FORMATS) { String simpleDateRegex = candidateTimestampFormat.simplePattern.pattern(); - assertEquals("^\\[.*?\\] \\[" + simpleDateRegex, - TextLogFileStructureFinder.createMultiLineMessageStartRegex(Collections.singleton("[ERROR] ["), simpleDateRegex)); + assertEquals( + "^\\[.*?\\] \\[" + simpleDateRegex, + TextLogFileStructureFinder.createMultiLineMessageStartRegex(Collections.singleton("[ERROR] ["), simpleDateRegex) + ); } } @@ -264,8 +345,10 @@ public void testCreateMultiLineMessageStartRegexGivenManyLogLevelPrefaces() { for (TimestampFormatFinder.CandidateTimestampFormat candidateTimestampFormat : TimestampFormatFinder.ORDERED_CANDIDATE_FORMATS) { Set prefaces = Sets.newHashSet("[ERROR] [", "[DEBUG] ["); String simpleDateRegex = candidateTimestampFormat.simplePattern.pattern(); - assertEquals("^\\[.*?\\] \\[" + simpleDateRegex, - TextLogFileStructureFinder.createMultiLineMessageStartRegex(prefaces, simpleDateRegex)); + assertEquals( + "^\\[.*?\\] \\[" + simpleDateRegex, + TextLogFileStructureFinder.createMultiLineMessageStartRegex(prefaces, simpleDateRegex) + ); } } @@ -273,8 +356,10 @@ public void testCreateMultiLineMessageStartRegexGivenManyHostnamePrefaces() { for (TimestampFormatFinder.CandidateTimestampFormat candidateTimestampFormat : TimestampFormatFinder.ORDERED_CANDIDATE_FORMATS) { Set prefaces = Sets.newHashSet("host-1.acme.com|", "my_host.elastic.co|"); String simpleDateRegex = candidateTimestampFormat.simplePattern.pattern(); - assertEquals("^.*?\\|" + simpleDateRegex, - TextLogFileStructureFinder.createMultiLineMessageStartRegex(prefaces, simpleDateRegex)); + assertEquals( + "^.*?\\|" + simpleDateRegex, + TextLogFileStructureFinder.createMultiLineMessageStartRegex(prefaces, simpleDateRegex) + ); } } @@ -282,8 +367,7 @@ public void testCreateMultiLineMessageStartRegexGivenManyPrefacesIncludingEmpty( for (TimestampFormatFinder.CandidateTimestampFormat candidateTimestampFormat : TimestampFormatFinder.ORDERED_CANDIDATE_FORMATS) { Set prefaces = Sets.newHashSet("", "[non-standard] "); String simpleDateRegex = candidateTimestampFormat.simplePattern.pattern(); - assertEquals("^.*?" + simpleDateRegex, - TextLogFileStructureFinder.createMultiLineMessageStartRegex(prefaces, simpleDateRegex)); + assertEquals("^.*?" + simpleDateRegex, TextLogFileStructureFinder.createMultiLineMessageStartRegex(prefaces, simpleDateRegex)); } } } diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/TimeoutCheckerTests.java b/x-pack/plugin/text-structure/src/test/java/org/elasticsearch/xpack/textstructure/structurefinder/TimeoutCheckerTests.java similarity index 78% rename from x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/TimeoutCheckerTests.java rename to x-pack/plugin/text-structure/src/test/java/org/elasticsearch/xpack/textstructure/structurefinder/TimeoutCheckerTests.java index 0ebfc3616c0c0..e8121e315b56e 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/TimeoutCheckerTests.java +++ b/x-pack/plugin/text-structure/src/test/java/org/elasticsearch/xpack/textstructure/structurefinder/TimeoutCheckerTests.java @@ -3,7 +3,7 @@ * or more contributor license agreements. Licensed under the Elastic License; * you may not use this file except in compliance with the Elastic License. */ -package org.elasticsearch.xpack.ml.filestructurefinder; +package org.elasticsearch.xpack.textstructure.structurefinder; import org.elasticsearch.ElasticsearchTimeoutException; import org.elasticsearch.common.unit.TimeValue; @@ -51,10 +51,14 @@ public void testCheckTimeoutExceeded() throws Exception { TimeValue timeout = TimeValue.timeValueMillis(10); try (TimeoutChecker timeoutChecker = new TimeoutChecker("timeout exceeded test", timeout, scheduler)) { assertBusy(() -> { - ElasticsearchTimeoutException e = expectThrows(ElasticsearchTimeoutException.class, - () -> timeoutChecker.check("should timeout")); - assertEquals("Aborting timeout exceeded test during [should timeout] as it has taken longer than the timeout of [" + - timeout + "]", e.getMessage()); + ElasticsearchTimeoutException e = expectThrows( + ElasticsearchTimeoutException.class, + () -> timeoutChecker.check("should timeout") + ); + assertEquals( + "Aborting timeout exceeded test during [should timeout] as it has taken longer than the timeout of [" + timeout + "]", + e.getMessage() + ); }); } } @@ -67,9 +71,7 @@ public void testWatchdog() throws Exception { watchdog.register(matcher); assertThat(watchdog.registry.get(Thread.currentThread()).matchers.size(), equalTo(1)); try { - assertBusy(() -> { - verify(matcher).interrupt(); - }); + assertBusy(() -> { verify(matcher).interrupt(); }); } finally { watchdog.unregister(matcher); assertThat(watchdog.registry.get(Thread.currentThread()).matchers.size(), equalTo(0)); @@ -83,10 +85,14 @@ public void testGrokCaptures() throws Exception { try (TimeoutChecker timeoutChecker = new TimeoutChecker("grok captures test", timeout, scheduler)) { assertBusy(() -> { - ElasticsearchTimeoutException e = expectThrows(ElasticsearchTimeoutException.class, - () -> timeoutChecker.grokCaptures(grok, randomAlphaOfLength(1000000), "should timeout")); - assertEquals("Aborting grok captures test during [should timeout] as it has taken longer than the timeout of [" + - timeout + "]", e.getMessage()); + ElasticsearchTimeoutException e = expectThrows( + ElasticsearchTimeoutException.class, + () -> timeoutChecker.grokCaptures(grok, randomAlphaOfLength(1000000), "should timeout") + ); + assertEquals( + "Aborting grok captures test during [should timeout] as it has taken longer than the timeout of [" + timeout + "]", + e.getMessage() + ); }); } } diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/TimestampFormatFinderTests.java b/x-pack/plugin/text-structure/src/test/java/org/elasticsearch/xpack/textstructure/structurefinder/TimestampFormatFinderTests.java similarity index 54% rename from x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/TimestampFormatFinderTests.java rename to x-pack/plugin/text-structure/src/test/java/org/elasticsearch/xpack/textstructure/structurefinder/TimestampFormatFinderTests.java index bc455c2f370be..7c1fcbd981fc9 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/TimestampFormatFinderTests.java +++ b/x-pack/plugin/text-structure/src/test/java/org/elasticsearch/xpack/textstructure/structurefinder/TimestampFormatFinderTests.java @@ -3,7 +3,7 @@ * or more contributor license agreements. Licensed under the Elastic License; * you may not use this file except in compliance with the Elastic License. */ -package org.elasticsearch.xpack.ml.filestructurefinder; +package org.elasticsearch.xpack.textstructure.structurefinder; import org.elasticsearch.common.collect.Tuple; import org.elasticsearch.common.time.DateFormatter; @@ -26,167 +26,205 @@ public class TimestampFormatFinderTests extends FileStructureTestCase { private static final String EXCEPTION_TRACE_SAMPLE = - "[2018-02-28T14:49:40,517][DEBUG][o.e.a.b.TransportShardBulkAction] [an_index][2] failed to execute bulk item " + - "(index) BulkShardRequest [[an_index][2]] containing [33] requests\n" + - "java.lang.IllegalArgumentException: Document contains at least one immense term in field=\"message.keyword\" (whose UTF8 " + - "encoding is longer than the max length 32766), all of which were skipped. Please correct the analyzer to not produce " + - "such terms. The prefix of the first immense term is: '[60, 83, 79, 65, 80, 45, 69, 78, 86, 58, 69, 110, 118, 101, 108, " + - "111, 112, 101, 32, 120, 109, 108, 110, 115, 58, 83, 79, 65, 80, 45]...', original message: bytes can be at most 32766 " + - "in length; got 49023\n" + - "\tat org.apache.lucene.index.DefaultIndexingChain$PerField.invert(DefaultIndexingChain.java:796) " + - "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" + - "\tat org.apache.lucene.index.DefaultIndexingChain.processField(DefaultIndexingChain.java:430) " + - "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" + - "\tat org.apache.lucene.index.DefaultIndexingChain.processDocument(DefaultIndexingChain.java:392) " + - "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" + - "\tat org.apache.lucene.index.DocumentsWriterPerThread.updateDocument(DocumentsWriterPerThread.java:240) " + - "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" + - "\tat org.apache.lucene.index.DocumentsWriter.updateDocument(DocumentsWriter.java:496) " + - "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" + - "\tat org.apache.lucene.index.IndexWriter.updateDocument(IndexWriter.java:1729) " + - "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" + - "\tat org.apache.lucene.index.IndexWriter.addDocument(IndexWriter.java:1464) " + - "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" + - "\tat org.elasticsearch.index.engine.InternalEngine.index(InternalEngine.java:1070) ~[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.index.engine.InternalEngine.indexIntoLucene(InternalEngine.java:1012) " + - "~[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.index.engine.InternalEngine.index(InternalEngine.java:878) ~[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.index.shard.IndexShard.index(IndexShard.java:738) ~[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.index.shard.IndexShard.applyIndexOperation(IndexShard.java:707) ~[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.index.shard.IndexShard.applyIndexOperationOnPrimary(IndexShard.java:673) " + - "~[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.action.bulk.TransportShardBulkAction.executeIndexRequestOnPrimary(TransportShardBulkAction.java:548) " + - "~[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.action.bulk.TransportShardBulkAction.executeIndexRequest(TransportShardBulkAction.java:140) " + - "[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.action.bulk.TransportShardBulkAction.executeBulkItemRequest(TransportShardBulkAction.java:236) " + - "[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.action.bulk.TransportShardBulkAction.performOnPrimary(TransportShardBulkAction.java:123) " + - "[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.action.bulk.TransportShardBulkAction.shardOperationOnPrimary(TransportShardBulkAction.java:110) " + - "[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.action.bulk.TransportShardBulkAction.shardOperationOnPrimary(TransportShardBulkAction.java:72) " + - "[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryShardReference.perform" + - "(TransportReplicationAction.java:1034) [elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryShardReference.perform" + - "(TransportReplicationAction.java:1012) [elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.action.support.replication.ReplicationOperation.execute(ReplicationOperation.java:103) " + - "[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$AsyncPrimaryAction.onResponse" + - "(TransportReplicationAction.java:359) [elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$AsyncPrimaryAction.onResponse" + - "(TransportReplicationAction.java:299) [elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$1.onResponse" + - "(TransportReplicationAction.java:975) [elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$1.onResponse" + - "(TransportReplicationAction.java:972) [elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.index.shard.IndexShardOperationPermits.acquire(IndexShardOperationPermits.java:238) " + - "[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.index.shard.IndexShard.acquirePrimaryOperationPermit(IndexShard.java:2220) " + - "[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.action.support.replication.TransportReplicationAction.acquirePrimaryShardReference" + - "(TransportReplicationAction.java:984) [elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.action.support.replication.TransportReplicationAction.access$500(TransportReplicationAction.java:98) " + - "[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$AsyncPrimaryAction.doRun" + - "(TransportReplicationAction.java:320) [elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:37) " + - "[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryOperationTransportHandler" + - ".messageReceived(TransportReplicationAction.java:295) [elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryOperationTransportHandler" + - ".messageReceived(TransportReplicationAction.java:282) [elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.transport.RequestHandlerRegistry.processMessageReceived(RequestHandlerRegistry.java:66) " + - "[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.transport.TransportService$7.doRun(TransportService.java:656) " + - "[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.common.util.concurrent.ThreadContext$ContextPreservingAbstractRunnable.doRun(ThreadContext.java:635) " + - "[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:37) " + - "[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) [?:1.8.0_144]\n" + - "\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) [?:1.8.0_144]\n" + - "\tat java.lang.Thread.run(Thread.java:748) [?:1.8.0_144]\n"; + "[2018-02-28T14:49:40,517][DEBUG][o.e.a.b.TransportShardBulkAction] [an_index][2] failed to execute bulk item " + + "(index) BulkShardRequest [[an_index][2]] containing [33] requests\n" + + "java.lang.IllegalArgumentException: Document contains at least one immense term in field=\"message.keyword\" (whose UTF8 " + + "encoding is longer than the max length 32766), all of which were skipped. Please correct the analyzer to not produce " + + "such terms. The prefix of the first immense term is: '[60, 83, 79, 65, 80, 45, 69, 78, 86, 58, 69, 110, 118, 101, 108, " + + "111, 112, 101, 32, 120, 109, 108, 110, 115, 58, 83, 79, 65, 80, 45]...', original message: bytes can be at most 32766 " + + "in length; got 49023\n" + + "\tat org.apache.lucene.index.DefaultIndexingChain$PerField.invert(DefaultIndexingChain.java:796) " + + "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" + + "\tat org.apache.lucene.index.DefaultIndexingChain.processField(DefaultIndexingChain.java:430) " + + "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" + + "\tat org.apache.lucene.index.DefaultIndexingChain.processDocument(DefaultIndexingChain.java:392) " + + "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" + + "\tat org.apache.lucene.index.DocumentsWriterPerThread.updateDocument(DocumentsWriterPerThread.java:240) " + + "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" + + "\tat org.apache.lucene.index.DocumentsWriter.updateDocument(DocumentsWriter.java:496) " + + "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" + + "\tat org.apache.lucene.index.IndexWriter.updateDocument(IndexWriter.java:1729) " + + "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" + + "\tat org.apache.lucene.index.IndexWriter.addDocument(IndexWriter.java:1464) " + + "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" + + "\tat org.elasticsearch.index.engine.InternalEngine.index(InternalEngine.java:1070) ~[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.index.engine.InternalEngine.indexIntoLucene(InternalEngine.java:1012) " + + "~[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.index.engine.InternalEngine.index(InternalEngine.java:878) ~[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.index.shard.IndexShard.index(IndexShard.java:738) ~[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.index.shard.IndexShard.applyIndexOperation(IndexShard.java:707) ~[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.index.shard.IndexShard.applyIndexOperationOnPrimary(IndexShard.java:673) " + + "~[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.action.bulk.TransportShardBulkAction.executeIndexRequestOnPrimary(TransportShardBulkAction.java:548) " + + "~[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.action.bulk.TransportShardBulkAction.executeIndexRequest(TransportShardBulkAction.java:140) " + + "[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.action.bulk.TransportShardBulkAction.executeBulkItemRequest(TransportShardBulkAction.java:236) " + + "[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.action.bulk.TransportShardBulkAction.performOnPrimary(TransportShardBulkAction.java:123) " + + "[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.action.bulk.TransportShardBulkAction.shardOperationOnPrimary(TransportShardBulkAction.java:110) " + + "[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.action.bulk.TransportShardBulkAction.shardOperationOnPrimary(TransportShardBulkAction.java:72) " + + "[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryShardReference.perform" + + "(TransportReplicationAction.java:1034) [elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryShardReference.perform" + + "(TransportReplicationAction.java:1012) [elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.action.support.replication.ReplicationOperation.execute(ReplicationOperation.java:103) " + + "[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$AsyncPrimaryAction.onResponse" + + "(TransportReplicationAction.java:359) [elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$AsyncPrimaryAction.onResponse" + + "(TransportReplicationAction.java:299) [elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$1.onResponse" + + "(TransportReplicationAction.java:975) [elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$1.onResponse" + + "(TransportReplicationAction.java:972) [elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.index.shard.IndexShardOperationPermits.acquire(IndexShardOperationPermits.java:238) " + + "[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.index.shard.IndexShard.acquirePrimaryOperationPermit(IndexShard.java:2220) " + + "[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.action.support.replication.TransportReplicationAction.acquirePrimaryShardReference" + + "(TransportReplicationAction.java:984) [elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.action.support.replication.TransportReplicationAction.access$500(TransportReplicationAction.java:98) " + + "[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$AsyncPrimaryAction.doRun" + + "(TransportReplicationAction.java:320) [elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:37) " + + "[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryOperationTransportHandler" + + ".messageReceived(TransportReplicationAction.java:295) [elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryOperationTransportHandler" + + ".messageReceived(TransportReplicationAction.java:282) [elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.transport.RequestHandlerRegistry.processMessageReceived(RequestHandlerRegistry.java:66) " + + "[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.transport.TransportService$7.doRun(TransportService.java:656) " + + "[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.common.util.concurrent.ThreadContext$ContextPreservingAbstractRunnable.doRun(ThreadContext.java:635) " + + "[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:37) " + + "[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) [?:1.8.0_144]\n" + + "\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) [?:1.8.0_144]\n" + + "\tat java.lang.Thread.run(Thread.java:748) [?:1.8.0_144]\n"; public void testValidOverrideFormatToGrokAndRegex() { - assertEquals(new Tuple<>("%{YEAR}-%{MONTHNUM2}-%{MONTHDAY}T%{HOUR}:%{MINUTE}:%{SECOND}%{ISO8601_TIMEZONE}", - "\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2},\\d{3}(?:Z|[+-]\\d{4})\\b"), - TimestampFormatFinder.overrideFormatToGrokAndRegex("yyyy-MM-dd'T'HH:mm:ss,SSSXX")); - assertEquals(new Tuple<>("%{MONTHDAY}\\.%{MONTHNUM2}\\.%{YEAR} %{HOUR}:%{MINUTE} (?:AM|PM)", - "\\b\\d{2}\\.\\d{2}\\.\\d{2} \\d{1,2}:\\d{2} [AP]M\\b"), - TimestampFormatFinder.overrideFormatToGrokAndRegex("dd.MM.yy h:mm a")); - assertEquals(new Tuple<>("%{MONTHNUM2}/%{MONTHDAY}/%{YEAR} %{HOUR}:%{MINUTE}:%{SECOND} %{TZ}", - "\\b\\d{2}/\\d{2}/\\d{4} \\d{1,2}:\\d{2}:\\d{2} [A-Z]{3}\\b"), - TimestampFormatFinder.overrideFormatToGrokAndRegex("MM/dd/yyyy H:mm:ss zzz")); + assertEquals( + new Tuple<>( + "%{YEAR}-%{MONTHNUM2}-%{MONTHDAY}T%{HOUR}:%{MINUTE}:%{SECOND}%{ISO8601_TIMEZONE}", + "\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2},\\d{3}(?:Z|[+-]\\d{4})\\b" + ), + TimestampFormatFinder.overrideFormatToGrokAndRegex("yyyy-MM-dd'T'HH:mm:ss,SSSXX") + ); + assertEquals( + new Tuple<>( + "%{MONTHDAY}\\.%{MONTHNUM2}\\.%{YEAR} %{HOUR}:%{MINUTE} (?:AM|PM)", + "\\b\\d{2}\\.\\d{2}\\.\\d{2} \\d{1,2}:\\d{2} [AP]M\\b" + ), + TimestampFormatFinder.overrideFormatToGrokAndRegex("dd.MM.yy h:mm a") + ); + assertEquals( + new Tuple<>( + "%{MONTHNUM2}/%{MONTHDAY}/%{YEAR} %{HOUR}:%{MINUTE}:%{SECOND} %{TZ}", + "\\b\\d{2}/\\d{2}/\\d{4} \\d{1,2}:\\d{2}:\\d{2} [A-Z]{3}\\b" + ), + TimestampFormatFinder.overrideFormatToGrokAndRegex("MM/dd/yyyy H:mm:ss zzz") + ); } public void testInvalidOverrideFormatToGrokAndRegex() { - IllegalArgumentException e = expectThrows(IllegalArgumentException.class, - () -> TimestampFormatFinder.overrideFormatToGrokAndRegex("MM/dd/yyyy\nH:mm:ss zzz")); + IllegalArgumentException e = expectThrows( + IllegalArgumentException.class, + () -> TimestampFormatFinder.overrideFormatToGrokAndRegex("MM/dd/yyyy\nH:mm:ss zzz") + ); assertEquals("Multi-line timestamp formats [MM/dd/yyyy\nH:mm:ss zzz] not supported", e.getMessage()); - e = expectThrows(IllegalArgumentException.class, - () -> TimestampFormatFinder.overrideFormatToGrokAndRegex("MM/dd/YYYY H:mm:ss zzz")); + e = expectThrows( + IllegalArgumentException.class, + () -> TimestampFormatFinder.overrideFormatToGrokAndRegex("MM/dd/YYYY H:mm:ss zzz") + ); assertEquals("Letter group [YYYY] in [MM/dd/YYYY H:mm:ss zzz] is not supported", e.getMessage()); - e = expectThrows(IllegalArgumentException.class, - () -> TimestampFormatFinder.overrideFormatToGrokAndRegex("MM/dd/yyy H:mm:ss zzz")); + e = expectThrows(IllegalArgumentException.class, () -> TimestampFormatFinder.overrideFormatToGrokAndRegex("MM/dd/yyy H:mm:ss zzz")); assertEquals("Letter group [yyy] in [MM/dd/yyy H:mm:ss zzz] is not supported", e.getMessage()); - e = expectThrows(IllegalArgumentException.class, - () -> TimestampFormatFinder.overrideFormatToGrokAndRegex("MM/dd/yyyy H:mm:ss+SSSSSS")); - assertEquals("Letter group [SSSSSS] in [MM/dd/yyyy H:mm:ss+SSSSSS] is not supported" - + " because it is not preceded by [ss] and a separator from [:.,]", e.getMessage()); - e = expectThrows(IllegalArgumentException.class, - () -> TimestampFormatFinder.overrideFormatToGrokAndRegex("MM/dd/yyyy H:mm,SSSSSS")); - assertEquals("Letter group [SSSSSS] in [MM/dd/yyyy H:mm,SSSSSS] is not supported" - + " because it is not preceded by [ss] and a separator from [:.,]", e.getMessage()); - e = expectThrows(IllegalArgumentException.class, - () -> TimestampFormatFinder.overrideFormatToGrokAndRegex(" 'T' ")); + e = expectThrows( + IllegalArgumentException.class, + () -> TimestampFormatFinder.overrideFormatToGrokAndRegex("MM/dd/yyyy H:mm:ss+SSSSSS") + ); + assertEquals( + "Letter group [SSSSSS] in [MM/dd/yyyy H:mm:ss+SSSSSS] is not supported" + + " because it is not preceded by [ss] and a separator from [:.,]", + e.getMessage() + ); + e = expectThrows( + IllegalArgumentException.class, + () -> TimestampFormatFinder.overrideFormatToGrokAndRegex("MM/dd/yyyy H:mm,SSSSSS") + ); + assertEquals( + "Letter group [SSSSSS] in [MM/dd/yyyy H:mm,SSSSSS] is not supported" + + " because it is not preceded by [ss] and a separator from [:.,]", + e.getMessage() + ); + e = expectThrows(IllegalArgumentException.class, () -> TimestampFormatFinder.overrideFormatToGrokAndRegex(" 'T' ")); assertEquals("No time format letter groups in override format [ 'T' ]", e.getMessage()); } public void testMakeCandidateFromOverrideFormat() { // Override is a special format - assertSame(TimestampFormatFinder.ISO8601_CANDIDATE_FORMAT, - TimestampFormatFinder.makeCandidateFromOverrideFormat("ISO8601", NOOP_TIMEOUT_CHECKER)); - assertSame(TimestampFormatFinder.UNIX_MS_CANDIDATE_FORMAT, - TimestampFormatFinder.makeCandidateFromOverrideFormat("UNIX_MS", NOOP_TIMEOUT_CHECKER)); - assertSame(TimestampFormatFinder.UNIX_CANDIDATE_FORMAT, - TimestampFormatFinder.makeCandidateFromOverrideFormat("UNIX", NOOP_TIMEOUT_CHECKER)); - assertSame(TimestampFormatFinder.TAI64N_CANDIDATE_FORMAT, - TimestampFormatFinder.makeCandidateFromOverrideFormat("TAI64N", NOOP_TIMEOUT_CHECKER)); + assertSame( + TimestampFormatFinder.ISO8601_CANDIDATE_FORMAT, + TimestampFormatFinder.makeCandidateFromOverrideFormat("ISO8601", NOOP_TIMEOUT_CHECKER) + ); + assertSame( + TimestampFormatFinder.UNIX_MS_CANDIDATE_FORMAT, + TimestampFormatFinder.makeCandidateFromOverrideFormat("UNIX_MS", NOOP_TIMEOUT_CHECKER) + ); + assertSame( + TimestampFormatFinder.UNIX_CANDIDATE_FORMAT, + TimestampFormatFinder.makeCandidateFromOverrideFormat("UNIX", NOOP_TIMEOUT_CHECKER) + ); + assertSame( + TimestampFormatFinder.TAI64N_CANDIDATE_FORMAT, + TimestampFormatFinder.makeCandidateFromOverrideFormat("TAI64N", NOOP_TIMEOUT_CHECKER) + ); // Override is covered by a built-in format - TimestampFormatFinder.CandidateTimestampFormat candidate = - TimestampFormatFinder.makeCandidateFromOverrideFormat("yyyy-MM-dd'T'HH:mm:ss.SSS", NOOP_TIMEOUT_CHECKER); + TimestampFormatFinder.CandidateTimestampFormat candidate = TimestampFormatFinder.makeCandidateFromOverrideFormat( + "yyyy-MM-dd'T'HH:mm:ss.SSS", + NOOP_TIMEOUT_CHECKER + ); assertEquals(TimestampFormatFinder.ISO8601_CANDIDATE_FORMAT.outputGrokPatternName, candidate.outputGrokPatternName); assertEquals(TimestampFormatFinder.ISO8601_CANDIDATE_FORMAT.strictGrokPattern, candidate.strictGrokPattern); // Can't compare Grok objects as Grok doesn't implement equals() assertEquals(TimestampFormatFinder.ISO8601_CANDIDATE_FORMAT.simplePattern.pattern(), candidate.simplePattern.pattern()); // Exact format supplied is returned if it matches - assertEquals(Collections.singletonList("yyyy-MM-dd'T'HH:mm:ss.SSS"), - candidate.javaTimestampFormatSupplier.apply("2018-05-15T16:14:56.374")); + assertEquals( + Collections.singletonList("yyyy-MM-dd'T'HH:mm:ss.SSS"), + candidate.javaTimestampFormatSupplier.apply("2018-05-15T16:14:56.374") + ); // Other supported formats are returned if exact format doesn't match assertEquals(Collections.singletonList("ISO8601"), candidate.javaTimestampFormatSupplier.apply("2018-05-15T16:14:56,374")); // Override is supported but not covered by any built-in format - candidate = - TimestampFormatFinder.makeCandidateFromOverrideFormat("MM/dd/yyyy H:mm:ss zzz", NOOP_TIMEOUT_CHECKER); + candidate = TimestampFormatFinder.makeCandidateFromOverrideFormat("MM/dd/yyyy H:mm:ss zzz", NOOP_TIMEOUT_CHECKER); assertEquals(TimestampFormatFinder.CUSTOM_TIMESTAMP_GROK_NAME, candidate.outputGrokPatternName); assertEquals("%{MONTHNUM2}/%{MONTHDAY}/%{YEAR} %{HOUR}:%{MINUTE}:%{SECOND} %{TZ}", candidate.strictGrokPattern); assertEquals("\\b\\d{2}/\\d{2}/\\d{4} \\d{1,2}:\\d{2}:\\d{2} [A-Z]{3}\\b", candidate.simplePattern.pattern()); - assertEquals(Collections.singletonList("MM/dd/yyyy H:mm:ss zzz"), - candidate.javaTimestampFormatSupplier.apply("05/15/2018 16:14:56 UTC")); + assertEquals( + Collections.singletonList("MM/dd/yyyy H:mm:ss zzz"), + candidate.javaTimestampFormatSupplier.apply("05/15/2018 16:14:56 UTC") + ); - candidate = - TimestampFormatFinder.makeCandidateFromOverrideFormat("M/d/yyyy H:mm:ss zzz", NOOP_TIMEOUT_CHECKER); + candidate = TimestampFormatFinder.makeCandidateFromOverrideFormat("M/d/yyyy H:mm:ss zzz", NOOP_TIMEOUT_CHECKER); assertEquals(TimestampFormatFinder.CUSTOM_TIMESTAMP_GROK_NAME, candidate.outputGrokPatternName); assertEquals("%{MONTHNUM}/%{MONTHDAY}/%{YEAR} %{HOUR}:%{MINUTE}:%{SECOND} %{TZ}", candidate.strictGrokPattern); assertEquals("\\b\\d{1,2}/\\d{1,2}/\\d{4} \\d{1,2}:\\d{2}:\\d{2} [A-Z]{3}\\b", candidate.simplePattern.pattern()); - assertEquals(Collections.singletonList("M/d/yyyy H:mm:ss zzz"), - candidate.javaTimestampFormatSupplier.apply("5/15/2018 16:14:56 UTC")); + assertEquals( + Collections.singletonList("M/d/yyyy H:mm:ss zzz"), + candidate.javaTimestampFormatSupplier.apply("5/15/2018 16:14:56 UTC") + ); } public void testRequiresTimezoneDependentParsing() { @@ -200,23 +238,37 @@ public void testRequiresTimezoneDependentParsing() { assertFalse(TimestampFormatFinder.TimestampMatch.requiresTimezoneDependentParsing("UNIX", "1526400896")); assertFalse(TimestampFormatFinder.TimestampMatch.requiresTimezoneDependentParsing("TAI64N", "400000005afb078a164ac980")); - assertFalse(TimestampFormatFinder.TimestampMatch.requiresTimezoneDependentParsing("EEE, dd MMM yyyy HH:mm:ss XXX", - "Tue, 15 May 2018 17:14:56 +01:00")); + assertFalse( + TimestampFormatFinder.TimestampMatch.requiresTimezoneDependentParsing( + "EEE, dd MMM yyyy HH:mm:ss XXX", + "Tue, 15 May 2018 17:14:56 +01:00" + ) + ); assertTrue(TimestampFormatFinder.TimestampMatch.requiresTimezoneDependentParsing("yyyyMMddHHmmss", "20180515171456")); - assertFalse(TimestampFormatFinder.TimestampMatch.requiresTimezoneDependentParsing("EEE MMM dd yy HH:mm:ss zzz", - "Tue May 15 18 16:14:56 UTC")); - assertFalse(TimestampFormatFinder.TimestampMatch.requiresTimezoneDependentParsing("yyyy-MM-dd HH:mm:ss,SSS XX", - "2018-05-15 17:14:56,374 +0100")); + assertFalse( + TimestampFormatFinder.TimestampMatch.requiresTimezoneDependentParsing( + "EEE MMM dd yy HH:mm:ss zzz", + "Tue May 15 18 16:14:56 UTC" + ) + ); + assertFalse( + TimestampFormatFinder.TimestampMatch.requiresTimezoneDependentParsing( + "yyyy-MM-dd HH:mm:ss,SSS XX", + "2018-05-15 17:14:56,374 +0100" + ) + ); assertTrue(TimestampFormatFinder.TimestampMatch.requiresTimezoneDependentParsing("MMM dd HH:mm:ss.SSS", "May 15 17:14:56.725")); - assertTrue(TimestampFormatFinder.TimestampMatch.requiresTimezoneDependentParsing("yyyy.MM.dd'zXz'HH:mm:ss", - "2018.05.15zXz17:14:56")); - assertTrue(TimestampFormatFinder.TimestampMatch.requiresTimezoneDependentParsing("yyyy.MM.dd HH:mm:ss'z'", - "2018.05.15 17:14:56z")); - assertTrue(TimestampFormatFinder.TimestampMatch.requiresTimezoneDependentParsing("'XX'yyyy.MM.dd HH:mm:ss", - "XX2018.05.15 17:14:56")); - assertFalse(TimestampFormatFinder.TimestampMatch.requiresTimezoneDependentParsing("'XX'yyyy.MM.dd HH:mm:ssXX", - "XX2018.05.15 17:14:56Z")); + assertTrue( + TimestampFormatFinder.TimestampMatch.requiresTimezoneDependentParsing("yyyy.MM.dd'zXz'HH:mm:ss", "2018.05.15zXz17:14:56") + ); + assertTrue(TimestampFormatFinder.TimestampMatch.requiresTimezoneDependentParsing("yyyy.MM.dd HH:mm:ss'z'", "2018.05.15 17:14:56z")); + assertTrue( + TimestampFormatFinder.TimestampMatch.requiresTimezoneDependentParsing("'XX'yyyy.MM.dd HH:mm:ss", "XX2018.05.15 17:14:56") + ); + assertFalse( + TimestampFormatFinder.TimestampMatch.requiresTimezoneDependentParsing("'XX'yyyy.MM.dd HH:mm:ssXX", "XX2018.05.15 17:14:56Z") + ); } public void testMatchHasNanosecondPrecision() { @@ -238,95 +290,121 @@ public void testMatchHasNanosecondPrecision() { assertFalse(TimestampFormatFinder.TimestampMatch.matchHasNanosecondPrecision("UNIX", "1526400896")); assertTrue(TimestampFormatFinder.TimestampMatch.matchHasNanosecondPrecision("TAI64N", "400000005afb078a164ac980")); - assertFalse(TimestampFormatFinder.TimestampMatch.matchHasNanosecondPrecision("yyyy-MM-dd HH:mm:ss,SSS XX", - "2018-05-15 17:14:56,374 +0100")); - assertTrue(TimestampFormatFinder.TimestampMatch.matchHasNanosecondPrecision("yyyy-MM-dd HH:mm:ss.SSSSSS XX", - "2018-05-15 17:14:56.374123 +0100")); - assertTrue(TimestampFormatFinder.TimestampMatch.matchHasNanosecondPrecision("yyyy-MM-dd HH:mm:ss,SSSSSSSSS XX", - "2018-05-15 17:14:56,374123456 +0100")); - - assertFalse(TimestampFormatFinder.TimestampMatch.matchHasNanosecondPrecision("'SSSS'yyyy.MM.dd HH:mm:ssXX", - "SSSS2018.05.15 17:14:56Z")); - assertFalse(TimestampFormatFinder.TimestampMatch.matchHasNanosecondPrecision("yyyy.MM.dd HH:mm:ss,SSS'SSSS'", - "2018.05.15 17:14:56,374SSSS")); - assertTrue(TimestampFormatFinder.TimestampMatch.matchHasNanosecondPrecision("yyyy.MM.dd HH:mm:ss,SSSS'SSSS'", - "2018.05.15 17:14:56,3741SSSS")); - assertFalse(TimestampFormatFinder.TimestampMatch.matchHasNanosecondPrecision("yyyy.MM.dd'SSSS'HH:mm:ss.SSS", - "2018.05.15SSSS17:14:56.374")); - assertTrue(TimestampFormatFinder.TimestampMatch.matchHasNanosecondPrecision("yyyy.MM.dd'SSSS'HH:mm:ss.SSSS", - "2018.05.15SSSS17:14:56.3741")); + assertFalse( + TimestampFormatFinder.TimestampMatch.matchHasNanosecondPrecision("yyyy-MM-dd HH:mm:ss,SSS XX", "2018-05-15 17:14:56,374 +0100") + ); + assertTrue( + TimestampFormatFinder.TimestampMatch.matchHasNanosecondPrecision( + "yyyy-MM-dd HH:mm:ss.SSSSSS XX", + "2018-05-15 17:14:56.374123 +0100" + ) + ); + assertTrue( + TimestampFormatFinder.TimestampMatch.matchHasNanosecondPrecision( + "yyyy-MM-dd HH:mm:ss,SSSSSSSSS XX", + "2018-05-15 17:14:56,374123456 +0100" + ) + ); + + assertFalse( + TimestampFormatFinder.TimestampMatch.matchHasNanosecondPrecision("'SSSS'yyyy.MM.dd HH:mm:ssXX", "SSSS2018.05.15 17:14:56Z") + ); + assertFalse( + TimestampFormatFinder.TimestampMatch.matchHasNanosecondPrecision("yyyy.MM.dd HH:mm:ss,SSS'SSSS'", "2018.05.15 17:14:56,374SSSS") + ); + assertTrue( + TimestampFormatFinder.TimestampMatch.matchHasNanosecondPrecision( + "yyyy.MM.dd HH:mm:ss,SSSS'SSSS'", + "2018.05.15 17:14:56,3741SSSS" + ) + ); + assertFalse( + TimestampFormatFinder.TimestampMatch.matchHasNanosecondPrecision("yyyy.MM.dd'SSSS'HH:mm:ss.SSS", "2018.05.15SSSS17:14:56.374") + ); + assertTrue( + TimestampFormatFinder.TimestampMatch.matchHasNanosecondPrecision("yyyy.MM.dd'SSSS'HH:mm:ss.SSSS", "2018.05.15SSSS17:14:56.3741") + ); } public void testParseIndeterminateDateNumbers() { // Simplest case - nothing is indeterminate - int[] indeterminateDateNumbers = - TimestampFormatFinder.TimestampMatch.parseIndeterminateDateNumbers("2018-05-15T16:14:56,374Z", - Collections.singletonList("yyyy-MM-dd'T'HH:mm:ss,SSSXX")); + int[] indeterminateDateNumbers = TimestampFormatFinder.TimestampMatch.parseIndeterminateDateNumbers( + "2018-05-15T16:14:56,374Z", + Collections.singletonList("yyyy-MM-dd'T'HH:mm:ss,SSSXX") + ); assertEquals(2, indeterminateDateNumbers.length); assertEquals(-1, indeterminateDateNumbers[0]); assertEquals(-1, indeterminateDateNumbers[1]); // US with padding - indeterminateDateNumbers = - TimestampFormatFinder.TimestampMatch.parseIndeterminateDateNumbers("05/15/2018 16:14:56", - Collections.singletonList("??/??/yyyy HH:mm:ss")); + indeterminateDateNumbers = TimestampFormatFinder.TimestampMatch.parseIndeterminateDateNumbers( + "05/15/2018 16:14:56", + Collections.singletonList("??/??/yyyy HH:mm:ss") + ); assertEquals(2, indeterminateDateNumbers.length); assertEquals(5, indeterminateDateNumbers[0]); assertEquals(15, indeterminateDateNumbers[1]); // US with padding, 2 digit year - indeterminateDateNumbers = - TimestampFormatFinder.TimestampMatch.parseIndeterminateDateNumbers("05/15/18 16:14:56", - Collections.singletonList("??/??/yy HH:mm:ss")); + indeterminateDateNumbers = TimestampFormatFinder.TimestampMatch.parseIndeterminateDateNumbers( + "05/15/18 16:14:56", + Collections.singletonList("??/??/yy HH:mm:ss") + ); assertEquals(2, indeterminateDateNumbers.length); assertEquals(5, indeterminateDateNumbers[0]); assertEquals(15, indeterminateDateNumbers[1]); // US without padding - indeterminateDateNumbers = - TimestampFormatFinder.TimestampMatch.parseIndeterminateDateNumbers("5/15/2018 16:14:56", - Collections.singletonList("?/?/yyyy HH:mm:ss")); + indeterminateDateNumbers = TimestampFormatFinder.TimestampMatch.parseIndeterminateDateNumbers( + "5/15/2018 16:14:56", + Collections.singletonList("?/?/yyyy HH:mm:ss") + ); assertEquals(2, indeterminateDateNumbers.length); assertEquals(5, indeterminateDateNumbers[0]); assertEquals(15, indeterminateDateNumbers[1]); // US without padding, 2 digit year - indeterminateDateNumbers = - TimestampFormatFinder.TimestampMatch.parseIndeterminateDateNumbers("5/15/18 16:14:56", - Collections.singletonList("?/?/yy HH:mm:ss")); + indeterminateDateNumbers = TimestampFormatFinder.TimestampMatch.parseIndeterminateDateNumbers( + "5/15/18 16:14:56", + Collections.singletonList("?/?/yy HH:mm:ss") + ); assertEquals(2, indeterminateDateNumbers.length); assertEquals(5, indeterminateDateNumbers[0]); assertEquals(15, indeterminateDateNumbers[1]); // EU with padding - indeterminateDateNumbers = - TimestampFormatFinder.TimestampMatch.parseIndeterminateDateNumbers("15/05/2018 16:14:56", - Collections.singletonList("??/??/yyyy HH:mm:ss")); + indeterminateDateNumbers = TimestampFormatFinder.TimestampMatch.parseIndeterminateDateNumbers( + "15/05/2018 16:14:56", + Collections.singletonList("??/??/yyyy HH:mm:ss") + ); assertEquals(2, indeterminateDateNumbers.length); assertEquals(15, indeterminateDateNumbers[0]); assertEquals(5, indeterminateDateNumbers[1]); // EU with padding, 2 digit year - indeterminateDateNumbers = - TimestampFormatFinder.TimestampMatch.parseIndeterminateDateNumbers("15/05/18 16:14:56", - Collections.singletonList("??/??/yy HH:mm:ss")); + indeterminateDateNumbers = TimestampFormatFinder.TimestampMatch.parseIndeterminateDateNumbers( + "15/05/18 16:14:56", + Collections.singletonList("??/??/yy HH:mm:ss") + ); assertEquals(2, indeterminateDateNumbers.length); assertEquals(15, indeterminateDateNumbers[0]); assertEquals(5, indeterminateDateNumbers[1]); // EU without padding - indeterminateDateNumbers = - TimestampFormatFinder.TimestampMatch.parseIndeterminateDateNumbers("15/5/2018 16:14:56", - Collections.singletonList("?/?/yyyy HH:mm:ss")); + indeterminateDateNumbers = TimestampFormatFinder.TimestampMatch.parseIndeterminateDateNumbers( + "15/5/2018 16:14:56", + Collections.singletonList("?/?/yyyy HH:mm:ss") + ); assertEquals(2, indeterminateDateNumbers.length); assertEquals(15, indeterminateDateNumbers[0]); assertEquals(5, indeterminateDateNumbers[1]); // EU without padding, 2 digit year - indeterminateDateNumbers = - TimestampFormatFinder.TimestampMatch.parseIndeterminateDateNumbers("15/5/18 16:14:56", - Collections.singletonList("?/?/yy HH:mm:ss")); + indeterminateDateNumbers = TimestampFormatFinder.TimestampMatch.parseIndeterminateDateNumbers( + "15/5/18 16:14:56", + Collections.singletonList("?/?/yy HH:mm:ss") + ); assertEquals(2, indeterminateDateNumbers.length); assertEquals(15, indeterminateDateNumbers[0]); assertEquals(5, indeterminateDateNumbers[1]); @@ -504,10 +582,13 @@ public void testGuessIsDayFirstFromMatchesMultipleFormats() { // ISO8601 formats cause confusion - this test proves that they don't // DATESTAMP supports both 2 and 4 digit years, so each test is repeated for both lengths - TimestampFormatFinder.TimestampFormat expectedPrimaryFormat = - new TimestampFormatFinder.TimestampFormat(Collections.singletonList("??/??/yyyy HH:mm:ss"), - Pattern.compile("\\b\\d{1,2}[/.-]\\d{1,2}[/.-](?:\\d{2}){1,2}[- ]\\d{2}:\\d{2}:\\d{2}\\b"), "DATESTAMP", - Collections.emptyMap(), ""); + TimestampFormatFinder.TimestampFormat expectedPrimaryFormat = new TimestampFormatFinder.TimestampFormat( + Collections.singletonList("??/??/yyyy HH:mm:ss"), + Pattern.compile("\\b\\d{1,2}[/.-]\\d{1,2}[/.-](?:\\d{2}){1,2}[- ]\\d{2}:\\d{2}:\\d{2}\\b"), + "DATESTAMP", + Collections.emptyMap(), + "" + ); TimestampFormatFinder timestampFormatFinder = new TimestampFormatFinder(explanation, true, true, false, NOOP_TIMEOUT_CHECKER); @@ -735,25 +816,39 @@ public void testFindBitPattern() { public void testFindBoundsForCandidate() { - final TimestampFormatFinder.CandidateTimestampFormat httpdCandidateFormat = TimestampFormatFinder.ORDERED_CANDIDATE_FORMATS - .stream().filter(candidate -> candidate.outputGrokPatternName.equals("HTTPDATE")).findAny().get(); - - BitSet numberPosBitSet = TimestampFormatFinder.stringToNumberPosBitSet("[2018-05-11T17:07:29,553][INFO ]" + - "[o.e.e.NodeEnvironment ] [node-0] heap size [3.9gb], compressed ordinary object pointers [true]"); - assertEquals(new Tuple<>(1, 36), - TimestampFormatFinder.findBoundsForCandidate(TimestampFormatFinder.ISO8601_CANDIDATE_FORMAT, numberPosBitSet)); + final TimestampFormatFinder.CandidateTimestampFormat httpdCandidateFormat = TimestampFormatFinder.ORDERED_CANDIDATE_FORMATS.stream() + .filter(candidate -> candidate.outputGrokPatternName.equals("HTTPDATE")) + .findAny() + .get(); + + BitSet numberPosBitSet = TimestampFormatFinder.stringToNumberPosBitSet( + "[2018-05-11T17:07:29,553][INFO ]" + + "[o.e.e.NodeEnvironment ] [node-0] heap size [3.9gb], compressed ordinary object pointers [true]" + ); + assertEquals( + new Tuple<>(1, 36), + TimestampFormatFinder.findBoundsForCandidate(TimestampFormatFinder.ISO8601_CANDIDATE_FORMAT, numberPosBitSet) + ); assertEquals(new Tuple<>(-1, -1), TimestampFormatFinder.findBoundsForCandidate(httpdCandidateFormat, numberPosBitSet)); // TAI64N doesn't necessarily contain digits, so this functionality cannot guarantee that it won't match somewhere in the text - assertEquals(new Tuple<>(0, Integer.MAX_VALUE), - TimestampFormatFinder.findBoundsForCandidate(TimestampFormatFinder.TAI64N_CANDIDATE_FORMAT, numberPosBitSet)); - - numberPosBitSet = TimestampFormatFinder.stringToNumberPosBitSet("192.168.62.101 - - [29/Jun/2016:12:11:31 +0000] " + - "\"POST //apiserv:8080/engine/v2/jobs HTTP/1.1\" 201 42 \"-\" \"curl/7.46.0\" 384"); - assertEquals(new Tuple<>(-1, -1), - TimestampFormatFinder.findBoundsForCandidate(TimestampFormatFinder.ISO8601_CANDIDATE_FORMAT, numberPosBitSet)); + assertEquals( + new Tuple<>(0, Integer.MAX_VALUE), + TimestampFormatFinder.findBoundsForCandidate(TimestampFormatFinder.TAI64N_CANDIDATE_FORMAT, numberPosBitSet) + ); + + numberPosBitSet = TimestampFormatFinder.stringToNumberPosBitSet( + "192.168.62.101 - - [29/Jun/2016:12:11:31 +0000] " + + "\"POST //apiserv:8080/engine/v2/jobs HTTP/1.1\" 201 42 \"-\" \"curl/7.46.0\" 384" + ); + assertEquals( + new Tuple<>(-1, -1), + TimestampFormatFinder.findBoundsForCandidate(TimestampFormatFinder.ISO8601_CANDIDATE_FORMAT, numberPosBitSet) + ); assertEquals(new Tuple<>(20, 46), TimestampFormatFinder.findBoundsForCandidate(httpdCandidateFormat, numberPosBitSet)); - assertEquals(new Tuple<>(0, Integer.MAX_VALUE), - TimestampFormatFinder.findBoundsForCandidate(TimestampFormatFinder.TAI64N_CANDIDATE_FORMAT, numberPosBitSet)); + assertEquals( + new Tuple<>(0, Integer.MAX_VALUE), + TimestampFormatFinder.findBoundsForCandidate(TimestampFormatFinder.TAI64N_CANDIDATE_FORMAT, numberPosBitSet) + ); } public void testFindFormatGivenNoMatch() { @@ -771,123 +866,346 @@ public void testFindFormatGivenNoMatch() { public void testFindFormatGivenOnlyIso8601() { - validateTimestampMatch("2018-05-15T16:14:56,374Z", "TIMESTAMP_ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", - "ISO8601", 1526400896374L); - validateTimestampMatch("2018-05-15T17:14:56,374+0100", "TIMESTAMP_ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", - "ISO8601", 1526400896374L); - validateTimestampMatch("2018-05-15T17:14:56,374+01:00", "TIMESTAMP_ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", - "ISO8601", 1526400896374L); - validateTimestampMatch("2018-05-15T17:14:56,374", "TIMESTAMP_ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", - "ISO8601", 1526400896374L); - - validateTimestampMatch("2018-05-15T16:14:56Z", "TIMESTAMP_ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", - "ISO8601", 1526400896000L); - validateTimestampMatch("2018-05-15T17:14:56+0100", "TIMESTAMP_ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", - "ISO8601", 1526400896000L); - validateTimestampMatch("2018-05-15T17:14:56+01:00", "TIMESTAMP_ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", - "ISO8601", 1526400896000L); - validateTimestampMatch("2018-05-15T17:14:56", "TIMESTAMP_ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", - "ISO8601", 1526400896000L); - - validateTimestampMatch("2018-05-15T16:14Z", "TIMESTAMP_ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", - "ISO8601", 1526400840000L); - validateTimestampMatch("2018-05-15T17:14+0100", "TIMESTAMP_ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", - "ISO8601", 1526400840000L); - validateTimestampMatch("2018-05-15T17:14+01:00", "TIMESTAMP_ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", - "ISO8601", 1526400840000L); - validateTimestampMatch("2018-05-15T17:14", "TIMESTAMP_ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", - "ISO8601", 1526400840000L); + validateTimestampMatch( + "2018-05-15T16:14:56,374Z", + "TIMESTAMP_ISO8601", + "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", + "ISO8601", + 1526400896374L + ); + validateTimestampMatch( + "2018-05-15T17:14:56,374+0100", + "TIMESTAMP_ISO8601", + "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", + "ISO8601", + 1526400896374L + ); + validateTimestampMatch( + "2018-05-15T17:14:56,374+01:00", + "TIMESTAMP_ISO8601", + "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", + "ISO8601", + 1526400896374L + ); + validateTimestampMatch( + "2018-05-15T17:14:56,374", + "TIMESTAMP_ISO8601", + "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", + "ISO8601", + 1526400896374L + ); + + validateTimestampMatch( + "2018-05-15T16:14:56Z", + "TIMESTAMP_ISO8601", + "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", + "ISO8601", + 1526400896000L + ); + validateTimestampMatch( + "2018-05-15T17:14:56+0100", + "TIMESTAMP_ISO8601", + "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", + "ISO8601", + 1526400896000L + ); + validateTimestampMatch( + "2018-05-15T17:14:56+01:00", + "TIMESTAMP_ISO8601", + "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", + "ISO8601", + 1526400896000L + ); + validateTimestampMatch( + "2018-05-15T17:14:56", + "TIMESTAMP_ISO8601", + "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", + "ISO8601", + 1526400896000L + ); + + validateTimestampMatch( + "2018-05-15T16:14Z", + "TIMESTAMP_ISO8601", + "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", + "ISO8601", + 1526400840000L + ); + validateTimestampMatch( + "2018-05-15T17:14+0100", + "TIMESTAMP_ISO8601", + "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", + "ISO8601", + 1526400840000L + ); + validateTimestampMatch( + "2018-05-15T17:14+01:00", + "TIMESTAMP_ISO8601", + "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", + "ISO8601", + 1526400840000L + ); + validateTimestampMatch( + "2018-05-15T17:14", + "TIMESTAMP_ISO8601", + "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", + "ISO8601", + 1526400840000L + ); // TIMESTAMP_ISO8601 doesn't match ISO8601 if it's only a date with no time validateTimestampMatch("2018-05-15", "CUSTOM_TIMESTAMP", "\\b\\d{4}-\\d{2}-\\d{2}\\b", "ISO8601", 1526338800000L); - validateTimestampMatch("2018-05-15 16:14:56,374Z", "TIMESTAMP_ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", - "yyyy-MM-dd HH:mm:ss,SSSXX", 1526400896374L); - validateTimestampMatch("2018-05-15 17:14:56,374+0100", "TIMESTAMP_ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", - "yyyy-MM-dd HH:mm:ss,SSSXX", 1526400896374L); - validateTimestampMatch("2018-05-15 17:14:56,374+01:00", "TIMESTAMP_ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", - "yyyy-MM-dd HH:mm:ss,SSSXXX", 1526400896374L); - validateTimestampMatch("2018-05-15 17:14:56,374", "TIMESTAMP_ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", - "yyyy-MM-dd HH:mm:ss,SSS", 1526400896374L); - - validateTimestampMatch("2018-05-15 16:14:56Z", "TIMESTAMP_ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", - "yyyy-MM-dd HH:mm:ssXX", 1526400896000L); - validateTimestampMatch("2018-05-15 17:14:56+0100", "TIMESTAMP_ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", - "yyyy-MM-dd HH:mm:ssXX", 1526400896000L); - validateTimestampMatch("2018-05-15 17:14:56+01:00", "TIMESTAMP_ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", - "yyyy-MM-dd HH:mm:ssXXX", 1526400896000L); - validateTimestampMatch("2018-05-15 17:14:56", "TIMESTAMP_ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", - "yyyy-MM-dd HH:mm:ss", 1526400896000L); - - validateTimestampMatch("2018-05-15 16:14Z", "TIMESTAMP_ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", - "yyyy-MM-dd HH:mmXX", 1526400840000L); - validateTimestampMatch("2018-05-15 17:14+0100", "TIMESTAMP_ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", - "yyyy-MM-dd HH:mmXX", 1526400840000L); - validateTimestampMatch("2018-05-15 17:14+01:00", "TIMESTAMP_ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", - "yyyy-MM-dd HH:mmXXX", 1526400840000L); - validateTimestampMatch("2018-05-15 17:14", "TIMESTAMP_ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", - "yyyy-MM-dd HH:mm", 1526400840000L); + validateTimestampMatch( + "2018-05-15 16:14:56,374Z", + "TIMESTAMP_ISO8601", + "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", + "yyyy-MM-dd HH:mm:ss,SSSXX", + 1526400896374L + ); + validateTimestampMatch( + "2018-05-15 17:14:56,374+0100", + "TIMESTAMP_ISO8601", + "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", + "yyyy-MM-dd HH:mm:ss,SSSXX", + 1526400896374L + ); + validateTimestampMatch( + "2018-05-15 17:14:56,374+01:00", + "TIMESTAMP_ISO8601", + "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", + "yyyy-MM-dd HH:mm:ss,SSSXXX", + 1526400896374L + ); + validateTimestampMatch( + "2018-05-15 17:14:56,374", + "TIMESTAMP_ISO8601", + "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", + "yyyy-MM-dd HH:mm:ss,SSS", + 1526400896374L + ); + + validateTimestampMatch( + "2018-05-15 16:14:56Z", + "TIMESTAMP_ISO8601", + "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", + "yyyy-MM-dd HH:mm:ssXX", + 1526400896000L + ); + validateTimestampMatch( + "2018-05-15 17:14:56+0100", + "TIMESTAMP_ISO8601", + "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", + "yyyy-MM-dd HH:mm:ssXX", + 1526400896000L + ); + validateTimestampMatch( + "2018-05-15 17:14:56+01:00", + "TIMESTAMP_ISO8601", + "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", + "yyyy-MM-dd HH:mm:ssXXX", + 1526400896000L + ); + validateTimestampMatch( + "2018-05-15 17:14:56", + "TIMESTAMP_ISO8601", + "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", + "yyyy-MM-dd HH:mm:ss", + 1526400896000L + ); + + validateTimestampMatch( + "2018-05-15 16:14Z", + "TIMESTAMP_ISO8601", + "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", + "yyyy-MM-dd HH:mmXX", + 1526400840000L + ); + validateTimestampMatch( + "2018-05-15 17:14+0100", + "TIMESTAMP_ISO8601", + "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", + "yyyy-MM-dd HH:mmXX", + 1526400840000L + ); + validateTimestampMatch( + "2018-05-15 17:14+01:00", + "TIMESTAMP_ISO8601", + "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", + "yyyy-MM-dd HH:mmXXX", + 1526400840000L + ); + validateTimestampMatch( + "2018-05-15 17:14", + "TIMESTAMP_ISO8601", + "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", + "yyyy-MM-dd HH:mm", + 1526400840000L + ); } public void testFindFormatGivenOnlyKnownTimestampFormat() { // Note: some of the time formats give millisecond accuracy, some second accuracy and some minute accuracy - validateTimestampMatch("2018-05-15 17:14:56,374 +0100", "TOMCAT_DATESTAMP", - "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}[:.,]\\d{3}", "yyyy-MM-dd HH:mm:ss,SSS XX", 1526400896374L); - - validateTimestampMatch("Tue May 15 18 16:14:56 UTC", "DATESTAMP_RFC822", + validateTimestampMatch( + "2018-05-15 17:14:56,374 +0100", + "TOMCAT_DATESTAMP", + "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}[:.,]\\d{3}", + "yyyy-MM-dd HH:mm:ss,SSS XX", + 1526400896374L + ); + + validateTimestampMatch( + "Tue May 15 18 16:14:56 UTC", + "DATESTAMP_RFC822", "\\b[A-Z]\\S{2} [A-Z]\\S{2} \\d{1,2} \\d{2} \\d{2}:\\d{2}:\\d{2}\\b", - Arrays.asList("EEE MMM dd yy HH:mm:ss zzz", "EEE MMM d yy HH:mm:ss zzz"), 1526400896000L); - - validateTimestampMatch("Tue, 15 May 2018 17:14:56 +01:00", "DATESTAMP_RFC2822", - "\\b[A-Z]\\S{2}, \\d{1,2} [A-Z]\\S{2} \\d{4} \\d{2}:\\d{2}:\\d{2}\\b", "EEE, dd MMM yyyy HH:mm:ss XXX", 1526400896000L); - validateTimestampMatch("Tue, 15 May 2018 17:14:56 +0100", "DATESTAMP_RFC2822", - "\\b[A-Z]\\S{2}, \\d{1,2} [A-Z]\\S{2} \\d{4} \\d{2}:\\d{2}:\\d{2}\\b", "EEE, dd MMM yyyy HH:mm:ss XX", 1526400896000L); - - validateTimestampMatch("Tue May 15 16:14:56 UTC 2018", "DATESTAMP_OTHER", + Arrays.asList("EEE MMM dd yy HH:mm:ss zzz", "EEE MMM d yy HH:mm:ss zzz"), + 1526400896000L + ); + + validateTimestampMatch( + "Tue, 15 May 2018 17:14:56 +01:00", + "DATESTAMP_RFC2822", + "\\b[A-Z]\\S{2}, \\d{1,2} [A-Z]\\S{2} \\d{4} \\d{2}:\\d{2}:\\d{2}\\b", + "EEE, dd MMM yyyy HH:mm:ss XXX", + 1526400896000L + ); + validateTimestampMatch( + "Tue, 15 May 2018 17:14:56 +0100", + "DATESTAMP_RFC2822", + "\\b[A-Z]\\S{2}, \\d{1,2} [A-Z]\\S{2} \\d{4} \\d{2}:\\d{2}:\\d{2}\\b", + "EEE, dd MMM yyyy HH:mm:ss XX", + 1526400896000L + ); + + validateTimestampMatch( + "Tue May 15 16:14:56 UTC 2018", + "DATESTAMP_OTHER", "\\b[A-Z]\\S{2,8} [A-Z]\\S{2,8} \\d{1,2} \\d{2}:\\d{2}:\\d{2}\\b", - Arrays.asList("EEE MMM dd HH:mm:ss zzz yyyy", "EEE MMM d HH:mm:ss zzz yyyy"), 1526400896000L); + Arrays.asList("EEE MMM dd HH:mm:ss zzz yyyy", "EEE MMM d HH:mm:ss zzz yyyy"), + 1526400896000L + ); validateTimestampMatch("20180515171456", "DATESTAMP_EVENTLOG", "\\b\\d{14}\\b", "yyyyMMddHHmmss", 1526400896000L); - validateTimestampMatch("Tue May 15 17:14:56 2018", "HTTPDERROR_DATE", - "\\b[A-Z]\\S{2} [A-Z]\\S{2} \\d{2} \\d{2}:\\d{2}:\\d{2} \\d{4}\\b", "EEE MMM dd HH:mm:ss yyyy", 1526400896000L); - - validateTimestampMatch("May 15 17:14:56.725", "SYSLOGTIMESTAMP", "\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{2}:\\d{2}:\\d{2}\\b", - Arrays.asList("MMM dd HH:mm:ss.SSS", "MMM d HH:mm:ss.SSS", "MMM d HH:mm:ss.SSS"), 1526400896725L); - validateTimestampMatch("May 15 17:14:56", "SYSLOGTIMESTAMP", "\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{2}:\\d{2}:\\d{2}\\b", - Arrays.asList("MMM dd HH:mm:ss", "MMM d HH:mm:ss", "MMM d HH:mm:ss"), 1526400896000L); - - validateTimestampMatch("15/May/2018:17:14:56 +0100", "HTTPDATE", "\\b\\d{2}/[A-Z]\\S{2}/\\d{4}:\\d{2}:\\d{2}:\\d{2} ", - "dd/MMM/yyyy:HH:mm:ss XX", 1526400896000L); - - validateTimestampMatch("May 15, 2018 5:14:56 PM", "CATALINA_DATESTAMP", - "\\b[A-Z]\\S{2} \\d{2}, \\d{4} \\d{1,2}:\\d{2}:\\d{2} [AP]M\\b", "MMM dd, yyyy h:mm:ss a", 1526400896000L); - - validateTimestampMatch("May 15 2018 17:14:56", "CISCOTIMESTAMP", "\\b[A-Z]\\S{2} {1,2}\\d{1,2} \\d{4} \\d{2}:\\d{2}:\\d{2}\\b", - Arrays.asList("MMM dd yyyy HH:mm:ss", "MMM d yyyy HH:mm:ss", "MMM d yyyy HH:mm:ss"), 1526400896000L); - - validateTimestampMatch("05/15/2018 17:14:56,374", "DATESTAMP", - "\\b\\d{1,2}[/.-]\\d{1,2}[/.-](?:\\d{2}){1,2}[- ]\\d{2}:\\d{2}:\\d{2}\\b", "MM/dd/yyyy HH:mm:ss,SSS", 1526400896374L); - validateTimestampMatch("05-15-2018-17:14:56.374", "DATESTAMP", - "\\b\\d{1,2}[/.-]\\d{1,2}[/.-](?:\\d{2}){1,2}[- ]\\d{2}:\\d{2}:\\d{2}\\b", "MM-dd-yyyy-HH:mm:ss.SSS", 1526400896374L); - validateTimestampMatch("15/05/2018 17:14:56.374", "DATESTAMP", - "\\b\\d{1,2}[/.-]\\d{1,2}[/.-](?:\\d{2}){1,2}[- ]\\d{2}:\\d{2}:\\d{2}\\b", "dd/MM/yyyy HH:mm:ss.SSS", 1526400896374L); - validateTimestampMatch("15-05-2018-17:14:56,374", "DATESTAMP", - "\\b\\d{1,2}[/.-]\\d{1,2}[/.-](?:\\d{2}){1,2}[- ]\\d{2}:\\d{2}:\\d{2}\\b", "dd-MM-yyyy-HH:mm:ss,SSS", 1526400896374L); - validateTimestampMatch("15.05.2018 17:14:56.374", "DATESTAMP", - "\\b\\d{1,2}[/.-]\\d{1,2}[/.-](?:\\d{2}){1,2}[- ]\\d{2}:\\d{2}:\\d{2}\\b", "dd.MM.yyyy HH:mm:ss.SSS", 1526400896374L); - validateTimestampMatch("05/15/2018 17:14:56", "DATESTAMP", - "\\b\\d{1,2}[/.-]\\d{1,2}[/.-](?:\\d{2}){1,2}[- ]\\d{2}:\\d{2}:\\d{2}\\b", "MM/dd/yyyy HH:mm:ss", 1526400896000L); - validateTimestampMatch("05-15-2018-17:14:56", "DATESTAMP", - "\\b\\d{1,2}[/.-]\\d{1,2}[/.-](?:\\d{2}){1,2}[- ]\\d{2}:\\d{2}:\\d{2}\\b", "MM-dd-yyyy-HH:mm:ss", 1526400896000L); - validateTimestampMatch("15/05/2018 17:14:56", "DATESTAMP", - "\\b\\d{1,2}[/.-]\\d{1,2}[/.-](?:\\d{2}){1,2}[- ]\\d{2}:\\d{2}:\\d{2}\\b", "dd/MM/yyyy HH:mm:ss", 1526400896000L); - validateTimestampMatch("15-05-2018-17:14:56", "DATESTAMP", - "\\b\\d{1,2}[/.-]\\d{1,2}[/.-](?:\\d{2}){1,2}[- ]\\d{2}:\\d{2}:\\d{2}\\b", "dd-MM-yyyy-HH:mm:ss", 1526400896000L); - validateTimestampMatch("15.05.2018 17:14:56", "DATESTAMP", - "\\b\\d{1,2}[/.-]\\d{1,2}[/.-](?:\\d{2}){1,2}[- ]\\d{2}:\\d{2}:\\d{2}\\b", "dd.MM.yyyy HH:mm:ss", 1526400896000L); + validateTimestampMatch( + "Tue May 15 17:14:56 2018", + "HTTPDERROR_DATE", + "\\b[A-Z]\\S{2} [A-Z]\\S{2} \\d{2} \\d{2}:\\d{2}:\\d{2} \\d{4}\\b", + "EEE MMM dd HH:mm:ss yyyy", + 1526400896000L + ); + + validateTimestampMatch( + "May 15 17:14:56.725", + "SYSLOGTIMESTAMP", + "\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{2}:\\d{2}:\\d{2}\\b", + Arrays.asList("MMM dd HH:mm:ss.SSS", "MMM d HH:mm:ss.SSS", "MMM d HH:mm:ss.SSS"), + 1526400896725L + ); + validateTimestampMatch( + "May 15 17:14:56", + "SYSLOGTIMESTAMP", + "\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{2}:\\d{2}:\\d{2}\\b", + Arrays.asList("MMM dd HH:mm:ss", "MMM d HH:mm:ss", "MMM d HH:mm:ss"), + 1526400896000L + ); + + validateTimestampMatch( + "15/May/2018:17:14:56 +0100", + "HTTPDATE", + "\\b\\d{2}/[A-Z]\\S{2}/\\d{4}:\\d{2}:\\d{2}:\\d{2} ", + "dd/MMM/yyyy:HH:mm:ss XX", + 1526400896000L + ); + + validateTimestampMatch( + "May 15, 2018 5:14:56 PM", + "CATALINA_DATESTAMP", + "\\b[A-Z]\\S{2} \\d{2}, \\d{4} \\d{1,2}:\\d{2}:\\d{2} [AP]M\\b", + "MMM dd, yyyy h:mm:ss a", + 1526400896000L + ); + + validateTimestampMatch( + "May 15 2018 17:14:56", + "CISCOTIMESTAMP", + "\\b[A-Z]\\S{2} {1,2}\\d{1,2} \\d{4} \\d{2}:\\d{2}:\\d{2}\\b", + Arrays.asList("MMM dd yyyy HH:mm:ss", "MMM d yyyy HH:mm:ss", "MMM d yyyy HH:mm:ss"), + 1526400896000L + ); + + validateTimestampMatch( + "05/15/2018 17:14:56,374", + "DATESTAMP", + "\\b\\d{1,2}[/.-]\\d{1,2}[/.-](?:\\d{2}){1,2}[- ]\\d{2}:\\d{2}:\\d{2}\\b", + "MM/dd/yyyy HH:mm:ss,SSS", + 1526400896374L + ); + validateTimestampMatch( + "05-15-2018-17:14:56.374", + "DATESTAMP", + "\\b\\d{1,2}[/.-]\\d{1,2}[/.-](?:\\d{2}){1,2}[- ]\\d{2}:\\d{2}:\\d{2}\\b", + "MM-dd-yyyy-HH:mm:ss.SSS", + 1526400896374L + ); + validateTimestampMatch( + "15/05/2018 17:14:56.374", + "DATESTAMP", + "\\b\\d{1,2}[/.-]\\d{1,2}[/.-](?:\\d{2}){1,2}[- ]\\d{2}:\\d{2}:\\d{2}\\b", + "dd/MM/yyyy HH:mm:ss.SSS", + 1526400896374L + ); + validateTimestampMatch( + "15-05-2018-17:14:56,374", + "DATESTAMP", + "\\b\\d{1,2}[/.-]\\d{1,2}[/.-](?:\\d{2}){1,2}[- ]\\d{2}:\\d{2}:\\d{2}\\b", + "dd-MM-yyyy-HH:mm:ss,SSS", + 1526400896374L + ); + validateTimestampMatch( + "15.05.2018 17:14:56.374", + "DATESTAMP", + "\\b\\d{1,2}[/.-]\\d{1,2}[/.-](?:\\d{2}){1,2}[- ]\\d{2}:\\d{2}:\\d{2}\\b", + "dd.MM.yyyy HH:mm:ss.SSS", + 1526400896374L + ); + validateTimestampMatch( + "05/15/2018 17:14:56", + "DATESTAMP", + "\\b\\d{1,2}[/.-]\\d{1,2}[/.-](?:\\d{2}){1,2}[- ]\\d{2}:\\d{2}:\\d{2}\\b", + "MM/dd/yyyy HH:mm:ss", + 1526400896000L + ); + validateTimestampMatch( + "05-15-2018-17:14:56", + "DATESTAMP", + "\\b\\d{1,2}[/.-]\\d{1,2}[/.-](?:\\d{2}){1,2}[- ]\\d{2}:\\d{2}:\\d{2}\\b", + "MM-dd-yyyy-HH:mm:ss", + 1526400896000L + ); + validateTimestampMatch( + "15/05/2018 17:14:56", + "DATESTAMP", + "\\b\\d{1,2}[/.-]\\d{1,2}[/.-](?:\\d{2}){1,2}[- ]\\d{2}:\\d{2}:\\d{2}\\b", + "dd/MM/yyyy HH:mm:ss", + 1526400896000L + ); + validateTimestampMatch( + "15-05-2018-17:14:56", + "DATESTAMP", + "\\b\\d{1,2}[/.-]\\d{1,2}[/.-](?:\\d{2}){1,2}[- ]\\d{2}:\\d{2}:\\d{2}\\b", + "dd-MM-yyyy-HH:mm:ss", + 1526400896000L + ); + validateTimestampMatch( + "15.05.2018 17:14:56", + "DATESTAMP", + "\\b\\d{1,2}[/.-]\\d{1,2}[/.-](?:\\d{2}){1,2}[- ]\\d{2}:\\d{2}:\\d{2}\\b", + "dd.MM.yyyy HH:mm:ss", + 1526400896000L + ); validateTimestampMatch("05/15/2018", "DATE", "\\b\\d{1,2}[/.-]\\d{1,2}[/.-](?:\\d{2}){1,2}\\b", "MM/dd/yyyy", 1526338800000L); validateTimestampMatch("05-15-2018", "DATE", "\\b\\d{1,2}[/.-]\\d{1,2}[/.-](?:\\d{2}){1,2}\\b", "MM-dd-yyyy", 1526338800000L); @@ -896,8 +1214,13 @@ public void testFindFormatGivenOnlyKnownTimestampFormat() { validateTimestampMatch("15.05.2018", "DATE", "\\b\\d{1,2}[/.-]\\d{1,2}[/.-](?:\\d{2}){1,2}\\b", "dd.MM.yyyy", 1526338800000L); // The Kibana export format doesn't correspond to a built-in Grok pattern, so it has to be custom - validateTimestampMatch("May 15, 2018 @ 17:14:56.374", "CUSTOM_TIMESTAMP", - "\\b[A-Z]\\S{2} \\d{2}, \\d{4} @ \\d{2}:\\d{2}:\\d{2}\\.\\d{3}\\b", "MMM dd, yyyy @ HH:mm:ss.SSS", 1526400896374L); + validateTimestampMatch( + "May 15, 2018 @ 17:14:56.374", + "CUSTOM_TIMESTAMP", + "\\b[A-Z]\\S{2} \\d{2}, \\d{4} @ \\d{2}:\\d{2}:\\d{2}\\.\\d{3}\\b", + "MMM dd, yyyy @ HH:mm:ss.SSS", + 1526400896374L + ); } public void testFindFormatGivenOnlySystemDate() { @@ -921,8 +1244,14 @@ public void testCustomOverrideMatchingBuiltInFormat() { String expectedSimpleRegex = "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}"; String expectedGrokPatternName = "TIMESTAMP_ISO8601"; - TimestampFormatFinder strictTimestampFormatFinder = new TimestampFormatFinder(explanation, overrideFormat, true, true, true, - NOOP_TIMEOUT_CHECKER); + TimestampFormatFinder strictTimestampFormatFinder = new TimestampFormatFinder( + explanation, + overrideFormat, + true, + true, + true, + NOOP_TIMEOUT_CHECKER + ); strictTimestampFormatFinder.addSample(text); assertEquals(expectedGrokPatternName, strictTimestampFormatFinder.getGrokPatternName()); assertEquals(Collections.emptyMap(), strictTimestampFormatFinder.getCustomGrokPatternDefinitions()); @@ -930,8 +1259,14 @@ public void testCustomOverrideMatchingBuiltInFormat() { assertEquals(Collections.singletonList(overrideFormat), strictTimestampFormatFinder.getJavaTimestampFormats()); assertEquals(1, strictTimestampFormatFinder.getNumMatchedFormats()); - TimestampFormatFinder lenientTimestampFormatFinder = new TimestampFormatFinder(explanation, overrideFormat, false, false, false, - NOOP_TIMEOUT_CHECKER); + TimestampFormatFinder lenientTimestampFormatFinder = new TimestampFormatFinder( + explanation, + overrideFormat, + false, + false, + false, + NOOP_TIMEOUT_CHECKER + ); lenientTimestampFormatFinder.addSample(text); lenientTimestampFormatFinder.selectBestMatch(); assertEquals(expectedGrokPatternName, lenientTimestampFormatFinder.getGrokPatternName()); @@ -943,27 +1278,55 @@ public void testCustomOverrideMatchingBuiltInFormat() { public void testCustomOverridesNotMatchingBuiltInFormat() { - validateCustomOverrideNotMatchingBuiltInFormat("MM/dd HH.mm.ss,SSSSSS 'in' yyyy", "05/15 17.14.56,374946 in 2018", - "\\b\\d{2}/\\d{2} \\d{2}\\.\\d{2}\\.\\d{2},\\d{6} in \\d{4}\\b", "CUSTOM_TIMESTAMP", - Collections.singletonMap(TimestampFormatFinder.CUSTOM_TIMESTAMP_GROK_NAME, - "%{MONTHNUM2}/%{MONTHDAY} %{HOUR}\\.%{MINUTE}\\.%{SECOND} in %{YEAR}")); - - validateCustomOverrideNotMatchingBuiltInFormat("'some_prefix 'dd.MM.yyyy HH:mm:ss.SSSSSS", "some_prefix 06.01.2018 16:56:14.295748", - "some_prefix \\d{2}\\.\\d{2}\\.\\d{4} \\d{2}:\\d{2}:\\d{2}\\.\\d{6}\\b", "CUSTOM_TIMESTAMP", - Collections.singletonMap(TimestampFormatFinder.CUSTOM_TIMESTAMP_GROK_NAME, - "some_prefix %{MONTHDAY}\\.%{MONTHNUM2}\\.%{YEAR} %{HOUR}:%{MINUTE}:%{SECOND}")); - - validateCustomOverrideNotMatchingBuiltInFormat("dd.MM. yyyy HH:mm:ss.SSSSSS", "06.01. 2018 16:56:14.295748", - "\\b\\d{2}\\.\\d{2}\\. \\d{4} \\d{2}:\\d{2}:\\d{2}\\.\\d{6}\\b", "CUSTOM_TIMESTAMP", - Collections.singletonMap(TimestampFormatFinder.CUSTOM_TIMESTAMP_GROK_NAME, - "%{MONTHDAY}\\.%{MONTHNUM2}\\. %{YEAR} %{HOUR}:%{MINUTE}:%{SECOND}")); + validateCustomOverrideNotMatchingBuiltInFormat( + "MM/dd HH.mm.ss,SSSSSS 'in' yyyy", + "05/15 17.14.56,374946 in 2018", + "\\b\\d{2}/\\d{2} \\d{2}\\.\\d{2}\\.\\d{2},\\d{6} in \\d{4}\\b", + "CUSTOM_TIMESTAMP", + Collections.singletonMap( + TimestampFormatFinder.CUSTOM_TIMESTAMP_GROK_NAME, + "%{MONTHNUM2}/%{MONTHDAY} %{HOUR}\\.%{MINUTE}\\.%{SECOND} in %{YEAR}" + ) + ); + + validateCustomOverrideNotMatchingBuiltInFormat( + "'some_prefix 'dd.MM.yyyy HH:mm:ss.SSSSSS", + "some_prefix 06.01.2018 16:56:14.295748", + "some_prefix \\d{2}\\.\\d{2}\\.\\d{4} \\d{2}:\\d{2}:\\d{2}\\.\\d{6}\\b", + "CUSTOM_TIMESTAMP", + Collections.singletonMap( + TimestampFormatFinder.CUSTOM_TIMESTAMP_GROK_NAME, + "some_prefix %{MONTHDAY}\\.%{MONTHNUM2}\\.%{YEAR} %{HOUR}:%{MINUTE}:%{SECOND}" + ) + ); + + validateCustomOverrideNotMatchingBuiltInFormat( + "dd.MM. yyyy HH:mm:ss.SSSSSS", + "06.01. 2018 16:56:14.295748", + "\\b\\d{2}\\.\\d{2}\\. \\d{4} \\d{2}:\\d{2}:\\d{2}\\.\\d{6}\\b", + "CUSTOM_TIMESTAMP", + Collections.singletonMap( + TimestampFormatFinder.CUSTOM_TIMESTAMP_GROK_NAME, + "%{MONTHDAY}\\.%{MONTHNUM2}\\. %{YEAR} %{HOUR}:%{MINUTE}:%{SECOND}" + ) + ); } - private void validateCustomOverrideNotMatchingBuiltInFormat(String overrideFormat, String text, String expectedSimpleRegex, - String expectedGrokPatternName, - Map expectedCustomGrokPatternDefinitions) { - TimestampFormatFinder strictTimestampFormatFinder = new TimestampFormatFinder(explanation, overrideFormat, true, true, true, - NOOP_TIMEOUT_CHECKER); + private void validateCustomOverrideNotMatchingBuiltInFormat( + String overrideFormat, + String text, + String expectedSimpleRegex, + String expectedGrokPatternName, + Map expectedCustomGrokPatternDefinitions + ) { + TimestampFormatFinder strictTimestampFormatFinder = new TimestampFormatFinder( + explanation, + overrideFormat, + true, + true, + true, + NOOP_TIMEOUT_CHECKER + ); strictTimestampFormatFinder.addSample(text); assertEquals(expectedGrokPatternName, strictTimestampFormatFinder.getGrokPatternName()); assertEquals(expectedCustomGrokPatternDefinitions, strictTimestampFormatFinder.getCustomGrokPatternDefinitions()); @@ -971,8 +1334,14 @@ private void validateCustomOverrideNotMatchingBuiltInFormat(String overrideForma assertEquals(Collections.singletonList(overrideFormat), strictTimestampFormatFinder.getJavaTimestampFormats()); assertEquals(1, strictTimestampFormatFinder.getNumMatchedFormats()); - TimestampFormatFinder lenientTimestampFormatFinder = new TimestampFormatFinder(explanation, overrideFormat, false, false, false, - NOOP_TIMEOUT_CHECKER); + TimestampFormatFinder lenientTimestampFormatFinder = new TimestampFormatFinder( + explanation, + overrideFormat, + false, + false, + false, + NOOP_TIMEOUT_CHECKER + ); lenientTimestampFormatFinder.addSample(text); lenientTimestampFormatFinder.selectBestMatch(); assertEquals(expectedGrokPatternName, lenientTimestampFormatFinder.getGrokPatternName()); @@ -984,105 +1353,160 @@ private void validateCustomOverrideNotMatchingBuiltInFormat(String overrideForma public void testFindFormatGivenRealLogMessages() { - validateFindInFullMessage("[2018-05-11T17:07:29,553][INFO ][o.e.e.NodeEnvironment ] [node-0] " + - "heap size [3.9gb], compressed ordinary object pointers [true]", "[", "TIMESTAMP_ISO8601", - "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", "ISO8601"); - - validateFindInFullMessage("192.168.62.101 - - [29/Jun/2016:12:11:31 +0000] " + - "\"POST //apiserv:8080/engine/v2/jobs HTTP/1.1\" 201 42 \"-\" \"curl/7.46.0\" 384", "192.168.62.101 - - [", "HTTPDATE", - "\\b\\d{2}/[A-Z]\\S{2}/\\d{4}:\\d{2}:\\d{2}:\\d{2} ", "dd/MMM/yyyy:HH:mm:ss XX"); - - validateFindInFullMessage("Aug 29, 2009 12:03:57 AM org.apache.tomcat.util.http.Parameters processParameters", "", - "CATALINA_DATESTAMP", "\\b[A-Z]\\S{2} \\d{2}, \\d{4} \\d{1,2}:\\d{2}:\\d{2} [AP]M\\b", "MMM dd, yyyy h:mm:ss a"); - - validateFindInFullMessage("Oct 19 17:04:44 esxi1.acme.com Vpxa: [3CB3FB90 verbose 'vpxavpxaInvtVm' " + - "opID=WFU-33d82c31] [VpxaInvtVmChangeListener] Guest DiskInfo Changed", "", "SYSLOGTIMESTAMP", + validateFindInFullMessage( + "[2018-05-11T17:07:29,553][INFO ][o.e.e.NodeEnvironment ] [node-0] " + + "heap size [3.9gb], compressed ordinary object pointers [true]", + "[", + "TIMESTAMP_ISO8601", + "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", + "ISO8601" + ); + + validateFindInFullMessage( + "192.168.62.101 - - [29/Jun/2016:12:11:31 +0000] " + + "\"POST //apiserv:8080/engine/v2/jobs HTTP/1.1\" 201 42 \"-\" \"curl/7.46.0\" 384", + "192.168.62.101 - - [", + "HTTPDATE", + "\\b\\d{2}/[A-Z]\\S{2}/\\d{4}:\\d{2}:\\d{2}:\\d{2} ", + "dd/MMM/yyyy:HH:mm:ss XX" + ); + + validateFindInFullMessage( + "Aug 29, 2009 12:03:57 AM org.apache.tomcat.util.http.Parameters processParameters", + "", + "CATALINA_DATESTAMP", + "\\b[A-Z]\\S{2} \\d{2}, \\d{4} \\d{1,2}:\\d{2}:\\d{2} [AP]M\\b", + "MMM dd, yyyy h:mm:ss a" + ); + + validateFindInFullMessage( + "Oct 19 17:04:44 esxi1.acme.com Vpxa: [3CB3FB90 verbose 'vpxavpxaInvtVm' " + + "opID=WFU-33d82c31] [VpxaInvtVmChangeListener] Guest DiskInfo Changed", + "", + "SYSLOGTIMESTAMP", "\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{2}:\\d{2}:\\d{2}\\b", - Arrays.asList("MMM dd HH:mm:ss", "MMM d HH:mm:ss", "MMM d HH:mm:ss")); - - validateFindInFullMessage("559550912540598297\t2016-04-20T14:06:53\t2016-04-20T21:06:53Z\t38545844\tserv02nw07\t" + - "192.168.114.28\tAuthpriv\tInfo\tsshd\tsubsystem request for sftp", "559550912540598297\t", "TIMESTAMP_ISO8601", - "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", "ISO8601"); - - validateFindInFullMessage("Sep 8 11:55:35 dnsserv named[22529]: error (unexpected RCODE REFUSED) resolving " + - "'www.elastic.co/A/IN': 95.110.68.206#53", "", "SYSLOGTIMESTAMP", "\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{2}:\\d{2}:\\d{2}\\b", - Arrays.asList("MMM dd HH:mm:ss", "MMM d HH:mm:ss", "MMM d HH:mm:ss")); - - validateFindInFullMessage("10-28-2016 16:22:47.636 +0200 ERROR Network - " + - "Error encountered for connection from src=192.168.0.1:12345. Local side shutting down", "", "DATESTAMP", - "\\b\\d{1,2}[/.-]\\d{1,2}[/.-](?:\\d{2}){1,2}[- ]\\d{2}:\\d{2}:\\d{2}\\b", "MM-dd-yyyy HH:mm:ss.SSS"); - - validateFindInFullMessage("2018-01-06 19:22:20.106822|INFO |VirtualServer |1 |client " + - " 'User1'(id:2) was added to channelgroup 'Channel Admin'(id:5) by client 'User1'(id:2) in channel '3er Instanz'(id:2)", "", - "TIMESTAMP_ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", "yyyy-MM-dd HH:mm:ss.SSSSSS"); + Arrays.asList("MMM dd HH:mm:ss", "MMM d HH:mm:ss", "MMM d HH:mm:ss") + ); + + validateFindInFullMessage( + "559550912540598297\t2016-04-20T14:06:53\t2016-04-20T21:06:53Z\t38545844\tserv02nw07\t" + + "192.168.114.28\tAuthpriv\tInfo\tsshd\tsubsystem request for sftp", + "559550912540598297\t", + "TIMESTAMP_ISO8601", + "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", + "ISO8601" + ); + + validateFindInFullMessage( + "Sep 8 11:55:35 dnsserv named[22529]: error (unexpected RCODE REFUSED) resolving " + "'www.elastic.co/A/IN': 95.110.68.206#53", + "", + "SYSLOGTIMESTAMP", + "\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{2}:\\d{2}:\\d{2}\\b", + Arrays.asList("MMM dd HH:mm:ss", "MMM d HH:mm:ss", "MMM d HH:mm:ss") + ); + + validateFindInFullMessage( + "10-28-2016 16:22:47.636 +0200 ERROR Network - " + + "Error encountered for connection from src=192.168.0.1:12345. Local side shutting down", + "", + "DATESTAMP", + "\\b\\d{1,2}[/.-]\\d{1,2}[/.-](?:\\d{2}){1,2}[- ]\\d{2}:\\d{2}:\\d{2}\\b", + "MM-dd-yyyy HH:mm:ss.SSS" + ); + + validateFindInFullMessage( + "2018-01-06 19:22:20.106822|INFO |VirtualServer |1 |client " + + " 'User1'(id:2) was added to channelgroup 'Channel Admin'(id:5) by client 'User1'(id:2) in channel '3er Instanz'(id:2)", + "", + "TIMESTAMP_ISO8601", + "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", + "yyyy-MM-dd HH:mm:ss.SSSSSS" + ); // Differs from the above as the required format is specified - validateFindInFullMessage("yyyy-MM-dd HH:mm:ss.SSSSSS", "2018-01-06 19:22:20.106822|INFO |VirtualServer |1 |client " + - " 'User1'(id:2) was added to channelgroup 'Channel Admin'(id:5) by client 'User1'(id:2) in channel '3er Instanz'(id:2)", "", - "TIMESTAMP_ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", "yyyy-MM-dd HH:mm:ss.SSSSSS"); + validateFindInFullMessage( + "yyyy-MM-dd HH:mm:ss.SSSSSS", + "2018-01-06 19:22:20.106822|INFO |VirtualServer |1 |client " + + " 'User1'(id:2) was added to channelgroup 'Channel Admin'(id:5) by client 'User1'(id:2) in channel '3er Instanz'(id:2)", + "", + "TIMESTAMP_ISO8601", + "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", + "yyyy-MM-dd HH:mm:ss.SSSSSS" + ); // Non-matching required format specified - TimestampFormatFinder timestampFormatFinder = new TimestampFormatFinder(explanation, - randomFrom("UNIX", "EEE MMM dd yyyy HH:mm zzz"), false, false, false, NOOP_TIMEOUT_CHECKER); - timestampFormatFinder.addSample("2018-01-06 19:22:20.106822|INFO |VirtualServer |1 |client " + - " 'User1'(id:2) was added to channelgroup 'Channel Admin'(id:5) by client 'User1'(id:2) in channel '3er Instanz'(id:2)"); + TimestampFormatFinder timestampFormatFinder = new TimestampFormatFinder( + explanation, + randomFrom("UNIX", "EEE MMM dd yyyy HH:mm zzz"), + false, + false, + false, + NOOP_TIMEOUT_CHECKER + ); + timestampFormatFinder.addSample( + "2018-01-06 19:22:20.106822|INFO |VirtualServer |1 |client " + + " 'User1'(id:2) was added to channelgroup 'Channel Admin'(id:5) by client 'User1'(id:2) in channel '3er Instanz'(id:2)" + ); assertEquals(Collections.emptyList(), timestampFormatFinder.getJavaTimestampFormats()); assertEquals(0, timestampFormatFinder.getNumMatchedFormats()); } - public void testSelectBestMatchGivenAllSame() { - String sample = "[2018-06-27T11:59:22,125][INFO ][o.e.n.Node ] [node-0] initializing ...\n" + - "[2018-06-27T11:59:22,201][INFO ][o.e.e.NodeEnvironment ] [node-0] using [1] data paths, mounts [[/ (/dev/disk1)]], " + - "net usable_space [216.1gb], net total_space [464.7gb], types [hfs]\n" + - "[2018-06-27T11:59:22,202][INFO ][o.e.e.NodeEnvironment ] [node-0] heap size [494.9mb], " + - "compressed ordinary object pointers [true]\n" + - "[2018-06-27T11:59:22,204][INFO ][o.e.n.Node ] [node-0] node name [node-0], node ID [Ha1gD8nNSDqjd6PIyu3DJA]\n" + - "[2018-06-27T11:59:22,204][INFO ][o.e.n.Node ] [node-0] version[6.4.0-SNAPSHOT], pid[2785], " + - "build[default/zip/3c60efa/2018-06-26T14:55:15.206676Z], OS[Mac OS X/10.12.6/x86_64], " + - "JVM[\"Oracle Corporation\"/Java HotSpot(TM) 64-Bit Server VM/10/10+46]\n" + - "[2018-06-27T11:59:22,205][INFO ][o.e.n.Node ] [node-0] JVM arguments [-Xms1g, -Xmx1g, " + - "-XX:+UseConcMarkSweepGC, -XX:CMSInitiatingOccupancyFraction=75, -XX:+UseCMSInitiatingOccupancyOnly, " + - "-XX:+AlwaysPreTouch, -Xss1m, -Djava.awt.headless=true, -Dfile.encoding=UTF-8, -Djna.nosys=true, " + - "-XX:-OmitStackTraceInFastThrow, -Dio.netty.noUnsafe=true, -Dio.netty.noKeySetOptimization=true, " + - "-Dio.netty.recycler.maxCapacityPerThread=0, -Dlog4j.shutdownHookEnabled=false, -Dlog4j2.disable.jmx=true, " + - "-Djava.io.tmpdir=/var/folders/k5/5sqcdlps5sg3cvlp783gcz740000h0/T/elasticsearch.nFUyeMH1, " + - "-XX:+HeapDumpOnOutOfMemoryError, -XX:HeapDumpPath=data, -XX:ErrorFile=logs/hs_err_pid%p.log, " + - "-Xlog:gc*,gc+age=trace,safepoint:file=logs/gc.log:utctime,pid,tags:filecount=32,filesize=64m, " + - "-Djava.locale.providers=COMPAT, -Dio.netty.allocator.type=unpooled, -ea, -esa, -Xms512m, -Xmx512m, " + - "-Des.path.home=/Users/dave/elasticsearch/distribution/build/cluster/run node0/elasticsearch-6.4.0-SNAPSHOT, " + - "-Des.path.conf=/Users/dave/elasticsearch/distribution/build/cluster/run node0/elasticsearch-6.4.0-SNAPSHOT/config, " + - "-Des.distribution.flavor=default, -Des.distribution.type=zip]\n" + - "[2018-06-27T11:59:22,205][WARN ][o.e.n.Node ] [node-0] version [6.4.0-SNAPSHOT] is a pre-release version of " + - "Elasticsearch and is not suitable for production\n" + - "[2018-06-27T11:59:23,585][INFO ][o.e.p.PluginsService ] [node-0] loaded module [aggs-matrix-stats]\n" + - "[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService ] [node-0] loaded module [analysis-common]\n" + - "[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService ] [node-0] loaded module [ingest-common]\n" + - "[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService ] [node-0] loaded module [lang-expression]\n" + - "[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService ] [node-0] loaded module [lang-mustache]\n" + - "[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService ] [node-0] loaded module [lang-painless]\n" + - "[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService ] [node-0] loaded module [mapper-extras]\n" + - "[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService ] [node-0] loaded module [parent-join]\n" + - "[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService ] [node-0] loaded module [percolator]\n" + - "[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService ] [node-0] loaded module [rank-eval]\n" + - "[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService ] [node-0] loaded module [reindex]\n" + - "[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService ] [node-0] loaded module [repository-url]\n" + - "[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService ] [node-0] loaded module [transport-netty4]\n" + - "[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService ] [node-0] loaded module [x-pack-core]\n" + - "[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService ] [node-0] loaded module [x-pack-deprecation]\n" + - "[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService ] [node-0] loaded module [x-pack-graph]\n" + - "[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService ] [node-0] loaded module [x-pack-logstash]\n" + - "[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService ] [node-0] loaded module [x-pack-ml]\n" + - "[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService ] [node-0] loaded module [x-pack-monitoring]\n" + - "[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService ] [node-0] loaded module [x-pack-rollup]\n" + - "[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService ] [node-0] loaded module [x-pack-security]\n" + - "[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService ] [node-0] loaded module [x-pack-sql]\n" + - "[2018-06-27T11:59:23,588][INFO ][o.e.p.PluginsService ] [node-0] loaded module [x-pack-upgrade]\n" + - "[2018-06-27T11:59:23,588][INFO ][o.e.p.PluginsService ] [node-0] loaded module [x-pack-watcher]\n" + - "[2018-06-27T11:59:23,588][INFO ][o.e.p.PluginsService ] [node-0] no plugins loaded\n"; - - TimestampFormatFinder timestampFormatFinder = TextLogFileStructureFinder.populateTimestampFormatFinder(explanation, - sample.split("\n"), FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); + String sample = "[2018-06-27T11:59:22,125][INFO ][o.e.n.Node ] [node-0] initializing ...\n" + + "[2018-06-27T11:59:22,201][INFO ][o.e.e.NodeEnvironment ] [node-0] using [1] data paths, mounts [[/ (/dev/disk1)]], " + + "net usable_space [216.1gb], net total_space [464.7gb], types [hfs]\n" + + "[2018-06-27T11:59:22,202][INFO ][o.e.e.NodeEnvironment ] [node-0] heap size [494.9mb], " + + "compressed ordinary object pointers [true]\n" + + "[2018-06-27T11:59:22,204][INFO ][o.e.n.Node ] [node-0] node name [node-0], node ID [Ha1gD8nNSDqjd6PIyu3DJA]\n" + + "[2018-06-27T11:59:22,204][INFO ][o.e.n.Node ] [node-0] version[6.4.0-SNAPSHOT], pid[2785], " + + "build[default/zip/3c60efa/2018-06-26T14:55:15.206676Z], OS[Mac OS X/10.12.6/x86_64], " + + "JVM[\"Oracle Corporation\"/Java HotSpot(TM) 64-Bit Server VM/10/10+46]\n" + + "[2018-06-27T11:59:22,205][INFO ][o.e.n.Node ] [node-0] JVM arguments [-Xms1g, -Xmx1g, " + + "-XX:+UseConcMarkSweepGC, -XX:CMSInitiatingOccupancyFraction=75, -XX:+UseCMSInitiatingOccupancyOnly, " + + "-XX:+AlwaysPreTouch, -Xss1m, -Djava.awt.headless=true, -Dfile.encoding=UTF-8, -Djna.nosys=true, " + + "-XX:-OmitStackTraceInFastThrow, -Dio.netty.noUnsafe=true, -Dio.netty.noKeySetOptimization=true, " + + "-Dio.netty.recycler.maxCapacityPerThread=0, -Dlog4j.shutdownHookEnabled=false, -Dlog4j2.disable.jmx=true, " + + "-Djava.io.tmpdir=/var/folders/k5/5sqcdlps5sg3cvlp783gcz740000h0/T/elasticsearch.nFUyeMH1, " + + "-XX:+HeapDumpOnOutOfMemoryError, -XX:HeapDumpPath=data, -XX:ErrorFile=logs/hs_err_pid%p.log, " + + "-Xlog:gc*,gc+age=trace,safepoint:file=logs/gc.log:utctime,pid,tags:filecount=32,filesize=64m, " + + "-Djava.locale.providers=COMPAT, -Dio.netty.allocator.type=unpooled, -ea, -esa, -Xms512m, -Xmx512m, " + + "-Des.path.home=/Users/dave/elasticsearch/distribution/build/cluster/run node0/elasticsearch-6.4.0-SNAPSHOT, " + + "-Des.path.conf=/Users/dave/elasticsearch/distribution/build/cluster/run node0/elasticsearch-6.4.0-SNAPSHOT/config, " + + "-Des.distribution.flavor=default, -Des.distribution.type=zip]\n" + + "[2018-06-27T11:59:22,205][WARN ][o.e.n.Node ] [node-0] version [6.4.0-SNAPSHOT] is a pre-release version of " + + "Elasticsearch and is not suitable for production\n" + + "[2018-06-27T11:59:23,585][INFO ][o.e.p.PluginsService ] [node-0] loaded module [aggs-matrix-stats]\n" + + "[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService ] [node-0] loaded module [analysis-common]\n" + + "[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService ] [node-0] loaded module [ingest-common]\n" + + "[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService ] [node-0] loaded module [lang-expression]\n" + + "[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService ] [node-0] loaded module [lang-mustache]\n" + + "[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService ] [node-0] loaded module [lang-painless]\n" + + "[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService ] [node-0] loaded module [mapper-extras]\n" + + "[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService ] [node-0] loaded module [parent-join]\n" + + "[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService ] [node-0] loaded module [percolator]\n" + + "[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService ] [node-0] loaded module [rank-eval]\n" + + "[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService ] [node-0] loaded module [reindex]\n" + + "[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService ] [node-0] loaded module [repository-url]\n" + + "[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService ] [node-0] loaded module [transport-netty4]\n" + + "[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService ] [node-0] loaded module [x-pack-core]\n" + + "[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService ] [node-0] loaded module [x-pack-deprecation]\n" + + "[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService ] [node-0] loaded module [x-pack-graph]\n" + + "[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService ] [node-0] loaded module [x-pack-logstash]\n" + + "[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService ] [node-0] loaded module [x-pack-ml]\n" + + "[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService ] [node-0] loaded module [x-pack-monitoring]\n" + + "[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService ] [node-0] loaded module [x-pack-rollup]\n" + + "[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService ] [node-0] loaded module [x-pack-security]\n" + + "[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService ] [node-0] loaded module [x-pack-sql]\n" + + "[2018-06-27T11:59:23,588][INFO ][o.e.p.PluginsService ] [node-0] loaded module [x-pack-upgrade]\n" + + "[2018-06-27T11:59:23,588][INFO ][o.e.p.PluginsService ] [node-0] loaded module [x-pack-watcher]\n" + + "[2018-06-27T11:59:23,588][INFO ][o.e.p.PluginsService ] [node-0] no plugins loaded\n"; + + TimestampFormatFinder timestampFormatFinder = TextLogFileStructureFinder.populateTimestampFormatFinder( + explanation, + sample.split("\n"), + FileStructureOverrides.EMPTY_OVERRIDES, + NOOP_TIMEOUT_CHECKER + ); timestampFormatFinder.selectBestMatch(); assertEquals(Collections.singletonList("ISO8601"), timestampFormatFinder.getJavaTimestampFormats()); assertEquals("TIMESTAMP_ISO8601", timestampFormatFinder.getGrokPatternName()); @@ -1095,8 +1519,12 @@ public void testSelectBestMatchGivenAllSame() { public void testSelectBestMatchGivenExceptionTrace() { - TimestampFormatFinder timestampFormatFinder = TextLogFileStructureFinder.populateTimestampFormatFinder(explanation, - EXCEPTION_TRACE_SAMPLE.split("\n"), FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); + TimestampFormatFinder timestampFormatFinder = TextLogFileStructureFinder.populateTimestampFormatFinder( + explanation, + EXCEPTION_TRACE_SAMPLE.split("\n"), + FileStructureOverrides.EMPTY_OVERRIDES, + NOOP_TIMEOUT_CHECKER + ); // Even though many lines have a timestamp near the end (in the Lucene version information), // these are so far along the lines that the weight of the timestamp near the beginning of the @@ -1115,8 +1543,12 @@ public void testSelectBestMatchGivenExceptionTraceAndTimestampFormatOverride() { FileStructureOverrides overrides = FileStructureOverrides.builder().setTimestampFormat("yyyy-MM-dd HH:mm:ss").build(); - TimestampFormatFinder timestampFormatFinder = TextLogFileStructureFinder.populateTimestampFormatFinder(explanation, - EXCEPTION_TRACE_SAMPLE.split("\n"), overrides, NOOP_TIMEOUT_CHECKER); + TimestampFormatFinder timestampFormatFinder = TextLogFileStructureFinder.populateTimestampFormatFinder( + explanation, + EXCEPTION_TRACE_SAMPLE.split("\n"), + overrides, + NOOP_TIMEOUT_CHECKER + ); // The override should force the seemingly inferior choice of timestamp // TODO - this won't work any more :-( @@ -1126,8 +1558,12 @@ public void testSelectBestMatchGivenExceptionTraceAndImpossibleTimestampFormatOv FileStructureOverrides overrides = FileStructureOverrides.builder().setTimestampFormat("MMM dd HH:mm:ss").build(); - TimestampFormatFinder timestampFormatFinder = TextLogFileStructureFinder.populateTimestampFormatFinder(explanation, - EXCEPTION_TRACE_SAMPLE.split("\n"), overrides, NOOP_TIMEOUT_CHECKER); + TimestampFormatFinder timestampFormatFinder = TextLogFileStructureFinder.populateTimestampFormatFinder( + explanation, + EXCEPTION_TRACE_SAMPLE.split("\n"), + overrides, + NOOP_TIMEOUT_CHECKER + ); timestampFormatFinder.selectBestMatch(); assertEquals(Collections.emptyList(), timestampFormatFinder.getJavaTimestampFormats()); @@ -1143,22 +1579,42 @@ private void validateNoTimestampMatch(String text) { expectThrows(IllegalArgumentException.class, () -> strictTimestampFormatFinder.addSample(text)); assertEquals(0, strictTimestampFormatFinder.getNumMatchedFormats()); - TimestampFormatFinder lenientTimestampFormatFinder = new TimestampFormatFinder(explanation, false, false, false, - NOOP_TIMEOUT_CHECKER); + TimestampFormatFinder lenientTimestampFormatFinder = new TimestampFormatFinder( + explanation, + false, + false, + false, + NOOP_TIMEOUT_CHECKER + ); lenientTimestampFormatFinder.addSample(text); lenientTimestampFormatFinder.selectBestMatch(); assertNull(lenientTimestampFormatFinder.getGrokPatternName()); assertEquals(0, lenientTimestampFormatFinder.getNumMatchedFormats()); } - private void validateTimestampMatch(String text, String expectedGrokPatternName, String expectedSimpleRegex, - String expectedJavaTimestampFormat, long expectedEpochMs) { - validateTimestampMatch(text, expectedGrokPatternName, expectedSimpleRegex, Collections.singletonList(expectedJavaTimestampFormat), - expectedEpochMs); + private void validateTimestampMatch( + String text, + String expectedGrokPatternName, + String expectedSimpleRegex, + String expectedJavaTimestampFormat, + long expectedEpochMs + ) { + validateTimestampMatch( + text, + expectedGrokPatternName, + expectedSimpleRegex, + Collections.singletonList(expectedJavaTimestampFormat), + expectedEpochMs + ); } - private void validateTimestampMatch(String text, String expectedGrokPatternName, String expectedSimpleRegex, - List expectedJavaTimestampFormats, long expectedEpochMs) { + private void validateTimestampMatch( + String text, + String expectedGrokPatternName, + String expectedSimpleRegex, + List expectedJavaTimestampFormats, + long expectedEpochMs + ) { Pattern expectedSimplePattern = Pattern.compile(expectedSimpleRegex); assertTrue(expectedSimplePattern.matcher(text).find()); @@ -1171,8 +1627,13 @@ private void validateTimestampMatch(String text, String expectedGrokPatternName, assertEquals(expectedJavaTimestampFormats, strictTimestampFormatFinder.getJavaTimestampFormats()); assertEquals(1, strictTimestampFormatFinder.getNumMatchedFormats()); - TimestampFormatFinder lenientTimestampFormatFinder = new TimestampFormatFinder(explanation, false, false, false, - NOOP_TIMEOUT_CHECKER); + TimestampFormatFinder lenientTimestampFormatFinder = new TimestampFormatFinder( + explanation, + false, + false, + false, + NOOP_TIMEOUT_CHECKER + ); lenientTimestampFormatFinder.addSample(text); lenientTimestampFormatFinder.selectBestMatch(); assertEquals(expectedGrokPatternName, lenientTimestampFormatFinder.getGrokPatternName()); @@ -1181,34 +1642,77 @@ private void validateTimestampMatch(String text, String expectedGrokPatternName, assertEquals(1, lenientTimestampFormatFinder.getNumMatchedFormats()); } - private void validateFindInFullMessage(String message, String expectedPreface, String expectedGrokPatternName, - String expectedSimpleRegex, String expectedJavaTimestampFormat) { - validateFindInFullMessage(message, expectedPreface, expectedGrokPatternName, expectedSimpleRegex, - Collections.singletonList(expectedJavaTimestampFormat)); + private void validateFindInFullMessage( + String message, + String expectedPreface, + String expectedGrokPatternName, + String expectedSimpleRegex, + String expectedJavaTimestampFormat + ) { + validateFindInFullMessage( + message, + expectedPreface, + expectedGrokPatternName, + expectedSimpleRegex, + Collections.singletonList(expectedJavaTimestampFormat) + ); } - private void validateFindInFullMessage(String timestampFormatOverride, String message, String expectedPreface, - String expectedGrokPatternName, String expectedSimpleRegex, - String expectedJavaTimestampFormat) { - validateFindInFullMessage(timestampFormatOverride, message, expectedPreface, expectedGrokPatternName, expectedSimpleRegex, - Collections.singletonList(expectedJavaTimestampFormat)); + private void validateFindInFullMessage( + String timestampFormatOverride, + String message, + String expectedPreface, + String expectedGrokPatternName, + String expectedSimpleRegex, + String expectedJavaTimestampFormat + ) { + validateFindInFullMessage( + timestampFormatOverride, + message, + expectedPreface, + expectedGrokPatternName, + expectedSimpleRegex, + Collections.singletonList(expectedJavaTimestampFormat) + ); } - private void validateFindInFullMessage(String message, String expectedPreface, String expectedGrokPatternName, - String expectedSimpleRegex, List expectedJavaTimestampFormats) { - validateFindInFullMessage(null, message, expectedPreface, expectedGrokPatternName, expectedSimpleRegex, - expectedJavaTimestampFormats); + private void validateFindInFullMessage( + String message, + String expectedPreface, + String expectedGrokPatternName, + String expectedSimpleRegex, + List expectedJavaTimestampFormats + ) { + validateFindInFullMessage( + null, + message, + expectedPreface, + expectedGrokPatternName, + expectedSimpleRegex, + expectedJavaTimestampFormats + ); } - private void validateFindInFullMessage(String timestampFormatOverride, String message, String expectedPreface, - String expectedGrokPatternName, String expectedSimpleRegex, - List expectedJavaTimestampFormats) { + private void validateFindInFullMessage( + String timestampFormatOverride, + String message, + String expectedPreface, + String expectedGrokPatternName, + String expectedSimpleRegex, + List expectedJavaTimestampFormats + ) { Pattern expectedSimplePattern = Pattern.compile(expectedSimpleRegex); assertTrue(expectedSimplePattern.matcher(message).find()); - TimestampFormatFinder timestampFormatFinder = new TimestampFormatFinder(explanation, timestampFormatOverride, false, false, false, - NOOP_TIMEOUT_CHECKER); + TimestampFormatFinder timestampFormatFinder = new TimestampFormatFinder( + explanation, + timestampFormatOverride, + false, + false, + false, + NOOP_TIMEOUT_CHECKER + ); timestampFormatFinder.addSample(message); timestampFormatFinder.selectBestMatch(); assertEquals(expectedGrokPatternName, timestampFormatFinder.getGrokPatternName()); @@ -1254,11 +1758,11 @@ private void validateJavaTimestampFormats(List javaTimestampFormats, Str DateTimeFormatter parser = builder.toFormatter(Locale.ROOT); // This next line parses the textual date without any default timezone, so if // the text doesn't contain the timezone then the resulting temporal accessor - // will be incomplete (i.e. impossible to convert to an Instant). You would + // will be incomplete (i.e. impossible to convert to an Instant). You would // hope that it would be possible to specify a timezone to be used only in this // case, and in Java 9 and 10 it is, by adding withZone(zone) before the - // parse(text) call. However, with Java 8 this overrides any timezone parsed - // from the text. The solution is to parse twice, once without a default + // parse(text) call. However, with Java 8 this overrides any timezone parsed + // from the text. The solution is to parse twice, once without a default // timezone and then again with a default timezone if the first parse didn't // find one in the text. TemporalAccessor parsed = parser.parse(text); diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderFactoryTests.java b/x-pack/plugin/text-structure/src/test/java/org/elasticsearch/xpack/textstructure/structurefinder/XmlFileStructureFinderFactoryTests.java similarity index 95% rename from x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderFactoryTests.java rename to x-pack/plugin/text-structure/src/test/java/org/elasticsearch/xpack/textstructure/structurefinder/XmlFileStructureFinderFactoryTests.java index aabcde85cc0dc..21c5454653377 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderFactoryTests.java +++ b/x-pack/plugin/text-structure/src/test/java/org/elasticsearch/xpack/textstructure/structurefinder/XmlFileStructureFinderFactoryTests.java @@ -3,7 +3,7 @@ * or more contributor license agreements. Licensed under the Elastic License; * you may not use this file except in compliance with the Elastic License. */ -package org.elasticsearch.xpack.ml.filestructurefinder; +package org.elasticsearch.xpack.textstructure.structurefinder; public class XmlFileStructureFinderFactoryTests extends FileStructureTestCase { diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderTests.java b/x-pack/plugin/text-structure/src/test/java/org/elasticsearch/xpack/textstructure/structurefinder/XmlFileStructureFinderTests.java similarity index 84% rename from x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderTests.java rename to x-pack/plugin/text-structure/src/test/java/org/elasticsearch/xpack/textstructure/structurefinder/XmlFileStructureFinderTests.java index 3892126674305..ed92b82352e66 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderTests.java +++ b/x-pack/plugin/text-structure/src/test/java/org/elasticsearch/xpack/textstructure/structurefinder/XmlFileStructureFinderTests.java @@ -3,9 +3,9 @@ * or more contributor license agreements. Licensed under the Elastic License; * you may not use this file except in compliance with the Elastic License. */ -package org.elasticsearch.xpack.ml.filestructurefinder; +package org.elasticsearch.xpack.textstructure.structurefinder; -import org.elasticsearch.xpack.core.ml.filestructurefinder.FileStructure; +import org.elasticsearch.xpack.core.textstructure.structurefinder.FileStructure; import java.util.Collections; @@ -18,8 +18,15 @@ public void testCreateConfigsGivenGoodXml() throws Exception { String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); - FileStructureFinder structureFinder = factory.createFromSample(explanation, XML_SAMPLE, charset, hasByteOrderMarker, - FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); + FileStructureFinder structureFinder = factory.createFromSample( + explanation, + XML_SAMPLE, + charset, + hasByteOrderMarker, + FileStructureFinderManager.DEFAULT_LINE_MERGE_SIZE_LIMIT, + FileStructureOverrides.EMPTY_OVERRIDES, + NOOP_TIMEOUT_CHECKER + ); FileStructure structure = structureFinder.getStructure();