From 9e4a55355b42fcfd2ad305f26560507565947909 Mon Sep 17 00:00:00 2001 From: David Roberts Date: Wed, 12 Sep 2018 13:51:14 +0100 Subject: [PATCH 1/3] [ML] Allow overrides for some file structure detection decisions This change modifies the file structure detection functionality such that some of the decisions can be overridden with user supplied values. The fields that can be overridden are: - charset - format - has_header_row - column_names - delimiter - quote - should_trim_fields - grok_pattern - timestamp_field - timestamp_format If an override makes finding the file structure impossible then the endpoint will return an exception. --- .../ml/action/FindFileStructureAction.java | 201 ++++++++++++- .../ml/filestructurefinder/FileStructure.java | 83 ++++-- .../FindFileStructureActionRequestTests.java | 92 +++++- .../FileStructureTests.java | 1 + .../TransportFindFileStructureAction.java | 5 +- .../DelimitedFileStructureFinder.java | 89 ++++-- .../DelimitedFileStructureFinderFactory.java | 22 +- .../FileStructureFinderFactory.java | 18 +- .../FileStructureFinderManager.java | 85 +++++- .../FileStructureOverrides.java | 197 +++++++++++++ .../FileStructureUtils.java | 56 +++- .../GrokPatternCreator.java | 114 ++++--- .../JsonFileStructureFinder.java | 10 +- .../JsonFileStructureFinderFactory.java | 12 +- .../TextLogFileStructureFinder.java | 41 ++- .../TextLogFileStructureFinderFactory.java | 13 +- .../TimestampFormatFinder.java | 96 ++++-- .../XmlFileStructureFinder.java | 10 +- .../XmlFileStructureFinderFactory.java | 11 +- .../ml/rest/RestFindFileStructureAction.java | 11 + ...imitedFileStructureFinderFactoryTests.java | 8 +- .../DelimitedFileStructureFinderTests.java | 202 ++++++++++++- .../FileStructureFinderManagerTests.java | 62 +++- .../FileStructureUtilsTests.java | 94 ++++-- .../GrokPatternCreatorTests.java | 60 +++- .../JsonFileStructureFinderTests.java | 4 +- .../TextLogFileStructureFinderTests.java | 279 ++++++++++++------ .../XmlFileStructureFinderTests.java | 4 +- 28 files changed, 1550 insertions(+), 330 deletions(-) create mode 100644 x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureOverrides.java diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/action/FindFileStructureAction.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/action/FindFileStructureAction.java index 9fda416b33bbe..d2b7e0cb6536c 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/action/FindFileStructureAction.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/action/FindFileStructureAction.java @@ -22,6 +22,8 @@ import org.elasticsearch.xpack.core.ml.filestructurefinder.FileStructure; import java.io.IOException; +import java.util.Arrays; +import java.util.List; import java.util.Objects; import static org.elasticsearch.action.ValidateActions.addValidationError; @@ -109,8 +111,29 @@ public boolean equals(Object other) { public static class Request extends ActionRequest { public static final ParseField LINES_TO_SAMPLE = new ParseField("lines_to_sample"); + public static final ParseField CHARSET = FileStructure.CHARSET; + public static final ParseField FORMAT = FileStructure.FORMAT; + public static final ParseField COLUMN_NAMES = FileStructure.COLUMN_NAMES; + public static final ParseField HAS_HEADER_ROW = FileStructure.HAS_HEADER_ROW; + public static final ParseField DELIMITER = FileStructure.DELIMITER; + public static final ParseField QUOTE = FileStructure.QUOTE; + public static final ParseField SHOULD_TRIM_FIELDS = FileStructure.SHOULD_TRIM_FIELDS; + public static final ParseField GROK_PATTERN = FileStructure.GROK_PATTERN; + // This one is plural in FileStructure, but singular in FileStructureOverrides + public static final ParseField TIMESTAMP_FORMAT = new ParseField("timestamp_format"); + public static final ParseField TIMESTAMP_FIELD = FileStructure.TIMESTAMP_FIELD; private Integer linesToSample; + private String charset; + private FileStructure.Format format; + private List columnNames; + private Boolean hasHeaderRow; + private Character delimiter; + private Character quote; + private Boolean shouldTrimFields; + private String grokPattern; + private String timestampFormat; + private String timestampField; private BytesReference sample; public Request() { @@ -124,6 +147,114 @@ public void setLinesToSample(Integer linesToSample) { this.linesToSample = linesToSample; } + public String getCharset() { + return charset; + } + + public void setCharset(String charset) { + this.charset = (charset == null || charset.isEmpty()) ? null : charset; + } + + public FileStructure.Format getFormat() { + return format; + } + + public void setFormat(FileStructure.Format format) { + this.format = format; + } + + public void setFormat(String format) { + this.format = (format == null || format.isEmpty()) ? null : FileStructure.Format.fromString(format); + } + + public List getColumnNames() { + return columnNames; + } + + public void setColumnNames(List columnNames) { + this.columnNames = (columnNames == null || columnNames.isEmpty()) ? null : columnNames; + } + + public void setColumnNames(String[] columnNames) { + this.columnNames = (columnNames == null || columnNames.length == 0) ? null : Arrays.asList(columnNames); + } + + public Boolean getHasHeaderRow() { + return hasHeaderRow; + } + + public void setHasHeaderRow(Boolean hasHeaderRow) { + this.hasHeaderRow = hasHeaderRow; + } + + public Character getDelimiter() { + return delimiter; + } + + public void setDelimiter(Character delimiter) { + this.delimiter = delimiter; + } + + public void setDelimiter(String delimiter) { + if (delimiter == null || delimiter.isEmpty()) { + this.delimiter = null; + } else if (delimiter.length() == 1) { + this.delimiter = delimiter.charAt(0); + } else { + throw new IllegalArgumentException(DELIMITER.getPreferredName() + " must be a single character"); + } + } + + public Character getQuote() { + return quote; + } + + public void setQuote(Character quote) { + this.quote = quote; + } + + public void setQuote(String quote) { + if (quote == null || quote.isEmpty()) { + this.quote = null; + } else if (quote.length() == 1) { + this.quote = quote.charAt(0); + } else { + throw new IllegalArgumentException(QUOTE.getPreferredName() + " must be a single character"); + } + } + + public Boolean getShouldTrimFields() { + return shouldTrimFields; + } + + public void setShouldTrimFields(Boolean shouldTrimFields) { + this.shouldTrimFields = shouldTrimFields; + } + + public String getGrokPattern() { + return grokPattern; + } + + public void setGrokPattern(String grokPattern) { + this.grokPattern = (grokPattern == null || grokPattern.isEmpty()) ? null : grokPattern; + } + + public String getTimestampFormat() { + return timestampFormat; + } + + public void setTimestampFormat(String timestampFormat) { + this.timestampFormat = (timestampFormat == null || timestampFormat.isEmpty()) ? null : timestampFormat; + } + + public String getTimestampField() { + return timestampField; + } + + public void setTimestampField(String timestampField) { + this.timestampField = (timestampField == null || timestampField.isEmpty()) ? null : timestampField; + } + public BytesReference getSample() { return sample; } @@ -139,6 +270,34 @@ public ActionRequestValidationException validate() { validationException = addValidationError(LINES_TO_SAMPLE.getPreferredName() + " must be positive if specified", validationException); } + if (format != FileStructure.Format.DELIMITED) { + if (columnNames != null) { + validationException = addValidationError(COLUMN_NAMES.getPreferredName() + " may only be specified if " + + FORMAT.getPreferredName() + " is " + FileStructure.Format.DELIMITED, validationException); + } + if (hasHeaderRow != null) { + validationException = addValidationError(HAS_HEADER_ROW.getPreferredName() + " may only be specified if " + + FORMAT.getPreferredName() + " is " + FileStructure.Format.DELIMITED, validationException); + } + if (delimiter != null) { + validationException = addValidationError(DELIMITER.getPreferredName() + " may only be specified if " + + FORMAT.getPreferredName() + " is " + FileStructure.Format.DELIMITED, validationException); + } + if (quote != null) { + validationException = addValidationError(QUOTE.getPreferredName() + " may only be specified if " + + FORMAT.getPreferredName() + " is " + FileStructure.Format.DELIMITED, validationException); + } + if (shouldTrimFields != null) { + validationException = addValidationError(SHOULD_TRIM_FIELDS.getPreferredName() + " may only be specified if " + + FORMAT.getPreferredName() + " is " + FileStructure.Format.DELIMITED, validationException); + } + } + if (format != FileStructure.Format.SEMI_STRUCTURED_TEXT) { + if (grokPattern != null) { + validationException = addValidationError(GROK_PATTERN.getPreferredName() + " may only be specified if " + + FORMAT.getPreferredName() + " is " + FileStructure.Format.SEMI_STRUCTURED_TEXT, validationException); + } + } if (sample == null || sample.length() == 0) { validationException = addValidationError("sample must be specified", validationException); } @@ -149,6 +308,14 @@ public ActionRequestValidationException validate() { public void readFrom(StreamInput in) throws IOException { super.readFrom(in); linesToSample = in.readOptionalVInt(); + charset = in.readOptionalString(); + format = in.readBoolean() ? in.readEnum(FileStructure.Format.class) : null; + columnNames = in.readBoolean() ? in.readList(StreamInput::readString) : null; + hasHeaderRow = in.readOptionalBoolean(); + delimiter = in.readBoolean() ? (char) in.readVInt() : null; + grokPattern = in.readOptionalString(); + timestampFormat = in.readOptionalString(); + timestampField = in.readOptionalString(); sample = in.readBytesReference(); } @@ -156,12 +323,36 @@ public void readFrom(StreamInput in) throws IOException { public void writeTo(StreamOutput out) throws IOException { super.writeTo(out); out.writeOptionalVInt(linesToSample); + out.writeOptionalString(charset); + if (format == null) { + out.writeBoolean(false); + } else { + out.writeBoolean(true); + out.writeEnum(format); + } + if (columnNames == null) { + out.writeBoolean(false); + } else { + out.writeBoolean(true); + out.writeCollection(columnNames, StreamOutput::writeString); + } + out.writeOptionalBoolean(hasHeaderRow); + if (delimiter == null) { + out.writeBoolean(false); + } else { + out.writeBoolean(true); + out.writeVInt(delimiter); + } + out.writeOptionalString(grokPattern); + out.writeOptionalString(timestampFormat); + out.writeOptionalString(timestampField); out.writeBytesReference(sample); } @Override public int hashCode() { - return Objects.hash(linesToSample, sample); + return Objects.hash(linesToSample, charset, format, columnNames, hasHeaderRow, delimiter, grokPattern, timestampFormat, + timestampField, sample); } @Override @@ -177,6 +368,14 @@ public boolean equals(Object other) { Request that = (Request) other; return Objects.equals(this.linesToSample, that.linesToSample) && + Objects.equals(this.charset, that.charset) && + Objects.equals(this.format, that.format) && + Objects.equals(this.columnNames, that.columnNames) && + Objects.equals(this.hasHeaderRow, that.hasHeaderRow) && + Objects.equals(this.delimiter, that.delimiter) && + Objects.equals(this.grokPattern, that.grokPattern) && + Objects.equals(this.timestampFormat, that.timestampFormat) && + Objects.equals(this.timestampField, that.timestampField) && Objects.equals(this.sample, that.sample); } } diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/filestructurefinder/FileStructure.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/filestructurefinder/FileStructure.java index dd508dfb36b74..db5f29f3b1b63 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/filestructurefinder/FileStructure.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/filestructurefinder/FileStructure.java @@ -84,25 +84,26 @@ public String toString() { public static final String EXPLAIN = "explain"; - static final ParseField NUM_LINES_ANALYZED = new ParseField("num_lines_analyzed"); - static final ParseField NUM_MESSAGES_ANALYZED = new ParseField("num_messages_analyzed"); - static final ParseField SAMPLE_START = new ParseField("sample_start"); - static final ParseField CHARSET = new ParseField("charset"); - static final ParseField HAS_BYTE_ORDER_MARKER = new ParseField("has_byte_order_marker"); - static final ParseField STRUCTURE = new ParseField("format"); - static final ParseField MULTILINE_START_PATTERN = new ParseField("multiline_start_pattern"); - static final ParseField EXCLUDE_LINES_PATTERN = new ParseField("exclude_lines_pattern"); - static final ParseField COLUMN_NAMES = new ParseField("column_names"); - static final ParseField HAS_HEADER_ROW = new ParseField("has_header_row"); - static final ParseField DELIMITER = new ParseField("delimiter"); - static final ParseField SHOULD_TRIM_FIELDS = new ParseField("should_trim_fields"); - static final ParseField GROK_PATTERN = new ParseField("grok_pattern"); - static final ParseField TIMESTAMP_FIELD = new ParseField("timestamp_field"); - static final ParseField TIMESTAMP_FORMATS = new ParseField("timestamp_formats"); - static final ParseField NEED_CLIENT_TIMEZONE = new ParseField("need_client_timezone"); - static final ParseField MAPPINGS = new ParseField("mappings"); - static final ParseField FIELD_STATS = new ParseField("field_stats"); - static final ParseField EXPLANATION = new ParseField("explanation"); + public static final ParseField NUM_LINES_ANALYZED = new ParseField("num_lines_analyzed"); + public static final ParseField NUM_MESSAGES_ANALYZED = new ParseField("num_messages_analyzed"); + public static final ParseField SAMPLE_START = new ParseField("sample_start"); + public static final ParseField CHARSET = new ParseField("charset"); + public static final ParseField HAS_BYTE_ORDER_MARKER = new ParseField("has_byte_order_marker"); + public static final ParseField FORMAT = new ParseField("format"); + public static final ParseField MULTILINE_START_PATTERN = new ParseField("multiline_start_pattern"); + public static final ParseField EXCLUDE_LINES_PATTERN = new ParseField("exclude_lines_pattern"); + public static final ParseField COLUMN_NAMES = new ParseField("column_names"); + public static final ParseField HAS_HEADER_ROW = new ParseField("has_header_row"); + public static final ParseField DELIMITER = new ParseField("delimiter"); + public static final ParseField QUOTE = new ParseField("quote"); + public static final ParseField SHOULD_TRIM_FIELDS = new ParseField("should_trim_fields"); + public static final ParseField GROK_PATTERN = new ParseField("grok_pattern"); + public static final ParseField TIMESTAMP_FIELD = new ParseField("timestamp_field"); + public static final ParseField TIMESTAMP_FORMATS = new ParseField("timestamp_formats"); + public static final ParseField NEED_CLIENT_TIMEZONE = new ParseField("need_client_timezone"); + public static final ParseField MAPPINGS = new ParseField("mappings"); + public static final ParseField FIELD_STATS = new ParseField("field_stats"); + public static final ParseField EXPLANATION = new ParseField("explanation"); public static final ObjectParser PARSER = new ObjectParser<>("file_structure", false, Builder::new); @@ -112,12 +113,13 @@ public String toString() { PARSER.declareString(Builder::setSampleStart, SAMPLE_START); PARSER.declareString(Builder::setCharset, CHARSET); PARSER.declareBoolean(Builder::setHasByteOrderMarker, HAS_BYTE_ORDER_MARKER); - PARSER.declareString((p, c) -> p.setFormat(Format.fromString(c)), STRUCTURE); + PARSER.declareString((p, c) -> p.setFormat(Format.fromString(c)), FORMAT); PARSER.declareString(Builder::setMultilineStartPattern, MULTILINE_START_PATTERN); PARSER.declareString(Builder::setExcludeLinesPattern, EXCLUDE_LINES_PATTERN); PARSER.declareStringArray(Builder::setColumnNames, COLUMN_NAMES); PARSER.declareBoolean(Builder::setHasHeaderRow, HAS_HEADER_ROW); PARSER.declareString((p, c) -> p.setDelimiter(c.charAt(0)), DELIMITER); + PARSER.declareString((p, c) -> p.setQuote(c.charAt(0)), QUOTE); PARSER.declareBoolean(Builder::setShouldTrimFields, SHOULD_TRIM_FIELDS); PARSER.declareString(Builder::setGrokPattern, GROK_PATTERN); PARSER.declareString(Builder::setTimestampField, TIMESTAMP_FIELD); @@ -145,6 +147,7 @@ public String toString() { private final List columnNames; private final Boolean hasHeaderRow; private final Character delimiter; + private final Character quote; private final Boolean shouldTrimFields; private final String grokPattern; private final List timestampFormats; @@ -156,8 +159,8 @@ public String toString() { public FileStructure(int numLinesAnalyzed, int numMessagesAnalyzed, String sampleStart, String charset, Boolean hasByteOrderMarker, Format format, String multilineStartPattern, String excludeLinesPattern, List columnNames, - Boolean hasHeaderRow, Character delimiter, Boolean shouldTrimFields, String grokPattern, String timestampField, - List timestampFormats, boolean needClientTimezone, Map mappings, + Boolean hasHeaderRow, Character delimiter, Character quote, Boolean shouldTrimFields, String grokPattern, + String timestampField, List timestampFormats, boolean needClientTimezone, Map mappings, Map fieldStats, List explanation) { this.numLinesAnalyzed = numLinesAnalyzed; @@ -171,6 +174,7 @@ public FileStructure(int numLinesAnalyzed, int numMessagesAnalyzed, String sampl this.columnNames = (columnNames == null) ? null : Collections.unmodifiableList(new ArrayList<>(columnNames)); this.hasHeaderRow = hasHeaderRow; this.delimiter = delimiter; + this.quote = quote; this.shouldTrimFields = shouldTrimFields; this.grokPattern = grokPattern; this.timestampField = timestampField; @@ -193,6 +197,7 @@ public FileStructure(StreamInput in) throws IOException { columnNames = in.readBoolean() ? Collections.unmodifiableList(in.readList(StreamInput::readString)) : null; hasHeaderRow = in.readOptionalBoolean(); delimiter = in.readBoolean() ? (char) in.readVInt() : null; + quote = in.readBoolean() ? (char) in.readVInt() : null; shouldTrimFields = in.readOptionalBoolean(); grokPattern = in.readOptionalString(); timestampFormats = in.readBoolean() ? Collections.unmodifiableList(in.readList(StreamInput::readString)) : null; @@ -226,6 +231,12 @@ public void writeTo(StreamOutput out) throws IOException { out.writeBoolean(true); out.writeVInt(delimiter); } + if (quote == null) { + out.writeBoolean(false); + } else { + out.writeBoolean(true); + out.writeVInt(quote); + } out.writeOptionalBoolean(shouldTrimFields); out.writeOptionalString(grokPattern); if (timestampFormats == null) { @@ -285,6 +296,10 @@ public Character getDelimiter() { return delimiter; } + public Character getQuote() { + return quote; + } + public Boolean getShouldTrimFields() { return shouldTrimFields; } @@ -328,7 +343,7 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws if (hasByteOrderMarker != null) { builder.field(HAS_BYTE_ORDER_MARKER.getPreferredName(), hasByteOrderMarker.booleanValue()); } - builder.field(STRUCTURE.getPreferredName(), format); + builder.field(FORMAT.getPreferredName(), format); if (multilineStartPattern != null && multilineStartPattern.isEmpty() == false) { builder.field(MULTILINE_START_PATTERN.getPreferredName(), multilineStartPattern); } @@ -344,6 +359,9 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws if (delimiter != null) { builder.field(DELIMITER.getPreferredName(), String.valueOf(delimiter)); } + if (quote != null) { + builder.field(QUOTE.getPreferredName(), String.valueOf(quote)); + } if (shouldTrimFields != null) { builder.field(SHOULD_TRIM_FIELDS.getPreferredName(), shouldTrimFields.booleanValue()); } @@ -377,8 +395,8 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws public int hashCode() { return Objects.hash(numLinesAnalyzed, numMessagesAnalyzed, sampleStart, charset, hasByteOrderMarker, format, - multilineStartPattern, excludeLinesPattern, columnNames, hasHeaderRow, delimiter, shouldTrimFields, grokPattern, timestampField, - timestampFormats, needClientTimezone, mappings, fieldStats, explanation); + multilineStartPattern, excludeLinesPattern, columnNames, hasHeaderRow, delimiter, quote, shouldTrimFields, grokPattern, + timestampField, timestampFormats, needClientTimezone, mappings, fieldStats, explanation); } @Override @@ -405,6 +423,7 @@ public boolean equals(Object other) { Objects.equals(this.columnNames, that.columnNames) && Objects.equals(this.hasHeaderRow, that.hasHeaderRow) && Objects.equals(this.delimiter, that.delimiter) && + Objects.equals(this.quote, that.quote) && Objects.equals(this.shouldTrimFields, that.shouldTrimFields) && Objects.equals(this.grokPattern, that.grokPattern) && Objects.equals(this.timestampField, that.timestampField) && @@ -427,6 +446,7 @@ public static class Builder { private List columnNames; private Boolean hasHeaderRow; private Character delimiter; + private Character quote; private Boolean shouldTrimFields; private String grokPattern; private String timestampField; @@ -499,6 +519,11 @@ public Builder setDelimiter(Character delimiter) { return this; } + public Builder setQuote(Character quote) { + this.quote = quote; + return this; + } + public Builder setShouldTrimFields(Boolean shouldTrimFields) { this.shouldTrimFields = shouldTrimFields; return this; @@ -582,6 +607,9 @@ public FileStructure build() { if (delimiter != null) { throw new IllegalArgumentException("Delimiter may not be specified for [" + format + "] structures."); } + if (quote != null) { + throw new IllegalArgumentException("Quote may not be specified for [" + format + "] structures."); + } if (grokPattern != null) { throw new IllegalArgumentException("Grok pattern may not be specified for [" + format + "] structures."); } @@ -610,6 +638,9 @@ public FileStructure build() { if (delimiter != null) { throw new IllegalArgumentException("Delimiter may not be specified for [" + format + "] structures."); } + if (quote != null) { + throw new IllegalArgumentException("Quote may not be specified for [" + format + "] structures."); + } if (shouldTrimFields != null) { throw new IllegalArgumentException("Should trim fields may not be specified for [" + format + "] structures."); } @@ -638,7 +669,7 @@ public FileStructure build() { } return new FileStructure(numLinesAnalyzed, numMessagesAnalyzed, sampleStart, charset, hasByteOrderMarker, format, - multilineStartPattern, excludeLinesPattern, columnNames, hasHeaderRow, delimiter, shouldTrimFields, grokPattern, + multilineStartPattern, excludeLinesPattern, columnNames, hasHeaderRow, delimiter, quote, shouldTrimFields, grokPattern, timestampField, timestampFormats, needClientTimezone, mappings, fieldStats, explanation); } } diff --git a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/action/FindFileStructureActionRequestTests.java b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/action/FindFileStructureActionRequestTests.java index 05ba0e7f306f4..90a4b656a7486 100644 --- a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/action/FindFileStructureActionRequestTests.java +++ b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/action/FindFileStructureActionRequestTests.java @@ -8,6 +8,9 @@ import org.elasticsearch.action.ActionRequestValidationException; import org.elasticsearch.common.bytes.BytesArray; import org.elasticsearch.test.AbstractStreamableTestCase; +import org.elasticsearch.xpack.core.ml.filestructurefinder.FileStructure; + +import java.util.Arrays; import static org.hamcrest.Matchers.containsString; import static org.hamcrest.Matchers.startsWith; @@ -22,6 +25,44 @@ protected FindFileStructureAction.Request createTestInstance() { if (randomBoolean()) { request.setLinesToSample(randomIntBetween(10, 2000)); } + + if (randomBoolean()) { + request.setCharset(randomAlphaOfLength(10)); + } + + if (randomBoolean()) { + FileStructure.Format format = randomFrom(FileStructure.Format.values()); + request.setFormat(format); + if (format == FileStructure.Format.DELIMITED) { + if (randomBoolean()) { + request.setColumnNames(generateRandomStringArray(10, 15, false, false)); + } + if (randomBoolean()) { + request.setHasHeaderRow(randomBoolean()); + } + if (randomBoolean()) { + request.setDelimiter(randomFrom(',', '\t', ';', '|')); + } + if (randomBoolean()) { + request.setQuote(randomFrom('"', '\'')); + } + if (randomBoolean()) { + request.setShouldTrimFields(randomBoolean()); + } + } else if (format == FileStructure.Format.SEMI_STRUCTURED_TEXT) { + if (randomBoolean()) { + request.setGrokPattern(randomAlphaOfLength(80)); + } + } + } + + if (randomBoolean()) { + request.setTimestampFormat(randomAlphaOfLength(20)); + } + if (randomBoolean()) { + request.setTimestampField(randomAlphaOfLength(15)); + } + request.setSample(new BytesArray(randomByteArrayOfLength(randomIntBetween(1000, 20000)))); return request; @@ -35,7 +76,7 @@ protected FindFileStructureAction.Request createBlankInstance() { public void testValidateLinesToSample() { FindFileStructureAction.Request request = new FindFileStructureAction.Request(); - request.setLinesToSample(randomFrom(-1, 0)); + request.setLinesToSample(randomIntBetween(-1, 0)); request.setSample(new BytesArray("foo\n")); ActionRequestValidationException e = request.validate(); @@ -44,6 +85,55 @@ public void testValidateLinesToSample() { assertThat(e.getMessage(), containsString(" lines_to_sample must be positive if specified")); } + public void testValidateNonDelimited() { + + FindFileStructureAction.Request request = new FindFileStructureAction.Request(); + String errorField; + switch (randomIntBetween(0, 4)) { + case 0: + errorField = "column_names"; + request.setColumnNames(Arrays.asList("col1", "col2")); + break; + case 1: + errorField = "has_header_row"; + request.setHasHeaderRow(randomBoolean()); + break; + case 2: + errorField = "delimiter"; + request.setDelimiter(randomFrom(',', '\t', ';', '|')); + break; + case 3: + errorField = "quote"; + request.setQuote(randomFrom('"', '\'')); + break; + case 4: + errorField = "should_trim_fields"; + request.setShouldTrimFields(randomBoolean()); + break; + default: + throw new IllegalStateException("unexpected switch value"); + } + request.setSample(new BytesArray("foo\n")); + + ActionRequestValidationException e = request.validate(); + assertNotNull(e); + assertThat(e.getMessage(), startsWith("Validation Failed: ")); + assertThat(e.getMessage(), containsString(" " + errorField + " may only be specified if format is delimited")); + } + + public void testValidateNonSemiStructuredText() { + + FindFileStructureAction.Request request = new FindFileStructureAction.Request(); + request.setFormat(randomFrom(FileStructure.Format.JSON, FileStructure.Format.XML, FileStructure.Format.DELIMITED)); + request.setGrokPattern(randomAlphaOfLength(80)); + request.setSample(new BytesArray("foo\n")); + + ActionRequestValidationException e = request.validate(); + assertNotNull(e); + assertThat(e.getMessage(), startsWith("Validation Failed: ")); + assertThat(e.getMessage(), containsString(" grok_pattern may only be specified if format is semi_structured_text")); + } + public void testValidateSample() { FindFileStructureAction.Request request = new FindFileStructureAction.Request(); diff --git a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/filestructurefinder/FileStructureTests.java b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/filestructurefinder/FileStructureTests.java index e09b9e3f91e7a..ac6c647136bd7 100644 --- a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/filestructurefinder/FileStructureTests.java +++ b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/filestructurefinder/FileStructureTests.java @@ -54,6 +54,7 @@ public static FileStructure createTestFileStructure() { builder.setColumnNames(Arrays.asList(generateRandomStringArray(10, 10, false, false))); builder.setHasHeaderRow(randomBoolean()); builder.setDelimiter(randomFrom(',', '\t', ';', '|')); + builder.setQuote(randomFrom('"', '\'')); } if (format == FileStructure.Format.SEMI_STRUCTURED_TEXT) { diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportFindFileStructureAction.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportFindFileStructureAction.java index 66d07f5111c52..ec37a2b7481f6 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportFindFileStructureAction.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportFindFileStructureAction.java @@ -17,6 +17,7 @@ import org.elasticsearch.xpack.ml.MachineLearning; import org.elasticsearch.xpack.ml.filestructurefinder.FileStructureFinder; import org.elasticsearch.xpack.ml.filestructurefinder.FileStructureFinderManager; +import org.elasticsearch.xpack.ml.filestructurefinder.FileStructureOverrides; public class TransportFindFileStructureAction extends HandledTransportAction { @@ -49,8 +50,8 @@ private FindFileStructureAction.Response buildFileStructureResponse(FindFileStru FileStructureFinderManager structureFinderManager = new FileStructureFinderManager(); - FileStructureFinder fileStructureFinder = - structureFinderManager.findFileStructure(request.getLinesToSample(), request.getSample().streamInput()); + FileStructureFinder fileStructureFinder = structureFinderManager.findFileStructure(request.getLinesToSample(), + request.getSample().streamInput(), new FileStructureOverrides(request)); return new FindFileStructureAction.Response(fileStructureFinder.getStructure()); } diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinder.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinder.java index ba6b590dfc8cd..f7cff3c2cba85 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinder.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinder.java @@ -40,21 +40,35 @@ public class DelimitedFileStructureFinder implements FileStructureFinder { static DelimitedFileStructureFinder makeDelimitedFileStructureFinder(List explanation, String sample, String charsetName, Boolean hasByteOrderMarker, CsvPreference csvPreference, - boolean trimFields) throws IOException { + boolean trimFields, FileStructureOverrides overrides) + throws IOException { Tuple>, List> parsed = readRows(sample, csvPreference); List> rows = parsed.v1(); List lineNumbers = parsed.v2(); - Tuple headerInfo = findHeaderFromSample(explanation, rows); + // Even if the column names are overridden we need to know if there's a + // header in the file, as it affects which rows are considered records + Tuple headerInfo = findHeaderFromSample(explanation, rows, overrides); boolean isHeaderInFile = headerInfo.v1(); String[] header = headerInfo.v2(); - // The column names are the header names but with blanks named column1, column2, etc. - String[] columnNames = new String[header.length]; - for (int i = 0; i < header.length; ++i) { - assert header[i] != null; - String rawHeader = trimFields ? header[i].trim() : header[i]; - columnNames[i] = rawHeader.isEmpty() ? "column" + (i + 1) : rawHeader; + + String[] columnNames; + List overriddenColumnNames = overrides.getColumnNames(); + if (overriddenColumnNames != null) { + if (overriddenColumnNames.size() != header.length) { + throw new IllegalArgumentException("[" + overriddenColumnNames.size() + "] column names were specified [" + + String.join(",", overriddenColumnNames) + "] but there are [" + header.length + "] columns in the sample"); + } + columnNames = overriddenColumnNames.toArray(new String[overriddenColumnNames.size()]); + } else { + // The column names are the header names but with blanks named column1, column2, etc. + columnNames = new String[header.length]; + for (int i = 0; i < header.length; ++i) { + assert header[i] != null; + String rawHeader = trimFields ? header[i].trim() : header[i]; + columnNames[i] = rawHeader.isEmpty() ? "column" + (i + 1) : rawHeader; + } } List sampleLines = Arrays.asList(sample.split("\n")); @@ -84,13 +98,14 @@ static DelimitedFileStructureFinder makeDelimitedFileStructureFinder(List timeField = FileStructureUtils.guessTimestampField(explanation, sampleRecords); + Tuple timeField = FileStructureUtils.guessTimestampField(explanation, sampleRecords, overrides); if (timeField != null) { String timeLineRegex = null; StringBuilder builder = new StringBuilder("^"); @@ -98,7 +113,7 @@ static DelimitedFileStructureFinder makeDelimitedFileStructureFinder(List "\"?" + column.replace("\"", "\"\"").replaceAll("([\\\\|()\\[\\]{}^$*?])", "\\\\$1") + "\"?") + .map(column -> optQuote + column.replace(quote, twoQuotes).replaceAll("([\\\\|()\\[\\]{}^$*?])", "\\\\$1") + optQuote) .collect(Collectors.joining(","))); } @@ -131,7 +149,10 @@ static DelimitedFileStructureFinder makeDelimitedFileStructureFinder(List mappings = mappingsAndFieldStats.v1(); - mappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD, Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "date")); + if (timeField != null) { + mappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD, + Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "date")); + } if (mappingsAndFieldStats.v2() != null) { structureBuilder.setFieldStats(mappingsAndFieldStats.v2()); @@ -205,45 +226,61 @@ static Tuple>, List> readRows(String sample, CsvPrefe return new Tuple<>(rows, lineNumbers); } - static Tuple findHeaderFromSample(List explanation, List> rows) { + static Tuple findHeaderFromSample(List explanation, List> rows, + FileStructureOverrides overrides) { assert rows.isEmpty() == false; + List overriddenColumnNames = overrides.getColumnNames(); List firstRow = rows.get(0); boolean isHeaderInFile = true; - if (rowContainsDuplicateNonEmptyValues(firstRow)) { - isHeaderInFile = false; - explanation.add("First row contains duplicate values, so assuming it's not a header"); + if (overrides.getHasHeaderRow() != null) { + isHeaderInFile = overrides.getHasHeaderRow(); + if (isHeaderInFile && overriddenColumnNames == null) { + String duplicateValue = findDuplicateNonEmptyValues(firstRow); + if (duplicateValue != null) { + throw new IllegalArgumentException("Sample specified to contain a header row, " + + "but the first row contains duplicate values: [" + duplicateValue + "]"); + } + } + explanation.add("Sample specified to " + (isHeaderInFile ? "contain" : "not contain") + " a header row"); } else { - if (rows.size() < 3) { - explanation.add("Too little data to accurately assess whether header is in sample - guessing it is"); + if (findDuplicateNonEmptyValues(firstRow) != null) { + isHeaderInFile = false; + explanation.add("First row contains duplicate values, so assuming it's not a header"); } else { - isHeaderInFile = isFirstRowUnusual(explanation, rows); + if (rows.size() < 3) { + explanation.add("Too little data to accurately assess whether header is in sample - guessing it is"); + } else { + isHeaderInFile = isFirstRowUnusual(explanation, rows); + } } } + String[] header; if (isHeaderInFile) { // SuperCSV will put nulls in the header if any columns don't have names, but empty strings are better for us - return new Tuple<>(true, firstRow.stream().map(field -> (field == null) ? "" : field).toArray(String[]::new)); + header = firstRow.stream().map(field -> (field == null) ? "" : field).toArray(String[]::new); } else { - String[] dummyHeader = new String[firstRow.size()]; - Arrays.fill(dummyHeader, ""); - return new Tuple<>(false, dummyHeader); + header = new String[firstRow.size()]; + Arrays.fill(header, ""); } + + return new Tuple<>(isHeaderInFile, header); } - static boolean rowContainsDuplicateNonEmptyValues(List row) { + static String findDuplicateNonEmptyValues(List row) { HashSet values = new HashSet<>(); for (String value : row) { if (value != null && value.isEmpty() == false && values.add(value) == false) { - return true; + return value; } } - return false; + return null; } private static boolean isFirstRowUnusual(List explanation, List> rows) { diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderFactory.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderFactory.java index 0bbe13e3b05c3..62e5eff517e90 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderFactory.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderFactory.java @@ -5,6 +5,7 @@ */ package org.elasticsearch.xpack.ml.filestructurefinder; +import org.elasticsearch.xpack.core.ml.filestructurefinder.FileStructure; import org.supercsv.prefs.CsvPreference; import java.io.IOException; @@ -17,12 +18,23 @@ public class DelimitedFileStructureFinderFactory implements FileStructureFinderF private final int minFieldsPerRow; private final boolean trimFields; - DelimitedFileStructureFinderFactory(char delimiter, int minFieldsPerRow, boolean trimFields) { - csvPreference = new CsvPreference.Builder('"', delimiter, "\n").build(); + DelimitedFileStructureFinderFactory(char delimiter, char quote, int minFieldsPerRow, boolean trimFields) { + csvPreference = new CsvPreference.Builder(quote, delimiter, "\n").build(); this.minFieldsPerRow = minFieldsPerRow; this.trimFields = trimFields; } + DelimitedFileStructureFinderFactory makeSimilar(Character quote, Boolean trimFields) { + + return new DelimitedFileStructureFinderFactory((char) csvPreference.getDelimiterChar(), + (quote == null) ? csvPreference.getQuoteChar() : quote, minFieldsPerRow, (trimFields == null) ? this.trimFields : trimFields); + } + + @Override + public boolean canFindFormat(FileStructure.Format format) { + return format == null || format == FileStructure.Format.DELIMITED; + } + /** * Rules are: * - It must contain at least two complete records @@ -49,9 +61,9 @@ public boolean canCreateFromSample(List explanation, String sample) { } @Override - public FileStructureFinder createFromSample(List explanation, String sample, String charsetName, Boolean hasByteOrderMarker) - throws IOException { + public FileStructureFinder createFromSample(List explanation, String sample, String charsetName, Boolean hasByteOrderMarker, + FileStructureOverrides overrides) throws IOException { return DelimitedFileStructureFinder.makeDelimitedFileStructureFinder(explanation, sample, charsetName, hasByteOrderMarker, - csvPreference, trimFields); + csvPreference, trimFields, overrides); } } diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderFactory.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderFactory.java index 4b6fce322ee1d..bff4b2115b0fd 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderFactory.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderFactory.java @@ -5,10 +5,20 @@ */ package org.elasticsearch.xpack.ml.filestructurefinder; +import org.elasticsearch.xpack.core.ml.filestructurefinder.FileStructure; + import java.util.List; public interface FileStructureFinderFactory { + /** + * Can this factory create a {@link FileStructureFinder} that can find the supplied format? + * @param format The format to query, or null. + * @return true if {@code format} is null or the factory + * can produce a {@link FileStructureFinder} that can find {@code format}. + */ + boolean canFindFormat(FileStructure.Format format); + /** * Given a sample of a file, decide whether this factory will be able * to create an appropriate object to represent its ingestion configs. @@ -27,9 +37,11 @@ public interface FileStructureFinderFactory { * @param sample A sample from the file to be ingested. * @param charsetName The name of the character set in which the sample was provided. * @param hasByteOrderMarker Did the sample have a byte order marker? null means "not relevant". - * @return A file structure object suitable for ingesting the supplied sample. + * @param overrides Stores structure decisions that have been made by the end user, and should + * take precedence over anything the {@link FileStructureFinder} may decide. + * @return A {@link FileStructureFinder} object suitable for determining the structure of the supplied sample. * @throws Exception if something goes wrong during creation. */ - FileStructureFinder createFromSample(List explanation, String sample, String charsetName, Boolean hasByteOrderMarker) - throws Exception; + FileStructureFinder createFromSample(List explanation, String sample, String charsetName, Boolean hasByteOrderMarker, + FileStructureOverrides overrides) throws Exception; } diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderManager.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderManager.java index d0ce68aff25c0..7949998d16e01 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderManager.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderManager.java @@ -13,6 +13,7 @@ import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; +import java.io.InputStreamReader; import java.io.Reader; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; @@ -24,6 +25,7 @@ import java.util.Locale; import java.util.Optional; import java.util.Set; +import java.util.stream.Collectors; /** * Runs the high-level steps needed to create ingest configs for the specified file. In order: @@ -70,15 +72,19 @@ public final class FileStructureFinderManager { new JsonFileStructureFinderFactory(), new XmlFileStructureFinderFactory(), // ND-JSON will often also be valid (although utterly weird) CSV, so JSON must come before CSV - new DelimitedFileStructureFinderFactory(',', 2, false), - new DelimitedFileStructureFinderFactory('\t', 2, false), - new DelimitedFileStructureFinderFactory(';', 4, false), - new DelimitedFileStructureFinderFactory('|', 5, true), + new DelimitedFileStructureFinderFactory(',', '"', 2, false), + new DelimitedFileStructureFinderFactory('\t', '"', 2, false), + new DelimitedFileStructureFinderFactory(';', '"', 4, false), + new DelimitedFileStructureFinderFactory('|', '"', 5, true), new TextLogFileStructureFinderFactory() )); private static final int BUFFER_SIZE = 8192; + public FileStructureFinder findFileStructure(Integer idealSampleLineCount, InputStream fromFile) throws Exception { + return findFileStructure(idealSampleLineCount, fromFile, FileStructureOverrides.EMPTY_OVERRIDES); + } + /** * Given a stream of data from some file, determine its structure. * @param idealSampleLineCount Ideally, how many lines from the stream will be read to determine the structure? @@ -86,24 +92,42 @@ public final class FileStructureFinderManager { * least {@link #MIN_SAMPLE_LINE_COUNT} lines can be read. If null * the value of {@link #DEFAULT_IDEAL_SAMPLE_LINE_COUNT} will be used. * @param fromFile A stream from which the sample will be read. + * @param overrides Aspects of the file structure that are known in advance. These take precedence over + * values determined by structure analysis. An exception will be thrown if the file structure + * is incompatible with an overridden value. * @return A {@link FileStructureFinder} object from which the structure and messages can be queried. * @throws Exception A variety of problems could occur at various stages of the structure finding process. */ - public FileStructureFinder findFileStructure(Integer idealSampleLineCount, InputStream fromFile) throws Exception { + public FileStructureFinder findFileStructure(Integer idealSampleLineCount, InputStream fromFile, FileStructureOverrides overrides) + throws Exception { return findFileStructure(new ArrayList<>(), (idealSampleLineCount == null) ? DEFAULT_IDEAL_SAMPLE_LINE_COUNT : idealSampleLineCount, - fromFile); + fromFile, overrides); } public FileStructureFinder findFileStructure(List explanation, int idealSampleLineCount, InputStream fromFile) throws Exception { + return findFileStructure(new ArrayList<>(), idealSampleLineCount, fromFile, FileStructureOverrides.EMPTY_OVERRIDES); + } + + public FileStructureFinder findFileStructure(List explanation, int idealSampleLineCount, InputStream fromFile, + FileStructureOverrides overrides) throws Exception { - CharsetMatch charsetMatch = findCharset(explanation, fromFile); - String charsetName = charsetMatch.getName(); + String charsetName = overrides.getCharset(); + Reader sampleReader; + if (charsetName != null) { + // Creating the reader will throw if the specified character set does not exist + sampleReader = new InputStreamReader(fromFile, charsetName); + explanation.add("Using specified character encoding [" + charsetName + "]"); + } else { + CharsetMatch charsetMatch = findCharset(explanation, fromFile); + charsetName = charsetMatch.getName(); + sampleReader = charsetMatch.getReader(); + } - Tuple sampleInfo = sampleFile(charsetMatch.getReader(), charsetName, MIN_SAMPLE_LINE_COUNT, + Tuple sampleInfo = sampleFile(sampleReader, charsetName, MIN_SAMPLE_LINE_COUNT, Math.max(MIN_SAMPLE_LINE_COUNT, idealSampleLineCount)); - return makeBestStructureFinder(explanation, sampleInfo.v1(), charsetName, sampleInfo.v2()); + return makeBestStructureFinder(explanation, sampleInfo.v1(), charsetName, sampleInfo.v2(), overrides); } CharsetMatch findCharset(List explanation, InputStream inputStream) throws Exception { @@ -195,15 +219,44 @@ CharsetMatch findCharset(List explanation, InputStream inputStream) thro (containsZeroBytes ? " - could it be binary data?" : "")); } - FileStructureFinder makeBestStructureFinder(List explanation, String sample, String charsetName, Boolean hasByteOrderMarker) - throws Exception { + FileStructureFinder makeBestStructureFinder(List explanation, String sample, String charsetName, Boolean hasByteOrderMarker, + FileStructureOverrides overrides) throws Exception { - for (FileStructureFinderFactory factory : ORDERED_STRUCTURE_FACTORIES) { + Character delimiter = overrides.getDelimiter(); + Character quote = overrides.getQuote(); + Boolean shouldTrimFields = overrides.getShouldTrimFields(); + List factories; + if (delimiter != null) { + + // If a precise delimiter is specified, we only need one structure finder + // factory, and we'll tolerate as little as one column in the input + factories = Collections.singletonList(new DelimitedFileStructureFinderFactory(delimiter, (quote == null) ? '"' : quote, 1, + (shouldTrimFields == null) ? (delimiter == '|') : shouldTrimFields)); + + } else if (quote != null || shouldTrimFields != null) { + + // The delimiter is not specified, but some other aspect of delimited files is, + // so clone our default delimited factories altering the overridden values + factories = ORDERED_STRUCTURE_FACTORIES.stream().filter(factory -> factory instanceof DelimitedFileStructureFinderFactory) + .map(factory -> ((DelimitedFileStructureFinderFactory) factory).makeSimilar(quote, shouldTrimFields)) + .collect(Collectors.toList()); + + } else { + + // We can use the default factories, but possibly filtered down to a specific format + factories = ORDERED_STRUCTURE_FACTORIES.stream() + .filter(factory -> factory.canFindFormat(overrides.getFormat())).collect(Collectors.toList()); + + } + + for (FileStructureFinderFactory factory : factories) { if (factory.canCreateFromSample(explanation, sample)) { - return factory.createFromSample(explanation, sample, charsetName, hasByteOrderMarker); + return factory.createFromSample(explanation, sample, charsetName, hasByteOrderMarker, overrides); } } - throw new IllegalArgumentException("Input did not match any known formats"); + + throw new IllegalArgumentException("Input did not match " + + ((overrides.getFormat() == null) ? "any known formats" : "the specified format [" + overrides.getFormat() + "]")); } private Tuple sampleFile(Reader reader, String charsetName, int minLines, int maxLines) throws IOException { @@ -233,7 +286,7 @@ private Tuple sampleFile(Reader reader, String charsetName, int } if (lineCount < minLines) { - throw new IllegalArgumentException("Input contained too few lines to sample"); + throw new IllegalArgumentException("Input contained too few lines [" + lineCount + "] to obtain a meaningful sample"); } return new Tuple<>(sample.toString(), hasByteOrderMarker); diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureOverrides.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureOverrides.java new file mode 100644 index 0000000000000..138dd4c49a4b2 --- /dev/null +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureOverrides.java @@ -0,0 +1,197 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ +package org.elasticsearch.xpack.ml.filestructurefinder; + +import org.elasticsearch.xpack.core.ml.action.FindFileStructureAction; +import org.elasticsearch.xpack.core.ml.filestructurefinder.FileStructure; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Objects; + +public class FileStructureOverrides { + + public static final FileStructureOverrides EMPTY_OVERRIDES = new Builder().build(); + + private final String charset; + private final FileStructure.Format format; + private final List columnNames; + private final Boolean hasHeaderRow; + private final Character delimiter; + private final Character quote; + private final Boolean shouldTrimFields; + private final String grokPattern; + private final String timestampFormat; + private final String timestampField; + + public FileStructureOverrides(FindFileStructureAction.Request request) { + + this(request.getCharset(), request.getFormat(), request.getColumnNames(), request.getHasHeaderRow(), request.getDelimiter(), + request.getQuote(), request.getShouldTrimFields(), request.getGrokPattern(), request.getTimestampFormat(), + request.getTimestampField()); + } + + private FileStructureOverrides(String charset, FileStructure.Format format, List columnNames, Boolean hasHeaderRow, + Character delimiter, Character quote, Boolean shouldTrimFields, String grokPattern, + String timestampFormat, String timestampField) { + this.charset = charset; + this.format = format; + this.columnNames = (columnNames == null) ? null : Collections.unmodifiableList(new ArrayList<>(columnNames)); + this.hasHeaderRow = hasHeaderRow; + this.delimiter = delimiter; + this.quote = quote; + this.shouldTrimFields = shouldTrimFields; + this.grokPattern = grokPattern; + this.timestampFormat = timestampFormat; + this.timestampField = timestampField; + } + + public static Builder builder() { + return new Builder(); + } + + public String getCharset() { + return charset; + } + + public FileStructure.Format getFormat() { + return format; + } + + public List getColumnNames() { + return columnNames; + } + + public Boolean getHasHeaderRow() { + return hasHeaderRow; + } + + public Character getDelimiter() { + return delimiter; + } + + public Character getQuote() { + return quote; + } + + public Boolean getShouldTrimFields() { + return shouldTrimFields; + } + + public String getGrokPattern() { + return grokPattern; + } + + public String getTimestampFormat() { + return timestampFormat; + } + + public String getTimestampField() { + return timestampField; + } + + @Override + public int hashCode() { + + return Objects.hash(charset, format, columnNames, hasHeaderRow, delimiter, quote, shouldTrimFields, grokPattern, timestampFormat, + timestampField); + } + + @Override + public boolean equals(Object other) { + + if (this == other) { + return true; + } + + if (other == null || getClass() != other.getClass()) { + return false; + } + + FileStructureOverrides that = (FileStructureOverrides) other; + return Objects.equals(this.charset, that.charset) && + Objects.equals(this.format, that.format) && + Objects.equals(this.columnNames, that.columnNames) && + Objects.equals(this.hasHeaderRow, that.hasHeaderRow) && + Objects.equals(this.delimiter, that.delimiter) && + Objects.equals(this.quote, that.quote) && + Objects.equals(this.shouldTrimFields, that.shouldTrimFields) && + Objects.equals(this.grokPattern, that.grokPattern) && + Objects.equals(this.timestampFormat, that.timestampFormat) && + Objects.equals(this.timestampField, that.timestampField); + } + + public static class Builder { + + private String charset; + private FileStructure.Format format; + private List columnNames; + private Boolean hasHeaderRow; + private Character delimiter; + private Character quote; + private Boolean shouldTrimFields; + private String grokPattern; + private String timestampFormat; + private String timestampField; + + public Builder setCharset(String charset) { + this.charset = charset; + return this; + } + + public Builder setFormat(FileStructure.Format format) { + this.format = format; + return this; + } + + public Builder setColumnNames(List columnNames) { + this.columnNames = columnNames; + return this; + } + + public Builder setHasHeaderRow(Boolean hasHeaderRow) { + this.hasHeaderRow = hasHeaderRow; + return this; + } + + public Builder setDelimiter(Character delimiter) { + this.delimiter = delimiter; + return this; + } + + public Builder setQuote(Character quote) { + this.quote = quote; + return this; + } + + public Builder setShouldTrimFields(Boolean shouldTrimFields) { + this.shouldTrimFields = shouldTrimFields; + return this; + } + + public Builder setGrokPattern(String grokPattern) { + this.grokPattern = grokPattern; + return this; + } + + public Builder setTimestampFormat(String timestampFormat) { + this.timestampFormat = timestampFormat; + return this; + } + + public Builder setTimestampField(String timestampField) { + this.timestampField = timestampField; + return this; + } + + public FileStructureOverrides build() { + + return new FileStructureOverrides(charset, format, columnNames, hasHeaderRow, delimiter, quote, shouldTrimFields, grokPattern, + timestampFormat, timestampField); + } + } +} diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtils.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtils.java index 0341e03a20bc6..6f9b4cbbd6427 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtils.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtils.java @@ -54,26 +54,35 @@ private FileStructureUtils() { * @return A tuple of (field name, timestamp format) if one can be found, or null if * there is no consistent timestamp. */ - static Tuple guessTimestampField(List explanation, List> sampleRecords) { + static Tuple guessTimestampField(List explanation, List> sampleRecords, + FileStructureOverrides overrides) { if (sampleRecords.isEmpty()) { return null; } // Accept the first match from the first sample that is compatible with all the other samples - for (Tuple candidate : findCandidates(explanation, sampleRecords)) { + for (Tuple candidate : findCandidates(explanation, sampleRecords, overrides)) { boolean allGood = true; for (Map sampleRecord : sampleRecords.subList(1, sampleRecords.size())) { Object fieldValue = sampleRecord.get(candidate.v1()); if (fieldValue == null) { + if (overrides.getTimestampField() != null) { + throw new IllegalArgumentException("Specified timestamp field [" + overrides.getTimestampField() + + "] is not present in record [" + sampleRecord + "]"); + } explanation.add("First sample match [" + candidate.v1() + "] ruled out because record [" + sampleRecord + "] doesn't have field"); allGood = false; break; } - TimestampMatch match = TimestampFormatFinder.findFirstFullMatch(fieldValue.toString()); + TimestampMatch match = TimestampFormatFinder.findFirstFullMatch(fieldValue.toString(), overrides.getTimestampFormat()); if (match == null || match.candidateIndex != candidate.v2().candidateIndex) { + if (overrides.getTimestampFormat() != null) { + throw new IllegalArgumentException("Specified timestamp format [" + overrides.getTimestampFormat() + + "] does not match for record [" + sampleRecord + "]"); + } explanation.add("First sample match [" + candidate.v1() + "] ruled out because record [" + sampleRecord + "] matches differently: [" + match + "]"); allGood = false; @@ -82,7 +91,8 @@ static Tuple guessTimestampField(List explanatio } if (allGood) { - explanation.add("Guessing timestamp field is [" + candidate.v1() + "] with format [" + candidate.v2() + "]"); + explanation.add(((overrides.getTimestampField() == null) ? "Guessing timestamp" : "Timestamp") + + " field is [" + candidate.v1() + "] with format [" + candidate.v2() + "]"); return candidate; } } @@ -90,23 +100,41 @@ static Tuple guessTimestampField(List explanatio return null; } - private static List> findCandidates(List explanation, List> sampleRecords) { + private static List> findCandidates(List explanation, List> sampleRecords, + FileStructureOverrides overrides) { + + assert sampleRecords.isEmpty() == false; + Map firstRecord = sampleRecords.get(0); + + String onlyConsiderField = overrides.getTimestampField(); + if (onlyConsiderField != null && firstRecord.get(onlyConsiderField) == null) { + throw new IllegalArgumentException("Specified timestamp field [" + overrides.getTimestampField() + + "] is not present in record [" + firstRecord + "]"); + } List> candidates = new ArrayList<>(); - // Get candidate timestamps from the first sample record - for (Map.Entry entry : sampleRecords.get(0).entrySet()) { - Object value = entry.getValue(); - if (value != null) { - TimestampMatch match = TimestampFormatFinder.findFirstFullMatch(value.toString()); - if (match != null) { - Tuple candidate = new Tuple<>(entry.getKey(), match); - candidates.add(candidate); - explanation.add("First sample timestamp match [" + candidate + "]"); + // Get candidate timestamps from the possible field(s) of the first sample record + for (Map.Entry field : firstRecord.entrySet()) { + String fieldName = field.getKey(); + if (onlyConsiderField == null || onlyConsiderField.equals(fieldName)) { + Object value = field.getValue(); + if (value != null) { + TimestampMatch match = TimestampFormatFinder.findFirstFullMatch(value.toString(), overrides.getTimestampFormat()); + if (match != null) { + Tuple candidate = new Tuple<>(fieldName, match); + candidates.add(candidate); + explanation.add("First sample timestamp match [" + candidate + "]"); + } } } } + if (candidates.isEmpty() && overrides.getTimestampFormat() != null) { + throw new IllegalArgumentException("Specified timestamp format [" + overrides.getTimestampFormat() + + "] does not match for record [" + firstRecord + "]"); + } + return candidates; } diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/GrokPatternCreator.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/GrokPatternCreator.java index 292d0b8e8b305..54be5079c9d2c 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/GrokPatternCreator.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/GrokPatternCreator.java @@ -48,21 +48,21 @@ public final class GrokPatternCreator { * Grok patterns that are designed to match the whole message, not just a part of it. */ private static final List FULL_MATCH_GROK_PATTERNS = Arrays.asList( - new FullMatchGrokPatternCandidate("BACULA_LOGLINE", "bts"), - new FullMatchGrokPatternCandidate("CATALINALOG", "timestamp"), - new FullMatchGrokPatternCandidate("COMBINEDAPACHELOG", "timestamp"), - new FullMatchGrokPatternCandidate("COMMONAPACHELOG", "timestamp"), - new FullMatchGrokPatternCandidate("ELB_ACCESS_LOG", "timestamp"), - new FullMatchGrokPatternCandidate("HAPROXYHTTP", "syslog_timestamp"), - new FullMatchGrokPatternCandidate("HAPROXYTCP", "syslog_timestamp"), - new FullMatchGrokPatternCandidate("HTTPD20_ERRORLOG", "timestamp"), - new FullMatchGrokPatternCandidate("HTTPD24_ERRORLOG", "timestamp"), - new FullMatchGrokPatternCandidate("NAGIOSLOGLINE", "nagios_epoch"), - new FullMatchGrokPatternCandidate("NETSCREENSESSIONLOG", "date"), - new FullMatchGrokPatternCandidate("RAILS3", "timestamp"), - new FullMatchGrokPatternCandidate("RUBY_LOGGER", "timestamp"), - new FullMatchGrokPatternCandidate("SHOREWALL", "timestamp"), - new FullMatchGrokPatternCandidate("TOMCATLOG", "timestamp") + FullMatchGrokPatternCandidate.fromGrokPatternName("BACULA_LOGLINE", "bts"), + FullMatchGrokPatternCandidate.fromGrokPatternName("CATALINALOG", "timestamp"), + FullMatchGrokPatternCandidate.fromGrokPatternName("COMBINEDAPACHELOG", "timestamp"), + FullMatchGrokPatternCandidate.fromGrokPatternName("COMMONAPACHELOG", "timestamp"), + FullMatchGrokPatternCandidate.fromGrokPatternName("ELB_ACCESS_LOG", "timestamp"), + FullMatchGrokPatternCandidate.fromGrokPatternName("HAPROXYHTTP", "syslog_timestamp"), + FullMatchGrokPatternCandidate.fromGrokPatternName("HAPROXYTCP", "syslog_timestamp"), + FullMatchGrokPatternCandidate.fromGrokPatternName("HTTPD20_ERRORLOG", "timestamp"), + FullMatchGrokPatternCandidate.fromGrokPatternName("HTTPD24_ERRORLOG", "timestamp"), + FullMatchGrokPatternCandidate.fromGrokPatternName("NAGIOSLOGLINE", "nagios_epoch"), + FullMatchGrokPatternCandidate.fromGrokPatternName("NETSCREENSESSIONLOG", "date"), + FullMatchGrokPatternCandidate.fromGrokPatternName("RAILS3", "timestamp"), + FullMatchGrokPatternCandidate.fromGrokPatternName("RUBY_LOGGER", "timestamp"), + FullMatchGrokPatternCandidate.fromGrokPatternName("SHOREWALL", "timestamp"), + FullMatchGrokPatternCandidate.fromGrokPatternName("TOMCATLOG", "timestamp") ); /** @@ -87,7 +87,7 @@ public final class GrokPatternCreator { // Can't use \b as the breaks, because slashes are not "word" characters new ValueOnlyGrokPatternCandidate("PATH", "keyword", "path", "(? explanation, Collection sampleMes /** * This method attempts to find a Grok pattern that will match all of the sample messages in their entirety. * It will also update mappings and field stats if they are non-null. + * @param timestampField If not null then the chosen Grok pattern must use this timestamp field. * @return A tuple of (time field name, Grok string), or null if no suitable Grok pattern was found. */ - public Tuple findFullLineGrokPattern() { + public Tuple findFullLineGrokPattern(String timestampField) { for (FullMatchGrokPatternCandidate candidate : FULL_MATCH_GROK_PATTERNS) { - if (candidate.matchesAll(sampleMessages)) { - return candidate.processMatch(explanation, sampleMessages, mappings, fieldStats); + if (timestampField == null || timestampField.equals(candidate.getTimeField())) { + if (candidate.matchesAll(sampleMessages)) { + return candidate.processMatch(explanation, sampleMessages, mappings, fieldStats); + } } } return null; } + /** + * This method processes a user-supplied Grok pattern that will match all of the sample messages in their entirety. + * It will also update mappings and field stats if they are non-null. + * @param grokPattern The user supplied Grok pattern. + * @param timestampField The name of the timestamp field within the Grok pattern. + * @throws IllegalArgumentException If the supplied Grok pattern does not match the sample messages. + */ + public void validateFullLineGrokPattern(String grokPattern, String timestampField) { + + FullMatchGrokPatternCandidate candidate = FullMatchGrokPatternCandidate.fromGrokPattern(grokPattern, timestampField); + if (candidate.matchesAll(sampleMessages)) { + candidate.processMatch(explanation, sampleMessages, mappings, fieldStats); + } else { + throw new IllegalArgumentException("Supplied Grok pattern [" + grokPattern + "] does not match sample messages"); + } + } + /** * Build a Grok pattern that will match all of the sample messages in their entirety. * @param seedPatternName A pattern that has already been determined to match some portion of every sample message. @@ -564,14 +584,26 @@ public String processCaptures(Map fieldNameCountStore, Collecti */ static class FullMatchGrokPatternCandidate { - private final String grokString; + private final String grokPattern; private final String timeField; private final Grok grok; - FullMatchGrokPatternCandidate(String grokPatternName, String timeField) { - grokString = "%{" + grokPatternName + "}"; + static FullMatchGrokPatternCandidate fromGrokPatternName(String grokPatternName, String timeField) { + return new FullMatchGrokPatternCandidate("%{" + grokPatternName + "}", timeField); + } + + static FullMatchGrokPatternCandidate fromGrokPattern(String grokPattern, String timeField) { + return new FullMatchGrokPatternCandidate(grokPattern, timeField); + } + + private FullMatchGrokPatternCandidate(String grokPattern, String timeField) { + this.grokPattern = grokPattern; this.timeField = timeField; - grok = new Grok(Grok.getBuiltinPatterns(), grokString); + grok = new Grok(Grok.getBuiltinPatterns(), grokPattern); + } + + public String getTimeField() { + return timeField; } public boolean matchesAll(Collection sampleMessages) { @@ -585,7 +617,7 @@ public boolean matchesAll(Collection sampleMessages) { public Tuple processMatch(List explanation, Collection sampleMessages, Map mappings, Map fieldStats) { - explanation.add("A full message Grok pattern [" + grokString.substring(2, grokString.length() - 1) + "] looks appropriate"); + explanation.add("A full message Grok pattern [" + grokPattern.substring(2, grokPattern.length() - 1) + "] looks appropriate"); if (mappings != null || fieldStats != null) { Map> valuesPerField = new HashMap<>(); @@ -594,41 +626,39 @@ public Tuple processMatch(List explanation, Collection captures = grok.captures(sampleMessage); // If the pattern doesn't match then captures will be null if (captures == null) { - throw new IllegalStateException("[" + grokString + "] does not match snippet [" + sampleMessage + "]"); + throw new IllegalStateException("[" + grokPattern + "] does not match snippet [" + sampleMessage + "]"); } for (Map.Entry capture : captures.entrySet()) { String fieldName = capture.getKey(); String fieldValue = capture.getValue().toString(); - - // Exclude the time field because that will be dropped and replaced with @timestamp - if (fieldName.equals(timeField) == false) { - valuesPerField.compute(fieldName, (k, v) -> { - if (v == null) { - return new ArrayList<>(Collections.singletonList(fieldValue)); - } else { - v.add(fieldValue); - return v; - } - }); - } + valuesPerField.compute(fieldName, (k, v) -> { + if (v == null) { + return new ArrayList<>(Collections.singletonList(fieldValue)); + } else { + v.add(fieldValue); + return v; + } + }); } } for (Map.Entry> valuesForField : valuesPerField.entrySet()) { String fieldName = valuesForField.getKey(); if (mappings != null) { - mappings.put(fieldName, - FileStructureUtils.guessScalarMapping(explanation, fieldName, valuesForField.getValue())); + // Exclude the time field because that will be dropped and replaced with @timestamp + if (fieldName.equals(timeField) == false) { + mappings.put(fieldName, + FileStructureUtils.guessScalarMapping(explanation, fieldName, valuesForField.getValue())); + } } if (fieldStats != null) { - fieldStats.put(fieldName, - FileStructureUtils.calculateFieldStats(valuesForField.getValue())); + fieldStats.put(fieldName, FileStructureUtils.calculateFieldStats(valuesForField.getValue())); } } } - return new Tuple<>(timeField, grokString); + return new Tuple<>(timeField, grokPattern); } } } diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/JsonFileStructureFinder.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/JsonFileStructureFinder.java index a488549bc524b..b20658f872b65 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/JsonFileStructureFinder.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/JsonFileStructureFinder.java @@ -33,7 +33,8 @@ public class JsonFileStructureFinder implements FileStructureFinder { private final FileStructure structure; static JsonFileStructureFinder makeJsonFileStructureFinder(List explanation, String sample, String charsetName, - Boolean hasByteOrderMarker) throws IOException { + Boolean hasByteOrderMarker, FileStructureOverrides overrides) + throws IOException { List> sampleRecords = new ArrayList<>(); @@ -51,7 +52,7 @@ static JsonFileStructureFinder makeJsonFileStructureFinder(List explanat .setNumLinesAnalyzed(sampleMessages.size()) .setNumMessagesAnalyzed(sampleRecords.size()); - Tuple timeField = FileStructureUtils.guessTimestampField(explanation, sampleRecords); + Tuple timeField = FileStructureUtils.guessTimestampField(explanation, sampleRecords, overrides); if (timeField != null) { structureBuilder.setTimestampField(timeField.v1()) .setTimestampFormats(timeField.v2().dateFormats) @@ -62,7 +63,10 @@ static JsonFileStructureFinder makeJsonFileStructureFinder(List explanat FileStructureUtils.guessMappingsAndCalculateFieldStats(explanation, sampleRecords); SortedMap mappings = mappingsAndFieldStats.v1(); - mappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD, Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "date")); + if (timeField != null) { + mappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD, + Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "date")); + } if (mappingsAndFieldStats.v2() != null) { structureBuilder.setFieldStats(mappingsAndFieldStats.v2()); diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/JsonFileStructureFinderFactory.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/JsonFileStructureFinderFactory.java index 02be3c1cf19d4..cfeaa222679c0 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/JsonFileStructureFinderFactory.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/JsonFileStructureFinderFactory.java @@ -8,6 +8,7 @@ import org.elasticsearch.common.xcontent.DeprecationHandler; import org.elasticsearch.common.xcontent.NamedXContentRegistry; import org.elasticsearch.common.xcontent.XContentParser; +import org.elasticsearch.xpack.core.ml.filestructurefinder.FileStructure; import java.io.IOException; import java.io.StringReader; @@ -18,6 +19,11 @@ public class JsonFileStructureFinderFactory implements FileStructureFinderFactory { + @Override + public boolean canFindFormat(FileStructure.Format format) { + return format == null || format == FileStructure.Format.JSON; + } + /** * This format matches if the sample consists of one or more JSON documents. * If there is more than one, they must be newline-delimited. The @@ -61,9 +67,9 @@ DeprecationHandler.THROW_UNSUPPORTED_OPERATION, new ContextPrintingStringReader( } @Override - public FileStructureFinder createFromSample(List explanation, String sample, String charsetName, Boolean hasByteOrderMarker) - throws IOException { - return JsonFileStructureFinder.makeJsonFileStructureFinder(explanation, sample, charsetName, hasByteOrderMarker); + public FileStructureFinder createFromSample(List explanation, String sample, String charsetName, Boolean hasByteOrderMarker, + FileStructureOverrides overrides) throws IOException { + return JsonFileStructureFinder.makeJsonFileStructureFinder(explanation, sample, charsetName, hasByteOrderMarker, overrides); } private static class ContextPrintingStringReader extends StringReader { diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinder.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinder.java index 95e0a5dc69d6a..e6e445a3ff6b1 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinder.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinder.java @@ -28,17 +28,19 @@ public class TextLogFileStructureFinder implements FileStructureFinder { private final FileStructure structure; static TextLogFileStructureFinder makeTextLogFileStructureFinder(List explanation, String sample, String charsetName, - Boolean hasByteOrderMarker) { + Boolean hasByteOrderMarker, FileStructureOverrides overrides) { String[] sampleLines = sample.split("\n"); - Tuple> bestTimestamp = mostLikelyTimestamp(sampleLines); + Tuple> bestTimestamp = mostLikelyTimestamp(sampleLines, overrides); if (bestTimestamp == null) { // Is it appropriate to treat a file that is neither structured nor has // a regular pattern of timestamps as a log file? Probably not... - throw new IllegalArgumentException("Could not find a timestamp in the sample provided"); + throw new IllegalArgumentException("Could not find " + + ((overrides.getTimestampFormat() == null) ? "a timestamp" : "the specified timestamp format") + " in the sample provided"); } - explanation.add("Most likely timestamp format is [" + bestTimestamp.v1() + "]"); + explanation.add(((overrides.getTimestampFormat() == null) ? "Most likely timestamp" : "Timestamp") + " format is [" + + bestTimestamp.v1() + "]"); List sampleMessages = new ArrayList<>(); StringBuilder preamble = new StringBuilder(); @@ -86,17 +88,26 @@ static TextLogFileStructureFinder makeTextLogFileStructureFinder(List ex SortedMap fieldStats = new TreeMap<>(); - // We can't parse directly into @timestamp using Grok, so parse to some other time field, which the date filter will then remove - String interimTimestampField; - String grokPattern; GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings, fieldStats); - Tuple timestampFieldAndFullMatchGrokPattern = grokPatternCreator.findFullLineGrokPattern(); - if (timestampFieldAndFullMatchGrokPattern != null) { - interimTimestampField = timestampFieldAndFullMatchGrokPattern.v1(); - grokPattern = timestampFieldAndFullMatchGrokPattern.v2(); + // We can't parse directly into @timestamp using Grok, so parse to some other time field, which the date filter will then remove + String interimTimestampField = overrides.getTimestampField(); + String grokPattern = overrides.getGrokPattern(); + if (grokPattern != null) { + if (interimTimestampField == null) { + interimTimestampField = "timestamp"; + } + grokPatternCreator.validateFullLineGrokPattern(grokPattern, interimTimestampField); } else { - interimTimestampField = "timestamp"; - grokPattern = grokPatternCreator.createGrokPatternFromExamples(bestTimestamp.v1().grokPatternName, interimTimestampField); + Tuple timestampFieldAndFullMatchGrokPattern = grokPatternCreator.findFullLineGrokPattern(interimTimestampField); + if (timestampFieldAndFullMatchGrokPattern != null) { + interimTimestampField = timestampFieldAndFullMatchGrokPattern.v1(); + grokPattern = timestampFieldAndFullMatchGrokPattern.v2(); + } else { + if (interimTimestampField == null) { + interimTimestampField = "timestamp"; + } + grokPattern = grokPatternCreator.createGrokPatternFromExamples(bestTimestamp.v1().grokPatternName, interimTimestampField); + } } FileStructure structure = structureBuilder @@ -127,14 +138,14 @@ public FileStructure getStructure() { return structure; } - static Tuple> mostLikelyTimestamp(String[] sampleLines) { + static Tuple> mostLikelyTimestamp(String[] sampleLines, FileStructureOverrides overrides) { Map>> timestampMatches = new LinkedHashMap<>(); int remainingLines = sampleLines.length; double differenceBetweenTwoHighestWeights = 0.0; for (String sampleLine : sampleLines) { - TimestampMatch match = TimestampFormatFinder.findFirstMatch(sampleLine); + TimestampMatch match = TimestampFormatFinder.findFirstMatch(sampleLine, overrides.getTimestampFormat()); if (match != null) { TimestampMatch pureMatch = new TimestampMatch(match.candidateIndex, "", match.dateFormats, match.simplePattern, match.grokPatternName, ""); diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderFactory.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderFactory.java index 5f737eeb9b823..b92b705aaffdf 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderFactory.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderFactory.java @@ -5,6 +5,8 @@ */ package org.elasticsearch.xpack.ml.filestructurefinder; +import org.elasticsearch.xpack.core.ml.filestructurefinder.FileStructure; + import java.util.List; import java.util.regex.Pattern; @@ -13,6 +15,11 @@ public class TextLogFileStructureFinderFactory implements FileStructureFinderFac // This works because, by default, dot doesn't match newlines private static final Pattern TWO_NON_BLANK_LINES_PATTERN = Pattern.compile(".\n+."); + @Override + public boolean canFindFormat(FileStructure.Format format) { + return format == null || format == FileStructure.Format.SEMI_STRUCTURED_TEXT; + } + /** * This format matches if the sample contains at least one newline and at least two * non-blank lines. @@ -33,7 +40,9 @@ public boolean canCreateFromSample(List explanation, String sample) { } @Override - public FileStructureFinder createFromSample(List explanation, String sample, String charsetName, Boolean hasByteOrderMarker) { - return TextLogFileStructureFinder.makeTextLogFileStructureFinder(explanation, sample, charsetName, hasByteOrderMarker); + public FileStructureFinder createFromSample(List explanation, String sample, String charsetName, Boolean hasByteOrderMarker, + FileStructureOverrides overrides) { + return TextLogFileStructureFinder.makeTextLogFileStructureFinder(explanation, sample, charsetName, hasByteOrderMarker, + overrides); } } diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TimestampFormatFinder.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TimestampFormatFinder.java index 81e490878a007..4239748f7df0b 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TimestampFormatFinder.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TimestampFormatFinder.java @@ -141,6 +141,7 @@ private TimestampFormatFinder() { /** * Find the first timestamp format that matches part of the supplied value. + * * @param text The value that the returned timestamp format must exist within. * @return The timestamp format, or null if none matches. */ @@ -148,34 +149,61 @@ public static TimestampMatch findFirstMatch(String text) { return findFirstMatch(text, 0); } + /** + * Find the first timestamp format that matches part of the supplied value. + * + * @param text The value that the returned timestamp format must exist within. + * @param requiredFormat A date format that any returned match must support. + * @return The timestamp format, or null if none matches. + */ + public static TimestampMatch findFirstMatch(String text, String requiredFormat) { + return findFirstMatch(text, 0, requiredFormat); + } + /** * Find the first timestamp format that matches part of the supplied value, * excluding a specified number of candidate formats. - * @param text The value that the returned timestamp format must exist within. + * + * @param text The value that the returned timestamp format must exist within. * @param ignoreCandidates The number of candidate formats to exclude from the search. * @return The timestamp format, or null if none matches. */ public static TimestampMatch findFirstMatch(String text, int ignoreCandidates) { + return findFirstMatch(text, ignoreCandidates, null); + } + + /** + * Find the first timestamp format that matches part of the supplied value, + * excluding a specified number of candidate formats. + * + * @param text The value that the returned timestamp format must exist within. + * @param ignoreCandidates The number of candidate formats to exclude from the search. + * @param requiredFormat A date format that any returned match must support. + * @return The timestamp format, or null if none matches. + */ + public static TimestampMatch findFirstMatch(String text, int ignoreCandidates, String requiredFormat) { Boolean[] quickRuleoutMatches = new Boolean[QUICK_RULE_OUT_PATTERNS.size()]; int index = ignoreCandidates; for (CandidateTimestampFormat candidate : ORDERED_CANDIDATE_FORMATS.subList(ignoreCandidates, ORDERED_CANDIDATE_FORMATS.size())) { - boolean quicklyRuledOut = false; - for (Integer quickRuleOutIndex : candidate.quickRuleOutIndices) { - if (quickRuleoutMatches[quickRuleOutIndex] == null) { - quickRuleoutMatches[quickRuleOutIndex] = QUICK_RULE_OUT_PATTERNS.get(quickRuleOutIndex).matcher(text).find(); - } - if (quickRuleoutMatches[quickRuleOutIndex] == false) { - quicklyRuledOut = true; - break; + if (requiredFormat == null || candidate.dateFormats.contains(requiredFormat)) { + boolean quicklyRuledOut = false; + for (Integer quickRuleOutIndex : candidate.quickRuleOutIndices) { + if (quickRuleoutMatches[quickRuleOutIndex] == null) { + quickRuleoutMatches[quickRuleOutIndex] = QUICK_RULE_OUT_PATTERNS.get(quickRuleOutIndex).matcher(text).find(); + } + if (quickRuleoutMatches[quickRuleOutIndex] == false) { + quicklyRuledOut = true; + break; + } } - } - if (quicklyRuledOut == false) { - Map captures = candidate.strictSearchGrok.captures(text); - if (captures != null) { - String preface = captures.getOrDefault(PREFACE, "").toString(); - String epilogue = captures.getOrDefault(EPILOGUE, "").toString(); - return makeTimestampMatch(candidate, index, preface, text.substring(preface.length(), - text.length() - epilogue.length()), epilogue); + if (quicklyRuledOut == false) { + Map captures = candidate.strictSearchGrok.captures(text); + if (captures != null) { + String preface = captures.getOrDefault(PREFACE, "").toString(); + String epilogue = captures.getOrDefault(EPILOGUE, "").toString(); + return makeTimestampMatch(candidate, index, preface, text.substring(preface.length(), + text.length() - epilogue.length()), epilogue); + } } } ++index; @@ -185,6 +213,7 @@ public static TimestampMatch findFirstMatch(String text, int ignoreCandidates) { /** * Find the best timestamp format for matching an entire field value. + * * @param text The value that the returned timestamp format must match in its entirety. * @return The timestamp format, or null if none matches. */ @@ -192,6 +221,17 @@ public static TimestampMatch findFirstFullMatch(String text) { return findFirstFullMatch(text, 0); } + /** + * Find the best timestamp format for matching an entire field value. + * + * @param text The value that the returned timestamp format must match in its entirety. + * @param requiredFormat A date format that any returned match must support. + * @return The timestamp format, or null if none matches. + */ + public static TimestampMatch findFirstFullMatch(String text, String requiredFormat) { + return findFirstFullMatch(text, 0, requiredFormat); + } + /** * Find the best timestamp format for matching an entire field value, * excluding a specified number of candidate formats. @@ -200,11 +240,25 @@ public static TimestampMatch findFirstFullMatch(String text) { * @return The timestamp format, or null if none matches. */ public static TimestampMatch findFirstFullMatch(String text, int ignoreCandidates) { + return findFirstFullMatch(text, ignoreCandidates, null); + } + + /** + * Find the best timestamp format for matching an entire field value, + * excluding a specified number of candidate formats. + * @param text The value that the returned timestamp format must match in its entirety. + * @param ignoreCandidates The number of candidate formats to exclude from the search. + * @param requiredFormat A date format that any returned match must support. + * @return The timestamp format, or null if none matches. + */ + public static TimestampMatch findFirstFullMatch(String text, int ignoreCandidates, String requiredFormat) { int index = ignoreCandidates; for (CandidateTimestampFormat candidate : ORDERED_CANDIDATE_FORMATS.subList(ignoreCandidates, ORDERED_CANDIDATE_FORMATS.size())) { - Map captures = candidate.strictFullMatchGrok.captures(text); - if (captures != null) { - return makeTimestampMatch(candidate, index, "", text, ""); + if (requiredFormat == null || candidate.dateFormats.contains(requiredFormat)) { + Map captures = candidate.strictFullMatchGrok.captures(text); + if (captures != null) { + return makeTimestampMatch(candidate, index, "", text, ""); + } } ++index; } @@ -417,7 +471,7 @@ static final class CandidateTimestampFormat { // The (?m) here has the Ruby meaning, which is equivalent to (?s) in Java this.strictSearchGrok = new Grok(Grok.getBuiltinPatterns(), "(?m)%{DATA:" + PREFACE + "}" + strictGrokPattern + "%{GREEDYDATA:" + EPILOGUE + "}"); - this.strictFullMatchGrok = new Grok(Grok.getBuiltinPatterns(), strictGrokPattern); + this.strictFullMatchGrok = new Grok(Grok.getBuiltinPatterns(), strictGrokPattern + "$"); this.standardGrokPatternName = standardGrokPatternName; assert quickRuleOutIndices.stream() .noneMatch(quickRuleOutIndex -> quickRuleOutIndex < 0 || quickRuleOutIndex >= QUICK_RULE_OUT_PATTERNS.size()); diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinder.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinder.java index 570f36f59c06e..d5e3fba34c972 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinder.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinder.java @@ -38,7 +38,7 @@ public class XmlFileStructureFinder implements FileStructureFinder { private final FileStructure structure; static XmlFileStructureFinder makeXmlFileStructureFinder(List explanation, String sample, String charsetName, - Boolean hasByteOrderMarker) + Boolean hasByteOrderMarker, FileStructureOverrides overrides) throws IOException, ParserConfigurationException, SAXException { String messagePrefix; @@ -90,7 +90,7 @@ static XmlFileStructureFinder makeXmlFileStructureFinder(List explanatio .setNumMessagesAnalyzed(sampleRecords.size()) .setMultilineStartPattern("^\\s*<" + topLevelTag); - Tuple timeField = FileStructureUtils.guessTimestampField(explanation, sampleRecords); + Tuple timeField = FileStructureUtils.guessTimestampField(explanation, sampleRecords, overrides); if (timeField != null) { structureBuilder.setTimestampField(timeField.v1()) .setTimestampFormats(timeField.v2().dateFormats) @@ -110,8 +110,10 @@ static XmlFileStructureFinder makeXmlFileStructureFinder(List explanatio secondLevelProperties.put(FileStructureUtils.MAPPING_PROPERTIES_SETTING, innerMappings); SortedMap outerMappings = new TreeMap<>(); outerMappings.put(topLevelTag, secondLevelProperties); - outerMappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD, - Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "date")); + if (timeField != null) { + outerMappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD, + Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "date")); + } FileStructure structure = structureBuilder .setMappings(outerMappings) diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderFactory.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderFactory.java index f8536d1437594..3079f53931db6 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderFactory.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderFactory.java @@ -5,6 +5,7 @@ */ package org.elasticsearch.xpack.ml.filestructurefinder; +import org.elasticsearch.xpack.core.ml.filestructurefinder.FileStructure; import org.xml.sax.SAXException; import javax.xml.parsers.ParserConfigurationException; @@ -27,6 +28,11 @@ public XmlFileStructureFinderFactory() { xmlFactory.setProperty(XMLInputFactory.IS_VALIDATING, Boolean.FALSE); } + @Override + public boolean canFindFormat(FileStructure.Format format) { + return format == null || format == FileStructure.Format.XML; + } + /** * This format matches if the sample consists of one or more XML documents, * all with the same root element name. If there is more than one document, @@ -115,8 +121,9 @@ public boolean canCreateFromSample(List explanation, String sample) { } @Override - public FileStructureFinder createFromSample(List explanation, String sample, String charsetName, Boolean hasByteOrderMarker) + public FileStructureFinder createFromSample(List explanation, String sample, String charsetName, Boolean hasByteOrderMarker, + FileStructureOverrides overrides) throws IOException, ParserConfigurationException, SAXException { - return XmlFileStructureFinder.makeXmlFileStructureFinder(explanation, sample, charsetName, hasByteOrderMarker); + return XmlFileStructureFinder.makeXmlFileStructureFinder(explanation, sample, charsetName, hasByteOrderMarker, overrides); } } diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/rest/RestFindFileStructureAction.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/rest/RestFindFileStructureAction.java index 83293c7d60efa..316a4b56e4a07 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/rest/RestFindFileStructureAction.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/rest/RestFindFileStructureAction.java @@ -39,6 +39,17 @@ protected RestChannelConsumer prepareRequest(RestRequest restRequest, NodeClient FindFileStructureAction.Request request = new FindFileStructureAction.Request(); request.setLinesToSample(restRequest.paramAsInt(FindFileStructureAction.Request.LINES_TO_SAMPLE.getPreferredName(), FileStructureFinderManager.DEFAULT_IDEAL_SAMPLE_LINE_COUNT)); + request.setCharset(restRequest.param(FindFileStructureAction.Request.CHARSET.getPreferredName())); + request.setFormat(restRequest.param(FindFileStructureAction.Request.FORMAT.getPreferredName())); + request.setColumnNames(restRequest.paramAsStringArray(FindFileStructureAction.Request.COLUMN_NAMES.getPreferredName(), null)); + request.setHasHeaderRow(restRequest.paramAsBoolean(FindFileStructureAction.Request.HAS_HEADER_ROW.getPreferredName(), null)); + request.setDelimiter(restRequest.param(FindFileStructureAction.Request.DELIMITER.getPreferredName())); + request.setQuote(restRequest.param(FindFileStructureAction.Request.QUOTE.getPreferredName())); + request.setShouldTrimFields(restRequest.paramAsBoolean(FindFileStructureAction.Request.SHOULD_TRIM_FIELDS.getPreferredName(), + null)); + request.setGrokPattern(restRequest.param(FindFileStructureAction.Request.GROK_PATTERN.getPreferredName())); + request.setTimestampFormat(restRequest.param(FindFileStructureAction.Request.TIMESTAMP_FORMAT.getPreferredName())); + request.setTimestampField(restRequest.param(FindFileStructureAction.Request.TIMESTAMP_FIELD.getPreferredName())); if (restRequest.hasContent()) { request.setSample(restRequest.content()); } else { diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderFactoryTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderFactoryTests.java index 6bcb827be94d8..53f3a2a4d4ca6 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderFactoryTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderFactoryTests.java @@ -7,10 +7,10 @@ public class DelimitedFileStructureFinderFactoryTests extends FileStructureTestCase { - private FileStructureFinderFactory csvFactory = new DelimitedFileStructureFinderFactory(',', 2, false); - private FileStructureFinderFactory tsvFactory = new DelimitedFileStructureFinderFactory('\t', 2, false); - private FileStructureFinderFactory semiColonDelimitedfactory = new DelimitedFileStructureFinderFactory(';', 4, false); - private FileStructureFinderFactory pipeDelimitedFactory = new DelimitedFileStructureFinderFactory('|', 5, true); + private FileStructureFinderFactory csvFactory = new DelimitedFileStructureFinderFactory(',', '"', 2, false); + private FileStructureFinderFactory tsvFactory = new DelimitedFileStructureFinderFactory('\t', '"', 2, false); + private FileStructureFinderFactory semiColonDelimitedfactory = new DelimitedFileStructureFinderFactory(';', '"', 4, false); + private FileStructureFinderFactory pipeDelimitedFactory = new DelimitedFileStructureFinderFactory('|', '"', 5, true); // CSV - no need to check JSON or XML because they come earlier in the order we check formats diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderTests.java index 4e692d583918e..decc61a5397a5 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderTests.java @@ -19,7 +19,7 @@ public class DelimitedFileStructureFinderTests extends FileStructureTestCase { - private FileStructureFinderFactory csvFactory = new DelimitedFileStructureFinderFactory(',', 2, false); + private FileStructureFinderFactory csvFactory = new DelimitedFileStructureFinderFactory(',', '"', 2, false); public void testCreateConfigsGivenCompleteCsv() throws Exception { String sample = "time,message\n" + @@ -29,7 +29,8 @@ public void testCreateConfigsGivenCompleteCsv() throws Exception { String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); - FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker); + FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, + FileStructureOverrides.EMPTY_OVERRIDES); FileStructure structure = structureFinder.getStructure(); @@ -43,6 +44,7 @@ public void testCreateConfigsGivenCompleteCsv() throws Exception { assertEquals("^\"?time\"?,\"?message\"?", structure.getExcludeLinesPattern()); assertEquals("^\"?\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", structure.getMultilineStartPattern()); assertEquals(Character.valueOf(','), structure.getDelimiter()); + assertEquals(Character.valueOf('"'), structure.getQuote()); assertTrue(structure.getHasHeaderRow()); assertNull(structure.getShouldTrimFields()); assertEquals(Arrays.asList("time", "message"), structure.getColumnNames()); @@ -51,6 +53,76 @@ public void testCreateConfigsGivenCompleteCsv() throws Exception { assertEquals(Collections.singletonList("ISO8601"), structure.getTimestampFormats()); } + public void testCreateConfigsGivenCompleteCsvAndColumnNamesOverride() throws Exception { + + FileStructureOverrides overrides = FileStructureOverrides.builder().setColumnNames(Arrays.asList("my_time", "my_message")).build(); + + String sample = "time,message\n" + + "2018-05-17T13:41:23,hello\n" + + "2018-05-17T13:41:32,hello again\n"; + assertTrue(csvFactory.canCreateFromSample(explanation, sample)); + + String charset = randomFrom(POSSIBLE_CHARSETS); + Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); + FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, overrides); + + FileStructure structure = structureFinder.getStructure(); + + assertEquals(FileStructure.Format.DELIMITED, structure.getFormat()); + assertEquals(charset, structure.getCharset()); + if (hasByteOrderMarker == null) { + assertNull(structure.getHasByteOrderMarker()); + } else { + assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker()); + } + assertEquals("^\"?time\"?,\"?message\"?", structure.getExcludeLinesPattern()); + assertEquals("^\"?\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", structure.getMultilineStartPattern()); + assertEquals(Character.valueOf(','), structure.getDelimiter()); + assertEquals(Character.valueOf('"'), structure.getQuote()); + assertTrue(structure.getHasHeaderRow()); + assertNull(structure.getShouldTrimFields()); + assertEquals(Arrays.asList("my_time", "my_message"), structure.getColumnNames()); + assertNull(structure.getGrokPattern()); + assertEquals("my_time", structure.getTimestampField()); + assertEquals(Collections.singletonList("ISO8601"), structure.getTimestampFormats()); + } + + public void testCreateConfigsGivenCompleteCsvAndHasHeaderRowOverride() throws Exception { + + // It's obvious the first row really should be a header row, so by overriding + // detection with the wrong choice the results will be completely changed + FileStructureOverrides overrides = FileStructureOverrides.builder().setHasHeaderRow(false).build(); + + String sample = "time,message\n" + + "2018-05-17T13:41:23,hello\n" + + "2018-05-17T13:41:32,hello again\n"; + assertTrue(csvFactory.canCreateFromSample(explanation, sample)); + + String charset = randomFrom(POSSIBLE_CHARSETS); + Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); + FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, overrides); + + FileStructure structure = structureFinder.getStructure(); + + assertEquals(FileStructure.Format.DELIMITED, structure.getFormat()); + assertEquals(charset, structure.getCharset()); + if (hasByteOrderMarker == null) { + assertNull(structure.getHasByteOrderMarker()); + } else { + assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker()); + } + assertNull(structure.getExcludeLinesPattern()); + assertNull(structure.getMultilineStartPattern()); + assertEquals(Character.valueOf(','), structure.getDelimiter()); + assertEquals(Character.valueOf('"'), structure.getQuote()); + assertFalse(structure.getHasHeaderRow()); + assertNull(structure.getShouldTrimFields()); + assertEquals(Arrays.asList("column1", "column2"), structure.getColumnNames()); + assertNull(structure.getGrokPattern()); + assertNull(structure.getTimestampField()); + assertNull(structure.getTimestampFormats()); + } + public void testCreateConfigsGivenCsvWithIncompleteLastRecord() throws Exception { String sample = "message,time,count\n" + "\"hello\n" + @@ -60,7 +132,8 @@ public void testCreateConfigsGivenCsvWithIncompleteLastRecord() throws Exception String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); - FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker); + FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, + FileStructureOverrides.EMPTY_OVERRIDES); FileStructure structure = structureFinder.getStructure(); @@ -74,6 +147,7 @@ public void testCreateConfigsGivenCsvWithIncompleteLastRecord() throws Exception assertEquals("^\"?message\"?,\"?time\"?,\"?count\"?", structure.getExcludeLinesPattern()); assertEquals("^.*?,\"?\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", structure.getMultilineStartPattern()); assertEquals(Character.valueOf(','), structure.getDelimiter()); + assertEquals(Character.valueOf('"'), structure.getQuote()); assertTrue(structure.getHasHeaderRow()); assertNull(structure.getShouldTrimFields()); assertEquals(Arrays.asList("message", "time", "count"), structure.getColumnNames()); @@ -93,7 +167,8 @@ public void testCreateConfigsGivenCsvWithTrailingNulls() throws Exception { String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); - FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker); + FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, + FileStructureOverrides.EMPTY_OVERRIDES); FileStructure structure = structureFinder.getStructure(); @@ -110,6 +185,7 @@ public void testCreateConfigsGivenCsvWithTrailingNulls() throws Exception { structure.getExcludeLinesPattern()); assertEquals("^.*?,\"?\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", structure.getMultilineStartPattern()); assertEquals(Character.valueOf(','), structure.getDelimiter()); + assertEquals(Character.valueOf('"'), structure.getQuote()); assertTrue(structure.getHasHeaderRow()); assertNull(structure.getShouldTrimFields()); assertEquals(Arrays.asList("VendorID", "tpep_pickup_datetime", "tpep_dropoff_datetime", "passenger_count", "trip_distance", @@ -120,6 +196,50 @@ public void testCreateConfigsGivenCsvWithTrailingNulls() throws Exception { assertEquals(Collections.singletonList("YYYY-MM-dd HH:mm:ss"), structure.getTimestampFormats()); } + public void testCreateConfigsGivenCsvWithTrailingNullsAndOverriddenTimeField() throws Exception { + + // Default timestamp field is the first field from the start of each row that contains a + // consistent timestamp format, so if we want the second we need an override + FileStructureOverrides overrides = FileStructureOverrides.builder().setTimestampField("tpep_dropoff_datetime").build(); + + String sample = "VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID," + + "store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount," + + "improvement_surcharge,total_amount,,\n" + + "2,2016-12-31 15:15:01,2016-12-31 15:15:09,1,.00,1,N,264,264,2,1,0,0.5,0,0,0.3,1.8,,\n" + + "1,2016-12-01 00:00:01,2016-12-01 00:10:22,1,1.60,1,N,163,143,2,9,0.5,0.5,0,0,0.3,10.3,,\n" + + "1,2016-12-01 00:00:01,2016-12-01 00:11:01,1,1.40,1,N,164,229,1,9,0.5,0.5,2.05,0,0.3,12.35,,\n"; + assertTrue(csvFactory.canCreateFromSample(explanation, sample)); + + String charset = randomFrom(POSSIBLE_CHARSETS); + Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); + FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, overrides); + + FileStructure structure = structureFinder.getStructure(); + + assertEquals(FileStructure.Format.DELIMITED, structure.getFormat()); + assertEquals(charset, structure.getCharset()); + if (hasByteOrderMarker == null) { + assertNull(structure.getHasByteOrderMarker()); + } else { + assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker()); + } + assertEquals("^\"?VendorID\"?,\"?tpep_pickup_datetime\"?,\"?tpep_dropoff_datetime\"?,\"?passenger_count\"?,\"?trip_distance\"?," + + "\"?RatecodeID\"?,\"?store_and_fwd_flag\"?,\"?PULocationID\"?,\"?DOLocationID\"?,\"?payment_type\"?,\"?fare_amount\"?," + + "\"?extra\"?,\"?mta_tax\"?,\"?tip_amount\"?,\"?tolls_amount\"?,\"?improvement_surcharge\"?,\"?total_amount\"?,\"?\"?,\"?\"?", + structure.getExcludeLinesPattern()); + assertEquals("^.*?,.*?,\"?\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", structure.getMultilineStartPattern()); + assertEquals(Character.valueOf(','), structure.getDelimiter()); + assertEquals(Character.valueOf('"'), structure.getQuote()); + assertTrue(structure.getHasHeaderRow()); + assertNull(structure.getShouldTrimFields()); + assertEquals(Arrays.asList("VendorID", "tpep_pickup_datetime", "tpep_dropoff_datetime", "passenger_count", "trip_distance", + "RatecodeID", "store_and_fwd_flag", "PULocationID", "DOLocationID", "payment_type", "fare_amount", "extra", "mta_tax", + "tip_amount", "tolls_amount", "improvement_surcharge", "total_amount", "column18", "column19"), structure.getColumnNames()); + assertNull(structure.getGrokPattern()); + assertEquals("tpep_dropoff_datetime", structure.getTimestampField()); + assertEquals(Collections.singletonList("YYYY-MM-dd HH:mm:ss"), structure.getTimestampFormats()); + } + public void testCreateConfigsGivenCsvWithTrailingNullsExceptHeader() throws Exception { String sample = "VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID," + "store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount," + @@ -131,7 +251,8 @@ public void testCreateConfigsGivenCsvWithTrailingNullsExceptHeader() throws Exce String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); - FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker); + FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, + FileStructureOverrides.EMPTY_OVERRIDES); FileStructure structure = structureFinder.getStructure(); @@ -148,6 +269,7 @@ public void testCreateConfigsGivenCsvWithTrailingNullsExceptHeader() throws Exce structure.getExcludeLinesPattern()); assertEquals("^.*?,\"?\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", structure.getMultilineStartPattern()); assertEquals(Character.valueOf(','), structure.getDelimiter()); + assertEquals(Character.valueOf('"'), structure.getQuote()); assertTrue(structure.getHasHeaderRow()); assertNull(structure.getShouldTrimFields()); assertEquals(Arrays.asList("VendorID", "tpep_pickup_datetime", "tpep_dropoff_datetime", "passenger_count", "trip_distance", @@ -158,6 +280,53 @@ public void testCreateConfigsGivenCsvWithTrailingNullsExceptHeader() throws Exce assertEquals(Collections.singletonList("YYYY-MM-dd HH:mm:ss"), structure.getTimestampFormats()); } + public void testCreateConfigsGivenCsvWithTrailingNullsExceptHeaderAndColumnNamesOverride() throws Exception { + + FileStructureOverrides overrides = FileStructureOverrides.builder() + .setColumnNames(Arrays.asList("my_VendorID", "my_tpep_pickup_datetime", "my_tpep_dropoff_datetime", "my_passenger_count", + "my_trip_distance", "my_RatecodeID", "my_store_and_fwd_flag", "my_PULocationID", "my_DOLocationID", "my_payment_type", + "my_fare_amount", "my_extra", "my_mta_tax", "my_tip_amount", "my_tolls_amount", "my_improvement_surcharge", + "my_total_amount")).build(); + + String sample = "VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID," + + "store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount," + + "improvement_surcharge,total_amount\n" + + "2,2016-12-31 15:15:01,2016-12-31 15:15:09,1,.00,1,N,264,264,2,1,0,0.5,0,0,0.3,1.8,,\n" + + "1,2016-12-01 00:00:01,2016-12-01 00:10:22,1,1.60,1,N,163,143,2,9,0.5,0.5,0,0,0.3,10.3,,\n" + + "1,2016-12-01 00:00:01,2016-12-01 00:11:01,1,1.40,1,N,164,229,1,9,0.5,0.5,2.05,0,0.3,12.35,,\n"; + assertTrue(csvFactory.canCreateFromSample(explanation, sample)); + + String charset = randomFrom(POSSIBLE_CHARSETS); + Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); + FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, overrides); + + FileStructure structure = structureFinder.getStructure(); + + assertEquals(FileStructure.Format.DELIMITED, structure.getFormat()); + assertEquals(charset, structure.getCharset()); + if (hasByteOrderMarker == null) { + assertNull(structure.getHasByteOrderMarker()); + } else { + assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker()); + } + assertEquals("^\"?VendorID\"?,\"?tpep_pickup_datetime\"?,\"?tpep_dropoff_datetime\"?,\"?passenger_count\"?,\"?trip_distance\"?," + + "\"?RatecodeID\"?,\"?store_and_fwd_flag\"?,\"?PULocationID\"?,\"?DOLocationID\"?,\"?payment_type\"?,\"?fare_amount\"?," + + "\"?extra\"?,\"?mta_tax\"?,\"?tip_amount\"?,\"?tolls_amount\"?,\"?improvement_surcharge\"?,\"?total_amount\"?", + structure.getExcludeLinesPattern()); + assertEquals("^.*?,\"?\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", structure.getMultilineStartPattern()); + assertEquals(Character.valueOf(','), structure.getDelimiter()); + assertEquals(Character.valueOf('"'), structure.getQuote()); + assertTrue(structure.getHasHeaderRow()); + assertNull(structure.getShouldTrimFields()); + assertEquals(Arrays.asList("my_VendorID", "my_tpep_pickup_datetime", "my_tpep_dropoff_datetime", "my_passenger_count", + "my_trip_distance", "my_RatecodeID", "my_store_and_fwd_flag", "my_PULocationID", "my_DOLocationID", "my_payment_type", + "my_fare_amount", "my_extra", "my_mta_tax", "my_tip_amount", "my_tolls_amount", "my_improvement_surcharge", "my_total_amount"), + structure.getColumnNames()); + assertNull(structure.getGrokPattern()); + assertEquals("my_tpep_pickup_datetime", structure.getTimestampField()); + assertEquals(Collections.singletonList("YYYY-MM-dd HH:mm:ss"), structure.getTimestampFormats()); + } + public void testCreateConfigsGivenCsvWithTimeLastColumn() throws Exception { String sample = "\"pos_id\",\"trip_id\",\"latitude\",\"longitude\",\"altitude\",\"timestamp\"\n" + "\"1\",\"3\",\"4703.7815\",\"1527.4713\",\"359.9\",\"2017-01-19 16:19:04.742113\"\n" + @@ -166,7 +335,8 @@ public void testCreateConfigsGivenCsvWithTimeLastColumn() throws Exception { String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); - FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker); + FileStructureFinder structureFinder = csvFactory.createFromSample(explanation, sample, charset, hasByteOrderMarker, + FileStructureOverrides.EMPTY_OVERRIDES); FileStructure structure = structureFinder.getStructure(); @@ -181,6 +351,7 @@ public void testCreateConfigsGivenCsvWithTimeLastColumn() throws Exception { structure.getExcludeLinesPattern()); assertNull(structure.getMultilineStartPattern()); assertEquals(Character.valueOf(','), structure.getDelimiter()); + assertEquals(Character.valueOf('"'), structure.getQuote()); assertTrue(structure.getHasHeaderRow()); assertNull(structure.getShouldTrimFields()); assertEquals(Arrays.asList("pos_id", "trip_id", "latitude", "longitude", "altitude", "timestamp"), structure.getColumnNames()); @@ -197,7 +368,7 @@ public void testFindHeaderFromSampleGivenHeaderInSample() throws IOException { "2014-06-23 00:00:01Z,KLM,1355.4812,farequote\n"; Tuple header = DelimitedFileStructureFinder.findHeaderFromSample(explanation, - DelimitedFileStructureFinder.readRows(withHeader, CsvPreference.EXCEL_PREFERENCE).v1()); + DelimitedFileStructureFinder.readRows(withHeader, CsvPreference.EXCEL_PREFERENCE).v1(), FileStructureOverrides.EMPTY_OVERRIDES); assertTrue(header.v1()); assertThat(header.v2(), arrayContaining("time", "airline", "responsetime", "sourcetype")); @@ -210,7 +381,8 @@ public void testFindHeaderFromSampleGivenHeaderNotInSample() throws IOException "2014-06-23 00:00:01Z,KLM,1355.4812,farequote\n"; Tuple header = DelimitedFileStructureFinder.findHeaderFromSample(explanation, - DelimitedFileStructureFinder.readRows(withoutHeader, CsvPreference.EXCEL_PREFERENCE).v1()); + DelimitedFileStructureFinder.readRows(withoutHeader, CsvPreference.EXCEL_PREFERENCE).v1(), + FileStructureOverrides.EMPTY_OVERRIDES); assertFalse(header.v1()); assertThat(header.v2(), arrayContaining("", "", "", "")); @@ -283,12 +455,12 @@ public void testLineHasUnescapedQuote() { public void testRowContainsDuplicateNonEmptyValues() { - assertFalse(DelimitedFileStructureFinder.rowContainsDuplicateNonEmptyValues(Collections.singletonList("a"))); - assertFalse(DelimitedFileStructureFinder.rowContainsDuplicateNonEmptyValues(Collections.singletonList(""))); - assertFalse(DelimitedFileStructureFinder.rowContainsDuplicateNonEmptyValues(Arrays.asList("a", "b", "c"))); - assertTrue(DelimitedFileStructureFinder.rowContainsDuplicateNonEmptyValues(Arrays.asList("a", "b", "a"))); - assertTrue(DelimitedFileStructureFinder.rowContainsDuplicateNonEmptyValues(Arrays.asList("a", "b", "b"))); - assertFalse(DelimitedFileStructureFinder.rowContainsDuplicateNonEmptyValues(Arrays.asList("a", "", ""))); - assertFalse(DelimitedFileStructureFinder.rowContainsDuplicateNonEmptyValues(Arrays.asList("", "a", ""))); + assertNull(DelimitedFileStructureFinder.findDuplicateNonEmptyValues(Collections.singletonList("a"))); + assertNull(DelimitedFileStructureFinder.findDuplicateNonEmptyValues(Collections.singletonList(""))); + assertNull(DelimitedFileStructureFinder.findDuplicateNonEmptyValues(Arrays.asList("a", "b", "c"))); + assertEquals("a", DelimitedFileStructureFinder.findDuplicateNonEmptyValues(Arrays.asList("a", "b", "a"))); + assertEquals("b", DelimitedFileStructureFinder.findDuplicateNonEmptyValues(Arrays.asList("a", "b", "b"))); + assertNull(DelimitedFileStructureFinder.findDuplicateNonEmptyValues(Arrays.asList("a", "", ""))); + assertNull(DelimitedFileStructureFinder.findDuplicateNonEmptyValues(Arrays.asList("", "a", ""))); } } diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderManagerTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderManagerTests.java index 10e780f1d34c1..00929ff474cce 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderManagerTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderManagerTests.java @@ -6,12 +6,14 @@ package org.elasticsearch.xpack.ml.filestructurefinder; import com.ibm.icu.text.CharsetMatch; +import org.elasticsearch.xpack.core.ml.filestructurefinder.FileStructure; import java.io.ByteArrayInputStream; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.util.Arrays; +import static org.elasticsearch.xpack.ml.filestructurefinder.FileStructureOverrides.EMPTY_OVERRIDES; import static org.hamcrest.Matchers.startsWith; import static org.hamcrest.core.IsInstanceOf.instanceOf; @@ -47,26 +49,62 @@ public void testFindCharsetGivenBinary() throws Exception { } public void testMakeBestStructureGivenJson() throws Exception { - assertThat(structureFinderManager.makeBestStructureFinder(explanation, - "{ \"time\": \"2018-05-17T13:41:23\", \"message\": \"hello\" }", StandardCharsets.UTF_8.name(), randomBoolean()), - instanceOf(JsonFileStructureFinder.class)); + assertThat(structureFinderManager.makeBestStructureFinder(explanation, JSON_SAMPLE, StandardCharsets.UTF_8.name(), randomBoolean(), + EMPTY_OVERRIDES), instanceOf(JsonFileStructureFinder.class)); + } + + public void testMakeBestStructureGivenJsonAndDelimitedOverride() throws Exception { + + // Need to change the quote character from the default of double quotes + // otherwise the quotes in the JSON will stop it parsing as CSV + FileStructureOverrides overrides = FileStructureOverrides.builder() + .setFormat(FileStructure.Format.DELIMITED).setQuote('\'').build(); + + assertThat(structureFinderManager.makeBestStructureFinder(explanation, JSON_SAMPLE, StandardCharsets.UTF_8.name(), randomBoolean(), + overrides), instanceOf(DelimitedFileStructureFinder.class)); } public void testMakeBestStructureGivenXml() throws Exception { - assertThat(structureFinderManager.makeBestStructureFinder(explanation, - "hello", StandardCharsets.UTF_8.name(), randomBoolean()), - instanceOf(XmlFileStructureFinder.class)); + assertThat(structureFinderManager.makeBestStructureFinder(explanation, XML_SAMPLE, StandardCharsets.UTF_8.name(), randomBoolean(), + EMPTY_OVERRIDES), instanceOf(XmlFileStructureFinder.class)); + } + + public void testMakeBestStructureGivenXmlAndTextOverride() throws Exception { + + FileStructureOverrides overrides = FileStructureOverrides.builder().setFormat(FileStructure.Format.SEMI_STRUCTURED_TEXT).build(); + + assertThat(structureFinderManager.makeBestStructureFinder(explanation, XML_SAMPLE, StandardCharsets.UTF_8.name(), randomBoolean(), + overrides), instanceOf(TextLogFileStructureFinder.class)); } public void testMakeBestStructureGivenCsv() throws Exception { - assertThat(structureFinderManager.makeBestStructureFinder(explanation, "time,message\n" + - "2018-05-17T13:41:23,hello\n", StandardCharsets.UTF_8.name(), randomBoolean()), - instanceOf(DelimitedFileStructureFinder.class)); + assertThat(structureFinderManager.makeBestStructureFinder(explanation, CSV_SAMPLE, StandardCharsets.UTF_8.name(), randomBoolean(), + EMPTY_OVERRIDES), instanceOf(DelimitedFileStructureFinder.class)); + } + + public void testMakeBestStructureGivenCsvAndJsonOverride() { + + FileStructureOverrides overrides = FileStructureOverrides.builder().setFormat(FileStructure.Format.JSON).build(); + + IllegalArgumentException e = expectThrows(IllegalArgumentException.class, + () -> structureFinderManager.makeBestStructureFinder(explanation, CSV_SAMPLE, StandardCharsets.UTF_8.name(), randomBoolean(), + overrides)); + + assertEquals("Input did not match the specified format [json]", e.getMessage()); } public void testMakeBestStructureGivenText() throws Exception { - assertThat(structureFinderManager.makeBestStructureFinder(explanation, "[2018-05-17T13:41:23] hello\n" + - "[2018-05-17T13:41:24] hello again\n", StandardCharsets.UTF_8.name(), randomBoolean()), - instanceOf(TextLogFileStructureFinder.class)); + assertThat(structureFinderManager.makeBestStructureFinder(explanation, TEXT_SAMPLE, StandardCharsets.UTF_8.name(), randomBoolean(), + EMPTY_OVERRIDES), instanceOf(TextLogFileStructureFinder.class)); + } + + public void testMakeBestStructureGivenTextAndDelimitedOverride() throws Exception { + + // Every line of the text sample has two colons, so colon delimited is possible, just very weird + FileStructureOverrides overrides = FileStructureOverrides.builder() + .setFormat(FileStructure.Format.DELIMITED).setDelimiter(':').build(); + + assertThat(structureFinderManager.makeBestStructureFinder(explanation, TEXT_SAMPLE, StandardCharsets.UTF_8.name(), randomBoolean(), + overrides), instanceOf(DelimitedFileStructureFinder.class)); } } diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtilsTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtilsTests.java index ac8f95670aba8..8dbfb6a8047de 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtilsTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtilsTests.java @@ -17,6 +17,7 @@ import java.util.Map; import java.util.SortedMap; +import static org.elasticsearch.xpack.ml.filestructurefinder.FileStructureOverrides.EMPTY_OVERRIDES; import static org.hamcrest.Matchers.contains; public class FileStructureUtilsTests extends FileStructureTestCase { @@ -32,57 +33,106 @@ public void testMoreLikelyGivenKeyword() { assertFalse(FileStructureUtils.isMoreLikelyTextThanKeyword(randomAlphaOfLengthBetween(1, 256))); } - public void testSingleSampleSingleField() { + public void testGuessTimestampGivenSingleSampleSingleField() { Map sample = Collections.singletonMap("field1", "2018-05-24T17:28:31,735"); Tuple match = - FileStructureUtils.guessTimestampField(explanation, Collections.singletonList(sample)); + FileStructureUtils.guessTimestampField(explanation, Collections.singletonList(sample), EMPTY_OVERRIDES); assertNotNull(match); assertEquals("field1", match.v1()); assertThat(match.v2().dateFormats, contains("ISO8601")); assertEquals("TIMESTAMP_ISO8601", match.v2().grokPatternName); } - public void testSamplesWithSameSingleTimeField() { + public void testGuessTimestampGivenSingleSampleSingleFieldAndConsistentTimeFieldOverride() { + + FileStructureOverrides overrides = FileStructureOverrides.builder().setTimestampField("field1").build(); + + Map sample = Collections.singletonMap("field1", "2018-05-24T17:28:31,735"); + Tuple match = + FileStructureUtils.guessTimestampField(explanation, Collections.singletonList(sample), overrides); + assertNotNull(match); + assertEquals("field1", match.v1()); + assertThat(match.v2().dateFormats, contains("ISO8601")); + assertEquals("TIMESTAMP_ISO8601", match.v2().grokPatternName); + } + + public void testGuessTimestampGivenSingleSampleSingleFieldAndImpossibleTimeFieldOverride() { + + FileStructureOverrides overrides = FileStructureOverrides.builder().setTimestampField("field2").build(); + + Map sample = Collections.singletonMap("field1", "2018-05-24T17:28:31,735"); + IllegalArgumentException e = expectThrows(IllegalArgumentException.class, + () -> FileStructureUtils.guessTimestampField(explanation, Collections.singletonList(sample), overrides)); + + assertEquals("Specified timestamp field [field2] is not present in record [{field1=2018-05-24T17:28:31,735}]", e.getMessage()); + } + + public void testGuessTimestampGivenSingleSampleSingleFieldAndConsistentTimeFormatOverride() { + + FileStructureOverrides overrides = FileStructureOverrides.builder().setTimestampFormat("ISO8601").build(); + + Map sample = Collections.singletonMap("field1", "2018-05-24T17:28:31,735"); + Tuple match = + FileStructureUtils.guessTimestampField(explanation, Collections.singletonList(sample), overrides); + assertNotNull(match); + assertEquals("field1", match.v1()); + assertThat(match.v2().dateFormats, contains("ISO8601")); + assertEquals("TIMESTAMP_ISO8601", match.v2().grokPatternName); + } + + public void testGuessTimestampGivenSingleSampleSingleFieldAndImpossibleTimeFormatOverride() { + + FileStructureOverrides overrides = FileStructureOverrides.builder().setTimestampFormat("EEE MMM dd HH:mm:ss YYYY").build(); + + Map sample = Collections.singletonMap("field1", "2018-05-24T17:28:31,735"); + IllegalArgumentException e = expectThrows(IllegalArgumentException.class, + () -> FileStructureUtils.guessTimestampField(explanation, Collections.singletonList(sample), overrides)); + + assertEquals("Specified timestamp format [EEE MMM dd HH:mm:ss YYYY] does not match for record [{field1=2018-05-24T17:28:31,735}]", + e.getMessage()); + } + + public void testGuessTimestampGivenSamplesWithSameSingleTimeField() { Map sample1 = Collections.singletonMap("field1", "2018-05-24T17:28:31,735"); Map sample2 = Collections.singletonMap("field1", "2018-05-24T17:33:39,406"); Tuple match = - FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2)); + FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), EMPTY_OVERRIDES); assertNotNull(match); assertEquals("field1", match.v1()); assertThat(match.v2().dateFormats, contains("ISO8601")); assertEquals("TIMESTAMP_ISO8601", match.v2().grokPatternName); } - public void testSamplesWithOneSingleTimeFieldDifferentFormat() { + public void testGuessTimestampGivenSamplesWithOneSingleTimeFieldDifferentFormat() { Map sample1 = Collections.singletonMap("field1", "2018-05-24T17:28:31,735"); Map sample2 = Collections.singletonMap("field1", "2018-05-24 17:33:39,406"); Tuple match = - FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2)); + FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), EMPTY_OVERRIDES); assertNull(match); } - public void testSamplesWithDifferentSingleTimeField() { + public void testGuessTimestampGivenSamplesWithDifferentSingleTimeField() { Map sample1 = Collections.singletonMap("field1", "2018-05-24T17:28:31,735"); Map sample2 = Collections.singletonMap("another_field", "2018-05-24T17:33:39,406"); Tuple match = - FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2)); + FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), EMPTY_OVERRIDES); assertNull(match); } - public void testSingleSampleManyFieldsOneTimeFormat() { + public void testGuessTimestampGivenSingleSampleManyFieldsOneTimeFormat() { Map sample = new LinkedHashMap<>(); sample.put("foo", "not a time"); sample.put("time", "2018-05-24 17:28:31,735"); sample.put("bar", 42); Tuple match = - FileStructureUtils.guessTimestampField(explanation, Collections.singletonList(sample)); + FileStructureUtils.guessTimestampField(explanation, Collections.singletonList(sample), EMPTY_OVERRIDES); assertNotNull(match); assertEquals("time", match.v1()); assertThat(match.v2().dateFormats, contains("YYYY-MM-dd HH:mm:ss,SSS")); assertEquals("TIMESTAMP_ISO8601", match.v2().grokPatternName); } - public void testSamplesWithManyFieldsSameSingleTimeFormat() { + public void testGuessTimestampGivenSamplesWithManyFieldsSameSingleTimeFormat() { Map sample1 = new LinkedHashMap<>(); sample1.put("foo", "not a time"); sample1.put("time", "2018-05-24 17:28:31,735"); @@ -92,14 +142,14 @@ public void testSamplesWithManyFieldsSameSingleTimeFormat() { sample2.put("time", "2018-05-29 11:53:02,837"); sample2.put("bar", 17); Tuple match = - FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2)); + FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), EMPTY_OVERRIDES); assertNotNull(match); assertEquals("time", match.v1()); assertThat(match.v2().dateFormats, contains("YYYY-MM-dd HH:mm:ss,SSS")); assertEquals("TIMESTAMP_ISO8601", match.v2().grokPatternName); } - public void testSamplesWithManyFieldsSameTimeFieldDifferentTimeFormat() { + public void testGuessTimestampGivenSamplesWithManyFieldsSameTimeFieldDifferentTimeFormat() { Map sample1 = new LinkedHashMap<>(); sample1.put("foo", "not a time"); sample1.put("time", "2018-05-24 17:28:31,735"); @@ -109,11 +159,11 @@ public void testSamplesWithManyFieldsSameTimeFieldDifferentTimeFormat() { sample2.put("time", "May 29 2018 11:53:02"); sample2.put("bar", 17); Tuple match = - FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2)); + FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), EMPTY_OVERRIDES); assertNull(match); } - public void testSamplesWithManyFieldsSameSingleTimeFormatDistractionBefore() { + public void testGuessTimestampGivenSamplesWithManyFieldsSameSingleTimeFormatDistractionBefore() { Map sample1 = new LinkedHashMap<>(); sample1.put("red_herring", "May 29 2007 11:53:02"); sample1.put("time", "2018-05-24 17:28:31,735"); @@ -123,14 +173,14 @@ public void testSamplesWithManyFieldsSameSingleTimeFormatDistractionBefore() { sample2.put("time", "2018-05-29 11:53:02,837"); sample2.put("bar", 17); Tuple match = - FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2)); + FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), EMPTY_OVERRIDES); assertNotNull(match); assertEquals("time", match.v1()); assertThat(match.v2().dateFormats, contains("YYYY-MM-dd HH:mm:ss,SSS")); assertEquals("TIMESTAMP_ISO8601", match.v2().grokPatternName); } - public void testSamplesWithManyFieldsSameSingleTimeFormatDistractionAfter() { + public void testGuessTimestampGivenSamplesWithManyFieldsSameSingleTimeFormatDistractionAfter() { Map sample1 = new LinkedHashMap<>(); sample1.put("foo", "not a time"); sample1.put("time", "May 24 2018 17:28:31"); @@ -140,14 +190,14 @@ public void testSamplesWithManyFieldsSameSingleTimeFormatDistractionAfter() { sample2.put("time", "May 29 2018 11:53:02"); sample2.put("red_herring", "17"); Tuple match = - FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2)); + FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), EMPTY_OVERRIDES); assertNotNull(match); assertEquals("time", match.v1()); assertThat(match.v2().dateFormats, contains("MMM dd YYYY HH:mm:ss", "MMM d YYYY HH:mm:ss")); assertEquals("CISCOTIMESTAMP", match.v2().grokPatternName); } - public void testSamplesWithManyFieldsInconsistentTimeFields() { + public void testGuessTimestampGivenSamplesWithManyFieldsInconsistentTimeFields() { Map sample1 = new LinkedHashMap<>(); sample1.put("foo", "not a time"); sample1.put("time1", "May 24 2018 17:28:31"); @@ -157,11 +207,11 @@ public void testSamplesWithManyFieldsInconsistentTimeFields() { sample2.put("time2", "May 29 2018 11:53:02"); sample2.put("bar", 42); Tuple match = - FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2)); + FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), EMPTY_OVERRIDES); assertNull(match); } - public void testSamplesWithManyFieldsInconsistentAndConsistentTimeFields() { + public void testGuessTimestampGivenSamplesWithManyFieldsInconsistentAndConsistentTimeFields() { Map sample1 = new LinkedHashMap<>(); sample1.put("foo", "not a time"); sample1.put("time1", "2018-05-09 17:28:31,735"); @@ -173,7 +223,7 @@ public void testSamplesWithManyFieldsInconsistentAndConsistentTimeFields() { sample2.put("time3", "Thu, May 10 2018 11:53:02"); sample2.put("bar", 42); Tuple match = - FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2)); + FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), EMPTY_OVERRIDES); assertNotNull(match); assertEquals("time2", match.v1()); assertThat(match.v2().dateFormats, contains("MMM dd YYYY HH:mm:ss", "MMM d YYYY HH:mm:ss")); diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/GrokPatternCreatorTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/GrokPatternCreatorTests.java index 858709e2764bb..e6a0aee6ee9f4 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/GrokPatternCreatorTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/GrokPatternCreatorTests.java @@ -244,8 +244,7 @@ public void testCreateGrokPatternFromExamplesGivenMultiTimestampLogs() { grokPatternCreator.createGrokPatternFromExamples("TIMESTAMP_ISO8601", "timestamp")); assertEquals(5, mappings.size()); assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "long"), mappings.get("field")); - assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "date"), - mappings.get("extra_timestamp")); + assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "date"), mappings.get("extra_timestamp")); assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "long"), mappings.get("field2")); assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "ip"), mappings.get("ipaddress")); assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("loglevel")); @@ -273,7 +272,7 @@ public void testFindFullLineGrokPatternGivenApacheCombinedLogs() { Map mappings = new HashMap<>(); GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings, null); - assertEquals(new Tuple<>("timestamp", "%{COMBINEDAPACHELOG}"), grokPatternCreator.findFullLineGrokPattern()); + assertEquals(new Tuple<>("timestamp", "%{COMBINEDAPACHELOG}"), grokPatternCreator.findFullLineGrokPattern(null)); assertEquals(10, mappings.size()); assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "text"), mappings.get("agent")); assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("auth")); @@ -323,4 +322,59 @@ public void testAdjustForPunctuationGivenNoCommonPrefix() { assertEquals("", grokPatternCreator.getOverallGrokPatternBuilder().toString()); assertSame(snippets, adjustedSnippets); } + + public void testValidateFullLineGrokPatternGivenValid() { + + String timestampField = "utc_timestamp"; + String grokPattern = "%{INT:serial_no}\\t%{TIMESTAMP_ISO8601:local_timestamp}\\t%{TIMESTAMP_ISO8601:utc_timestamp}\\t" + + "%{INT:user_id}\\t%{HOSTNAME:host}\\t%{IP:client_ip}\\t%{WORD:method}\\t%{LOGLEVEL:severity}\\t%{PROG:program}\\t" + + "%{GREEDYDATA:message}"; + + // Two timestamps: one local, one UTC + Collection sampleMessages = Arrays.asList( + "559550912540598297\t2016-04-20T14:06:53\t2016-04-20T21:06:53Z\t38545844\tserv02nw07\t192.168.114.28\tAuthpriv\t" + + "Info\tsshd\tsubsystem request for sftp", + "559550912548986880\t2016-04-20T14:06:53\t2016-04-20T21:06:53Z\t9049724\tserv02nw03\t10.120.48.147\tAuthpriv\t" + + "Info\tsshd\tsubsystem request for sftp", + "559550912548986887\t2016-04-20T14:06:53\t2016-04-20T21:06:53Z\t884343\tserv02tw03\t192.168.121.189\tAuthpriv\t" + + "Info\tsshd\tsubsystem request for sftp", + "559550912603512850\t2016-04-20T14:06:53\t2016-04-20T21:06:53Z\t8907014\tserv02nw01\t192.168.118.208\tAuthpriv\t" + + "Info\tsshd\tsubsystem request for sftp"); + + Map mappings = new HashMap<>(); + GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings, null); + + grokPatternCreator.validateFullLineGrokPattern(grokPattern, timestampField); + assertEquals(9, mappings.size()); + assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "long"), mappings.get("serial_no")); + assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "date"), mappings.get("local_timestamp")); + assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "long"), mappings.get("user_id")); + assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("host")); + assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "ip"), mappings.get("client_ip")); + assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("method")); + assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("program")); + assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("message")); + } + + public void testValidateFullLineGrokPatternGivenInvalid() { + + String timestampField = "utc_timestamp"; + String grokPattern = "%{INT:serial_no}\\t%{TIMESTAMP_ISO8601:local_timestamp}\\t%{TIMESTAMP_ISO8601:utc_timestamp}\\t" + + "%{INT:user_id}\\t%{HOSTNAME:host}\\t%{IP:client_ip}\\t%{WORD:method}\\t%{LOGLEVEL:severity}\\t%{PROG:program}\\t" + + "%{GREEDYDATA:message}"; + + Collection sampleMessages = Arrays.asList( + "Sep 8 11:55:06 linux named[22529]: error (unexpected RCODE REFUSED) resolving 'elastic.slack.com/A/IN': 95.110.64.205#53", + "Sep 8 11:55:08 linux named[22529]: error (unexpected RCODE REFUSED) resolving 'slack-imgs.com/A/IN': 95.110.64.205#53", + "Sep 8 11:55:35 linux named[22529]: error (unexpected RCODE REFUSED) resolving 'www.elastic.co/A/IN': 95.110.68.206#53", + "Sep 8 11:55:42 linux named[22529]: error (unexpected RCODE REFUSED) resolving 'b.akamaiedge.net/A/IN': 95.110.64.205#53"); + + Map mappings = new HashMap<>(); + GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings, null); + + IllegalArgumentException e = expectThrows(IllegalArgumentException.class, + () -> grokPatternCreator.validateFullLineGrokPattern(grokPattern, timestampField)); + + assertEquals("Supplied Grok pattern [" + grokPattern + "] does not match sample messages", e.getMessage()); + } } diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/JsonFileStructureFinderTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/JsonFileStructureFinderTests.java index f41868be86286..6856e9a60214b 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/JsonFileStructureFinderTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/JsonFileStructureFinderTests.java @@ -18,7 +18,8 @@ public void testCreateConfigsGivenGoodJson() throws Exception { String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); - FileStructureFinder structureFinder = factory.createFromSample(explanation, JSON_SAMPLE, charset, hasByteOrderMarker); + FileStructureFinder structureFinder = factory.createFromSample(explanation, JSON_SAMPLE, charset, hasByteOrderMarker, + FileStructureOverrides.EMPTY_OVERRIDES); FileStructure structure = structureFinder.getStructure(); @@ -32,6 +33,7 @@ public void testCreateConfigsGivenGoodJson() throws Exception { assertNull(structure.getExcludeLinesPattern()); assertNull(structure.getMultilineStartPattern()); assertNull(structure.getDelimiter()); + assertNull(structure.getQuote()); assertNull(structure.getHasHeaderRow()); assertNull(structure.getShouldTrimFields()); assertNull(structure.getGrokPattern()); diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderTests.java index a23080a827277..5bc40a165117e 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderTests.java @@ -15,6 +15,90 @@ public class TextLogFileStructureFinderTests extends FileStructureTestCase { + private static final String EXCEPTION_TRACE_SAMPLE = + "[2018-02-28T14:49:40,517][DEBUG][o.e.a.b.TransportShardBulkAction] [an_index][2] failed to execute bulk item " + + "(index) BulkShardRequest [[an_index][2]] containing [33] requests\n" + + "java.lang.IllegalArgumentException: Document contains at least one immense term in field=\"message.keyword\" (whose UTF8 " + + "encoding is longer than the max length 32766), all of which were skipped. Please correct the analyzer to not produce " + + "such terms. The prefix of the first immense term is: '[60, 83, 79, 65, 80, 45, 69, 78, 86, 58, 69, 110, 118, 101, 108, " + + "111, 112, 101, 32, 120, 109, 108, 110, 115, 58, 83, 79, 65, 80, 45]...', original message: bytes can be at most 32766 " + + "in length; got 49023\n" + + "\tat org.apache.lucene.index.DefaultIndexingChain$PerField.invert(DefaultIndexingChain.java:796) " + + "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" + + "\tat org.apache.lucene.index.DefaultIndexingChain.processField(DefaultIndexingChain.java:430) " + + "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" + + "\tat org.apache.lucene.index.DefaultIndexingChain.processDocument(DefaultIndexingChain.java:392) " + + "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" + + "\tat org.apache.lucene.index.DocumentsWriterPerThread.updateDocument(DocumentsWriterPerThread.java:240) " + + "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" + + "\tat org.apache.lucene.index.DocumentsWriter.updateDocument(DocumentsWriter.java:496) " + + "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" + + "\tat org.apache.lucene.index.IndexWriter.updateDocument(IndexWriter.java:1729) " + + "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" + + "\tat org.apache.lucene.index.IndexWriter.addDocument(IndexWriter.java:1464) " + + "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" + + "\tat org.elasticsearch.index.engine.InternalEngine.index(InternalEngine.java:1070) ~[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.index.engine.InternalEngine.indexIntoLucene(InternalEngine.java:1012) " + + "~[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.index.engine.InternalEngine.index(InternalEngine.java:878) ~[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.index.shard.IndexShard.index(IndexShard.java:738) ~[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.index.shard.IndexShard.applyIndexOperation(IndexShard.java:707) ~[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.index.shard.IndexShard.applyIndexOperationOnPrimary(IndexShard.java:673) " + + "~[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.action.bulk.TransportShardBulkAction.executeIndexRequestOnPrimary(TransportShardBulkAction.java:548) " + + "~[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.action.bulk.TransportShardBulkAction.executeIndexRequest(TransportShardBulkAction.java:140) " + + "[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.action.bulk.TransportShardBulkAction.executeBulkItemRequest(TransportShardBulkAction.java:236) " + + "[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.action.bulk.TransportShardBulkAction.performOnPrimary(TransportShardBulkAction.java:123) " + + "[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.action.bulk.TransportShardBulkAction.shardOperationOnPrimary(TransportShardBulkAction.java:110) " + + "[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.action.bulk.TransportShardBulkAction.shardOperationOnPrimary(TransportShardBulkAction.java:72) " + + "[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryShardReference.perform" + + "(TransportReplicationAction.java:1034) [elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryShardReference.perform" + + "(TransportReplicationAction.java:1012) [elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.action.support.replication.ReplicationOperation.execute(ReplicationOperation.java:103) " + + "[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$AsyncPrimaryAction.onResponse" + + "(TransportReplicationAction.java:359) [elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$AsyncPrimaryAction.onResponse" + + "(TransportReplicationAction.java:299) [elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$1.onResponse" + + "(TransportReplicationAction.java:975) [elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$1.onResponse" + + "(TransportReplicationAction.java:972) [elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.index.shard.IndexShardOperationPermits.acquire(IndexShardOperationPermits.java:238) " + + "[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.index.shard.IndexShard.acquirePrimaryOperationPermit(IndexShard.java:2220) " + + "[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.action.support.replication.TransportReplicationAction.acquirePrimaryShardReference" + + "(TransportReplicationAction.java:984) [elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.action.support.replication.TransportReplicationAction.access$500(TransportReplicationAction.java:98) " + + "[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$AsyncPrimaryAction.doRun" + + "(TransportReplicationAction.java:320) [elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:37) " + + "[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryOperationTransportHandler" + + ".messageReceived(TransportReplicationAction.java:295) [elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryOperationTransportHandler" + + ".messageReceived(TransportReplicationAction.java:282) [elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.transport.RequestHandlerRegistry.processMessageReceived(RequestHandlerRegistry.java:66) " + + "[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.transport.TransportService$7.doRun(TransportService.java:656) " + + "[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.common.util.concurrent.ThreadContext$ContextPreservingAbstractRunnable.doRun(ThreadContext.java:635) " + + "[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:37) " + + "[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) [?:1.8.0_144]\n" + + "\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) [?:1.8.0_144]\n" + + "\tat java.lang.Thread.run(Thread.java:748) [?:1.8.0_144]\n"; + private FileStructureFinderFactory factory = new TextLogFileStructureFinderFactory(); public void testCreateConfigsGivenElasticsearchLog() throws Exception { @@ -22,7 +106,8 @@ public void testCreateConfigsGivenElasticsearchLog() throws Exception { String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); - FileStructureFinder structureFinder = factory.createFromSample(explanation, TEXT_SAMPLE, charset, hasByteOrderMarker); + FileStructureFinder structureFinder = factory.createFromSample(explanation, TEXT_SAMPLE, charset, hasByteOrderMarker, + FileStructureOverrides.EMPTY_OVERRIDES); FileStructure structure = structureFinder.getStructure(); @@ -36,6 +121,7 @@ public void testCreateConfigsGivenElasticsearchLog() throws Exception { assertNull(structure.getExcludeLinesPattern()); assertEquals("^\\[\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", structure.getMultilineStartPattern()); assertNull(structure.getDelimiter()); + assertNull(structure.getQuote()); assertNull(structure.getHasHeaderRow()); assertNull(structure.getShouldTrimFields()); assertEquals("\\[%{TIMESTAMP_ISO8601:timestamp}\\]\\[%{LOGLEVEL:loglevel} \\]\\[.*", structure.getGrokPattern()); @@ -43,6 +129,85 @@ public void testCreateConfigsGivenElasticsearchLog() throws Exception { assertEquals(Collections.singletonList("ISO8601"), structure.getTimestampFormats()); } + public void testCreateConfigsGivenElasticsearchLogAndTimestampFieldOverride() throws Exception { + + FileStructureOverrides overrides = FileStructureOverrides.builder().setTimestampField("my_time").build(); + + assertTrue(factory.canCreateFromSample(explanation, TEXT_SAMPLE)); + + String charset = randomFrom(POSSIBLE_CHARSETS); + Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); + FileStructureFinder structureFinder = factory.createFromSample(explanation, TEXT_SAMPLE, charset, hasByteOrderMarker, overrides); + + FileStructure structure = structureFinder.getStructure(); + + assertEquals(FileStructure.Format.SEMI_STRUCTURED_TEXT, structure.getFormat()); + assertEquals(charset, structure.getCharset()); + if (hasByteOrderMarker == null) { + assertNull(structure.getHasByteOrderMarker()); + } else { + assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker()); + } + assertNull(structure.getExcludeLinesPattern()); + assertEquals("^\\[\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", structure.getMultilineStartPattern()); + assertNull(structure.getDelimiter()); + assertNull(structure.getQuote()); + assertNull(structure.getHasHeaderRow()); + assertNull(structure.getShouldTrimFields()); + assertEquals("\\[%{TIMESTAMP_ISO8601:my_time}\\]\\[%{LOGLEVEL:loglevel} \\]\\[.*", structure.getGrokPattern()); + assertEquals("my_time", structure.getTimestampField()); + assertEquals(Collections.singletonList("ISO8601"), structure.getTimestampFormats()); + } + + public void testCreateConfigsGivenElasticsearchLogAndGrokPatternOverride() throws Exception { + + FileStructureOverrides overrides = FileStructureOverrides.builder().setGrokPattern("\\[%{TIMESTAMP_ISO8601:timestamp}\\]" + + "\\[%{LOGLEVEL:loglevel} *\\]\\[%{JAVACLASS:class} *\\] \\[%{HOSTNAME:node}\\] %{JAVALOGMESSAGE:message}").build(); + + assertTrue(factory.canCreateFromSample(explanation, TEXT_SAMPLE)); + + String charset = randomFrom(POSSIBLE_CHARSETS); + Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); + FileStructureFinder structureFinder = factory.createFromSample(explanation, TEXT_SAMPLE, charset, hasByteOrderMarker, overrides); + + FileStructure structure = structureFinder.getStructure(); + + assertEquals(FileStructure.Format.SEMI_STRUCTURED_TEXT, structure.getFormat()); + assertEquals(charset, structure.getCharset()); + if (hasByteOrderMarker == null) { + assertNull(structure.getHasByteOrderMarker()); + } else { + assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker()); + } + assertNull(structure.getExcludeLinesPattern()); + assertEquals("^\\[\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", structure.getMultilineStartPattern()); + assertNull(structure.getDelimiter()); + assertNull(structure.getQuote()); + assertNull(structure.getHasHeaderRow()); + assertNull(structure.getShouldTrimFields()); + assertEquals("\\[%{TIMESTAMP_ISO8601:timestamp}\\]\\[%{LOGLEVEL:loglevel} *\\]" + + "\\[%{JAVACLASS:class} *\\] \\[%{HOSTNAME:node}\\] %{JAVALOGMESSAGE:message}", structure.getGrokPattern()); + assertEquals("timestamp", structure.getTimestampField()); + assertEquals(Collections.singletonList("ISO8601"), structure.getTimestampFormats()); + } + + public void testCreateConfigsGivenElasticsearchLogAndImpossibleGrokPatternOverride() { + + // This Grok pattern cannot be matched against the messages in the sample because the fields are in the wrong order + FileStructureOverrides overrides = FileStructureOverrides.builder().setGrokPattern("\\[%{LOGLEVEL:loglevel} *\\]" + + "\\[%{HOSTNAME:node}\\]\\[%{TIMESTAMP_ISO8601:timestamp}\\] \\[%{JAVACLASS:class} *\\] %{JAVALOGMESSAGE:message}").build(); + + assertTrue(factory.canCreateFromSample(explanation, TEXT_SAMPLE)); + + String charset = randomFrom(POSSIBLE_CHARSETS); + Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); + IllegalArgumentException e = expectThrows(IllegalArgumentException.class, + () -> factory.createFromSample(explanation, TEXT_SAMPLE, charset, hasByteOrderMarker, overrides)); + + assertEquals("Supplied Grok pattern [\\[%{LOGLEVEL:loglevel} *\\]\\[%{HOSTNAME:node}\\]\\[%{TIMESTAMP_ISO8601:timestamp}\\] " + + "\\[%{JAVACLASS:class} *\\] %{JAVALOGMESSAGE:message}] does not match sample messages", e.getMessage()); + } + public void testCreateMultiLineMessageStartRegexGivenNoPrefaces() { for (TimestampFormatFinder.CandidateTimestampFormat candidateTimestampFormat : TimestampFormatFinder.ORDERED_CANDIDATE_FORMATS) { String simpleDateRegex = candidateTimestampFormat.simplePattern.pattern(); @@ -144,97 +309,17 @@ public void testMostLikelyTimestampGivenAllSame() { "[2018-06-27T11:59:23,588][INFO ][o.e.p.PluginsService ] [node-0] loaded module [x-pack-watcher]\n" + "[2018-06-27T11:59:23,588][INFO ][o.e.p.PluginsService ] [node-0] no plugins loaded\n"; - Tuple> mostLikelyMatch = TextLogFileStructureFinder.mostLikelyTimestamp(sample.split("\n")); + Tuple> mostLikelyMatch = + TextLogFileStructureFinder.mostLikelyTimestamp(sample.split("\n"), FileStructureOverrides.EMPTY_OVERRIDES); assertNotNull(mostLikelyMatch); assertEquals(new TimestampMatch(7, "", "ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", "TIMESTAMP_ISO8601", ""), mostLikelyMatch.v1()); } public void testMostLikelyTimestampGivenExceptionTrace() { - String sample = "[2018-02-28T14:49:40,517][DEBUG][o.e.a.b.TransportShardBulkAction] [an_index][2] failed to execute bulk item " + - "(index) BulkShardRequest [[an_index][2]] containing [33] requests\n" + - "java.lang.IllegalArgumentException: Document contains at least one immense term in field=\"message.keyword\" (whose UTF8 " + - "encoding is longer than the max length 32766), all of which were skipped. Please correct the analyzer to not produce " + - "such terms. The prefix of the first immense term is: '[60, 83, 79, 65, 80, 45, 69, 78, 86, 58, 69, 110, 118, 101, 108, " + - "111, 112, 101, 32, 120, 109, 108, 110, 115, 58, 83, 79, 65, 80, 45]...', original message: bytes can be at most 32766 " + - "in length; got 49023\n" + - "\tat org.apache.lucene.index.DefaultIndexingChain$PerField.invert(DefaultIndexingChain.java:796) " + - "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" + - "\tat org.apache.lucene.index.DefaultIndexingChain.processField(DefaultIndexingChain.java:430) " + - "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" + - "\tat org.apache.lucene.index.DefaultIndexingChain.processDocument(DefaultIndexingChain.java:392) " + - "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" + - "\tat org.apache.lucene.index.DocumentsWriterPerThread.updateDocument(DocumentsWriterPerThread.java:240) " + - "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" + - "\tat org.apache.lucene.index.DocumentsWriter.updateDocument(DocumentsWriter.java:496) " + - "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" + - "\tat org.apache.lucene.index.IndexWriter.updateDocument(IndexWriter.java:1729) " + - "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" + - "\tat org.apache.lucene.index.IndexWriter.addDocument(IndexWriter.java:1464) " + - "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" + - "\tat org.elasticsearch.index.engine.InternalEngine.index(InternalEngine.java:1070) ~[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.index.engine.InternalEngine.indexIntoLucene(InternalEngine.java:1012) " + - "~[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.index.engine.InternalEngine.index(InternalEngine.java:878) ~[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.index.shard.IndexShard.index(IndexShard.java:738) ~[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.index.shard.IndexShard.applyIndexOperation(IndexShard.java:707) ~[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.index.shard.IndexShard.applyIndexOperationOnPrimary(IndexShard.java:673) " + - "~[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.action.bulk.TransportShardBulkAction.executeIndexRequestOnPrimary(TransportShardBulkAction.java:548) " + - "~[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.action.bulk.TransportShardBulkAction.executeIndexRequest(TransportShardBulkAction.java:140) " + - "[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.action.bulk.TransportShardBulkAction.executeBulkItemRequest(TransportShardBulkAction.java:236) " + - "[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.action.bulk.TransportShardBulkAction.performOnPrimary(TransportShardBulkAction.java:123) " + - "[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.action.bulk.TransportShardBulkAction.shardOperationOnPrimary(TransportShardBulkAction.java:110) " + - "[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.action.bulk.TransportShardBulkAction.shardOperationOnPrimary(TransportShardBulkAction.java:72) " + - "[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryShardReference.perform" + - "(TransportReplicationAction.java:1034) [elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryShardReference.perform" + - "(TransportReplicationAction.java:1012) [elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.action.support.replication.ReplicationOperation.execute(ReplicationOperation.java:103) " + - "[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$AsyncPrimaryAction.onResponse" + - "(TransportReplicationAction.java:359) [elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$AsyncPrimaryAction.onResponse" + - "(TransportReplicationAction.java:299) [elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$1.onResponse" + - "(TransportReplicationAction.java:975) [elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$1.onResponse" + - "(TransportReplicationAction.java:972) [elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.index.shard.IndexShardOperationPermits.acquire(IndexShardOperationPermits.java:238) " + - "[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.index.shard.IndexShard.acquirePrimaryOperationPermit(IndexShard.java:2220) " + - "[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.action.support.replication.TransportReplicationAction.acquirePrimaryShardReference" + - "(TransportReplicationAction.java:984) [elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.action.support.replication.TransportReplicationAction.access$500(TransportReplicationAction.java:98) " + - "[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$AsyncPrimaryAction.doRun" + - "(TransportReplicationAction.java:320) [elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:37) " + - "[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryOperationTransportHandler" + - ".messageReceived(TransportReplicationAction.java:295) [elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryOperationTransportHandler" + - ".messageReceived(TransportReplicationAction.java:282) [elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.transport.RequestHandlerRegistry.processMessageReceived(RequestHandlerRegistry.java:66) " + - "[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.transport.TransportService$7.doRun(TransportService.java:656) " + - "[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.common.util.concurrent.ThreadContext$ContextPreservingAbstractRunnable.doRun(ThreadContext.java:635) " + - "[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:37) " + - "[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) [?:1.8.0_144]\n" + - "\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) [?:1.8.0_144]\n" + - "\tat java.lang.Thread.run(Thread.java:748) [?:1.8.0_144]\n"; - - Tuple> mostLikelyMatch = TextLogFileStructureFinder.mostLikelyTimestamp(sample.split("\n")); + + Tuple> mostLikelyMatch = + TextLogFileStructureFinder.mostLikelyTimestamp(EXCEPTION_TRACE_SAMPLE.split("\n"), FileStructureOverrides.EMPTY_OVERRIDES); assertNotNull(mostLikelyMatch); // Even though many lines have a timestamp near the end (in the Lucene version information), @@ -243,4 +328,26 @@ public void testMostLikelyTimestampGivenExceptionTrace() { assertEquals(new TimestampMatch(7, "", "ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", "TIMESTAMP_ISO8601", ""), mostLikelyMatch.v1()); } + + public void testMostLikelyTimestampGivenExceptionTraceAndTimestampFormatOverride() { + + FileStructureOverrides overrides = FileStructureOverrides.builder().setTimestampFormat("YYYY-MM-dd HH:mm:ss").build(); + + Tuple> mostLikelyMatch = + TextLogFileStructureFinder.mostLikelyTimestamp(EXCEPTION_TRACE_SAMPLE.split("\n"), overrides); + assertNotNull(mostLikelyMatch); + + // The override should force the seemingly inferior choice of timestamp + assertEquals(new TimestampMatch(6, "", "YYYY-MM-dd HH:mm:ss", "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", "TIMESTAMP_ISO8601", + ""), mostLikelyMatch.v1()); + } + + public void testMostLikelyTimestampGivenExceptionTraceAndImpossibleTimestampFormatOverride() { + + FileStructureOverrides overrides = FileStructureOverrides.builder().setTimestampFormat("MMM dd HH:mm:ss").build(); + + Tuple> mostLikelyMatch = + TextLogFileStructureFinder.mostLikelyTimestamp(EXCEPTION_TRACE_SAMPLE.split("\n"), overrides); + assertNull(mostLikelyMatch); + } } diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderTests.java index 4bf65ba783572..01c44147b0430 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinderTests.java @@ -18,7 +18,8 @@ public void testCreateConfigsGivenGoodXml() throws Exception { String charset = randomFrom(POSSIBLE_CHARSETS); Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); - FileStructureFinder structureFinder = factory.createFromSample(explanation, XML_SAMPLE, charset, hasByteOrderMarker); + FileStructureFinder structureFinder = factory.createFromSample(explanation, XML_SAMPLE, charset, hasByteOrderMarker, + FileStructureOverrides.EMPTY_OVERRIDES); FileStructure structure = structureFinder.getStructure(); @@ -32,6 +33,7 @@ public void testCreateConfigsGivenGoodXml() throws Exception { assertNull(structure.getExcludeLinesPattern()); assertEquals("^\\s* Date: Wed, 12 Sep 2018 14:48:38 +0100 Subject: [PATCH 2/3] Fixed problems in first PR commit --- .../xpack/core/ml/action/FindFileStructureAction.java | 9 +++++++++ .../DelimitedFileStructureFinder.java | 2 +- .../ml/filestructurefinder/FileStructureOverrides.java | 8 ++++++++ .../ml/filestructurefinder/FileStructureUtils.java | 3 +++ .../ml/filestructurefinder/TimestampFormatFinder.java | 10 ++-------- .../filestructurefinder/GrokPatternCreatorTests.java | 3 ++- 6 files changed, 25 insertions(+), 10 deletions(-) diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/action/FindFileStructureAction.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/action/FindFileStructureAction.java index d2b7e0cb6536c..c58c24564b254 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/action/FindFileStructureAction.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/action/FindFileStructureAction.java @@ -313,6 +313,8 @@ public void readFrom(StreamInput in) throws IOException { columnNames = in.readBoolean() ? in.readList(StreamInput::readString) : null; hasHeaderRow = in.readOptionalBoolean(); delimiter = in.readBoolean() ? (char) in.readVInt() : null; + quote = in.readBoolean() ? (char) in.readVInt() : null; + shouldTrimFields = in.readOptionalBoolean(); grokPattern = in.readOptionalString(); timestampFormat = in.readOptionalString(); timestampField = in.readOptionalString(); @@ -343,6 +345,13 @@ public void writeTo(StreamOutput out) throws IOException { out.writeBoolean(true); out.writeVInt(delimiter); } + if (quote == null) { + out.writeBoolean(false); + } else { + out.writeBoolean(true); + out.writeVInt(quote); + } + out.writeOptionalBoolean(shouldTrimFields); out.writeOptionalString(grokPattern); out.writeOptionalString(timestampFormat); out.writeOptionalString(timestampField); diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinder.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinder.java index f7cff3c2cba85..39c2d406fd947 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinder.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinder.java @@ -133,7 +133,7 @@ static DelimitedFileStructureFinder makeDelimitedFileStructureFinder(List optQuote + column.replace(quote, twoQuotes).replaceAll("([\\\\|()\\[\\]{}^$*?])", "\\\\$1") + optQuote) .collect(Collectors.joining(","))); diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureOverrides.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureOverrides.java index 138dd4c49a4b2..e30699c69b7f8 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureOverrides.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureOverrides.java @@ -13,6 +13,14 @@ import java.util.List; import java.util.Objects; +/** + * An immutable holder for the aspects of file structure detection that can be overridden + * by the end user. Every field can be null, and this means that that + * aspect of the file structure detection is not overridden. + * + * There is no consistency checking in this class. Consistency checking of the different + * fields is done in {@link FindFileStructureAction.Request}. + */ public class FileStructureOverrides { public static final FileStructureOverrides EMPTY_OVERRIDES = new Builder().build(); diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtils.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtils.java index 6f9b4cbbd6427..66ecee5b311bb 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtils.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtils.java @@ -51,6 +51,9 @@ private FileStructureUtils() { * may be non-empty when the method is called, and this method may * append to it. * @param sampleRecords List of records derived from the provided sample. + * @param overrides Aspects of the file structure that are known in advance. These take precedence over + * values determined by structure analysis. An exception will be thrown if the file structure + * is incompatible with an overridden value. * @return A tuple of (field name, timestamp format) if one can be found, or null if * there is no consistent timestamp. */ diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TimestampFormatFinder.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TimestampFormatFinder.java index 4239748f7df0b..363b1352a54cb 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TimestampFormatFinder.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TimestampFormatFinder.java @@ -141,7 +141,6 @@ private TimestampFormatFinder() { /** * Find the first timestamp format that matches part of the supplied value. - * * @param text The value that the returned timestamp format must exist within. * @return The timestamp format, or null if none matches. */ @@ -151,7 +150,6 @@ public static TimestampMatch findFirstMatch(String text) { /** * Find the first timestamp format that matches part of the supplied value. - * * @param text The value that the returned timestamp format must exist within. * @param requiredFormat A date format that any returned match must support. * @return The timestamp format, or null if none matches. @@ -163,8 +161,7 @@ public static TimestampMatch findFirstMatch(String text, String requiredFormat) /** * Find the first timestamp format that matches part of the supplied value, * excluding a specified number of candidate formats. - * - * @param text The value that the returned timestamp format must exist within. + * @param text The value that the returned timestamp format must exist within. * @param ignoreCandidates The number of candidate formats to exclude from the search. * @return The timestamp format, or null if none matches. */ @@ -175,7 +172,6 @@ public static TimestampMatch findFirstMatch(String text, int ignoreCandidates) { /** * Find the first timestamp format that matches part of the supplied value, * excluding a specified number of candidate formats. - * * @param text The value that the returned timestamp format must exist within. * @param ignoreCandidates The number of candidate formats to exclude from the search. * @param requiredFormat A date format that any returned match must support. @@ -213,7 +209,6 @@ public static TimestampMatch findFirstMatch(String text, int ignoreCandidates, S /** * Find the best timestamp format for matching an entire field value. - * * @param text The value that the returned timestamp format must match in its entirety. * @return The timestamp format, or null if none matches. */ @@ -223,7 +218,6 @@ public static TimestampMatch findFirstFullMatch(String text) { /** * Find the best timestamp format for matching an entire field value. - * * @param text The value that the returned timestamp format must match in its entirety. * @param requiredFormat A date format that any returned match must support. * @return The timestamp format, or null if none matches. @@ -471,7 +465,7 @@ static final class CandidateTimestampFormat { // The (?m) here has the Ruby meaning, which is equivalent to (?s) in Java this.strictSearchGrok = new Grok(Grok.getBuiltinPatterns(), "(?m)%{DATA:" + PREFACE + "}" + strictGrokPattern + "%{GREEDYDATA:" + EPILOGUE + "}"); - this.strictFullMatchGrok = new Grok(Grok.getBuiltinPatterns(), strictGrokPattern + "$"); + this.strictFullMatchGrok = new Grok(Grok.getBuiltinPatterns(), "^" + strictGrokPattern + "$"); this.standardGrokPatternName = standardGrokPatternName; assert quickRuleOutIndices.stream() .noneMatch(quickRuleOutIndex -> quickRuleOutIndex < 0 || quickRuleOutIndex >= QUICK_RULE_OUT_PATTERNS.size()); diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/GrokPatternCreatorTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/GrokPatternCreatorTests.java index e6a0aee6ee9f4..271e071fc2717 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/GrokPatternCreatorTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/GrokPatternCreatorTests.java @@ -272,7 +272,8 @@ public void testFindFullLineGrokPatternGivenApacheCombinedLogs() { Map mappings = new HashMap<>(); GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings, null); - assertEquals(new Tuple<>("timestamp", "%{COMBINEDAPACHELOG}"), grokPatternCreator.findFullLineGrokPattern(null)); + assertEquals(new Tuple<>("timestamp", "%{COMBINEDAPACHELOG}"), + grokPatternCreator.findFullLineGrokPattern(randomBoolean() ? "timestamp" : null)); assertEquals(10, mappings.size()); assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "text"), mappings.get("agent")); assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("auth")); From 7ed63e3015ac303ba3ff5b3465ceae60d9988fe3 Mon Sep 17 00:00:00 2001 From: David Roberts Date: Thu, 13 Sep 2018 15:12:50 +0100 Subject: [PATCH 3/3] Address review comments --- .../ml/action/FindFileStructureAction.java | 31 ++++++----- .../FindFileStructureActionRequestTests.java | 6 +- .../DelimitedFileStructureFinder.java | 5 +- .../api/xpack.ml.find_file_structure.json | 43 ++++++++++++++- .../test/ml/find_file_structure.yml | 55 ++++++++++++++++++- 5 files changed, 120 insertions(+), 20 deletions(-) diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/action/FindFileStructureAction.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/action/FindFileStructureAction.java index c58c24564b254..d10fedfb58975 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/action/FindFileStructureAction.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/action/FindFileStructureAction.java @@ -24,6 +24,7 @@ import java.io.IOException; import java.util.Arrays; import java.util.List; +import java.util.Locale; import java.util.Objects; import static org.elasticsearch.action.ValidateActions.addValidationError; @@ -123,6 +124,9 @@ public static class Request extends ActionRequest { public static final ParseField TIMESTAMP_FORMAT = new ParseField("timestamp_format"); public static final ParseField TIMESTAMP_FIELD = FileStructure.TIMESTAMP_FIELD; + private static final String ARG_INCOMPATIBLE_WITH_FORMAT_TEMPLATE = + "[%s] may only be specified if [" + FORMAT.getPreferredName() + "] is [%s]"; + private Integer linesToSample; private String charset; private FileStructure.Format format; @@ -263,39 +267,40 @@ public void setSample(BytesReference sample) { this.sample = sample; } + private static ActionRequestValidationException addIncompatibleArgError(ParseField arg, FileStructure.Format format, + ActionRequestValidationException validationException) { + return addValidationError(String.format(Locale.ROOT, ARG_INCOMPATIBLE_WITH_FORMAT_TEMPLATE, arg.getPreferredName(), format), + validationException); + } + @Override public ActionRequestValidationException validate() { ActionRequestValidationException validationException = null; if (linesToSample != null && linesToSample <= 0) { validationException = - addValidationError(LINES_TO_SAMPLE.getPreferredName() + " must be positive if specified", validationException); + addValidationError("[" + LINES_TO_SAMPLE.getPreferredName() + "] must be positive if specified", validationException); } if (format != FileStructure.Format.DELIMITED) { if (columnNames != null) { - validationException = addValidationError(COLUMN_NAMES.getPreferredName() + " may only be specified if " + - FORMAT.getPreferredName() + " is " + FileStructure.Format.DELIMITED, validationException); + validationException = addIncompatibleArgError(COLUMN_NAMES, FileStructure.Format.DELIMITED, validationException); } if (hasHeaderRow != null) { - validationException = addValidationError(HAS_HEADER_ROW.getPreferredName() + " may only be specified if " + - FORMAT.getPreferredName() + " is " + FileStructure.Format.DELIMITED, validationException); + validationException = addIncompatibleArgError(HAS_HEADER_ROW, FileStructure.Format.DELIMITED, validationException); } if (delimiter != null) { - validationException = addValidationError(DELIMITER.getPreferredName() + " may only be specified if " + - FORMAT.getPreferredName() + " is " + FileStructure.Format.DELIMITED, validationException); + validationException = addIncompatibleArgError(DELIMITER, FileStructure.Format.DELIMITED, validationException); } if (quote != null) { - validationException = addValidationError(QUOTE.getPreferredName() + " may only be specified if " + - FORMAT.getPreferredName() + " is " + FileStructure.Format.DELIMITED, validationException); + validationException = addIncompatibleArgError(QUOTE, FileStructure.Format.DELIMITED, validationException); } if (shouldTrimFields != null) { - validationException = addValidationError(SHOULD_TRIM_FIELDS.getPreferredName() + " may only be specified if " + - FORMAT.getPreferredName() + " is " + FileStructure.Format.DELIMITED, validationException); + validationException = addIncompatibleArgError(SHOULD_TRIM_FIELDS, FileStructure.Format.DELIMITED, validationException); } } if (format != FileStructure.Format.SEMI_STRUCTURED_TEXT) { if (grokPattern != null) { - validationException = addValidationError(GROK_PATTERN.getPreferredName() + " may only be specified if " + - FORMAT.getPreferredName() + " is " + FileStructure.Format.SEMI_STRUCTURED_TEXT, validationException); + validationException = + addIncompatibleArgError(GROK_PATTERN, FileStructure.Format.SEMI_STRUCTURED_TEXT, validationException); } } if (sample == null || sample.length() == 0) { diff --git a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/action/FindFileStructureActionRequestTests.java b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/action/FindFileStructureActionRequestTests.java index 90a4b656a7486..21f11fa5f73c7 100644 --- a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/action/FindFileStructureActionRequestTests.java +++ b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/action/FindFileStructureActionRequestTests.java @@ -82,7 +82,7 @@ public void testValidateLinesToSample() { ActionRequestValidationException e = request.validate(); assertNotNull(e); assertThat(e.getMessage(), startsWith("Validation Failed: ")); - assertThat(e.getMessage(), containsString(" lines_to_sample must be positive if specified")); + assertThat(e.getMessage(), containsString(" [lines_to_sample] must be positive if specified")); } public void testValidateNonDelimited() { @@ -118,7 +118,7 @@ public void testValidateNonDelimited() { ActionRequestValidationException e = request.validate(); assertNotNull(e); assertThat(e.getMessage(), startsWith("Validation Failed: ")); - assertThat(e.getMessage(), containsString(" " + errorField + " may only be specified if format is delimited")); + assertThat(e.getMessage(), containsString(" [" + errorField + "] may only be specified if [format] is [delimited]")); } public void testValidateNonSemiStructuredText() { @@ -131,7 +131,7 @@ public void testValidateNonSemiStructuredText() { ActionRequestValidationException e = request.validate(); assertNotNull(e); assertThat(e.getMessage(), startsWith("Validation Failed: ")); - assertThat(e.getMessage(), containsString(" grok_pattern may only be specified if format is semi_structured_text")); + assertThat(e.getMessage(), containsString(" [grok_pattern] may only be specified if [format] is [semi_structured_text]")); } public void testValidateSample() { diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinder.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinder.java index 39c2d406fd947..a103560480d06 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinder.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinder.java @@ -33,6 +33,7 @@ public class DelimitedFileStructureFinder implements FileStructureFinder { + private static final String REGEX_NEEDS_ESCAPE_PATTERN = "([\\\\|()\\[\\]{}^$.+*?])"; private static final int MAX_LEVENSHTEIN_COMPARISONS = 100; private final List sampleMessages; @@ -133,9 +134,9 @@ static DelimitedFileStructureFinder makeDelimitedFileStructureFinder(List optQuote + column.replace(quote, twoQuotes).replaceAll("([\\\\|()\\[\\]{}^$*?])", "\\\\$1") + optQuote) + .map(column -> optQuote + column.replace(quote, twoQuotes).replaceAll(REGEX_NEEDS_ESCAPE_PATTERN, "\\\\$1") + optQuote) .collect(Collectors.joining(","))); } diff --git a/x-pack/plugin/src/test/resources/rest-api-spec/api/xpack.ml.find_file_structure.json b/x-pack/plugin/src/test/resources/rest-api-spec/api/xpack.ml.find_file_structure.json index bd41e0c00bca8..20a5c8e0c2a01 100644 --- a/x-pack/plugin/src/test/resources/rest-api-spec/api/xpack.ml.find_file_structure.json +++ b/x-pack/plugin/src/test/resources/rest-api-spec/api/xpack.ml.find_file_structure.json @@ -10,9 +10,50 @@ "type": "int", "description": "Optional parameter to specify how many lines of the file to include in the analysis" }, + "charset": { + "type": "string", + "description": "Optional parameter to specify the character set of the file" + }, + "format": { + "type": "enum", + "options": [ "json", "xml", "delimited", "semi_structured_text" ], + "description": "Optional parameter to specify the high level file format" + }, + "has_header_row": { + "type": "boolean", + "description": "Optional parameter to specify whether a delimited file includes the column names in its first row" + }, + "column_names": { + "type": "list", + "description": "Optional parameter containing a comma separated list of the column names for a delimited file" + }, + "delimiter": { + "type": "string", + "description": "Optional parameter to specify the delimiter character for a delimited file - must be a single character" + }, + "quote": { + "type": "string", + "description": "Optional parameter to specify the quote character for a delimited file - must be a single character" + }, + "should_trim_fields": { + "type": "boolean", + "description": "Optional parameter to specify whether the values between delimiters in a delimited file should have whitespace trimmed from them" + }, + "grok_pattern": { + "type": "string", + "description": "Optional parameter to specify the Grok pattern that should be used to extract fields from messages in a semi-structured text file" + }, + "timestamp_field": { + "type": "string", + "description": "Optional parameter to specify the timestamp field in the file" + }, + "timestamp_format": { + "type": "string", + "description": "Optional parameter to specify the timestamp format in the file" + }, "explain": { "type": "boolean", - "description": "Optional parameter to include an commentary on how the structure was derived" + "description": "Optional parameter to include a commentary on how the structure was derived" } } }, diff --git a/x-pack/plugin/src/test/resources/rest-api-spec/test/ml/find_file_structure.yml b/x-pack/plugin/src/test/resources/rest-api-spec/test/ml/find_file_structure.yml index 1d164cc0c5afc..1f6964b919357 100644 --- a/x-pack/plugin/src/test/resources/rest-api-spec/test/ml/find_file_structure.yml +++ b/x-pack/plugin/src/test/resources/rest-api-spec/test/ml/find_file_structure.yml @@ -1,11 +1,12 @@ --- -"Test JSON file structure analysis": +"Test JSON file structure analysis without overrides": - do: headers: # This is to stop the usual content type randomization, which # would obviously ruin the results for this particular test Content-Type: "application/json" xpack.ml.find_file_structure: + lines_to_sample: 3 body: - airline: AAL responsetime: 132.2046 @@ -42,3 +43,55 @@ - match: { field_stats.time.count: 3 } - match: { field_stats.time.cardinality: 3 } - match: { field_stats.time.cardinality: 3 } + - is_false: explanation + +--- +"Test JSON file structure analysis with overrides": + - do: + headers: + # This is to stop the usual content type randomization, which + # would obviously ruin the results for this particular test + Content-Type: "application/json" + xpack.ml.find_file_structure: + charset: UTF-8 + format: json + timestamp_field: time + timestamp_format: UNIX + explain: true + body: + - airline: AAL + responsetime: 132.2046 + sourcetype: file-structure-test + time: 1403481600 + - airline: JZA + responsetime: 990.4628 + sourcetype: file-structure-test + time: 1403481700 + - airline: AAL + responsetime: 134.2046 + sourcetype: file-structure-test + time: 1403481800 + + - match: { num_lines_analyzed: 3 } + - match: { num_messages_analyzed: 3 } + - match: { charset: "UTF-8" } + - match: { has_byte_order_marker: false } + - match: { format: json } + - match: { timestamp_field: time } + - match: { timestamp_formats.0: UNIX } + - match: { need_client_timezone: false } + - match: { mappings.airline.type: keyword } + - match: { mappings.responsetime.type: double } + - match: { mappings.sourcetype.type: keyword } + - match: { mappings.time.type: date } + - match: { mappings.time.format: epoch_second } + - match: { field_stats.airline.count: 3 } + - match: { field_stats.airline.cardinality: 2 } + - match: { field_stats.responsetime.count: 3 } + - match: { field_stats.responsetime.cardinality: 3 } + - match: { field_stats.sourcetype.count: 3 } + - match: { field_stats.sourcetype.cardinality: 1 } + - match: { field_stats.time.count: 3 } + - match: { field_stats.time.cardinality: 3 } + - match: { field_stats.time.cardinality: 3 } + - match: { explanation.0: "Using specified character encoding [UTF-8]" }