Skip to content

Commit 4d86598

Browse files
author
David Roberts
committed
[ML] Allow overrides for some file structure detection decisions (#33630)
This change modifies the file structure detection functionality such that some of the decisions can be overridden with user supplied values. The fields that can be overridden are: - charset - format - has_header_row - column_names - delimiter - quote - should_trim_fields - grok_pattern - timestamp_field - timestamp_format If an override makes finding the file structure impossible then the endpoint will return an exception.
1 parent 7172636 commit 4d86598

30 files changed

+1668
-333
lines changed

x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/action/FindFileStructureAction.java

Lines changed: 215 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,9 @@
2222
import org.elasticsearch.xpack.core.ml.filestructurefinder.FileStructure;
2323

2424
import java.io.IOException;
25+
import java.util.Arrays;
26+
import java.util.List;
27+
import java.util.Locale;
2528
import java.util.Objects;
2629

2730
import static org.elasticsearch.action.ValidateActions.addValidationError;
@@ -115,8 +118,32 @@ public boolean equals(Object other) {
115118
public static class Request extends ActionRequest {
116119

117120
public static final ParseField LINES_TO_SAMPLE = new ParseField("lines_to_sample");
121+
public static final ParseField CHARSET = FileStructure.CHARSET;
122+
public static final ParseField FORMAT = FileStructure.FORMAT;
123+
public static final ParseField COLUMN_NAMES = FileStructure.COLUMN_NAMES;
124+
public static final ParseField HAS_HEADER_ROW = FileStructure.HAS_HEADER_ROW;
125+
public static final ParseField DELIMITER = FileStructure.DELIMITER;
126+
public static final ParseField QUOTE = FileStructure.QUOTE;
127+
public static final ParseField SHOULD_TRIM_FIELDS = FileStructure.SHOULD_TRIM_FIELDS;
128+
public static final ParseField GROK_PATTERN = FileStructure.GROK_PATTERN;
129+
// This one is plural in FileStructure, but singular in FileStructureOverrides
130+
public static final ParseField TIMESTAMP_FORMAT = new ParseField("timestamp_format");
131+
public static final ParseField TIMESTAMP_FIELD = FileStructure.TIMESTAMP_FIELD;
132+
133+
private static final String ARG_INCOMPATIBLE_WITH_FORMAT_TEMPLATE =
134+
"[%s] may only be specified if [" + FORMAT.getPreferredName() + "] is [%s]";
118135

119136
private Integer linesToSample;
137+
private String charset;
138+
private FileStructure.Format format;
139+
private List<String> columnNames;
140+
private Boolean hasHeaderRow;
141+
private Character delimiter;
142+
private Character quote;
143+
private Boolean shouldTrimFields;
144+
private String grokPattern;
145+
private String timestampFormat;
146+
private String timestampField;
120147
private BytesReference sample;
121148

122149
public Request() {
@@ -130,6 +157,114 @@ public void setLinesToSample(Integer linesToSample) {
130157
this.linesToSample = linesToSample;
131158
}
132159

160+
public String getCharset() {
161+
return charset;
162+
}
163+
164+
public void setCharset(String charset) {
165+
this.charset = (charset == null || charset.isEmpty()) ? null : charset;
166+
}
167+
168+
public FileStructure.Format getFormat() {
169+
return format;
170+
}
171+
172+
public void setFormat(FileStructure.Format format) {
173+
this.format = format;
174+
}
175+
176+
public void setFormat(String format) {
177+
this.format = (format == null || format.isEmpty()) ? null : FileStructure.Format.fromString(format);
178+
}
179+
180+
public List<String> getColumnNames() {
181+
return columnNames;
182+
}
183+
184+
public void setColumnNames(List<String> columnNames) {
185+
this.columnNames = (columnNames == null || columnNames.isEmpty()) ? null : columnNames;
186+
}
187+
188+
public void setColumnNames(String[] columnNames) {
189+
this.columnNames = (columnNames == null || columnNames.length == 0) ? null : Arrays.asList(columnNames);
190+
}
191+
192+
public Boolean getHasHeaderRow() {
193+
return hasHeaderRow;
194+
}
195+
196+
public void setHasHeaderRow(Boolean hasHeaderRow) {
197+
this.hasHeaderRow = hasHeaderRow;
198+
}
199+
200+
public Character getDelimiter() {
201+
return delimiter;
202+
}
203+
204+
public void setDelimiter(Character delimiter) {
205+
this.delimiter = delimiter;
206+
}
207+
208+
public void setDelimiter(String delimiter) {
209+
if (delimiter == null || delimiter.isEmpty()) {
210+
this.delimiter = null;
211+
} else if (delimiter.length() == 1) {
212+
this.delimiter = delimiter.charAt(0);
213+
} else {
214+
throw new IllegalArgumentException(DELIMITER.getPreferredName() + " must be a single character");
215+
}
216+
}
217+
218+
public Character getQuote() {
219+
return quote;
220+
}
221+
222+
public void setQuote(Character quote) {
223+
this.quote = quote;
224+
}
225+
226+
public void setQuote(String quote) {
227+
if (quote == null || quote.isEmpty()) {
228+
this.quote = null;
229+
} else if (quote.length() == 1) {
230+
this.quote = quote.charAt(0);
231+
} else {
232+
throw new IllegalArgumentException(QUOTE.getPreferredName() + " must be a single character");
233+
}
234+
}
235+
236+
public Boolean getShouldTrimFields() {
237+
return shouldTrimFields;
238+
}
239+
240+
public void setShouldTrimFields(Boolean shouldTrimFields) {
241+
this.shouldTrimFields = shouldTrimFields;
242+
}
243+
244+
public String getGrokPattern() {
245+
return grokPattern;
246+
}
247+
248+
public void setGrokPattern(String grokPattern) {
249+
this.grokPattern = (grokPattern == null || grokPattern.isEmpty()) ? null : grokPattern;
250+
}
251+
252+
public String getTimestampFormat() {
253+
return timestampFormat;
254+
}
255+
256+
public void setTimestampFormat(String timestampFormat) {
257+
this.timestampFormat = (timestampFormat == null || timestampFormat.isEmpty()) ? null : timestampFormat;
258+
}
259+
260+
public String getTimestampField() {
261+
return timestampField;
262+
}
263+
264+
public void setTimestampField(String timestampField) {
265+
this.timestampField = (timestampField == null || timestampField.isEmpty()) ? null : timestampField;
266+
}
267+
133268
public BytesReference getSample() {
134269
return sample;
135270
}
@@ -138,12 +273,41 @@ public void setSample(BytesReference sample) {
138273
this.sample = sample;
139274
}
140275

276+
private static ActionRequestValidationException addIncompatibleArgError(ParseField arg, FileStructure.Format format,
277+
ActionRequestValidationException validationException) {
278+
return addValidationError(String.format(Locale.ROOT, ARG_INCOMPATIBLE_WITH_FORMAT_TEMPLATE, arg.getPreferredName(), format),
279+
validationException);
280+
}
281+
141282
@Override
142283
public ActionRequestValidationException validate() {
143284
ActionRequestValidationException validationException = null;
144285
if (linesToSample != null && linesToSample <= 0) {
145286
validationException =
146-
addValidationError(LINES_TO_SAMPLE.getPreferredName() + " must be positive if specified", validationException);
287+
addValidationError("[" + LINES_TO_SAMPLE.getPreferredName() + "] must be positive if specified", validationException);
288+
}
289+
if (format != FileStructure.Format.DELIMITED) {
290+
if (columnNames != null) {
291+
validationException = addIncompatibleArgError(COLUMN_NAMES, FileStructure.Format.DELIMITED, validationException);
292+
}
293+
if (hasHeaderRow != null) {
294+
validationException = addIncompatibleArgError(HAS_HEADER_ROW, FileStructure.Format.DELIMITED, validationException);
295+
}
296+
if (delimiter != null) {
297+
validationException = addIncompatibleArgError(DELIMITER, FileStructure.Format.DELIMITED, validationException);
298+
}
299+
if (quote != null) {
300+
validationException = addIncompatibleArgError(QUOTE, FileStructure.Format.DELIMITED, validationException);
301+
}
302+
if (shouldTrimFields != null) {
303+
validationException = addIncompatibleArgError(SHOULD_TRIM_FIELDS, FileStructure.Format.DELIMITED, validationException);
304+
}
305+
}
306+
if (format != FileStructure.Format.SEMI_STRUCTURED_TEXT) {
307+
if (grokPattern != null) {
308+
validationException =
309+
addIncompatibleArgError(GROK_PATTERN, FileStructure.Format.SEMI_STRUCTURED_TEXT, validationException);
310+
}
147311
}
148312
if (sample == null || sample.length() == 0) {
149313
validationException = addValidationError("sample must be specified", validationException);
@@ -155,19 +319,60 @@ public ActionRequestValidationException validate() {
155319
public void readFrom(StreamInput in) throws IOException {
156320
super.readFrom(in);
157321
linesToSample = in.readOptionalVInt();
322+
charset = in.readOptionalString();
323+
format = in.readBoolean() ? in.readEnum(FileStructure.Format.class) : null;
324+
columnNames = in.readBoolean() ? in.readList(StreamInput::readString) : null;
325+
hasHeaderRow = in.readOptionalBoolean();
326+
delimiter = in.readBoolean() ? (char) in.readVInt() : null;
327+
quote = in.readBoolean() ? (char) in.readVInt() : null;
328+
shouldTrimFields = in.readOptionalBoolean();
329+
grokPattern = in.readOptionalString();
330+
timestampFormat = in.readOptionalString();
331+
timestampField = in.readOptionalString();
158332
sample = in.readBytesReference();
159333
}
160334

161335
@Override
162336
public void writeTo(StreamOutput out) throws IOException {
163337
super.writeTo(out);
164338
out.writeOptionalVInt(linesToSample);
339+
out.writeOptionalString(charset);
340+
if (format == null) {
341+
out.writeBoolean(false);
342+
} else {
343+
out.writeBoolean(true);
344+
out.writeEnum(format);
345+
}
346+
if (columnNames == null) {
347+
out.writeBoolean(false);
348+
} else {
349+
out.writeBoolean(true);
350+
out.writeCollection(columnNames, StreamOutput::writeString);
351+
}
352+
out.writeOptionalBoolean(hasHeaderRow);
353+
if (delimiter == null) {
354+
out.writeBoolean(false);
355+
} else {
356+
out.writeBoolean(true);
357+
out.writeVInt(delimiter);
358+
}
359+
if (quote == null) {
360+
out.writeBoolean(false);
361+
} else {
362+
out.writeBoolean(true);
363+
out.writeVInt(quote);
364+
}
365+
out.writeOptionalBoolean(shouldTrimFields);
366+
out.writeOptionalString(grokPattern);
367+
out.writeOptionalString(timestampFormat);
368+
out.writeOptionalString(timestampField);
165369
out.writeBytesReference(sample);
166370
}
167371

168372
@Override
169373
public int hashCode() {
170-
return Objects.hash(linesToSample, sample);
374+
return Objects.hash(linesToSample, charset, format, columnNames, hasHeaderRow, delimiter, grokPattern, timestampFormat,
375+
timestampField, sample);
171376
}
172377

173378
@Override
@@ -183,6 +388,14 @@ public boolean equals(Object other) {
183388

184389
Request that = (Request) other;
185390
return Objects.equals(this.linesToSample, that.linesToSample) &&
391+
Objects.equals(this.charset, that.charset) &&
392+
Objects.equals(this.format, that.format) &&
393+
Objects.equals(this.columnNames, that.columnNames) &&
394+
Objects.equals(this.hasHeaderRow, that.hasHeaderRow) &&
395+
Objects.equals(this.delimiter, that.delimiter) &&
396+
Objects.equals(this.grokPattern, that.grokPattern) &&
397+
Objects.equals(this.timestampFormat, that.timestampFormat) &&
398+
Objects.equals(this.timestampField, that.timestampField) &&
186399
Objects.equals(this.sample, that.sample);
187400
}
188401
}

0 commit comments

Comments
 (0)