Skip to content

Commit 7345878

Browse files
author
David Roberts
authored
[ML] Refactor delimited file structure detection (#33233)
1. Use the term "delimited" rather than "separated values" 2. Use a single factory class with arguments to specify the delimiter and identification constraints This change makes it easier to add support for other delimiter characters.
1 parent 73eb4cb commit 7345878

24 files changed

+278
-430
lines changed

x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/CsvLogStructureFinderFactory.java

Lines changed: 0 additions & 35 deletions
This file was deleted.
Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -29,17 +29,16 @@
2929
import java.util.stream.Collectors;
3030
import java.util.stream.IntStream;
3131

32-
public class SeparatedValuesLogStructureFinder implements LogStructureFinder {
32+
public class DelimitedLogStructureFinder implements LogStructureFinder {
3333

3434
private static final int MAX_LEVENSHTEIN_COMPARISONS = 100;
3535

3636
private final List<String> sampleMessages;
3737
private final LogStructure structure;
3838

39-
static SeparatedValuesLogStructureFinder makeSeparatedValuesLogStructureFinder(List<String> explanation, String sample,
40-
String charsetName, Boolean hasByteOrderMarker,
41-
CsvPreference csvPreference, boolean trimFields)
42-
throws IOException {
39+
static DelimitedLogStructureFinder makeDelimitedLogStructureFinder(List<String> explanation, String sample, String charsetName,
40+
Boolean hasByteOrderMarker, CsvPreference csvPreference,
41+
boolean trimFields) throws IOException {
4342

4443
Tuple<List<List<String>>, List<Integer>> parsed = readRows(sample, csvPreference);
4544
List<List<String>> rows = parsed.v1();
@@ -73,13 +72,14 @@ static SeparatedValuesLogStructureFinder makeSeparatedValuesLogStructureFinder(L
7372
String preamble = Pattern.compile("\n").splitAsStream(sample).limit(lineNumbers.get(1)).collect(Collectors.joining("\n", "", "\n"));
7473

7574
char delimiter = (char) csvPreference.getDelimiterChar();
76-
LogStructure.Builder structureBuilder = new LogStructure.Builder(LogStructure.Format.fromSeparator(delimiter))
75+
LogStructure.Builder structureBuilder = new LogStructure.Builder(LogStructure.Format.DELIMITED)
7776
.setCharset(charsetName)
7877
.setHasByteOrderMarker(hasByteOrderMarker)
7978
.setSampleStart(preamble)
8079
.setNumLinesAnalyzed(lineNumbers.get(lineNumbers.size() - 1))
8180
.setNumMessagesAnalyzed(sampleRecords.size())
8281
.setHasHeaderRow(isHeaderInFile)
82+
.setDelimiter(delimiter)
8383
.setInputFields(Arrays.stream(headerWithNamedBlanks).collect(Collectors.toList()));
8484

8585
if (trimFields) {
@@ -131,10 +131,10 @@ static SeparatedValuesLogStructureFinder makeSeparatedValuesLogStructureFinder(L
131131
.setExplanation(explanation)
132132
.build();
133133

134-
return new SeparatedValuesLogStructureFinder(sampleMessages, structure);
134+
return new DelimitedLogStructureFinder(sampleMessages, structure);
135135
}
136136

137-
private SeparatedValuesLogStructureFinder(List<String> sampleMessages, LogStructure structure) {
137+
private DelimitedLogStructureFinder(List<String> sampleMessages, LogStructure structure) {
138138
this.sampleMessages = Collections.unmodifiableList(sampleMessages);
139139
this.structure = structure;
140140
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
/*
2+
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3+
* or more contributor license agreements. Licensed under the Elastic License;
4+
* you may not use this file except in compliance with the Elastic License.
5+
*/
6+
package org.elasticsearch.xpack.ml.logstructurefinder;
7+
8+
import org.supercsv.prefs.CsvPreference;
9+
10+
import java.io.IOException;
11+
import java.util.List;
12+
import java.util.Locale;
13+
14+
public class DelimitedLogStructureFinderFactory implements LogStructureFinderFactory {
15+
16+
private final CsvPreference csvPreference;
17+
private final int minFieldsPerRow;
18+
private final boolean trimFields;
19+
20+
DelimitedLogStructureFinderFactory(char delimiter, int minFieldsPerRow, boolean trimFields) {
21+
csvPreference = new CsvPreference.Builder('"', delimiter, "\n").build();
22+
this.minFieldsPerRow = minFieldsPerRow;
23+
this.trimFields = trimFields;
24+
}
25+
26+
/**
27+
* Rules are:
28+
* - It must contain at least two complete records
29+
* - There must be a minimum number of fields per record (otherwise files with no commas could be treated as CSV!)
30+
* - Every record except the last must have the same number of fields
31+
* The reason the last record is allowed to have fewer fields than the others is that
32+
* it could have been truncated when the file was sampled.
33+
*/
34+
@Override
35+
public boolean canCreateFromSample(List<String> explanation, String sample) {
36+
String formatName;
37+
switch ((char) csvPreference.getDelimiterChar()) {
38+
case ',':
39+
formatName = "CSV";
40+
break;
41+
case '\t':
42+
formatName = "TSV";
43+
break;
44+
default:
45+
formatName = Character.getName(csvPreference.getDelimiterChar()).toLowerCase(Locale.ROOT) + " delimited values";
46+
break;
47+
}
48+
return DelimitedLogStructureFinder.canCreateFromSample(explanation, sample, minFieldsPerRow, csvPreference, formatName);
49+
}
50+
51+
@Override
52+
public LogStructureFinder createFromSample(List<String> explanation, String sample, String charsetName, Boolean hasByteOrderMarker)
53+
throws IOException {
54+
return DelimitedLogStructureFinder.makeDelimitedLogStructureFinder(explanation, sample, charsetName, hasByteOrderMarker,
55+
csvPreference, trimFields);
56+
}
57+
}

0 commit comments

Comments
 (0)