Skip to content

Commit cf2d675

Browse files
author
David Roberts
committed
Assume rows with duplicate values are not headers
1 parent 37a92f7 commit cf2d675

File tree

2 files changed

+34
-4
lines changed

2 files changed

+34
-4
lines changed

x-pack/plugin/ml/log-structure-finder/src/main/java/org/elasticsearch/xpack/ml/logstructurefinder/SeparatedValuesLogStructureFinder.java

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
import java.util.Arrays;
1919
import java.util.Collections;
2020
import java.util.DoubleSummaryStatistics;
21+
import java.util.HashSet;
2122
import java.util.LinkedHashMap;
2223
import java.util.List;
2324
import java.util.Locale;
@@ -189,12 +190,17 @@ static Tuple<Boolean, String[]> findHeaderFromSample(List<String> explanation, L
189190
assert rows.isEmpty() == false;
190191

191192
List<String> firstRow = rows.get(0);
192-
boolean isHeaderInFile = true;
193193

194-
if (rows.size() < 3) {
195-
explanation.add("Too little data to accurately assess whether header is in sample - guessing it is");
194+
boolean isHeaderInFile = true;
195+
if (rowContainsDuplicateNonEmptyValues(firstRow)) {
196+
isHeaderInFile = false;
197+
explanation.add("First row contains duplicate values, so assuming it's not a header");
196198
} else {
197-
isHeaderInFile = isFirstRowUnusual(explanation, rows);
199+
if (rows.size() < 3) {
200+
explanation.add("Too little data to accurately assess whether header is in sample - guessing it is");
201+
} else {
202+
isHeaderInFile = isFirstRowUnusual(explanation, rows);
203+
}
198204
}
199205

200206
if (isHeaderInFile) {
@@ -205,6 +211,19 @@ static Tuple<Boolean, String[]> findHeaderFromSample(List<String> explanation, L
205211
}
206212
}
207213

214+
static boolean rowContainsDuplicateNonEmptyValues(List<String> row) {
215+
216+
HashSet<String> values = new HashSet<>();
217+
218+
for (String value : row) {
219+
if (value != null && value.isEmpty() == false && values.add(value) == false) {
220+
return true;
221+
}
222+
}
223+
224+
return false;
225+
}
226+
208227
private static boolean isFirstRowUnusual(List<String> explanation, List<List<String>> rows) {
209228

210229
assert rows.size() >= 3;

x-pack/plugin/ml/log-structure-finder/src/test/java/org/elasticsearch/xpack/ml/logstructurefinder/SeparatedValuesLogStructureFinderTests.java

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -279,4 +279,15 @@ public void testLineHasUnescapedQuote() {
279279
assertTrue(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("between\"words\tb\tc", CsvPreference.TAB_PREFERENCE));
280280
assertTrue(SeparatedValuesLogStructureFinder.lineHasUnescapedQuote("x and \"y\"\tb\tc", CsvPreference.TAB_PREFERENCE));
281281
}
282+
283+
public void testRowContainsDuplicateNonEmptyValues() {
284+
285+
assertFalse(SeparatedValuesLogStructureFinder.rowContainsDuplicateNonEmptyValues(Collections.singletonList("a")));
286+
assertFalse(SeparatedValuesLogStructureFinder.rowContainsDuplicateNonEmptyValues(Collections.singletonList("")));
287+
assertFalse(SeparatedValuesLogStructureFinder.rowContainsDuplicateNonEmptyValues(Arrays.asList("a", "b", "c")));
288+
assertTrue(SeparatedValuesLogStructureFinder.rowContainsDuplicateNonEmptyValues(Arrays.asList("a", "b", "a")));
289+
assertTrue(SeparatedValuesLogStructureFinder.rowContainsDuplicateNonEmptyValues(Arrays.asList("a", "b", "b")));
290+
assertFalse(SeparatedValuesLogStructureFinder.rowContainsDuplicateNonEmptyValues(Arrays.asList("a", "", "")));
291+
assertFalse(SeparatedValuesLogStructureFinder.rowContainsDuplicateNonEmptyValues(Arrays.asList("", "a", "")));
292+
}
282293
}

0 commit comments

Comments
 (0)