Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -123,9 +123,16 @@ static DelimitedLogStructureFinder makeDelimitedLogStructureFinder(List<String>
.setMultilineStartPattern(timeLineRegex);
}

SortedMap<String, Object> mappings = LogStructureUtils.guessMappings(explanation, sampleRecords);
Tuple<SortedMap<String, Object>, SortedMap<String, FieldStats>> mappingsAndFieldStats =
LogStructureUtils.guessMappingsAndCalculateFieldStats(explanation, sampleRecords);

SortedMap<String, Object> mappings = mappingsAndFieldStats.v1();
mappings.put(LogStructureUtils.DEFAULT_TIMESTAMP_FIELD, Collections.singletonMap(LogStructureUtils.MAPPING_TYPE_SETTING, "date"));

if (mappingsAndFieldStats.v2() != null) {
structureBuilder.setFieldStats(mappingsAndFieldStats.v2());
}

LogStructure structure = structureBuilder
.setMappings(mappings)
.setExplanation(explanation)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/
package org.elasticsearch.xpack.ml.logstructurefinder;

import org.elasticsearch.common.ParseField;
import org.elasticsearch.common.xcontent.ConstructingObjectParser;
import org.elasticsearch.common.xcontent.ToXContentObject;
import org.elasticsearch.common.xcontent.XContentBuilder;

import java.io.IOException;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Objects;

public class FieldStats implements ToXContentObject {

static final ParseField COUNT = new ParseField("count");
static final ParseField CARDINALITY = new ParseField("cardinality");
static final ParseField MIN_VALUE = new ParseField("min_value");
static final ParseField MAX_VALUE = new ParseField("max_value");
static final ParseField MEAN_VALUE = new ParseField("mean_value");
static final ParseField MEDIAN_VALUE = new ParseField("median_value");
static final ParseField TOP_HITS = new ParseField("top_hits");

@SuppressWarnings("unchecked")
public static final ConstructingObjectParser<FieldStats, Void> PARSER = new ConstructingObjectParser<>("field_stats", false,
a -> new FieldStats((long) a[0], (int) a[1], (Double) a[2], (Double) a[3], (Double) a[4], (Double) a[5],
(List<Map<String, Object>>) a[6]));

static {
PARSER.declareLong(ConstructingObjectParser.constructorArg(), COUNT);
PARSER.declareInt(ConstructingObjectParser.constructorArg(), CARDINALITY);
PARSER.declareDouble(ConstructingObjectParser.optionalConstructorArg(), MIN_VALUE);
PARSER.declareDouble(ConstructingObjectParser.optionalConstructorArg(), MAX_VALUE);
PARSER.declareDouble(ConstructingObjectParser.optionalConstructorArg(), MEAN_VALUE);
PARSER.declareDouble(ConstructingObjectParser.optionalConstructorArg(), MEDIAN_VALUE);
PARSER.declareObjectArray(ConstructingObjectParser.optionalConstructorArg(), (p, c) -> p.mapOrdered(), TOP_HITS);
}

private final long count;
private final int cardinality;
private final Double minValue;
private final Double maxValue;
private final Double meanValue;
private final Double medianValue;
private final List<Map<String, Object>> topHits;

FieldStats(long count, int cardinality, List<Map<String, Object>> topHits) {
this(count, cardinality, null, null, null, null, topHits);
}

FieldStats(long count, int cardinality, Double minValue, Double maxValue, Double meanValue, Double medianValue,
List<Map<String, Object>> topHits) {
this.count = count;
this.cardinality = cardinality;
this.minValue = minValue;
this.maxValue = maxValue;
this.meanValue = meanValue;
this.medianValue = medianValue;
this.topHits = (topHits == null) ? Collections.emptyList() : Collections.unmodifiableList(topHits);
}

public long getCount() {
return count;
}

public int getCardinality() {
return cardinality;
}

public Double getMinValue() {
return minValue;
}

public Double getMaxValue() {
return maxValue;
}

public Double getMeanValue() {
return meanValue;
}

public Double getMedianValue() {
return medianValue;
}

public List<Map<String, Object>> getTopHits() {
return topHits;
}

@Override
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {

builder.startObject();
builder.field(COUNT.getPreferredName(), count);
builder.field(CARDINALITY.getPreferredName(), cardinality);
if (minValue != null) {
builder.field(MIN_VALUE.getPreferredName(), minValue);
}
if (maxValue != null) {
builder.field(MAX_VALUE.getPreferredName(), maxValue);
}
if (meanValue != null) {
builder.field(MEAN_VALUE.getPreferredName(), meanValue);
}
if (medianValue != null) {
builder.field(MEDIAN_VALUE.getPreferredName(), medianValue);
}
if (topHits.isEmpty() == false) {
builder.field(TOP_HITS.getPreferredName(), topHits);
}
builder.endObject();

return builder;
}

@Override
public int hashCode() {

return Objects.hash(count, cardinality, minValue, maxValue, meanValue, medianValue, topHits);
}

@Override
public boolean equals(Object other) {

if (this == other) {
return true;
}

if (other == null || getClass() != other.getClass()) {
return false;
}

FieldStats that = (FieldStats) other;
return this.count == that.count &&
this.cardinality == that.cardinality &&
Objects.equals(this.minValue, that.minValue) &&
Objects.equals(this.maxValue, that.maxValue) &&
Objects.equals(this.meanValue, that.meanValue) &&
Objects.equals(this.medianValue, that.medianValue) &&
Objects.equals(this.topHits, that.topHits);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/
package org.elasticsearch.xpack.ml.logstructurefinder;

import java.util.ArrayList;
import java.util.Collection;
import java.util.Comparator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.SortedMap;
import java.util.TreeMap;
import java.util.stream.Collectors;

/**
* Calculate statistics for a set of scalar field values.
* Count, cardinality (distinct count) and top hits (most common values) are always calculated.
* Extra statistics are calculated if the field is numeric: min, max, mean and median.
*/
public class FieldStatsCalculator {

private long count;
private SortedMap<String, Integer> countsByStringValue = new TreeMap<>();
private SortedMap<Double, Integer> countsByNumericValue = new TreeMap<>();

/**
* Add a collection of values to the calculator.
* The values to be added can be combined by the caller and added in a
* single call to this method or added in multiple calls to this method.
* @param fieldValues Zero or more values to add. May not be <code>null</code>.
*/
public void accept(Collection<String> fieldValues) {

count += fieldValues.size();

for (String fieldValue : fieldValues) {

countsByStringValue.compute(fieldValue, (k, v) -> (v == null) ? 1 : (1 + v));

if (countsByNumericValue != null) {

try {
countsByNumericValue.compute(Double.valueOf(fieldValue), (k, v) -> (v == null) ? 1 : (1 + v));
} catch (NumberFormatException e) {
countsByNumericValue = null;
}
}
}
}

/**
* Calculate field statistics based on the previously accepted values.
* @param numTopHits The maximum number of entries to include in the top hits.
* @return The calculated field statistics.
*/
public FieldStats calculate(int numTopHits) {

if (countsByNumericValue != null && countsByNumericValue.isEmpty() == false) {
return new FieldStats(count, countsByNumericValue.size(), countsByNumericValue.firstKey(), countsByNumericValue.lastKey(),
calculateMean(), calculateMedian(), findNumericTopHits(numTopHits));
} else {
return new FieldStats(count, countsByStringValue.size(), findStringTopHits(numTopHits));
}
}

Double calculateMean() {

assert countsByNumericValue != null;

if (countsByNumericValue.isEmpty()) {
return null;
}

double runningCount = 0.0;
double runningMean = Double.NaN;

for (Map.Entry<Double, Integer> entry : countsByNumericValue.entrySet()) {

double entryCount = (double) entry.getValue();
double newRunningCount = runningCount + entryCount;

// Updating a running mean like this is more numerically stable than using (sum / count)
if (runningCount > 0.0) {
runningMean = runningMean * (runningCount / newRunningCount) + entry.getKey() * (entryCount / newRunningCount);
} else if (entryCount > 0.0) {
runningMean = entry.getKey();
}

runningCount = newRunningCount;
}

return runningMean;
}

Double calculateMedian() {

assert countsByNumericValue != null;

if (count % 2 == 1) {

// Simple case - median is middle value
long targetCount = count / 2 + 1;
long currentUpperBound = 0;

for (Map.Entry<Double, Integer> entry : countsByNumericValue.entrySet()) {

currentUpperBound += entry.getValue();

if (currentUpperBound >= targetCount) {
return entry.getKey();
}
}

} else {

// More complicated case - median is average of two middle values
long target1Count = count / 2;
long target2Count = target1Count + 1;
double target1Value = Double.NaN;
long prevUpperBound = -1;
long currentUpperBound = 0;

for (Map.Entry<Double, Integer> entry : countsByNumericValue.entrySet()) {

currentUpperBound += entry.getValue();

if (currentUpperBound >= target2Count) {

if (prevUpperBound < target1Count) {
// Both target values are the same
return entry.getKey();
} else {
return (target1Value + entry.getKey()) / 2.0;
}
}

if (currentUpperBound >= target1Count) {
target1Value = entry.getKey();
}

prevUpperBound = currentUpperBound;
}
}

return null;
}

List<Map<String, Object>> findNumericTopHits(int numTopHits) {
assert countsByNumericValue != null;
return findTopHits(numTopHits, countsByNumericValue, Comparator.comparing(Map.Entry<Double, Integer>::getKey));
}

List<Map<String, Object>> findStringTopHits(int numTopHits) {
return findTopHits(numTopHits, countsByStringValue, Comparator.comparing(Map.Entry<String, Integer>::getKey));
}

/**
* Order by descending count, with a secondary sort to ensure reproducibility of results.
*/
private static <T> List<Map<String, Object>> findTopHits(int numTopHits, Map<T, Integer> countsByValue,
Comparator<Map.Entry<T, Integer>> secondarySort) {

List<Map.Entry<T, Integer>> sortedByCount = countsByValue.entrySet().stream()
.sorted(Comparator.comparing(Map.Entry<T, Integer>::getValue, Comparator.reverseOrder()).thenComparing(secondarySort))
.limit(numTopHits).collect(Collectors.toList());

List<Map<String, Object>> topHits = new ArrayList<>(sortedByCount.size());

for (Map.Entry<T, Integer> entry : sortedByCount) {

Map<String, Object> topHit = new LinkedHashMap<>(3);
topHit.put("value", entry.getKey());
topHit.put("count", entry.getValue());
topHits.add(topHit);
}

return topHits;
}
}
Loading