Skip to content

Commit 35453e2

Browse files
author
David Roberts
committed
[ML] Improve uniqueness of result document IDs (#50644)
Switch from a 32 bit Java hash to a 128 bit Murmur hash for creating document IDs from by/over/partition field values. The 32 bit Java hash was not sufficiently unique, and could produce identical numbers for relatively common combinations of by/partition field values such as L018/128 and L017/228. Fixes #50613
1 parent 46d600c commit 35453e2

File tree

9 files changed

+103
-44
lines changed

9 files changed

+103
-44
lines changed

x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/MachineLearningField.java

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,18 @@
55
*/
66
package org.elasticsearch.xpack.core.ml;
77

8+
import org.elasticsearch.common.Numbers;
9+
import org.elasticsearch.common.hash.MurmurHash3;
810
import org.elasticsearch.common.settings.Setting;
911
import org.elasticsearch.common.unit.ByteSizeValue;
1012
import org.elasticsearch.common.unit.TimeValue;
1113

14+
import java.math.BigInteger;
15+
import java.nio.charset.StandardCharsets;
16+
import java.util.Arrays;
17+
import java.util.Objects;
18+
import java.util.stream.Collectors;
19+
1220
public final class MachineLearningField {
1321
public static final Setting<Boolean> AUTODETECT_PROCESS =
1422
Setting.boolSetting("xpack.ml.autodetect_process", true, Setting.Property.NodeScope);
@@ -19,4 +27,13 @@ public final class MachineLearningField {
1927

2028
private MachineLearningField() {}
2129

30+
public static String valuesToId(String... values) {
31+
String combined = Arrays.stream(values).filter(Objects::nonNull).collect(Collectors.joining());
32+
byte[] bytes = combined.getBytes(StandardCharsets.UTF_8);
33+
MurmurHash3.Hash128 hash = MurmurHash3.hash128(bytes, 0, bytes.length, 0, new MurmurHash3.Hash128());
34+
byte[] hashedBytes = new byte[16];
35+
System.arraycopy(Numbers.longToBytes(hash.h1), 0, hashedBytes, 0, 8);
36+
System.arraycopy(Numbers.longToBytes(hash.h2), 0, hashedBytes, 8, 8);
37+
return new BigInteger(hashedBytes) + "_" + combined.length();
38+
}
2239
}

x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/job/results/AnomalyRecord.java

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
import org.elasticsearch.common.xcontent.ObjectParser.ValueType;
1616
import org.elasticsearch.common.xcontent.ToXContentObject;
1717
import org.elasticsearch.common.xcontent.XContentBuilder;
18+
import org.elasticsearch.xpack.core.ml.MachineLearningField;
1819
import org.elasticsearch.xpack.core.ml.job.config.Detector;
1920
import org.elasticsearch.xpack.core.ml.job.config.Job;
2021
import org.elasticsearch.xpack.core.ml.utils.ExceptionsHelper;
@@ -358,12 +359,13 @@ public String getJobId() {
358359
* Data store ID of this record.
359360
*/
360361
public String getId() {
361-
int valuesHash = Objects.hash(byFieldValue, overFieldValue, partitionFieldValue);
362-
int length = (byFieldValue == null ? 0 : byFieldValue.length()) +
363-
(overFieldValue == null ? 0 : overFieldValue.length()) +
364-
(partitionFieldValue == null ? 0 : partitionFieldValue.length());
362+
return buildId(jobId, timestamp, bucketSpan, detectorIndex, byFieldValue, overFieldValue, partitionFieldValue);
363+
}
365364

366-
return jobId + "_record_" + timestamp.getTime() + "_" + bucketSpan + "_" + detectorIndex + "_" + valuesHash + "_" + length;
365+
static String buildId(String jobId, Date timestamp, long bucketSpan, int detectorIndex,
366+
String byFieldValue, String overFieldValue, String partitionFieldValue) {
367+
return jobId + "_record_" + timestamp.getTime() + "_" + bucketSpan + "_" + detectorIndex + "_"
368+
+ MachineLearningField.valuesToId(byFieldValue, overFieldValue, partitionFieldValue);
367369
}
368370

369371
public int getDetectorIndex() {

x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/job/results/Forecast.java

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
import org.elasticsearch.common.xcontent.ObjectParser.ValueType;
1414
import org.elasticsearch.common.xcontent.ToXContentObject;
1515
import org.elasticsearch.common.xcontent.XContentBuilder;
16+
import org.elasticsearch.xpack.core.ml.MachineLearningField;
1617
import org.elasticsearch.xpack.core.ml.job.config.Job;
1718
import org.elasticsearch.xpack.core.common.time.TimeUtils;
1819

@@ -165,12 +166,9 @@ public String getForecastId() {
165166
}
166167

167168
public String getId() {
168-
int valuesHash = Objects.hash(byFieldValue, partitionFieldValue);
169-
int length = (byFieldValue == null ? 0 : byFieldValue.length()) +
170-
(partitionFieldValue == null ? 0 : partitionFieldValue.length());
171169
return jobId + "_model_forecast_" + forecastId + "_" + timestamp.getTime()
172170
+ "_" + bucketSpan + "_" + detectorIndex + "_"
173-
+ valuesHash + "_" + length;
171+
+ MachineLearningField.valuesToId(byFieldValue, partitionFieldValue);
174172
}
175173

176174
public Date getTimestamp() {

x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/job/results/Influencer.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
import org.elasticsearch.common.xcontent.ObjectParser.ValueType;
1414
import org.elasticsearch.common.xcontent.ToXContentObject;
1515
import org.elasticsearch.common.xcontent.XContentBuilder;
16+
import org.elasticsearch.xpack.core.ml.MachineLearningField;
1617
import org.elasticsearch.xpack.core.ml.job.config.Job;
1718
import org.elasticsearch.xpack.core.ml.utils.ExceptionsHelper;
1819
import org.elasticsearch.xpack.core.common.time.TimeUtils;
@@ -134,7 +135,7 @@ public String getJobId() {
134135

135136
public String getId() {
136137
return jobId + "_influencer_" + timestamp.getTime() + "_" + bucketSpan + "_" +
137-
influenceField + "_" + influenceValue.hashCode() + "_" + influenceValue.length();
138+
influenceField + "_" + MachineLearningField.valuesToId(influenceValue);
138139
}
139140

140141
public double getProbability() {

x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/job/results/ModelPlot.java

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
import org.elasticsearch.common.xcontent.ObjectParser.ValueType;
1515
import org.elasticsearch.common.xcontent.ToXContentObject;
1616
import org.elasticsearch.common.xcontent.XContentBuilder;
17+
import org.elasticsearch.xpack.core.ml.MachineLearningField;
1718
import org.elasticsearch.xpack.core.ml.job.config.Job;
1819
import org.elasticsearch.xpack.core.common.time.TimeUtils;
1920

@@ -205,12 +206,8 @@ public String getJobId() {
205206
}
206207

207208
public String getId() {
208-
int valuesHash = Objects.hash(byFieldValue, overFieldValue, partitionFieldValue);
209-
int length = (byFieldValue == null ? 0 : byFieldValue.length()) +
210-
(overFieldValue == null ? 0 : overFieldValue.length()) +
211-
(partitionFieldValue == null ? 0 : partitionFieldValue.length());
212209
return jobId + "_model_plot_" + timestamp.getTime() + "_" + bucketSpan
213-
+ "_" + detectorIndex + "_" + valuesHash + "_" + length;
210+
+ "_" + detectorIndex + "_" + MachineLearningField.valuesToId(byFieldValue, overFieldValue, partitionFieldValue);
214211
}
215212

216213
public Date getTimestamp() {

x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/job/results/AnomalyRecordTests.java

Lines changed: 21 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -14,17 +14,20 @@
1414
import org.elasticsearch.common.xcontent.XContentType;
1515
import org.elasticsearch.common.xcontent.json.JsonXContent;
1616
import org.elasticsearch.test.AbstractSerializingTestCase;
17+
import org.elasticsearch.xpack.core.ml.MachineLearningField;
18+
import org.elasticsearch.xpack.core.ml.utils.MlStrings;
1719

1820
import java.io.IOException;
21+
import java.nio.charset.StandardCharsets;
1922
import java.util.ArrayList;
2023
import java.util.Arrays;
2124
import java.util.Collections;
2225
import java.util.Date;
2326
import java.util.List;
2427
import java.util.Map;
25-
import java.util.Objects;
2628

2729
import static org.hamcrest.Matchers.containsString;
30+
import static org.hamcrest.Matchers.lessThanOrEqualTo;
2831

2932
public class AnomalyRecordTests extends AbstractSerializingTestCase<AnomalyRecord> {
3033

@@ -174,28 +177,23 @@ public void testId() {
174177
String overFieldValue = null;
175178
String partitionFieldValue = null;
176179

177-
int valuesHash = Objects.hash(byFieldValue, overFieldValue, partitionFieldValue);
178-
assertEquals("test-job_record_1000_60_0_" + valuesHash + "_0", record.getId());
180+
assertEquals("test-job_record_1000_60_0_0_0", record.getId());
179181

180-
int length = 0;
181182
if (randomBoolean()) {
182183
byFieldValue = randomAlphaOfLength(10);
183-
length += byFieldValue.length();
184184
record.setByFieldValue(byFieldValue);
185185
}
186186
if (randomBoolean()) {
187187
overFieldValue = randomAlphaOfLength(10);
188-
length += overFieldValue.length();
189188
record.setOverFieldValue(overFieldValue);
190189
}
191190
if (randomBoolean()) {
192191
partitionFieldValue = randomAlphaOfLength(10);
193-
length += partitionFieldValue.length();
194192
record.setPartitionFieldValue(partitionFieldValue);
195193
}
196194

197-
valuesHash = Objects.hash(byFieldValue, overFieldValue, partitionFieldValue);
198-
assertEquals("test-job_record_1000_60_0_" + valuesHash + "_" + length, record.getId());
195+
String valuesPart = MachineLearningField.valuesToId(byFieldValue, overFieldValue, partitionFieldValue);
196+
assertEquals("test-job_record_1000_60_0_" + valuesPart, record.getId());
199197
}
200198

201199
public void testStrictParser_IsLenientOnTopLevelFields() throws IOException {
@@ -222,4 +220,18 @@ public void testLenientParser() throws IOException {
222220
AnomalyRecord.LENIENT_PARSER.apply(parser, null);
223221
}
224222
}
223+
224+
public void testIdLength() {
225+
String jobId = randomAlphaOfLength(MlStrings.ID_LENGTH_LIMIT);
226+
Date timestamp = new Date(Long.MAX_VALUE);
227+
long bucketSpan = Long.MAX_VALUE;
228+
int detectorIndex = Integer.MAX_VALUE;
229+
String byFieldValue = randomAlphaOfLength(randomIntBetween(100, 1000));
230+
String overFieldValue = randomAlphaOfLength(randomIntBetween(100, 1000));
231+
String partitionFieldValue = randomAlphaOfLength(randomIntBetween(100, 1000));
232+
233+
String id = AnomalyRecord.buildId(jobId, timestamp, bucketSpan, detectorIndex, byFieldValue, overFieldValue, partitionFieldValue);
234+
// 512 comes from IndexRequest.validate()
235+
assertThat(id.getBytes(StandardCharsets.UTF_8).length, lessThanOrEqualTo(512));
236+
}
225237
}

x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/job/results/InfluencerTests.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,10 @@
1212
import org.elasticsearch.common.xcontent.XContentType;
1313
import org.elasticsearch.common.xcontent.json.JsonXContent;
1414
import org.elasticsearch.test.AbstractSerializingTestCase;
15+
import org.elasticsearch.xpack.core.ml.MachineLearningField;
1516

1617
import java.io.IOException;
1718
import java.util.Date;
18-
import java.util.Objects;
1919

2020
public class InfluencerTests extends AbstractSerializingTestCase<Influencer> {
2121

@@ -64,8 +64,8 @@ public void testToXContentDoesNotIncludeNameValueFieldWhenReservedWord() throws
6464
public void testId() {
6565
String influencerFieldValue = "wopr";
6666
Influencer influencer = new Influencer("job-foo", "host", influencerFieldValue, new Date(1000), 300L);
67-
int valueHash = Objects.hashCode(influencerFieldValue);
68-
assertEquals("job-foo_influencer_1000_300_host_" + valueHash + "_" + influencerFieldValue.length(), influencer.getId());
67+
String valuePart = MachineLearningField.valuesToId(influencerFieldValue);
68+
assertEquals("job-foo_influencer_1000_300_host_" + valuePart, influencer.getId());
6969
}
7070

7171
public void testLenientParser() throws IOException {

x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/results/ForecastTests.java

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,11 @@
99
import org.elasticsearch.common.xcontent.XContentParser;
1010
import org.elasticsearch.common.xcontent.json.JsonXContent;
1111
import org.elasticsearch.test.AbstractSerializingTestCase;
12+
import org.elasticsearch.xpack.core.ml.MachineLearningField;
1213
import org.elasticsearch.xpack.core.ml.job.results.Forecast;
1314

1415
import java.io.IOException;
1516
import java.util.Date;
16-
import java.util.Objects;
1717

1818
import static org.hamcrest.Matchers.containsString;
1919

@@ -72,23 +72,19 @@ public void testId() {
7272
String byFieldValue = null;
7373
String partitionFieldValue = null;
7474

75-
int valuesHash = Objects.hash(byFieldValue, partitionFieldValue);
76-
assertEquals("job-foo_model_forecast_222_100_60_2_" + valuesHash + "_0", forecast.getId());
75+
assertEquals("job-foo_model_forecast_222_100_60_2_0_0", forecast.getId());
7776

78-
int length = 0;
7977
if (randomBoolean()) {
8078
byFieldValue = randomAlphaOfLength(10);
81-
length += byFieldValue.length();
8279
forecast.setByFieldValue(byFieldValue);
8380
}
8481
if (randomBoolean()) {
8582
partitionFieldValue = randomAlphaOfLength(10);
86-
length += partitionFieldValue.length();
8783
forecast.setPartitionFieldValue(partitionFieldValue);
8884
}
8985

90-
valuesHash = Objects.hash(byFieldValue, partitionFieldValue);
91-
assertEquals("job-foo_model_forecast_222_100_60_2_" + valuesHash + "_" + length, forecast.getId());
86+
String valuesPart = MachineLearningField.valuesToId(byFieldValue, partitionFieldValue);
87+
assertEquals("job-foo_model_forecast_222_100_60_2_" + valuesPart, forecast.getId());
9288
}
9389

9490
public void testStrictParser() throws IOException {

x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/results/ModelPlotTests.java

Lines changed: 45 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,15 @@
1212
import org.elasticsearch.common.xcontent.XContentParser;
1313
import org.elasticsearch.common.xcontent.json.JsonXContent;
1414
import org.elasticsearch.test.AbstractSerializingTestCase;
15+
import org.elasticsearch.xpack.core.ml.MachineLearningField;
1516
import org.elasticsearch.xpack.core.ml.job.results.ModelPlot;
1617

1718
import java.io.IOException;
19+
import java.util.ArrayList;
1820
import java.util.Date;
19-
import java.util.Objects;
21+
import java.util.HashMap;
22+
import java.util.List;
23+
import java.util.Map;
2024

2125
import static org.hamcrest.Matchers.containsString;
2226
import static org.hamcrest.Matchers.not;
@@ -221,28 +225,23 @@ public void testId() {
221225
String overFieldValue = null;
222226
String partitionFieldValue = null;
223227

224-
int valuesHash = Objects.hash(byFieldValue, overFieldValue, partitionFieldValue);
225-
assertEquals("job-foo_model_plot_100_60_33_" + valuesHash + "_0", plot.getId());
228+
assertEquals("job-foo_model_plot_100_60_33_0_0", plot.getId());
226229

227-
int length = 0;
228230
if (randomBoolean()) {
229231
byFieldValue = randomAlphaOfLength(10);
230-
length += byFieldValue.length();
231232
plot.setByFieldValue(byFieldValue);
232233
}
233234
if (randomBoolean()) {
234235
overFieldValue = randomAlphaOfLength(10);
235-
length += overFieldValue.length();
236236
plot.setOverFieldValue(overFieldValue);
237237
}
238238
if (randomBoolean()) {
239239
partitionFieldValue = randomAlphaOfLength(10);
240-
length += partitionFieldValue.length();
241240
plot.setPartitionFieldValue(partitionFieldValue);
242241
}
243242

244-
valuesHash = Objects.hash(byFieldValue, overFieldValue, partitionFieldValue);
245-
assertEquals("job-foo_model_plot_100_60_33_" + valuesHash + "_" + length, plot.getId());
243+
String valuesPart = MachineLearningField.valuesToId(byFieldValue, overFieldValue, partitionFieldValue);
244+
assertEquals("job-foo_model_plot_100_60_33_" + valuesPart, plot.getId());
246245
}
247246

248247
public void testStrictParser() throws IOException {
@@ -262,6 +261,43 @@ public void testLenientParser() throws IOException {
262261
}
263262
}
264263

264+
public void testIdUniqueness() {
265+
ModelPlot modelPlot = new ModelPlot("foo", new Date(), 3600, 0);
266+
267+
String[] partitionFieldValues = { "730", "132", "358", "552", "888", "236", "224", "674",
268+
"438", "128", "722", "560", "228", "628", "226", "656" };
269+
String[] byFieldValues = { "S000", "S001", "S002", "S003", "S004", "S005", "S006", "S007", "S008", "S009",
270+
"S010", "S011", "S012", "S013", "S014", "S015", "S016", "S017", "S018", "S019",
271+
"S020", "S021", "S022", "S023", "S024", "S025", "S026", "S027", "S028", "S029",
272+
"S057", "S058", "S059", "M020", "M021", "M026", "M027", "M028", "M029", "M030",
273+
"M031", "M032", "M033", "M056", "M057", "M058", "M059", "M060", "M061", "M062",
274+
"M063", "M086", "M087", "M088", "M089", "M090", "M091", "M092", "M093", "M116",
275+
"M117", "M118", "M119", "L012", "L013", "L014", "L017", "L018", "L019", "L023",
276+
"L024", "L025", "L029", "L030", "L031" };
277+
278+
Map<String, List<String>> uniqueIds = new HashMap<>();
279+
280+
for (String partitionFieldValue : partitionFieldValues) {
281+
modelPlot.setPartitionFieldValue(partitionFieldValue);
282+
for (String byFieldValue : byFieldValues) {
283+
modelPlot.setByFieldValue(byFieldValue);
284+
String id = modelPlot.getId();
285+
uniqueIds.compute(id, (k, v) -> {
286+
if (v == null) {
287+
v = new ArrayList<>();
288+
}
289+
v.add(partitionFieldValue + "/" + byFieldValue);
290+
if (v.size() > 1) {
291+
logger.error("Duplicates for ID [" + id + "]: " + v);
292+
}
293+
return v;
294+
});
295+
}
296+
}
297+
298+
assertEquals(partitionFieldValues.length * byFieldValues.length, uniqueIds.size());
299+
}
300+
265301
private ModelPlot createFullyPopulated() {
266302
ModelPlot modelPlot = new ModelPlot("foo", new Date(12345678L), 360L, 22);
267303
modelPlot.setByFieldName("by");

0 commit comments

Comments
 (0)