Skip to content

Commit cedd78c

Browse files
authored
[ML-DataFrame] Add support for (date) histogram pivots (#38725)
* [FEATURE][DATA_FRAME] Adding (date) histogram group_by support for pivot * adjusting format for merge * Update DataFramePivotRestIT.java
1 parent cd7292c commit cedd78c

File tree

8 files changed

+469
-7
lines changed

8 files changed

+469
-7
lines changed

x-pack/plugin/data-frame/qa/single-node-tests/src/test/java/org/elasticsearch/xpack/dataframe/integration/DataFramePivotRestIT.java

Lines changed: 75 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,44 @@ public void testSimplePivotWithQuery() throws Exception {
7575
assertOnePivotValue(dataFrameIndex + "/_search?q=reviewer:user_26", 3.918918918);
7676
}
7777

78+
public void testHistogramPivot() throws Exception {
79+
String transformId = "simpleHistogramPivot";
80+
String dataFrameIndex = "pivot_reviews_via_histogram";
81+
82+
final Request createDataframeTransformRequest = new Request("PUT", DATAFRAME_ENDPOINT + transformId);
83+
84+
String config = "{"
85+
+ " \"source\": \"reviews\","
86+
+ " \"dest\": \"" + dataFrameIndex + "\",";
87+
88+
89+
config += " \"pivot\": {"
90+
+ " \"group_by\": [ {"
91+
+ " \"every_2\": {"
92+
+ " \"histogram\": {"
93+
+ " \"interval\": 2,\"field\":\"stars\""
94+
+ " } } } ],"
95+
+ " \"aggregations\": {"
96+
+ " \"avg_rating\": {"
97+
+ " \"avg\": {"
98+
+ " \"field\": \"stars\""
99+
+ " } } } }"
100+
+ "}";
101+
102+
103+
createDataframeTransformRequest.setJsonEntity(config);
104+
Map<String, Object> createDataframeTransformResponse = entityAsMap(client().performRequest(createDataframeTransformRequest));
105+
assertThat(createDataframeTransformResponse.get("acknowledged"), equalTo(Boolean.TRUE));
106+
assertTrue(indexExists(dataFrameIndex));
107+
108+
startAndWaitForTransform(transformId, dataFrameIndex);
109+
110+
// we expect 3 documents as there shall be 5 unique star values and we are bucketing every 2 starting at 0
111+
Map<String, Object> indexStats = getAsMap(dataFrameIndex + "/_stats");
112+
assertEquals(3, XContentMapValues.extractValue("_all.total.docs.count", indexStats));
113+
assertOnePivotValue(dataFrameIndex + "/_search?q=every_2:0.0", 1.0);
114+
}
115+
78116
public void testBiggerPivot() throws Exception {
79117
String transformId = "biggerPivot";
80118
String dataFrameIndex = "bigger_pivot_reviews";
@@ -149,6 +187,43 @@ public void testBiggerPivot() throws Exception {
149187
assertEquals(41, actual.longValue());
150188
}
151189

190+
public void testDateHistogramPivot() throws Exception {
191+
String transformId = "simpleDateHistogramPivot";
192+
String dataFrameIndex = "pivot_reviews_via_date_histogram";
193+
194+
final Request createDataframeTransformRequest = new Request("PUT", DATAFRAME_ENDPOINT + transformId);
195+
196+
String config = "{"
197+
+ " \"source\": \"reviews\","
198+
+ " \"dest\": \"" + dataFrameIndex + "\",";
199+
200+
201+
config += " \"pivot\": {"
202+
+ " \"group_by\": [ {"
203+
+ " \"by_day\": {"
204+
+ " \"date_histogram\": {"
205+
+ " \"interval\": \"1d\",\"field\":\"timestamp\",\"format\":\"yyyy-MM-DD\""
206+
+ " } } } ],"
207+
+ " \"aggregations\": {"
208+
+ " \"avg_rating\": {"
209+
+ " \"avg\": {"
210+
+ " \"field\": \"stars\""
211+
+ " } } } }"
212+
+ "}";
213+
214+
createDataframeTransformRequest.setJsonEntity(config);
215+
Map<String, Object> createDataframeTransformResponse = entityAsMap(client().performRequest(createDataframeTransformRequest));
216+
assertThat(createDataframeTransformResponse.get("acknowledged"), equalTo(Boolean.TRUE));
217+
assertTrue(indexExists(dataFrameIndex));
218+
219+
startAndWaitForTransform(transformId, dataFrameIndex);
220+
221+
// we expect 21 documents as there shall be 21 days worth of docs
222+
Map<String, Object> indexStats = getAsMap(dataFrameIndex + "/_stats");
223+
assertEquals(21, XContentMapValues.extractValue("_all.total.docs.count", indexStats));
224+
assertOnePivotValue(dataFrameIndex + "/_search?q=by_day:2017-01-15", 3.82);
225+
}
226+
152227
private void startAndWaitForTransform(String transformId, String dataFrameIndex) throws IOException, Exception {
153228
// start the transform
154229
final Request startTransformRequest = new Request("POST", DATAFRAME_ENDPOINT + transformId + "/_start");
@@ -160,8 +235,6 @@ private void startAndWaitForTransform(String transformId, String dataFrameIndex)
160235
refreshIndex(dataFrameIndex);
161236
}
162237

163-
164-
165238
private void waitForDataFrameGeneration(String transformId) throws Exception {
166239
assertBusy(() -> {
167240
long generation = getDataFrameGeneration(transformId);

x-pack/plugin/data-frame/qa/single-node-tests/src/test/java/org/elasticsearch/xpack/dataframe/integration/DataFrameRestTestCase.java

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,9 @@ protected void createReviewsIndex() throws IOException {
4545
{
4646
builder.startObject("mappings")
4747
.startObject("properties")
48+
.startObject("timestamp")
49+
.field("type", "date")
50+
.endObject()
4851
.startObject("user_id")
4952
.field("type", "keyword")
5053
.endObject()
@@ -66,11 +69,17 @@ protected void createReviewsIndex() throws IOException {
6669

6770
// create index
6871
final StringBuilder bulk = new StringBuilder();
72+
int day = 10;
6973
for (int i = 0; i < numDocs; i++) {
7074
bulk.append("{\"index\":{\"_index\":\"reviews\"}}\n");
7175
long user = Math.round(Math.pow(i * 31 % 1000, distributionTable[i % distributionTable.length]) % 27);
7276
int stars = distributionTable[(i * 33) % distributionTable.length];
7377
long business = Math.round(Math.pow(user * stars, distributionTable[i % distributionTable.length]) % 13);
78+
int hour = randomIntBetween(10, 20);
79+
int min = randomIntBetween(30, 59);
80+
int sec = randomIntBetween(30, 59);
81+
82+
String date_string = "2017-01-" + day + "T" + hour + ":" + min + ":" + sec + "Z";
7483
bulk.append("{\"user_id\":\"")
7584
.append("user_")
7685
.append(user)
@@ -79,7 +88,9 @@ protected void createReviewsIndex() throws IOException {
7988
.append(business)
8089
.append("\",\"stars\":")
8190
.append(stars)
82-
.append("}\n");
91+
.append(",\"timestamp\":\"")
92+
.append(date_string)
93+
.append("\"}\n");
8394

8495
if (i % 50 == 0) {
8596
bulk.append("\r\n");
@@ -89,6 +100,7 @@ protected void createReviewsIndex() throws IOException {
89100
client().performRequest(bulkRequest);
90101
// clear the builder
91102
bulk.setLength(0);
103+
day += 1;
92104
}
93105
}
94106
bulk.append("\r\n");
@@ -209,4 +221,4 @@ protected static void wipeIndices() throws IOException {
209221
}
210222
}
211223
}
212-
}
224+
}
Lines changed: 177 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,177 @@
1+
/*
2+
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3+
* or more contributor license agreements. Licensed under the Elastic License;
4+
* you may not use this file except in compliance with the Elastic License.
5+
*/
6+
package org.elasticsearch.xpack.dataframe.transforms.pivot;
7+
8+
import org.elasticsearch.common.ParseField;
9+
import org.elasticsearch.common.io.stream.StreamInput;
10+
import org.elasticsearch.common.io.stream.StreamOutput;
11+
import org.elasticsearch.common.xcontent.ConstructingObjectParser;
12+
import org.elasticsearch.common.xcontent.ObjectParser;
13+
import org.elasticsearch.common.xcontent.XContentBuilder;
14+
import org.elasticsearch.common.xcontent.XContentParser;
15+
import org.elasticsearch.search.aggregations.bucket.histogram.DateHistogramInterval;
16+
17+
import java.io.IOException;
18+
import java.time.ZoneId;
19+
import java.time.ZoneOffset;
20+
import java.util.Objects;
21+
22+
23+
public class DateHistogramGroupSource extends SingleGroupSource<DateHistogramGroupSource> {
24+
25+
private static final String NAME = "data_frame_date_histogram_group";
26+
private static final ParseField TIME_ZONE = new ParseField("time_zone");
27+
private static final ParseField FORMAT = new ParseField("format");
28+
29+
private static final ConstructingObjectParser<DateHistogramGroupSource, Void> STRICT_PARSER = createParser(false);
30+
private static final ConstructingObjectParser<DateHistogramGroupSource, Void> LENIENT_PARSER = createParser(true);
31+
private long interval = 0;
32+
private DateHistogramInterval dateHistogramInterval;
33+
private String format;
34+
private ZoneId timeZone;
35+
36+
public DateHistogramGroupSource(String field) {
37+
super(field);
38+
}
39+
40+
public DateHistogramGroupSource(StreamInput in) throws IOException {
41+
super(in);
42+
this.interval = in.readLong();
43+
this.dateHistogramInterval = in.readOptionalWriteable(DateHistogramInterval::new);
44+
this.timeZone = in.readOptionalZoneId();
45+
this.format = in.readOptionalString();
46+
}
47+
48+
private static ConstructingObjectParser<DateHistogramGroupSource, Void> createParser(boolean lenient) {
49+
ConstructingObjectParser<DateHistogramGroupSource, Void> parser = new ConstructingObjectParser<>(NAME, lenient, (args) -> {
50+
String field = (String) args[0];
51+
return new DateHistogramGroupSource(field);
52+
});
53+
54+
SingleGroupSource.declareValuesSourceFields(parser, null);
55+
56+
parser.declareField((histogram, interval) -> {
57+
if (interval instanceof Long) {
58+
histogram.setInterval((long) interval);
59+
} else {
60+
histogram.setDateHistogramInterval((DateHistogramInterval) interval);
61+
}
62+
}, p -> {
63+
if (p.currentToken() == XContentParser.Token.VALUE_NUMBER) {
64+
return p.longValue();
65+
} else {
66+
return new DateHistogramInterval(p.text());
67+
}
68+
}, HistogramGroupSource.INTERVAL, ObjectParser.ValueType.LONG);
69+
70+
parser.declareField(DateHistogramGroupSource::setTimeZone, p -> {
71+
if (p.currentToken() == XContentParser.Token.VALUE_STRING) {
72+
return ZoneId.of(p.text());
73+
} else {
74+
return ZoneOffset.ofHours(p.intValue());
75+
}
76+
}, TIME_ZONE, ObjectParser.ValueType.LONG);
77+
78+
parser.declareString(DateHistogramGroupSource::setFormat, FORMAT);
79+
return parser;
80+
}
81+
82+
public static DateHistogramGroupSource fromXContent(final XContentParser parser, boolean lenient) throws IOException {
83+
return lenient ? LENIENT_PARSER.apply(parser, null) : STRICT_PARSER.apply(parser, null);
84+
}
85+
86+
public long getInterval() {
87+
return interval;
88+
}
89+
90+
public void setInterval(long interval) {
91+
if (interval < 1) {
92+
throw new IllegalArgumentException("[interval] must be greater than or equal to 1.");
93+
}
94+
this.interval = interval;
95+
}
96+
97+
public DateHistogramInterval getDateHistogramInterval() {
98+
return dateHistogramInterval;
99+
}
100+
101+
public void setDateHistogramInterval(DateHistogramInterval dateHistogramInterval) {
102+
if (dateHistogramInterval == null) {
103+
throw new IllegalArgumentException("[dateHistogramInterval] must not be null");
104+
}
105+
this.dateHistogramInterval = dateHistogramInterval;
106+
}
107+
108+
public String getFormat() {
109+
return format;
110+
}
111+
112+
public void setFormat(String format) {
113+
this.format = format;
114+
}
115+
116+
public ZoneId getTimeZone() {
117+
return timeZone;
118+
}
119+
120+
public void setTimeZone(ZoneId timeZone) {
121+
this.timeZone = timeZone;
122+
}
123+
124+
@Override
125+
public void writeTo(StreamOutput out) throws IOException {
126+
out.writeOptionalString(field);
127+
out.writeLong(interval);
128+
out.writeOptionalWriteable(dateHistogramInterval);
129+
out.writeOptionalZoneId(timeZone);
130+
out.writeOptionalString(format);
131+
}
132+
133+
@Override
134+
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
135+
builder.startObject();
136+
if (field != null) {
137+
builder.field(FIELD.getPreferredName(), field);
138+
}
139+
if (dateHistogramInterval == null) {
140+
builder.field(HistogramGroupSource.INTERVAL.getPreferredName(), interval);
141+
} else {
142+
builder.field(HistogramGroupSource.INTERVAL.getPreferredName(), dateHistogramInterval.toString());
143+
}
144+
if (timeZone != null) {
145+
builder.field(TIME_ZONE.getPreferredName(), timeZone.toString());
146+
}
147+
if (format != null) {
148+
builder.field(FORMAT.getPreferredName(), format);
149+
}
150+
builder.endObject();
151+
return builder;
152+
}
153+
154+
@Override
155+
public boolean equals(Object other) {
156+
if (this == other) {
157+
return true;
158+
}
159+
160+
if (other == null || getClass() != other.getClass()) {
161+
return false;
162+
}
163+
164+
final DateHistogramGroupSource that = (DateHistogramGroupSource) other;
165+
166+
return Objects.equals(this.field, that.field) &&
167+
Objects.equals(interval, that.interval) &&
168+
Objects.equals(dateHistogramInterval, that.dateHistogramInterval) &&
169+
Objects.equals(timeZone, that.timeZone) &&
170+
Objects.equals(format, that.format);
171+
}
172+
173+
@Override
174+
public int hashCode() {
175+
return Objects.hash(field, interval, dateHistogramInterval, timeZone, format);
176+
}
177+
}

x-pack/plugin/data-frame/src/main/java/org/elasticsearch/xpack/dataframe/transforms/pivot/GroupConfig.java

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,12 @@ public GroupConfig(StreamInput in) throws IOException {
4343
case TERMS:
4444
groupSource = in.readOptionalWriteable(TermsGroupSource::new);
4545
break;
46+
case HISTOGRAM:
47+
groupSource = in.readOptionalWriteable(HistogramGroupSource::new);
48+
break;
49+
case DATE_HISTOGRAM:
50+
groupSource = in.readOptionalWriteable(DateHistogramGroupSource::new);
51+
break;
4652
default:
4753
throw new IOException("Unknown group type");
4854
}
@@ -126,6 +132,12 @@ public static GroupConfig fromXContent(final XContentParser parser, boolean leni
126132
case TERMS:
127133
groupSource = TermsGroupSource.fromXContent(parser, lenient);
128134
break;
135+
case HISTOGRAM:
136+
groupSource = HistogramGroupSource.fromXContent(parser, lenient);
137+
break;
138+
case DATE_HISTOGRAM:
139+
groupSource = DateHistogramGroupSource.fromXContent(parser, lenient);
140+
break;
129141
default:
130142
throw new ParsingException(parser.getTokenLocation(), "invalid grouping type: " + groupType);
131143
}

0 commit comments

Comments
 (0)