Skip to content

Commit 14b2d2d

Browse files
author
Hendrik Muhs
authored
[ML] frequent items filter (#91137)
add a filter to the frequent items agg that filters documents from the analysis while still calculating support on the full set A filter is specified top-level in frequent_items: "frequent_items": { "filter": { "term": { "host.name.keyword": "i-12345" } }, ... The above filters documents that don't match, however still counts the docs when calculating support. That's in contrast to specifying a query at the top, in which case you find the same item sets, but don't know the importance given the full document set.
1 parent 01f77da commit 14b2d2d

File tree

13 files changed

+249
-34
lines changed

13 files changed

+249
-34
lines changed

docs/changelog/91137.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 91137
2+
summary: Add a filter parameter to frequent items
3+
area: Machine Learning
4+
type: enhancement
5+
issues: []

docs/reference/aggregations/bucket/frequent-items-aggregation.asciidoc

Lines changed: 53 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ A `frequent_items` aggregation looks like this in isolation:
5151
|`minimum_set_size` | (integer) The <<frequent-items-minimum-set-size,minimum size>> of one item set. | Optional | `1`
5252
|`minimum_support` | (integer) The <<frequent-items-minimum-support,minimum support>> of one item set. | Optional | `0.1`
5353
|`size` | (integer) The number of top item sets to return. | Optional | `10`
54+
|`filter` | (object) Query that filters documents from the analysis | Optional | `match_all`
5455
|===
5556

5657

@@ -102,6 +103,18 @@ parameter has a significant effect on the required memory and the runtime of the
102103
aggregation.
103104

104105

106+
[discrete]
107+
[[frequent-items-filter]]
108+
==== Filter
109+
110+
A query to filter documents to use as part of the analysis. Documents that
111+
don't match the filter are ignored when generating the item sets, however still
112+
count when calculating the support of an item set.
113+
114+
Use the filter if you want to narrow the item set analysis to fields of interest.
115+
Use a top-level query to filter the data set.
116+
117+
105118
[discrete]
106119
[[frequent-items-example]]
107120
==== Examples
@@ -123,7 +136,7 @@ example.
123136

124137
[source,console]
125138
-------------------------------------------------
126-
POST /kibana_sample_data_ecommerce /_async_search
139+
POST /kibana_sample_data_ecommerce/_async_search
127140
{
128141
"size": 0,
129142
"aggs": {
@@ -224,7 +237,45 @@ from New York. Finally, the item set with the third highest support is
224237

225238

226239
[discrete]
227-
==== Analizing numeric values by using a runtime field
240+
==== Aggregation with two analyzed fields and a filter
241+
242+
We take the first example, but want to narrow the item sets to places in Europe.
243+
For that we add a filter:
244+
245+
[source,console]
246+
-------------------------------------------------
247+
POST /kibana_sample_data_ecommerce/_async_search
248+
{
249+
"size": 0,
250+
"aggs": {
251+
"my_agg": {
252+
"frequent_items": {
253+
"minimum_set_size": 3,
254+
"fields": [
255+
{ "field": "category.keyword" },
256+
{ "field": "geoip.city_name" }
257+
],
258+
"size": 3,
259+
"filter": {
260+
"term": {
261+
"geoip.continent_name": "Europe"
262+
}
263+
}
264+
}
265+
}
266+
}
267+
}
268+
-------------------------------------------------
269+
// TEST[skip:setup kibana sample data]
270+
271+
The result will only show item sets that created from documents matching the
272+
filter, namely purchases in Europe. Using `filter` the calculated `support` still
273+
takes all purchases into acount. That's different to specifying a query at the
274+
top-level, in which case `support` gets calculated only from purchases in Europe.
275+
276+
277+
[discrete]
278+
==== Analyzing numeric values by using a runtime field
228279

229280
The frequent items aggregation enables you to bucket numeric values by using
230281
<<runtime,runtime fields>>. The next example demonstrates how to use a script to

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/aggs/frequentitemsets/EclatMapReducer.java

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,12 @@ public HashBasedTransactionStore map(Stream<Tuple<Field, List<Object>>> keyValue
188188
return transactionStore;
189189
}
190190

191+
@Override
192+
public HashBasedTransactionStore mapFiltered(HashBasedTransactionStore transactionStore) {
193+
transactionStore.addFilteredTransaction();
194+
return transactionStore;
195+
}
196+
191197
@Override
192198
protected ImmutableTransactionStore mapFinalize(HashBasedTransactionStore transactionStore) {
193199

@@ -197,6 +203,7 @@ protected ImmutableTransactionStore mapFinalize(HashBasedTransactionStore transa
197203
profilingInfoMap.put("ram_bytes_transactionstore_after_map", transactionStore.ramBytesUsed());
198204
profilingInfoMap.put("total_items_after_map", transactionStore.getTotalItemCount());
199205
profilingInfoMap.put("total_transactions_after_map", transactionStore.getTotalTransactionCount());
206+
profilingInfoMap.put("filtered_transactions_after_map", transactionStore.getFilteredTransactionCount());
200207
profilingInfoMap.put("unique_items_after_map", transactionStore.getUniqueItemsCount());
201208
profilingInfoMap.put("unique_transactions_after_map", transactionStore.getUniqueTransactionCount());
202209
}
@@ -283,6 +290,7 @@ public EclatResult reduceFinalize(HashBasedTransactionStore transactionStore, Li
283290
profilingInfoReduce.put("ram_bytes_transactionstore_after_reduce", transactionStore.ramBytesUsed());
284291
profilingInfoReduce.put("total_items_after_reduce", transactionStore.getTotalItemCount());
285292
profilingInfoReduce.put("total_transactions_after_reduce", transactionStore.getTotalTransactionCount());
293+
profilingInfoReduce.put("filtered_transactions_after_reduce", transactionStore.getFilteredTransactionCount());
286294
profilingInfoReduce.put("unique_items_after_reduce", transactionStore.getUniqueItemsCount());
287295
profilingInfoReduce.put("unique_transactions_after_reduce", transactionStore.getUniqueTransactionCount());
288296
}
@@ -293,6 +301,7 @@ public EclatResult reduceFinalize(HashBasedTransactionStore transactionStore, Li
293301
profilingInfoReduce.put("ram_bytes_transactionstore_after_prune", transactionStore.ramBytesUsed());
294302
profilingInfoReduce.put("total_items_after_prune", transactionStore.getTotalItemCount());
295303
profilingInfoReduce.put("total_transactions_after_prune", transactionStore.getTotalTransactionCount());
304+
profilingInfoReduce.put("filtered_transactions_after_prune", transactionStore.getFilteredTransactionCount());
296305
profilingInfoReduce.put("unique_items_after_prune", transactionStore.getUniqueItemsCount());
297306
profilingInfoReduce.put("unique_transactions_after_prune", transactionStore.getUniqueTransactionCount());
298307
}

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/aggs/frequentitemsets/FrequentItemSetsAggregationBuilder.java

Lines changed: 32 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
import org.elasticsearch.Version;
1111
import org.elasticsearch.common.io.stream.StreamInput;
1212
import org.elasticsearch.common.io.stream.StreamOutput;
13+
import org.elasticsearch.index.query.AbstractQueryBuilder;
14+
import org.elasticsearch.index.query.QueryBuilder;
1315
import org.elasticsearch.search.aggregations.AbstractAggregationBuilder;
1416
import org.elasticsearch.search.aggregations.Aggregation;
1517
import org.elasticsearch.search.aggregations.AggregationBuilder;
@@ -21,6 +23,7 @@
2123
import org.elasticsearch.search.aggregations.support.ValuesSourceRegistry;
2224
import org.elasticsearch.xcontent.ConstructingObjectParser;
2325
import org.elasticsearch.xcontent.ContextParser;
26+
import org.elasticsearch.xcontent.ObjectParser;
2427
import org.elasticsearch.xcontent.ParseField;
2528
import org.elasticsearch.xcontent.XContentBuilder;
2629
import org.elasticsearch.xpack.ml.aggs.frequentitemsets.mr.ItemSetMapReduceValueSource;
@@ -50,22 +53,29 @@ public final class FrequentItemSetsAggregationBuilder extends AbstractAggregatio
5053
double minimumSupport = args[1] == null ? DEFAULT_MINIMUM_SUPPORT : (double) args[1];
5154
int minimumSetSize = args[2] == null ? DEFAULT_MINIMUM_SET_SIZE : (int) args[2];
5255
int size = args[3] == null ? DEFAULT_SIZE : (int) args[3];
56+
QueryBuilder filter = (QueryBuilder) args[4];
5357

54-
return new FrequentItemSetsAggregationBuilder(context, fields, minimumSupport, minimumSetSize, size);
58+
return new FrequentItemSetsAggregationBuilder(context, fields, minimumSupport, minimumSetSize, size, filter);
5559
}
5660
);
5761

5862
static {
59-
ContextParser<Void, MultiValuesSourceFieldConfig.Builder> metricParser = MultiValuesSourceFieldConfig.parserBuilder(
63+
ContextParser<Void, MultiValuesSourceFieldConfig.Builder> fieldsParser = MultiValuesSourceFieldConfig.parserBuilder(
6064
false, // scriptable
6165
false, // timezone aware
62-
false, // filtered
66+
false, // filtered (not defined per field, but for all fields below)
6367
false // format
6468
);
65-
PARSER.declareObjectArray(ConstructingObjectParser.constructorArg(), (p, n) -> metricParser.parse(p, null).build(), FIELDS);
69+
PARSER.declareObjectArray(ConstructingObjectParser.constructorArg(), (p, n) -> fieldsParser.parse(p, null).build(), FIELDS);
6670
PARSER.declareDouble(ConstructingObjectParser.optionalConstructorArg(), MINIMUM_SUPPORT);
6771
PARSER.declareInt(ConstructingObjectParser.optionalConstructorArg(), MINIMUM_SET_SIZE);
6872
PARSER.declareInt(ConstructingObjectParser.optionalConstructorArg(), Aggregation.CommonFields.SIZE);
73+
PARSER.declareField(
74+
ConstructingObjectParser.optionalConstructorArg(),
75+
(p, context) -> AbstractQueryBuilder.parseTopLevelQuery(p),
76+
MultiValuesSourceFieldConfig.FILTER,
77+
ObjectParser.ValueType.OBJECT
78+
);
6979
}
7080

7181
static final ValuesSourceRegistry.RegistryKey<ItemSetMapReduceValueSource.ValueSourceSupplier> REGISTRY_KEY =
@@ -92,13 +102,15 @@ public static void registerAggregators(ValuesSourceRegistry.Builder registry) {
92102
private final double minimumSupport;
93103
private final int minimumSetSize;
94104
private final int size;
105+
private final QueryBuilder filter;
95106

96107
public FrequentItemSetsAggregationBuilder(
97108
String name,
98109
List<MultiValuesSourceFieldConfig> fields,
99110
double minimumSupport,
100111
int minimumSetSize,
101-
int size
112+
int size,
113+
QueryBuilder filter
102114
) {
103115
super(name);
104116
this.fields = fields;
@@ -118,6 +130,7 @@ public FrequentItemSetsAggregationBuilder(
118130
throw new IllegalArgumentException("[size] must be greater than 0. Found [" + size + "] in [" + name + "]");
119131
}
120132
this.size = size;
133+
this.filter = filter;
121134
}
122135

123136
public FrequentItemSetsAggregationBuilder(StreamInput in) throws IOException {
@@ -126,6 +139,11 @@ public FrequentItemSetsAggregationBuilder(StreamInput in) throws IOException {
126139
this.minimumSupport = in.readDouble();
127140
this.minimumSetSize = in.readVInt();
128141
this.size = in.readVInt();
142+
if (in.getVersion().onOrAfter(Version.V_8_6_0)) {
143+
this.filter = in.readOptionalNamedWriteable(QueryBuilder.class);
144+
} else {
145+
this.filter = null;
146+
}
129147
}
130148

131149
@Override
@@ -135,7 +153,7 @@ public boolean supportsSampling() {
135153

136154
@Override
137155
protected AggregationBuilder shallowCopy(Builder factoriesBuilder, Map<String, Object> metadata) {
138-
return new FrequentItemSetsAggregationBuilder(name, fields, minimumSupport, minimumSetSize, size);
156+
return new FrequentItemSetsAggregationBuilder(name, fields, minimumSupport, minimumSetSize, size, filter);
139157
}
140158

141159
@Override
@@ -149,6 +167,9 @@ protected void doWriteTo(StreamOutput out) throws IOException {
149167
out.writeDouble(minimumSupport);
150168
out.writeVInt(minimumSetSize);
151169
out.writeVInt(size);
170+
if (out.getVersion().onOrAfter(Version.V_8_6_0)) {
171+
out.writeOptionalNamedWriteable(filter);
172+
}
152173
}
153174

154175
@Override
@@ -164,7 +185,8 @@ protected AggregatorFactory doBuild(AggregationContext context, AggregatorFactor
164185
fields,
165186
minimumSupport,
166187
minimumSetSize,
167-
size
188+
size,
189+
filter
168190
);
169191
}
170192

@@ -179,6 +201,9 @@ protected XContentBuilder internalXContent(XContentBuilder builder, Params param
179201
builder.field(MINIMUM_SUPPORT.getPreferredName(), minimumSupport);
180202
builder.field(MINIMUM_SET_SIZE.getPreferredName(), minimumSetSize);
181203
builder.field(Aggregation.CommonFields.SIZE.getPreferredName(), size);
204+
if (filter != null) {
205+
builder.field(MultiValuesSourceFieldConfig.FILTER.getPreferredName(), filter);
206+
}
182207
builder.endObject();
183208
return builder;
184209
}

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/aggs/frequentitemsets/FrequentItemSetsAggregatorFactory.java

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
package org.elasticsearch.xpack.ml.aggs.frequentitemsets;
99

1010
import org.elasticsearch.common.io.stream.Writeable;
11+
import org.elasticsearch.index.query.QueryBuilder;
1112
import org.elasticsearch.search.SearchService;
1213
import org.elasticsearch.search.aggregations.AggregationExecutionException;
1314
import org.elasticsearch.search.aggregations.Aggregator;
@@ -59,6 +60,7 @@ public class FrequentItemSetsAggregatorFactory extends AggregatorFactory {
5960
private final double minimumSupport;
6061
private final int minimumSetSize;
6162
private final int size;
63+
private final QueryBuilder filter;
6264

6365
public FrequentItemSetsAggregatorFactory(
6466
String name,
@@ -69,13 +71,15 @@ public FrequentItemSetsAggregatorFactory(
6971
List<MultiValuesSourceFieldConfig> fields,
7072
double minimumSupport,
7173
int minimumSetSize,
72-
int size
74+
int size,
75+
QueryBuilder filter
7376
) throws IOException {
7477
super(name, context, parent, subFactoriesBuilder, metadata);
7578
this.fields = fields;
7679
this.minimumSupport = minimumSupport;
7780
this.minimumSetSize = minimumSetSize;
7881
this.size = size;
82+
this.filter = filter;
7983
}
8084

8185
@Override
@@ -109,7 +113,8 @@ protected Aggregator createInternal(Aggregator parent, CardinalityUpperBound car
109113
parent,
110114
metadata,
111115
new EclatMapReducer(FrequentItemSetsAggregationBuilder.NAME, minimumSupport, minimumSetSize, size, context.profiling()),
112-
configs
116+
configs,
117+
filter
113118
) {
114119
};
115120
}

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/aggs/frequentitemsets/HashBasedTransactionStore.java

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@ public final class HashBasedTransactionStore extends TransactionStore {
9696
private BytesRefHash transactions;
9797
private LongArray transactionCounts;
9898
private long totalTransactionCount;
99+
private long filteredTransactionCount;
99100

100101
public HashBasedTransactionStore(BigArrays bigArrays) {
101102
super(bigArrays);
@@ -209,6 +210,14 @@ public void add(Stream<Tuple<Field, List<Object>>> keyValues) {
209210
transactionCounts.increment(id, 1);
210211
}
211212

213+
/**
214+
* Report a filtered transaction to the store.
215+
*/
216+
public void addFilteredTransaction() {
217+
++filteredTransactionCount;
218+
++totalTransactionCount;
219+
}
220+
212221
@Override
213222
public long getTotalItemCount() {
214223
return totalItemCount;
@@ -219,6 +228,11 @@ public long getTotalTransactionCount() {
219228
return totalTransactionCount;
220229
}
221230

231+
@Override
232+
public long getFilteredTransactionCount() {
233+
return filteredTransactionCount;
234+
}
235+
222236
@Override
223237
public BytesRefArray getItems() {
224238
return items.getBytesRefs();
@@ -292,6 +306,7 @@ public void merge(TransactionStore other) throws IOException {
292306

293307
totalItemCount += other.getTotalItemCount();
294308
totalTransactionCount += other.getTotalTransactionCount();
309+
filteredTransactionCount += other.getFilteredTransactionCount();
295310
}
296311

297312
/**
@@ -445,7 +460,8 @@ public ImmutableTransactionStore createImmutableTransactionStore() {
445460
totalItemCount,
446461
transactions.takeBytesRefsOwnership(),
447462
transactionCounts,
448-
totalTransactionCount
463+
totalTransactionCount,
464+
filteredTransactionCount
449465
);
450466

451467
items = null;

0 commit comments

Comments
 (0)