Skip to content

Commit 3d8c2f9

Browse files
committed
Fix query analyzer logic for mixed conjunctions of terms and ranges (#49803)
When the query analyzer examines a conjunction containing both terms and ranges, it should only include ranges in the minimum_should_match calculation if there are no other range queries on that same field within the conjunction. This is because we cannot build a selection query over disjoint ranges on the same field, and it is not easy to check if two range queries have an overlap. The current logic to calculate this just sets minimum_should_match to 1 or 0, dependent on whether or not the current range is over a field that has already been seen. However, this can be incorrect in the case that there are terms in the same match group which adjust the minimum_should_match downwards. Instead, the logic should be changed to match the terms extraction, whereby we adjust minimum_should_match downwards if we have already seen a range field. Fixes #49684
1 parent a16abf9 commit 3d8c2f9

File tree

2 files changed

+172
-31
lines changed

2 files changed

+172
-31
lines changed

modules/percolator/src/main/java/org/elasticsearch/percolator/QueryAnalyzer.java

Lines changed: 40 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -247,7 +247,7 @@ private static Result handleConjunction(List<Result> conjunctionsWithUnknowns, V
247247
List<Result> conjunctions = conjunctionsWithUnknowns.stream().filter(r -> r.isUnknown() == false).collect(Collectors.toList());
248248
if (conjunctions.isEmpty()) {
249249
if (conjunctionsWithUnknowns.isEmpty()) {
250-
throw new IllegalArgumentException("Must have at least on conjunction sub result");
250+
throw new IllegalArgumentException("Must have at least one conjunction sub result");
251251
}
252252
return conjunctionsWithUnknowns.get(0); // all conjunctions are unknown, so just return the first one
253253
}
@@ -260,64 +260,73 @@ private static Result handleConjunction(List<Result> conjunctionsWithUnknowns, V
260260
return subResult;
261261
}
262262
}
263+
263264
int msm = 0;
264265
boolean verified = conjunctionsWithUnknowns.size() == conjunctions.size();
265266
boolean matchAllDocs = true;
266-
boolean hasDuplicateTerms = false;
267267
Set<QueryExtraction> extractions = new HashSet<>();
268268
Set<String> seenRangeFields = new HashSet<>();
269269
for (Result result : conjunctions) {
270-
// In case that there are duplicate query extractions we need to be careful with
271-
// incrementing msm,
272-
// because that could lead to valid matches not becoming candidate matches:
273-
// query: (field:val1 AND field:val2) AND (field:val2 AND field:val3)
274-
// doc: field: val1 val2 val3
275-
// So lets be protective and decrease the msm:
270+
276271
int resultMsm = result.minimumShouldMatch;
277272
for (QueryExtraction queryExtraction : result.extractions) {
278273
if (queryExtraction.range != null) {
279274
// In case of range queries each extraction does not simply increment the
280-
// minimum_should_match
281-
// for that percolator query like for a term based extraction, so that can lead
282-
// to more false
283-
// positives for percolator queries with range queries than term based queries.
284-
// The is because the way number fields are extracted from the document to be
285-
// percolated.
286-
// Per field a single range is extracted and if a percolator query has two or
287-
// more range queries
288-
// on the same field, then the minimum should match can be higher than clauses
289-
// in the CoveringQuery.
290-
// Therefore right now the minimum should match is incremented once per number
291-
// field when processing
292-
// the percolator query at index time.
293-
if (seenRangeFields.add(queryExtraction.range.fieldName)) {
294-
resultMsm = 1;
295-
} else {
296-
resultMsm = 0;
275+
// minimum_should_match for that percolator query like for a term based extraction,
276+
// so that can lead to more false positives for percolator queries with range queries
277+
// than term based queries.
278+
// This is because the way number fields are extracted from the document to be
279+
// percolated. Per field a single range is extracted and if a percolator query has two or
280+
// more range queries on the same field, then the minimum should match can be higher than clauses
281+
// in the CoveringQuery. Therefore right now the minimum should match is only incremented once per
282+
// number field when processing the percolator query at index time.
283+
// For multiple ranges within a single extraction (ie from an existing conjunction or disjunction)
284+
// then this will already have been taken care of, so we only check against fieldnames from
285+
// previously processed extractions, and don't add to the seenRangeFields list until all
286+
// extractions from this result are processed
287+
if (seenRangeFields.contains(queryExtraction.range.fieldName)) {
288+
resultMsm = Math.max(0, resultMsm - 1);
289+
verified = false;
290+
}
291+
} else {
292+
// In case that there are duplicate term query extractions we need to be careful with
293+
// incrementing msm, because that could lead to valid matches not becoming candidate matches:
294+
// query: (field:val1 AND field:val2) AND (field:val2 AND field:val3)
295+
// doc: field: val1 val2 val3
296+
// So lets be protective and decrease the msm:
297+
if (extractions.contains(queryExtraction)) {
298+
resultMsm = Math.max(0, resultMsm - 1);
297299
verified = false;
298-
break;
299300
}
300-
}
301-
if (extractions.contains(queryExtraction)) {
302-
resultMsm = Math.max(0, resultMsm - 1);
303-
verified = false;
304301
}
305302
}
306303
msm += resultMsm;
304+
305+
// add range fields from this Result to the seenRangeFields set so that minimumShouldMatch is correctly
306+
// calculated for subsequent Results
307+
result.extractions.stream()
308+
.map(e -> e.range)
309+
.filter(Objects::nonNull)
310+
.map(e -> e.fieldName)
311+
.forEach(seenRangeFields::add);
312+
307313
if (result.verified == false
308314
// If some inner extractions are optional, the result can't be verified
309315
|| result.minimumShouldMatch < result.extractions.size()) {
310316
verified = false;
317+
311318
}
312319
matchAllDocs &= result.matchAllDocs;
313320
extractions.addAll(result.extractions);
314321
}
322+
315323
if (matchAllDocs) {
316324
return new Result(matchAllDocs, verified);
317325
} else {
318-
return new Result(verified, extractions, hasDuplicateTerms ? 1 : msm);
326+
return new Result(verified, extractions, msm);
319327
}
320328

329+
321330
} else {
322331
Result bestClause = null;
323332
for (Result result : conjunctions) {

modules/percolator/src/test/java/org/elasticsearch/percolator/QueryAnalyzerTests.java

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@
8282
import static org.hamcrest.Matchers.equalTo;
8383
import static org.hamcrest.Matchers.is;
8484
import static org.hamcrest.Matchers.sameInstance;
85+
import static org.hamcrest.collection.IsCollectionWithSize.hasSize;
8586

8687
public class QueryAnalyzerTests extends ESTestCase {
8788

@@ -1506,4 +1507,135 @@ public void testIntervalQueries() {
15061507
assertTermsEqual(result.extractions, new Term("field", "a"));
15071508
}
15081509

1510+
public void testCombinedRangeAndTermWithMinimumShouldMatch() {
1511+
1512+
Query disj = new BooleanQuery.Builder()
1513+
.add(IntPoint.newRangeQuery("i", 0, 10), Occur.SHOULD)
1514+
.add(new TermQuery(new Term("f", "v1")), Occur.SHOULD)
1515+
.add(new TermQuery(new Term("f", "v1")), Occur.SHOULD)
1516+
.setMinimumNumberShouldMatch(2)
1517+
.build();
1518+
1519+
Result r = analyze(disj, Version.CURRENT);
1520+
assertThat(r.minimumShouldMatch, equalTo(1));
1521+
assertThat(r.extractions, hasSize(2));
1522+
assertFalse(r.matchAllDocs);
1523+
assertFalse(r.verified);
1524+
1525+
Query q = new BooleanQuery.Builder()
1526+
.add(IntPoint.newRangeQuery("i", 0, 10), Occur.SHOULD)
1527+
.add(new TermQuery(new Term("f", "v1")), Occur.SHOULD)
1528+
.add(new TermQuery(new Term("f", "v1")), Occur.SHOULD)
1529+
.add(new TermQuery(new Term("f", "v1")), Occur.FILTER)
1530+
.setMinimumNumberShouldMatch(2)
1531+
.build();
1532+
1533+
Result result = analyze(q, Version.CURRENT);
1534+
assertThat(result.minimumShouldMatch, equalTo(1));
1535+
assertThat(result.extractions.size(), equalTo(2));
1536+
assertFalse(result.verified);
1537+
assertFalse(result.matchAllDocs);
1538+
1539+
q = new BooleanQuery.Builder()
1540+
.add(q, Occur.MUST)
1541+
.add(q, Occur.MUST)
1542+
.build();
1543+
1544+
result = analyze(q, Version.CURRENT);
1545+
assertThat(result.minimumShouldMatch, equalTo(1));
1546+
assertThat(result.extractions.size(), equalTo(2));
1547+
assertFalse(result.verified);
1548+
assertFalse(result.matchAllDocs);
1549+
1550+
Query q2 = new BooleanQuery.Builder()
1551+
.add(new TermQuery(new Term("f", "v1")), Occur.FILTER)
1552+
.add(IntPoint.newRangeQuery("i", 15, 20), Occur.SHOULD)
1553+
.add(new TermQuery(new Term("f", "v2")), Occur.SHOULD)
1554+
.add(new TermQuery(new Term("f", "v2")), Occur.MUST)
1555+
.setMinimumNumberShouldMatch(1)
1556+
.build();
1557+
1558+
result = analyze(q2, Version.CURRENT);
1559+
assertThat(result.minimumShouldMatch, equalTo(2));
1560+
assertThat(result.extractions, hasSize(3));
1561+
assertFalse(result.verified);
1562+
assertFalse(result.matchAllDocs);
1563+
1564+
// multiple range queries on different fields
1565+
Query q3 = new BooleanQuery.Builder()
1566+
.add(IntPoint.newRangeQuery("i", 15, 20), Occur.SHOULD)
1567+
.add(IntPoint.newRangeQuery("i2", 15, 20), Occur.SHOULD)
1568+
.add(new TermQuery(new Term("f", "v1")), Occur.SHOULD)
1569+
.add(new TermQuery(new Term("f", "v2")), Occur.MUST)
1570+
.setMinimumNumberShouldMatch(1)
1571+
.build();
1572+
result = analyze(q3, Version.CURRENT);
1573+
assertThat(result.minimumShouldMatch, equalTo(2));
1574+
assertThat(result.extractions, hasSize(4));
1575+
assertFalse(result.verified);
1576+
assertFalse(result.matchAllDocs);
1577+
1578+
// multiple disjoint range queries on the same field
1579+
Query q4 = new BooleanQuery.Builder()
1580+
.add(IntPoint.newRangeQuery("i", 15, 20), Occur.SHOULD)
1581+
.add(IntPoint.newRangeQuery("i", 25, 30), Occur.SHOULD)
1582+
.add(IntPoint.newRangeQuery("i", 35, 40), Occur.SHOULD)
1583+
.add(new TermQuery(new Term("f", "v1")), Occur.SHOULD)
1584+
.add(new TermQuery(new Term("f", "v2")), Occur.MUST)
1585+
.setMinimumNumberShouldMatch(1)
1586+
.build();
1587+
result = analyze(q4, Version.CURRENT);
1588+
assertThat(result.minimumShouldMatch, equalTo(2));
1589+
assertThat(result.extractions, hasSize(5));
1590+
assertFalse(result.verified);
1591+
assertFalse(result.matchAllDocs);
1592+
1593+
// multiple conjunction range queries on the same field
1594+
Query q5 = new BooleanQuery.Builder()
1595+
.add(new BooleanQuery.Builder()
1596+
.add(IntPoint.newRangeQuery("i", 15, 20), Occur.MUST)
1597+
.add(IntPoint.newRangeQuery("i", 25, 30), Occur.MUST)
1598+
.build(), Occur.MUST)
1599+
.add(IntPoint.newRangeQuery("i", 35, 40), Occur.MUST)
1600+
.add(new TermQuery(new Term("f", "v2")), Occur.MUST)
1601+
.build();
1602+
result = analyze(q5, Version.CURRENT);
1603+
assertThat(result.minimumShouldMatch, equalTo(2));
1604+
assertThat(result.extractions, hasSize(4));
1605+
assertFalse(result.verified);
1606+
assertFalse(result.matchAllDocs);
1607+
1608+
// multiple conjunction range queries on different fields
1609+
Query q6 = new BooleanQuery.Builder()
1610+
.add(new BooleanQuery.Builder()
1611+
.add(IntPoint.newRangeQuery("i", 15, 20), Occur.MUST)
1612+
.add(IntPoint.newRangeQuery("i2", 25, 30), Occur.MUST)
1613+
.build(), Occur.MUST)
1614+
.add(IntPoint.newRangeQuery("i", 35, 40), Occur.MUST)
1615+
.add(new TermQuery(new Term("f", "v2")), Occur.MUST)
1616+
.build();
1617+
result = analyze(q6, Version.CURRENT);
1618+
assertThat(result.minimumShouldMatch, equalTo(3));
1619+
assertThat(result.extractions, hasSize(4));
1620+
assertFalse(result.verified);
1621+
assertFalse(result.matchAllDocs);
1622+
1623+
// mixed term and range conjunctions
1624+
Query q7 = new BooleanQuery.Builder()
1625+
.add(new BooleanQuery.Builder()
1626+
.add(IntPoint.newRangeQuery("i", 1, 2), Occur.MUST)
1627+
.add(new TermQuery(new Term("f", "1")), Occur.MUST)
1628+
.build(), Occur.MUST)
1629+
.add(new BooleanQuery.Builder()
1630+
.add(IntPoint.newRangeQuery("i", 1, 2), Occur.MUST)
1631+
.add(new TermQuery(new Term("f", "2")), Occur.MUST)
1632+
.build(), Occur.MUST)
1633+
.build();
1634+
result = analyze(q7, Version.CURRENT);
1635+
assertThat(result.minimumShouldMatch, equalTo(3));
1636+
assertThat(result.extractions, hasSize(3));
1637+
assertFalse(result.verified);
1638+
assertFalse(result.matchAllDocs);
1639+
}
1640+
15091641
}

0 commit comments

Comments
 (0)