From 1ac3e7f4ba0720e4ebfb83760c185834c3a6c2eb Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Wed, 28 Mar 2018 16:25:37 +0200 Subject: [PATCH] Fix some query extraction bugs. While playing with the percolator I found two bugs: - Sometimes we set a min_should_match that is greater than the number of extractions. While this doesn't cause direct trouble, it does when the query is nested into a boolean query and the boolean query tries to compute the min_should_match for the entire query based on its own min_should_match and those of the sub queries. So I changed the code to throw an exception when min_should_match is greater than the number of extractions. - Boolean queries claim matches are verified when in fact they shouldn't. This is due to the fact that boolean queries assume that they are verified if all sub clauses are verified but things are more complex than that, eg. conjunctions that are nested in a disjunction or disjunctions that are nested in a conjunction can generally not be verified without running the query. --- .../percolator/QueryAnalyzer.java | 48 ++++-- .../percolator/CandidateQueryTests.java | 9 +- .../percolator/QueryAnalyzerTests.java | 146 ++++++++++++++++-- 3 files changed, 170 insertions(+), 33 deletions(-) diff --git a/modules/percolator/src/main/java/org/elasticsearch/percolator/QueryAnalyzer.java b/modules/percolator/src/main/java/org/elasticsearch/percolator/QueryAnalyzer.java index 24b210c29d584..8f1bb2a9310d3 100644 --- a/modules/percolator/src/main/java/org/elasticsearch/percolator/QueryAnalyzer.java +++ b/modules/percolator/src/main/java/org/elasticsearch/percolator/QueryAnalyzer.java @@ -143,7 +143,7 @@ static Result analyze(Query query, Version indexVersion) { } private static BiFunction matchNoDocsQuery() { - return (query, version) -> new Result(true, Collections.emptySet(), 1); + return (query, version) -> new Result(true, Collections.emptySet(), 0); } private static BiFunction matchAllDocsQuery() { @@ -179,28 +179,28 @@ private static BiFunction termInSetQuery() { for (BytesRef term = iterator.next(); term != null; term = iterator.next()) { terms.add(new QueryExtraction(new Term(iterator.field(), term))); } - return new Result(true, terms, 1); + return new Result(true, terms, Math.min(1, terms.size())); }; } private static BiFunction synonymQuery() { return (query, version) -> { Set terms = ((SynonymQuery) query).getTerms().stream().map(QueryExtraction::new).collect(toSet()); - return new Result(true, terms, 1); + return new Result(true, terms, Math.min(1, terms.size())); }; } private static BiFunction commonTermsQuery() { return (query, version) -> { Set terms = ((CommonTermsQuery) query).getTerms().stream().map(QueryExtraction::new).collect(toSet()); - return new Result(false, terms, 1); + return new Result(false, terms, Math.min(1, terms.size())); }; } private static BiFunction blendedTermQuery() { return (query, version) -> { Set terms = ((BlendedTermQuery) query).getTerms().stream().map(QueryExtraction::new).collect(toSet()); - return new Result(true, terms, 1); + return new Result(true, terms, Math.min(1, terms.size())); }; } @@ -208,7 +208,7 @@ private static BiFunction phraseQuery() { return (query, version) -> { Term[] terms = ((PhraseQuery) query).getTerms(); if (terms.length == 0) { - return new Result(true, Collections.emptySet(), 1); + return new Result(true, Collections.emptySet(), 0); } if (version.onOrAfter(Version.V_6_1_0)) { @@ -232,7 +232,7 @@ private static BiFunction multiPhraseQuery() { return (query, version) -> { Term[][] terms = ((MultiPhraseQuery) query).getTermArrays(); if (terms.length == 0) { - return new Result(true, Collections.emptySet(), 1); + return new Result(true, Collections.emptySet(), 0); } if (version.onOrAfter(Version.V_6_1_0)) { @@ -297,7 +297,7 @@ private static BiFunction spanOrQuery() { for (SpanQuery clause : spanOrQuery.getClauses()) { terms.addAll(analyze(clause, version).extractions); } - return new Result(false, terms, 1); + return new Result(false, terms, Math.min(1, terms.size())); }; } @@ -334,6 +334,9 @@ private static BiFunction booleanQuery() { numOptionalClauses++; } } + if (minimumShouldMatch > numOptionalClauses) { + return new Result(false, Collections.emptySet(), 0); + } if (numRequiredClauses > 0) { if (version.onOrAfter(Version.V_6_1_0)) { UnsupportedQueryException uqe = null; @@ -345,7 +348,12 @@ private static BiFunction booleanQuery() { // since they are completely optional. try { - results.add(analyze(clause.getQuery(), version)); + Result subResult = analyze(clause.getQuery(), version); + if (subResult.matchAllDocs == false && subResult.extractions.isEmpty()) { + // doesn't match anything + return subResult; + } + results.add(subResult); } catch (UnsupportedQueryException e) { uqe = e; } @@ -400,7 +408,11 @@ private static BiFunction booleanQuery() { } msm += resultMsm; - verified &= result.verified; + if (result.verified == false + // If some inner extractions are optional, the result can't be verified + || result.minimumShouldMatch < result.extractions.size()) { + verified = false; + } matchAllDocs &= result.matchAllDocs; extractions.addAll(result.extractions); } @@ -492,7 +504,7 @@ private static BiFunction pointRangeQuery() { // Need to check whether upper is not smaller than lower, otherwise NumericUtils.subtract(...) fails IAE // If upper is really smaller than lower then we deal with like MatchNoDocsQuery. (verified and no extractions) if (new BytesRef(lowerPoint).compareTo(new BytesRef(upperPoint)) > 0) { - return new Result(true, Collections.emptySet(), 1); + return new Result(true, Collections.emptySet(), 0); } byte[] interval = new byte[16]; @@ -537,7 +549,15 @@ private static Result handleDisjunction(List disjunctions, int requiredSh for (int i = 0; i < disjunctions.size(); i++) { Query disjunct = disjunctions.get(i); Result subResult = analyze(disjunct, version); - verified &= subResult.verified; + if (subResult.verified == false + // one of the sub queries requires more than one term to match, we can't + // verify it with a single top-level min_should_match + || subResult.minimumShouldMatch > 1 + // One of the inner clauses has multiple extractions, we won't be able to + // verify it with a single top-level min_should_match + || (subResult.extractions.size() > 1 && requiredShouldClauses > 1)) { + verified = false; + } if (subResult.matchAllDocs) { numMatchAllClauses++; } @@ -683,6 +703,10 @@ static class Result { final boolean matchAllDocs; Result(boolean verified, Set extractions, int minimumShouldMatch) { + if (minimumShouldMatch > extractions.size()) { + throw new IllegalArgumentException("minimumShouldMatch can't be greater than the number of extractions: " + + minimumShouldMatch + " > " + extractions.size()); + } this.extractions = extractions; this.verified = verified; this.minimumShouldMatch = minimumShouldMatch; diff --git a/modules/percolator/src/test/java/org/elasticsearch/percolator/CandidateQueryTests.java b/modules/percolator/src/test/java/org/elasticsearch/percolator/CandidateQueryTests.java index 59f4e091140ea..27d72b2926749 100644 --- a/modules/percolator/src/test/java/org/elasticsearch/percolator/CandidateQueryTests.java +++ b/modules/percolator/src/test/java/org/elasticsearch/percolator/CandidateQueryTests.java @@ -210,12 +210,13 @@ public void testDuel() throws Exception { new BytesRef(randomFrom(stringContent.get(field1))))); queryFunctions.add(() -> new TermInSetQuery(field2, new BytesRef(randomFrom(stringContent.get(field1))), new BytesRef(randomFrom(stringContent.get(field1))))); - int numRandomBoolQueries = randomIntBetween(16, 32); + // many iterations with boolean queries, which are the most complex queries to deal with when nested + int numRandomBoolQueries = 1000; for (int i = 0; i < numRandomBoolQueries; i++) { queryFunctions.add(() -> createRandomBooleanQuery(1, stringFields, stringContent, intFieldType, intValues)); } queryFunctions.add(() -> { - int numClauses = randomIntBetween(1, 16); + int numClauses = randomIntBetween(1, 1 << randomIntBetween(2, 4)); List clauses = new ArrayList<>(); for (int i = 0; i < numClauses; i++) { String field = randomFrom(stringFields); @@ -266,7 +267,7 @@ public void testDuel() throws Exception { private BooleanQuery createRandomBooleanQuery(int depth, List fields, Map> content, MappedFieldType intFieldType, List intValues) { BooleanQuery.Builder builder = new BooleanQuery.Builder(); - int numClauses = randomIntBetween(1, 16); + int numClauses = randomIntBetween(1, 1 << randomIntBetween(2, 4)); // use low numbers of clauses more often int numShouldClauses = 0; boolean onlyShouldClauses = rarely(); for (int i = 0; i < numClauses; i++) { @@ -313,7 +314,7 @@ private BooleanQuery createRandomBooleanQuery(int depth, List fields, Ma numShouldClauses++; } } - builder.setMinimumNumberShouldMatch(numShouldClauses); + builder.setMinimumNumberShouldMatch(randomIntBetween(0, numShouldClauses)); return builder.build(); } diff --git a/modules/percolator/src/test/java/org/elasticsearch/percolator/QueryAnalyzerTests.java b/modules/percolator/src/test/java/org/elasticsearch/percolator/QueryAnalyzerTests.java index 5968f8c3f8327..7bcdcd2e1f695 100644 --- a/modules/percolator/src/test/java/org/elasticsearch/percolator/QueryAnalyzerTests.java +++ b/modules/percolator/src/test/java/org/elasticsearch/percolator/QueryAnalyzerTests.java @@ -44,6 +44,7 @@ import org.apache.lucene.search.TermInSetQuery; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TermRangeQuery; +import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.join.QueryBitSetProducer; import org.apache.lucene.search.join.ScoreMode; import org.apache.lucene.search.spans.SpanFirstQuery; @@ -227,23 +228,87 @@ public void testExtractQueryMetadata_booleanQuery_pre6dot1() { public void testExtractQueryMetadata_booleanQuery_msm() { BooleanQuery.Builder builder = new BooleanQuery.Builder(); builder.setMinimumNumberShouldMatch(2); - TermQuery termQuery1 = new TermQuery(new Term("_field", "_term1")); + Term term1 = new Term("_field", "_term1"); + TermQuery termQuery1 = new TermQuery(term1); builder.add(termQuery1, BooleanClause.Occur.SHOULD); - TermQuery termQuery2 = new TermQuery(new Term("_field", "_term2")); + Term term2 = new Term("_field", "_term2"); + TermQuery termQuery2 = new TermQuery(term2); builder.add(termQuery2, BooleanClause.Occur.SHOULD); - TermQuery termQuery3 = new TermQuery(new Term("_field", "_term3")); + Term term3 = new Term("_field", "_term3"); + TermQuery termQuery3 = new TermQuery(term3); builder.add(termQuery3, BooleanClause.Occur.SHOULD); BooleanQuery booleanQuery = builder.build(); Result result = analyze(booleanQuery, Version.CURRENT); assertThat(result.verified, is(true)); assertThat(result.minimumShouldMatch, equalTo(2)); - List extractions = new ArrayList<>(result.extractions); - extractions.sort(Comparator.comparing(extraction -> extraction.term)); - assertThat(extractions.size(), equalTo(3)); - assertThat(extractions.get(0).term, equalTo(new Term("_field", "_term1"))); - assertThat(extractions.get(1).term, equalTo(new Term("_field", "_term2"))); - assertThat(extractions.get(2).term, equalTo(new Term("_field", "_term3"))); + assertTermsEqual(result.extractions, term1, term2, term3); + + builder = new BooleanQuery.Builder() + .add(new BooleanQuery.Builder() + .add(termQuery1, Occur.SHOULD) + .add(termQuery2, Occur.SHOULD) + .build(), Occur.SHOULD) + .add(termQuery3, Occur.SHOULD) + .setMinimumNumberShouldMatch(2); + booleanQuery = builder.build(); + result = analyze(booleanQuery, Version.CURRENT); + assertThat(result.verified, is(false)); + assertThat(result.minimumShouldMatch, equalTo(2)); + assertTermsEqual(result.extractions, term1, term2, term3); + + Term term4 = new Term("_field", "_term4"); + TermQuery termQuery4 = new TermQuery(term4); + builder = new BooleanQuery.Builder() + .add(new BooleanQuery.Builder() + .add(termQuery1, Occur.MUST) + .add(termQuery2, Occur.FILTER) + .build(), Occur.SHOULD) + .add(new BooleanQuery.Builder() + .add(termQuery3, Occur.MUST) + .add(termQuery4, Occur.FILTER) + .build(), Occur.SHOULD); + booleanQuery = builder.build(); + result = analyze(booleanQuery, Version.CURRENT); + assertThat(result.verified, is(false)); + assertThat(result.minimumShouldMatch, equalTo(2)); + assertTermsEqual(result.extractions, term1, term2, term3, term4); + + Term term5 = new Term("_field", "_term5"); + TermQuery termQuery5 = new TermQuery(term5); + builder.add(termQuery5, Occur.SHOULD); + booleanQuery = builder.build(); + result = analyze(booleanQuery, Version.CURRENT); + assertThat(result.verified, is(false)); + assertThat(result.minimumShouldMatch, equalTo(1)); + assertTermsEqual(result.extractions, term1, term2, term3, term4, term5); + + builder.setMinimumNumberShouldMatch(2); + booleanQuery = builder.build(); + result = analyze(booleanQuery, Version.CURRENT); + assertThat(result.verified, is(false)); + assertThat(result.minimumShouldMatch, equalTo(3)); + assertTermsEqual(result.extractions, term1, term2, term3, term4, term5); + + builder.setMinimumNumberShouldMatch(3); + booleanQuery = builder.build(); + result = analyze(booleanQuery, Version.CURRENT); + assertThat(result.verified, is(false)); + assertThat(result.minimumShouldMatch, equalTo(5)); + assertTermsEqual(result.extractions, term1, term2, term3, term4, term5); + + builder = new BooleanQuery.Builder() + .add(new BooleanQuery.Builder() + .add(termQuery1, Occur.SHOULD) + .add(termQuery2, Occur.SHOULD) + .build(), Occur.SHOULD) + .add(new BooleanQuery.Builder().setMinimumNumberShouldMatch(1).build(), Occur.SHOULD) + .setMinimumNumberShouldMatch(2); + booleanQuery = builder.build(); + result = analyze(booleanQuery, Version.CURRENT); + // ideally it would return no extractions, but the fact + // that it doesn't consider them verified is probably good enough + assertFalse(result.verified); } public void testExtractQueryMetadata_booleanQuery_msm_pre6dot1() { @@ -353,7 +418,7 @@ public void testExactMatch_booleanQuery() { assertThat(result.minimumShouldMatch, equalTo(1)); builder = new BooleanQuery.Builder(); - builder.setMinimumNumberShouldMatch(randomIntBetween(2, 32)); + builder.setMinimumNumberShouldMatch(randomIntBetween(1, 2)); builder.add(termQuery1, BooleanClause.Occur.SHOULD); builder.add(termQuery2, BooleanClause.Occur.SHOULD); result = analyze(builder.build(), Version.CURRENT); @@ -379,6 +444,54 @@ public void testExactMatch_booleanQuery() { result = analyze(builder.build(), Version.CURRENT); assertThat("Prohibited clause, so candidate matches are not verified", result.verified, is(false)); assertThat(result.minimumShouldMatch, equalTo(1)); + + builder = new BooleanQuery.Builder(); + builder.add(termQuery1, randomBoolean() ? BooleanClause.Occur.MUST : BooleanClause.Occur.FILTER); + builder.add(termQuery2, BooleanClause.Occur.MUST_NOT); + result = analyze(builder.build(), Version.CURRENT); + assertThat("Prohibited clause, so candidate matches are not verified", result.verified, is(false)); + assertThat(result.minimumShouldMatch, equalTo(1)); + + TermQuery termQuery3 = new TermQuery(new Term("_field", "_term3")); + builder = new BooleanQuery.Builder() + .add(new BooleanQuery.Builder() + .add(termQuery1, Occur.FILTER) + .add(termQuery2, Occur.FILTER) + .build(), Occur.SHOULD) + .add(termQuery3, Occur.SHOULD); + result = analyze(builder.build(), Version.CURRENT); + assertThat("Inner clause that is not a pure disjunction, so candidate matches are not verified", result.verified, is(false)); + assertThat(result.minimumShouldMatch, equalTo(1)); + + builder = new BooleanQuery.Builder() + .add(new BooleanQuery.Builder() + .add(termQuery1, Occur.SHOULD) + .add(termQuery2, Occur.SHOULD) + .build(), Occur.SHOULD) + .add(termQuery3, Occur.SHOULD); + result = analyze(builder.build(), Version.CURRENT); + assertThat("Inner clause that is a pure disjunction, so candidate matches are verified", result.verified, is(true)); + assertThat(result.minimumShouldMatch, equalTo(1)); + + builder = new BooleanQuery.Builder() + .add(new BooleanQuery.Builder() + .add(termQuery1, Occur.SHOULD) + .add(termQuery2, Occur.SHOULD) + .build(), Occur.MUST) + .add(termQuery3, Occur.FILTER); + result = analyze(builder.build(), Version.CURRENT); + assertThat("Disjunctions of conjunctions can't be verified", result.verified, is(false)); + assertThat(result.minimumShouldMatch, equalTo(2)); + + builder = new BooleanQuery.Builder() + .add(new BooleanQuery.Builder() + .add(termQuery1, Occur.MUST) + .add(termQuery2, Occur.FILTER) + .build(), Occur.SHOULD) + .add(termQuery3, Occur.SHOULD); + result = analyze(builder.build(), Version.CURRENT); + assertThat("Conjunctions of disjunctions can't be verified", result.verified, is(false)); + assertThat(result.minimumShouldMatch, equalTo(1)); } public void testBooleanQueryWithMustAndShouldClauses() { @@ -564,16 +677,15 @@ public void testExtractQueryMetadata_matchNoDocsQuery() { Result result = analyze(new MatchNoDocsQuery("sometimes there is no reason at all"), Version.CURRENT); assertThat(result.verified, is(true)); assertEquals(0, result.extractions.size()); - assertThat(result.minimumShouldMatch, equalTo(1)); + assertThat(result.minimumShouldMatch, equalTo(0)); BooleanQuery.Builder bq = new BooleanQuery.Builder(); bq.add(new TermQuery(new Term("field", "value")), BooleanClause.Occur.MUST); bq.add(new MatchNoDocsQuery("sometimes there is no reason at all"), BooleanClause.Occur.MUST); result = analyze(bq.build(), Version.CURRENT); assertThat(result.verified, is(true)); - assertEquals(1, result.extractions.size()); - assertThat(result.minimumShouldMatch, equalTo(2)); - assertTermsEqual(result.extractions, new Term("field", "value")); + assertEquals(0, result.extractions.size()); + assertThat(result.minimumShouldMatch, equalTo(0)); bq = new BooleanQuery.Builder(); bq.add(new TermQuery(new Term("field", "value")), BooleanClause.Occur.SHOULD); @@ -785,7 +897,7 @@ public void testSynonymQuery() { SynonymQuery query = new SynonymQuery(); Result result = analyze(query, Version.CURRENT); assertThat(result.verified, is(true)); - assertThat(result.minimumShouldMatch, equalTo(1)); + assertThat(result.minimumShouldMatch, equalTo(0)); assertThat(result.extractions.isEmpty(), is(true)); query = new SynonymQuery(new Term("_field", "_value1"), new Term("_field", "_value2")); @@ -997,7 +1109,7 @@ public void testPointRangeQuery_lowerUpperReversed() { Query query = IntPoint.newRangeQuery("_field", 20, 10); Result result = analyze(query, Version.CURRENT); assertTrue(result.verified); - assertThat(result.minimumShouldMatch, equalTo(1)); + assertThat(result.minimumShouldMatch, equalTo(0)); assertThat(result.extractions.size(), equalTo(0)); } @@ -1179,7 +1291,7 @@ public void testExtractQueryMetadata_duplicatedClauses() { BooleanClause.Occur.SHOULD ); result = analyze(builder.build(), Version.CURRENT); - assertThat(result.verified, is(true)); + assertThat(result.verified, is(false)); assertThat(result.matchAllDocs, is(false)); assertThat(result.minimumShouldMatch, equalTo(2)); assertTermsEqual(result.extractions, new Term("field", "value1"), new Term("field", "value2"),