Skip to content

Commit 01123d0

Browse files
committed
Add a limit for graph phrase query expansion (#34031)
Today query parsers throw TooManyClauses exception when a query creates too many clauses. However graph phrase queries do not respect this limit. This change adds a protection against crazy expansions that can happen when building a graph phrase query. This is a temporary copy of the fix available in https://issues.apache.org/jira/browse/LUCENE-8479 but not merged yet. This logic will be removed when we integrate the Lucene patch in a future release.
1 parent 0e73001 commit 01123d0

File tree

2 files changed

+159
-1
lines changed

2 files changed

+159
-1
lines changed

server/src/main/java/org/elasticsearch/index/search/MatchQuery.java

Lines changed: 81 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
import org.apache.lucene.search.spans.SpanTermQuery;
4545
import org.apache.lucene.util.BytesRef;
4646
import org.apache.lucene.util.QueryBuilder;
47+
import org.apache.lucene.util.graph.GraphTokenStreamFiniteStrings;
4748
import org.elasticsearch.ElasticsearchException;
4849
import org.elasticsearch.common.io.stream.StreamInput;
4950
import org.elasticsearch.common.io.stream.StreamOutput;
@@ -60,11 +61,14 @@
6061
import org.elasticsearch.index.query.support.QueryParsers;
6162

6263
import java.io.IOException;
64+
import java.util.ArrayList;
65+
import java.util.Iterator;
66+
import java.util.List;
6367

6468
import static org.elasticsearch.common.lucene.search.Queries.newLenientFieldQuery;
6569
import static org.elasticsearch.common.lucene.search.Queries.newUnmappedFieldQuery;
6670

67-
public class MatchQuery {
71+
public class MatchQuery {
6872

6973
private static final DeprecationLogger DEPRECATION_LOGGER = new DeprecationLogger(Loggers.getLogger(MappedFieldType.class));
7074

@@ -525,6 +529,82 @@ private Query boolToExtendedCommonTermsQuery(BooleanQuery bq, Occur highFreqOccu
525529
}
526530
return query;
527531
}
532+
533+
/**
534+
* Overrides {@link QueryBuilder#analyzeGraphPhrase(TokenStream, String, int)} to add
535+
* a limit (see {@link BooleanQuery#getMaxClauseCount()}) to the number of {@link SpanQuery}
536+
* that this method can create.
537+
*
538+
* TODO Remove when https://issues.apache.org/jira/browse/LUCENE-8479 is fixed.
539+
*/
540+
@Override
541+
protected SpanQuery analyzeGraphPhrase(TokenStream source, String field, int phraseSlop) throws IOException {
542+
source.reset();
543+
GraphTokenStreamFiniteStrings graph = new GraphTokenStreamFiniteStrings(source);
544+
List<SpanQuery> clauses = new ArrayList<>();
545+
int[] articulationPoints = graph.articulationPoints();
546+
int lastState = 0;
547+
int maxBooleanClause = BooleanQuery.getMaxClauseCount();
548+
for (int i = 0; i <= articulationPoints.length; i++) {
549+
int start = lastState;
550+
int end = -1;
551+
if (i < articulationPoints.length) {
552+
end = articulationPoints[i];
553+
}
554+
lastState = end;
555+
final SpanQuery queryPos;
556+
if (graph.hasSidePath(start)) {
557+
List<SpanQuery> queries = new ArrayList<>();
558+
Iterator<TokenStream> it = graph.getFiniteStrings(start, end);
559+
while (it.hasNext()) {
560+
TokenStream ts = it.next();
561+
SpanQuery q = createSpanQuery(ts, field);
562+
if (q != null) {
563+
if (queries.size() >= maxBooleanClause) {
564+
throw new BooleanQuery.TooManyClauses();
565+
}
566+
queries.add(q);
567+
}
568+
}
569+
if (queries.size() > 0) {
570+
queryPos = new SpanOrQuery(queries.toArray(new SpanQuery[0]));
571+
} else {
572+
queryPos = null;
573+
}
574+
} else {
575+
Term[] terms = graph.getTerms(field, start);
576+
assert terms.length > 0;
577+
if (terms.length >= maxBooleanClause) {
578+
throw new BooleanQuery.TooManyClauses();
579+
}
580+
if (terms.length == 1) {
581+
queryPos = new SpanTermQuery(terms[0]);
582+
} else {
583+
SpanTermQuery[] orClauses = new SpanTermQuery[terms.length];
584+
for (int idx = 0; idx < terms.length; idx++) {
585+
orClauses[idx] = new SpanTermQuery(terms[idx]);
586+
}
587+
588+
queryPos = new SpanOrQuery(orClauses);
589+
}
590+
}
591+
592+
if (queryPos != null) {
593+
if (clauses.size() >= maxBooleanClause) {
594+
throw new BooleanQuery.TooManyClauses();
595+
}
596+
clauses.add(queryPos);
597+
}
598+
}
599+
600+
if (clauses.isEmpty()) {
601+
return null;
602+
} else if (clauses.size() == 1) {
603+
return clauses.get(0);
604+
} else {
605+
return new SpanNearQuery(clauses.toArray(new SpanQuery[0]), phraseSlop, true);
606+
}
607+
}
528608
}
529609

530610
/**

server/src/test/java/org/elasticsearch/index/query/MatchQueryBuilderTests.java

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,11 @@
1919

2020
package org.elasticsearch.index.query;
2121

22+
import org.apache.lucene.analysis.Analyzer;
23+
import org.apache.lucene.analysis.CannedBinaryTokenStream;
24+
import org.apache.lucene.analysis.MockTokenizer;
25+
import org.apache.lucene.analysis.TokenStream;
26+
import org.apache.lucene.analysis.Tokenizer;
2227
import org.apache.lucene.index.Term;
2328
import org.apache.lucene.queries.ExtendedCommonTermsQuery;
2429
import org.apache.lucene.search.BooleanClause;
@@ -31,6 +36,7 @@
3136
import org.apache.lucene.search.Query;
3237
import org.apache.lucene.search.TermQuery;
3338
import org.elasticsearch.Version;
39+
import org.apache.lucene.util.BytesRef;
3440
import org.elasticsearch.action.admin.indices.mapping.put.PutMappingRequest;
3541
import org.elasticsearch.common.ParsingException;
3642
import org.elasticsearch.common.Strings;
@@ -40,12 +46,15 @@
4046
import org.elasticsearch.index.mapper.MappedFieldType;
4147
import org.elasticsearch.index.mapper.MapperService;
4248
import org.elasticsearch.index.search.MatchQuery;
49+
import org.elasticsearch.index.search.MatchQuery.Type;
4350
import org.elasticsearch.index.search.MatchQuery.ZeroTermsQuery;
4451
import org.elasticsearch.search.internal.SearchContext;
4552
import org.elasticsearch.test.AbstractQueryTestCase;
4653
import org.hamcrest.Matcher;
4754

4855
import java.io.IOException;
56+
import java.io.Reader;
57+
import java.util.ArrayList;
4958
import java.util.HashMap;
5059
import java.util.List;
5160
import java.util.Locale;
@@ -403,4 +412,73 @@ public void testLenientPhraseQuery() throws Exception {
403412
assertThat(query.toString(),
404413
containsString("field:[string_no_pos] was indexed without position data; cannot run PhraseQuery"));
405414
}
415+
416+
public void testMaxBooleanClause() {
417+
assumeTrue("test runs only when at least a type is registered", getCurrentTypes().length > 0);
418+
MatchQuery query = new MatchQuery(createShardContext());
419+
query.setAnalyzer(new MockGraphAnalyzer(createGiantGraph(40)));
420+
expectThrows(BooleanQuery.TooManyClauses.class, () -> query.parse(Type.PHRASE, STRING_FIELD_NAME, ""));
421+
query.setAnalyzer(new MockGraphAnalyzer(createGiantGraphMultiTerms()));
422+
expectThrows(BooleanQuery.TooManyClauses.class, () -> query.parse(Type.PHRASE, STRING_FIELD_NAME, ""));
423+
}
424+
425+
private static class MockGraphAnalyzer extends Analyzer {
426+
final CannedBinaryTokenStream.BinaryToken[] tokens;
427+
428+
private MockGraphAnalyzer(CannedBinaryTokenStream.BinaryToken[] tokens ) {
429+
this.tokens = tokens;
430+
}
431+
@Override
432+
protected TokenStreamComponents createComponents(String fieldName) {
433+
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true);
434+
return new TokenStreamComponents(tokenizer) {
435+
@Override
436+
public TokenStream getTokenStream() {
437+
return new CannedBinaryTokenStream(tokens);
438+
}
439+
440+
@Override
441+
protected void setReader(final Reader reader) {
442+
}
443+
};
444+
}
445+
}
446+
447+
/**
448+
* Creates a graph token stream with 2 side paths at each position.
449+
**/
450+
private static CannedBinaryTokenStream.BinaryToken[] createGiantGraph(int numPos) {
451+
List<CannedBinaryTokenStream.BinaryToken> tokens = new ArrayList<>();
452+
BytesRef term1 = new BytesRef("foo");
453+
BytesRef term2 = new BytesRef("bar");
454+
for (int i = 0; i < numPos;) {
455+
if (i % 2 == 0) {
456+
tokens.add(new CannedBinaryTokenStream.BinaryToken(term2, 1, 1));
457+
tokens.add(new CannedBinaryTokenStream.BinaryToken(term1, 0, 2));
458+
i += 2;
459+
} else {
460+
tokens.add(new CannedBinaryTokenStream.BinaryToken(term2, 1, 1));
461+
i++;
462+
}
463+
}
464+
return tokens.toArray(new CannedBinaryTokenStream.BinaryToken[0]);
465+
}
466+
467+
/**
468+
* Creates a graph token stream with {@link BooleanQuery#getMaxClauseCount()}
469+
* expansions at the last position.
470+
**/
471+
private static CannedBinaryTokenStream.BinaryToken[] createGiantGraphMultiTerms() {
472+
List<CannedBinaryTokenStream.BinaryToken> tokens = new ArrayList<>();
473+
BytesRef term1 = new BytesRef("foo");
474+
BytesRef term2 = new BytesRef("bar");
475+
tokens.add(new CannedBinaryTokenStream.BinaryToken(term2, 1, 1));
476+
tokens.add(new CannedBinaryTokenStream.BinaryToken(term1, 0, 2));
477+
tokens.add(new CannedBinaryTokenStream.BinaryToken(term2, 1, 1));
478+
tokens.add(new CannedBinaryTokenStream.BinaryToken(term2, 1, 1));
479+
for (int i = 0; i < BooleanQuery.getMaxClauseCount(); i++) {
480+
tokens.add(new CannedBinaryTokenStream.BinaryToken(term1, 0, 1));
481+
}
482+
return tokens.toArray(new CannedBinaryTokenStream.BinaryToken[0]);
483+
}
406484
}

0 commit comments

Comments
 (0)