Add a limit for graph phrase query expansion (#34031)

jimczi · jimczi · commit 01123d085b82 · 2018-09-25T22:05:54.000+02:00
Today query parsers throw TooManyClauses exception when a query creates too many clauses. However graph phrase queries do not respect this limit. This change adds a protection against crazy expansions that can happen when building a graph phrase query. This is a temporary copy of the fix available in https://issues.apache.org/jira/browse/LUCENE-8479 but not merged yet. This logic will be removed when we integrate the Lucene patch in a future release.
diff --git a/server/src/main/java/org/elasticsearch/index/search/MatchQuery.java b/server/src/main/java/org/elasticsearch/index/search/MatchQuery.java
@@ -44,6 +44,7 @@
 import org.apache.lucene.search.spans.SpanTermQuery;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.QueryBuilder;
+import org.apache.lucene.util.graph.GraphTokenStreamFiniteStrings;
 import org.elasticsearch.ElasticsearchException;
 import org.elasticsearch.common.io.stream.StreamInput;
 import org.elasticsearch.common.io.stream.StreamOutput;
@@ -60,11 +61,14 @@
 import org.elasticsearch.index.query.support.QueryParsers;
 
 import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
 
 import static org.elasticsearch.common.lucene.search.Queries.newLenientFieldQuery;
 import static org.elasticsearch.common.lucene.search.Queries.newUnmappedFieldQuery;
 
-public class    MatchQuery {
+public class MatchQuery {
 
     private static final DeprecationLogger DEPRECATION_LOGGER = new DeprecationLogger(Loggers.getLogger(MappedFieldType.class));
 
@@ -525,6 +529,82 @@ private Query boolToExtendedCommonTermsQuery(BooleanQuery bq, Occur highFreqOccu
             }
             return query;
         }
+
+        /**
+         * Overrides {@link QueryBuilder#analyzeGraphPhrase(TokenStream, String, int)} to add
+         * a limit (see {@link BooleanQuery#getMaxClauseCount()}) to the number of {@link SpanQuery}
+         * that this method can create.
+         *
+         * TODO Remove when https://issues.apache.org/jira/browse/LUCENE-8479 is fixed.
+         */
+        @Override
+        protected SpanQuery analyzeGraphPhrase(TokenStream source, String field, int phraseSlop) throws IOException {
+            source.reset();
+            GraphTokenStreamFiniteStrings graph = new GraphTokenStreamFiniteStrings(source);
+            List<SpanQuery> clauses = new ArrayList<>();
+            int[] articulationPoints = graph.articulationPoints();
+            int lastState = 0;
+            int maxBooleanClause = BooleanQuery.getMaxClauseCount();
+            for (int i = 0; i <= articulationPoints.length; i++) {
+                int start = lastState;
+                int end = -1;
+                if (i < articulationPoints.length) {
+                    end = articulationPoints[i];
+                }
+                lastState = end;
+                final SpanQuery queryPos;
+                if (graph.hasSidePath(start)) {
+                    List<SpanQuery> queries = new ArrayList<>();
+                    Iterator<TokenStream> it = graph.getFiniteStrings(start, end);
+                    while (it.hasNext()) {
+                        TokenStream ts = it.next();
+                        SpanQuery q = createSpanQuery(ts, field);
+                        if (q != null) {
+                            if (queries.size() >= maxBooleanClause) {
+                                throw new BooleanQuery.TooManyClauses();
+                            }
+                            queries.add(q);
+                        }
+                    }
+                    if (queries.size() > 0) {
+                        queryPos = new SpanOrQuery(queries.toArray(new SpanQuery[0]));
+                    } else {
+                        queryPos = null;
+                    }
+                } else {
+                    Term[] terms = graph.getTerms(field, start);
+                    assert terms.length > 0;
+                    if (terms.length >= maxBooleanClause) {
+                        throw new BooleanQuery.TooManyClauses();
+                    }
+                    if (terms.length == 1) {
+                        queryPos = new SpanTermQuery(terms[0]);
+                    } else {
+                        SpanTermQuery[] orClauses = new SpanTermQuery[terms.length];
+                        for (int idx = 0; idx < terms.length; idx++) {
+                            orClauses[idx] = new SpanTermQuery(terms[idx]);
+                        }
+
+                        queryPos = new SpanOrQuery(orClauses);
+                    }
+                }
+
+                if (queryPos != null) {
+                    if (clauses.size() >= maxBooleanClause) {
+                        throw new BooleanQuery.TooManyClauses();
+                    }
+                    clauses.add(queryPos);
+                }
+            }
+
+            if (clauses.isEmpty()) {
+                return null;
+            } else if (clauses.size() == 1) {
+                return clauses.get(0);
+            } else {
+                return new SpanNearQuery(clauses.toArray(new SpanQuery[0]), phraseSlop, true);
+            }
+        }
     }
 
     /**
diff --git a/server/src/test/java/org/elasticsearch/index/query/MatchQueryBuilderTests.java b/server/src/test/java/org/elasticsearch/index/query/MatchQueryBuilderTests.java
@@ -19,6 +19,11 @@
 
 package org.elasticsearch.index.query;
 
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CannedBinaryTokenStream;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.queries.ExtendedCommonTermsQuery;
 import org.apache.lucene.search.BooleanClause;
@@ -31,6 +36,7 @@
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.TermQuery;
 import org.elasticsearch.Version;
+import org.apache.lucene.util.BytesRef;
 import org.elasticsearch.action.admin.indices.mapping.put.PutMappingRequest;
 import org.elasticsearch.common.ParsingException;
 import org.elasticsearch.common.Strings;
@@ -40,12 +46,15 @@
 import org.elasticsearch.index.mapper.MappedFieldType;
 import org.elasticsearch.index.mapper.MapperService;
 import org.elasticsearch.index.search.MatchQuery;
+import org.elasticsearch.index.search.MatchQuery.Type;
 import org.elasticsearch.index.search.MatchQuery.ZeroTermsQuery;
 import org.elasticsearch.search.internal.SearchContext;
 import org.elasticsearch.test.AbstractQueryTestCase;
 import org.hamcrest.Matcher;
 
 import java.io.IOException;
+import java.io.Reader;
+import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Locale;
@@ -403,4 +412,73 @@ public void testLenientPhraseQuery() throws Exception {
         assertThat(query.toString(),
             containsString("field:[string_no_pos] was indexed without position data; cannot run PhraseQuery"));
     }
+
+    public void testMaxBooleanClause() {
+        assumeTrue("test runs only when at least a type is registered", getCurrentTypes().length > 0);
+        MatchQuery query = new MatchQuery(createShardContext());
+        query.setAnalyzer(new MockGraphAnalyzer(createGiantGraph(40)));
+        expectThrows(BooleanQuery.TooManyClauses.class, () -> query.parse(Type.PHRASE, STRING_FIELD_NAME, ""));
+        query.setAnalyzer(new MockGraphAnalyzer(createGiantGraphMultiTerms()));
+        expectThrows(BooleanQuery.TooManyClauses.class, () -> query.parse(Type.PHRASE, STRING_FIELD_NAME, ""));
+    }
+
+    private static class MockGraphAnalyzer extends Analyzer {
+        final CannedBinaryTokenStream.BinaryToken[] tokens;
+
+        private MockGraphAnalyzer(CannedBinaryTokenStream.BinaryToken[] tokens ) {
+            this.tokens = tokens;
+        }
+        @Override
+        protected TokenStreamComponents createComponents(String fieldName) {
+            Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true);
+            return new TokenStreamComponents(tokenizer) {
+                @Override
+                public TokenStream getTokenStream() {
+                    return new CannedBinaryTokenStream(tokens);
+                }
+
+                @Override
+                protected void setReader(final Reader reader) {
+                }
+            };
+        }
+    }
+
+    /**
+     * Creates a graph token stream with 2 side paths at each position.
+     **/
+    private static CannedBinaryTokenStream.BinaryToken[] createGiantGraph(int numPos) {
+        List<CannedBinaryTokenStream.BinaryToken> tokens = new ArrayList<>();
+        BytesRef term1 = new BytesRef("foo");
+        BytesRef term2 = new BytesRef("bar");
+        for (int i = 0; i < numPos;) {
+            if (i % 2 == 0) {
+                tokens.add(new CannedBinaryTokenStream.BinaryToken(term2, 1, 1));
+                tokens.add(new CannedBinaryTokenStream.BinaryToken(term1, 0, 2));
+                i += 2;
+            } else {
+                tokens.add(new CannedBinaryTokenStream.BinaryToken(term2, 1, 1));
+                i++;
+            }
+        }
+        return tokens.toArray(new CannedBinaryTokenStream.BinaryToken[0]);
+    }
+
+    /**
+     * Creates a graph token stream with {@link BooleanQuery#getMaxClauseCount()}
+     * expansions at the last position.
+     **/
+    private static CannedBinaryTokenStream.BinaryToken[] createGiantGraphMultiTerms() {
+        List<CannedBinaryTokenStream.BinaryToken> tokens = new ArrayList<>();
+        BytesRef term1 = new BytesRef("foo");
+        BytesRef term2 = new BytesRef("bar");
+        tokens.add(new CannedBinaryTokenStream.BinaryToken(term2, 1, 1));
+        tokens.add(new CannedBinaryTokenStream.BinaryToken(term1, 0, 2));
+        tokens.add(new CannedBinaryTokenStream.BinaryToken(term2, 1, 1));
+        tokens.add(new CannedBinaryTokenStream.BinaryToken(term2, 1, 1));
+        for (int i = 0; i < BooleanQuery.getMaxClauseCount(); i++) {
+            tokens.add(new CannedBinaryTokenStream.BinaryToken(term1, 0, 1));
+        }
+        return tokens.toArray(new CannedBinaryTokenStream.BinaryToken[0]);
+    }
 }