From 0c0446ba8cdcb8b87912f3d8c4c2a27b3a63e773 Mon Sep 17 00:00:00 2001 From: markharwood Date: Fri, 31 Jul 2020 15:01:19 +0100 Subject: [PATCH 1/2] LUCENE-9445 Add support for case insensitive regex searches in QueryParser using the standard /.../i regex syntax --- .../classic/MultiFieldQueryParser.java | 6 +- .../lucene/queryparser/classic/QueryParser.jj | 2 +- .../queryparser/classic/QueryParserBase.java | 15 +- .../classic/QueryParserTokenManager.java | 197 +++++++++--------- .../builders/RegexpQueryNodeBuilder.java | 5 +- .../standard/nodes/RegexpQueryNode.java | 13 +- .../standard/parser/StandardSyntaxParser.java | 40 ++-- .../standard/parser/StandardSyntaxParser.jj | 8 +- .../StandardSyntaxParserTokenManager.java | 56 ++--- .../queryparser/util/QueryParserTestBase.java | 10 + 10 files changed, 200 insertions(+), 152 deletions(-) diff --git a/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/MultiFieldQueryParser.java b/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/MultiFieldQueryParser.java index 3ee9c6ced0c4..2866646c9322 100644 --- a/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/MultiFieldQueryParser.java +++ b/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/MultiFieldQueryParser.java @@ -261,16 +261,16 @@ protected Query getRangeQuery(String field, String part1, String part2, boolean @Override - protected Query getRegexpQuery(String field, String termStr) + protected Query getRegexpQuery(String field, String termStr, boolean caseSensitive) throws ParseException { if (field == null) { List clauses = new ArrayList<>(); for (int i = 0; i < fields.length; i++) { - clauses.add(getRegexpQuery(fields[i], termStr)); + clauses.add(getRegexpQuery(fields[i], termStr, caseSensitive)); } return getMultiFieldQuery(clauses); } - return super.getRegexpQuery(field, termStr); + return super.getRegexpQuery(field, termStr, caseSensitive); } /** Creates a multifield query */ diff --git a/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParser.jj b/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParser.jj index fdc0cd0ed5eb..8a149720df65 100644 --- a/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParser.jj +++ b/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParser.jj @@ -209,7 +209,7 @@ PARSER_END(QueryParser) | )+ (( "." (<_NUM_CHAR>)+ )? (<_TERM_CHAR>)*) | (<_TERM_CHAR>)*) > | (<_TERM_CHAR>)* "*" ) > | | [ "*", "?" ]) (<_TERM_CHAR> | ( [ "*", "?" ] ))* > -| +| | : Range | : Range } diff --git a/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParserBase.java b/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParserBase.java index a1f8fa582ada..7b08839ad8db 100644 --- a/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParserBase.java +++ b/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParserBase.java @@ -572,10 +572,12 @@ protected Query newPrefixQuery(Term prefix){ /** * Builds a new RegexpQuery instance * @param regexp Regexp term + * @param caseSensitive if the term matching should be case sensitive * @return new RegexpQuery instance */ - protected Query newRegexpQuery(Term regexp) { - RegexpQuery query = new RegexpQuery(regexp, RegExp.ALL, + protected Query newRegexpQuery(Term regexp, boolean caseSensitive) { + int matchFlags = caseSensitive ? 0 : RegExp.ASCII_CASE_INSENSITIVE; + RegexpQuery query = new RegexpQuery(regexp, RegExp.ALL, matchFlags, maxDeterminizedStates); query.setRewriteMethod(multiTermRewriteMethod); return query; @@ -746,18 +748,19 @@ private BytesRef analyzeWildcard(String field, String termStr) { * * @param field Name of the field query will use. * @param termStr Term token that contains a regular expression + * @param caseSensitive if token matching should be case sensitive * * @return Resulting {@link org.apache.lucene.search.Query} built for the term * @exception org.apache.lucene.queryparser.classic.ParseException throw in overridden method to disallow */ - protected Query getRegexpQuery(String field, String termStr) throws ParseException + protected Query getRegexpQuery(String field, String termStr, boolean caseSensitive) throws ParseException { // We need to pass the whole string to #normalize, which will not work with // custom attribute factories for the binary term impl, and may not work // with some analyzers BytesRef term = getAnalyzer().normalize(field, termStr); Term t = new Term(field, term); - return newRegexpQuery(t); + return newRegexpQuery(t, caseSensitive); } /** @@ -823,7 +826,9 @@ Query handleBareTokenQuery(String qfield, Token term, Token fuzzySlop, boolean p discardEscapeChar(term.image.substring (0, term.image.length()-1))); } else if (regexp) { - q = getRegexpQuery(qfield, term.image.substring(1, term.image.length()-1)); + boolean caseSensitive = !term.image.endsWith("i"); + int lastSlash = term.image.lastIndexOf("/"); + q = getRegexpQuery(qfield, term.image.substring(1, lastSlash), caseSensitive); } else if (fuzzy) { q = handleBareFuzzy(qfield, fuzzySlop, termImage); } else { diff --git a/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParserTokenManager.java b/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParserTokenManager.java index 39cac0232f28..7300cf7f4202 100644 --- a/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParserTokenManager.java +++ b/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParserTokenManager.java @@ -32,7 +32,7 @@ private int jjMoveStringLiteralDfa0_2() case 41: return jjStopAtPos(0, 15); case 42: - return jjStartNfaWithStates_2(0, 17, 49); + return jjStartNfaWithStates_2(0, 17, 51); case 43: return jjStartNfaWithStates_2(0, 11, 15); case 45: @@ -72,7 +72,7 @@ private int jjStartNfaWithStates_2(int pos, int kind, int state) private int jjMoveNfa_2(int startState, int curPos) { int startsAt = 0; - jjnewStateCnt = 49; + jjnewStateCnt = 51; int i = 1; jjstateSet[0] = startState; int kind = 0x7fffffff; @@ -87,14 +87,6 @@ private int jjMoveNfa_2(int startState, int curPos) { switch(jjstateSet[--i]) { - case 49: - case 33: - if ((0xfbff7cf8ffffd9ffL & l) == 0L) - break; - if (kind > 23) - kind = 23; - jjCheckNAddTwoStates(33, 34); - break; case 0: if ((0xfbff54f8ffffd9ffL & l) != 0L) { @@ -110,14 +102,14 @@ else if ((0x100002600L & l) != 0L) else if ((0x280200000000L & l) != 0L) jjstateSet[jjnewStateCnt++] = 15; else if (curChar == 47) - jjCheckNAddStates(0, 2); + jjCheckNAddStates(0, 3); else if (curChar == 34) - jjCheckNAddStates(3, 5); + jjCheckNAddStates(4, 6); if ((0x7bff50f8ffffd9ffL & l) != 0L) { if (kind > 20) kind = 20; - jjCheckNAddStates(6, 10); + jjCheckNAddStates(7, 11); } else if (curChar == 42) { @@ -132,6 +124,14 @@ else if (curChar == 33) if (curChar == 38) jjstateSet[jjnewStateCnt++] = 4; break; + case 51: + case 33: + if ((0xfbff7cf8ffffd9ffL & l) == 0L) + break; + if (kind > 23) + kind = 23; + jjCheckNAddTwoStates(33, 34); + break; case 4: if (curChar == 38 && kind > 8) kind = 8; @@ -154,14 +154,14 @@ else if (curChar == 33) break; case 16: if (curChar == 34) - jjCheckNAddStates(3, 5); + jjCheckNAddStates(4, 6); break; case 17: if ((0xfffffffbffffffffL & l) != 0L) - jjCheckNAddStates(3, 5); + jjCheckNAddStates(4, 6); break; case 19: - jjCheckNAddStates(3, 5); + jjCheckNAddStates(4, 6); break; case 20: if (curChar == 34 && kind > 19) @@ -172,7 +172,7 @@ else if (curChar == 33) break; if (kind > 21) kind = 21; - jjCheckNAddStates(11, 14); + jjCheckNAddStates(12, 15); break; case 23: if (curChar == 46) @@ -183,7 +183,7 @@ else if (curChar == 33) break; if (kind > 21) kind = 21; - jjCheckNAddStates(15, 17); + jjCheckNAddStates(16, 18); break; case 25: if ((0x7bff78f8ffffd9ffL & l) == 0L) @@ -228,41 +228,45 @@ else if (curChar == 33) case 36: case 38: if (curChar == 47) - jjCheckNAddStates(0, 2); + jjCheckNAddStates(0, 3); break; case 37: if ((0xffff7fffffffffffL & l) != 0L) - jjCheckNAddStates(0, 2); + jjCheckNAddStates(0, 3); break; case 40: if (curChar == 47 && kind > 24) kind = 24; break; - case 41: + case 42: + if (curChar == 47) + jjstateSet[jjnewStateCnt++] = 41; + break; + case 43: if ((0x7bff50f8ffffd9ffL & l) == 0L) break; if (kind > 20) kind = 20; - jjCheckNAddStates(6, 10); + jjCheckNAddStates(7, 11); break; - case 42: + case 44: if ((0x7bff78f8ffffd9ffL & l) == 0L) break; if (kind > 20) kind = 20; - jjCheckNAddTwoStates(42, 43); + jjCheckNAddTwoStates(44, 45); break; - case 44: + case 46: if (kind > 20) kind = 20; - jjCheckNAddTwoStates(42, 43); + jjCheckNAddTwoStates(44, 45); break; - case 45: + case 47: if ((0x7bff78f8ffffd9ffL & l) != 0L) - jjCheckNAddStates(18, 20); + jjCheckNAddStates(19, 21); break; - case 47: - jjCheckNAddStates(18, 20); + case 49: + jjCheckNAddStates(19, 21); break; default : break; } @@ -275,30 +279,20 @@ else if (curChar < 128) { switch(jjstateSet[--i]) { - case 49: - if ((0x97ffffff87ffffffL & l) != 0L) - { - if (kind > 23) - kind = 23; - jjCheckNAddTwoStates(33, 34); - } - else if (curChar == 92) - jjCheckNAdd(35); - break; case 0: if ((0x97ffffff87ffffffL & l) != 0L) { if (kind > 20) kind = 20; - jjCheckNAddStates(6, 10); + jjCheckNAddStates(7, 11); } else if (curChar == 92) - jjCheckNAddStates(21, 23); + jjCheckNAddStates(22, 24); else if (curChar == 126) { if (kind > 21) kind = 21; - jjCheckNAddStates(24, 26); + jjCheckNAddStates(25, 27); } if ((0x97ffffff87ffffffL & l) != 0L) { @@ -315,6 +309,16 @@ else if (curChar == 79) else if (curChar == 65) jjstateSet[jjnewStateCnt++] = 2; break; + case 51: + if ((0x97ffffff87ffffffL & l) != 0L) + { + if (kind > 23) + kind = 23; + jjCheckNAddTwoStates(33, 34); + } + else if (curChar == 92) + jjCheckNAdd(35); + break; case 1: if (curChar == 68 && kind > 8) kind = 8; @@ -357,21 +361,21 @@ else if (curChar == 65) break; case 17: if ((0xffffffffefffffffL & l) != 0L) - jjCheckNAddStates(3, 5); + jjCheckNAddStates(4, 6); break; case 18: if (curChar == 92) jjstateSet[jjnewStateCnt++] = 19; break; case 19: - jjCheckNAddStates(3, 5); + jjCheckNAddStates(4, 6); break; case 21: if (curChar != 126) break; if (kind > 21) kind = 21; - jjCheckNAddStates(24, 26); + jjCheckNAddStates(25, 27); break; case 25: if ((0x97ffffff87ffffffL & l) == 0L) @@ -429,49 +433,53 @@ else if (curChar == 65) jjCheckNAddTwoStates(33, 34); break; case 37: - jjAddStates(0, 2); + jjAddStates(0, 3); break; case 39: if (curChar == 92) jjstateSet[jjnewStateCnt++] = 38; break; case 41: + if (curChar == 105 && kind > 24) + kind = 24; + break; + case 43: if ((0x97ffffff87ffffffL & l) == 0L) break; if (kind > 20) kind = 20; - jjCheckNAddStates(6, 10); + jjCheckNAddStates(7, 11); break; - case 42: + case 44: if ((0x97ffffff87ffffffL & l) == 0L) break; if (kind > 20) kind = 20; - jjCheckNAddTwoStates(42, 43); + jjCheckNAddTwoStates(44, 45); break; - case 43: + case 45: if (curChar == 92) - jjCheckNAdd(44); + jjCheckNAdd(46); break; - case 44: + case 46: if (kind > 20) kind = 20; - jjCheckNAddTwoStates(42, 43); + jjCheckNAddTwoStates(44, 45); break; - case 45: + case 47: if ((0x97ffffff87ffffffL & l) != 0L) - jjCheckNAddStates(18, 20); + jjCheckNAddStates(19, 21); break; - case 46: + case 48: if (curChar == 92) - jjCheckNAdd(47); + jjCheckNAdd(49); break; - case 47: - jjCheckNAddStates(18, 20); + case 49: + jjCheckNAddStates(19, 21); break; - case 48: + case 50: if (curChar == 92) - jjCheckNAddStates(21, 23); + jjCheckNAddStates(22, 24); break; default : break; } @@ -488,14 +496,6 @@ else if (curChar == 65) { switch(jjstateSet[--i]) { - case 49: - case 33: - if (!jjCanMove_2(hiByte, i1, i2, l1, l2)) - break; - if (kind > 23) - kind = 23; - jjCheckNAddTwoStates(33, 34); - break; case 0: if (jjCanMove_0(hiByte, i1, i2, l1, l2)) { @@ -512,9 +512,17 @@ else if (curChar == 65) { if (kind > 20) kind = 20; - jjCheckNAddStates(6, 10); + jjCheckNAddStates(7, 11); } break; + case 51: + case 33: + if (!jjCanMove_2(hiByte, i1, i2, l1, l2)) + break; + if (kind > 23) + kind = 23; + jjCheckNAddTwoStates(33, 34); + break; case 15: if (jjCanMove_0(hiByte, i1, i2, l1, l2) && kind > 13) kind = 13; @@ -522,7 +530,7 @@ else if (curChar == 65) case 17: case 19: if (jjCanMove_1(hiByte, i1, i2, l1, l2)) - jjCheckNAddStates(3, 5); + jjCheckNAddStates(4, 6); break; case 25: if (!jjCanMove_2(hiByte, i1, i2, l1, l2)) @@ -568,36 +576,36 @@ else if (curChar == 65) break; case 37: if (jjCanMove_1(hiByte, i1, i2, l1, l2)) - jjAddStates(0, 2); + jjAddStates(0, 3); break; - case 41: + case 43: if (!jjCanMove_2(hiByte, i1, i2, l1, l2)) break; if (kind > 20) kind = 20; - jjCheckNAddStates(6, 10); + jjCheckNAddStates(7, 11); break; - case 42: + case 44: if (!jjCanMove_2(hiByte, i1, i2, l1, l2)) break; if (kind > 20) kind = 20; - jjCheckNAddTwoStates(42, 43); + jjCheckNAddTwoStates(44, 45); break; - case 44: + case 46: if (!jjCanMove_1(hiByte, i1, i2, l1, l2)) break; if (kind > 20) kind = 20; - jjCheckNAddTwoStates(42, 43); + jjCheckNAddTwoStates(44, 45); break; - case 45: + case 47: if (jjCanMove_2(hiByte, i1, i2, l1, l2)) - jjCheckNAddStates(18, 20); + jjCheckNAddStates(19, 21); break; - case 47: + case 49: if (jjCanMove_1(hiByte, i1, i2, l1, l2)) - jjCheckNAddStates(18, 20); + jjCheckNAddStates(19, 21); break; default : break; } @@ -610,7 +618,7 @@ else if (curChar == 65) kind = 0x7fffffff; } ++curPos; - if ((i = jjnewStateCnt) == (startsAt = 49 - (jjnewStateCnt = startsAt))) + if ((i = jjnewStateCnt) == (startsAt = 51 - (jjnewStateCnt = startsAt))) return curPos; try { curChar = input_stream.readChar(); } catch(java.io.IOException e) { return curPos; } @@ -643,7 +651,7 @@ private int jjMoveNfa_0(int startState, int curPos) break; if (kind > 27) kind = 27; - jjAddStates(27, 28); + jjAddStates(28, 29); break; case 1: if (curChar == 46) @@ -797,11 +805,11 @@ else if (curChar == 34) break; case 2: if ((0xfffffffbffffffffL & l) != 0L) - jjCheckNAddStates(29, 31); + jjCheckNAddStates(30, 32); break; case 3: if (curChar == 34) - jjCheckNAddStates(29, 31); + jjCheckNAddStates(30, 32); break; case 5: if (curChar == 34 && kind > 31) @@ -834,7 +842,7 @@ else if (curChar < 128) jjCheckNAdd(6); break; case 2: - jjAddStates(29, 31); + jjAddStates(30, 32); break; case 4: if (curChar == 92) @@ -870,7 +878,7 @@ else if (curChar < 128) break; case 2: if (jjCanMove_1(hiByte, i1, i2, l1, l2)) - jjAddStates(29, 31); + jjAddStates(30, 32); break; case 6: if (!jjCanMove_1(hiByte, i1, i2, l1, l2)) @@ -897,8 +905,9 @@ else if (curChar < 128) } } static final int[] jjnextStates = { - 37, 39, 40, 17, 18, 20, 42, 43, 45, 46, 31, 22, 23, 25, 26, 24, - 25, 26, 45, 46, 31, 44, 47, 35, 22, 28, 29, 0, 1, 2, 4, 5, + 37, 39, 40, 42, 17, 18, 20, 44, 45, 47, 48, 31, 22, 23, 25, 26, + 24, 25, 26, 47, 48, 31, 46, 49, 35, 22, 28, 29, 0, 1, 2, 4, + 5, }; private static final boolean jjCanMove_0(int hiByte, int i1, int i2, long l1, long l2) { @@ -962,8 +971,8 @@ private static final boolean jjCanMove_2(int hiByte, int i1, int i2, long l1, lo 0x80L, }; protected CharStream input_stream; -private final int[] jjrounds = new int[49]; -private final int[] jjstateSet = new int[98]; +private final int[] jjrounds = new int[51]; +private final int[] jjstateSet = new int[102]; protected char curChar; /** Constructor. */ public QueryParserTokenManager(CharStream stream){ @@ -988,7 +997,7 @@ private void ReInitRounds() { int i; jjround = 0x80000001; - for (i = 49; i-- > 0;) + for (i = 51; i-- > 0;) jjrounds[i] = 0x80000000; } diff --git a/lucene/queryparser/src/java/org/apache/lucene/queryparser/flexible/standard/builders/RegexpQueryNodeBuilder.java b/lucene/queryparser/src/java/org/apache/lucene/queryparser/flexible/standard/builders/RegexpQueryNodeBuilder.java index b2198b41fdab..d06759a3cd20 100644 --- a/lucene/queryparser/src/java/org/apache/lucene/queryparser/flexible/standard/builders/RegexpQueryNodeBuilder.java +++ b/lucene/queryparser/src/java/org/apache/lucene/queryparser/flexible/standard/builders/RegexpQueryNodeBuilder.java @@ -23,6 +23,8 @@ import org.apache.lucene.queryparser.flexible.standard.processors.MultiTermRewriteMethodProcessor; import org.apache.lucene.search.MultiTermQuery; import org.apache.lucene.search.RegexpQuery; +import org.apache.lucene.util.automaton.Operations; +import org.apache.lucene.util.automaton.RegExp; /** * Builds a {@link RegexpQuery} object from a {@link RegexpQueryNode} object. @@ -36,10 +38,11 @@ public RegexpQueryNodeBuilder() { @Override public RegexpQuery build(QueryNode queryNode) throws QueryNodeException { RegexpQueryNode regexpNode = (RegexpQueryNode) queryNode; + int matchFlags = regexpNode.getCaseSensitive() ? 0 : RegExp.ASCII_CASE_INSENSITIVE; // TODO: make the maxStates configurable w/ a reasonable default (QueryParserBase uses 10000) RegexpQuery q = new RegexpQuery(new Term(regexpNode.getFieldAsString(), - regexpNode.textToBytesRef())); + regexpNode.textToBytesRef()), RegExp.ALL, matchFlags, Operations.DEFAULT_MAX_DETERMINIZED_STATES); MultiTermQuery.RewriteMethod method = (MultiTermQuery.RewriteMethod) queryNode .getTag(MultiTermRewriteMethodProcessor.TAG_ID); diff --git a/lucene/queryparser/src/java/org/apache/lucene/queryparser/flexible/standard/nodes/RegexpQueryNode.java b/lucene/queryparser/src/java/org/apache/lucene/queryparser/flexible/standard/nodes/RegexpQueryNode.java index cba2612cb612..202e7fcb0cae 100644 --- a/lucene/queryparser/src/java/org/apache/lucene/queryparser/flexible/standard/nodes/RegexpQueryNode.java +++ b/lucene/queryparser/src/java/org/apache/lucene/queryparser/flexible/standard/nodes/RegexpQueryNode.java @@ -30,6 +30,7 @@ public class RegexpQueryNode extends QueryNodeImpl implements TextableQueryNode FieldableNode { private CharSequence text; private CharSequence field; + private boolean caseSensitive; /** * @param field * - field name @@ -39,11 +40,14 @@ public class RegexpQueryNode extends QueryNodeImpl implements TextableQueryNode * - position in the query string * @param end * - position in the query string + * @param caseSensitive + * - true if the text matching should be case sensitive */ public RegexpQueryNode(CharSequence field, CharSequence text, int begin, - int end) { + int end, boolean caseSensitive) { this.field = field; this.text = text.subSequence(begin, end); + this.caseSensitive = caseSensitive; } public BytesRef textToBytesRef() { @@ -52,7 +56,7 @@ public BytesRef textToBytesRef() { @Override public String toString() { - return ""; + return ""; } @Override @@ -60,6 +64,7 @@ public RegexpQueryNode cloneTree() throws CloneNotSupportedException { RegexpQueryNode clone = (RegexpQueryNode) super.cloneTree(); clone.field = this.field; clone.text = this.text; + clone.caseSensitive = this.caseSensitive; return clone; } @@ -67,6 +72,10 @@ public RegexpQueryNode cloneTree() throws CloneNotSupportedException { public CharSequence getText() { return text; } + + public boolean getCaseSensitive() { + return caseSensitive; + } @Override public void setText(CharSequence text) { diff --git a/lucene/queryparser/src/java/org/apache/lucene/queryparser/flexible/standard/parser/StandardSyntaxParser.java b/lucene/queryparser/src/java/org/apache/lucene/queryparser/flexible/standard/parser/StandardSyntaxParser.java index 57d723159702..aefd3c60009e 100644 --- a/lucene/queryparser/src/java/org/apache/lucene/queryparser/flexible/standard/parser/StandardSyntaxParser.java +++ b/lucene/queryparser/src/java/org/apache/lucene/queryparser/flexible/standard/parser/StandardSyntaxParser.java @@ -551,8 +551,10 @@ final public QueryNode Term(CharSequence field) throws ParseException { } q = new FuzzyQueryNode(field, EscapeQuerySyntaxImpl.discardEscapeChar(term.image), fms, term.beginColumn, term.endColumn); } else if (regexp) { - String re = term.image.substring(1, term.image.length()-1); - q = new RegexpQueryNode(field, re, 0, re.length()); + boolean caseSensitive = !term.image.endsWith("i"); + int lastSlash = term.image.lastIndexOf("/"); + String re = term.image.substring(1, lastSlash); + q = new RegexpQueryNode(field, re, 0, re.length(), caseSensitive); } break; case RANGEIN_START: @@ -707,33 +709,28 @@ private boolean jj_2_2(int xla) { finally { jj_save(1, xla); } } - private boolean jj_3R_12() { - if (jj_scan_token(RANGEIN_START)) return true; - return false; - } - private boolean jj_3R_11() { if (jj_scan_token(REGEXPTERM)) return true; return false; } - private boolean jj_3_1() { - if (jj_scan_token(TERM)) return true; + private boolean jj_3R_8() { Token xsp; xsp = jj_scanpos; - if (jj_scan_token(15)) { + if (jj_3R_12()) { jj_scanpos = xsp; - if (jj_scan_token(16)) return true; + if (jj_scan_token(27)) return true; } return false; } - private boolean jj_3R_8() { + private boolean jj_3_1() { + if (jj_scan_token(TERM)) return true; Token xsp; xsp = jj_scanpos; - if (jj_3R_12()) { + if (jj_scan_token(15)) { jj_scanpos = xsp; - if (jj_scan_token(27)) return true; + if (jj_scan_token(16)) return true; } return false; } @@ -743,6 +740,11 @@ private boolean jj_3R_10() { return false; } + private boolean jj_3R_9() { + if (jj_scan_token(QUOTED)) return true; + return false; + } + private boolean jj_3R_7() { Token xsp; xsp = jj_scanpos; @@ -756,11 +758,6 @@ private boolean jj_3R_7() { return false; } - private boolean jj_3R_9() { - if (jj_scan_token(QUOTED)) return true; - return false; - } - private boolean jj_3R_5() { Token xsp; xsp = jj_scanpos; @@ -820,6 +817,11 @@ private boolean jj_3_2() { return false; } + private boolean jj_3R_12() { + if (jj_scan_token(RANGEIN_START)) return true; + return false; + } + /** Generated Token Manager. */ public StandardSyntaxParserTokenManager token_source; /** Current token. */ diff --git a/lucene/queryparser/src/java/org/apache/lucene/queryparser/flexible/standard/parser/StandardSyntaxParser.jj b/lucene/queryparser/src/java/org/apache/lucene/queryparser/flexible/standard/parser/StandardSyntaxParser.jj index 0a60490b6958..50a77f244b3c 100644 --- a/lucene/queryparser/src/java/org/apache/lucene/queryparser/flexible/standard/parser/StandardSyntaxParser.jj +++ b/lucene/queryparser/src/java/org/apache/lucene/queryparser/flexible/standard/parser/StandardSyntaxParser.jj @@ -125,7 +125,7 @@ PARSER_END(StandardSyntaxParser) | )* "\""> | (<_TERM_CHAR>)* > | )+ ( "." (<_NUM_CHAR>)+ )? )? > -| +| | : Range | : Range } @@ -440,8 +440,10 @@ QueryNode Term(CharSequence field) : { } q = new FuzzyQueryNode(field, EscapeQuerySyntaxImpl.discardEscapeChar(term.image), fms, term.beginColumn, term.endColumn); } else if (regexp) { - String re = term.image.substring(1, term.image.length()-1); - q = new RegexpQueryNode(field, re, 0, re.length()); + boolean caseSensitive = !term.image.endsWith("i"); + int lastSlash = term.image.lastIndexOf("/"); + String re = term.image.substring(1, lastSlash); + q = new RegexpQueryNode(field, re, 0, re.length(), caseSensitive); } } | ( ( {startInc=true;} | ) diff --git a/lucene/queryparser/src/java/org/apache/lucene/queryparser/flexible/standard/parser/StandardSyntaxParserTokenManager.java b/lucene/queryparser/src/java/org/apache/lucene/queryparser/flexible/standard/parser/StandardSyntaxParserTokenManager.java index 1fdaa480af0d..9ce4771b539f 100644 --- a/lucene/queryparser/src/java/org/apache/lucene/queryparser/flexible/standard/parser/StandardSyntaxParserTokenManager.java +++ b/lucene/queryparser/src/java/org/apache/lucene/queryparser/flexible/standard/parser/StandardSyntaxParserTokenManager.java @@ -107,7 +107,7 @@ else if ((active0 & 0x100000L) != 0L) private int jjMoveNfa_2(int startState, int curPos) { int startsAt = 0; - jjnewStateCnt = 33; + jjnewStateCnt = 35; int i = 1; jjstateSet[0] = startState; int kind = 0x7fffffff; @@ -135,9 +135,9 @@ else if ((0x100002600L & l) != 0L) kind = 7; } else if (curChar == 47) - jjCheckNAddStates(0, 2); + jjCheckNAddStates(0, 3); else if (curChar == 34) - jjCheckNAddStates(3, 5); + jjCheckNAddStates(4, 6); else if (curChar == 33) { if (kind > 10) @@ -160,14 +160,14 @@ else if (curChar == 33) break; case 14: if (curChar == 34) - jjCheckNAddStates(3, 5); + jjCheckNAddStates(4, 6); break; case 15: if ((0xfffffffbffffffffL & l) != 0L) - jjCheckNAddStates(3, 5); + jjCheckNAddStates(4, 6); break; case 17: - jjCheckNAddStates(3, 5); + jjCheckNAddStates(4, 6); break; case 18: if (curChar == 34 && kind > 22) @@ -197,7 +197,7 @@ else if (curChar == 33) break; if (kind > 24) kind = 24; - jjAddStates(6, 7); + jjAddStates(7, 8); break; case 26: if (curChar == 46) @@ -213,16 +213,20 @@ else if (curChar == 33) case 28: case 30: if (curChar == 47) - jjCheckNAddStates(0, 2); + jjCheckNAddStates(0, 3); break; case 29: if ((0xffff7fffffffffffL & l) != 0L) - jjCheckNAddStates(0, 2); + jjCheckNAddStates(0, 3); break; case 32: if (curChar == 47 && kind > 25) kind = 25; break; + case 34: + if (curChar == 47) + jjstateSet[jjnewStateCnt++] = 33; + break; default : break; } } while(i != startsAt); @@ -300,14 +304,14 @@ else if (curChar == 65) break; case 15: if ((0xffffffffefffffffL & l) != 0L) - jjCheckNAddStates(3, 5); + jjCheckNAddStates(4, 6); break; case 16: if (curChar == 92) jjstateSet[jjnewStateCnt++] = 17; break; case 17: - jjCheckNAddStates(3, 5); + jjCheckNAddStates(4, 6); break; case 19: case 20: @@ -338,12 +342,16 @@ else if (curChar == 65) jjstateSet[jjnewStateCnt++] = 25; break; case 29: - jjAddStates(0, 2); + jjAddStates(0, 3); break; case 31: if (curChar == 92) jjstateSet[jjnewStateCnt++] = 30; break; + case 33: + if (curChar == 105 && kind > 25) + kind = 25; + break; default : break; } } while(i != startsAt); @@ -375,7 +383,7 @@ else if (curChar == 65) case 15: case 17: if (jjCanMove_1(hiByte, i1, i2, l1, l2)) - jjCheckNAddStates(3, 5); + jjCheckNAddStates(4, 6); break; case 19: case 20: @@ -394,7 +402,7 @@ else if (curChar == 65) break; case 29: if (jjCanMove_1(hiByte, i1, i2, l1, l2)) - jjAddStates(0, 2); + jjAddStates(0, 3); break; default : break; } @@ -407,7 +415,7 @@ else if (curChar == 65) kind = 0x7fffffff; } ++curPos; - if ((i = jjnewStateCnt) == (startsAt = 33 - (jjnewStateCnt = startsAt))) + if ((i = jjnewStateCnt) == (startsAt = 35 - (jjnewStateCnt = startsAt))) return curPos; try { curChar = input_stream.readChar(); } catch(java.io.IOException e) { return curPos; } @@ -440,7 +448,7 @@ private int jjMoveNfa_0(int startState, int curPos) break; if (kind > 28) kind = 28; - jjAddStates(8, 9); + jjAddStates(9, 10); break; case 1: if (curChar == 46) @@ -594,11 +602,11 @@ else if (curChar == 34) break; case 2: if ((0xfffffffbffffffffL & l) != 0L) - jjCheckNAddStates(10, 12); + jjCheckNAddStates(11, 13); break; case 3: if (curChar == 34) - jjCheckNAddStates(10, 12); + jjCheckNAddStates(11, 13); break; case 5: if (curChar == 34 && kind > 32) @@ -631,7 +639,7 @@ else if (curChar < 128) jjCheckNAdd(6); break; case 2: - jjAddStates(10, 12); + jjAddStates(11, 13); break; case 4: if (curChar == 92) @@ -667,7 +675,7 @@ else if (curChar < 128) break; case 2: if (jjCanMove_1(hiByte, i1, i2, l1, l2)) - jjAddStates(10, 12); + jjAddStates(11, 13); break; case 6: if (!jjCanMove_1(hiByte, i1, i2, l1, l2)) @@ -694,7 +702,7 @@ else if (curChar < 128) } } static final int[] jjnextStates = { - 29, 31, 32, 15, 16, 18, 25, 26, 0, 1, 2, 4, 5, + 29, 31, 32, 34, 15, 16, 18, 25, 26, 0, 1, 2, 4, 5, }; private static final boolean jjCanMove_0(int hiByte, int i1, int i2, long l1, long l2) { @@ -758,8 +766,8 @@ private static final boolean jjCanMove_2(int hiByte, int i1, int i2, long l1, lo 0x80L, }; protected CharStream input_stream; -private final int[] jjrounds = new int[33]; -private final int[] jjstateSet = new int[66]; +private final int[] jjrounds = new int[35]; +private final int[] jjstateSet = new int[70]; protected char curChar; /** Constructor. */ public StandardSyntaxParserTokenManager(CharStream stream){ @@ -784,7 +792,7 @@ private void ReInitRounds() { int i; jjround = 0x80000001; - for (i = 33; i-- > 0;) + for (i = 35; i-- > 0;) jjrounds[i] = 0x80000000; } diff --git a/lucene/queryparser/src/test/org/apache/lucene/queryparser/util/QueryParserTestBase.java b/lucene/queryparser/src/test/org/apache/lucene/queryparser/util/QueryParserTestBase.java index b4451a6cd524..8a4f07c3483e 100644 --- a/lucene/queryparser/src/test/org/apache/lucene/queryparser/util/QueryParserTestBase.java +++ b/lucene/queryparser/src/test/org/apache/lucene/queryparser/util/QueryParserTestBase.java @@ -1051,6 +1051,16 @@ public void testRegexps() throws Exception { assertEquals(re, getQuery("field:/http.*/",qp)); assertEquals(re, getQuery("/http.*/",qp)); + // Confirm the automaton comparison identifies differences in case-matching choices + re = new RegexpQuery(new Term("field", "http.*"), RegExp.NONE); + assertNotEquals(re, getQuery("field:/http.*/i",qp)); + assertNotEquals(re, getQuery("/http.*/i",qp)); + + // Now check the case insensitivity syntax + re = new RegexpQuery(new Term("field", "http.*"), RegExp.NONE, RegExp.ASCII_CASE_INSENSITIVE, 1000); + assertEquals(re, getQuery("field:/http.*/i",qp)); + assertEquals(re, getQuery("/http.*/i",qp)); + re = new RegexpQuery(new Term("field", "http~0.5")); assertEquals(re, getQuery("field:/http~0.5/",qp)); assertEquals(re, getQuery("/http~0.5/",qp)); From b10b65f4fc82b2c4f6692606b5aec005de00db6f Mon Sep 17 00:00:00 2001 From: markharwood Date: Mon, 3 Aug 2020 10:26:49 +0100 Subject: [PATCH 2/2] Docs changes - package docs and CHANGES.txt --- lucene/CHANGES.txt | 6 ++++++ .../org/apache/lucene/queryparser/classic/package-info.java | 3 +++ 2 files changed, 9 insertions(+) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index e2c646c61215..cba41cfdd5e0 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -61,8 +61,14 @@ API Changes * LUCENE-9462: Fields without positions should still return MatchIterator. (Alan Woodward, Dawid Weiss) +* LUCENE-9445: QueryParserBase.getRegexpQuery and newRegexpQuery now take a + caseSensitive flag to allow new case insensitive matching option. (Mark Harwood) + Improvements +* LUCENE-9445: QueryParser syntax for regular expressions extended to support + trailing i for case insensitive matching e.g. /.*Foo/i (Mark Harwood) + * LUCENE-9463: Query match region retrieval component, passage scoring and formatting for building custom highlighters. (Alan Woodward, Dawid Weiss) diff --git a/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/package-info.java b/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/package-info.java index 9f77eb9b7e13..39f694116a82 100644 --- a/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/package-info.java +++ b/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/package-info.java @@ -192,6 +192,9 @@ * syntax is documented in the {@link org.apache.lucene.util.automaton.RegExp RegExp} class. For example to find documents containing "moat" or "boat": * *
/[mb]oat/
+ *

Searches against indexed terms can be made case insensitive by adding an `i` to the regex as in this example: + * + *

/.*MiXeDcAsE/i
* *

Fuzzy Searches

*

Lucene supports fuzzy searches based on Damerau-Levenshtein Distance. To do a fuzzy search use the tilde, "~", symbol at the end of a Single word Term. For example to search for a term similar in spelling to "roam" use the fuzzy search: