From 7d970ae233bffba55c2abe931df514e6e0ebdb34 Mon Sep 17 00:00:00 2001 From: Nik Everett Date: Fri, 17 Mar 2017 12:17:19 -0400 Subject: [PATCH] Painless: Fix regex lexer and error messages Without this change, if write a script with multiple regexes *sometimes* the lexer will decide to look at them like one big regex and then some trailing garbage. Like this discuss post: https://discuss.elastic.co/t/error-with-the-split-function-in-painless-script/79021 ``` def val = /\\\\/.split(ctx._source.event_data.param17); if (val[2] =~ /\\./) { def val2 = /\\./.split(val[2]); ctx._source['user_crash'] = val2[0] } else { ctx._source['user_crash'] = val[2] } ``` The error message you get from the lexer is `lexer_no_viable_alt_exception` right after the *second* regex. With this change each regex is just a single regex like it ought to be. As a bonus, while looking into this issue I found that the error reporting for regexes wasn't very nice. If you specify an invalid pattern then you get an error marker on the start of the pattern with the JVM's regex error message which attempts to point you to the location in the regex but is totally unreadable in the JSON response. This change fixes the location to point to the appropriate spot inside the pattern and removes the portion of the JVM's error message that doesn't render well. It is no longer needed now that we point users to the appropriate spot in the pattern. --- .../src/main/antlr/PainlessLexer.g4 | 2 +- .../painless/antlr/PainlessLexer.java | 54 +++++++++---------- .../elasticsearch/painless/node/ERegex.java | 5 +- .../elasticsearch/painless/RegexTests.java | 23 ++++++-- 4 files changed, 48 insertions(+), 36 deletions(-) diff --git a/modules/lang-painless/src/main/antlr/PainlessLexer.g4 b/modules/lang-painless/src/main/antlr/PainlessLexer.g4 index f60d48efcc744..6ab6a86113595 100644 --- a/modules/lang-painless/src/main/antlr/PainlessLexer.g4 +++ b/modules/lang-painless/src/main/antlr/PainlessLexer.g4 @@ -120,7 +120,7 @@ INTEGER: ( '0' | [1-9] [0-9]* ) [lLfFdD]?; DECIMAL: ( '0' | [1-9] [0-9]* ) (DOT [0-9]+)? ( [eE] [+\-]? [0-9]+ )? [fFdD]?; STRING: ( '"' ( '\\"' | '\\\\' | ~[\\"] )*? '"' ) | ( '\'' ( '\\\'' | '\\\\' | ~[\\'] )*? '\'' ); -REGEX: '/' ( ~('/' | '\n') | '\\' ~'\n' )+ '/' [cilmsUux]* { slashIsRegex() }?; +REGEX: '/' ( '\\' ~'\n' | ~('/' | '\n') )+? '/' [cilmsUux]* { slashIsRegex() }?; TRUE: 'true'; FALSE: 'false'; diff --git a/modules/lang-painless/src/main/java/org/elasticsearch/painless/antlr/PainlessLexer.java b/modules/lang-painless/src/main/java/org/elasticsearch/painless/antlr/PainlessLexer.java index 44972061b5902..fd32c59b4ff03 100644 --- a/modules/lang-painless/src/main/java/org/elasticsearch/painless/antlr/PainlessLexer.java +++ b/modules/lang-painless/src/main/java/org/elasticsearch/painless/antlr/PainlessLexer.java @@ -1,7 +1,5 @@ // ANTLR GENERATED CODE: DO NOT EDIT package org.elasticsearch.painless.antlr; - - import org.antlr.v4.runtime.Lexer; import org.antlr.v4.runtime.CharStream; import org.antlr.v4.runtime.Token; @@ -211,29 +209,29 @@ private boolean TYPE_sempred(RuleContext _localctx, int predIndex) { "\3N\3N\7N\u021a\nN\fN\16N\u021d\13N\3N\3N\3O\3O\3O\3O\3O\3P\3P\3P\3P\3"+ "P\3P\3Q\3Q\3Q\3Q\3Q\3R\3R\3R\3R\7R\u0235\nR\fR\16R\u0238\13R\3R\3R\3S"+ "\3S\7S\u023e\nS\fS\16S\u0241\13S\3T\3T\3T\7T\u0246\nT\fT\16T\u0249\13"+ - "T\5T\u024b\nT\3T\3T\3U\3U\7U\u0251\nU\fU\16U\u0254\13U\3U\3U\6\u00b9\u00c3"+ - "\u01fd\u0209\2V\4\3\6\4\b\5\n\6\f\7\16\b\20\t\22\n\24\13\26\f\30\r\32"+ - "\16\34\17\36\20 \21\"\22$\23&\24(\25*\26,\27.\30\60\31\62\32\64\33\66"+ - "\348\35:\36<\37> @!B\"D#F$H%J&L\'N(P)R*T+V,X-Z.\\/^\60`\61b\62d\63f\64"+ - "h\65j\66l\67n8p9r:t;v|?~@\u0080A\u0082B\u0084C\u0086D\u0088E\u008a"+ - "F\u008cG\u008eH\u0090I\u0092J\u0094K\u0096L\u0098M\u009aN\u009cO\u009e"+ - "P\u00a0Q\u00a2R\u00a4S\u00a6T\u00a8U\u00aaV\4\2\3\25\5\2\13\f\17\17\""+ - "\"\4\2\f\f\17\17\3\2\629\4\2NNnn\4\2ZZzz\5\2\62;CHch\3\2\63;\3\2\62;\b"+ - "\2FFHHNNffhhnn\4\2GGgg\4\2--//\6\2FFHHffhh\4\2$$^^\4\2))^^\4\2\f\f\61"+ - "\61\3\2\f\f\t\2WWeekknouuwwzz\5\2C\\aac|\6\2\62;C\\aac|\u0277\2\4\3\2"+ - "\2\2\2\6\3\2\2\2\2\b\3\2\2\2\2\n\3\2\2\2\2\f\3\2\2\2\2\16\3\2\2\2\2\20"+ - "\3\2\2\2\2\22\3\2\2\2\2\24\3\2\2\2\2\26\3\2\2\2\2\30\3\2\2\2\2\32\3\2"+ - "\2\2\2\34\3\2\2\2\2\36\3\2\2\2\2 \3\2\2\2\2\"\3\2\2\2\2$\3\2\2\2\2&\3"+ - "\2\2\2\2(\3\2\2\2\2*\3\2\2\2\2,\3\2\2\2\2.\3\2\2\2\2\60\3\2\2\2\2\62\3"+ - "\2\2\2\2\64\3\2\2\2\2\66\3\2\2\2\28\3\2\2\2\2:\3\2\2\2\2<\3\2\2\2\2>\3"+ - "\2\2\2\2@\3\2\2\2\2B\3\2\2\2\2D\3\2\2\2\2F\3\2\2\2\2H\3\2\2\2\2J\3\2\2"+ - "\2\2L\3\2\2\2\2N\3\2\2\2\2P\3\2\2\2\2R\3\2\2\2\2T\3\2\2\2\2V\3\2\2\2\2"+ - "X\3\2\2\2\2Z\3\2\2\2\2\\\3\2\2\2\2^\3\2\2\2\2`\3\2\2\2\2b\3\2\2\2\2d\3"+ - "\2\2\2\2f\3\2\2\2\2h\3\2\2\2\2j\3\2\2\2\2l\3\2\2\2\2n\3\2\2\2\2p\3\2\2"+ - "\2\2r\3\2\2\2\2t\3\2\2\2\2v\3\2\2\2\2x\3\2\2\2\2z\3\2\2\2\2|\3\2\2\2\2"+ - "~\3\2\2\2\2\u0080\3\2\2\2\2\u0082\3\2\2\2\2\u0084\3\2\2\2\2\u0086\3\2"+ - "\2\2\2\u0088\3\2\2\2\2\u008a\3\2\2\2\2\u008c\3\2\2\2\2\u008e\3\2\2\2\2"+ - "\u0090\3\2\2\2\2\u0092\3\2\2\2\2\u0094\3\2\2\2\2\u0096\3\2\2\2\2\u0098"+ + "T\5T\u024b\nT\3T\3T\3U\3U\7U\u0251\nU\fU\16U\u0254\13U\3U\3U\7\u00b9\u00c3"+ + "\u01fd\u0209\u0215\2V\4\3\6\4\b\5\n\6\f\7\16\b\20\t\22\n\24\13\26\f\30"+ + "\r\32\16\34\17\36\20 \21\"\22$\23&\24(\25*\26,\27.\30\60\31\62\32\64\33"+ + "\66\348\35:\36<\37> @!B\"D#F$H%J&L\'N(P)R*T+V,X-Z.\\/^\60`\61b\62d\63"+ + "f\64h\65j\66l\67n8p9r:t;v|?~@\u0080A\u0082B\u0084C\u0086D\u0088E"+ + "\u008aF\u008cG\u008eH\u0090I\u0092J\u0094K\u0096L\u0098M\u009aN\u009c"+ + "O\u009eP\u00a0Q\u00a2R\u00a4S\u00a6T\u00a8U\u00aaV\4\2\3\25\5\2\13\f\17"+ + "\17\"\"\4\2\f\f\17\17\3\2\629\4\2NNnn\4\2ZZzz\5\2\62;CHch\3\2\63;\3\2"+ + "\62;\b\2FFHHNNffhhnn\4\2GGgg\4\2--//\6\2FFHHffhh\4\2$$^^\4\2))^^\3\2\f"+ + "\f\4\2\f\f\61\61\t\2WWeekknouuwwzz\5\2C\\aac|\6\2\62;C\\aac|\u0277\2\4"+ + "\3\2\2\2\2\6\3\2\2\2\2\b\3\2\2\2\2\n\3\2\2\2\2\f\3\2\2\2\2\16\3\2\2\2"+ + "\2\20\3\2\2\2\2\22\3\2\2\2\2\24\3\2\2\2\2\26\3\2\2\2\2\30\3\2\2\2\2\32"+ + "\3\2\2\2\2\34\3\2\2\2\2\36\3\2\2\2\2 \3\2\2\2\2\"\3\2\2\2\2$\3\2\2\2\2"+ + "&\3\2\2\2\2(\3\2\2\2\2*\3\2\2\2\2,\3\2\2\2\2.\3\2\2\2\2\60\3\2\2\2\2\62"+ + "\3\2\2\2\2\64\3\2\2\2\2\66\3\2\2\2\28\3\2\2\2\2:\3\2\2\2\2<\3\2\2\2\2"+ + ">\3\2\2\2\2@\3\2\2\2\2B\3\2\2\2\2D\3\2\2\2\2F\3\2\2\2\2H\3\2\2\2\2J\3"+ + "\2\2\2\2L\3\2\2\2\2N\3\2\2\2\2P\3\2\2\2\2R\3\2\2\2\2T\3\2\2\2\2V\3\2\2"+ + "\2\2X\3\2\2\2\2Z\3\2\2\2\2\\\3\2\2\2\2^\3\2\2\2\2`\3\2\2\2\2b\3\2\2\2"+ + "\2d\3\2\2\2\2f\3\2\2\2\2h\3\2\2\2\2j\3\2\2\2\2l\3\2\2\2\2n\3\2\2\2\2p"+ + "\3\2\2\2\2r\3\2\2\2\2t\3\2\2\2\2v\3\2\2\2\2x\3\2\2\2\2z\3\2\2\2\2|\3\2"+ + "\2\2\2~\3\2\2\2\2\u0080\3\2\2\2\2\u0082\3\2\2\2\2\u0084\3\2\2\2\2\u0086"+ + "\3\2\2\2\2\u0088\3\2\2\2\2\u008a\3\2\2\2\2\u008c\3\2\2\2\2\u008e\3\2\2"+ + "\2\2\u0090\3\2\2\2\2\u0092\3\2\2\2\2\u0094\3\2\2\2\2\u0096\3\2\2\2\2\u0098"+ "\3\2\2\2\2\u009a\3\2\2\2\2\u009c\3\2\2\2\2\u009e\3\2\2\2\2\u00a0\3\2\2"+ "\2\2\u00a2\3\2\2\2\2\u00a4\3\2\2\2\2\u00a6\3\2\2\2\3\u00a8\3\2\2\2\3\u00aa"+ "\3\2\2\2\4\u00ad\3\2\2\2\6\u00c8\3\2\2\2\b\u00cc\3\2\2\2\n\u00ce\3\2\2"+ @@ -358,9 +356,9 @@ private boolean TYPE_sempred(RuleContext _localctx, int predIndex) { "\3\2\2\2\u0207\u0206\3\2\2\2\u0208\u020b\3\2\2\2\u0209\u020a\3\2\2\2\u0209"+ "\u0207\3\2\2\2\u020a\u020c\3\2\2\2\u020b\u0209\3\2\2\2\u020c\u020e\7)"+ "\2\2\u020d\u01f5\3\2\2\2\u020d\u0201\3\2\2\2\u020e\u009b\3\2\2\2\u020f"+ - "\u0213\7\61\2\2\u0210\u0214\n\20\2\2\u0211\u0212\7^\2\2\u0212\u0214\n"+ - "\21\2\2\u0213\u0210\3\2\2\2\u0213\u0211\3\2\2\2\u0214\u0215\3\2\2\2\u0215"+ - "\u0213\3\2\2\2\u0215\u0216\3\2\2\2\u0216\u0217\3\2\2\2\u0217\u021b\7\61"+ + "\u0213\7\61\2\2\u0210\u0211\7^\2\2\u0211\u0214\n\20\2\2\u0212\u0214\n"+ + "\21\2\2\u0213\u0210\3\2\2\2\u0213\u0212\3\2\2\2\u0214\u0215\3\2\2\2\u0215"+ + "\u0216\3\2\2\2\u0215\u0213\3\2\2\2\u0216\u0217\3\2\2\2\u0217\u021b\7\61"+ "\2\2\u0218\u021a\t\22\2\2\u0219\u0218\3\2\2\2\u021a\u021d\3\2\2\2\u021b"+ "\u0219\3\2\2\2\u021b\u021c\3\2\2\2\u021c\u021e\3\2\2\2\u021d\u021b\3\2"+ "\2\2\u021e\u021f\6N\3\2\u021f\u009d\3\2\2\2\u0220\u0221\7v\2\2\u0221\u0222"+ diff --git a/modules/lang-painless/src/main/java/org/elasticsearch/painless/node/ERegex.java b/modules/lang-painless/src/main/java/org/elasticsearch/painless/node/ERegex.java index 1d1f41948c43a..4b38868b1b1fc 100644 --- a/modules/lang-painless/src/main/java/org/elasticsearch/painless/node/ERegex.java +++ b/modules/lang-painless/src/main/java/org/elasticsearch/painless/node/ERegex.java @@ -68,8 +68,9 @@ void analyze(Locals locals) { try { Pattern.compile(pattern, flags); - } catch (PatternSyntaxException exception) { - throw createError(exception); + } catch (PatternSyntaxException e) { + throw new Location(location.getSourceName(), location.getOffset() + 1 + e.getIndex()).createError( + new IllegalArgumentException("Error compiling regex: " + e.getDescription())); } constant = new Constant(location, Definition.PATTERN_TYPE.type, "regexAt$" + location.getOffset(), this::initializeConstant); diff --git a/modules/lang-painless/src/test/java/org/elasticsearch/painless/RegexTests.java b/modules/lang-painless/src/test/java/org/elasticsearch/painless/RegexTests.java index 1c53692ad741a..83a592b3f2632 100644 --- a/modules/lang-painless/src/test/java/org/elasticsearch/painless/RegexTests.java +++ b/modules/lang-painless/src/test/java/org/elasticsearch/painless/RegexTests.java @@ -20,6 +20,7 @@ package org.elasticsearch.painless; import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.script.ScriptException; import java.nio.CharBuffer; import java.util.Arrays; @@ -44,8 +45,17 @@ public void testPatternAfterReturn() { assertEquals(false, exec("return 'bar' ==~ /foo/")); } - public void testSlashesEscapePattern() { - assertEquals(true, exec("return '//' ==~ /\\/\\//")); + public void testBackslashEscapesForwardSlash() { + assertEquals(true, exec("'//' ==~ /\\/\\//")); + } + + public void testBackslashEscapeBackslash() { + // Both of these are single backslashes but java escaping + Painless escaping.... + assertEquals(true, exec("'\\\\' ==~ /\\\\/")); + } + + public void testRegexIsNonGreedy() { + assertEquals(true, exec("def s = /\\\\/.split('.\\\\.'); return s[1] ==~ /\\./")); } public void testPatternAfterAssignment() { @@ -248,11 +258,14 @@ public void testCantUsePatternCompile() { } public void testBadRegexPattern() { - PatternSyntaxException e = expectScriptThrows(PatternSyntaxException.class, () -> { + ScriptException e = expectThrows(ScriptException.class, () -> { exec("/\\ujjjj/"); // Invalid unicode }); - assertThat(e.getMessage(), containsString("Illegal Unicode escape sequence near index 2")); - assertThat(e.getMessage(), containsString("\\ujjjj")); + assertEquals("Error compiling regex: Illegal Unicode escape sequence", e.getCause().getMessage()); + + // And make sure the location of the error points to the offset inside the pattern + assertEquals("/\\ujjjj/", e.getScriptStack().get(0)); + assertEquals(" ^---- HERE", e.getScriptStack().get(1)); } public void testRegexAgainstNumber() {