Skip to content

Commit 257a7d7

Browse files
authored
Painless: Fix regex lexer and error messages (#23634)
Without this change, if write a script with multiple regexes *sometimes* the lexer will decide to look at them like one big regex and then some trailing garbage. Like this discuss post: https://discuss.elastic.co/t/error-with-the-split-function-in-painless-script/79021 ``` def val = /\\\\/.split(ctx._source.event_data.param17); if (val[2] =~ /\\./) { def val2 = /\\./.split(val[2]); ctx._source['user_crash'] = val2[0] } else { ctx._source['user_crash'] = val[2] } ``` The error message you get from the lexer is `lexer_no_viable_alt_exception` right after the *second* regex. With this change each regex is just a single regex like it ought to be. As a bonus, while looking into this issue I found that the error reporting for regexes wasn't very nice. If you specify an invalid pattern then you get an error marker on the start of the pattern with the JVM's regex error message which attempts to point you to the location in the regex but is totally unreadable in the JSON response. This change fixes the location to point to the appropriate spot inside the pattern and removes the portion of the JVM's error message that doesn't render well. It is no longer needed now that we point users to the appropriate spot in the pattern.
1 parent 490d29f commit 257a7d7

File tree

4 files changed

+48
-36
lines changed

4 files changed

+48
-36
lines changed

modules/lang-painless/src/main/antlr/PainlessLexer.g4

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ INTEGER: ( '0' | [1-9] [0-9]* ) [lLfFdD]?;
120120
DECIMAL: ( '0' | [1-9] [0-9]* ) (DOT [0-9]+)? ( [eE] [+\-]? [0-9]+ )? [fFdD]?;
121121

122122
STRING: ( '"' ( '\\"' | '\\\\' | ~[\\"] )*? '"' ) | ( '\'' ( '\\\'' | '\\\\' | ~[\\'] )*? '\'' );
123-
REGEX: '/' ( ~('/' | '\n') | '\\' ~'\n' )+ '/' [cilmsUux]* { slashIsRegex() }?;
123+
REGEX: '/' ( '\\' ~'\n' | ~('/' | '\n') )+? '/' [cilmsUux]* { slashIsRegex() }?;
124124

125125
TRUE: 'true';
126126
FALSE: 'false';

modules/lang-painless/src/main/java/org/elasticsearch/painless/antlr/PainlessLexer.java

Lines changed: 26 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
11
// ANTLR GENERATED CODE: DO NOT EDIT
22
package org.elasticsearch.painless.antlr;
3-
4-
53
import org.antlr.v4.runtime.Lexer;
64
import org.antlr.v4.runtime.CharStream;
75
import org.antlr.v4.runtime.Token;
@@ -211,29 +209,29 @@ private boolean TYPE_sempred(RuleContext _localctx, int predIndex) {
211209
"\3N\3N\7N\u021a\nN\fN\16N\u021d\13N\3N\3N\3O\3O\3O\3O\3O\3P\3P\3P\3P\3"+
212210
"P\3P\3Q\3Q\3Q\3Q\3Q\3R\3R\3R\3R\7R\u0235\nR\fR\16R\u0238\13R\3R\3R\3S"+
213211
"\3S\7S\u023e\nS\fS\16S\u0241\13S\3T\3T\3T\7T\u0246\nT\fT\16T\u0249\13"+
214-
"T\5T\u024b\nT\3T\3T\3U\3U\7U\u0251\nU\fU\16U\u0254\13U\3U\3U\6\u00b9\u00c3"+
215-
"\u01fd\u0209\2V\4\3\6\4\b\5\n\6\f\7\16\b\20\t\22\n\24\13\26\f\30\r\32"+
216-
"\16\34\17\36\20 \21\"\22$\23&\24(\25*\26,\27.\30\60\31\62\32\64\33\66"+
217-
"\348\35:\36<\37> @!B\"D#F$H%J&L\'N(P)R*T+V,X-Z.\\/^\60`\61b\62d\63f\64"+
218-
"h\65j\66l\67n8p9r:t;v<x=z>|?~@\u0080A\u0082B\u0084C\u0086D\u0088E\u008a"+
219-
"F\u008cG\u008eH\u0090I\u0092J\u0094K\u0096L\u0098M\u009aN\u009cO\u009e"+
220-
"P\u00a0Q\u00a2R\u00a4S\u00a6T\u00a8U\u00aaV\4\2\3\25\5\2\13\f\17\17\""+
221-
"\"\4\2\f\f\17\17\3\2\629\4\2NNnn\4\2ZZzz\5\2\62;CHch\3\2\63;\3\2\62;\b"+
222-
"\2FFHHNNffhhnn\4\2GGgg\4\2--//\6\2FFHHffhh\4\2$$^^\4\2))^^\4\2\f\f\61"+
223-
"\61\3\2\f\f\t\2WWeekknouuwwzz\5\2C\\aac|\6\2\62;C\\aac|\u0277\2\4\3\2"+
224-
"\2\2\2\6\3\2\2\2\2\b\3\2\2\2\2\n\3\2\2\2\2\f\3\2\2\2\2\16\3\2\2\2\2\20"+
225-
"\3\2\2\2\2\22\3\2\2\2\2\24\3\2\2\2\2\26\3\2\2\2\2\30\3\2\2\2\2\32\3\2"+
226-
"\2\2\2\34\3\2\2\2\2\36\3\2\2\2\2 \3\2\2\2\2\"\3\2\2\2\2$\3\2\2\2\2&\3"+
227-
"\2\2\2\2(\3\2\2\2\2*\3\2\2\2\2,\3\2\2\2\2.\3\2\2\2\2\60\3\2\2\2\2\62\3"+
228-
"\2\2\2\2\64\3\2\2\2\2\66\3\2\2\2\28\3\2\2\2\2:\3\2\2\2\2<\3\2\2\2\2>\3"+
229-
"\2\2\2\2@\3\2\2\2\2B\3\2\2\2\2D\3\2\2\2\2F\3\2\2\2\2H\3\2\2\2\2J\3\2\2"+
230-
"\2\2L\3\2\2\2\2N\3\2\2\2\2P\3\2\2\2\2R\3\2\2\2\2T\3\2\2\2\2V\3\2\2\2\2"+
231-
"X\3\2\2\2\2Z\3\2\2\2\2\\\3\2\2\2\2^\3\2\2\2\2`\3\2\2\2\2b\3\2\2\2\2d\3"+
232-
"\2\2\2\2f\3\2\2\2\2h\3\2\2\2\2j\3\2\2\2\2l\3\2\2\2\2n\3\2\2\2\2p\3\2\2"+
233-
"\2\2r\3\2\2\2\2t\3\2\2\2\2v\3\2\2\2\2x\3\2\2\2\2z\3\2\2\2\2|\3\2\2\2\2"+
234-
"~\3\2\2\2\2\u0080\3\2\2\2\2\u0082\3\2\2\2\2\u0084\3\2\2\2\2\u0086\3\2"+
235-
"\2\2\2\u0088\3\2\2\2\2\u008a\3\2\2\2\2\u008c\3\2\2\2\2\u008e\3\2\2\2\2"+
236-
"\u0090\3\2\2\2\2\u0092\3\2\2\2\2\u0094\3\2\2\2\2\u0096\3\2\2\2\2\u0098"+
212+
"T\5T\u024b\nT\3T\3T\3U\3U\7U\u0251\nU\fU\16U\u0254\13U\3U\3U\7\u00b9\u00c3"+
213+
"\u01fd\u0209\u0215\2V\4\3\6\4\b\5\n\6\f\7\16\b\20\t\22\n\24\13\26\f\30"+
214+
"\r\32\16\34\17\36\20 \21\"\22$\23&\24(\25*\26,\27.\30\60\31\62\32\64\33"+
215+
"\66\348\35:\36<\37> @!B\"D#F$H%J&L\'N(P)R*T+V,X-Z.\\/^\60`\61b\62d\63"+
216+
"f\64h\65j\66l\67n8p9r:t;v<x=z>|?~@\u0080A\u0082B\u0084C\u0086D\u0088E"+
217+
"\u008aF\u008cG\u008eH\u0090I\u0092J\u0094K\u0096L\u0098M\u009aN\u009c"+
218+
"O\u009eP\u00a0Q\u00a2R\u00a4S\u00a6T\u00a8U\u00aaV\4\2\3\25\5\2\13\f\17"+
219+
"\17\"\"\4\2\f\f\17\17\3\2\629\4\2NNnn\4\2ZZzz\5\2\62;CHch\3\2\63;\3\2"+
220+
"\62;\b\2FFHHNNffhhnn\4\2GGgg\4\2--//\6\2FFHHffhh\4\2$$^^\4\2))^^\3\2\f"+
221+
"\f\4\2\f\f\61\61\t\2WWeekknouuwwzz\5\2C\\aac|\6\2\62;C\\aac|\u0277\2\4"+
222+
"\3\2\2\2\2\6\3\2\2\2\2\b\3\2\2\2\2\n\3\2\2\2\2\f\3\2\2\2\2\16\3\2\2\2"+
223+
"\2\20\3\2\2\2\2\22\3\2\2\2\2\24\3\2\2\2\2\26\3\2\2\2\2\30\3\2\2\2\2\32"+
224+
"\3\2\2\2\2\34\3\2\2\2\2\36\3\2\2\2\2 \3\2\2\2\2\"\3\2\2\2\2$\3\2\2\2\2"+
225+
"&\3\2\2\2\2(\3\2\2\2\2*\3\2\2\2\2,\3\2\2\2\2.\3\2\2\2\2\60\3\2\2\2\2\62"+
226+
"\3\2\2\2\2\64\3\2\2\2\2\66\3\2\2\2\28\3\2\2\2\2:\3\2\2\2\2<\3\2\2\2\2"+
227+
">\3\2\2\2\2@\3\2\2\2\2B\3\2\2\2\2D\3\2\2\2\2F\3\2\2\2\2H\3\2\2\2\2J\3"+
228+
"\2\2\2\2L\3\2\2\2\2N\3\2\2\2\2P\3\2\2\2\2R\3\2\2\2\2T\3\2\2\2\2V\3\2\2"+
229+
"\2\2X\3\2\2\2\2Z\3\2\2\2\2\\\3\2\2\2\2^\3\2\2\2\2`\3\2\2\2\2b\3\2\2\2"+
230+
"\2d\3\2\2\2\2f\3\2\2\2\2h\3\2\2\2\2j\3\2\2\2\2l\3\2\2\2\2n\3\2\2\2\2p"+
231+
"\3\2\2\2\2r\3\2\2\2\2t\3\2\2\2\2v\3\2\2\2\2x\3\2\2\2\2z\3\2\2\2\2|\3\2"+
232+
"\2\2\2~\3\2\2\2\2\u0080\3\2\2\2\2\u0082\3\2\2\2\2\u0084\3\2\2\2\2\u0086"+
233+
"\3\2\2\2\2\u0088\3\2\2\2\2\u008a\3\2\2\2\2\u008c\3\2\2\2\2\u008e\3\2\2"+
234+
"\2\2\u0090\3\2\2\2\2\u0092\3\2\2\2\2\u0094\3\2\2\2\2\u0096\3\2\2\2\2\u0098"+
237235
"\3\2\2\2\2\u009a\3\2\2\2\2\u009c\3\2\2\2\2\u009e\3\2\2\2\2\u00a0\3\2\2"+
238236
"\2\2\u00a2\3\2\2\2\2\u00a4\3\2\2\2\2\u00a6\3\2\2\2\3\u00a8\3\2\2\2\3\u00aa"+
239237
"\3\2\2\2\4\u00ad\3\2\2\2\6\u00c8\3\2\2\2\b\u00cc\3\2\2\2\n\u00ce\3\2\2"+
@@ -358,9 +356,9 @@ private boolean TYPE_sempred(RuleContext _localctx, int predIndex) {
358356
"\3\2\2\2\u0207\u0206\3\2\2\2\u0208\u020b\3\2\2\2\u0209\u020a\3\2\2\2\u0209"+
359357
"\u0207\3\2\2\2\u020a\u020c\3\2\2\2\u020b\u0209\3\2\2\2\u020c\u020e\7)"+
360358
"\2\2\u020d\u01f5\3\2\2\2\u020d\u0201\3\2\2\2\u020e\u009b\3\2\2\2\u020f"+
361-
"\u0213\7\61\2\2\u0210\u0214\n\20\2\2\u0211\u0212\7^\2\2\u0212\u0214\n"+
362-
"\21\2\2\u0213\u0210\3\2\2\2\u0213\u0211\3\2\2\2\u0214\u0215\3\2\2\2\u0215"+
363-
"\u0213\3\2\2\2\u0215\u0216\3\2\2\2\u0216\u0217\3\2\2\2\u0217\u021b\7\61"+
359+
"\u0213\7\61\2\2\u0210\u0211\7^\2\2\u0211\u0214\n\20\2\2\u0212\u0214\n"+
360+
"\21\2\2\u0213\u0210\3\2\2\2\u0213\u0212\3\2\2\2\u0214\u0215\3\2\2\2\u0215"+
361+
"\u0216\3\2\2\2\u0215\u0213\3\2\2\2\u0216\u0217\3\2\2\2\u0217\u021b\7\61"+
364362
"\2\2\u0218\u021a\t\22\2\2\u0219\u0218\3\2\2\2\u021a\u021d\3\2\2\2\u021b"+
365363
"\u0219\3\2\2\2\u021b\u021c\3\2\2\2\u021c\u021e\3\2\2\2\u021d\u021b\3\2"+
366364
"\2\2\u021e\u021f\6N\3\2\u021f\u009d\3\2\2\2\u0220\u0221\7v\2\2\u0221\u0222"+

modules/lang-painless/src/main/java/org/elasticsearch/painless/node/ERegex.java

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,8 +68,9 @@ void analyze(Locals locals) {
6868

6969
try {
7070
Pattern.compile(pattern, flags);
71-
} catch (PatternSyntaxException exception) {
72-
throw createError(exception);
71+
} catch (PatternSyntaxException e) {
72+
throw new Location(location.getSourceName(), location.getOffset() + 1 + e.getIndex()).createError(
73+
new IllegalArgumentException("Error compiling regex: " + e.getDescription()));
7374
}
7475

7576
constant = new Constant(location, Definition.PATTERN_TYPE.type, "regexAt$" + location.getOffset(), this::initializeConstant);

modules/lang-painless/src/test/java/org/elasticsearch/painless/RegexTests.java

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
package org.elasticsearch.painless;
2121

2222
import org.elasticsearch.common.settings.Settings;
23+
import org.elasticsearch.script.ScriptException;
2324

2425
import java.nio.CharBuffer;
2526
import java.util.Arrays;
@@ -44,8 +45,17 @@ public void testPatternAfterReturn() {
4445
assertEquals(false, exec("return 'bar' ==~ /foo/"));
4546
}
4647

47-
public void testSlashesEscapePattern() {
48-
assertEquals(true, exec("return '//' ==~ /\\/\\//"));
48+
public void testBackslashEscapesForwardSlash() {
49+
assertEquals(true, exec("'//' ==~ /\\/\\//"));
50+
}
51+
52+
public void testBackslashEscapeBackslash() {
53+
// Both of these are single backslashes but java escaping + Painless escaping....
54+
assertEquals(true, exec("'\\\\' ==~ /\\\\/"));
55+
}
56+
57+
public void testRegexIsNonGreedy() {
58+
assertEquals(true, exec("def s = /\\\\/.split('.\\\\.'); return s[1] ==~ /\\./"));
4959
}
5060

5161
public void testPatternAfterAssignment() {
@@ -248,11 +258,14 @@ public void testCantUsePatternCompile() {
248258
}
249259

250260
public void testBadRegexPattern() {
251-
PatternSyntaxException e = expectScriptThrows(PatternSyntaxException.class, () -> {
261+
ScriptException e = expectThrows(ScriptException.class, () -> {
252262
exec("/\\ujjjj/"); // Invalid unicode
253263
});
254-
assertThat(e.getMessage(), containsString("Illegal Unicode escape sequence near index 2"));
255-
assertThat(e.getMessage(), containsString("\\ujjjj"));
264+
assertEquals("Error compiling regex: Illegal Unicode escape sequence", e.getCause().getMessage());
265+
266+
// And make sure the location of the error points to the offset inside the pattern
267+
assertEquals("/\\ujjjj/", e.getScriptStack().get(0));
268+
assertEquals(" ^---- HERE", e.getScriptStack().get(1));
256269
}
257270

258271
public void testRegexAgainstNumber() {

0 commit comments

Comments
 (0)