Skip to content

Commit 8f76a1f

Browse files
authored
Rework RegExp engine and add support for proper unicode matching (#3746)
This change includes several bugfixes, general improvements, and support for additional features. - Added full support for web compatibility syntax defined in Annex B - Implemented parsing and matching patterns in unicode mode - Fixed capture results when iterating with nested capturing groups - Significantly reduced regexp bytecode size - Reduced stack usage during regexp execution - Improved matching performance JerryScript-DCO-1.0-Signed-off-by: Dániel Bátyai [email protected]
1 parent 908240b commit 8f76a1f

30 files changed

+3373
-2379
lines changed

jerry-core/api/jerry-snapshot.c

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -559,18 +559,15 @@ snapshot_load_compiled_code (const uint8_t *base_addr_p, /**< base address of th
559559
#if ENABLED (JERRY_BUILTIN_REGEXP)
560560
if (!(bytecode_p->status_flags & CBC_CODE_FLAGS_FUNCTION))
561561
{
562-
const re_compiled_code_t *re_bytecode_p = NULL;
563562

564563
const uint8_t *regex_start_p = ((const uint8_t *) bytecode_p) + sizeof (ecma_compiled_code_t);
565564

566565
/* Real size is stored in refs. */
567566
ecma_string_t *pattern_str_p = ecma_new_ecma_string_from_utf8 (regex_start_p,
568567
bytecode_p->refs);
569568

570-
re_compile_bytecode (&re_bytecode_p,
571-
pattern_str_p,
572-
bytecode_p->status_flags);
573-
569+
const re_compiled_code_t *re_bytecode_p = re_compile_bytecode (pattern_str_p,
570+
bytecode_p->status_flags);
574571
ecma_deref_ecma_string (pattern_str_p);
575572

576573
return (ecma_compiled_code_t *) re_bytecode_p;

jerry-core/ecma/base/ecma-gc.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1467,7 +1467,7 @@ ecma_gc_run (void)
14671467

14681468
#if ENABLED (JERRY_BUILTIN_REGEXP)
14691469
/* Free RegExp bytecodes stored in cache */
1470-
re_cache_gc_run ();
1470+
re_cache_gc ();
14711471
#endif /* ENABLED (JERRY_BUILTIN_REGEXP) */
14721472
} /* ecma_gc_run */
14731473

jerry-core/ecma/base/ecma-helpers-string.c

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2362,8 +2362,7 @@ ecma_string_trim_helper (const lit_utf8_byte_t **utf8_str_p, /**< [in, out] curr
23622362
{
23632363
read_size = lit_read_code_unit_from_utf8 (current_p, &ch);
23642364

2365-
if (!lit_char_is_white_space (ch)
2366-
&& !lit_char_is_line_terminator (ch))
2365+
if (!lit_char_is_white_space (ch))
23672366
{
23682367
nonws_start_p = current_p;
23692368
break;
@@ -2378,8 +2377,7 @@ ecma_string_trim_helper (const lit_utf8_byte_t **utf8_str_p, /**< [in, out] curr
23782377
{
23792378
read_size = lit_read_prev_code_unit_from_utf8 (current_p, &ch);
23802379

2381-
if (!lit_char_is_white_space (ch)
2382-
&& !lit_char_is_line_terminator (ch))
2380+
if (!lit_char_is_white_space (ch))
23832381
{
23842382
break;
23852383
}

jerry-core/ecma/builtin-objects/ecma-builtin-global.c

Lines changed: 10 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -223,13 +223,13 @@ ecma_builtin_global_object_decode_uri_helper (lit_utf8_byte_t *input_start_p, /*
223223
continue;
224224
}
225225

226-
ecma_char_t decoded_byte;
227-
228-
if (!lit_read_code_unit_from_hex (input_char_p + 1, 2, &decoded_byte))
226+
uint32_t hex_value = lit_char_hex_lookup (input_char_p + 1, input_end_p, 2);
227+
if (hex_value == UINT32_MAX)
229228
{
230229
return ecma_raise_uri_error (ECMA_ERR_MSG ("Invalid hexadecimal value."));
231230
}
232231

232+
ecma_char_t decoded_byte = (ecma_char_t) hex_value;
233233
input_char_p += URI_ENCODED_BYTE_SIZE;
234234

235235
if (decoded_byte <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
@@ -272,20 +272,18 @@ ecma_builtin_global_object_decode_uri_helper (lit_utf8_byte_t *input_start_p, /*
272272
/* Input decode. */
273273
if (*input_char_p != '%')
274274
{
275-
*output_char_p = *input_char_p;
276-
output_char_p++;
277-
input_char_p++;
275+
*output_char_p++ = *input_char_p++;
278276
continue;
279277
}
280278

281-
ecma_char_t decoded_byte;
282-
283-
if (!lit_read_code_unit_from_hex (input_char_p + 1, 2, &decoded_byte))
279+
uint32_t hex_value = lit_char_hex_lookup (input_char_p + 1, input_end_p, 2);
280+
if (hex_value == UINT32_MAX)
284281
{
285282
ret_value = ecma_raise_uri_error (ECMA_ERR_MSG ("Invalid hexadecimal value."));
286283
break;
287284
}
288285

286+
ecma_char_t decoded_byte = (ecma_char_t) hex_value;
289287
input_char_p += URI_ENCODED_BYTE_SIZE;
290288

291289
if (decoded_byte <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
@@ -337,17 +335,16 @@ ecma_builtin_global_object_decode_uri_helper (lit_utf8_byte_t *input_start_p, /*
337335
}
338336
else
339337
{
340-
ecma_char_t chr;
338+
hex_value = lit_char_hex_lookup (input_char_p + 1, input_end_p, 2);
341339

342-
if (!lit_read_code_unit_from_hex (input_char_p + 1, 2, &chr)
343-
|| ((chr & LIT_UTF8_EXTRA_BYTE_MASK) != LIT_UTF8_EXTRA_BYTE_MARKER))
340+
if (hex_value == UINT32_MAX || (hex_value & LIT_UTF8_EXTRA_BYTE_MASK) != LIT_UTF8_EXTRA_BYTE_MARKER)
344341
{
345342
is_valid = false;
346343
break;
347344
}
348345

349-
octets[i] = (lit_utf8_byte_t) chr;
350346
input_char_p += URI_ENCODED_BYTE_SIZE;
347+
octets[i] = (lit_utf8_byte_t) hex_value;
351348
}
352349
}
353350

jerry-core/ecma/builtin-objects/ecma-builtin-json.c

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -174,18 +174,13 @@ ecma_builtin_json_parse_string (ecma_json_token_t *token_p) /**< token argument
174174
}
175175
case LIT_CHAR_LOWERCASE_U:
176176
{
177-
if ((end_p - current_p <= ECMA_JSON_HEX_ESCAPE_SEQUENCE_LENGTH))
177+
uint32_t hex_value = lit_char_hex_lookup (current_p + 1, end_p, ECMA_JSON_HEX_ESCAPE_SEQUENCE_LENGTH);
178+
if (hex_value == UINT32_MAX)
178179
{
179180
goto invalid_string;
180181
}
181182

182-
ecma_char_t code_unit;
183-
if (!(lit_read_code_unit_from_hex (current_p + 1, ECMA_JSON_HEX_ESCAPE_SEQUENCE_LENGTH, &code_unit)))
184-
{
185-
goto invalid_string;
186-
}
187-
188-
ecma_stringbuilder_append_char (&result_builder, code_unit);
183+
ecma_stringbuilder_append_char (&result_builder, (ecma_char_t) hex_value);
189184
current_p += ECMA_JSON_HEX_ESCAPE_SEQUENCE_LENGTH + 1;
190185
break;
191186
}

jerry-core/ecma/builtin-objects/ecma-builtins.c

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -505,12 +505,10 @@ ecma_instantiate_builtin (ecma_builtin_id_t obj_builtin_id) /**< built-in id */
505505

506506
ext_object_p->u.class_prop.class_id = LIT_MAGIC_STRING_REGEXP_UL;
507507

508-
const re_compiled_code_t *bc_p = NULL;
509-
ecma_value_t ret_value = re_compile_bytecode (&bc_p,
510-
ecma_get_magic_string (LIT_MAGIC_STRING_EMPTY_NON_CAPTURE_GROUP),
511-
RE_FLAG_EMPTY);
508+
re_compiled_code_t *bc_p = re_compile_bytecode (ecma_get_magic_string (LIT_MAGIC_STRING_EMPTY_NON_CAPTURE_GROUP),
509+
RE_FLAG_EMPTY);
512510

513-
JERRY_ASSERT (ecma_is_value_empty (ret_value));
511+
JERRY_ASSERT (bc_p != NULL);
514512

515513
ECMA_SET_INTERNAL_VALUE_POINTER (ext_object_p->u.class_prop.u.value, bc_p);
516514

0 commit comments

Comments
 (0)