Skip to content

Commit b319eff

Browse files
committed
[GR-35151] Fix RegExp case-folding for non-Unicode regexps.
PullRequest: graal/10339
2 parents ea9d756 + f6a0860 commit b319eff

File tree

5 files changed

+99
-53
lines changed

5 files changed

+99
-53
lines changed

regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/CaseFoldTable.java

Lines changed: 1 addition & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -545,21 +545,11 @@ public void appendRangesTo(RangesBuffer buffer, int startIndex, int endIndex) {
545545
0x001f78, 0x001f79, INTEGER_OFFSET, 128,
546546
0x001f7a, 0x001f7b, INTEGER_OFFSET, 112,
547547
0x001f7c, 0x001f7d, INTEGER_OFFSET, 126,
548-
0x001f80, 0x001f87, INTEGER_OFFSET, 8,
549-
0x001f88, 0x001f8f, INTEGER_OFFSET, -8,
550-
0x001f90, 0x001f97, INTEGER_OFFSET, 8,
551-
0x001f98, 0x001f9f, INTEGER_OFFSET, -8,
552-
0x001fa0, 0x001fa7, INTEGER_OFFSET, 8,
553-
0x001fa8, 0x001faf, INTEGER_OFFSET, -8,
554548
0x001fb0, 0x001fb1, INTEGER_OFFSET, 8,
555-
0x001fb3, 0x001fb3, INTEGER_OFFSET, 9,
556549
0x001fb8, 0x001fb9, INTEGER_OFFSET, -8,
557550
0x001fba, 0x001fbb, INTEGER_OFFSET, -74,
558-
0x001fbc, 0x001fbc, INTEGER_OFFSET, -9,
559551
0x001fbe, 0x001fbe, DIRECT_MAPPING, 5,
560-
0x001fc3, 0x001fc3, INTEGER_OFFSET, 9,
561552
0x001fc8, 0x001fcb, INTEGER_OFFSET, -86,
562-
0x001fcc, 0x001fcc, INTEGER_OFFSET, -9,
563553
0x001fd0, 0x001fd1, INTEGER_OFFSET, 8,
564554
0x001fd8, 0x001fd9, INTEGER_OFFSET, -8,
565555
0x001fda, 0x001fdb, INTEGER_OFFSET, -100,
@@ -568,10 +558,8 @@ public void appendRangesTo(RangesBuffer buffer, int startIndex, int endIndex) {
568558
0x001fe8, 0x001fe9, INTEGER_OFFSET, -8,
569559
0x001fea, 0x001feb, INTEGER_OFFSET, -112,
570560
0x001fec, 0x001fec, INTEGER_OFFSET, -7,
571-
0x001ff3, 0x001ff3, INTEGER_OFFSET, 9,
572561
0x001ff8, 0x001ff9, INTEGER_OFFSET, -128,
573562
0x001ffa, 0x001ffb, INTEGER_OFFSET, -126,
574-
0x001ffc, 0x001ffc, INTEGER_OFFSET, -9,
575563
0x002132, 0x002132, INTEGER_OFFSET, 28,
576564
0x00214e, 0x00214e, INTEGER_OFFSET, -28,
577565
0x002160, 0x00216f, INTEGER_OFFSET, 16,
@@ -635,27 +623,7 @@ public void appendRangesTo(RangesBuffer buffer, int startIndex, int endIndex) {
635623
0x00ab53, 0x00ab53, INTEGER_OFFSET, -928,
636624
0x00ab70, 0x00abbf, INTEGER_OFFSET, -38864,
637625
0x00ff21, 0x00ff3a, INTEGER_OFFSET, 32,
638-
0x00ff41, 0x00ff5a, INTEGER_OFFSET, -32,
639-
0x010400, 0x010427, INTEGER_OFFSET, 40,
640-
0x010428, 0x01044f, INTEGER_OFFSET, -40,
641-
0x0104b0, 0x0104d3, INTEGER_OFFSET, 40,
642-
0x0104d8, 0x0104fb, INTEGER_OFFSET, -40,
643-
0x010570, 0x01057a, INTEGER_OFFSET, 39,
644-
0x01057c, 0x01058a, INTEGER_OFFSET, 39,
645-
0x01058c, 0x010592, INTEGER_OFFSET, 39,
646-
0x010594, 0x010595, INTEGER_OFFSET, 39,
647-
0x010597, 0x0105a1, INTEGER_OFFSET, -39,
648-
0x0105a3, 0x0105b1, INTEGER_OFFSET, -39,
649-
0x0105b3, 0x0105b9, INTEGER_OFFSET, -39,
650-
0x0105bb, 0x0105bc, INTEGER_OFFSET, -39,
651-
0x010c80, 0x010cb2, INTEGER_OFFSET, 64,
652-
0x010cc0, 0x010cf2, INTEGER_OFFSET, -64,
653-
0x0118a0, 0x0118bf, INTEGER_OFFSET, 32,
654-
0x0118c0, 0x0118df, INTEGER_OFFSET, -32,
655-
0x016e40, 0x016e5f, INTEGER_OFFSET, 32,
656-
0x016e60, 0x016e7f, INTEGER_OFFSET, -32,
657-
0x01e900, 0x01e921, INTEGER_OFFSET, 34,
658-
0x01e922, 0x01e943, INTEGER_OFFSET, -34
626+
0x00ff41, 0x00ff5a, INTEGER_OFFSET, -32
659627
});
660628

661629
private static final CaseFoldTableImpl UNICODE_TABLE_ENTRIES = new CaseFoldTableImpl(new int[]{

regex/src/com.oracle.truffle.regex/tools/generate_case_fold_table.clj

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
1-
#!/usr/bin/env boot
2-
31
; ------------------------------------------------------------------------------
4-
; Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
2+
; Copyright (c) 2021, Oracle and/or its affiliates. All rights reserved.
53
; DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
64
;
75
; The Universal Permissive License (UPL), Version 1.0
@@ -48,8 +46,9 @@
4846
;; This script assumes that the current working directory contains a folder "dat"
4947
;; with the files NonUnicodeFoldTable.txt and UnicodeFoldTable.txt.
5048

51-
(require '[clojure.set :as set]
52-
'[clojure.string :as str])
49+
(ns generate-case-fold-table
50+
(:require [clojure.set :as set]
51+
[clojure.string :as str]))
5352

5453
(defn pairwise
5554
"Given a sequence `x_1`, `x_2`, `x_3`..., returns the sequence of pairs `[x_1 x_2]`, `[x_2 x_3]`..."
@@ -324,17 +323,17 @@
324323
footer "\n });\n"
325324
method-name-and-args (fn [entry]
326325
(case (:kind entry)
327-
:delta {:lo (:lo entry)
328-
:hi (:hi entry)
326+
:delta {:lo (:lo entry)
327+
:hi (:hi entry)
329328
:method-name "INTEGER_OFFSET"
330329
:arg (:delta entry)}
331-
:alternating {:lo (:lo entry)
330+
:alternating {:lo (:lo entry)
332331
:hi (:hi entry)
333332
:method-name (if (:aligned entry)
334333
"ALTERNATING_AL"
335334
"ALTERNATING_UL")
336335
:arg 0 }
337-
:class {:lo (:lo entry)
336+
:class {:lo (:lo entry)
338337
:hi (:hi entry)
339338
:method-name "DIRECT_MAPPING"
340339
:arg (:class-id entry)}))
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
#!/usr/bin/env python3
2+
#
3+
# Copyright (c) 2021, Oracle and/or its affiliates. All rights reserved.
4+
# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5+
#
6+
# The Universal Permissive License (UPL), Version 1.0
7+
#
8+
# Subject to the condition set forth below, permission is hereby granted to any
9+
# person obtaining a copy of this software, associated documentation and/or
10+
# data (collectively the "Software"), free of charge and under any and all
11+
# copyright rights in the Software, and any and all patent rights owned or
12+
# freely licensable by each licensor hereunder covering either (i) the
13+
# unmodified Software as contributed to or provided by such licensor, or (ii)
14+
# the Larger Works (as defined below), to deal in both
15+
#
16+
# (a) the Software, and
17+
#
18+
# (b) any piece of software and/or hardware listed in the lrgrwrks.txt file if
19+
# one is included with the Software each a "Larger Work" to which the Software
20+
# is contributed by such licensors),
21+
#
22+
# without restriction, including without limitation the rights to copy, create
23+
# derivative works of, display, perform, and distribute the Software and make,
24+
# use, sell, offer for sale, import, export, have made, and have sold the
25+
# Software and the Larger Work(s), and to sublicense the foregoing rights on
26+
# either these or other terms.
27+
#
28+
# This license is subject to the following condition:
29+
#
30+
# The above copyright notice and either this complete permission notice or at a
31+
# minimum a reference to the UPL must be included in all copies or substantial
32+
# portions of the Software.
33+
#
34+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
35+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
36+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
37+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
38+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
39+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
40+
# SOFTWARE.
41+
42+
43+
# This reads dat/UnicodeData.txt and dat/SpecialCasing.txt and produces a file
44+
# that gives all the non-trivial pairs of inputs-outputs of the ECMAScript
45+
# Canonicalize when Unicode is false and IgnoreCase is true.
46+
47+
upper_map = {}
48+
for line in open("dat/UnicodeData.txt"):
49+
tokens = line.split(";")
50+
# Drop entries without toUppercase mapping
51+
if tokens[12].strip() == "":
52+
continue
53+
char = int(tokens[0].strip(), 16)
54+
upper = int(tokens[12].strip(), 16)
55+
upper_map[char] = [upper]
56+
57+
for line in open("dat/SpecialCasing.txt"):
58+
# Drop comments and empty lines
59+
if line.startswith("#") or line.strip() == "":
60+
continue
61+
tokens = line.split(";")
62+
# Drop entries with conditions
63+
if len(tokens) > 5:
64+
continue
65+
char = int(tokens[0].strip(), 16)
66+
upper = [int(c, 16) for c in tokens[3].split()]
67+
upper_map[char] = upper
68+
69+
for (char, upper) in upper_map.items():
70+
# Only follow rules which give map to a single code unit
71+
if len(upper) > 1 or upper[0] >= 0x10000:
72+
continue
73+
# Do not allow non-ASCII characters to cross into ASCII.
74+
if char >= 128 and upper[0] < 128:
75+
continue
76+
# Drop trivial mappings
77+
if (char == upper[0]):
78+
continue
79+
print("%X;%X" % (char, upper[0]))

regex/src/com.oracle.truffle.regex/tools/run_scripts.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
# SOFTWARE.
4141
#
4242

43+
set -e
4344

4445
if [[ $(pwd) != *graal/regex/src/com.oracle.truffle.regex/tools ]]
4546
then
@@ -65,7 +66,7 @@ unzip -d dat dat/ucd.nounihan.flat.zip
6566

6667
./unicode-script.sh
6768

68-
clojure --init generate_case_fold_table.clj --eval '(-main)' > dat/case-fold-table.txt
69+
clojure -Sdeps '{:paths ["."]}' -M --main generate-case-fold-table > dat/case-fold-table.txt
6970

7071
./update_case_fold_table.py
7172

regex/src/com.oracle.truffle.regex/tools/unicode-script.sh

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#!/usr/bin/env bash
22
#
3-
# Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
3+
# Copyright (c) 2021, Oracle and/or its affiliates. All rights reserved.
44
# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
55
#
66
# The Universal Permissive License (UPL), Version 1.0
@@ -40,6 +40,8 @@
4040
# SOFTWARE.
4141
#
4242

43+
set -e
44+
4345

4446
# This script takes the CaseFolding.txt and UnicodeData.txt files of the Unicode
4547
# character database and extracts from them the files UnicodeFoldTable.txt and
@@ -66,15 +68,12 @@ cat dat/CaseFolding.txt \
6668
> dat/UnicodeFoldTable.txt
6769

6870
# We produce the table for the Canonicalize abstract function when the Unicode
69-
# flag is not present. We use the UnicodeData.txt file and extract the
70-
# Uppercase_Character field. We remove entries which do not have an
71-
# Uppercase_Character mapping and entries which map from non-ASCII
72-
# code points (>= 128) to ASCII code points (< 128), as per the ECMAScript spec.
73-
cat dat/UnicodeData.txt \
74-
| cut -d\; -f1,13 \
75-
| sed -e '/;$/d' \
76-
-e '/^\(00[8-F][0-9A-F]\|0[^0][0-9A-F]\+\|[^0][0-9A-F]\+\);00[0-7][0-9A-F]$/d' \
77-
> dat/NonUnicodeFoldTable.txt
71+
# flag is not present. We extract the Unicode Case Conversion table from the
72+
# UnicodeData.txt and SpecialCasing.txt files. We remove entries which map from
73+
# non-ASCII code points (>= 128) to ASCII code points (< 128), as per the
74+
# ECMAScript spec. We also drop the special entries which produce strings of more
75+
# than one UTF-16 code unit.
76+
./generate_nonunicode_fold_table.py > dat/NonUnicodeFoldTable.txt
7877

7978
# In Python's case insensitive regular expressions, characters are considered
8079
# equivalent if they have the same Lowercase mapping. However, in some cases

0 commit comments

Comments
 (0)