Skip to content

Commit b1a91f3

Browse files
committed
Implement \p{Name=/.../} wildcards
This commit adds wildcard subpatterns for the Name and Name Aliases properties.
1 parent ffd8e51 commit b1a91f3

File tree

13 files changed

+530
-28
lines changed

13 files changed

+530
-28
lines changed

charclass_invlists.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -419864,7 +419864,7 @@ static const U8 WB_table[23][23] = {
419864419864
* baba9dfc133e3cb770a89aaf0973b1341fa61c2da6c176baf6428898b3b568d8 lib/unicore/extracted/DLineBreak.txt
419865419865
* 6d4a8c945dd7db83ed617cbb7d937de7f4ecf016ff22970d846e996a7c9a2a5d lib/unicore/extracted/DNumType.txt
419866419866
* 5b7c14380d5cceeaffcfbc18db1ed936391d2af2d51f5a41f1a17b692c77e59b lib/unicore/extracted/DNumValues.txt
419867-
* 3e37ae63c1a4f3084bba787a2c6ca020dad9d0d56e115c118fe8c68ac290ea7a lib/unicore/mktables
419867+
* 62a198b1430be086ac577285f5cbc0c2bde043a8ba469d85b256f1e191aa997d lib/unicore/mktables
419868419868
* 50b85a67451145545a65cea370dab8d3444fbfe07e9c34cef560c5b7da9d3eef lib/unicore/version
419869419869
* 2680b9254eb236c5c090f11b149605043e8c8433661b96efc4a42fb4709342a5 regen/charset_translations.pl
419870419870
* 6bbad21de0848e0236b02f34f5fa0edd3cdae9ba8173cc9469a5513936b9e728 regen/mk_PL_charclass.pl

embed.fnc

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1946,6 +1946,10 @@ ERS |REGEXP*|compile_wildcard|NN const char * subpattern|const STRLEN len\
19461946
ES |I32 |execute_wildcard|NN REGEXP * const prog|NN char* stringarg \
19471947
|NN char* strend|NN char* strbeg \
19481948
|SSize_t minend |NN SV* screamer|U32 nosave
1949+
ES |bool |handle_names_wildcard \
1950+
|NN const char * wname \
1951+
|const STRLEN wname_len \
1952+
|NN SV ** prop_definition
19491953
ES |void|add_above_Latin1_folds|NN RExC_state_t *pRExC_state|const U8 cp \
19501954
|NN SV** invlist
19511955
Ei |regnode_offset|handle_named_backref|NN RExC_state_t *pRExC_state \

embed.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1032,6 +1032,7 @@
10321032
#define get_ANYOF_cp_list_for_ssc(a,b) S_get_ANYOF_cp_list_for_ssc(aTHX_ a,b)
10331033
#define grok_bslash_N(a,b,c,d,e,f,g) S_grok_bslash_N(aTHX_ a,b,c,d,e,f,g)
10341034
#define handle_named_backref(a,b,c,d) S_handle_named_backref(aTHX_ a,b,c,d)
1035+
#define handle_names_wildcard(a,b,c) S_handle_names_wildcard(aTHX_ a,b,c)
10351036
#define handle_possible_posix(a,b,c,d,e) S_handle_possible_posix(aTHX_ a,b,c,d,e)
10361037
#define handle_regex_sets(a,b,c,d,e) S_handle_regex_sets(aTHX_ a,b,c,d,e)
10371038
#define handle_user_defined_property(a,b,c,d,e,f,g,h,i,j) S_handle_user_defined_property(aTHX_ a,b,c,d,e,f,g,h,i,j)

lib/_charnames.pm

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -641,6 +641,14 @@ sub _loose_regcomp_lookup {
641641
);
642642
}
643643

644+
sub _get_names_info {
645+
# For use only by regcomp.c to compile \p{name=/.../}
646+
populate_txt() unless $txt;
647+
648+
649+
return ( \$txt, \@charnames::code_points_ending_in_code_point );
650+
}
651+
644652
sub import
645653
{
646654
shift; ## ignore class name

lib/charnames.t

Lines changed: 38 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,8 @@ my $run_slow_tests = $ENV{PERL_RUN_SLOW_TESTS} || 0;
1010
my $RUN_SLOW_TESTS_EVERY_CODE_POINT = 100;
1111

1212
# If $ENV{PERL_RUN_SLOW_TESTS} is at least 1 and less than the number above,
13-
# all code points with names are tested. If it is at least that number, all
14-
# 1,114,112 Unicode code points are tested.
13+
# all code points with names are tested, including wildcard search names. If
14+
# it is at least that number, all 1,114,112 Unicode code points are tested.
1515

1616
# Because \N{} is compile time, any warnings will get generated before
1717
# execution, so have to have an array, and arrange things so no warning
@@ -114,6 +114,7 @@ sub get_loose_name ($) { # Modify name to stress the loose tests.
114114
}
115115

116116
sub test_vianame ($$$) {
117+
CORE::state $wildcard_count = 0;
117118

118119
# Run the vianame tests on a code point, both loose and full
119120

@@ -126,23 +127,54 @@ sub test_vianame ($$$) {
126127
# Get a copy of the name modified to stress the loose tests.
127128
my $loose_name = get_loose_name($name);
128129

130+
my $right_anchor;
131+
129132
# Switch loose and full in vianame vs string_vianame half the time
130133
if (rand() < .5) {
131134
use charnames ":full";
132135
$all_pass &= is(charnames::vianame($name), $i, "Verify vianame(\"$name\") is 0x$hex");
133136
use charnames ":loose";
134137
$all_pass &= is(charnames::string_vianame($loose_name), chr($i), "Verify string_vianame(\"$loose_name\") is chr(0x$hex)");
138+
$right_anchor = '\\Z';
135139
}
136140
else {
137141
use charnames ":loose";
138142
$all_pass &= is(charnames::vianame($loose_name), $i, "Verify vianame(\"$loose_name\") is 0x$hex");
139143
use charnames ":full";
140144
$all_pass &= is(charnames::string_vianame($name), chr($i), "Verify string_vianame(\"$name\") is chr(0x$hex)");
145+
$right_anchor = '\\z';
141146
}
142147

148+
my $left_anchor = (rand() < .5) ? '^' : '\\A';
149+
143150
# \p{name=} is always loose matching
144151
$all_pass &= like(chr($i), qr/^\p{name=$loose_name}$/, "Verify /\p{name=$loose_name}/ matches chr(0x$hex)");
145152

153+
$wildcard_count++;
154+
155+
# Because wildcard name matching is so real-time intensive, do it less
156+
# frequently than the others
157+
if ($wildcard_count >= 10) {
158+
$wildcard_count = 0;
159+
160+
# A few control characters have anomalous names containing
161+
# parentheses, which need to be escaped.
162+
my $name_ref = \$name;
163+
my $mod_name;
164+
if ($i <= 0x85) { # NEL in ASCII; affected controls are lower than
165+
# this in EBCDIC
166+
$mod_name = $name =~ s/([()])/\\$1/gr;
167+
$name_ref = \$mod_name;
168+
}
169+
170+
# We anchor the name, randomly with the possible anchors.
171+
my $assembled = $left_anchor. $$name_ref . $right_anchor;
172+
173+
# \p{name=/.../} is always full matching
174+
$all_pass &= like(chr($i), qr!^\p{name=/$assembled/}!,
175+
"Verify /\p{name=/$assembled/} matches chr(0x$hex)");
176+
}
177+
146178
return $all_pass;
147179
}
148180

@@ -352,6 +384,10 @@ is(charnames::viacode("U+00000000000FEED"), "ARABIC LETTER WAW ISOLATED FORM", '
352384

353385
test_vianame(0x116C, "116C", "HANGUL JUNGSEONG OE");
354386
test_vianame(0x1180, "1180", "HANGUL JUNGSEONG O-E");
387+
like(chr(0x59C3), qr/\p{name=\/\ACJK UNIFIED IDEOGRAPH-59C3\z\/}/,
388+
'Verify name wildcards delimitters can be escaped');
389+
like(chr(0xD800), qr!\p{name=/\A\z/}!,
390+
'Verify works on matching an empty name');
355391

356392
{
357393
no warnings 'deprecated';

lib/unicore/mktables

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17897,9 +17897,12 @@ $loose_names
1789717897

1789817898
# And the following array gives the inverse mapping from code points to
1789917899
# names. Lowest code points are first
17900-
my \@code_points_ending_in_code_point = (
17900+
\@code_points_ending_in_code_point = (
1790117901
$code_points_ending_in_code_point
1790217902
);
17903+
17904+
# Is exportable, make read-only
17905+
Internals::SvREADONLY(\@code_points_ending_in_code_point, 1);
1790317906
END
1790417907
# Earlier releases didn't have Jamos. No sense outputting
1790517908
# them unless will be used.

lib/unicore/uni_keywords.pl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1295,7 +1295,7 @@
12951295
# baba9dfc133e3cb770a89aaf0973b1341fa61c2da6c176baf6428898b3b568d8 lib/unicore/extracted/DLineBreak.txt
12961296
# 6d4a8c945dd7db83ed617cbb7d937de7f4ecf016ff22970d846e996a7c9a2a5d lib/unicore/extracted/DNumType.txt
12971297
# 5b7c14380d5cceeaffcfbc18db1ed936391d2af2d51f5a41f1a17b692c77e59b lib/unicore/extracted/DNumValues.txt
1298-
# 3e37ae63c1a4f3084bba787a2c6ca020dad9d0d56e115c118fe8c68ac290ea7a lib/unicore/mktables
1298+
# 62a198b1430be086ac577285f5cbc0c2bde043a8ba469d85b256f1e191aa997d lib/unicore/mktables
12991299
# 50b85a67451145545a65cea370dab8d3444fbfe07e9c34cef560c5b7da9d3eef lib/unicore/version
13001300
# 2680b9254eb236c5c090f11b149605043e8c8433661b96efc4a42fb4709342a5 regen/charset_translations.pl
13011301
# 6bbad21de0848e0236b02f34f5fa0edd3cdae9ba8173cc9469a5513936b9e728 regen/mk_PL_charclass.pl

pod/perldelta.pod

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,17 @@ here, but most should go in the L</Performance Enhancements> section.
3131

3232
See L<https://www.unicode.org/versions/Unicode13.0.0/> for details.
3333

34+
=head2 It is now possible to write C<qr/\p{Name=...}/>, or
35+
C<qr!\p{na=/(SMILING|GRINNING) FACE/}!>
36+
37+
The Unicode Name property is now accessible in regular expression
38+
patterns, as an alternative to C<\N{...}>.
39+
A comparison of the two methods is given in
40+
L<perlunicode/Comparison of \N{...} and \p{name=...}>.
41+
42+
The second example above shows that wildcard subpatterns are also usable
43+
in this property. See L<perlunicode/Wildcards in Property Values>.
44+
3445
=head1 Security
3546

3647
XXX Any security-related notices go here. In particular, any security

pod/perlunicode.pod

Lines changed: 38 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1066,30 +1066,58 @@ example,
10661066

10671067
would match the same things.
10681068

1069-
A warning is issued if none of the legal values for a property are
1070-
matched by your pattern. It's likely that a future release will raise a
1071-
warning if your pattern ends up causing every possible code point to
1072-
match.
1073-
10741069
Another example that shows that within C<\p{...}>, C</x> isn't needed to
10751070
have spaces:
10761071

10771072
qr!\p{scx= /Hebrew|Greek/ }!
10781073

10791074
To be safe, we should have anchored the above example, to prevent
10801075
matches for something like C<Hebrew_Braille>, but there aren't
1081-
any script names like that.
1076+
any script names like that, so far.
1077+
A warning is issued if none of the legal values for a property are
1078+
matched by your pattern. It's likely that a future release will raise a
1079+
warning if your pattern ends up causing every possible code point to
1080+
match.
1081+
1082+
Starting in 5.32, the Name and Name Aliases properties are allowed to be
1083+
matched. They are considered to be a single combination property, just
1084+
as has long been the case for C<\N{}>. Loose matching doesn't work in
1085+
exactly the same way for these as it does for the values of other
1086+
properties. The rules are given in
1087+
L<https://www.unicode.org/reports/tr44/tr44-24.html#UAX44-LM2>. As a
1088+
result, Perl doesn't try loose matching for you, like it does in other
1089+
properties. All letters in names are uppercase, but you can add C<(?i)>
1090+
to your subpattern to ignore case. If you're uncertain where a blank
1091+
is, you can use C< ?> in your subpattern. No character name contains an
1092+
underscore, so don't bother trying to match one. The use of hyphens is
1093+
particularly problematic; refer to the above link. But note that, as of
1094+
Unicode 13.0, the only script in modern usage which has weirdnesses with
1095+
these is Tibetan; also the two Korean characters U+116C HANGUL JUNGSEONG
1096+
OE and U+1180 HANGUL JUNGSEONG O-E. Unicode makes no promises to not
1097+
add hyphen-problematic names in the future.
1098+
1099+
Using wildcards on these is resource intensive, given the hundreds of
1100+
thousands of legal names that must be checked against.
1101+
1102+
An example of using Name property wildcards is
1103+
1104+
qr!\p{name=/(SMILING|GRINNING) FACE/}!
1105+
1106+
Another is
1107+
1108+
qr/(?[ \p{name=\/CJK\/} - \p{ideographic} ])/
1109+
1110+
which is the 200-ish (as of Unicode 13.0) CJK characters that aren't
1111+
ideographs.
10821112

1083-
There are certain properties that it doesn't currently work with. These
1084-
are:
1113+
There are certain properties that wildcard subpatterns don't currently
1114+
work with. These are:
10851115

10861116
Bidi Mirroring Glyph
10871117
Bidi Paired Bracket
10881118
Case Folding
10891119
Decomposition Mapping
10901120
Equivalent Unified Ideograph
1091-
Name
1092-
Name Alias
10931121
Lowercase Mapping
10941122
NFKC Case Fold
10951123
Titlecase Mapping

proto.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5721,6 +5721,9 @@ PERL_STATIC_INLINE regnode_offset S_handle_named_backref(pTHX_ RExC_state_t *pRE
57215721
#define PERL_ARGS_ASSERT_HANDLE_NAMED_BACKREF \
57225722
assert(pRExC_state); assert(flagp); assert(parse_start)
57235723
#endif
5724+
STATIC bool S_handle_names_wildcard(pTHX_ const char * wname, const STRLEN wname_len, SV ** prop_definition);
5725+
#define PERL_ARGS_ASSERT_HANDLE_NAMES_WILDCARD \
5726+
assert(wname); assert(prop_definition)
57245727
STATIC int S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state, const char* const s, char ** updated_parse_ptr, AV** posix_warnings, const bool check_only);
57255728
#define PERL_ARGS_ASSERT_HANDLE_POSSIBLE_POSIX \
57265729
assert(pRExC_state); assert(s)

0 commit comments

Comments
 (0)