diff --git a/pod/perldelta.pod b/pod/perldelta.pod index 147ab795c621..9aca18c31470 100644 --- a/pod/perldelta.pod +++ b/pod/perldelta.pod @@ -373,6 +373,41 @@ consisted of only ASCII characters. The real upper limit was as few as Chinese or Osage. Now an identifier in any language may contain at least 255 characters. +=item * + +The allowed characters for regular expression capture group names has +been corrected to conform to Perl identifier syntax, which in turn is +based on public Unicode rules. The net result of this change is that, +as of Unicode 17.0, about 160 characters that formerly were allowed to +be in an identifier no longer are. Only programs that do +L|utf8> can be affected, and then only characters that +appear in the 2nd or later positions of the name. The characters that +an identifier name can begin with are unchanged. + +130 of the now unacceptable characters are 5 sets of 26 Latin letters +that are enclosed by some shape, such as CIRCLED LATIN CAPITAL LETTER N. +Another 8 are generic modifiers that add shapes around other characters; +5 are modifiers to Cyrillic numbers; and 16 are Arabic ligatures and +isolated forms. The other two are GREEK YPOGEGRAMMENI and VERTICAL +TILDE. + +You can get a complete list of them by running the following program + + perl -le 'use re qw(Debug COMPILE); qr/(?[ \w - \p{XIDC} ])/' + +Look near the final line. The one that begins C contains a +list of 4 and 5 digit hexdecimal numbers. These are the Unicode code +points that were previously allowed, but no longer are. + +(Long after Perl identifier rules were formed, Unicode has added +recommendations to further restrict legal identifier names. These were +added to counter cases where, for example, programmers snuck code past +reviewers using characters that look like other ones. The two +properties are C and C. +L. Perl currently doesn't do +anything with these, except to furnish you the ability to use them in +regular expressions.) + =back =head1 Known Problems diff --git a/pod/perldiag.pod b/pod/perldiag.pod index e6250bd970ee..40bf32ae0572 100644 --- a/pod/perldiag.pod +++ b/pod/perldiag.pod @@ -2867,8 +2867,9 @@ has since been undefined. S<<-- HERE> in m/%s/ (F) Group names must follow the rules for perl identifiers, meaning -they must start with a non-digit word character. A common cause of -this error is using (?&0) instead of (?0). See L. +that ASCII-range ones must start with a non-digit word character. A +common cause of this error is using (?&0) instead of (?0). See +L and L. =item ()-group starts with a count diff --git a/pod/perlre.pod b/pod/perlre.pod index f128eb717688..6f898c3bf178 100644 --- a/pod/perlre.pod +++ b/pod/perlre.pod @@ -1198,7 +1198,9 @@ You can dispense with numbers altogether and create named capture groups. The notation is C<(?EIE...)> to declare and C<\g{I}> to reference. (To be compatible with .Net regular expressions, C<\g{I}> may also be written as C<\k{I}>, C<\kEIE> or C<\k'I'>.) -I must not begin with a number, nor contain hyphens. +I must follow the rules for perl identifiers +(L) which means, for example, that they +can't begin with a number, nor contain hyphens. When different groups within the same pattern have the same name, any reference to that name assumes the leftmost defined group. Named groups count in absolute and relative numbering, and so can also be referred to by those diff --git a/regcomp.c b/regcomp.c index 134a59fd03c0..0dbea26a4ffd 100644 --- a/regcomp.c +++ b/regcomp.c @@ -2530,7 +2530,7 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) do { RExC_parse_advance(advance); } while ( RExC_parse < RExC_end - && (advance = isWORDCHAR_utf8_safe( (U8 *) RExC_parse, + && (advance = isIDCONT_utf8_safe( (U8 *) RExC_parse, (U8 *) RExC_end))); } else { RExC_parse_inc_by(1); /* so the <- from the vFAIL is after the offending diff --git a/t/re/pat.t b/t/re/pat.t index ce826a5730d1..544348e78288 100644 --- a/t/re/pat.t +++ b/t/re/pat.t @@ -28,7 +28,7 @@ skip_all_without_unicode_tables(); my $has_locales = locales_enabled('LC_CTYPE'); my $utf8_locale = find_utf8_ctype_locale(); -plan tests => 1296; # Update this when adding/deleting tests. +plan tests => 1298; # Update this when adding/deleting tests. run_tests() unless caller; @@ -1388,6 +1388,28 @@ EOP fresh_perl_like($prog, qr!Group name must start with a non-digit word character!, {}, sprintf("'U+%04X not legal IDFirst'", ord($char))); } + + foreach my $char (chr(0x2115), chr(0x24B7)) { + my $prog = <<"EOP"; +use utf8;; +no warnings 'utf8'; +print 0 + "abc" =~ qr/(?abc)/; +EOP + utf8::encode($prog); + if ($char =~ /\p{XID_Continue}/) { + fresh_perl_is($prog, 1, + {}, + sprintf("U+%04X is legal IDCont", + ord($char))); + } + else { + fresh_perl_like($prog, + qr/Sequence .* not terminated/, + {}, + sprintf("U+%04X not legal IDCont", + ord($char))); + } + } } { # [perl #101710]