Implement \p{Name=/.../} wildcards

khwilliamson · khwilliamson · commit b1a91f306fb9 · 2020-03-11T09:00:04.000-06:00
This commit adds wildcard subpatterns for the Name and Name Aliases
properties.
diff --git a/charclass_invlists.h b/charclass_invlists.h
@@ -419864,7 +419864,7 @@ static const U8 WB_table[23][23] = {
  * baba9dfc133e3cb770a89aaf0973b1341fa61c2da6c176baf6428898b3b568d8 lib/unicore/extracted/DLineBreak.txt
  * 6d4a8c945dd7db83ed617cbb7d937de7f4ecf016ff22970d846e996a7c9a2a5d lib/unicore/extracted/DNumType.txt
  * 5b7c14380d5cceeaffcfbc18db1ed936391d2af2d51f5a41f1a17b692c77e59b lib/unicore/extracted/DNumValues.txt
- * 3e37ae63c1a4f3084bba787a2c6ca020dad9d0d56e115c118fe8c68ac290ea7a lib/unicore/mktables
+ * 62a198b1430be086ac577285f5cbc0c2bde043a8ba469d85b256f1e191aa997d lib/unicore/mktables
  * 50b85a67451145545a65cea370dab8d3444fbfe07e9c34cef560c5b7da9d3eef lib/unicore/version
  * 2680b9254eb236c5c090f11b149605043e8c8433661b96efc4a42fb4709342a5 regen/charset_translations.pl
  * 6bbad21de0848e0236b02f34f5fa0edd3cdae9ba8173cc9469a5513936b9e728 regen/mk_PL_charclass.pl
diff --git a/embed.fnc b/embed.fnc
@@ -1946,6 +1946,10 @@ ERS	|REGEXP*|compile_wildcard|NN const char * subpattern|const STRLEN len\
 ES	|I32	|execute_wildcard|NN REGEXP * const prog|NN char* stringarg \
 				|NN char* strend|NN char* strbeg \
 				|SSize_t minend |NN SV* screamer|U32 nosave
+ES	|bool	|handle_names_wildcard					    \
+				|NN const char * wname			    \
+				|const STRLEN wname_len			    \
+				|NN SV ** prop_definition
 ES	|void|add_above_Latin1_folds|NN RExC_state_t *pRExC_state|const U8 cp \
 				|NN SV** invlist
 Ei	|regnode_offset|handle_named_backref|NN RExC_state_t *pRExC_state   \
diff --git a/embed.h b/embed.h
@@ -1032,6 +1032,7 @@
 #define get_ANYOF_cp_list_for_ssc(a,b)	S_get_ANYOF_cp_list_for_ssc(aTHX_ a,b)
 #define grok_bslash_N(a,b,c,d,e,f,g)	S_grok_bslash_N(aTHX_ a,b,c,d,e,f,g)
 #define handle_named_backref(a,b,c,d)	S_handle_named_backref(aTHX_ a,b,c,d)
+#define handle_names_wildcard(a,b,c)	S_handle_names_wildcard(aTHX_ a,b,c)
 #define handle_possible_posix(a,b,c,d,e)	S_handle_possible_posix(aTHX_ a,b,c,d,e)
 #define handle_regex_sets(a,b,c,d,e)	S_handle_regex_sets(aTHX_ a,b,c,d,e)
 #define handle_user_defined_property(a,b,c,d,e,f,g,h,i,j)	S_handle_user_defined_property(aTHX_ a,b,c,d,e,f,g,h,i,j)
diff --git a/lib/_charnames.pm b/lib/_charnames.pm
@@ -641,6 +641,14 @@ sub _loose_regcomp_lookup {
                     );
 }
 
+sub _get_names_info {
+  # For use only by regcomp.c to compile \p{name=/.../}
+  populate_txt() unless $txt;
+
+
+  return ( \$txt, \@charnames::code_points_ending_in_code_point );
+}
+
 sub import
 {
   shift; ## ignore class name
diff --git a/lib/charnames.t b/lib/charnames.t
@@ -10,8 +10,8 @@ my $run_slow_tests = $ENV{PERL_RUN_SLOW_TESTS} || 0;
 my $RUN_SLOW_TESTS_EVERY_CODE_POINT = 100;
 
 # If $ENV{PERL_RUN_SLOW_TESTS} is at least 1 and less than the number above,
-# all code points with names are tested.  If it is at least that number, all
-# 1,114,112 Unicode code points are tested.
+# all code points with names are tested, including wildcard search names.  If
+# it is at least that number, all 1,114,112 Unicode code points are tested.
 
 # Because \N{} is compile time, any warnings will get generated before
 # execution, so have to have an array, and arrange things so no warning
@@ -114,6 +114,7 @@ sub get_loose_name ($) { # Modify name to stress the loose tests.
 }
 
 sub test_vianame ($$$) {
+    CORE::state $wildcard_count = 0;
 
     # Run the vianame tests on a code point, both loose and full
 
@@ -126,23 +127,54 @@ sub test_vianame ($$$) {
     # Get a copy of the name modified to stress the loose tests.
     my $loose_name = get_loose_name($name);
 
+    my $right_anchor;
+
     # Switch loose and full in vianame vs string_vianame half the time
     if (rand() < .5) {
         use charnames ":full";
         $all_pass &= is(charnames::vianame($name), $i, "Verify vianame(\"$name\") is 0x$hex");
         use charnames ":loose";
         $all_pass &= is(charnames::string_vianame($loose_name), chr($i), "Verify string_vianame(\"$loose_name\") is chr(0x$hex)");
+        $right_anchor = '\\Z';
     }
     else {
         use charnames ":loose";
         $all_pass &= is(charnames::vianame($loose_name), $i, "Verify vianame(\"$loose_name\") is 0x$hex");
         use charnames ":full";
         $all_pass &= is(charnames::string_vianame($name), chr($i), "Verify string_vianame(\"$name\") is chr(0x$hex)");
+        $right_anchor = '\\z';
     }
 
+    my $left_anchor = (rand() < .5) ? '^' : '\\A';
+
     # \p{name=} is always loose matching
     $all_pass &= like(chr($i), qr/^\p{name=$loose_name}$/, "Verify /\p{name=$loose_name}/ matches chr(0x$hex)");
 
+    $wildcard_count++;
+
+    # Because wildcard name matching is so real-time intensive, do it less
+    # frequently than the others
+    if ($wildcard_count >= 10) {
+        $wildcard_count = 0;
+
+        # A few control characters have anomalous names containing
+        # parentheses, which need to be escaped.
+        my $name_ref = \$name;
+        my $mod_name;
+        if ($i <= 0x85) {   # NEL in ASCII; affected controls are lower than
+                            # this in EBCDIC
+            $mod_name = $name =~ s/([()])/\\$1/gr;
+            $name_ref = \$mod_name;
+        }
+
+        # We anchor the name, randomly with the possible anchors.
+        my $assembled = $left_anchor. $$name_ref . $right_anchor;
+
+        # \p{name=/.../} is always full matching
+        $all_pass &= like(chr($i), qr!^\p{name=/$assembled/}!,
+                          "Verify /\p{name=/$assembled/} matches chr(0x$hex)");
+    }
+
     return $all_pass;
 }
 
@@ -352,6 +384,10 @@ is(charnames::viacode("U+00000000000FEED"), "ARABIC LETTER WAW ISOLATED FORM", '
 
 test_vianame(0x116C, "116C", "HANGUL JUNGSEONG OE");
 test_vianame(0x1180, "1180", "HANGUL JUNGSEONG O-E");
+like(chr(0x59C3), qr/\p{name=\/\ACJK UNIFIED IDEOGRAPH-59C3\z\/}/,
+         'Verify name wildcards delimitters can be escaped');
+like(chr(0xD800), qr!\p{name=/\A\z/}!,
+                                'Verify works on matching an empty name');
 
 {
     no warnings 'deprecated';
diff --git a/lib/unicore/mktables b/lib/unicore/mktables
@@ -17897,9 +17897,12 @@ $loose_names
 
     # And the following array gives the inverse mapping from code points to
     # names.  Lowest code points are first
-    my \@code_points_ending_in_code_point = (
+    \@code_points_ending_in_code_point = (
 $code_points_ending_in_code_point
     );
+
+    # Is exportable, make read-only
+    Internals::SvREADONLY(\@code_points_ending_in_code_point, 1);
 END
     # Earlier releases didn't have Jamos.  No sense outputting
     # them unless will be used.
diff --git a/lib/unicore/uni_keywords.pl b/lib/unicore/uni_keywords.pl
@@ -1295,7 +1295,7 @@
 # baba9dfc133e3cb770a89aaf0973b1341fa61c2da6c176baf6428898b3b568d8 lib/unicore/extracted/DLineBreak.txt
 # 6d4a8c945dd7db83ed617cbb7d937de7f4ecf016ff22970d846e996a7c9a2a5d lib/unicore/extracted/DNumType.txt
 # 5b7c14380d5cceeaffcfbc18db1ed936391d2af2d51f5a41f1a17b692c77e59b lib/unicore/extracted/DNumValues.txt
-# 3e37ae63c1a4f3084bba787a2c6ca020dad9d0d56e115c118fe8c68ac290ea7a lib/unicore/mktables
+# 62a198b1430be086ac577285f5cbc0c2bde043a8ba469d85b256f1e191aa997d lib/unicore/mktables
 # 50b85a67451145545a65cea370dab8d3444fbfe07e9c34cef560c5b7da9d3eef lib/unicore/version
 # 2680b9254eb236c5c090f11b149605043e8c8433661b96efc4a42fb4709342a5 regen/charset_translations.pl
 # 6bbad21de0848e0236b02f34f5fa0edd3cdae9ba8173cc9469a5513936b9e728 regen/mk_PL_charclass.pl
diff --git a/pod/perldelta.pod b/pod/perldelta.pod
@@ -31,6 +31,17 @@ here, but most should go in the L</Performance Enhancements> section.
 
 See L<https://www.unicode.org/versions/Unicode13.0.0/> for details.
 
+=head2 It is now possible to write C<qr/\p{Name=...}/>, or
+C<qr!\p{na=/(SMILING|GRINNING) FACE/}!>
+
+The Unicode Name property is now accessible in regular expression
+patterns, as an alternative to C<\N{...}>.
+A comparison of the two methods is given in
+L<perlunicode/Comparison of \N{...} and \p{name=...}>.
+
+The second example above shows that wildcard subpatterns are also usable
+in this property.  See L<perlunicode/Wildcards in Property Values>.
+
 =head1 Security
 
 XXX Any security-related notices go here.  In particular, any security
diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod
@@ -1066,30 +1066,58 @@ example,
 
 would match the same things.
 
-A warning is issued if none of the legal values for a property are
-matched by your pattern.  It's likely that a future release will raise a
-warning if your pattern ends up causing every possible code point to
-match.
-
 Another example that shows that within C<\p{...}>, C</x> isn't needed to
 have spaces:
 
  qr!\p{scx= /Hebrew|Greek/ }!
 
 To be safe, we should have anchored the above example, to prevent
 matches for something like C<Hebrew_Braille>, but there aren't
-any script names like that.
+any script names like that, so far.
+A warning is issued if none of the legal values for a property are
+matched by your pattern.  It's likely that a future release will raise a
+warning if your pattern ends up causing every possible code point to
+match.
+
+Starting in 5.32, the Name and Name Aliases properties are allowed to be
+matched.  They are considered to be a single combination property, just
+as has long been the case for C<\N{}>.  Loose matching doesn't work in
+exactly the same way for these as it does for the values of other
+properties.  The rules are given in
+L<https://www.unicode.org/reports/tr44/tr44-24.html#UAX44-LM2>.  As a
+result, Perl doesn't try loose matching for you, like it does in other
+properties.  All letters in names are uppercase, but you can add C<(?i)>
+to your subpattern to ignore case.  If you're uncertain where a blank
+is, you can use C< ?> in your subpattern.  No character name contains an
+underscore, so don't bother trying to match one.  The use of hyphens is
+particularly problematic; refer to the above link.  But note that, as of
+Unicode 13.0, the only script in modern usage which has weirdnesses with
+these is Tibetan; also the two Korean characters U+116C HANGUL JUNGSEONG
+OE and U+1180 HANGUL JUNGSEONG O-E.  Unicode makes no promises to not
+add hyphen-problematic names in the future.
+
+Using wildcards on these is resource intensive, given the hundreds of
+thousands of legal names that must be checked against.
+
+An example of using Name property wildcards is
+
+ qr!\p{name=/(SMILING|GRINNING) FACE/}!
+
+Another is
+
+ qr/(?[ \p{name=\/CJK\/} - \p{ideographic} ])/
+
+which is the 200-ish (as of Unicode 13.0) CJK characters that aren't
+ideographs.
 
-There are certain properties that it doesn't currently work with.  These
-are:
+There are certain properties that wildcard subpatterns don't currently
+work with.  These are:
 
  Bidi Mirroring Glyph
  Bidi Paired Bracket
  Case Folding
  Decomposition Mapping
  Equivalent Unified Ideograph
- Name
- Name Alias
  Lowercase Mapping
  NFKC Case Fold
  Titlecase Mapping
diff --git a/proto.h b/proto.h
@@ -5721,6 +5721,9 @@ PERL_STATIC_INLINE regnode_offset	S_handle_named_backref(pTHX_ RExC_state_t *pRE
 #define PERL_ARGS_ASSERT_HANDLE_NAMED_BACKREF	\
 	assert(pRExC_state); assert(flagp); assert(parse_start)
 #endif
+STATIC bool	S_handle_names_wildcard(pTHX_ const char * wname, const STRLEN wname_len, SV ** prop_definition);
+#define PERL_ARGS_ASSERT_HANDLE_NAMES_WILDCARD	\
+	assert(wname); assert(prop_definition)
 STATIC int	S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state, const char* const s, char ** updated_parse_ptr, AV** posix_warnings, const bool check_only);
 #define PERL_ARGS_ASSERT_HANDLE_POSSIBLE_POSIX	\
 	assert(pRExC_state); assert(s)
diff --git a/regcharclass.h b/regcharclass.h
@@ -2247,7 +2247,7 @@
  * baba9dfc133e3cb770a89aaf0973b1341fa61c2da6c176baf6428898b3b568d8 lib/unicore/extracted/DLineBreak.txt
  * 6d4a8c945dd7db83ed617cbb7d937de7f4ecf016ff22970d846e996a7c9a2a5d lib/unicore/extracted/DNumType.txt
  * 5b7c14380d5cceeaffcfbc18db1ed936391d2af2d51f5a41f1a17b692c77e59b lib/unicore/extracted/DNumValues.txt
- * 3e37ae63c1a4f3084bba787a2c6ca020dad9d0d56e115c118fe8c68ac290ea7a lib/unicore/mktables
+ * 62a198b1430be086ac577285f5cbc0c2bde043a8ba469d85b256f1e191aa997d lib/unicore/mktables
  * 50b85a67451145545a65cea370dab8d3444fbfe07e9c34cef560c5b7da9d3eef lib/unicore/version
  * 2680b9254eb236c5c090f11b149605043e8c8433661b96efc4a42fb4709342a5 regen/charset_translations.pl
  * f9a393e7add8c7c2728356473ce5b52246d51295b2da0c48fb6f0aa21799e2bb regen/regcharclass.pl
diff --git a/regcomp.c b/regcomp.c
diff --git a/uni_keywords.h b/uni_keywords.h

Original file line number	Diff line number	Diff line change
`@@ -641,6 +641,14 @@ sub _loose_regcomp_lookup {`
`641`	`641`	`);`
`642`	`642`	`}`
`643`	`643`
	`644`	`+sub _get_names_info {`
	`645`	`+ # For use only by regcomp.c to compile \p{name=/.../}`
	`646`	`+ populate_txt() unless $txt;`
	`647`	`+`
	`648`	`+`
	`649`	`+ return ( \$txt, \@charnames::code_points_ending_in_code_point );`
	`650`	`+}`
	`651`	`+`
`644`	`652`	`sub import`
`645`	`653`	`{`
`646`	`654`	`shift; ## ignore class name`