From dd861d2d4038d30eb239bbf2654e149cf553ae73 Mon Sep 17 00:00:00 2001 From: Tim Starling Date: Thu, 3 Nov 2022 21:05:47 +1100 Subject: [PATCH 1/3] Updates for RFC: Locale-independent case conversion * Add a detailed description of what is meant by ASCII case conversion to strtolower() and strtoupper(). * Replace locale-related text on ucfirst(), lcfirst() and ucwords() with text about ASCII case conversion. * Remove entity note.locale-single-byte since it was only used on ucwords(). * Add changelog entries to all the functions mentioned in the RFC. --- language-snippets.ent | 5 --- reference/array/constants.xml | 14 ++++---- reference/strings/functions/lcfirst.xml | 31 +++++++++++++---- reference/strings/functions/setlocale.xml | 2 +- reference/strings/functions/str-ireplace.xml | 24 +++++++++++++ reference/strings/functions/stripos.xml | 8 +++++ reference/strings/functions/stristr.xml | 8 +++++ reference/strings/functions/strripos.xml | 8 +++++ reference/strings/functions/strtolower.xml | 36 +++++++++++++++++--- reference/strings/functions/strtoupper.xml | 36 +++++++++++++++++--- reference/strings/functions/ucfirst.xml | 32 +++++++++++++---- reference/strings/functions/ucwords.xml | 33 ++++++++++++++++-- 12 files changed, 203 insertions(+), 34 deletions(-) diff --git a/language-snippets.ent b/language-snippets.ent index e17b5069097f..8d9758fdad3e 100644 --- a/language-snippets.ent +++ b/language-snippets.ent @@ -28,11 +28,6 @@ cryptographically secure value, consider using random_int, This function is binary-safe.'> -This function is locale-aware -and will handle input according to the currently set locale. However, it only works on single-byte character sets. -If you need to use multibyte characters (most non-western-European languages) look at the -multibyte or intl extensions instead.'> - The results of this function are cached. See clearstatcache for more details.'> diff --git a/reference/array/constants.xml b/reference/array/constants.xml index 3e05e33685de..b9d14a897fdf 100644 --- a/reference/array/constants.xml +++ b/reference/array/constants.xml @@ -15,7 +15,8 @@ CASE_LOWER is used with array_change_key_case and is used to convert array keys to lower case. This is also the default case for - array_change_key_case. + array_change_key_case. Since PHP 8.2.0, only ASCII + characters will be converted. @@ -28,7 +29,8 @@ CASE_UPPER is used with array_change_key_case and is used to convert array - keys to upper case. + keys to upper case. Since PHP 8.2.0, only ASCII characters will be + converted. @@ -130,10 +132,10 @@ - SORT_FLAG_CASE can be combined - (bitwise OR) with - SORT_STRING or - SORT_NATURAL to sort strings case-insensitively. + SORT_FLAG_CASE can be combined (bitwise OR) with + SORT_STRING or SORT_NATURAL to + sort strings case-insensitively. Since PHP 8.2.0, only ASCII case folding + will be done. diff --git a/reference/strings/functions/lcfirst.xml b/reference/strings/functions/lcfirst.xml index adc6e721c775..663d161ca77c 100644 --- a/reference/strings/functions/lcfirst.xml +++ b/reference/strings/functions/lcfirst.xml @@ -15,12 +15,8 @@ Returns a string with the first character of string lowercased if that character is - alphabetic. - - - Note that 'alphabetic' is determined by the current locale. For - instance, in the default "C" locale characters such as umlaut-a - (ä) will not be converted. + an ASCII character in the range "A" (0x41) to + "Z" (0x5a). @@ -47,6 +43,29 @@ + + &reftitle.changelog; + + + + + &Version; + &Description; + + + + + 8.2.0 + + Case conversion no longer depends on the locale set with + setlocale. Only ASCII characters will be converted. + + + + + + + &reftitle.examples; diff --git a/reference/strings/functions/setlocale.xml b/reference/strings/functions/setlocale.xml index ca6dea0bd135..23566562e31a 100644 --- a/reference/strings/functions/setlocale.xml +++ b/reference/strings/functions/setlocale.xml @@ -62,7 +62,7 @@ LC_CTYPE for character classification and conversion, for - example strtoupper + example ctype_alpha diff --git a/reference/strings/functions/str-ireplace.xml b/reference/strings/functions/str-ireplace.xml index 1ebd3baae7e3..336863bdd5df 100644 --- a/reference/strings/functions/str-ireplace.xml +++ b/reference/strings/functions/str-ireplace.xml @@ -97,6 +97,30 @@ + + &reftitle.changelog; + + + + + &Version; + &Description; + + + + + 8.2.0 + + Case folding no longer depends on the locale set with + setlocale. Only ASCII case folding will be done. + Non-ASCII bytes will be compared by their byte value. + + + + + + + &reftitle.examples; diff --git a/reference/strings/functions/stripos.xml b/reference/strings/functions/stripos.xml index fb71929df663..33a109b549d7 100644 --- a/reference/strings/functions/stripos.xml +++ b/reference/strings/functions/stripos.xml @@ -84,6 +84,14 @@ + + 8.2.0 + + Case folding no longer depends on the locale set with + setlocale. Only ASCII case folding will be done. + Non-ASCII bytes will be compared by their byte value. + + 8.0.0 diff --git a/reference/strings/functions/stristr.xml b/reference/strings/functions/stristr.xml index 6ebea767a179..ad6543aa6385 100644 --- a/reference/strings/functions/stristr.xml +++ b/reference/strings/functions/stristr.xml @@ -76,6 +76,14 @@ + + 8.2.0 + + Case folding no longer depends on the locale set with + setlocale. Only ASCII case folding will be done. + Non-ASCII bytes will be compared by their byte value. + + 8.0.0 diff --git a/reference/strings/functions/strripos.xml b/reference/strings/functions/strripos.xml index 3ce8d1f2db70..816a107a6d17 100644 --- a/reference/strings/functions/strripos.xml +++ b/reference/strings/functions/strripos.xml @@ -98,6 +98,14 @@ + + 8.2.0 + + Case folding no longer depends on the locale set with + setlocale. Only ASCII case folding will be done. + Non-ASCII bytes will be compared by their byte value. + + 8.0.0 diff --git a/reference/strings/functions/strtolower.xml b/reference/strings/functions/strtolower.xml index 0fe2251a83b5..fab8c9daf5d0 100644 --- a/reference/strings/functions/strtolower.xml +++ b/reference/strings/functions/strtolower.xml @@ -13,13 +13,18 @@ stringstring - Returns string with all alphabetic characters + Returns string with all ASCII alphabetic characters converted to lowercase. - Note that 'alphabetic' is determined by the current locale. This means - that e.g. in the default "C" locale, characters such as umlaut-A - (Ä) will not be converted. + Bytes in the range "A" (0x41) to "Z" + (0x5a) will be converted to the corresponding lowercase letter by adding 32 + to each byte value. + + + This can be used to convert ASCII characters within strings encoded with + UTF-8, since multibyte UTF-8 characters will be ignored. To convert multibyte + non-ASCII characters, use mb_strtolower. @@ -46,6 +51,29 @@ + + &reftitle.changelog; + + + + + &Version; + &Description; + + + + + 8.2.0 + + Case conversion no longer depends on the locale set with + setlocale. Only ASCII characters will be converted. + + + + + + + &reftitle.examples; diff --git a/reference/strings/functions/strtoupper.xml b/reference/strings/functions/strtoupper.xml index 15c11d7d744e..b51cf32f6a46 100644 --- a/reference/strings/functions/strtoupper.xml +++ b/reference/strings/functions/strtoupper.xml @@ -13,13 +13,18 @@ stringstring - Returns string with all alphabetic characters + Returns string with all ASCII alphabetic characters converted to uppercase. - Note that 'alphabetic' is determined by the current locale. For instance, - in the default "C" locale characters such as umlaut-a (ä) will not be - converted. + Bytes in the range "a" (0x61) to "z" + (0x7a) will be converted to the corresponding uppercase letter by subtracting + 32 from each byte value. + + + This can be used to convert ASCII characters within strings encoded with + UTF-8, since multibyte UTF-8 characters will be ignored. To convert multibyte + non-ASCII characters, use mb_strtoupper. @@ -46,6 +51,29 @@ + + &reftitle.changelog; + + + + + &Version; + &Description; + + + + + 8.2.0 + + Case conversion no longer depends on the locale set with + setlocale. Only ASCII characters will be converted. + + + + + + + &reftitle.examples; diff --git a/reference/strings/functions/ucfirst.xml b/reference/strings/functions/ucfirst.xml index e77c7bf62559..982ad351d151 100644 --- a/reference/strings/functions/ucfirst.xml +++ b/reference/strings/functions/ucfirst.xml @@ -15,12 +15,8 @@ Returns a string with the first character of string capitalized, if that character is - alphabetic. - - - Note that 'alphabetic' is determined by the current locale. For - instance, in the default "C" locale characters such as umlaut-a - (ä) will not be converted. + an ASCII character in the range from "a" (0x61) to + "z" (0x7a). @@ -47,6 +43,29 @@ + + &reftitle.changelog; + + + + + &Version; + &Description; + + + + + 8.2.0 + + Case conversion no longer depends on the locale set with + setlocale. Only ASCII characters will be converted. + + + + + + + &reftitle.examples; @@ -76,6 +95,7 @@ $bar = ucfirst(strtolower($bar)); // Hello world! strtolower strtoupper ucwords + mb_convert_case diff --git a/reference/strings/functions/ucwords.xml b/reference/strings/functions/ucwords.xml index 79d7a9b5af84..7f481d76b6f0 100644 --- a/reference/strings/functions/ucwords.xml +++ b/reference/strings/functions/ucwords.xml @@ -15,13 +15,20 @@ Returns a string with the first character of each word in - string capitalized, if that character is alphabetic. + string capitalized, if that character is an ASCII + character between "a" (0x61) and "z" + (0x7a). For this function, a word is a string of characters that are not listed in the separators parameter. By default, these are: space, horizontal tab, carriage return, newline, form-feed and vertical tab. + + To do a similar conversion on multibyte strings, use + mb_convert_case with the MB_CASE_TITLE + mode. + @@ -55,6 +62,29 @@ + + &reftitle.changelog; + + + + + &Version; + &Description; + + + + + 8.2.0 + + Case conversion no longer depends on the locale set with + setlocale. Only ASCII characters will be converted. + + + + + + + &reftitle.examples; @@ -110,7 +140,6 @@ $baz = ucwords($foo, " \t\r\n\f\v'"); // Mike O'Hara &reftitle.notes; - ¬e.locale-single-byte; ¬e.bin-safe; From a8216fb643703232ab4f62fe0dc6dbf213fb58a0 Mon Sep 17 00:00:00 2001 From: Tim Starling Date: Fri, 4 Nov 2022 23:19:18 +1100 Subject: [PATCH 2/3] Apply suggestions from code review Co-authored-by: George Peter Banyard --- reference/array/constants.xml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/reference/array/constants.xml b/reference/array/constants.xml index b9d14a897fdf..84e355bd5c8c 100644 --- a/reference/array/constants.xml +++ b/reference/array/constants.xml @@ -15,7 +15,7 @@ CASE_LOWER is used with array_change_key_case and is used to convert array keys to lower case. This is also the default case for - array_change_key_case. Since PHP 8.2.0, only ASCII + array_change_key_case. As of PHP 8.2.0, only ASCII characters will be converted. @@ -29,7 +29,7 @@ CASE_UPPER is used with array_change_key_case and is used to convert array - keys to upper case. Since PHP 8.2.0, only ASCII characters will be + keys to upper case. As of PHP 8.2.0, only ASCII characters will be converted. @@ -134,7 +134,7 @@ SORT_FLAG_CASE can be combined (bitwise OR) with SORT_STRING or SORT_NATURAL to - sort strings case-insensitively. Since PHP 8.2.0, only ASCII case folding + sort strings case-insensitively. As of PHP 8.2.0, only ASCII case folding will be done. From a98fffb08fc25919489e2271a3dce5a0c956f835 Mon Sep 17 00:00:00 2001 From: Tim Starling Date: Mon, 7 Nov 2022 11:43:49 +1100 Subject: [PATCH 3/3] Use entities for changelog entries --- language-snippets.ent | 21 ++++++++++++++++++++ reference/strings/functions/lcfirst.xml | 8 +------- reference/strings/functions/str-ireplace.xml | 9 +-------- reference/strings/functions/stripos.xml | 9 +-------- reference/strings/functions/stristr.xml | 9 +-------- reference/strings/functions/strripos.xml | 9 +-------- reference/strings/functions/strtolower.xml | 8 +------- reference/strings/functions/strtoupper.xml | 8 +------- reference/strings/functions/ucfirst.xml | 8 +------- reference/strings/functions/ucwords.xml | 8 +------- 10 files changed, 30 insertions(+), 67 deletions(-) diff --git a/language-snippets.ent b/language-snippets.ent index 8d9758fdad3e..4c09ef10b434 100644 --- a/language-snippets.ent +++ b/language-snippets.ent @@ -3691,6 +3691,27 @@ local: { '> + + 8.2.0 + + Case conversion no longer depends on the locale set with + setlocale. Only ASCII characters will be converted. + + +'> + + + 8.2.0 + + Case folding no longer depends on the locale set with + setlocale. Only ASCII case folding will be done. + Non-ASCII bytes will be compared by their byte value. + + +'> + diff --git a/reference/strings/functions/lcfirst.xml b/reference/strings/functions/lcfirst.xml index 663d161ca77c..ee75c446d54c 100644 --- a/reference/strings/functions/lcfirst.xml +++ b/reference/strings/functions/lcfirst.xml @@ -54,13 +54,7 @@ - - 8.2.0 - - Case conversion no longer depends on the locale set with - setlocale. Only ASCII characters will be converted. - - + &strings.changelog.ascii-case-conversion; diff --git a/reference/strings/functions/str-ireplace.xml b/reference/strings/functions/str-ireplace.xml index 336863bdd5df..ef6619e0fdec 100644 --- a/reference/strings/functions/str-ireplace.xml +++ b/reference/strings/functions/str-ireplace.xml @@ -108,14 +108,7 @@ - - 8.2.0 - - Case folding no longer depends on the locale set with - setlocale. Only ASCII case folding will be done. - Non-ASCII bytes will be compared by their byte value. - - + &strings.changelog.ascii-case-folding; diff --git a/reference/strings/functions/stripos.xml b/reference/strings/functions/stripos.xml index 33a109b549d7..5be56b120e8d 100644 --- a/reference/strings/functions/stripos.xml +++ b/reference/strings/functions/stripos.xml @@ -84,14 +84,7 @@ - - 8.2.0 - - Case folding no longer depends on the locale set with - setlocale. Only ASCII case folding will be done. - Non-ASCII bytes will be compared by their byte value. - - + &strings.changelog.ascii-case-folding; 8.0.0 diff --git a/reference/strings/functions/stristr.xml b/reference/strings/functions/stristr.xml index ad6543aa6385..2820ae6915ab 100644 --- a/reference/strings/functions/stristr.xml +++ b/reference/strings/functions/stristr.xml @@ -76,14 +76,7 @@ - - 8.2.0 - - Case folding no longer depends on the locale set with - setlocale. Only ASCII case folding will be done. - Non-ASCII bytes will be compared by their byte value. - - + &strings.changelog.ascii-case-folding; 8.0.0 diff --git a/reference/strings/functions/strripos.xml b/reference/strings/functions/strripos.xml index 816a107a6d17..ae4ad1032391 100644 --- a/reference/strings/functions/strripos.xml +++ b/reference/strings/functions/strripos.xml @@ -98,14 +98,7 @@ - - 8.2.0 - - Case folding no longer depends on the locale set with - setlocale. Only ASCII case folding will be done. - Non-ASCII bytes will be compared by their byte value. - - + &strings.changelog.ascii-case-folding; 8.0.0 diff --git a/reference/strings/functions/strtolower.xml b/reference/strings/functions/strtolower.xml index fab8c9daf5d0..79f5570332e7 100644 --- a/reference/strings/functions/strtolower.xml +++ b/reference/strings/functions/strtolower.xml @@ -62,13 +62,7 @@ - - 8.2.0 - - Case conversion no longer depends on the locale set with - setlocale. Only ASCII characters will be converted. - - + &strings.changelog.ascii-case-conversion; diff --git a/reference/strings/functions/strtoupper.xml b/reference/strings/functions/strtoupper.xml index b51cf32f6a46..3cae64459595 100644 --- a/reference/strings/functions/strtoupper.xml +++ b/reference/strings/functions/strtoupper.xml @@ -62,13 +62,7 @@ - - 8.2.0 - - Case conversion no longer depends on the locale set with - setlocale. Only ASCII characters will be converted. - - + &strings.changelog.ascii-case-conversion; diff --git a/reference/strings/functions/ucfirst.xml b/reference/strings/functions/ucfirst.xml index 982ad351d151..5ba5efe8e680 100644 --- a/reference/strings/functions/ucfirst.xml +++ b/reference/strings/functions/ucfirst.xml @@ -54,13 +54,7 @@ - - 8.2.0 - - Case conversion no longer depends on the locale set with - setlocale. Only ASCII characters will be converted. - - + &strings.changelog.ascii-case-conversion; diff --git a/reference/strings/functions/ucwords.xml b/reference/strings/functions/ucwords.xml index 7f481d76b6f0..fd20dbf2ff73 100644 --- a/reference/strings/functions/ucwords.xml +++ b/reference/strings/functions/ucwords.xml @@ -73,13 +73,7 @@ - - 8.2.0 - - Case conversion no longer depends on the locale set with - setlocale. Only ASCII characters will be converted. - - + &strings.changelog.ascii-case-conversion;