diff --git a/ext/mbstring/tests/data/BIG5.txt b/ext/mbstring/tests/data/BIG5.txt index dd6cd9ec22839..0f497a44dfd98 100644 --- a/ext/mbstring/tests/data/BIG5.txt +++ b/ext/mbstring/tests/data/BIG5.txt @@ -1,35 +1,26 @@ +# BIG5.TXT +# Date: 2015-12-02 23:52:00 GMT [KW] +# © 2015 Unicode®, Inc. +# For terms of use, see http://www.unicode.org/terms_of_use.html # # Name: BIG5 to Unicode table (complete) # Unicode version: 1.1 -# Table version: 0.0d3 +# Table version: 2.0 # Table format: Format A -# Date: 11 February 1994 -# Authors: Glenn Adams -# John H. Jenkins -# -# Copyright (c) 1991-1994 Unicode, Inc. All Rights reserved. -# -# This file is provided as-is by Unicode, Inc. (The Unicode Consortium). -# No claims are made as to fitness for any particular purpose. No -# warranties of any kind are expressed or implied. The recipient -# agrees to determine applicability of information provided. If this -# file has been provided on magnetic media by Unicode, Inc., the sole -# remedy for any claim will be exchange of defective media within 90 -# days of receipt. -# -# Recipient is granted the right to make copies in any form for -# internal distribution and to freely use the information supplied -# in the creation of products supporting Unicode. Unicode, Inc. -# specifically excludes the right to re-distribute this file directly -# to third parties or other organizations whether for profit or not. +# Date: 2011 October 14 (header updated: 2015 December 02) # # General notes: # -# This table contains the data Metis and Taligent currently have on how -# BIG5 characters map into Unicode. +# +# This table contains one set of mappings from BIG5 into Unicode. +# Note that these data are *possible* mappings only and may not be the +# same as those used by actual products, nor may they be the best suited +# for all uses. For more information on the mappings between various code +# pages incorporating the repertoire of BIG5 and Unicode, consult the +# VENDORS mapping data. # # WARNING! It is currently impossible to provide round-trip compatibility -# between BIG5 and Unicode. +# between BIG5 and Unicode. # # A number of characters are not currently mapped because # of conflicts with other mappings. They are as follows: @@ -46,45 +37,58 @@ # # We currently map all of these characters to U+FFFD REPLACEMENT CHARACTER. # It is also possible to map these characters to their duplicates, or to -# the user zone. -# +# the user zone. +# # Notes: # # 1. In addition to the above, there is some uncertainty about the # mappings in the range C6A1 - C8FE, and F9DD - F9FE. The ETEN -# version of BIG5 organizes the former range differently, and adds -# additional characters in the latter range. The correct mappings -# these ranges need to be determined. +# version of BIG5 organizes the former range differently, and adds +# additional characters in the latter range. The correct mappings +# these ranges need to be determined. # # 2. There is an uncertainty in the mapping of the Big Five character -# 0xA3BC. This character occurs within the Big Five block of tone marks -# for bopomofo and is intended to be the tone mark for the first tone in -# Mandarin Chinese. We have selected the mapping U+02C9 MODIFIER LETTER -# MACRON (Mandarin Chinese first tone) to reflect this semantic. -# However, because bopomofo uses the absense of a tone mark to indicate -# the first Mandarin tone, most implementations of Big Five represent -# this character with a blank space, and so a mapping such as U+2003 EM SPACE -# might be preferred. -# -# +# 0xA3BC. This character occurs within the Big Five block of tone marks +# for bopomofo and is intended to be the tone mark for the first tone in +# Mandarin Chinese. We have selected the mapping U+02C9 MODIFIER LETTER +# MACRON (Mandarin Chinese first tone) to reflect this semantic. +# However, because bopomofo uses the absense of a tone mark to indicate +# the first Mandarin tone, most implementations of Big Five represent +# this character with a blank space, and so a mapping such as U+2003 EM +# SPACE might be preferred. # # Format: Three tab-separated columns # Column #1 is the BIG5 code (in hex as 0xXXXX) # Column #2 is the Unicode (in hex as 0xXXXX) # Column #3 is the Unicode name (follows a comment sign, '#') -# The official names for Unicode characters U+4E00 -# to U+9FA5, inclusive, is "CJK UNIFIED IDEOGRAPH-XXXX", -# where XXXX is the code point. Including all these -# names in this file increases its size substantially -# and needlessly. The token "" is used for the -# name of these characters. If necessary, it can be -# expanded algorithmically by a parser or editor. +# The official names for Unicode characters U+4E00 +# to U+9FA5, inclusive, is "CJK UNIFIED IDEOGRAPH-XXXX", +# where XXXX is the code point. Including all these +# names in this file increases its size substantially +# and needlessly. The token "" is used for the +# name of these characters. If necessary, it can be +# expanded algorithmically by a parser or editor. # # The entries are in BIG5 order # -# Any comments or problems, contact +# Revision History: +# +# [v2.0, 2015 December 02] +# updates to copyright notice and terms of use +# no changes to character mappings +# +# [v1.0, 2011 October 14] +# Updated terms of use to current wording. +# Updated contact information. +# No changes to the mapping data. +# +# [v0.0d3, 11 February 1994] +# First release. # +# Use the Unicode reporting form +# for any questions or comments or to report errors in the data. # +# Manually added mapping of lower ASCII characters 0x0 0x0 0x1 0x1 0x2 0x2 @@ -239,6 +243,7 @@ 0xA157 0xFE31 # PRESENTATION FORM FOR VERTICAL EM DASH 0xA158 0x2014 # EM DASH 0xA159 0xFE33 # PRESENTATION FORM FOR VERTICAL LOW LINE +0xA15A 0xFFFD # *** NO MAPPING *** 0xA15B 0xFE34 # PRESENTATION FORM FOR VERTICAL WAVY LOW LINE 0xA15C 0xFE4F # WAVY LOW LINE 0xA15D 0xFF08 # FULLWIDTH LEFT PARENTHESIS @@ -309,7 +314,9 @@ 0xA1C0 0x32A3 # CIRCLED IDEOGRAPH CORRECT 0xA1C1 0x2105 # CARE OF 0xA1C2 0x203E # OVERLINE +0xA1C3 0xFFFD # *** NO MAPPING *** 0xA1C4 0xFF3F # FULLWIDTH LOW LINE +0xA1C5 0xFFFD # *** NO MAPPING *** 0xA1C6 0xFE49 # DASHED OVERLINE 0xA1C7 0xFE4A # CENTRELINE OVERLINE 0xA1C8 0xFE4D # DASHED LOW LINE @@ -366,6 +373,8 @@ 0xA1FB 0x2198 # SOUTH EAST ARROW 0xA1FC 0x2225 # PARALLEL TO 0xA1FD 0x2223 # DIVIDES +0xA1FE 0xFFFD # *** NO MAPPING *** +0xA240 0xFFFD # *** NO MAPPING *** 0xA241 0xFF0F # FULLWIDTH SOLIDUS 0xA242 0xFF3C # FULLWIDTH REVERSE SOLIDUS 0xA243 0xFF04 # FULLWIDTH DOLLAR SIGN @@ -471,7 +480,9 @@ 0xA2C9 0x3027 # HANGZHOU NUMERAL SEVEN 0xA2CA 0x3028 # HANGZHOU NUMERAL EIGHT 0xA2CB 0x3029 # HANGZHOU NUMERAL NINE +0xA2CC 0xFFFD # *** NO MAPPING *** 0xA2CD 0x5344 # +0xA2CE 0xFFFD # *** NO MAPPING *** 0xA2CF 0xFF21 # FULLWIDTH LATIN CAPITAL LETTER A 0xA2D0 0xFF22 # FULLWIDTH LATIN CAPITAL LETTER B 0xA2D1 0xFF23 # FULLWIDTH LATIN CAPITAL LETTER C @@ -13916,7 +13927,7 @@ 0xF9D3 0x9F7E # 0xF9D4 0x9F49 # 0xF9D5 0x9F98 # -# The following ETEN extensions are copied from CP950.txt: +# The following ETEN extensions are copied from CP950.txt (https://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP950.TXT): 0xF9D6 0x7881 #CJK UNIFIED IDEOGRAPH 0xF9D7 0x92B9 #CJK UNIFIED IDEOGRAPH 0xF9D8 0x88CF #CJK UNIFIED IDEOGRAPH diff --git a/ext/mbstring/tests/data/JISX0201.txt b/ext/mbstring/tests/data/JISX0201.txt index 87e9e94192e23..d4293a5e483a2 100644 --- a/ext/mbstring/tests/data/JISX0201.txt +++ b/ext/mbstring/tests/data/JISX0201.txt @@ -1,33 +1,24 @@ +# JIS0201.TXT +# Date: 2015-12-02 23:49:00 GMT [KW] +# © 2015 Unicode®, Inc. +# For terms of use, see http://www.unicode.org/terms_of_use.html # # Name: JIS X 0201 (1976) to Unicode 1.1 Table # Unicode version: 1.1 -# Table version: 0.9 +# Table version: 2.0 # Table format: Format A -# Date: 8 March 1994 -# Authors: Glenn Adams -# John H. Jenkins +# Date: 2011 October 14 (header updated: 2015 December 02) # -# Copyright (c) 1991-1994 Unicode, Inc. All Rights reserved. -# -# This file is provided as-is by Unicode, Inc. (The Unicode Consortium). -# No claims are made as to fitness for any particular purpose. No -# warranties of any kind are expressed or implied. The recipient -# agrees to determine applicability of information provided. If this -# file has been provided on magnetic media by Unicode, Inc., the sole -# remedy for any claim will be exchange of defective media within 90 -# days of receipt. +# General notes: # -# Recipient is granted the right to make copies in any form for -# internal distribution and to freely use the information supplied -# in the creation of products supporting Unicode. Unicode, Inc. -# specifically excludes the right to re-distribute this file directly -# to third parties or other organizations whether for profit or not. # -# General notes: +# This table contains one set of mappings from JIS X 0201 into Unicode. +# Note that these data are *possible* mappings only and may not be the +# same as those used by actual products, nor may they be the best suited +# for all uses. For more information on the mappings between various code +# pages incorporating the repertoire of JIS X 0201 and Unicode, consult the +# VENDORS mapping data. # -# This table contains the data the Unicode Consortium has on how -# single-byte JIS X 0201 characters map into Unicode 1.1 -# (ISO/IEC 10646:1-1993 UCS-2). # # Format: Three tab-separated columns # Column #1 is the shift JIS code (in hex as 0xXX) @@ -36,11 +27,22 @@ # # The entries are in JIS order # -# These mappings are provisional, pending definition of -# official mappings by Japanese standards bodies. +# Revision History: +# +# [v2.0, 2015 December 02] +# updates to copyright notice and terms of use +# no changes to character mappings +# +# [v1.0, 2011 October 14] +# Updated terms of use to current wording. +# Updated contact information. +# No changes to the mapping data. # -# Any comments or problems, contact +# [v0.9, 8 March 1994] +# First release. # +# Use the Unicode reporting form +# for any questions or comments or to report errors in the data. # 0x20 0x0020 # SPACE 0x21 0x0021 # EXCLAMATION MARK diff --git a/ext/mbstring/tests/data/JISX0212.txt b/ext/mbstring/tests/data/JISX0212.txt index 316d28e4d10fd..ae35ecacdc2b0 100644 --- a/ext/mbstring/tests/data/JISX0212.txt +++ b/ext/mbstring/tests/data/JISX0212.txt @@ -1,44 +1,36 @@ +# JIS0212.TXT +# Date: 2015-12-02 23:51:00 GMT [KW] +# © 2015 Unicode®, Inc. +# For terms of use, see http://www.unicode.org/terms_of_use.html # # Name: JIS X 0212 (1990) to Unicode # Unicode version: 1.1 -# Table version: 0.9 +# Table version: 2.0 # Table format: Format A -# Date: 8 March 1994 -# Authors: Glenn Adams -# John H. Jenkins +# Date: 2011 October 14 (header updated: 2015 December 02) # -# Copyright (c) 1991-1994 Unicode, Inc. All Rights reserved. -# -# This file is provided as-is by Unicode, Inc. (The Unicode Consortium). -# No claims are made as to fitness for any particular purpose. No -# warranties of any kind are expressed or implied. The recipient -# agrees to determine applicability of information provided. If this -# file has been provided on magnetic media by Unicode, Inc., the sole -# remedy for any claim will be exchange of defective media within 90 -# days of receipt. +# General notes: # -# Recipient is granted the right to make copies in any form for -# internal distribution and to freely use the information supplied -# in the creation of products supporting Unicode. Unicode, Inc. -# specifically excludes the right to re-distribute this file directly -# to third parties or other organizations whether for profit or not. # -# General notes: +# This table contains one set of mappings from JIS X 0212 into Unicode. +# Note that these data are *possible* mappings only and may not be the +# same as those used by actual products, nor may they be the best suited +# for all uses. For more information on the mappings between various code +# pages incorporating the repertoire of JIS X 0212 and Unicode, consult the +# VENDORS mapping data. # -# This table contains the data the Unicode Consortium has on how -# JIS X 0212 (1983) characters map into Unicode. # # Format: Three tab-separated columns # Column #1 is the JIS X 0212 code (in hex as 0xXXXX) # Column #2 is the Unicode (in hex as 0xXXXX) # Column #3 the Unicode name (follows a comment sign, '#') -# The official names for Unicode characters U+4E00 -# to U+9FA5, inclusive, is "CJK UNIFIED IDEOGRAPH-XXXX", -# where XXXX is the code point. Including all these -# names in this file increases its size substantially -# and needlessly. The token "" is used for the -# name of these characters. If necessary, it can be -# expanded algorithmically by a parser or editor. +# The official names for Unicode characters U+4E00 +# to U+9FA5, inclusive, is "CJK UNIFIED IDEOGRAPH-XXXX", +# where XXXX is the code point. Including all these +# names in this file increases its size substantially +# and needlessly. The token "" is used for the +# name of these characters. If necessary, it can be +# expanded algorithmically by a parser or editor. # # The entries are in JIS X 0212 order # @@ -51,17 +43,11 @@ # the kuten form. For example, 0x2121 -> 0x0101 -> 0101; # 0x6D63 -> 0x4D43 -> 7767 # -# The kanji mappings are a normative part of ISO/IEC 10646. The -# non-kanji mappings are provisional, pending definition of -# official mappings by Japanese standards bodies -# -# Any comments or problems, contact -# # Notes: # # 1. JIS X 0212 apparently unified the following two symbols # into a single character at 0x2922: -# +# # LATIN CAPITAL LETTER D WITH STROKE # LATIN CAPITAL LETTER ETH # @@ -72,6 +58,23 @@ # Consequently, in the Unicode mapping, 0x2922 is treated as # LATIN CAPITAL LETTER D WITH STROKE. # +# Revision History: +# +# [v2.0, 2015 December 02] +# updates to copyright notice and terms of use +# no changes to character mappings +# +# [v1.0, 2011 October 14] +# Updated terms of use to current wording. +# Updated contact information. +# No changes to the mapping data. +# +# [v0.9, 8 March 1994] +# First release. +# +# Use the Unicode reporting form +# for any questions or comments or to report errors in the data. +# 0x222F 0x02D8 # BREVE 0x2230 0x02C7 # CARON (Mandarin Chinese third tone) 0x2231 0x00B8 # CEDILLA diff --git a/ext/mbstring/tests/data/KSX1001.txt b/ext/mbstring/tests/data/KSX1001.txt index 5bff605b4532a..3ebd300b63414 100644 --- a/ext/mbstring/tests/data/KSX1001.txt +++ b/ext/mbstring/tests/data/KSX1001.txt @@ -1,11 +1,12 @@ # # Name: Unified Hangul (KS X 1001) to Unicode table # Unicode version: 2.0 -# Table version: 1.0 +# Table version: 1.1 # Table format: Format A -# Date: 08/16/99 +# Date: 2011 October 14 # Authors: Jungshik Shin at jshin@pantheon.yale.edu -# General notes: none +# +# Copyright (c) 1999-2011 Unicode, Inc. All Rights reserved. # # This file is provided as-is by Unicode, Inc. (The Unicode Consortium). # No claims are made as to fitness for any particular purpose. No @@ -15,11 +16,13 @@ # remedy for any claim will be exchange of defective media within 90 # days of receipt. # -# Recipient is granted the right to make copies in any form for -# internal distribution and to freely use the information supplied -# in the creation of products supporting Unicode. Unicode, Inc. -# specifically excludes the right to re-distribute this file directly -# to third parties or other organizations whether for profit or not. +# Unicode, Inc. hereby grants the right to freely use the information +# supplied in this file in the creation of products supporting the +# Unicode Standard, and to make copies of this file in any form for +# internal or external distribution as long as this notice remains +# attached. +# +# General notes: # # What is enclosed below is the mapping between KS X 1001(KS C 5601-1987 # and Unicode 2.0. It's automatically generated from KSC5601.TXT @@ -63,6 +66,19 @@ # first subtract 0x2020. Then # the high and low bytes correspond to the row(Hang) and the column(Yol), # respectively +# +# Revision History: +# +# [v1.1, 2011 October 14] +# Updated terms of use to current wording. +# Updated contact information. +# No changes to the mapping data. +# +# [v1.0, 08/16/99] +# First release. +# +# Use the Unicode reporting form +# for any questions or comments or to report errors in the data. # 0x2121 0x3000 # IDEOGRAPHIC SPACE 0x2122 0x3001 # IDEOGRAPHIC COMMA diff --git a/ext/mbstring/tests/encoding_tests.inc b/ext/mbstring/tests/encoding_tests.inc index f6fab024ca3d9..dc895eb4c0ad6 100644 --- a/ext/mbstring/tests/encoding_tests.inc +++ b/ext/mbstring/tests/encoding_tests.inc @@ -14,6 +14,10 @@ function readConversionTable($path, &$from, &$to, $utf32 = false) { if ($line[0] == '#') continue; if (sscanf($line, "0x%x\t0x%x", $char, $codepoint) == 2) { + // Skip codepoints that do not have a mapping (e.g. in BIG5.txt) + if ($codepoint === 0xFFFD) { + continue; + } $codepoint = $utf32 ? pack('N', $codepoint) : pack('n', $codepoint); if ($char == PHP_INT_MAX) { // We may be on a 32-bit machine and testing a text encoding with 4-byte codes