From 2e660900c45eb67657e8cac9a8b4b81558aba515 Mon Sep 17 00:00:00 2001 From: Jules Bertholet Date: Mon, 17 Jun 2024 09:50:14 -0400 Subject: [PATCH] Support `Grapheme_Cluster_Break=Prepend` These characters act like combining marks, except they go before the base character instead of after it. --- scripts/unicode.py | 16 ++++++++++++++++ src/lib.rs | 5 ++++- src/tables.rs | 24 ++++++++++++++++-------- tests/tests.rs | 6 ++++++ 4 files changed, 42 insertions(+), 9 deletions(-) diff --git a/scripts/unicode.py b/scripts/unicode.py index 18e71d0..e3be355 100755 --- a/scripts/unicode.py +++ b/scripts/unicode.py @@ -20,6 +20,7 @@ # - ReadMe.txt # - Scripts.txt # - UnicodeData.txt +# - auxiliary/GraphemeBreakProperty.txt # - emoji/emoji-data.txt # - emoji/emoji-variation-sequences.txt # - extracted/DerivedGeneralCategory.txt @@ -526,6 +527,21 @@ def load_zero_widths() -> list[bool]: zw_map[0x0891] = True zw_map[0x08E2] = True + # `[:Grapheme_Cluster_Break=Prepend:]-[:Prepended_Concatenation_Mark:]` + gcb_prepend = set() + load_property( + "auxiliary/GraphemeBreakProperty.txt", + "Prepend", + lambda cp: gcb_prepend.add(cp), + ) + load_property( + "PropList.txt", + "Prepended_Concatenation_Mark", + lambda cp: gcb_prepend.remove(cp), + ) + for cp in gcb_prepend: + zw_map[cp] = True + # HANGUL CHOSEONG FILLER # U+115F is a `Default_Ignorable_Code_Point`, and therefore would normally have # zero width. However, the expected usage is to combine it with vowel or trailing jamo diff --git a/src/lib.rs b/src/lib.rs index d83a6c8..4297e11 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -73,7 +73,7 @@ //! - **[Buginese]**: `"\u{1A15}\u{1A17}\u{200D}\u{1A10}"` ( ya, `ᨕᨗ‍ᨐ`) has total width 1. //! - **[Hebrew]**: `"א\u{200D}ל"` (Alef-Lamed, `א‍ל`) has total width 1. //! - **[Khmer]**: Coeng signs consisting of `'\u{17D2}'` followed by a character in -//! `'\u{1780}'..='\u{1782}' | '\u{1784}'..='\u{1787}' | '\u{1789}'..='\u{178C}' | '\u{178E}'..='\u{1793}' | '\u{1795}'..='\u{1798}' | '\u{179B}'..='\u{179D}' | '\u{17A0}' | '\u{17A2}' | '\u{17A7}' | '\u{17AB}'..='\u{17AC}' | '\u{17AF}'` +//! `'\u{1780}'..='\u{1782}' | '\u{1784}'..='\u{1787}' | '\u{1789}'..='\u{178C}' | '\u{178E}'..='\u{1793}' | '\u{1795}'..='\u{1798}' | '\u{179B}'..='\u{179D}' | '\u{17A0}' | '\u{17A2}' | '\u{17A7}' | '\u{17AB}'..='\u{17AC}' | '\u{17AF}'` //! have width 0. //! - **[Lisu]**: Tone letter combinations consisting of a character in the range `'\u{A4F8}'..='\u{A4FB}'` //! followed by a character in the range `'\u{A4FC}'..='\u{A4FD}'` have width 1. For example: `ꓹꓼ` @@ -113,6 +113,8 @@ //! - [`'\u{0890}'` POUND MARK ABOVE](https://util.unicode.org/UnicodeJsps/character.jsp?a=0890), //! - [`'\u{0891}'` PIASTRE MARK ABOVE](https://util.unicode.org/UnicodeJsps/character.jsp?a=0891), and //! - [`'\u{08E2}'` DISPUTED END OF AYAH](https://util.unicode.org/UnicodeJsps/character.jsp?a=08E2). +//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BGrapheme_Cluster_Break%3DPrepend%7D-%5Cp%7BPrepended_Concatenation_Mark%7D) +//! with the [`Grapheme_Extend=Prepend`] property, that are not also [`Prepended_Concatenation_Mark`]s. //! - [`'\u{A8FA}'` DEVANAGARI CARET](https://util.unicode.org/UnicodeJsps/character.jsp?a=A8FA). //! 5. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DF%7D%5Cp%7BEast_Asian_Width%3DW%7D) //! with an [`East_Asian_Width`] of [`Fullwidth`] or [`Wide`] have width 2. @@ -132,6 +134,7 @@ //! [`East_Asian_Width`]: https://www.unicode.org/reports/tr11/#ED1 //! [`Emoji_Presentation`]: https://unicode.org/reports/tr51/#def_emoji_presentation //! [`General_Category`]: https://www.unicode.org/versions/Unicode15.0.0/ch04.pdf#G124142 +//! [`Grapheme_Extend=Prepend`]: https://www.unicode.org/reports/tr29/#Prepend //! [`Grapheme_Extend`]: https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G52443 //! [`Hangul_Syllable_Type`]: https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G45593 //! [`Joining_Group`]: https://www.unicode.org/versions/Unicode14.0.0/ch09.pdf#G36862 diff --git a/src/tables.rs b/src/tables.rs index f7a7a86..c8a4aba 100644 --- a/src/tables.rs +++ b/src/tables.rs @@ -1162,7 +1162,7 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([ ], [ 0x50, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x15, - 0x44, 0x01, 0x54, 0x55, 0x51, 0x55, 0x15, 0x55, 0x55, 0x05, 0x55, 0x55, 0x55, 0x55, 0x55, + 0x44, 0x01, 0x54, 0x55, 0x41, 0x55, 0x15, 0x55, 0x55, 0x05, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, ], [ @@ -1532,7 +1532,7 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([ ], [ 0x50, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x05, 0x00, - 0x40, 0x55, 0x55, 0x01, 0x14, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, + 0x40, 0x05, 0x55, 0x01, 0x14, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, ], [ @@ -1587,7 +1587,7 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([ ], [ 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x54, 0x55, 0x15, - 0x44, 0x15, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, + 0x04, 0x11, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, ], [ @@ -1596,12 +1596,12 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([ 0x55, 0x55, ], [ - 0x01, 0x00, 0x40, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x15, 0x00, 0x14, + 0x01, 0x00, 0x40, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x15, 0x00, 0x04, 0x40, 0x55, 0x15, 0x55, 0x55, 0x01, 0x40, 0x01, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, ], [ - 0x55, 0x55, 0x05, 0x00, 0x00, 0x40, 0x50, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, + 0x55, 0x00, 0x00, 0x00, 0x00, 0x40, 0x50, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, ], @@ -1617,7 +1617,7 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([ ], [ 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x01, 0x40, 0x45, - 0x10, 0x00, 0x10, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, + 0x10, 0x00, 0x00, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, ], [ @@ -1631,7 +1631,7 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([ 0x55, 0x55, ], [ - 0x50, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x05, 0x40, + 0x40, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x05, 0x40, 0x55, 0x44, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, ], @@ -1994,7 +1994,7 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([ /// Sorted list of codepoint ranges (inclusive) /// that are zero-width but not `Joining_Type=Transparent` /// FIXME: can we get better compression? -static NON_TRANSPARENT_ZERO_WIDTHS: [([u8; 3], [u8; 3]); 45] = [ +static NON_TRANSPARENT_ZERO_WIDTHS: [([u8; 3], [u8; 3]); 53] = [ ([0x05, 0x06, 0x00], [0x05, 0x06, 0x00]), ([0x90, 0x08, 0x00], [0x91, 0x08, 0x00]), ([0xE2, 0x08, 0x00], [0xE2, 0x08, 0x00]), @@ -2010,6 +2010,7 @@ static NON_TRANSPARENT_ZERO_WIDTHS: [([u8; 3], [u8; 3]); 45] = [ ([0xCA, 0x0C, 0x00], [0xCB, 0x0C, 0x00]), ([0xD5, 0x0C, 0x00], [0xD6, 0x0C, 0x00]), ([0x3E, 0x0D, 0x00], [0x3E, 0x0D, 0x00]), + ([0x4E, 0x0D, 0x00], [0x4E, 0x0D, 0x00]), ([0x57, 0x0D, 0x00], [0x57, 0x0D, 0x00]), ([0xCF, 0x0D, 0x00], [0xCF, 0x0D, 0x00]), ([0xDF, 0x0D, 0x00], [0xDF, 0x0D, 0x00]), @@ -2028,12 +2029,19 @@ static NON_TRANSPARENT_ZERO_WIDTHS: [([u8; 3], [u8; 3]); 45] = [ ([0xCB, 0xD7, 0x00], [0xFB, 0xD7, 0x00]), ([0x9E, 0xFF, 0x00], [0xA0, 0xFF, 0x00]), ([0xF0, 0xFF, 0x00], [0xF8, 0xFF, 0x00]), + ([0xC2, 0x11, 0x01], [0xC3, 0x11, 0x01]), ([0x3E, 0x13, 0x01], [0x3E, 0x13, 0x01]), ([0x57, 0x13, 0x01], [0x57, 0x13, 0x01]), ([0xB0, 0x14, 0x01], [0xB0, 0x14, 0x01]), ([0xBD, 0x14, 0x01], [0xBD, 0x14, 0x01]), ([0xAF, 0x15, 0x01], [0xAF, 0x15, 0x01]), ([0x30, 0x19, 0x01], [0x30, 0x19, 0x01]), + ([0x3F, 0x19, 0x01], [0x3F, 0x19, 0x01]), + ([0x41, 0x19, 0x01], [0x41, 0x19, 0x01]), + ([0x3A, 0x1A, 0x01], [0x3A, 0x1A, 0x01]), + ([0x84, 0x1A, 0x01], [0x89, 0x1A, 0x01]), + ([0x46, 0x1D, 0x01], [0x46, 0x1D, 0x01]), + ([0x02, 0x1F, 0x01], [0x02, 0x1F, 0x01]), ([0x65, 0xD1, 0x01], [0x65, 0xD1, 0x01]), ([0x6E, 0xD1, 0x01], [0x72, 0xD1, 0x01]), ([0x00, 0x00, 0x0E], [0x00, 0x00, 0x0E]), diff --git a/tests/tests.rs b/tests/tests.rs index 4f713e7..8ff0c6b 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -110,6 +110,12 @@ fn test_prepended_concatenation_marks() { } } +#[test] +fn test_gcb_prepend() { + assert_width!("ൎഉ", 1, 1); + assert_width!("\u{11A89}", 0, 0); +} + #[test] fn test_interlinear_annotation_chars() { assert_width!('\u{FFF9}', Some(1), Some(1));