From 2e660900c45eb67657e8cac9a8b4b81558aba515 Mon Sep 17 00:00:00 2001
From: Jules Bertholet <julesbertholet@quoi.xyz>
Date: Mon, 17 Jun 2024 09:50:14 -0400
Subject: [PATCH] Support `Grapheme_Cluster_Break=Prepend`

These characters act like combining marks,
except they go before the base character instead of after it.
---
 scripts/unicode.py | 16 ++++++++++++++++
 src/lib.rs         |  5 ++++-
 src/tables.rs      | 24 ++++++++++++++++--------
 tests/tests.rs     |  6 ++++++
 4 files changed, 42 insertions(+), 9 deletions(-)

diff --git a/scripts/unicode.py b/scripts/unicode.py
index 18e71d0..e3be355 100755
--- a/scripts/unicode.py
+++ b/scripts/unicode.py
@@ -20,6 +20,7 @@
 # - ReadMe.txt
 # - Scripts.txt
 # - UnicodeData.txt
+# - auxiliary/GraphemeBreakProperty.txt
 # - emoji/emoji-data.txt
 # - emoji/emoji-variation-sequences.txt
 # - extracted/DerivedGeneralCategory.txt
@@ -526,6 +527,21 @@ def load_zero_widths() -> list[bool]:
     zw_map[0x0891] = True
     zw_map[0x08E2] = True
 
+    # `[:Grapheme_Cluster_Break=Prepend:]-[:Prepended_Concatenation_Mark:]`
+    gcb_prepend = set()
+    load_property(
+        "auxiliary/GraphemeBreakProperty.txt",
+        "Prepend",
+        lambda cp: gcb_prepend.add(cp),
+    )
+    load_property(
+        "PropList.txt",
+        "Prepended_Concatenation_Mark",
+        lambda cp: gcb_prepend.remove(cp),
+    )
+    for cp in gcb_prepend:
+        zw_map[cp] = True
+
     # HANGUL CHOSEONG FILLER
     # U+115F is a `Default_Ignorable_Code_Point`, and therefore would normally have
     # zero width. However, the expected usage is to combine it with vowel or trailing jamo
diff --git a/src/lib.rs b/src/lib.rs
index d83a6c8..4297e11 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -73,7 +73,7 @@
 //!      - **[Buginese]**: `"\u{1A15}\u{1A17}\u{200D}\u{1A10}"` (<a, -i> ya, `ᨕᨗ‍ᨐ`) has total width 1.
 //!      - **[Hebrew]**: `"א\u{200D}ל"` (Alef-Lamed, `א‍ל`) has total width 1.
 //!      - **[Khmer]**: Coeng signs consisting of `'\u{17D2}'` followed by a character in
-//!        `'\u{1780}'..='\u{1782}' | '\u{1784}'..='\u{1787}' | '\u{1789}'..='\u{178C}'  | '\u{178E}'..='\u{1793}' | '\u{1795}'..='\u{1798}' | '\u{179B}'..='\u{179D}' | '\u{17A0}' | '\u{17A2}'  | '\u{17A7}' | '\u{17AB}'..='\u{17AC}' | '\u{17AF}'`
+//!        `'\u{1780}'..='\u{1782}' | '\u{1784}'..='\u{1787}' | '\u{1789}'..='\u{178C}' | '\u{178E}'..='\u{1793}' | '\u{1795}'..='\u{1798}' | '\u{179B}'..='\u{179D}' | '\u{17A0}' | '\u{17A2}'  | '\u{17A7}' | '\u{17AB}'..='\u{17AC}' | '\u{17AF}'`
 //!        have width 0.
 //!      - **[Lisu]**: Tone letter combinations consisting of a character in the range `'\u{A4F8}'..='\u{A4FB}'`
 //!        followed by a character in the range `'\u{A4FC}'..='\u{A4FD}'` have width 1. For example: `ꓹꓼ`
@@ -113,6 +113,8 @@
 //!         - [`'\u{0890}'` POUND MARK ABOVE](https://util.unicode.org/UnicodeJsps/character.jsp?a=0890),
 //!         - [`'\u{0891}'` PIASTRE MARK ABOVE](https://util.unicode.org/UnicodeJsps/character.jsp?a=0891), and
 //!         - [`'\u{08E2}'` DISPUTED END OF AYAH](https://util.unicode.org/UnicodeJsps/character.jsp?a=08E2).
+//!       - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BGrapheme_Cluster_Break%3DPrepend%7D-%5Cp%7BPrepended_Concatenation_Mark%7D)
+//!         with the [`Grapheme_Extend=Prepend`] property, that are not also [`Prepended_Concatenation_Mark`]s.
 //!       - [`'\u{A8FA}'` DEVANAGARI CARET](https://util.unicode.org/UnicodeJsps/character.jsp?a=A8FA).
 //!    5. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DF%7D%5Cp%7BEast_Asian_Width%3DW%7D)
 //!       with an [`East_Asian_Width`] of [`Fullwidth`] or [`Wide`] have width 2.
@@ -132,6 +134,7 @@
 //! [`East_Asian_Width`]: https://www.unicode.org/reports/tr11/#ED1
 //! [`Emoji_Presentation`]: https://unicode.org/reports/tr51/#def_emoji_presentation
 //! [`General_Category`]: https://www.unicode.org/versions/Unicode15.0.0/ch04.pdf#G124142
+//! [`Grapheme_Extend=Prepend`]: https://www.unicode.org/reports/tr29/#Prepend
 //! [`Grapheme_Extend`]: https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G52443
 //! [`Hangul_Syllable_Type`]: https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G45593
 //! [`Joining_Group`]: https://www.unicode.org/versions/Unicode14.0.0/ch09.pdf#G36862
diff --git a/src/tables.rs b/src/tables.rs
index f7a7a86..c8a4aba 100644
--- a/src/tables.rs
+++ b/src/tables.rs
@@ -1162,7 +1162,7 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([
     ],
     [
         0x50, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x15,
-        0x44, 0x01, 0x54, 0x55, 0x51, 0x55, 0x15, 0x55, 0x55, 0x05, 0x55, 0x55, 0x55, 0x55, 0x55,
+        0x44, 0x01, 0x54, 0x55, 0x41, 0x55, 0x15, 0x55, 0x55, 0x05, 0x55, 0x55, 0x55, 0x55, 0x55,
         0x55, 0x55,
     ],
     [
@@ -1532,7 +1532,7 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([
     ],
     [
         0x50, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x05, 0x00,
-        0x40, 0x55, 0x55, 0x01, 0x14, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
+        0x40, 0x05, 0x55, 0x01, 0x14, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
         0x55, 0x55,
     ],
     [
@@ -1587,7 +1587,7 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([
     ],
     [
         0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x54, 0x55, 0x15,
-        0x44, 0x15, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
+        0x04, 0x11, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
         0x55, 0x55,
     ],
     [
@@ -1596,12 +1596,12 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([
         0x55, 0x55,
     ],
     [
-        0x01, 0x00, 0x40, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x15, 0x00, 0x14,
+        0x01, 0x00, 0x40, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x15, 0x00, 0x04,
         0x40, 0x55, 0x15, 0x55, 0x55, 0x01, 0x40, 0x01, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
         0x55, 0x55,
     ],
     [
-        0x55, 0x55, 0x05, 0x00, 0x00, 0x40, 0x50, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
+        0x55, 0x00, 0x00, 0x00, 0x00, 0x40, 0x50, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
         0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
         0x55, 0x55,
     ],
@@ -1617,7 +1617,7 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([
     ],
     [
         0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x01, 0x40, 0x45,
-        0x10, 0x00, 0x10, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
+        0x10, 0x00, 0x00, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
         0x55, 0x55,
     ],
     [
@@ -1631,7 +1631,7 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([
         0x55, 0x55,
     ],
     [
-        0x50, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x05, 0x40,
+        0x40, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x05, 0x40,
         0x55, 0x44, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
         0x55, 0x55,
     ],
@@ -1994,7 +1994,7 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([
 /// Sorted list of codepoint ranges (inclusive)
 /// that are zero-width but not `Joining_Type=Transparent`
 /// FIXME: can we get better compression?
-static NON_TRANSPARENT_ZERO_WIDTHS: [([u8; 3], [u8; 3]); 45] = [
+static NON_TRANSPARENT_ZERO_WIDTHS: [([u8; 3], [u8; 3]); 53] = [
     ([0x05, 0x06, 0x00], [0x05, 0x06, 0x00]),
     ([0x90, 0x08, 0x00], [0x91, 0x08, 0x00]),
     ([0xE2, 0x08, 0x00], [0xE2, 0x08, 0x00]),
@@ -2010,6 +2010,7 @@ static NON_TRANSPARENT_ZERO_WIDTHS: [([u8; 3], [u8; 3]); 45] = [
     ([0xCA, 0x0C, 0x00], [0xCB, 0x0C, 0x00]),
     ([0xD5, 0x0C, 0x00], [0xD6, 0x0C, 0x00]),
     ([0x3E, 0x0D, 0x00], [0x3E, 0x0D, 0x00]),
+    ([0x4E, 0x0D, 0x00], [0x4E, 0x0D, 0x00]),
     ([0x57, 0x0D, 0x00], [0x57, 0x0D, 0x00]),
     ([0xCF, 0x0D, 0x00], [0xCF, 0x0D, 0x00]),
     ([0xDF, 0x0D, 0x00], [0xDF, 0x0D, 0x00]),
@@ -2028,12 +2029,19 @@ static NON_TRANSPARENT_ZERO_WIDTHS: [([u8; 3], [u8; 3]); 45] = [
     ([0xCB, 0xD7, 0x00], [0xFB, 0xD7, 0x00]),
     ([0x9E, 0xFF, 0x00], [0xA0, 0xFF, 0x00]),
     ([0xF0, 0xFF, 0x00], [0xF8, 0xFF, 0x00]),
+    ([0xC2, 0x11, 0x01], [0xC3, 0x11, 0x01]),
     ([0x3E, 0x13, 0x01], [0x3E, 0x13, 0x01]),
     ([0x57, 0x13, 0x01], [0x57, 0x13, 0x01]),
     ([0xB0, 0x14, 0x01], [0xB0, 0x14, 0x01]),
     ([0xBD, 0x14, 0x01], [0xBD, 0x14, 0x01]),
     ([0xAF, 0x15, 0x01], [0xAF, 0x15, 0x01]),
     ([0x30, 0x19, 0x01], [0x30, 0x19, 0x01]),
+    ([0x3F, 0x19, 0x01], [0x3F, 0x19, 0x01]),
+    ([0x41, 0x19, 0x01], [0x41, 0x19, 0x01]),
+    ([0x3A, 0x1A, 0x01], [0x3A, 0x1A, 0x01]),
+    ([0x84, 0x1A, 0x01], [0x89, 0x1A, 0x01]),
+    ([0x46, 0x1D, 0x01], [0x46, 0x1D, 0x01]),
+    ([0x02, 0x1F, 0x01], [0x02, 0x1F, 0x01]),
     ([0x65, 0xD1, 0x01], [0x65, 0xD1, 0x01]),
     ([0x6E, 0xD1, 0x01], [0x72, 0xD1, 0x01]),
     ([0x00, 0x00, 0x0E], [0x00, 0x00, 0x0E]),
diff --git a/tests/tests.rs b/tests/tests.rs
index 4f713e7..8ff0c6b 100644
--- a/tests/tests.rs
+++ b/tests/tests.rs
@@ -110,6 +110,12 @@ fn test_prepended_concatenation_marks() {
     }
 }
 
+#[test]
+fn test_gcb_prepend() {
+    assert_width!("ൎഉ", 1, 1);
+    assert_width!("\u{11A89}", 0, 0);
+}
+
 #[test]
 fn test_interlinear_annotation_chars() {
     assert_width!('\u{FFF9}', Some(1), Some(1));