rust-lang
diff --git a/‎library/core/src/unicode/mod.rs‎
Lines changed: 2 additions & 1 deletion b/‎library/core/src/unicode/mod.rs‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/tools/unicode-table-generator/src/range_search.rs‎ renamed to ‎library/core/src/unicode/rt.rs‎
Lines changed: 40 additions & 6 deletions b/‎src/tools/unicode-table-generator/src/range_search.rs‎ renamed to ‎library/core/src/unicode/rt.rs‎
Lines changed: 40 additions & 6 deletions
diff --git a/‎library/core/src/unicode/unicode_data.rs‎
Lines changed: 1322 additions & 1215 deletions b/‎library/core/src/unicode/unicode_data.rs‎
Lines changed: 1322 additions & 1215 deletions
diff --git a/‎library/coretests/tests/lib.rs‎
Lines changed: 1 addition & 0 deletions b/‎library/coretests/tests/lib.rs‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎library/coretests/tests/unicode.rs‎
Lines changed: 96 additions & 0 deletions b/‎library/coretests/tests/unicode.rs‎
Lines changed: 96 additions & 0 deletions
@@ -18,8 +18,9 @@ pub(crate) use unicode_data::white_space::lookup as White_Space;
 
 pub(crate) mod printable;
 
+mod rt;
 #[allow(unreachable_pub)]
-mod unicode_data;
+pub mod unicode_data;
 
 /// The version of [Unicode](https://www.unicode.org/) that the Unicode parts of
 /// `char` and `str` methods are based on.
 
@@ -1,5 +1,7 @@
+//! Runtime support for `unicode_data`.
+
 #[inline(always)]
-const fn bitset_search<
+pub(super) const fn bitset_search<
     const N: usize,
     const CHUNK_SIZE: usize,
     const N1: usize,
@@ -46,23 +48,23 @@ const fn bitset_search<
 }
 
 #[repr(transparent)]
-struct ShortOffsetRunHeader(u32);
+pub(super) struct ShortOffsetRunHeader(pub(super) u32);
 
 impl ShortOffsetRunHeader {
-    const fn new(start_index: usize, prefix_sum: u32) -> Self {
+    pub(super) const fn new(start_index: usize, prefix_sum: u32) -> Self {
         assert!(start_index < (1 << 11));
         assert!(prefix_sum < (1 << 21));
 
         Self((start_index as u32) << 21 | prefix_sum)
     }
 
     #[inline]
-    const fn start_index(&self) -> usize {
+    pub(super) const fn start_index(&self) -> usize {
         (self.0 >> 21) as usize
     }
 
     #[inline]
-    const fn prefix_sum(&self) -> u32 {
+    pub(super) const fn prefix_sum(&self) -> u32 {
         self.0 & ((1 << 21) - 1)
     }
 }
@@ -72,7 +74,7 @@ impl ShortOffsetRunHeader {
 /// - The last element of `short_offset_runs` must be greater than `std::char::MAX`.
 /// - The start indices of all elements in `short_offset_runs` must be less than `OFFSETS`.
 #[inline(always)]
-unsafe fn skip_search<const SOR: usize, const OFFSETS: usize>(
+pub(super) unsafe fn skip_search<const SOR: usize, const OFFSETS: usize>(
     needle: char,
     short_offset_runs: &[ShortOffsetRunHeader; SOR],
     offsets: &[u8; OFFSETS],
@@ -126,3 +128,35 @@ unsafe fn skip_search<const SOR: usize, const OFFSETS: usize>(
     }
     offset_idx % 2 == 1
 }
+
+/// # Safety
+/// The second component of each tuple in `table` must either be:
+/// - A valid `char`
+/// - A value with the high bit (1 << 22) set, and the lower 22 bits
+///   being a valid index into `multi`.
+#[inline(always)]
+pub(super) unsafe fn case_conversion(
+    c: char,
+    ascii_fn: fn(char) -> char,
+    table: &[(char, u32)],
+    multi: &[[char; 3]],
+) -> [char; 3] {
+    const INDEX_MASK: u32 = 1 << 22;
+
+    if c.is_ascii() {
+        return [ascii_fn(c), '\0', '\0'];
+    }
+
+    let Ok(i) = table.binary_search_by(|&(key, _)| key.cmp(&c)) else {
+        return [c, '\0', '\0'];
+    };
+
+    let u = table[i].1;
+    match char::from_u32(u) {
+        Option::Some(c) => [c, '\0', '\0'],
+        Option::None => {
+            // SAFETY: Index comes from statically generated table
+            unsafe { *multi.get_unchecked((u & (INDEX_MASK - 1)) as usize) }
+        }
+    }
+}
@@ -113,6 +113,7 @@
 #![feature(try_find)]
 #![feature(try_trait_v2)]
 #![feature(uint_bit_width)]
+#![feature(unicode_internals)]
 #![feature(unsize)]
 #![feature(unwrap_infallible)]
 // tidy-alphabetical-end
 
@@ -1,5 +1,101 @@
+use core::unicode::unicode_data;
+use std::ops::RangeInclusive;
+
+mod test_data;
+
 #[test]
 pub fn version() {
     let (major, _minor, _update) = core::char::UNICODE_VERSION;
     assert!(major >= 10);
 }
+
+#[track_caller]
+fn test_boolean_property(ranges: &[RangeInclusive<char>], lookup: fn(char) -> bool) {
+    let mut start = '\u{80}';
+    for range in ranges {
+        for c in start..*range.start() {
+            assert!(!lookup(c), "{c:?}");
+        }
+        for c in range.clone() {
+            assert!(lookup(c), "{c:?}");
+        }
+        start = char::from_u32(*range.end() as u32 + 1).unwrap();
+    }
+    for c in start..=char::MAX {
+        assert!(!lookup(c), "{c:?}");
+    }
+}
+
+#[track_caller]
+fn test_case_mapping(ranges: &[(char, [char; 3])], lookup: fn(char) -> [char; 3]) {
+    let mut start = '\u{80}';
+    for &(key, val) in ranges {
+        for c in start..key {
+            assert_eq!(lookup(c), [c, '\0', '\0'], "{c:?}");
+        }
+        assert_eq!(lookup(key), val, "{key:?}");
+        start = char::from_u32(key as u32 + 1).unwrap();
+    }
+    for c in start..=char::MAX {
+        assert_eq!(lookup(c), [c, '\0', '\0'], "{c:?}");
+    }
+}
+
+#[test]
+#[cfg_attr(miri, ignore)]
+fn alphabetic() {
+    test_boolean_property(test_data::ALPHABETIC, unicode_data::alphabetic::lookup);
+}
+
+#[test]
+#[cfg_attr(miri, ignore)]
+fn case_ignorable() {
+    test_boolean_property(test_data::CASE_IGNORABLE, unicode_data::case_ignorable::lookup);
+}
+
+#[test]
+#[cfg_attr(miri, ignore)]
+fn cased() {
+    test_boolean_property(test_data::CASED, unicode_data::cased::lookup);
+}
+
+#[test]
+#[cfg_attr(miri, ignore)]
+fn grapheme_extend() {
+    test_boolean_property(test_data::GRAPHEME_EXTEND, unicode_data::grapheme_extend::lookup);
+}
+
+#[test]
+#[cfg_attr(miri, ignore)]
+fn lowercase() {
+    test_boolean_property(test_data::LOWERCASE, unicode_data::lowercase::lookup);
+}
+
+#[test]
+fn n() {
+    test_boolean_property(test_data::N, unicode_data::n::lookup);
+}
+
+#[test]
+#[cfg_attr(miri, ignore)]
+fn uppercase() {
+    test_boolean_property(test_data::UPPERCASE, unicode_data::uppercase::lookup);
+}
+
+#[test]
+#[cfg_attr(miri, ignore)]
+fn white_space() {
+    test_boolean_property(test_data::WHITE_SPACE, unicode_data::white_space::lookup);
+}
+
+#[test]
+#[cfg_attr(miri, ignore)]
+fn to_lowercase() {
+    test_case_mapping(test_data::TO_LOWER, unicode_data::conversions::to_lower);
+}
+
+#[test]
+#[cfg_attr(miri, ignore)]
+fn to_uppercase() {
+    test_case_mapping(test_data::TO_UPPER, unicode_data::conversions::to_upper);
+}