Skip to content

Commit f29ba69

Browse files
authored
Unrolled build for #147622
Rollup merge of #147622 - Kmeakin:km/unicode-data/refactors, r=joboet `unicode_data` refactors Minor refactors to `unicode_data` that occured to me while trying to reduce the size of the tables. Splitting into a separate PR. NFC
2 parents 17e7324 + 0e6131c commit f29ba69

File tree

13 files changed

+4700
-1592
lines changed

13 files changed

+4700
-1592
lines changed

library/core/src/unicode/mod.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,9 @@ pub(crate) use unicode_data::white_space::lookup as White_Space;
1818

1919
pub(crate) mod printable;
2020

21+
mod rt;
2122
#[allow(unreachable_pub)]
22-
mod unicode_data;
23+
pub mod unicode_data;
2324

2425
/// The version of [Unicode](https://www.unicode.org/) that the Unicode parts of
2526
/// `char` and `str` methods are based on.

src/tools/unicode-table-generator/src/range_search.rs renamed to library/core/src/unicode/rt.rs

Lines changed: 40 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
1+
//! Runtime support for `unicode_data`.
2+
13
#[inline(always)]
2-
const fn bitset_search<
4+
pub(super) const fn bitset_search<
35
const N: usize,
46
const CHUNK_SIZE: usize,
57
const N1: usize,
@@ -46,23 +48,23 @@ const fn bitset_search<
4648
}
4749

4850
#[repr(transparent)]
49-
struct ShortOffsetRunHeader(u32);
51+
pub(super) struct ShortOffsetRunHeader(pub(super) u32);
5052

5153
impl ShortOffsetRunHeader {
52-
const fn new(start_index: usize, prefix_sum: u32) -> Self {
54+
pub(super) const fn new(start_index: usize, prefix_sum: u32) -> Self {
5355
assert!(start_index < (1 << 11));
5456
assert!(prefix_sum < (1 << 21));
5557

5658
Self((start_index as u32) << 21 | prefix_sum)
5759
}
5860

5961
#[inline]
60-
const fn start_index(&self) -> usize {
62+
pub(super) const fn start_index(&self) -> usize {
6163
(self.0 >> 21) as usize
6264
}
6365

6466
#[inline]
65-
const fn prefix_sum(&self) -> u32 {
67+
pub(super) const fn prefix_sum(&self) -> u32 {
6668
self.0 & ((1 << 21) - 1)
6769
}
6870
}
@@ -72,7 +74,7 @@ impl ShortOffsetRunHeader {
7274
/// - The last element of `short_offset_runs` must be greater than `std::char::MAX`.
7375
/// - The start indices of all elements in `short_offset_runs` must be less than `OFFSETS`.
7476
#[inline(always)]
75-
unsafe fn skip_search<const SOR: usize, const OFFSETS: usize>(
77+
pub(super) unsafe fn skip_search<const SOR: usize, const OFFSETS: usize>(
7678
needle: char,
7779
short_offset_runs: &[ShortOffsetRunHeader; SOR],
7880
offsets: &[u8; OFFSETS],
@@ -126,3 +128,35 @@ unsafe fn skip_search<const SOR: usize, const OFFSETS: usize>(
126128
}
127129
offset_idx % 2 == 1
128130
}
131+
132+
/// # Safety
133+
/// The second component of each tuple in `table` must either be:
134+
/// - A valid `char`
135+
/// - A value with the high bit (1 << 22) set, and the lower 22 bits
136+
/// being a valid index into `multi`.
137+
#[inline(always)]
138+
pub(super) unsafe fn case_conversion(
139+
c: char,
140+
ascii_fn: fn(char) -> char,
141+
table: &[(char, u32)],
142+
multi: &[[char; 3]],
143+
) -> [char; 3] {
144+
const INDEX_MASK: u32 = 1 << 22;
145+
146+
if c.is_ascii() {
147+
return [ascii_fn(c), '\0', '\0'];
148+
}
149+
150+
let Ok(i) = table.binary_search_by(|&(key, _)| key.cmp(&c)) else {
151+
return [c, '\0', '\0'];
152+
};
153+
154+
let u = table[i].1;
155+
match char::from_u32(u) {
156+
Option::Some(c) => [c, '\0', '\0'],
157+
Option::None => {
158+
// SAFETY: Index comes from statically generated table
159+
unsafe { *multi.get_unchecked((u & (INDEX_MASK - 1)) as usize) }
160+
}
161+
}
162+
}

library/core/src/unicode/unicode_data.rs

Lines changed: 1322 additions & 1215 deletions
Large diffs are not rendered by default.

library/coretests/tests/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,7 @@
113113
#![feature(try_find)]
114114
#![feature(try_trait_v2)]
115115
#![feature(uint_bit_width)]
116+
#![feature(unicode_internals)]
116117
#![feature(unsize)]
117118
#![feature(unwrap_infallible)]
118119
// tidy-alphabetical-end

library/coretests/tests/unicode.rs

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,101 @@
1+
use core::unicode::unicode_data;
2+
use std::ops::RangeInclusive;
3+
4+
mod test_data;
5+
16
#[test]
27
pub fn version() {
38
let (major, _minor, _update) = core::char::UNICODE_VERSION;
49
assert!(major >= 10);
510
}
11+
12+
#[track_caller]
13+
fn test_boolean_property(ranges: &[RangeInclusive<char>], lookup: fn(char) -> bool) {
14+
let mut start = '\u{80}';
15+
for range in ranges {
16+
for c in start..*range.start() {
17+
assert!(!lookup(c), "{c:?}");
18+
}
19+
for c in range.clone() {
20+
assert!(lookup(c), "{c:?}");
21+
}
22+
start = char::from_u32(*range.end() as u32 + 1).unwrap();
23+
}
24+
for c in start..=char::MAX {
25+
assert!(!lookup(c), "{c:?}");
26+
}
27+
}
28+
29+
#[track_caller]
30+
fn test_case_mapping(ranges: &[(char, [char; 3])], lookup: fn(char) -> [char; 3]) {
31+
let mut start = '\u{80}';
32+
for &(key, val) in ranges {
33+
for c in start..key {
34+
assert_eq!(lookup(c), [c, '\0', '\0'], "{c:?}");
35+
}
36+
assert_eq!(lookup(key), val, "{key:?}");
37+
start = char::from_u32(key as u32 + 1).unwrap();
38+
}
39+
for c in start..=char::MAX {
40+
assert_eq!(lookup(c), [c, '\0', '\0'], "{c:?}");
41+
}
42+
}
43+
44+
#[test]
45+
#[cfg_attr(miri, ignore)]
46+
fn alphabetic() {
47+
test_boolean_property(test_data::ALPHABETIC, unicode_data::alphabetic::lookup);
48+
}
49+
50+
#[test]
51+
#[cfg_attr(miri, ignore)]
52+
fn case_ignorable() {
53+
test_boolean_property(test_data::CASE_IGNORABLE, unicode_data::case_ignorable::lookup);
54+
}
55+
56+
#[test]
57+
#[cfg_attr(miri, ignore)]
58+
fn cased() {
59+
test_boolean_property(test_data::CASED, unicode_data::cased::lookup);
60+
}
61+
62+
#[test]
63+
#[cfg_attr(miri, ignore)]
64+
fn grapheme_extend() {
65+
test_boolean_property(test_data::GRAPHEME_EXTEND, unicode_data::grapheme_extend::lookup);
66+
}
67+
68+
#[test]
69+
#[cfg_attr(miri, ignore)]
70+
fn lowercase() {
71+
test_boolean_property(test_data::LOWERCASE, unicode_data::lowercase::lookup);
72+
}
73+
74+
#[test]
75+
fn n() {
76+
test_boolean_property(test_data::N, unicode_data::n::lookup);
77+
}
78+
79+
#[test]
80+
#[cfg_attr(miri, ignore)]
81+
fn uppercase() {
82+
test_boolean_property(test_data::UPPERCASE, unicode_data::uppercase::lookup);
83+
}
84+
85+
#[test]
86+
#[cfg_attr(miri, ignore)]
87+
fn white_space() {
88+
test_boolean_property(test_data::WHITE_SPACE, unicode_data::white_space::lookup);
89+
}
90+
91+
#[test]
92+
#[cfg_attr(miri, ignore)]
93+
fn to_lowercase() {
94+
test_case_mapping(test_data::TO_LOWER, unicode_data::conversions::to_lower);
95+
}
96+
97+
#[test]
98+
#[cfg_attr(miri, ignore)]
99+
fn to_uppercase() {
100+
test_case_mapping(test_data::TO_UPPER, unicode_data::conversions::to_upper);
101+
}

0 commit comments

Comments
 (0)