Skip to content

Commit dc0dcf5

Browse files
committed
refactor: move more functions to unicode/rt.rs
Move the logic for traversing the lookup tables for case conversions into `rt.rs` along with the other helper functions along with the other helper functions.
1 parent 7f00217 commit dc0dcf5

File tree

3 files changed

+56
-80
lines changed

3 files changed

+56
-80
lines changed

library/core/src/unicode/rt.rs

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,3 +128,30 @@ pub(super) unsafe fn skip_search<const SOR: usize, const OFFSETS: usize>(
128128
}
129129
offset_idx % 2 == 1
130130
}
131+
132+
#[inline(always)]
133+
pub(super) fn case_conversion(
134+
c: char,
135+
ascii_fn: fn(char) -> char,
136+
table: &[(char, u32)],
137+
multi: &[[char; 3]],
138+
) -> [char; 3] {
139+
const INDEX_MASK: u32 = 1 << 22;
140+
141+
if c.is_ascii() {
142+
return [ascii_fn(c), '\0', '\0'];
143+
}
144+
145+
let Ok(i) = table.binary_search_by(|&(key, _)| key.cmp(&c)) else {
146+
return [c, '\0', '\0'];
147+
};
148+
149+
let u = table[i].1;
150+
match char::from_u32(u) {
151+
Option::Some(c) => [c, '\0', '\0'],
152+
Option::None => {
153+
// SAFETY: Index comes from statically generated table
154+
unsafe { *multi.get_unchecked((u & (INDEX_MASK - 1)) as usize) }
155+
}
156+
}
157+
}

library/core/src/unicode/unicode_data.rs

Lines changed: 14 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -762,46 +762,14 @@ pub mod white_space {
762762
}
763763

764764
pub mod conversions {
765-
const INDEX_MASK: u32 = 1 << 22;
766-
767765
pub fn to_lower(c: char) -> [char; 3] {
768-
if c.is_ascii() {
769-
[(c as u8).to_ascii_lowercase() as char, '\0', '\0']
770-
} else {
771-
LOWERCASE_TABLE
772-
.binary_search_by(|&(key, _)| key.cmp(&c))
773-
.map(|i| {
774-
let u = LOWERCASE_TABLE[i].1;
775-
char::from_u32(u).map(|c| [c, '\0', '\0']).unwrap_or_else(|| {
776-
// SAFETY: Index comes from statically generated table
777-
unsafe {
778-
*LOWERCASE_TABLE_MULTI.get_unchecked((u & (INDEX_MASK - 1)) as usize)
779-
}
780-
})
781-
})
782-
.unwrap_or([c, '\0', '\0'])
783-
}
766+
super::case_conversion(
767+
c,
768+
|c| c.to_ascii_lowercase(),
769+
LOWERCASE_TABLE,
770+
LOWERCASE_TABLE_MULTI,
771+
)
784772
}
785-
786-
pub fn to_upper(c: char) -> [char; 3] {
787-
if c.is_ascii() {
788-
[(c as u8).to_ascii_uppercase() as char, '\0', '\0']
789-
} else {
790-
UPPERCASE_TABLE
791-
.binary_search_by(|&(key, _)| key.cmp(&c))
792-
.map(|i| {
793-
let u = UPPERCASE_TABLE[i].1;
794-
char::from_u32(u).map(|c| [c, '\0', '\0']).unwrap_or_else(|| {
795-
// SAFETY: Index comes from statically generated table
796-
unsafe {
797-
*UPPERCASE_TABLE_MULTI.get_unchecked((u & (INDEX_MASK - 1)) as usize)
798-
}
799-
})
800-
})
801-
.unwrap_or([c, '\0', '\0'])
802-
}
803-
}
804-
805773
#[rustfmt::skip]
806774
static LOWERCASE_TABLE: &[(char, u32); 1462] = &[
807775
('\u{c0}', 224), ('\u{c1}', 225), ('\u{c2}', 226), ('\u{c3}', 227), ('\u{c4}', 228),
@@ -1164,6 +1132,14 @@ pub mod conversions {
11641132
['i', '\u{307}', '\u{0}'],
11651133
];
11661134

1135+
pub fn to_upper(c: char) -> [char; 3] {
1136+
super::case_conversion(
1137+
c,
1138+
|c| c.to_ascii_uppercase(),
1139+
UPPERCASE_TABLE,
1140+
UPPERCASE_TABLE_MULTI,
1141+
)
1142+
}
11671143
#[rustfmt::skip]
11681144
static UPPERCASE_TABLE: &[(char, u32); 1554] = &[
11691145
('\u{b5}', 924), ('\u{df}', 4194304), ('\u{e0}', 192), ('\u{e1}', 193), ('\u{e2}', 194),

src/tools/unicode-table-generator/src/case_mapping.rs

Lines changed: 15 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -9,14 +9,17 @@ pub(crate) fn generate_case_mapping(data: &UnicodeData) -> (String, [usize; 2])
99
let (lower_tables, lower_size) = generate_tables("LOWER", &data.to_lower);
1010
let (upper_tables, upper_size) = generate_tables("UPPER", &data.to_upper);
1111
let file = format!(
12-
"{HEADER}
12+
"
1313
{lower_tables}
1414
{upper_tables}"
1515
);
1616
(file, [lower_size, upper_size])
1717
}
1818

19-
fn generate_tables(case: &str, data: &BTreeMap<u32, [u32; 3]>) -> (String, usize) {
19+
fn generate_tables(prop: &str, data: &BTreeMap<u32, [u32; 3]>) -> (String, usize) {
20+
let prop_lower = prop.to_lowercase();
21+
let prop_upper = prop.to_uppercase();
22+
2023
let mut mappings = Vec::with_capacity(data.len());
2124
let mut multis = Vec::new();
2225

@@ -45,8 +48,16 @@ fn generate_tables(case: &str, data: &BTreeMap<u32, [u32; 3]>) -> (String, usize
4548
let size = size_of_val(mappings.as_slice()) + size_of_val(multis.as_slice());
4649
let file = format!(
4750
"
48-
#[rustfmt::skip]\nstatic {case}CASE_TABLE: &[(char, u32); {mappings_len}] = &[{mappings}];
49-
#[rustfmt::skip]\nstatic {case}CASE_TABLE_MULTI: &[[char; 3]; {multis_len}] = &[{multis}];",
51+
pub fn to_{prop_lower}(c: char) -> [char; 3] {{
52+
super::case_conversion(
53+
c,
54+
|c| c.to_ascii_{prop_lower}case(),
55+
{prop_upper}CASE_TABLE,
56+
{prop_upper}CASE_TABLE_MULTI,
57+
)
58+
}}
59+
#[rustfmt::skip]\nstatic {prop_upper}CASE_TABLE: &[(char, u32); {mappings_len}] = &[{mappings}];
60+
#[rustfmt::skip]\nstatic {prop_upper}CASE_TABLE_MULTI: &[[char; 3]; {multis_len}] = &[{multis}];",
5061
mappings = fmt_list(&mappings),
5162
mappings_len = mappings.len(),
5263
multis = fmt_list(&multis),
@@ -55,41 +66,3 @@ fn generate_tables(case: &str, data: &BTreeMap<u32, [u32; 3]>) -> (String, usize
5566

5667
(file, size)
5768
}
58-
59-
static HEADER: &str = r"
60-
const INDEX_MASK: u32 = 1 << 22;
61-
62-
pub fn to_lower(c: char) -> [char; 3] {
63-
if c.is_ascii() {
64-
[(c as u8).to_ascii_lowercase() as char, '\0', '\0']
65-
} else {
66-
LOWERCASE_TABLE
67-
.binary_search_by(|&(key, _)| key.cmp(&c))
68-
.map(|i| {
69-
let u = LOWERCASE_TABLE[i].1;
70-
char::from_u32(u).map(|c| [c, '\0', '\0']).unwrap_or_else(|| {
71-
// SAFETY: Index comes from statically generated table
72-
unsafe { *LOWERCASE_TABLE_MULTI.get_unchecked((u & (INDEX_MASK - 1)) as usize) }
73-
})
74-
})
75-
.unwrap_or([c, '\0', '\0'])
76-
}
77-
}
78-
79-
pub fn to_upper(c: char) -> [char; 3] {
80-
if c.is_ascii() {
81-
[(c as u8).to_ascii_uppercase() as char, '\0', '\0']
82-
} else {
83-
UPPERCASE_TABLE
84-
.binary_search_by(|&(key, _)| key.cmp(&c))
85-
.map(|i| {
86-
let u = UPPERCASE_TABLE[i].1;
87-
char::from_u32(u).map(|c| [c, '\0', '\0']).unwrap_or_else(|| {
88-
// SAFETY: Index comes from statically generated table
89-
unsafe { *UPPERCASE_TABLE_MULTI.get_unchecked((u & (INDEX_MASK - 1)) as usize) }
90-
})
91-
})
92-
.unwrap_or([c, '\0', '\0'])
93-
}
94-
}
95-
";

0 commit comments

Comments
 (0)