Skip to content

Commit a8c6694

Browse files
committed
optimization: Don't include ASCII characters in Unicode tables
The ASCII subset of Unicode is fixed and will never change, so we don't need to generate tables for it with every new Unicode version. This saves a few bytes of static data and speeds up `char::is_control` and `char::is_grapheme_extended` on ASCII inputs. Since the table lookup functions exported from the `unicode` module will give nonsensical errors on ASCII input (and in fact will panic in debug mode), I had to add some private wrapper methods to `char` which check for ASCII-ness first.
1 parent fbd8f95 commit a8c6694

File tree

7 files changed

+320
-247
lines changed

7 files changed

+320
-247
lines changed

library/alloc/src/str.rs

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -418,9 +418,8 @@ impl str {
418418
}
419419

420420
fn case_ignorable_then_cased<I: Iterator<Item = char>>(iter: I) -> bool {
421-
use core::unicode::{Case_Ignorable, Cased};
422-
match iter.skip_while(|&c| Case_Ignorable(c)).next() {
423-
Some(c) => Cased(c),
421+
match iter.skip_while(|&c| c.is_case_ignorable()).next() {
422+
Some(c) => c.is_cased(),
424423
None => false,
425424
}
426425
}

library/core/src/char/methods.rs

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -969,7 +969,43 @@ impl char {
969969
#[must_use]
970970
#[inline]
971971
pub(crate) fn is_grapheme_extended(self) -> bool {
972-
unicode::Grapheme_Extend(self)
972+
!self.is_ascii() && unicode::Grapheme_Extend(self)
973+
}
974+
975+
/// Returns `true` if this `char` has the `Cased` property.
976+
///
977+
/// `Cased` is described in Chapter 4 (Character Properties) of the [Unicode Standard] and
978+
/// specified in the [Unicode Character Database][ucd] [`DerivedCoreProperties.txt`].
979+
///
980+
/// [Unicode Standard]: https://www.unicode.org/versions/latest/
981+
/// [ucd]: https://www.unicode.org/reports/tr44/
982+
/// [`DerivedCoreProperties.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
983+
#[must_use]
984+
#[inline]
985+
#[doc(hidden)]
986+
#[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")]
987+
pub fn is_cased(self) -> bool {
988+
if self.is_ascii() { self.is_ascii_alphabetic() } else { unicode::Cased(self) }
989+
}
990+
991+
/// Returns `true` if this `char` has the `Case_Ignorable` property.
992+
///
993+
/// `Case_Ignorable` is described in Chapter 4 (Character Properties) of the [Unicode Standard] and
994+
/// specified in the [Unicode Character Database][ucd] [`DerivedCoreProperties.txt`].
995+
///
996+
/// [Unicode Standard]: https://www.unicode.org/versions/latest/
997+
/// [ucd]: https://www.unicode.org/reports/tr44/
998+
/// [`DerivedCoreProperties.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
999+
#[must_use]
1000+
#[inline]
1001+
#[doc(hidden)]
1002+
#[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")]
1003+
pub fn is_case_ignorable(self) -> bool {
1004+
if self.is_ascii() {
1005+
matches!(self, '\'' | '.' | ':' | '^' | '`')
1006+
} else {
1007+
unicode::Case_Ignorable(self)
1008+
}
9731009
}
9741010

9751011
/// Returns `true` if this `char` has one of the general categories for numbers.

library/core/src/unicode/unicode_data.rs

Lines changed: 276 additions & 243 deletions
Large diffs are not rendered by default.

src/tools/unicode-table-generator/src/cascading_map.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ impl RawEmitter {
6464

6565
writeln!(&mut self.file, "#[inline]").unwrap();
6666
writeln!(&mut self.file, "pub const fn lookup(c: char) -> bool {{").unwrap();
67+
writeln!(&mut self.file, " debug_assert!(!c.is_ascii());").unwrap();
6768
writeln!(&mut self.file, " match c as u32 >> 8 {{").unwrap();
6869
for arm in arms {
6970
writeln!(&mut self.file, " {arm},").unwrap();

src/tools/unicode-table-generator/src/main.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,7 @@ fn load_data() -> UnicodeData {
195195
.into_iter()
196196
.flatten()
197197
.flat_map(|cp| cp.scalar())
198+
.filter(|c| !c.is_ascii())
198199
.map(u32::from)
199200
.collect::<Vec<_>>();
200201
(prop, ranges_from_set(&codepoints))

src/tools/unicode-table-generator/src/raw_emitter.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,7 @@ impl RawEmitter {
9898
self.blank_line();
9999

100100
writeln!(&mut self.file, "pub const fn lookup(c: char) -> bool {{").unwrap();
101+
writeln!(&mut self.file, " debug_assert!(!c.is_ascii());").unwrap();
101102
if first_code_point > 0x7f {
102103
writeln!(&mut self.file, " (c as u32) >= {first_code_point:#04x} &&").unwrap();
103104
}

src/tools/unicode-table-generator/src/skiplist.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,7 @@ impl RawEmitter {
9999
if first_code_point > 0x7f {
100100
writeln!(&mut self.file, "#[inline]").unwrap();
101101
writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap();
102+
writeln!(&mut self.file, " debug_assert!(!c.is_ascii());").unwrap();
102103
writeln!(&mut self.file, " (c as u32) >= {first_code_point:#04x} && lookup_slow(c)")
103104
.unwrap();
104105
writeln!(&mut self.file, "}}").unwrap();
@@ -107,6 +108,7 @@ impl RawEmitter {
107108
writeln!(&mut self.file, "fn lookup_slow(c: char) -> bool {{").unwrap();
108109
} else {
109110
writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap();
111+
writeln!(&mut self.file, " debug_assert!(!c.is_ascii());").unwrap();
110112
}
111113
writeln!(&mut self.file, " const {{").unwrap();
112114
writeln!(

0 commit comments

Comments
 (0)