Skip to content

Add complex case mapping and title case mapping. #26039

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Jun 9, 2015
97 changes: 72 additions & 25 deletions src/etc/unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,9 @@ def is_surrogate(n):
def load_unicode_data(f):
fetch(f)
gencats = {}
upperlower = {}
lowerupper = {}
to_lower = {}
to_upper = {}
to_title = {}
combines = {}
canon_decomp = {}
compat_decomp = {}
Expand Down Expand Up @@ -103,12 +104,16 @@ def load_unicode_data(f):

# generate char to char direct common and simple conversions
# uppercase to lowercase
if gencat == "Lu" and lowcase != "" and code_org != lowcase:
upperlower[code] = int(lowcase, 16)
if lowcase != "" and code_org != lowcase:
to_lower[code] = (int(lowcase, 16), 0, 0)

# lowercase to uppercase
if gencat == "Ll" and upcase != "" and code_org != upcase:
lowerupper[code] = int(upcase, 16)
if upcase != "" and code_org != upcase:
to_upper[code] = (int(upcase, 16), 0, 0)

# title case
if titlecase.strip() != "" and code_org != titlecase:
to_title[code] = (int(titlecase, 16), 0, 0)

# store decomposition, if given
if decomp != "":
Expand Down Expand Up @@ -144,7 +149,32 @@ def load_unicode_data(f):
gencats = group_cats(gencats)
combines = to_combines(group_cats(combines))

return (canon_decomp, compat_decomp, gencats, combines, lowerupper, upperlower)
return (canon_decomp, compat_decomp, gencats, combines, to_upper, to_lower, to_title)

def load_special_casing(f, to_upper, to_lower, to_title):
fetch(f)
for line in fileinput.input(f):
data = line.split('#')[0].split(';')
if len(data) == 5:
code, lower, title, upper, _comment = data
elif len(data) == 6:
code, lower, title, upper, condition, _comment = data
if condition.strip(): # Only keep unconditional mappins
continue
else:
continue
code = code.strip()
lower = lower.strip()
title = title.strip()
upper = upper.strip()
key = int(code, 16)
for (map_, values) in [(to_lower, lower), (to_upper, upper), (to_title, title)]:
if values != code:
values = [int(i, 16) for i in values.split()]
for _ in range(len(values), 3):
values.append(0)
assert len(values) == 3
map_[key] = values

def group_cats(cats):
cats_out = {}
Expand Down Expand Up @@ -279,7 +309,7 @@ def load_east_asian_width(want_widths, except_cats):
return widths

def escape_char(c):
return "'\\u{%x}'" % c
return "'\\u{%x}'" % c if c != 0 else "'\\0'"

def emit_bsearch_range_table(f):
f.write("""
Expand Down Expand Up @@ -319,7 +349,7 @@ def emit_property_module(f, mod, tbl, emit):
f.write(" }\n\n")
f.write("}\n\n")

def emit_conversions_module(f, lowerupper, upperlower):
def emit_conversions_module(f, to_upper, to_lower, to_title):
f.write("pub mod conversions {")
f.write("""
use core::cmp::Ordering::{Equal, Less, Greater};
Expand All @@ -328,21 +358,28 @@ def emit_conversions_module(f, lowerupper, upperlower):
use core::option::Option::{Some, None};
use core::result::Result::{Ok, Err};

pub fn to_lower(c: char) -> char {
match bsearch_case_table(c, LuLl_table) {
None => c,
Some(index) => LuLl_table[index].1
pub fn to_lower(c: char) -> [char; 3] {
match bsearch_case_table(c, to_lowercase_table) {
None => [c, '\\0', '\\0'],
Some(index) => to_lowercase_table[index].1
}
}

pub fn to_upper(c: char) -> [char; 3] {
match bsearch_case_table(c, to_uppercase_table) {
None => [c, '\\0', '\\0'],
Some(index) => to_uppercase_table[index].1
}
}

pub fn to_upper(c: char) -> char {
match bsearch_case_table(c, LlLu_table) {
None => c,
Some(index) => LlLu_table[index].1
pub fn to_title(c: char) -> [char; 3] {
match bsearch_case_table(c, to_titlecase_table) {
None => [c, '\\0', '\\0'],
Some(index) => to_titlecase_table[index].1
}
}

fn bsearch_case_table(c: char, table: &'static [(char, char)]) -> Option<usize> {
fn bsearch_case_table(c: char, table: &'static [(char, [char; 3])]) -> Option<usize> {
match table.binary_search_by(|&(key, _)| {
if c == key { Equal }
else if key < c { Less }
Expand All @@ -354,10 +391,18 @@ def emit_conversions_module(f, lowerupper, upperlower):
}

""")
emit_table(f, "LuLl_table",
sorted(upperlower.iteritems(), key=operator.itemgetter(0)), is_pub=False)
emit_table(f, "LlLu_table",
sorted(lowerupper.iteritems(), key=operator.itemgetter(0)), is_pub=False)
t_type = "&'static [(char, [char; 3])]"
pfun = lambda x: "(%s,[%s,%s,%s])" % (
escape_char(x[0]), escape_char(x[1][0]), escape_char(x[1][1]), escape_char(x[1][2]))
emit_table(f, "to_lowercase_table",
sorted(to_lower.iteritems(), key=operator.itemgetter(0)),
is_pub=False, t_type = t_type, pfun=pfun)
emit_table(f, "to_uppercase_table",
sorted(to_upper.iteritems(), key=operator.itemgetter(0)),
is_pub=False, t_type = t_type, pfun=pfun)
emit_table(f, "to_titlecase_table",
sorted(to_title.iteritems(), key=operator.itemgetter(0)),
is_pub=False, t_type = t_type, pfun=pfun)
f.write("}\n\n")

def emit_grapheme_module(f, grapheme_table, grapheme_cats):
Expand Down Expand Up @@ -591,8 +636,10 @@ def optimize_width_table(wtable):
pub const UNICODE_VERSION: (u64, u64, u64) = (%s, %s, %s);
""" % unicode_version)
(canon_decomp, compat_decomp, gencats, combines,
lowerupper, upperlower) = load_unicode_data("UnicodeData.txt")
want_derived = ["XID_Start", "XID_Continue", "Alphabetic", "Lowercase", "Uppercase"]
to_upper, to_lower, to_title) = load_unicode_data("UnicodeData.txt")
load_special_casing("SpecialCasing.txt", to_upper, to_lower, to_title)
want_derived = ["XID_Start", "XID_Continue", "Alphabetic", "Lowercase", "Uppercase",
"Cased", "Case_Ignorable"]
derived = load_properties("DerivedCoreProperties.txt", want_derived)
scripts = load_properties("Scripts.txt", [])
props = load_properties("PropList.txt",
Expand All @@ -611,7 +658,7 @@ def optimize_width_table(wtable):

# normalizations and conversions module
emit_norm_module(rf, canon_decomp, compat_decomp, combines, norm_props)
emit_conversions_module(rf, lowerupper, upperlower)
emit_conversions_module(rf, to_upper, to_lower, to_title)

### character width module
width_table = []
Expand Down
30 changes: 27 additions & 3 deletions src/libcollections/str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1851,11 +1851,35 @@ impl str {
/// let s = "HELLO";
/// assert_eq!(s.to_lowercase(), "hello");
/// ```
#[unstable(feature = "collections")]
#[stable(feature = "unicode_case_mapping", since = "1.2.0")]
pub fn to_lowercase(&self) -> String {
let mut s = String::with_capacity(self.len());
s.extend(self[..].chars().flat_map(|c| c.to_lowercase()));
for (i, c) in self[..].char_indices() {
if c == 'Σ' {
map_uppercase_sigma(self, i, &mut s)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you add a comment for why this is being specially cased here?

} else {
s.extend(c.to_lowercase());
}
}
return s;

#[cold]
#[inline(never)]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Were you able to get a measurable improvement with these attributes? If not I'd personally recommend leaving them off for now.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I did not take any measurement, I just imitated Vec::push. I’ll remove them.

fn map_uppercase_sigma(from: &str, i: usize, to: &mut String) {
debug_assert!('Σ'.len_utf8() == 2);
let is_word_final =
case_ignoreable_then_cased(from[..i].chars().rev()) &&
!case_ignoreable_then_cased(from[i + 2..].chars());
to.push_str(if is_word_final { "ς" } else { "σ" });
}

fn case_ignoreable_then_cased<I: Iterator<Item=char>>(iter: I) -> bool {
use rustc_unicode::derived_property::{Cased, Case_Ignorable};
match iter.skip_while(|&c| Case_Ignorable(c)).next() {
Some(c) => Cased(c),
None => false,
}
}
}

/// Returns the uppercase equivalent of this string.
Expand All @@ -1868,7 +1892,7 @@ impl str {
/// let s = "hello";
/// assert_eq!(s.to_uppercase(), "HELLO");
/// ```
#[unstable(feature = "collections")]
#[stable(feature = "unicode_case_mapping", since = "1.2.0")]
pub fn to_uppercase(&self) -> String {
let mut s = String::with_capacity(self.len());
s.extend(self[..].chars().flat_map(|c| c.to_uppercase()));
Expand Down
42 changes: 42 additions & 0 deletions src/libcollectionstest/char.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

use collections::vec::Vec;

#[test]
fn char_to_lowercase() {
assert_iter_eq('A'.to_lowercase(), &['a']);
assert_iter_eq('É'.to_lowercase(), &['é']);
assert_iter_eq('Dž'.to_lowercase(), &['dž']);
}

#[test]
fn char_to_uppercase() {
assert_iter_eq('a'.to_uppercase(), &['A']);
assert_iter_eq('é'.to_uppercase(), &['É']);
assert_iter_eq('Dž'.to_uppercase(), &['DŽ']);
assert_iter_eq('ß'.to_uppercase(), &['S', 'S']);
assert_iter_eq('fi'.to_uppercase(), &['F', 'I']);
assert_iter_eq('ᾀ'.to_uppercase(), &['Ἀ', 'Ι']);
}

#[test]
fn char_to_titlecase() {
assert_iter_eq('a'.to_titlecase(), &['A']);
assert_iter_eq('é'.to_titlecase(), &['É']);
assert_iter_eq('DŽ'.to_titlecase(), &['Dž']);
assert_iter_eq('ß'.to_titlecase(), &['S', 's']);
assert_iter_eq('fi'.to_titlecase(), &['F', 'i']);
assert_iter_eq('ᾀ'.to_titlecase(), &['ᾈ']);
}

fn assert_iter_eq<I: Iterator<Item=char>>(iter: I, expected: &[char]) {
assert_eq!(iter.collect::<Vec<_>>(), expected);
}
1 change: 1 addition & 0 deletions src/libcollectionstest/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ extern crate rustc_unicode;
mod binary_heap;
mod bit;
mod btree;
mod char; // char isn't really a collection, but didn't find a better place for this.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

mumble mumble collection of code points mumble mumble

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A char is one code point.

mod enum_set;
mod fmt;
mod linked_list;
Expand Down
13 changes: 13 additions & 0 deletions src/libcollectionstest/str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1687,6 +1687,19 @@ fn trim_ws() {
"");
}

#[test]
fn to_lowercase() {
assert_eq!("".to_lowercase(), "");
// https://github.com/rust-lang/rust/issues/26035
assert_eq!("'Σ AÉΣ'Σ'' Σ DžΣ".to_lowercase(), "'σ aéσ'ς'' σ džς");
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's a few other conditions in the checking for the sigma at the end, could this add some tests cases for those?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What cases are you thinking of? End of a word, of the string?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah specifically there's a boolean is_word_final and this only exercises one of the two code paths (e.g. only the true case I believe)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

All σ’s come from the false case, but I wrote more tests anyway.

}

#[test]
fn to_uppercase() {
assert_eq!("".to_uppercase(), "");
assert_eq!("aéDžßfiᾀ".to_uppercase(), "AÉDŽSSFIἈΙ");
}

mod pattern {
use std::str::pattern::Pattern;
use std::str::pattern::{Searcher, ReverseSearcher};
Expand Down
Loading