From 592d99d3388ea0764bebb621346c7b292d2e464b Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Fri, 11 Jul 2025 12:12:22 +0800 Subject: [PATCH 01/13] add benchmark --- Cargo.toml | 6 ++++++ benches/chars.rs | 4 ++-- benches/unicode_word_indices.rs | 37 +++++++++++++++++++++++++++++++++ benches/word_bounds.rs | 2 +- benches/words.rs | 4 ++-- 5 files changed, 48 insertions(+), 5 deletions(-) create mode 100644 benches/unicode_word_indices.rs diff --git a/Cargo.toml b/Cargo.toml index 404f053..1aac6ea 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -24,6 +24,7 @@ no_std = [] # This is a no-op, preserved for backward compatibility only. [dev-dependencies] quickcheck = "0.7" criterion = "0.5" +proptest = "1.7.0" [[bench]] name = "chars" @@ -36,3 +37,8 @@ harness = false [[bench]] name = "word_bounds" harness = false + +[[bench]] +name = "unicode_word_indices" +harness = false + diff --git a/benches/chars.rs b/benches/chars.rs index bacffa1..2654a26 100644 --- a/benches/chars.rs +++ b/benches/chars.rs @@ -41,7 +41,7 @@ fn bench_all(c: &mut Criterion) { for file in FILES { group.bench_with_input( BenchmarkId::new("grapheme", file), - &fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(), + &fs::read_to_string(format!("benches/texts/{file}.txt")).unwrap(), |b, content| b.iter(|| grapheme(content)), ); } @@ -49,7 +49,7 @@ fn bench_all(c: &mut Criterion) { for file in FILES { group.bench_with_input( BenchmarkId::new("scalar", file), - &fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(), + &fs::read_to_string(format!("benches/texts/{file}.txt")).unwrap(), |b, content| b.iter(|| scalar(content)), ); } diff --git a/benches/unicode_word_indices.rs b/benches/unicode_word_indices.rs new file mode 100644 index 0000000..4c09404 --- /dev/null +++ b/benches/unicode_word_indices.rs @@ -0,0 +1,37 @@ +use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion}; + +use std::fs; +use unicode_segmentation::UnicodeSegmentation; + +const FILES: &[&str] = &[ + "log", //"arabic", + "english", + //"hindi", + "japanese", + //"korean", + //"mandarin", + //"russian", + //"source_code", +]; + +#[inline(always)] +fn grapheme(text: &str) { + for w in text.unicode_word_indices() { + black_box(w); + } +} + +fn bench_all(c: &mut Criterion) { + let mut group = c.benchmark_group("unicode_word_indices"); + + for file in FILES { + let input = fs::read_to_string(format!("benches/texts/{file}.txt")).unwrap(); + group.throughput(criterion::Throughput::Bytes(input.len() as u64)); + group.bench_with_input(BenchmarkId::from_parameter(file), &input, |b, content| { + b.iter(|| grapheme(content)) + }); + } +} + +criterion_group!(benches, bench_all); +criterion_main!(benches); diff --git a/benches/word_bounds.rs b/benches/word_bounds.rs index 42d50ff..f1af7c4 100644 --- a/benches/word_bounds.rs +++ b/benches/word_bounds.rs @@ -27,7 +27,7 @@ fn bench_all(c: &mut Criterion) { for file in FILES { group.bench_with_input( BenchmarkId::new("grapheme", file), - &fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(), + &fs::read_to_string(format!("benches/texts/{file}.txt",)).unwrap(), |b, content| b.iter(|| grapheme(content)), ); } diff --git a/benches/words.rs b/benches/words.rs index 86785d5..508bc9f 100644 --- a/benches/words.rs +++ b/benches/words.rs @@ -41,7 +41,7 @@ fn bench_all(c: &mut Criterion) { for file in FILES { group.bench_with_input( BenchmarkId::new("grapheme", file), - &fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(), + &fs::read_to_string(format!("benches/texts/{file}.txt")).unwrap(), |b, content| b.iter(|| grapheme(content)), ); } @@ -49,7 +49,7 @@ fn bench_all(c: &mut Criterion) { for file in FILES { group.bench_with_input( BenchmarkId::new("scalar", file), - &fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(), + &fs::read_to_string(format!("benches/texts/{file}.txt")).unwrap(), |b, content| b.iter(|| scalar(content)), ); } From eca90432d2943af1d5040d61b39e05a16780949e Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Fri, 11 Jul 2025 13:08:47 +0800 Subject: [PATCH 02/13] add ascii fastpath --- src/lib.rs | 25 +++-- src/word.rs | 306 +++++++++++++++++++++++++++++++--------------------- 2 files changed, 196 insertions(+), 135 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index c8ec5b5..7672eb2 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -56,11 +56,14 @@ )] #![no_std] +#[cfg(test)] +extern crate std; + pub use grapheme::{GraphemeCursor, GraphemeIncomplete}; pub use grapheme::{GraphemeIndices, Graphemes}; pub use sentence::{USentenceBoundIndices, USentenceBounds, UnicodeSentences}; pub use tables::UNICODE_VERSION; -pub use word::{UWordBoundIndices, UWordBounds, UnicodeWordIndices, UnicodeWords}; +pub use word::{UWordBoundIndices, UWordBounds}; mod grapheme; mod sentence; @@ -133,7 +136,7 @@ pub trait UnicodeSegmentation { /// /// assert_eq!(&uw1[..], b); /// ``` - fn unicode_words(&self) -> UnicodeWords<'_>; + fn unicode_words(&self) -> impl Iterator; /// Returns an iterator over the words of `self`, separated on /// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries), and their @@ -157,7 +160,7 @@ pub trait UnicodeSegmentation { /// /// assert_eq!(&uwi1[..], b); /// ``` - fn unicode_word_indices(&self) -> UnicodeWordIndices<'_>; + fn unicode_word_indices(&self) -> impl Iterator; /// Returns an iterator over substrings of `self` separated on /// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries). @@ -173,7 +176,7 @@ pub trait UnicodeSegmentation { /// /// assert_eq!(&swu1[..], b); /// ``` - fn split_word_bounds(&self) -> UWordBounds<'_>; + fn split_word_bounds(&self) -> impl DoubleEndedIterator; /// Returns an iterator over substrings of `self`, split on UAX#29 word boundaries, /// and their offsets. See `split_word_bounds()` for more information. @@ -188,7 +191,7 @@ pub trait UnicodeSegmentation { /// /// assert_eq!(&swi1[..], b); /// ``` - fn split_word_bound_indices(&self) -> UWordBoundIndices<'_>; + fn split_word_bound_indices(&self) -> impl DoubleEndedIterator; /// Returns an iterator over substrings of `self` separated on /// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries). @@ -210,7 +213,7 @@ pub trait UnicodeSegmentation { /// /// assert_eq!(&us1[..], b); /// ``` - fn unicode_sentences(&self) -> UnicodeSentences<'_>; + fn unicode_sentences(&self) -> impl Iterator; /// Returns an iterator over substrings of `self` separated on /// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries). @@ -258,27 +261,27 @@ impl UnicodeSegmentation for str { } #[inline] - fn unicode_words(&self) -> UnicodeWords { + fn unicode_words(&self) -> impl Iterator { word::new_unicode_words(self) } #[inline] - fn unicode_word_indices(&self) -> UnicodeWordIndices { + fn unicode_word_indices(&self) -> impl Iterator { word::new_unicode_word_indices(self) } #[inline] - fn split_word_bounds(&self) -> UWordBounds { + fn split_word_bounds(&self) -> impl DoubleEndedIterator { word::new_word_bounds(self) } #[inline] - fn split_word_bound_indices(&self) -> UWordBoundIndices { + fn split_word_bound_indices(&self) -> impl DoubleEndedIterator { word::new_word_bound_indices(self) } #[inline] - fn unicode_sentences(&self) -> UnicodeSentences { + fn unicode_sentences(&self) -> impl Iterator { sentence::new_unicode_sentences(self) } diff --git a/src/word.rs b/src/word.rs index b2a85ae..964cdc0 100644 --- a/src/word.rs +++ b/src/word.rs @@ -9,85 +9,11 @@ // except according to those terms. use core::cmp; -use core::iter::Filter; -use crate::tables::word::WordCat; - -/// An iterator over the substrings of a string which, after splitting the string on -/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries), -/// contain any characters with the -/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic) -/// property, or with -/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values). -/// -/// This struct is created by the [`unicode_words`] method on the [`UnicodeSegmentation`] trait. See -/// its documentation for more. -/// -/// [`unicode_words`]: trait.UnicodeSegmentation.html#tymethod.unicode_words -/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html -#[derive(Debug)] -pub struct UnicodeWords<'a> { - inner: Filter, fn(&&str) -> bool>, -} - -impl<'a> Iterator for UnicodeWords<'a> { - type Item = &'a str; - - #[inline] - fn next(&mut self) -> Option<&'a str> { - self.inner.next() - } - - #[inline] - fn size_hint(&self) -> (usize, Option) { - self.inner.size_hint() - } -} -impl<'a> DoubleEndedIterator for UnicodeWords<'a> { - #[inline] - fn next_back(&mut self) -> Option<&'a str> { - self.inner.next_back() - } -} - -/// An iterator over the substrings of a string which, after splitting the string on -/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries), -/// contain any characters with the -/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic) -/// property, or with -/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values). -/// This iterator also provides the byte offsets for each substring. -/// -/// This struct is created by the [`unicode_word_indices`] method on the [`UnicodeSegmentation`] trait. See -/// its documentation for more. -/// -/// [`unicode_word_indices`]: trait.UnicodeSegmentation.html#tymethod.unicode_word_indices -/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html -#[derive(Debug)] -pub struct UnicodeWordIndices<'a> { - #[allow(clippy::type_complexity)] - inner: Filter, fn(&(usize, &str)) -> bool>, -} +extern crate alloc; +use alloc::boxed::Box; -impl<'a> Iterator for UnicodeWordIndices<'a> { - type Item = (usize, &'a str); - - #[inline] - fn next(&mut self) -> Option<(usize, &'a str)> { - self.inner.next() - } - - #[inline] - fn size_hint(&self) -> (usize, Option) { - self.inner.size_hint() - } -} -impl<'a> DoubleEndedIterator for UnicodeWordIndices<'a> { - #[inline] - fn next_back(&mut self) -> Option<(usize, &'a str)> { - self.inner.next_back() - } -} +use crate::tables::word::WordCat; /// External iterator for a string's /// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries). @@ -117,24 +43,6 @@ pub struct UWordBoundIndices<'a> { iter: UWordBounds<'a>, } -impl<'a> UWordBoundIndices<'a> { - #[inline] - /// View the underlying data (the part yet to be iterated) as a slice of the original string. - /// - /// ```rust - /// # use unicode_segmentation::UnicodeSegmentation; - /// let mut iter = "Hello world".split_word_bound_indices(); - /// assert_eq!(iter.as_str(), "Hello world"); - /// iter.next(); - /// assert_eq!(iter.as_str(), " world"); - /// iter.next(); - /// assert_eq!(iter.as_str(), "world"); - /// ``` - pub fn as_str(&self) -> &'a str { - self.iter.as_str() - } -} - impl<'a> Iterator for UWordBoundIndices<'a> { type Item = (usize, &'a str); @@ -677,22 +585,6 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> { } impl<'a> UWordBounds<'a> { - #[inline] - /// View the underlying data (the part yet to be iterated) as a slice of the original string. - /// - /// ```rust - /// # use unicode_segmentation::UnicodeSegmentation; - /// let mut iter = "Hello world".split_word_bounds(); - /// assert_eq!(iter.as_str(), "Hello world"); - /// iter.next(); - /// assert_eq!(iter.as_str(), " world"); - /// iter.next(); - /// assert_eq!(iter.as_str(), "world"); - /// ``` - pub fn as_str(&self) -> &'a str { - self.string - } - #[inline] fn get_next_cat(&self, idx: usize) -> Option { use crate::tables::word as wd; @@ -736,33 +628,161 @@ pub fn new_word_bound_indices(s: &str) -> UWordBoundIndices<'_> { #[inline] fn has_alphanumeric(s: &&str) -> bool { - use crate::tables::util::is_alphanumeric; - - s.chars().any(is_alphanumeric) + s.chars().any(|c| c.is_alphanumeric()) } #[inline] -pub fn new_unicode_words(s: &str) -> UnicodeWords<'_> { - use super::UnicodeSegmentation; +fn has_ascii_alphanumeric(s: &&str) -> bool { + s.bytes().any(|b| b.is_ascii_alphanumeric()) +} + +/// Fast-path for ASCII-only word segmentation, matching `unicode-segmentation` on pure ASCII: +/// • runs of ASCII spaces are grouped (`" "`) +/// • core-runs (letters, digits, underscore + infix) +/// • any other ASCII char emits as one token, except CR+LF emits as a single two-char token +pub fn new_ascii_word_bound_indices<'a>(s: &'a str) -> impl Iterator + 'a { + #[inline(always)] + fn is_core(b: u8) -> bool { + b.is_ascii_alphanumeric() || b == b'_' + } + #[inline(always)] + fn is_infix(b: u8, prev: u8, next: u8) -> bool { + match b { + // numeric separators + b'.' | b',' | b';' | b'\'' if prev.is_ascii_digit() && next.is_ascii_digit() => true, + // apostrophe in contractions + b'\'' if prev.is_ascii_alphabetic() && next.is_ascii_alphabetic() => true, + // dot/colon inside letters + b'.' | b':' if prev.is_ascii_alphabetic() && next.is_ascii_alphabetic() => true, + _ => false, + } + } + + use core::iter::from_fn; + let mut rest = s; + let mut offset = 0; + + from_fn(move || { + if rest.is_empty() { + return None; + } + let bytes = rest.as_bytes(); + let len = bytes.len(); + + // 1) Group runs of spaces + if bytes[0] == b' ' { + let mut i = 1; + while i < len && bytes[i] == b' ' { + i += 1; + } + let word = &rest[..i]; + let pos = offset; + rest = &rest[i..]; + offset += i; + return Some((pos, word)); + } - UnicodeWords { - inner: s.split_word_bounds().filter(has_alphanumeric), + // 2) Core-run (letters/digits/underscore + infix) + if is_core(bytes[0]) { + let mut i = 1; + while i < len { + let b = bytes[i]; + if is_core(b) || (i + 1 < len && is_infix(b, bytes[i - 1], bytes[i + 1])) { + i += 1; + } else { + break; + } + } + let word = &rest[..i]; + let pos = offset; + rest = &rest[i..]; + offset += i; + return Some((pos, word)); + } + + // 3) Non-core: CR+LF as one token, otherwise single char + if bytes[0] == b'\r' && len >= 2 && bytes[1] == b'\n' { + let word = &rest[..2]; + let pos = offset; + rest = &rest[2..]; + offset += 2; + Some((pos, word)) + } else { + // emit exactly one byte (whitespace/control/punct) + let word = &rest[..1]; + let pos = offset; + rest = &rest[1..]; + offset += 1; + Some((pos, word)) + } + }) +} +/// An iterator over the substrings of a string which, after splitting the string on +/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries), +/// contain any characters with the +/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic) +/// property, or with +/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values). +/// +/// This method is accessed by the [`unicode_words`] method on the [`UnicodeSegmentation`] trait. See +/// its documentation for more. +/// +/// [`unicode_words`]: trait.UnicodeSegmentation.html#tymethod.unicode_words +/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html +#[inline] +pub(crate) fn new_unicode_words(s: &str) -> Box + '_> { + if s.is_ascii() { + Box::new(new_unicode_words_ascii(s)) + } else { + Box::new(new_unicode_words_general(s)) } } #[inline] -pub fn new_unicode_word_indices(s: &str) -> UnicodeWordIndices<'_> { - use super::UnicodeSegmentation; +fn new_unicode_words_ascii<'a>(s: &'a str) -> impl Iterator + 'a { + new_ascii_word_bound_indices(s) + .map(|(_, w)| w) + .filter(|w| w.chars().any(|c| c.is_ascii_alphanumeric())) +} + +#[inline] +fn new_unicode_words_general<'a>(s: &'a str) -> impl Iterator + 'a { + new_word_bounds(s).filter(has_alphanumeric) +} - UnicodeWordIndices { - inner: s - .split_word_bound_indices() - .filter(|(_, c)| has_alphanumeric(c)), +/// An iterator over the substrings of a string which, after splitting the string on +/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries), +/// contain any characters with the +/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic) +/// property, or with +/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values). +/// This iterator also provides the byte offsets for each substring. +/// +/// This method is accessed by the [`unicode_word_indices`] method on the [`UnicodeSegmentation`] trait. See +/// its documentation for more. +/// +/// [`unicode_word_indices`]: trait.UnicodeSegmentation.html#tymethod.unicode_word_indices +/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html +#[inline] +pub fn new_unicode_word_indices<'a>(s: &'a str) -> Box + 'a> { + if s.is_ascii() { + Box::new(new_ascii_word_bound_indices(s).filter(|(_, w)| has_ascii_alphanumeric(w))) + } else { + Box::new(new_word_bound_indices(s).filter(|(_, w)| has_alphanumeric(w))) } } #[cfg(test)] mod tests { + use crate::word::{ + new_ascii_word_bound_indices, new_unicode_words_ascii, new_word_bound_indices, + }; + use std::string::String; + use std::vec::Vec; + use std::{format, vec}; + + use proptest::prelude::*; + #[test] fn test_syriac_abbr_mark() { use crate::tables::word as wd; @@ -776,4 +796,42 @@ mod tests { let (_, _, cat) = wd::word_category('\u{6dd}'); assert_eq!(cat, wd::WC_Numeric); } + + #[test] + fn test_ascii_word_indices_various_cases() { + let s = "Hello, world! can't e.g. var1 123,456 foo_bar example.com"; + let words: Vec<&str> = new_unicode_words_ascii(s).collect(); + let expected = vec![ + ("Hello"), // simple letters + ("world"), // skip comma+space, stop at '!' + ("can't"), // apostrophe joins letters + ("e.g"), + ("var1"), + ("123,456"), // digits+comma+digits + ("foo_bar"), + ("example.com"), + ]; + assert_eq!(words, expected); + } + + /// Strategy that yields every code-point from NUL (0) to DEL (127). + fn ascii_char() -> impl Strategy { + (0u8..=127).prop_map(|b| b as char) + } + + proptest! { + #![proptest_config(ProptestConfig::with_cases(10000))] + /// Fast path must equal general path for any ASCII input. + #[test] + fn proptest_ascii_matches_unicode_word_indices( + // Vec → String, length 0‒99 + s in proptest::collection::vec(ascii_char(), 0..100) + .prop_map(|v| v.into_iter().collect::()) + ) { + let fast: Vec<(usize, &str)> = new_ascii_word_bound_indices(&s).collect(); + let uni: Vec<(usize, &str)> = new_word_bound_indices(&s).collect(); + + prop_assert_eq!(fast, uni); + } + } } From b5ed407d84bdac57ffae8f968973bf33496a0326 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Fri, 11 Jul 2025 14:27:30 +0800 Subject: [PATCH 03/13] add test case IP --- src/word.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/word.rs b/src/word.rs index 964cdc0..c690eb6 100644 --- a/src/word.rs +++ b/src/word.rs @@ -799,7 +799,7 @@ mod tests { #[test] fn test_ascii_word_indices_various_cases() { - let s = "Hello, world! can't e.g. var1 123,456 foo_bar example.com"; + let s = "Hello, world! can't e.g. var1 123,456 foo_bar example.com 127.0.0.1:9090"; let words: Vec<&str> = new_unicode_words_ascii(s).collect(); let expected = vec![ ("Hello"), // simple letters @@ -810,6 +810,8 @@ mod tests { ("123,456"), // digits+comma+digits ("foo_bar"), ("example.com"), + ("127.0.0.1"), + ("9090"), // port number ]; assert_eq!(words, expected); } From 9b1b7f998451d87c0fb8481c756778a4961f122c Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Sun, 13 Jul 2025 18:30:30 +0800 Subject: [PATCH 04/13] add log to benches --- benches/texts/log.txt | 1 + 1 file changed, 1 insertion(+) create mode 100644 benches/texts/log.txt diff --git a/benches/texts/log.txt b/benches/texts/log.txt new file mode 100644 index 0000000..e18ca32 --- /dev/null +++ b/benches/texts/log.txt @@ -0,0 +1 @@ +2018-07-12 13:59:01 UTC | ERROR | (worker.go:131 in process) | Too many errors for endpoint 'dummy/api/v1/check_run?api_key=*************************00000': retrying later From 6f96a23b0234a078e05028563198d8c04c48b2e5 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Tue, 15 Jul 2025 19:55:08 +0800 Subject: [PATCH 05/13] restore iterators --- src/lib.rs | 18 ++-- src/word.rs | 280 ++++++++++++++++++++++++++++++++++------------------ 2 files changed, 193 insertions(+), 105 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 7672eb2..1dbdd73 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -65,6 +65,8 @@ pub use sentence::{USentenceBoundIndices, USentenceBounds, UnicodeSentences}; pub use tables::UNICODE_VERSION; pub use word::{UWordBoundIndices, UWordBounds}; +use crate::word::{UnicodeWordIndices, UnicodeWords}; + mod grapheme; mod sentence; #[rustfmt::skip] @@ -136,7 +138,7 @@ pub trait UnicodeSegmentation { /// /// assert_eq!(&uw1[..], b); /// ``` - fn unicode_words(&self) -> impl Iterator; + fn unicode_words(&self) -> UnicodeWords; /// Returns an iterator over the words of `self`, separated on /// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries), and their @@ -160,7 +162,7 @@ pub trait UnicodeSegmentation { /// /// assert_eq!(&uwi1[..], b); /// ``` - fn unicode_word_indices(&self) -> impl Iterator; + fn unicode_word_indices(&self) -> UnicodeWordIndices; /// Returns an iterator over substrings of `self` separated on /// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries). @@ -176,7 +178,7 @@ pub trait UnicodeSegmentation { /// /// assert_eq!(&swu1[..], b); /// ``` - fn split_word_bounds(&self) -> impl DoubleEndedIterator; + fn split_word_bounds(&self) -> UWordBounds; /// Returns an iterator over substrings of `self`, split on UAX#29 word boundaries, /// and their offsets. See `split_word_bounds()` for more information. @@ -191,7 +193,7 @@ pub trait UnicodeSegmentation { /// /// assert_eq!(&swi1[..], b); /// ``` - fn split_word_bound_indices(&self) -> impl DoubleEndedIterator; + fn split_word_bound_indices(&self) -> UWordBoundIndices; /// Returns an iterator over substrings of `self` separated on /// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries). @@ -261,22 +263,22 @@ impl UnicodeSegmentation for str { } #[inline] - fn unicode_words(&self) -> impl Iterator { + fn unicode_words(&self) -> UnicodeWords { word::new_unicode_words(self) } #[inline] - fn unicode_word_indices(&self) -> impl Iterator { + fn unicode_word_indices(&self) -> UnicodeWordIndices { word::new_unicode_word_indices(self) } #[inline] - fn split_word_bounds(&self) -> impl DoubleEndedIterator { + fn split_word_bounds(&self) -> UWordBounds { word::new_word_bounds(self) } #[inline] - fn split_word_bound_indices(&self) -> impl DoubleEndedIterator { + fn split_word_bound_indices(&self) -> UWordBoundIndices { word::new_word_bound_indices(self) } diff --git a/src/word.rs b/src/word.rs index c690eb6..1d1d69c 100644 --- a/src/word.rs +++ b/src/word.rs @@ -8,13 +8,82 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -use core::cmp; - extern crate alloc; use alloc::boxed::Box; +use core::cmp; +use core::iter::Filter; use crate::tables::word::WordCat; +/// An iterator over the substrings of a string which, after splitting the string on +/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries), +/// contain any characters with the +/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic) +/// property, or with +/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values). +/// +/// This struct is created by the [`unicode_words`] method on the [`UnicodeSegmentation`] trait. See +/// its documentation for more. +/// +/// [`unicode_words`]: trait.UnicodeSegmentation.html#tymethod.unicode_words +/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html +pub struct UnicodeWords<'a> { + inner: Box + 'a>, +} + +impl<'a> Iterator for UnicodeWords<'a> { + type Item = &'a str; + + #[inline] + fn next(&mut self) -> Option<&'a str> { + self.inner.next() + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + self.inner.size_hint() + } +} + +/// An iterator over the substrings of a string which, after splitting the string on +/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries), +/// contain any characters with the +/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic) +/// property, or with +/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values). +/// This iterator also provides the byte offsets for each substring. +/// +/// This struct is created by the [`unicode_word_indices`] method on the [`UnicodeSegmentation`] trait. See +/// its documentation for more. +/// +/// [`unicode_word_indices`]: trait.UnicodeSegmentation.html#tymethod.unicode_word_indices +/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html +#[derive(Debug)] +pub struct UnicodeWordIndices<'a> { + #[allow(clippy::type_complexity)] + inner: Filter, fn(&(usize, &str)) -> bool>, +} + +impl<'a> Iterator for UnicodeWordIndices<'a> { + type Item = (usize, &'a str); + + #[inline] + fn next(&mut self) -> Option<(usize, &'a str)> { + self.inner.next() + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + self.inner.size_hint() + } +} +impl<'a> DoubleEndedIterator for UnicodeWordIndices<'a> { + #[inline] + fn next_back(&mut self) -> Option<(usize, &'a str)> { + self.inner.next_back() + } +} + /// External iterator for a string's /// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries). /// @@ -43,6 +112,24 @@ pub struct UWordBoundIndices<'a> { iter: UWordBounds<'a>, } +impl<'a> UWordBoundIndices<'a> { + #[inline] + /// View the underlying data (the part yet to be iterated) as a slice of the original string. + /// + /// ```rust + /// # use unicode_segmentation::UnicodeSegmentation; + /// let mut iter = "Hello world".split_word_bound_indices(); + /// assert_eq!(iter.as_str(), "Hello world"); + /// iter.next(); + /// assert_eq!(iter.as_str(), " world"); + /// iter.next(); + /// assert_eq!(iter.as_str(), "world"); + /// ``` + pub fn as_str(&self) -> &'a str { + self.iter.as_str() + } +} + impl<'a> Iterator for UWordBoundIndices<'a> { type Item = (usize, &'a str); @@ -585,6 +672,22 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> { } impl<'a> UWordBounds<'a> { + #[inline] + /// View the underlying data (the part yet to be iterated) as a slice of the original string. + /// + /// ```rust + /// # use unicode_segmentation::UnicodeSegmentation; + /// let mut iter = "Hello world".split_word_bounds(); + /// assert_eq!(iter.as_str(), "Hello world"); + /// iter.next(); + /// assert_eq!(iter.as_str(), " world"); + /// iter.next(); + /// assert_eq!(iter.as_str(), "world"); + /// ``` + pub fn as_str(&self) -> &'a str { + self.string + } + #[inline] fn get_next_cat(&self, idx: usize) -> Option { use crate::tables::word as wd; @@ -609,42 +712,21 @@ impl<'a> UWordBounds<'a> { } } -#[inline] -pub fn new_word_bounds(s: &str) -> UWordBounds<'_> { - UWordBounds { - string: s, - cat: None, - catb: None, - } +pub struct AsciiWordBoundIter<'a> { + rest: &'a str, + offset: usize, } -#[inline] -pub fn new_word_bound_indices(s: &str) -> UWordBoundIndices<'_> { - UWordBoundIndices { - start_offset: s.as_ptr() as usize, - iter: new_word_bounds(s), +impl<'a> AsciiWordBoundIter<'a> { + pub fn new(s: &'a str) -> Self { + AsciiWordBoundIter { rest: s, offset: 0 } } -} - -#[inline] -fn has_alphanumeric(s: &&str) -> bool { - s.chars().any(|c| c.is_alphanumeric()) -} -#[inline] -fn has_ascii_alphanumeric(s: &&str) -> bool { - s.bytes().any(|b| b.is_ascii_alphanumeric()) -} - -/// Fast-path for ASCII-only word segmentation, matching `unicode-segmentation` on pure ASCII: -/// • runs of ASCII spaces are grouped (`" "`) -/// • core-runs (letters, digits, underscore + infix) -/// • any other ASCII char emits as one token, except CR+LF emits as a single two-char token -pub fn new_ascii_word_bound_indices<'a>(s: &'a str) -> impl Iterator + 'a { #[inline(always)] fn is_core(b: u8) -> bool { b.is_ascii_alphanumeric() || b == b'_' } + #[inline(always)] fn is_infix(b: u8, prev: u8, next: u8) -> bool { match b { @@ -657,16 +739,17 @@ pub fn new_ascii_word_bound_indices<'a>(s: &'a str) -> impl Iterator false, } } +} - use core::iter::from_fn; - let mut rest = s; - let mut offset = 0; +impl<'a> Iterator for AsciiWordBoundIter<'a> { + type Item = (usize, &'a str); - from_fn(move || { - if rest.is_empty() { + fn next(&mut self) -> Option { + if self.rest.is_empty() { return None; } - let bytes = rest.as_bytes(); + + let bytes = self.rest.as_bytes(); let len = bytes.len(); // 1) Group runs of spaces @@ -675,69 +758,79 @@ pub fn new_ascii_word_bound_indices<'a>(s: &'a str) -> impl Iterator= 2 && bytes[1] == b'\n' { - let word = &rest[..2]; - let pos = offset; - rest = &rest[2..]; - offset += 2; + let word = &self.rest[..2]; + let pos = self.offset; + self.rest = &self.rest[2..]; + self.offset += 2; Some((pos, word)) } else { - // emit exactly one byte (whitespace/control/punct) - let word = &rest[..1]; - let pos = offset; - rest = &rest[1..]; - offset += 1; + let word = &self.rest[..1]; + let pos = self.offset; + self.rest = &self.rest[1..]; + self.offset += 1; Some((pos, word)) } - }) + } } -/// An iterator over the substrings of a string which, after splitting the string on -/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries), -/// contain any characters with the -/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic) -/// property, or with -/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values). -/// -/// This method is accessed by the [`unicode_words`] method on the [`UnicodeSegmentation`] trait. See -/// its documentation for more. -/// -/// [`unicode_words`]: trait.UnicodeSegmentation.html#tymethod.unicode_words -/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html + #[inline] -pub(crate) fn new_unicode_words(s: &str) -> Box + '_> { - if s.is_ascii() { - Box::new(new_unicode_words_ascii(s)) - } else { - Box::new(new_unicode_words_general(s)) +pub fn new_word_bounds(s: &str) -> UWordBounds<'_> { + UWordBounds { + string: s, + cat: None, + catb: None, + } +} + +#[inline] +pub fn new_word_bound_indices(s: &str) -> UWordBoundIndices<'_> { + UWordBoundIndices { + start_offset: s.as_ptr() as usize, + iter: new_word_bounds(s), } } +#[inline] +pub fn new_ascii_word_bound_indices(s: &str) -> AsciiWordBoundIter<'_> { + AsciiWordBoundIter::new(s) +} + +#[inline] +fn has_alphanumeric(s: &&str) -> bool { + use crate::tables::util::is_alphanumeric; + + s.chars().any(is_alphanumeric) +} + #[inline] fn new_unicode_words_ascii<'a>(s: &'a str) -> impl Iterator + 'a { new_ascii_word_bound_indices(s) @@ -750,25 +843,25 @@ fn new_unicode_words_general<'a>(s: &'a str) -> impl Iterator + new_word_bounds(s).filter(has_alphanumeric) } -/// An iterator over the substrings of a string which, after splitting the string on -/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries), -/// contain any characters with the -/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic) -/// property, or with -/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values). -/// This iterator also provides the byte offsets for each substring. -/// -/// This method is accessed by the [`unicode_word_indices`] method on the [`UnicodeSegmentation`] trait. See -/// its documentation for more. -/// -/// [`unicode_word_indices`]: trait.UnicodeSegmentation.html#tymethod.unicode_word_indices -/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html #[inline] -pub fn new_unicode_word_indices<'a>(s: &'a str) -> Box + 'a> { - if s.is_ascii() { - Box::new(new_ascii_word_bound_indices(s).filter(|(_, w)| has_ascii_alphanumeric(w))) +pub fn new_unicode_words(s: &str) -> UnicodeWords<'_> { + let iter: Box> = if s.is_ascii() { + Box::new(new_unicode_words_ascii(s)) } else { - Box::new(new_word_bound_indices(s).filter(|(_, w)| has_alphanumeric(w))) + Box::new(new_unicode_words_general(s)) + }; + + UnicodeWords { inner: iter } +} + +#[inline] +pub fn new_unicode_word_indices(s: &str) -> UnicodeWordIndices<'_> { + use super::UnicodeSegmentation; + + UnicodeWordIndices { + inner: s + .split_word_bound_indices() + .filter(|(_, c)| has_alphanumeric(c)), } } @@ -785,13 +878,6 @@ mod tests { #[test] fn test_syriac_abbr_mark() { - use crate::tables::word as wd; - let (_, _, cat) = wd::word_category('\u{70f}'); - assert_eq!(cat, wd::WC_ALetter); - } - - #[test] - fn test_end_of_ayah_cat() { use crate::tables::word as wd; let (_, _, cat) = wd::word_category('\u{6dd}'); assert_eq!(cat, wd::WC_Numeric); From 7beb8a60b73dbeecd42d16bb8f32cdfb0da2a9e3 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Tue, 15 Jul 2025 20:15:55 +0800 Subject: [PATCH 06/13] add backwards iterator --- src/lib.rs | 24 +++++------ src/word.rs | 120 +++++++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 113 insertions(+), 31 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 1dbdd73..d15ac0b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -138,7 +138,7 @@ pub trait UnicodeSegmentation { /// /// assert_eq!(&uw1[..], b); /// ``` - fn unicode_words(&self) -> UnicodeWords; + fn unicode_words(&self) -> UnicodeWords<'_>; /// Returns an iterator over the words of `self`, separated on /// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries), and their @@ -162,7 +162,7 @@ pub trait UnicodeSegmentation { /// /// assert_eq!(&uwi1[..], b); /// ``` - fn unicode_word_indices(&self) -> UnicodeWordIndices; + fn unicode_word_indices(&self) -> UnicodeWordIndices<'_>; /// Returns an iterator over substrings of `self` separated on /// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries). @@ -178,7 +178,7 @@ pub trait UnicodeSegmentation { /// /// assert_eq!(&swu1[..], b); /// ``` - fn split_word_bounds(&self) -> UWordBounds; + fn split_word_bounds(&self) -> UWordBounds<'_>; /// Returns an iterator over substrings of `self`, split on UAX#29 word boundaries, /// and their offsets. See `split_word_bounds()` for more information. @@ -193,7 +193,7 @@ pub trait UnicodeSegmentation { /// /// assert_eq!(&swi1[..], b); /// ``` - fn split_word_bound_indices(&self) -> UWordBoundIndices; + fn split_word_bound_indices(&self) -> UWordBoundIndices<'_>; /// Returns an iterator over substrings of `self` separated on /// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries). @@ -215,7 +215,7 @@ pub trait UnicodeSegmentation { /// /// assert_eq!(&us1[..], b); /// ``` - fn unicode_sentences(&self) -> impl Iterator; + fn unicode_sentences(&self) -> UnicodeSentences<'_>; /// Returns an iterator over substrings of `self` separated on /// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries). @@ -253,7 +253,7 @@ pub trait UnicodeSegmentation { impl UnicodeSegmentation for str { #[inline] - fn graphemes(&self, is_extended: bool) -> Graphemes { + fn graphemes(&self, is_extended: bool) -> Graphemes<'_> { grapheme::new_graphemes(self, is_extended) } @@ -263,32 +263,32 @@ impl UnicodeSegmentation for str { } #[inline] - fn unicode_words(&self) -> UnicodeWords { + fn unicode_words(&self) -> UnicodeWords<'_> { word::new_unicode_words(self) } #[inline] - fn unicode_word_indices(&self) -> UnicodeWordIndices { + fn unicode_word_indices(&self) -> UnicodeWordIndices<'_> { word::new_unicode_word_indices(self) } #[inline] - fn split_word_bounds(&self) -> UWordBounds { + fn split_word_bounds(&self) -> UWordBounds<'_> { word::new_word_bounds(self) } #[inline] - fn split_word_bound_indices(&self) -> UWordBoundIndices { + fn split_word_bound_indices(&self) -> UWordBoundIndices<'_> { word::new_word_bound_indices(self) } #[inline] - fn unicode_sentences(&self) -> impl Iterator { + fn unicode_sentences(&self) -> UnicodeSentences<'_> { sentence::new_unicode_sentences(self) } #[inline] - fn split_sentence_bounds(&self) -> USentenceBounds { + fn split_sentence_bounds(&self) -> USentenceBounds<'_> { sentence::new_sentence_bounds(self) } diff --git a/src/word.rs b/src/word.rs index 1d1d69c..c835445 100644 --- a/src/word.rs +++ b/src/word.rs @@ -11,7 +11,6 @@ extern crate alloc; use alloc::boxed::Box; use core::cmp; -use core::iter::Filter; use crate::tables::word::WordCat; @@ -28,7 +27,7 @@ use crate::tables::word::WordCat; /// [`unicode_words`]: trait.UnicodeSegmentation.html#tymethod.unicode_words /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html pub struct UnicodeWords<'a> { - inner: Box + 'a>, + inner: Box + 'a>, } impl<'a> Iterator for UnicodeWords<'a> { @@ -45,6 +44,13 @@ impl<'a> Iterator for UnicodeWords<'a> { } } +impl<'a> DoubleEndedIterator for UnicodeWords<'a> { + #[inline] + fn next_back(&mut self) -> Option<&'a str> { + self.inner.next_back() + } +} + /// An iterator over the substrings of a string which, after splitting the string on /// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries), /// contain any characters with the @@ -58,16 +64,15 @@ impl<'a> Iterator for UnicodeWords<'a> { /// /// [`unicode_word_indices`]: trait.UnicodeSegmentation.html#tymethod.unicode_word_indices /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html -#[derive(Debug)] pub struct UnicodeWordIndices<'a> { #[allow(clippy::type_complexity)] - inner: Filter, fn(&(usize, &str)) -> bool>, + inner: Box + 'a>, } impl<'a> Iterator for UnicodeWordIndices<'a> { type Item = (usize, &'a str); - #[inline] + #[inline(always)] fn next(&mut self) -> Option<(usize, &'a str)> { self.inner.next() } @@ -722,12 +727,12 @@ impl<'a> AsciiWordBoundIter<'a> { AsciiWordBoundIter { rest: s, offset: 0 } } - #[inline(always)] + #[inline] fn is_core(b: u8) -> bool { b.is_ascii_alphanumeric() || b == b'_' } - #[inline(always)] + #[inline] fn is_infix(b: u8, prev: u8, next: u8) -> bool { match b { // numeric separators @@ -744,6 +749,7 @@ impl<'a> AsciiWordBoundIter<'a> { impl<'a> Iterator for AsciiWordBoundIter<'a> { type Item = (usize, &'a str); + #[inline] fn next(&mut self) -> Option { if self.rest.is_empty() { return None; @@ -802,6 +808,66 @@ impl<'a> Iterator for AsciiWordBoundIter<'a> { } } +impl<'a> DoubleEndedIterator for AsciiWordBoundIter<'a> { + fn next_back(&mut self) -> Option<(usize, &'a str)> { + let rest = self.rest; + if rest.is_empty() { + return None; + } + let bytes = rest.as_bytes(); + let len = bytes.len(); + + // 1) Trailing spaces + if bytes[len - 1] == b' ' { + // find start of this last run of spaces + let mut start = len - 1; + while start > 0 && bytes[start - 1] == b' ' { + start -= 1; + } + let word = &rest[start..]; + let pos = self.offset + start; + self.rest = &rest[..start]; + return Some((pos, word)); + } + + // 2) Trailing core-run (letters/digits/underscore + infix) + if Self::is_core(bytes[len - 1]) { + // scan backwards as long as we see `is_core` or an `is_infix` + let mut start = len - 1; + while start > 0 { + let b = bytes[start - 1]; + let prev = if start >= 2 { bytes[start - 2] } else { b }; + let next = bytes[start]; // the byte we just included + if Self::is_core(b) || Self::is_infix(b, prev, next) { + start -= 1; + } else { + break; + } + } + let word = &rest[start..]; + let pos = self.offset + start; + self.rest = &rest[..start]; + return Some((pos, word)); + } + + // 3) CR+LF at end + if len >= 2 && bytes[len - 2] == b'\r' && bytes[len - 1] == b'\n' { + let start = len - 2; + let word = &rest[start..]; + let pos = self.offset + start; + self.rest = &rest[..start]; + return Some((pos, word)); + } + + // 4) Single non-core byte + let start = len - 1; + let word = &rest[start..]; + let pos = self.offset + start; + self.rest = &rest[..start]; + Some((pos, word)) + } +} + #[inline] pub fn new_word_bounds(s: &str) -> UWordBounds<'_> { UWordBounds { @@ -832,20 +898,25 @@ fn has_alphanumeric(s: &&str) -> bool { } #[inline] -fn new_unicode_words_ascii<'a>(s: &'a str) -> impl Iterator + 'a { +fn has_ascii_alphanumeric(s: &&str) -> bool { + s.chars().any(|c| c.is_ascii_alphanumeric()) +} + +#[inline] +fn new_unicode_words_ascii<'a>(s: &'a str) -> impl DoubleEndedIterator + 'a { new_ascii_word_bound_indices(s) .map(|(_, w)| w) - .filter(|w| w.chars().any(|c| c.is_ascii_alphanumeric())) + .filter(has_ascii_alphanumeric) } #[inline] -fn new_unicode_words_general<'a>(s: &'a str) -> impl Iterator + 'a { +fn new_unicode_words_general<'a>(s: &'a str) -> impl DoubleEndedIterator + 'a { new_word_bounds(s).filter(has_alphanumeric) } #[inline] pub fn new_unicode_words(s: &str) -> UnicodeWords<'_> { - let iter: Box> = if s.is_ascii() { + let iter: Box> = if s.is_ascii() { Box::new(new_unicode_words_ascii(s)) } else { Box::new(new_unicode_words_general(s)) @@ -855,14 +926,13 @@ pub fn new_unicode_words(s: &str) -> UnicodeWords<'_> { } #[inline] -pub fn new_unicode_word_indices(s: &str) -> UnicodeWordIndices<'_> { - use super::UnicodeSegmentation; - - UnicodeWordIndices { - inner: s - .split_word_bound_indices() - .filter(|(_, c)| has_alphanumeric(c)), - } +pub fn new_unicode_word_indices<'a>(s: &'a str) -> UnicodeWordIndices<'a> { + let iter: Box> = if s.is_ascii() { + Box::new(new_ascii_word_bound_indices(s).filter(|(_, w)| has_ascii_alphanumeric(w))) + } else { + Box::new(new_word_bound_indices(s).filter(|(_, w)| has_alphanumeric(w))) + }; + UnicodeWordIndices { inner: iter } } #[cfg(test)] @@ -921,5 +991,17 @@ mod tests { prop_assert_eq!(fast, uni); } + + /// Fast path must equal general path for any ASCII input, forwards and backwards. + #[test] + fn proptest_ascii_matches_unicode_word_indices_rev( + // Vec → String, length 0‒99 + s in proptest::collection::vec(ascii_char(), 0..100) + .prop_map(|v| v.into_iter().collect::()) + ) { + let fast_rev: Vec<(usize, &str)> = new_ascii_word_bound_indices(&s).rev().collect(); + let uni_rev : Vec<(usize, &str)> = new_word_bound_indices(&s).rev().collect(); + prop_assert_eq!(fast_rev, uni_rev); + } } } From a3881da71bb3b4747713d149ef61444b91a43faf Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Tue, 15 Jul 2025 20:40:35 +0800 Subject: [PATCH 07/13] restore test --- src/word.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/word.rs b/src/word.rs index c835445..b6e042b 100644 --- a/src/word.rs +++ b/src/word.rs @@ -948,6 +948,13 @@ mod tests { #[test] fn test_syriac_abbr_mark() { + use crate::tables::word as wd; + let (_, _, cat) = wd::word_category('\u{70f}'); + assert_eq!(cat, wd::WC_ALetter); + } + + #[test] + fn test_end_of_ayah_cat() { use crate::tables::word as wd; let (_, _, cat) = wd::word_category('\u{6dd}'); assert_eq!(cat, wd::WC_Numeric); From 7599d624aa21b1bdd8bda7b1d12dfa37a2a690ef Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Wed, 16 Jul 2025 20:15:20 +0800 Subject: [PATCH 08/13] replace Box with Enum --- src/word.rs | 135 +++++++++++++++++++++++++++++++++++----------------- 1 file changed, 91 insertions(+), 44 deletions(-) diff --git a/src/word.rs b/src/word.rs index b6e042b..f4bd9e1 100644 --- a/src/word.rs +++ b/src/word.rs @@ -9,7 +9,6 @@ // except according to those terms. extern crate alloc; -use alloc::boxed::Box; use core::cmp; use crate::tables::word::WordCat; @@ -27,27 +26,33 @@ use crate::tables::word::WordCat; /// [`unicode_words`]: trait.UnicodeSegmentation.html#tymethod.unicode_words /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html pub struct UnicodeWords<'a> { - inner: Box + 'a>, + inner: WordsIter<'a>, } impl<'a> Iterator for UnicodeWords<'a> { type Item = &'a str; - #[inline] - fn next(&mut self) -> Option<&'a str> { - self.inner.next() + fn next(&mut self) -> Option { + match &mut self.inner { + WordsIter::Ascii(i) => i.next(), + WordsIter::Unicode(i) => i.next(), + } } - #[inline] fn size_hint(&self) -> (usize, Option) { - self.inner.size_hint() + match &self.inner { + WordsIter::Ascii(i) => i.size_hint(), + WordsIter::Unicode(i) => i.size_hint(), + } } } - impl<'a> DoubleEndedIterator for UnicodeWords<'a> { #[inline] - fn next_back(&mut self) -> Option<&'a str> { - self.inner.next_back() + fn next_back(&mut self) -> Option { + match &mut self.inner { + WordsIter::Ascii(i) => i.next_back(), + WordsIter::Unicode(i) => i.next_back(), + } } } @@ -65,27 +70,33 @@ impl<'a> DoubleEndedIterator for UnicodeWords<'a> { /// [`unicode_word_indices`]: trait.UnicodeSegmentation.html#tymethod.unicode_word_indices /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html pub struct UnicodeWordIndices<'a> { - #[allow(clippy::type_complexity)] - inner: Box + 'a>, + inner: IndicesIter<'a>, } impl<'a> Iterator for UnicodeWordIndices<'a> { type Item = (usize, &'a str); - - #[inline(always)] - fn next(&mut self) -> Option<(usize, &'a str)> { - self.inner.next() + #[inline] + fn next(&mut self) -> Option { + match &mut self.inner { + IndicesIter::Ascii(i) => i.next(), + IndicesIter::Unicode(i) => i.next(), + } } - #[inline] fn size_hint(&self) -> (usize, Option) { - self.inner.size_hint() + match &self.inner { + IndicesIter::Ascii(i) => i.size_hint(), + IndicesIter::Unicode(i) => i.size_hint(), + } } } impl<'a> DoubleEndedIterator for UnicodeWordIndices<'a> { #[inline] - fn next_back(&mut self) -> Option<(usize, &'a str)> { - self.inner.next_back() + fn next_back(&mut self) -> Option { + match &mut self.inner { + IndicesIter::Ascii(i) => i.next_back(), + IndicesIter::Unicode(i) => i.next_back(), + } } } @@ -868,6 +879,58 @@ impl<'a> DoubleEndedIterator for AsciiWordBoundIter<'a> { } } +#[inline] +fn ascii_word_ok(t: &(usize, &str)) -> bool { + has_ascii_alphanumeric(&t.1) +} +#[inline] +fn unicode_word_ok(t: &(usize, &str)) -> bool { + has_alphanumeric(&t.1) +} + +type AsciiWordsIter<'a> = core::iter::Filter< + core::iter::Map, fn((usize, &'a str)) -> &'a str>, + fn(&&'a str) -> bool, +>; + +type UnicodeWordsIter<'a> = core::iter::Filter, fn(&&'a str) -> bool>; + +type AsciiIndicesIter<'a> = + core::iter::Filter, fn(&(usize, &'a str)) -> bool>; + +type UnicodeIndicesIter<'a> = + core::iter::Filter, fn(&(usize, &'a str)) -> bool>; + +enum WordsIter<'a> { + Ascii(AsciiWordsIter<'a>), + Unicode(UnicodeWordsIter<'a>), +} + +enum IndicesIter<'a> { + Ascii(AsciiIndicesIter<'a>), + Unicode(UnicodeIndicesIter<'a>), +} + +#[inline] +pub fn new_unicode_words(s: &str) -> UnicodeWords<'_> { + let inner = if s.is_ascii() { + WordsIter::Ascii(new_unicode_words_ascii(s)) + } else { + WordsIter::Unicode(new_unicode_words_general(s)) + }; + UnicodeWords { inner } +} + +#[inline] +pub fn new_unicode_word_indices(s: &str) -> UnicodeWordIndices<'_> { + let inner = if s.is_ascii() { + IndicesIter::Ascii(new_ascii_word_bound_indices(s).filter(ascii_word_ok)) + } else { + IndicesIter::Unicode(new_word_bound_indices(s).filter(unicode_word_ok)) + }; + UnicodeWordIndices { inner } +} + #[inline] pub fn new_word_bounds(s: &str) -> UWordBounds<'_> { UWordBounds { @@ -902,39 +965,23 @@ fn has_ascii_alphanumeric(s: &&str) -> bool { s.chars().any(|c| c.is_ascii_alphanumeric()) } +#[inline(always)] +fn strip_pos((_, w): (usize, &str)) -> &str { + w +} + #[inline] -fn new_unicode_words_ascii<'a>(s: &'a str) -> impl DoubleEndedIterator + 'a { +fn new_unicode_words_ascii<'a>(s: &'a str) -> AsciiWordsIter<'a> { new_ascii_word_bound_indices(s) - .map(|(_, w)| w) + .map(strip_pos as fn(_) -> _) .filter(has_ascii_alphanumeric) } #[inline] -fn new_unicode_words_general<'a>(s: &'a str) -> impl DoubleEndedIterator + 'a { +fn new_unicode_words_general<'a>(s: &'a str) -> UnicodeWordsIter<'a> { new_word_bounds(s).filter(has_alphanumeric) } -#[inline] -pub fn new_unicode_words(s: &str) -> UnicodeWords<'_> { - let iter: Box> = if s.is_ascii() { - Box::new(new_unicode_words_ascii(s)) - } else { - Box::new(new_unicode_words_general(s)) - }; - - UnicodeWords { inner: iter } -} - -#[inline] -pub fn new_unicode_word_indices<'a>(s: &'a str) -> UnicodeWordIndices<'a> { - let iter: Box> = if s.is_ascii() { - Box::new(new_ascii_word_bound_indices(s).filter(|(_, w)| has_ascii_alphanumeric(w))) - } else { - Box::new(new_word_bound_indices(s).filter(|(_, w)| has_alphanumeric(w))) - }; - UnicodeWordIndices { inner: iter } -} - #[cfg(test)] mod tests { use crate::word::{ From e29c432017fbb58b05b8703eba0766e9caa5b914 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Thu, 17 Jul 2025 13:04:37 +0800 Subject: [PATCH 09/13] add comments with reference to the spec --- src/word.rs | 61 +++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 50 insertions(+), 11 deletions(-) diff --git a/src/word.rs b/src/word.rs index f4bd9e1..b2d1c37 100644 --- a/src/word.rs +++ b/src/word.rs @@ -728,6 +728,31 @@ impl<'a> UWordBounds<'a> { } } +/// ASCII‑fast‑path word‑boundary iterator for strings that contain only ASCII characters. +/// +/// Since we handle only ASCII characters, we can use a much simpler set of +/// word break values than the full Unicode algorithm. +/// https://www.unicode.org/reports/tr29/#Table_Word_Break_Property_Values +/// +/// | Word_Break value | ASCII code points that belong to it | +/// | -----------------| --------------------------------------------------------------- | +/// | CR | U+000D (CR) | +/// | LF | U+000A (LF) | +/// | Newline | U+000B (VT), U+000C (FF) | +/// | Single_Quote | U+0027 (') | +/// | Double_Quote | U+0022 (") | +/// | MidNumLet | U+002E (.) FULL STOP | +/// | MidLetter | U+003A (:) COLON | +/// | MidNum | U+002C (,), U+003B (;) | +/// | Numeric | U+0030 – U+0039 (0 … 9) | +/// | ALetter | U+0041 – U+005A (A … Z), U+0061 – U+007A (a … z) | +/// | ExtendNumLet | U+005F (_) underscore | +/// | WSegSpace | U+0020 (SPACE) | +/// +/// The macro MidNumLetQ boils down to: U+002E (.) FULL STOP and U+0027 (') +/// AHLetter is the same as ALetter, so we don't need to distinguish it. +/// +/// Any other single ASCII byte is its own boundary (the default WB999). pub struct AsciiWordBoundIter<'a> { rest: &'a str, offset: usize, @@ -746,12 +771,17 @@ impl<'a> AsciiWordBoundIter<'a> { #[inline] fn is_infix(b: u8, prev: u8, next: u8) -> bool { match b { - // numeric separators + // Numeric separators such as "1,000" or "3.14" (WB11/WB12) + // + // "Numeric (MidNum | MidNumLetQ) Numeric" b'.' | b',' | b';' | b'\'' if prev.is_ascii_digit() && next.is_ascii_digit() => true, - // apostrophe in contractions - b'\'' if prev.is_ascii_alphabetic() && next.is_ascii_alphabetic() => true, - // dot/colon inside letters - b'.' | b':' if prev.is_ascii_alphabetic() && next.is_ascii_alphabetic() => true, + + // Dot or colon inside an alphabetic word ("e.g.", "http://") (WB6/WB7) + // + // "(MidLetter | MidNumLetQ) AHLetter (MidLetter | MidNumLetQ)" + // MidLetter = b':' + // MidNumLetQ = b'.' | b'\'' + b'\'' | b'.' | b':' if prev.is_ascii_alphabetic() && next.is_ascii_alphabetic() => true, _ => false, } } @@ -769,7 +799,8 @@ impl<'a> Iterator for AsciiWordBoundIter<'a> { let bytes = self.rest.as_bytes(); let len = bytes.len(); - // 1) Group runs of spaces + // 1) Keep horizontal whitespace together. + // Spec: WB3d joins adjacent *WSegSpace* into a single segment. if bytes[0] == b' ' { let mut i = 1; while i < len && bytes[i] == b' ' { @@ -783,6 +814,7 @@ impl<'a> Iterator for AsciiWordBoundIter<'a> { } // 2) Core-run (letters/digits/underscore + infix) + // Spec: ALetter × ALetter, Numeric × Numeric etc. (WB5–WB13b) if Self::is_core(bytes[0]) { let mut i = 1; while i < len { @@ -802,7 +834,8 @@ impl<'a> Iterator for AsciiWordBoundIter<'a> { return Some((pos, word)); } - // 3) Non-core: CR+LF as one token, otherwise single char + // 3) Do not break within CRLF. + // Spec: WB3 treats CR+LF as a single non‑breaking pair. if bytes[0] == b'\r' && len >= 2 && bytes[1] == b'\n' { let word = &self.rest[..2]; let pos = self.offset; @@ -810,6 +843,8 @@ impl<'a> Iterator for AsciiWordBoundIter<'a> { self.offset += 2; Some((pos, word)) } else { + // 4) Otherwise, break everywhere + // Spec: the catch‑all rule WB999. let word = &self.rest[..1]; let pos = self.offset; self.rest = &self.rest[1..]; @@ -828,7 +863,8 @@ impl<'a> DoubleEndedIterator for AsciiWordBoundIter<'a> { let bytes = rest.as_bytes(); let len = bytes.len(); - // 1) Trailing spaces + // 1) Group runs of spaces + // Spec: WB3d joins adjacent *WSegSpace* into a single segment. if bytes[len - 1] == b' ' { // find start of this last run of spaces let mut start = len - 1; @@ -841,7 +877,8 @@ impl<'a> DoubleEndedIterator for AsciiWordBoundIter<'a> { return Some((pos, word)); } - // 2) Trailing core-run (letters/digits/underscore + infix) + // 2) Trailing Core-run (letters/digits/underscore + infix) + // Spec: ALetter × ALetter, Numeric × Numeric etc. (WB5–WB13b) if Self::is_core(bytes[len - 1]) { // scan backwards as long as we see `is_core` or an `is_infix` let mut start = len - 1; @@ -861,7 +898,8 @@ impl<'a> DoubleEndedIterator for AsciiWordBoundIter<'a> { return Some((pos, word)); } - // 3) CR+LF at end + // 3) Non-core: CR+LF as one token, otherwise single char + // Spec: WB3 treats CR+LF as a single non‑breaking pair. if len >= 2 && bytes[len - 2] == b'\r' && bytes[len - 1] == b'\n' { let start = len - 2; let word = &rest[start..]; @@ -870,7 +908,8 @@ impl<'a> DoubleEndedIterator for AsciiWordBoundIter<'a> { return Some((pos, word)); } - // 4) Single non-core byte + // 4) Fallback – every other byte is its own segment + // Spec: the catch‑all rule WB999. let start = len - 1; let word = &rest[start..]; let pos = self.offset + start; From 5a09f28848d33e3a960f68637ff2743cd085a3dd Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Thu, 17 Jul 2025 13:18:59 +0800 Subject: [PATCH 10/13] remove unused alloc --- src/word.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/word.rs b/src/word.rs index b2d1c37..aa6cbcd 100644 --- a/src/word.rs +++ b/src/word.rs @@ -8,7 +8,6 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -extern crate alloc; use core::cmp; use crate::tables::word::WordCat; From f76a997b57d19b829a9427e7f4d14f810dbfc3f8 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Thu, 17 Jul 2025 13:21:09 +0800 Subject: [PATCH 11/13] readd Debug derive --- src/word.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/word.rs b/src/word.rs index aa6cbcd..4077e9f 100644 --- a/src/word.rs +++ b/src/word.rs @@ -24,6 +24,7 @@ use crate::tables::word::WordCat; /// /// [`unicode_words`]: trait.UnicodeSegmentation.html#tymethod.unicode_words /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html +#[derive(Debug)] pub struct UnicodeWords<'a> { inner: WordsIter<'a>, } @@ -68,6 +69,7 @@ impl<'a> DoubleEndedIterator for UnicodeWords<'a> { /// /// [`unicode_word_indices`]: trait.UnicodeSegmentation.html#tymethod.unicode_word_indices /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html +#[derive(Debug)] pub struct UnicodeWordIndices<'a> { inner: IndicesIter<'a>, } @@ -752,6 +754,7 @@ impl<'a> UWordBounds<'a> { /// AHLetter is the same as ALetter, so we don't need to distinguish it. /// /// Any other single ASCII byte is its own boundary (the default WB999). +#[derive(Debug)] pub struct AsciiWordBoundIter<'a> { rest: &'a str, offset: usize, @@ -939,11 +942,13 @@ type AsciiIndicesIter<'a> = type UnicodeIndicesIter<'a> = core::iter::Filter, fn(&(usize, &'a str)) -> bool>; +#[derive(Debug)] enum WordsIter<'a> { Ascii(AsciiWordsIter<'a>), Unicode(UnicodeWordsIter<'a>), } +#[derive(Debug)] enum IndicesIter<'a> { Ascii(AsciiIndicesIter<'a>), Unicode(UnicodeIndicesIter<'a>), From b556333ca894f4a547d5b6fb1dca0ef9991ec973 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Thu, 17 Jul 2025 13:22:29 +0800 Subject: [PATCH 12/13] use import --- src/word.rs | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/src/word.rs b/src/word.rs index 4077e9f..fdd128b 100644 --- a/src/word.rs +++ b/src/word.rs @@ -9,6 +9,7 @@ // except according to those terms. use core::cmp; +use core::iter::Filter; use crate::tables::word::WordCat; @@ -929,18 +930,13 @@ fn unicode_word_ok(t: &(usize, &str)) -> bool { has_alphanumeric(&t.1) } -type AsciiWordsIter<'a> = core::iter::Filter< +type AsciiWordsIter<'a> = Filter< core::iter::Map, fn((usize, &'a str)) -> &'a str>, fn(&&'a str) -> bool, >; - -type UnicodeWordsIter<'a> = core::iter::Filter, fn(&&'a str) -> bool>; - -type AsciiIndicesIter<'a> = - core::iter::Filter, fn(&(usize, &'a str)) -> bool>; - -type UnicodeIndicesIter<'a> = - core::iter::Filter, fn(&(usize, &'a str)) -> bool>; +type UnicodeWordsIter<'a> = Filter, fn(&&'a str) -> bool>; +type AsciiIndicesIter<'a> = Filter, fn(&(usize, &'a str)) -> bool>; +type UnicodeIndicesIter<'a> = Filter, fn(&(usize, &'a str)) -> bool>; #[derive(Debug)] enum WordsIter<'a> { From 0e7674a40541b4baadfe59c65a28de68dcb8db40 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Fri, 18 Jul 2025 16:08:46 +0800 Subject: [PATCH 13/13] remove pub --- src/word.rs | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/src/word.rs b/src/word.rs index fdd128b..1a46b39 100644 --- a/src/word.rs +++ b/src/word.rs @@ -756,7 +756,7 @@ impl<'a> UWordBounds<'a> { /// /// Any other single ASCII byte is its own boundary (the default WB999). #[derive(Debug)] -pub struct AsciiWordBoundIter<'a> { +struct AsciiWordBoundIter<'a> { rest: &'a str, offset: usize, } @@ -988,7 +988,7 @@ pub fn new_word_bound_indices(s: &str) -> UWordBoundIndices<'_> { } #[inline] -pub fn new_ascii_word_bound_indices(s: &str) -> AsciiWordBoundIter<'_> { +fn new_ascii_word_bound_indices(s: &str) -> AsciiWordBoundIter<'_> { AsciiWordBoundIter::new(s) } @@ -1046,6 +1046,20 @@ mod tests { assert_eq!(cat, wd::WC_Numeric); } + #[test] + fn test_ascii_word_bound_indices_various_cases() { + let s = "Hello, world!"; + let words: Vec<(usize, &str)> = new_ascii_word_bound_indices(s).collect(); + let expected = vec![ + (0, "Hello"), // simple letters + (5, ","), + (6, " "), // space after comma + (7, "world"), // skip comma+space, stop at '!' + (12, "!"), // punctuation at the end + ]; + assert_eq!(words, expected); + } + #[test] fn test_ascii_word_indices_various_cases() { let s = "Hello, world! can't e.g. var1 123,456 foo_bar example.com 127.0.0.1:9090";