From 7eb95949761f69edb1ebba6ed3c0dc47e1e02097 Mon Sep 17 00:00:00 2001 From: shilangyu Date: Thu, 22 May 2025 17:18:06 +0200 Subject: [PATCH 1/7] Keep track of maximum look-behind length --- regex-automata/src/nfa/thompson/builder.rs | 22 ++++++++++++++++++--- regex-automata/src/nfa/thompson/compiler.rs | 5 ++++- regex-automata/src/nfa/thompson/nfa.rs | 17 ++++++++++++++++ 3 files changed, 40 insertions(+), 4 deletions(-) diff --git a/regex-automata/src/nfa/thompson/builder.rs b/regex-automata/src/nfa/thompson/builder.rs index 4f2f9af79..4abb6d8cd 100644 --- a/regex-automata/src/nfa/thompson/builder.rs +++ b/regex-automata/src/nfa/thompson/builder.rs @@ -342,6 +342,9 @@ pub struct Builder { start_pattern: Vec, /// The starting states for each individual look-behind sub-expression. start_look_behind: Vec, + /// The length (in bytes) of the longest string matched by any + /// look-behind sub-expression. If `None`, the length is unbounded. + maximum_look_behind_len: Option, /// A map from pattern ID to capture group index to name. (If no name /// exists, then a None entry is present. Thus, all capturing groups are /// present in this mapping.) @@ -374,7 +377,7 @@ pub struct Builder { impl Builder { /// Create a new builder for hand-assembling NFAs. pub fn new() -> Builder { - Builder::default() + Builder { maximum_look_behind_len: Some(0), ..Builder::default() } } /// Clear this builder. @@ -453,6 +456,7 @@ impl Builder { nfa.set_starts(start_anchored, start_unanchored, &self.start_pattern); nfa.set_look_behind_starts(self.start_look_behind.as_slice()); + nfa.set_maximum_look_behind_len(self.maximum_look_behind_len); nfa.set_captures(&self.captures).map_err(BuildError::captures)?; // The idea here is to convert our intermediate states to their final // form. The only real complexity here is the process of converting @@ -711,9 +715,21 @@ impl Builder { } /// Adds the `start_id` to the set of starting states that is used when - /// running look-behind expressions. - pub fn start_look_behind(&mut self, start_id: StateID) { + /// running look-behind expressions. Additionally registers the maximum + /// length (in bytes) that the sub-expression of the look-behind can match. + pub fn start_look_behind( + &mut self, + start_id: StateID, + maximum_len: Option, + ) { self.start_look_behind.push(start_id); + + self.maximum_look_behind_len = + match (self.maximum_look_behind_len, maximum_len) { + (Some(l1), Some(l2)) => Some(usize::max(l1, l2)), + // A None subsumes the entire result. + (None, _) | (_, None) => None, + }; } /// Add an "empty" NFA state. diff --git a/regex-automata/src/nfa/thompson/compiler.rs b/regex-automata/src/nfa/thompson/compiler.rs index c79a0556a..a67bf2d8e 100644 --- a/regex-automata/src/nfa/thompson/compiler.rs +++ b/regex-automata/src/nfa/thompson/compiler.rs @@ -1039,7 +1039,10 @@ impl Compiler { let unanchored = self.c_at_least(&Hir::dot(hir::Dot::AnyByte), false, 0)?; - self.builder.borrow_mut().start_look_behind(unanchored.start); + let maximum_len = lookaround.sub().properties().maximum_len(); + self.builder + .borrow_mut() + .start_look_behind(unanchored.start, maximum_len); let sub = self.c(lookaround.sub())?; let write = self.add_write_lookaround(idx)?; diff --git a/regex-automata/src/nfa/thompson/nfa.rs b/regex-automata/src/nfa/thompson/nfa.rs index 34458766e..9cfe5b460 100644 --- a/regex-automata/src/nfa/thompson/nfa.rs +++ b/regex-automata/src/nfa/thompson/nfa.rs @@ -1112,6 +1112,13 @@ impl NFA { &self.0.start_look_behind } + /// Returns the length (in bytes) of the longest string matched by any + /// look-behind sub-expression. If `None`, the length is unbounded. + #[inline] + pub fn maximum_look_behind_len(&self) -> Option { + self.0.maximum_look_behind_len + } + // FIXME: The `look_set_prefix_all` computation was not correct, and it // seemed a little tricky to fix it. Since I wasn't actually using it for // anything, I just decided to remove it in the run up to the regex 1.9 @@ -1279,6 +1286,9 @@ pub(super) struct Inner { lookaround_count: usize, /// Contains the start state for each of the look-behind subexpressions. start_look_behind: Vec, + /// The length (in bytes) of the longest string matched by any + /// look-behind sub-expression. If `None`, the length is unbounded. + maximum_look_behind_len: Option, /// Heap memory used indirectly by NFA states and other things (like the /// various capturing group representations above). Since each state /// might use a different amount of heap, we need to keep track of this @@ -1435,6 +1445,13 @@ impl Inner { self.start_look_behind = look_behind_starts.to_vec(); } + pub(super) fn set_maximum_look_behind_len( + &mut self, + maximum_look_behind_len: Option, + ) { + self.maximum_look_behind_len = maximum_look_behind_len; + } + /// Sets the UTF-8 mode of this NFA. pub(super) fn set_utf8(&mut self, yes: bool) { self.utf8 = yes; From c9dc1ddc3888542af18e182a543ce4d14c21dfb2 Mon Sep 17 00:00:00 2001 From: shilangyu Date: Thu, 22 May 2025 17:27:00 +0200 Subject: [PATCH 2/7] Start the look-behind threads later --- regex-automata/src/nfa/thompson/pikevm.rs | 53 +++++++++++++++++++++-- 1 file changed, 49 insertions(+), 4 deletions(-) diff --git a/regex-automata/src/nfa/thompson/pikevm.rs b/regex-automata/src/nfa/thompson/pikevm.rs index b18101c53..1b24ca817 100644 --- a/regex-automata/src/nfa/thompson/pikevm.rs +++ b/regex-automata/src/nfa/thompson/pikevm.rs @@ -1258,6 +1258,8 @@ impl PikeVM { Some(config) => config, }; + let maximum_look_behind_len = self.nfa.maximum_look_behind_len(); + let pre = if anchored { None } else { self.get_config().get_prefilter() }; let Cache { @@ -1274,7 +1276,14 @@ impl PikeVM { if let Some(active) = match_lookaround { *curr_lookaround = active.clone(); } else if self.lookaround_count() > 0 { - // This initializes the look-behind threads from the start of the input + // If we know the maximum look-behind length, we do not need to + // start from 0. + let start_position = usize::saturating_sub( + input.start(), + maximum_look_behind_len.unwrap_or(input.start()), + ); + + // This initializes the look-behind threads from the `start_position` // Note: since capture groups are not allowed inside look-behinds, // there won't be any Capture epsilon transitions and hence it is ok to // use &mut [] for the slots parameter. We need to add the start states @@ -1289,14 +1298,14 @@ impl PikeVM { curr_lookaround, lookaround, input, - 0, + start_position, *look_behind_start, ); } // This is necessary for look-behinds to be able to match outside of the // input span. self.fast_forward_lookbehinds( - Span { start: 0, end: input.start() }, + Span { start: start_position, end: input.start() }, input, stack, curr_lookaround, @@ -1346,10 +1355,46 @@ impl PikeVM { None => break, Some(ref span) => { if self.lookaround_count() > 0 { + // If we know the maximum look-behind length, + // we might be able to catch up the look-behind + // threads later than starting at `at`. + let start_position = usize::max( + at, + usize::saturating_sub( + span.start, + maximum_look_behind_len + .unwrap_or(span.start), + ), + ); + // If we resume from later than `at`, we need + // to reinitialize the look-behind threads. + if start_position != at { + curr_lookaround.set.clear(); + for look_behind_start in self + .nfa + .look_behind_starts() + .iter() + .rev() + { + self.epsilon_closure( + stack, + &mut [], + curr_lookaround, + lookaround, + input, + start_position, + *look_behind_start, + ); + } + } + // We are jumping ahead due to the pre-filter, thus we must bring // the look-behind threads to the new position. self.fast_forward_lookbehinds( - Span { start: at, end: span.start }, + Span { + start: start_position, + end: span.start, + }, input, stack, curr_lookaround, From 2d3b61258894d340e364b381890d499a1f5dc97a Mon Sep 17 00:00:00 2001 From: shilangyu Date: Mon, 2 Jun 2025 23:31:57 +0200 Subject: [PATCH 3/7] Fix conservative computation of look-behind start offsets --- regex-automata/src/nfa/thompson/builder.rs | 37 +++++++++++++-------- regex-automata/src/nfa/thompson/compiler.rs | 30 +++++++++++++++-- regex-automata/src/nfa/thompson/nfa.rs | 18 +++++----- regex-automata/src/nfa/thompson/pikevm.rs | 7 ++-- 4 files changed, 65 insertions(+), 27 deletions(-) diff --git a/regex-automata/src/nfa/thompson/builder.rs b/regex-automata/src/nfa/thompson/builder.rs index 4abb6d8cd..29b592426 100644 --- a/regex-automata/src/nfa/thompson/builder.rs +++ b/regex-automata/src/nfa/thompson/builder.rs @@ -342,9 +342,10 @@ pub struct Builder { start_pattern: Vec, /// The starting states for each individual look-behind sub-expression. start_look_behind: Vec, - /// The length (in bytes) of the longest string matched by any - /// look-behind sub-expression. If `None`, the length is unbounded. - maximum_look_behind_len: Option, + /// Among all look-behinds, this is the furthest offset (in bytes) from + /// the beginning of the main regex that a look-behind starts at. + /// If `None`, the offset is unbounded. + maximum_lookbehind_offset_from_start: Option, /// A map from pattern ID to capture group index to name. (If no name /// exists, then a None entry is present. Thus, all capturing groups are /// present in this mapping.) @@ -377,7 +378,10 @@ pub struct Builder { impl Builder { /// Create a new builder for hand-assembling NFAs. pub fn new() -> Builder { - Builder { maximum_look_behind_len: Some(0), ..Builder::default() } + Builder { + maximum_lookbehind_offset_from_start: Some(0), + ..Builder::default() + } } /// Clear this builder. @@ -456,7 +460,9 @@ impl Builder { nfa.set_starts(start_anchored, start_unanchored, &self.start_pattern); nfa.set_look_behind_starts(self.start_look_behind.as_slice()); - nfa.set_maximum_look_behind_len(self.maximum_look_behind_len); + nfa.set_maximum_lookbehind_offset_from_start( + self.maximum_lookbehind_offset_from_start, + ); nfa.set_captures(&self.captures).map_err(BuildError::captures)?; // The idea here is to convert our intermediate states to their final // form. The only real complexity here is the process of converting @@ -715,21 +721,24 @@ impl Builder { } /// Adds the `start_id` to the set of starting states that is used when - /// running look-behind expressions. Additionally registers the maximum - /// length (in bytes) that the sub-expression of the look-behind can match. + /// running look-behind expressions. Additionally registers the furthest + /// offset (in bytes) from the start of the main regex this look-behind + /// starts. pub fn start_look_behind( &mut self, start_id: StateID, - maximum_len: Option, + offset_from_start: Option, ) { self.start_look_behind.push(start_id); - self.maximum_look_behind_len = - match (self.maximum_look_behind_len, maximum_len) { - (Some(l1), Some(l2)) => Some(usize::max(l1, l2)), - // A None subsumes the entire result. - (None, _) | (_, None) => None, - }; + self.maximum_lookbehind_offset_from_start = match ( + self.maximum_lookbehind_offset_from_start, + offset_from_start, + ) { + (Some(l1), Some(l2)) => Some(usize::max(l1, l2)), + // A None subsumes the entire result. + (None, _) | (_, None) => None, + }; } /// Add an "empty" NFA state. diff --git a/regex-automata/src/nfa/thompson/compiler.rs b/regex-automata/src/nfa/thompson/compiler.rs index a67bf2d8e..3bd5ebe34 100644 --- a/regex-automata/src/nfa/thompson/compiler.rs +++ b/regex-automata/src/nfa/thompson/compiler.rs @@ -713,6 +713,11 @@ pub struct Compiler { utf8_suffix: RefCell, /// The next index to use for a look-around expression. lookaround_index: RefCell, + /// How far from the beginning (in bytes) of the main regex does the + /// current look-behind start at. This is updated when relativizing to + /// the current look-behind expression. When `None`, the distance can be + /// seen as infinity. + current_lookbehind_offset_from_start: RefCell>, } impl Compiler { @@ -726,6 +731,7 @@ impl Compiler { trie_state: RefCell::new(RangeTrie::new()), utf8_suffix: RefCell::new(Utf8SuffixMap::new(1000)), lookaround_index: RefCell::new(SmallIndex::ZERO), + current_lookbehind_offset_from_start: RefCell::new(Some(0)), } } @@ -1022,10 +1028,13 @@ impl Compiler { } } + /// Compile a look-around expression as its own sub-automaton. Its starting + /// state is saved. fn c_lookaround( &self, lookaround: &LookAround, ) -> Result { + // Assign a unique index for this look-around. let idx = *self.lookaround_index.borrow(); *self.lookaround_index.borrow_mut() = SmallIndex::new(idx.one_more()) .map_err(|e| { @@ -1037,14 +1046,31 @@ impl Compiler { }; let check = self.add_check_lookaround(idx, pos)?; + // Compute the furthest offset from the start of the main regex + // where this look-around can begin at. We offset the current start + // offset by the maximal match length of the subexpression. + let maximum_len = lookaround.sub().properties().maximum_len(); + let relative_start = + *self.current_lookbehind_offset_from_start.borrow(); + let start_offset = match (relative_start, maximum_len) { + (Some(s), Some(l)) => Some(s + l), + (None, _) | (_, None) => None, + }; + let unanchored = self.c_at_least(&Hir::dot(hir::Dot::AnyByte), false, 0)?; - let maximum_len = lookaround.sub().properties().maximum_len(); self.builder .borrow_mut() - .start_look_behind(unanchored.start, maximum_len); + .start_look_behind(unanchored.start, start_offset); + // When compiling the subexpression we temporarily change the starting + // offset and restore it after. This way, the subexpression is relativized + // to our current offset. + *self.current_lookbehind_offset_from_start.borrow_mut() = start_offset; let sub = self.c(lookaround.sub())?; + *self.current_lookbehind_offset_from_start.borrow_mut() = + relative_start; + let write = self.add_write_lookaround(idx)?; self.patch(unanchored.end, sub.start)?; self.patch(sub.end, write)?; diff --git a/regex-automata/src/nfa/thompson/nfa.rs b/regex-automata/src/nfa/thompson/nfa.rs index 9cfe5b460..20b667aac 100644 --- a/regex-automata/src/nfa/thompson/nfa.rs +++ b/regex-automata/src/nfa/thompson/nfa.rs @@ -1115,8 +1115,8 @@ impl NFA { /// Returns the length (in bytes) of the longest string matched by any /// look-behind sub-expression. If `None`, the length is unbounded. #[inline] - pub fn maximum_look_behind_len(&self) -> Option { - self.0.maximum_look_behind_len + pub fn maximum_lookbehind_offset_from_start(&self) -> Option { + self.0.maximum_lookbehind_offset_from_start } // FIXME: The `look_set_prefix_all` computation was not correct, and it @@ -1286,9 +1286,10 @@ pub(super) struct Inner { lookaround_count: usize, /// Contains the start state for each of the look-behind subexpressions. start_look_behind: Vec, - /// The length (in bytes) of the longest string matched by any - /// look-behind sub-expression. If `None`, the length is unbounded. - maximum_look_behind_len: Option, + /// Among all look-behinds, this is the furthest offset (in bytes) from + /// the beginning of the main regex that a look-behind starts at. + /// If `None`, the offset is unbounded. + maximum_lookbehind_offset_from_start: Option, /// Heap memory used indirectly by NFA states and other things (like the /// various capturing group representations above). Since each state /// might use a different amount of heap, we need to keep track of this @@ -1445,11 +1446,12 @@ impl Inner { self.start_look_behind = look_behind_starts.to_vec(); } - pub(super) fn set_maximum_look_behind_len( + pub(super) fn set_maximum_lookbehind_offset_from_start( &mut self, - maximum_look_behind_len: Option, + maximum_lookbehind_offset_from_start: Option, ) { - self.maximum_look_behind_len = maximum_look_behind_len; + self.maximum_lookbehind_offset_from_start = + maximum_lookbehind_offset_from_start; } /// Sets the UTF-8 mode of this NFA. diff --git a/regex-automata/src/nfa/thompson/pikevm.rs b/regex-automata/src/nfa/thompson/pikevm.rs index 1b24ca817..0a568629d 100644 --- a/regex-automata/src/nfa/thompson/pikevm.rs +++ b/regex-automata/src/nfa/thompson/pikevm.rs @@ -1258,7 +1258,8 @@ impl PikeVM { Some(config) => config, }; - let maximum_look_behind_len = self.nfa.maximum_look_behind_len(); + let maximum_lookbehind_offset_from_start = + self.nfa.maximum_lookbehind_offset_from_start(); let pre = if anchored { None } else { self.get_config().get_prefilter() }; @@ -1280,7 +1281,7 @@ impl PikeVM { // start from 0. let start_position = usize::saturating_sub( input.start(), - maximum_look_behind_len.unwrap_or(input.start()), + maximum_lookbehind_offset_from_start.unwrap_or(input.start()), ); // This initializes the look-behind threads from the `start_position` @@ -1362,7 +1363,7 @@ impl PikeVM { at, usize::saturating_sub( span.start, - maximum_look_behind_len + maximum_lookbehind_offset_from_start .unwrap_or(span.start), ), ); From bac64f431ed611bcdec4de3f377c3c64bd26377e Mon Sep 17 00:00:00 2001 From: shilangyu Date: Tue, 3 Jun 2025 20:08:03 +0200 Subject: [PATCH 4/7] Generalize the NFA compiler info and specialize the PikeVM --- regex-automata/src/nfa/thompson/builder.rs | 42 ++++-------- regex-automata/src/nfa/thompson/compiler.rs | 2 +- regex-automata/src/nfa/thompson/nfa.rs | 76 +++++++++++++-------- regex-automata/src/nfa/thompson/pikevm.rs | 53 +++++++++----- 4 files changed, 97 insertions(+), 76 deletions(-) diff --git a/regex-automata/src/nfa/thompson/builder.rs b/regex-automata/src/nfa/thompson/builder.rs index 29b592426..a7fb46d23 100644 --- a/regex-automata/src/nfa/thompson/builder.rs +++ b/regex-automata/src/nfa/thompson/builder.rs @@ -5,7 +5,7 @@ use alloc::{sync::Arc, vec, vec::Vec}; use crate::{ nfa::thompson::{ error::BuildError, - nfa::{self, SparseTransitions, Transition, NFA}, + nfa::{self, LookBehindInfo, SparseTransitions, Transition, NFA}, }, util::{ look::{Look, LookMatcher}, @@ -340,12 +340,11 @@ pub struct Builder { /// contains a single regex, then `start_pattern[0]` and `start_anchored` /// are always equivalent. start_pattern: Vec, - /// The starting states for each individual look-behind sub-expression. - start_look_behind: Vec, - /// Among all look-behinds, this is the furthest offset (in bytes) from - /// the beginning of the main regex that a look-behind starts at. - /// If `None`, the offset is unbounded. - maximum_lookbehind_offset_from_start: Option, + /// A vector of meta-data information about each look-behind in this NFA. + /// + /// Must be stored in a depth-first pre-order with regards to the nesting + /// of look-behinds. + lookbehinds: Vec, /// A map from pattern ID to capture group index to name. (If no name /// exists, then a None entry is present. Thus, all capturing groups are /// present in this mapping.) @@ -378,10 +377,7 @@ pub struct Builder { impl Builder { /// Create a new builder for hand-assembling NFAs. pub fn new() -> Builder { - Builder { - maximum_lookbehind_offset_from_start: Some(0), - ..Builder::default() - } + Builder::default() } /// Clear this builder. @@ -394,7 +390,7 @@ impl Builder { self.pattern_id = None; self.states.clear(); self.start_pattern.clear(); - self.start_look_behind.clear(); + self.lookbehinds.clear(); self.captures.clear(); self.memory_states = 0; } @@ -459,10 +455,7 @@ impl Builder { remap.resize(self.states.len(), StateID::ZERO); nfa.set_starts(start_anchored, start_unanchored, &self.start_pattern); - nfa.set_look_behind_starts(self.start_look_behind.as_slice()); - nfa.set_maximum_lookbehind_offset_from_start( - self.maximum_lookbehind_offset_from_start, - ); + nfa.set_lookbehinds(self.lookbehinds.as_slice()); nfa.set_captures(&self.captures).map_err(BuildError::captures)?; // The idea here is to convert our intermediate states to their final // form. The only real complexity here is the process of converting @@ -724,21 +717,16 @@ impl Builder { /// running look-behind expressions. Additionally registers the furthest /// offset (in bytes) from the start of the main regex this look-behind /// starts. - pub fn start_look_behind( + /// + /// Look-behinds must be started in a depth-first pre-order fashion with + /// regards to the nesting of look-behinds. + pub fn start_lookbehind( &mut self, start_id: StateID, offset_from_start: Option, ) { - self.start_look_behind.push(start_id); - - self.maximum_lookbehind_offset_from_start = match ( - self.maximum_lookbehind_offset_from_start, - offset_from_start, - ) { - (Some(l1), Some(l2)) => Some(usize::max(l1, l2)), - // A None subsumes the entire result. - (None, _) | (_, None) => None, - }; + self.lookbehinds + .push(LookBehindInfo::new(start_id, offset_from_start)); } /// Add an "empty" NFA state. diff --git a/regex-automata/src/nfa/thompson/compiler.rs b/regex-automata/src/nfa/thompson/compiler.rs index 3bd5ebe34..3a1853a1a 100644 --- a/regex-automata/src/nfa/thompson/compiler.rs +++ b/regex-automata/src/nfa/thompson/compiler.rs @@ -1061,7 +1061,7 @@ impl Compiler { self.c_at_least(&Hir::dot(hir::Dot::AnyByte), false, 0)?; self.builder .borrow_mut() - .start_look_behind(unanchored.start, start_offset); + .start_lookbehind(unanchored.start, start_offset); // When compiling the subexpression we temporarily change the starting // offset and restore it after. This way, the subexpression is relativized diff --git a/regex-automata/src/nfa/thompson/nfa.rs b/regex-automata/src/nfa/thompson/nfa.rs index 20b667aac..67857dfaa 100644 --- a/regex-automata/src/nfa/thompson/nfa.rs +++ b/regex-automata/src/nfa/thompson/nfa.rs @@ -1108,15 +1108,8 @@ impl NFA { /// Returns the starting states for initializing look-behind evaluation. #[inline] - pub fn look_behind_starts(&self) -> &Vec { - &self.0.start_look_behind - } - - /// Returns the length (in bytes) of the longest string matched by any - /// look-behind sub-expression. If `None`, the length is unbounded. - #[inline] - pub fn maximum_lookbehind_offset_from_start(&self) -> Option { - self.0.maximum_lookbehind_offset_from_start + pub fn lookbehinds(&self) -> &Vec { + &self.0.lookbehinds } // FIXME: The `look_set_prefix_all` computation was not correct, and it @@ -1284,12 +1277,11 @@ pub(super) struct Inner { /// This is needed to initialize the table for storing the result of /// look-around evaluation. lookaround_count: usize, - /// Contains the start state for each of the look-behind subexpressions. - start_look_behind: Vec, - /// Among all look-behinds, this is the furthest offset (in bytes) from - /// the beginning of the main regex that a look-behind starts at. - /// If `None`, the offset is unbounded. - maximum_lookbehind_offset_from_start: Option, + /// A vector of meta-data information about each look-behind in this NFA. + /// + /// Must be stored in a depth-first pre-order with regards to the nesting + /// of look-behinds. + lookbehinds: Vec, /// Heap memory used indirectly by NFA states and other things (like the /// various capturing group representations above). Since each state /// might use a different amount of heap, we need to keep track of this @@ -1297,6 +1289,36 @@ pub(super) struct Inner { memory_extra: usize, } +/// Information about a look-behind needed for execution. +#[derive(Clone, Copy, Debug)] +pub struct LookBehindInfo { + /// The id of the start state of the look-behind subexpression. + start_id: StateID, + /// The offset (in bytes) from the beginning of the main regex that a + /// look-behind starts at. If `None`, the offset is unbounded. + offset_from_start: Option, +} + +impl LookBehindInfo { + pub(super) fn new( + start_id: StateID, + offset_from_start: Option, + ) -> Self { + Self { start_id, offset_from_start } + } + + /// Start states of the look-behind subexpression. + pub(super) fn start_state(&self) -> StateID { + self.start_id + } + + /// The offset (in bytes) from the beginning of the main regex that a + /// look-behind starts at. If `None`, the offset is unbounded. + pub(super) fn offset_from_start(&self) -> Option { + self.offset_from_start + } +} + impl Inner { /// Runs any last finalization bits and turns this into a full NFA. pub(super) fn into_nfa(mut self) -> NFA { @@ -1439,19 +1461,12 @@ impl Inner { self.start_pattern = start_pattern.to_vec(); } - pub(super) fn set_look_behind_starts( - &mut self, - look_behind_starts: &[StateID], - ) { - self.start_look_behind = look_behind_starts.to_vec(); - } - - pub(super) fn set_maximum_lookbehind_offset_from_start( - &mut self, - maximum_lookbehind_offset_from_start: Option, - ) { - self.maximum_lookbehind_offset_from_start = - maximum_lookbehind_offset_from_start; + /// Sets the look-behind information of this NFA. + /// + /// The slice must be in a depth-first pre-order with regards to the + /// nesting of look-behinds. + pub(super) fn set_lookbehinds(&mut self, lookbehinds: &[LookBehindInfo]) { + self.lookbehinds = lookbehinds.to_vec(); } /// Sets the UTF-8 mode of this NFA. @@ -1507,7 +1522,8 @@ impl Inner { for id in self.start_pattern.iter_mut() { *id = old_to_new[*id]; } - for id in self.start_look_behind.iter_mut() { + for LookBehindInfo { start_id: id, .. } in self.lookbehinds.iter_mut() + { *id = old_to_new[*id]; } } @@ -1521,7 +1537,7 @@ impl fmt::Debug for Inner { '^' } else if sid == self.start_unanchored { '>' - } else if self.start_look_behind.contains(&sid) { + } else if self.lookbehinds.iter().any(|i| i.start_state() == sid) { '<' } else { ' ' diff --git a/regex-automata/src/nfa/thompson/pikevm.rs b/regex-automata/src/nfa/thompson/pikevm.rs index 0a568629d..86d95dd21 100644 --- a/regex-automata/src/nfa/thompson/pikevm.rs +++ b/regex-automata/src/nfa/thompson/pikevm.rs @@ -278,7 +278,24 @@ impl Builder { /// given here is already built. pub fn build_from_nfa(&self, nfa: NFA) -> Result { nfa.look_set_any().available().map_err(BuildError::word)?; - Ok(PikeVM { config: self.config.clone(), nfa }) + + // The reverse of a depth-first pre-order is the depth-first + // reverse-post-order. This means, a look-around is always before its + // surrounding look-behinds in this vector. + let lookbehind_starts = + nfa.lookbehinds().iter().map(|i| i.start_state()).rev().collect(); + + let maximum_lookbehind_offset_from_start = + nfa.lookbehinds().iter().try_fold(0, |acc, curr| { + curr.offset_from_start().map(|o| usize::max(acc, o)) + }); + + Ok(PikeVM { + config: self.config.clone(), + nfa, + lookbehind_starts, + maximum_lookbehind_offset_from_start, + }) } /// Apply the given `PikeVM` configuration options to this builder. @@ -387,6 +404,13 @@ impl Builder { pub struct PikeVM { config: Config, nfa: NFA, + /// Stored depth-first reverse-post-order with regards to the nesting + /// of look-behinds. + lookbehind_starts: Vec, + /// Among all look-behinds, this is the furthest offset (in bytes) from + /// the beginning of the main regex that a look-behind starts at. + /// If `None`, the offset is unbounded. + maximum_lookbehind_offset_from_start: Option, } impl PikeVM { @@ -1258,9 +1282,6 @@ impl PikeVM { Some(config) => config, }; - let maximum_lookbehind_offset_from_start = - self.nfa.maximum_lookbehind_offset_from_start(); - let pre = if anchored { None } else { self.get_config().get_prefilter() }; let Cache { @@ -1281,18 +1302,17 @@ impl PikeVM { // start from 0. let start_position = usize::saturating_sub( input.start(), - maximum_lookbehind_offset_from_start.unwrap_or(input.start()), + self.maximum_lookbehind_offset_from_start + .unwrap_or(input.start()), ); // This initializes the look-behind threads from the `start_position` // Note: since capture groups are not allowed inside look-behinds, // there won't be any Capture epsilon transitions and hence it is ok to - // use &mut [] for the slots parameter. We need to add the start states - // in reverse because more deeply nested look-behinds have a higher index - // but must be executed first, so that the result is available for the - // outer expression. - for look_behind_start in self.nfa.look_behind_starts().iter().rev() - { + // use &mut [] for the slots parameter. Since the start states are stored + // in depth-first reverse-post-order, more deeply nested look-behinds are + // executed first, so that the result is available for the outer expression. + for look_behind_start in &self.lookbehind_starts { self.epsilon_closure( stack, &mut [], @@ -1363,7 +1383,7 @@ impl PikeVM { at, usize::saturating_sub( span.start, - maximum_lookbehind_offset_from_start + self.maximum_lookbehind_offset_from_start .unwrap_or(span.start), ), ); @@ -1371,11 +1391,8 @@ impl PikeVM { // to reinitialize the look-behind threads. if start_position != at { curr_lookaround.set.clear(); - for look_behind_start in self - .nfa - .look_behind_starts() - .iter() - .rev() + for look_behind_start in + &self.lookbehind_starts { self.epsilon_closure( stack, @@ -1598,7 +1615,7 @@ impl PikeVM { match_lookaround: _, } = cache; - for look_behind_start in self.nfa.look_behind_starts().iter().rev() { + for look_behind_start in &self.lookbehind_starts { self.epsilon_closure( stack, &mut [], From 6b66a15930d8e9694fe45587256a1c16dbd2adb1 Mon Sep 17 00:00:00 2001 From: shilangyu Date: Thu, 5 Jun 2025 17:49:16 +0200 Subject: [PATCH 5/7] Store look-behind offsets separately --- regex-automata/src/nfa/thompson/builder.rs | 23 +- regex-automata/src/nfa/thompson/compiler.rs | 20 +- regex-automata/src/nfa/thompson/nfa.rs | 96 +++++-- regex-automata/src/nfa/thompson/pikevm.rs | 264 +++++++++++--------- 4 files changed, 241 insertions(+), 162 deletions(-) diff --git a/regex-automata/src/nfa/thompson/builder.rs b/regex-automata/src/nfa/thompson/builder.rs index a7fb46d23..9f9f88004 100644 --- a/regex-automata/src/nfa/thompson/builder.rs +++ b/regex-automata/src/nfa/thompson/builder.rs @@ -5,7 +5,7 @@ use alloc::{sync::Arc, vec, vec::Vec}; use crate::{ nfa::thompson::{ error::BuildError, - nfa::{self, LookBehindInfo, SparseTransitions, Transition, NFA}, + nfa::{self, LookBehindTree, SparseTransitions, Transition, NFA}, }, util::{ look::{Look, LookMatcher}, @@ -340,11 +340,9 @@ pub struct Builder { /// contains a single regex, then `start_pattern[0]` and `start_anchored` /// are always equivalent. start_pattern: Vec, - /// A vector of meta-data information about each look-behind in this NFA. - /// - /// Must be stored in a depth-first pre-order with regards to the nesting - /// of look-behinds. - lookbehinds: Vec, + /// A vector of look-behinds appearing in the regex. Order reflects the + /// order in the regex. + lookbehinds: Vec, /// A map from pattern ID to capture group index to name. (If no name /// exists, then a None entry is present. Thus, all capturing groups are /// present in this mapping.) @@ -719,14 +717,21 @@ impl Builder { /// starts. /// /// Look-behinds must be started in a depth-first pre-order fashion with - /// regards to the nesting of look-behinds. + /// regards to the nesting of look-behinds. The nesting path is stored + /// as indices in `path`. pub fn start_lookbehind( &mut self, start_id: StateID, offset_from_start: Option, + path: &[usize], ) { - self.lookbehinds - .push(LookBehindInfo::new(start_id, offset_from_start)); + let mut current = &mut self.lookbehinds; + + for index in path { + current = current[*index].children_mut(); + } + + current.push(LookBehindTree::new(start_id, offset_from_start)); } /// Add an "empty" NFA state. diff --git a/regex-automata/src/nfa/thompson/compiler.rs b/regex-automata/src/nfa/thompson/compiler.rs index 3a1853a1a..979ff8d7d 100644 --- a/regex-automata/src/nfa/thompson/compiler.rs +++ b/regex-automata/src/nfa/thompson/compiler.rs @@ -718,6 +718,8 @@ pub struct Compiler { /// the current look-behind expression. When `None`, the distance can be /// seen as infinity. current_lookbehind_offset_from_start: RefCell>, + /// The current path of look-behind nesting. + lookbehind_nesting_path: RefCell>, } impl Compiler { @@ -732,6 +734,7 @@ impl Compiler { utf8_suffix: RefCell::new(Utf8SuffixMap::new(1000)), lookaround_index: RefCell::new(SmallIndex::ZERO), current_lookbehind_offset_from_start: RefCell::new(Some(0)), + lookbehind_nesting_path: RefCell::new(vec![0]), } } @@ -972,6 +975,8 @@ impl Compiler { .borrow_mut() .set_size_limit(self.config.get_nfa_size_limit())?; *self.lookaround_index.borrow_mut() = SmallIndex::ZERO; + *self.lookbehind_nesting_path.borrow_mut() = vec![0]; + *self.current_lookbehind_offset_from_start.borrow_mut() = Some(0); // We always add an unanchored prefix unless we were specifically told // not to (for tests only), or if we know that the regex is anchored @@ -1059,15 +1064,22 @@ impl Compiler { let unanchored = self.c_at_least(&Hir::dot(hir::Dot::AnyByte), false, 0)?; - self.builder - .borrow_mut() - .start_lookbehind(unanchored.start, start_offset); + self.builder.borrow_mut().start_lookbehind( + unanchored.start, + start_offset, + self.lookbehind_nesting_path.borrow().split_last().unwrap().1, + ); // When compiling the subexpression we temporarily change the starting // offset and restore it after. This way, the subexpression is relativized - // to our current offset. + // to our current offset. We also update the path to the current lookbehind + // expression. + self.lookbehind_nesting_path.borrow_mut().push(0); *self.current_lookbehind_offset_from_start.borrow_mut() = start_offset; let sub = self.c(lookaround.sub())?; + let mut path = self.lookbehind_nesting_path.borrow_mut(); + path.pop(); + *path.last_mut().unwrap() += 1; *self.current_lookbehind_offset_from_start.borrow_mut() = relative_start; diff --git a/regex-automata/src/nfa/thompson/nfa.rs b/regex-automata/src/nfa/thompson/nfa.rs index 67857dfaa..8a48681b0 100644 --- a/regex-automata/src/nfa/thompson/nfa.rs +++ b/regex-automata/src/nfa/thompson/nfa.rs @@ -1108,7 +1108,7 @@ impl NFA { /// Returns the starting states for initializing look-behind evaluation. #[inline] - pub fn lookbehinds(&self) -> &Vec { + pub fn lookbehinds(&self) -> &[LookBehindTree] { &self.0.lookbehinds } @@ -1277,11 +1277,9 @@ pub(super) struct Inner { /// This is needed to initialize the table for storing the result of /// look-around evaluation. lookaround_count: usize, - /// A vector of meta-data information about each look-behind in this NFA. - /// - /// Must be stored in a depth-first pre-order with regards to the nesting - /// of look-behinds. - lookbehinds: Vec, + /// A vector of look-behinds appearing in the regex. Order reflects the + /// order in the regex. + lookbehinds: Vec, /// Heap memory used indirectly by NFA states and other things (like the /// various capturing group representations above). Since each state /// might use a different amount of heap, we need to keep track of this @@ -1289,34 +1287,73 @@ pub(super) struct Inner { memory_extra: usize, } -/// Information about a look-behind needed for execution. -#[derive(Clone, Copy, Debug)] -pub struct LookBehindInfo { - /// The id of the start state of the look-behind subexpression. +/// Information about a look-behinds needed for execution. It preserves the +/// nesting structure of look-behinds. +#[derive(Clone, Debug)] +pub struct LookBehindTree { start_id: StateID, - /// The offset (in bytes) from the beginning of the main regex that a - /// look-behind starts at. If `None`, the offset is unbounded. offset_from_start: Option, + children: Vec, } -impl LookBehindInfo { - pub(super) fn new( - start_id: StateID, - offset_from_start: Option, - ) -> Self { - Self { start_id, offset_from_start } +impl LookBehindTree { + pub fn new(start_id: StateID, offset_from_start: Option) -> Self { + Self { start_id, offset_from_start, children: Vec::new() } } - /// Start states of the look-behind subexpression. - pub(super) fn start_state(&self) -> StateID { + /// The id of the start state of the look-behind subexpression. + pub fn start_id(&self) -> StateID { self.start_id } /// The offset (in bytes) from the beginning of the main regex that a /// look-behind starts at. If `None`, the offset is unbounded. - pub(super) fn offset_from_start(&self) -> Option { + pub fn offset_from_start(&self) -> Option { self.offset_from_start } + + /// The look-behinds this look-behind contains. Order reflects the order + /// in the regex. + pub fn children(&self) -> &[LookBehindTree] { + &self.children + } + + /// Calls `fun` on this look-behind tree and all of its children in pre-order. + /// `fun` should return `true` if the traversal should continue and `false` + /// if it should stop. + /// + /// The return value indicates whether the traversal was at any point stopped. + pub fn preorder(&self, fun: &impl Fn(&LookBehindTree) -> bool) -> bool { + if !fun(self) { + return false; + } + for child in &self.children { + if !child.preorder(fun) { + return false; + } + } + true + } + + /// Like [`preorder`], but allows mutating the nodes. + pub fn preorder_mut( + &mut self, + fun: &impl Fn(&mut LookBehindTree) -> bool, + ) -> bool { + if !fun(self) { + return false; + } + for child in &mut self.children { + if !child.preorder_mut(fun) { + return false; + } + } + true + } + + pub fn children_mut(&mut self) -> &mut Vec { + &mut self.children + } } impl Inner { @@ -1465,7 +1502,7 @@ impl Inner { /// /// The slice must be in a depth-first pre-order with regards to the /// nesting of look-behinds. - pub(super) fn set_lookbehinds(&mut self, lookbehinds: &[LookBehindInfo]) { + pub(super) fn set_lookbehinds(&mut self, lookbehinds: &[LookBehindTree]) { self.lookbehinds = lookbehinds.to_vec(); } @@ -1522,9 +1559,12 @@ impl Inner { for id in self.start_pattern.iter_mut() { *id = old_to_new[*id]; } - for LookBehindInfo { start_id: id, .. } in self.lookbehinds.iter_mut() - { - *id = old_to_new[*id]; + + for lbs in self.lookbehinds.iter_mut() { + lbs.preorder_mut(&|e| { + e.start_id = old_to_new[e.start_id]; + true + }); } } } @@ -1537,7 +1577,11 @@ impl fmt::Debug for Inner { '^' } else if sid == self.start_unanchored { '>' - } else if self.lookbehinds.iter().any(|i| i.start_state() == sid) { + } else if self + .lookbehinds + .iter() + .any(|i| !i.preorder(&|e| e.start_id() != sid)) + { '<' } else { ' ' diff --git a/regex-automata/src/nfa/thompson/pikevm.rs b/regex-automata/src/nfa/thompson/pikevm.rs index 86d95dd21..6f43b8b6c 100644 --- a/regex-automata/src/nfa/thompson/pikevm.rs +++ b/regex-automata/src/nfa/thompson/pikevm.rs @@ -7,6 +7,7 @@ resolving all spans of capturing groups that participate in a match. #[cfg(feature = "internal-instrument-pikevm")] use core::cell::RefCell; +use core::cmp::Ordering; use alloc::{vec, vec::Vec}; @@ -279,23 +280,38 @@ impl Builder { pub fn build_from_nfa(&self, nfa: NFA) -> Result { nfa.look_set_any().available().map_err(BuildError::word)?; - // The reverse of a depth-first pre-order is the depth-first - // reverse-post-order. This means, a look-around is always before its - // surrounding look-behinds in this vector. - let lookbehind_starts = - nfa.lookbehinds().iter().map(|i| i.start_state()).rev().collect(); - - let maximum_lookbehind_offset_from_start = - nfa.lookbehinds().iter().try_fold(0, |acc, curr| { - curr.offset_from_start().map(|o| usize::max(acc, o)) - }); - - Ok(PikeVM { - config: self.config.clone(), - nfa, - lookbehind_starts, - maximum_lookbehind_offset_from_start, - }) + // We first fill the array depth-first reverse-pre-order. + let mut lookbehinds = vec![]; + + // Depth-first traversal stack. Offsets are initially zero. + let mut stack = + nfa.lookbehinds().iter().map(|e| (e, Some(0))).collect::>(); + + while let Some((lb, offset)) = stack.pop() { + // To uphold the condition on `lookbehinds`, we must make sure that more nested look-behinds + // have an offset bigger or equal to those surrounding them. + let offset = match (offset, lb.offset_from_start()) { + (Some(o1), Some(o2)) => Some(usize::max(o1, o2)), + // A None subsumes the entire result. + (None, _) | (_, None) => None, + }; + + lookbehinds.push((lb.start_id(), offset)); + + stack.extend(lb.children().iter().map(|e| (e, offset))); + } + + // We need to change the reverse-pre-order into a post-order (to store nested look-behinds before + // those surrounding them) and then use a **stable** sort on the offsets to uphold the conditions. + lookbehinds.reverse(); + lookbehinds.sort_by(|a, b| match (a.1, b.1) { + (None, None) => Ordering::Equal, + (None, _) => Ordering::Less, + (_, None) => Ordering::Greater, + (Some(a), Some(b)) => b.cmp(&a), + }); + + Ok(PikeVM { config: self.config.clone(), nfa, lookbehinds }) } /// Apply the given `PikeVM` configuration options to this builder. @@ -404,13 +420,19 @@ impl Builder { pub struct PikeVM { config: Config, nfa: NFA, - /// Stored depth-first reverse-post-order with regards to the nesting - /// of look-behinds. - lookbehind_starts: Vec, - /// Among all look-behinds, this is the furthest offset (in bytes) from - /// the beginning of the main regex that a look-behind starts at. - /// If `None`, the offset is unbounded. - maximum_lookbehind_offset_from_start: Option, + /// Vector of look-behind start states together with the offset (in bytes) + /// of its start from the beginning of the main regex. An offset that is + /// `None` is unbounded. + /// + /// The order of the vector **must** uphold the following conditions: + /// 1. Elements with a `None` offset are ordered before other ones + /// 2. Elements with a larger offset are ordered before other ones + /// 3. A nested look-behind is ordered before its outer ones + /// + /// These conditions are crutial for starting the look-behind threads + /// in the correct haystack position. Offsets can be conservatively made + /// larger to uphold the previous conditions. + lookbehinds: Vec<(StateID, Option)>, } impl PikeVM { @@ -1297,36 +1319,9 @@ impl PikeVM { if let Some(active) = match_lookaround { *curr_lookaround = active.clone(); - } else if self.lookaround_count() > 0 { - // If we know the maximum look-behind length, we do not need to - // start from 0. - let start_position = usize::saturating_sub( - input.start(), - self.maximum_lookbehind_offset_from_start - .unwrap_or(input.start()), - ); - - // This initializes the look-behind threads from the `start_position` - // Note: since capture groups are not allowed inside look-behinds, - // there won't be any Capture epsilon transitions and hence it is ok to - // use &mut [] for the slots parameter. Since the start states are stored - // in depth-first reverse-post-order, more deeply nested look-behinds are - // executed first, so that the result is available for the outer expression. - for look_behind_start in &self.lookbehind_starts { - self.epsilon_closure( - stack, - &mut [], - curr_lookaround, - lookaround, - input, - start_position, - *look_behind_start, - ); - } - // This is necessary for look-behinds to be able to match outside of the - // input span. + } else { self.fast_forward_lookbehinds( - Span { start: start_position, end: input.start() }, + Span { start: 0, end: input.start() }, input, stack, curr_lookaround, @@ -1375,51 +1370,16 @@ impl PikeVM { match pre.find(input.haystack(), span) { None => break, Some(ref span) => { - if self.lookaround_count() > 0 { - // If we know the maximum look-behind length, - // we might be able to catch up the look-behind - // threads later than starting at `at`. - let start_position = usize::max( - at, - usize::saturating_sub( - span.start, - self.maximum_lookbehind_offset_from_start - .unwrap_or(span.start), - ), - ); - // If we resume from later than `at`, we need - // to reinitialize the look-behind threads. - if start_position != at { - curr_lookaround.set.clear(); - for look_behind_start in - &self.lookbehind_starts - { - self.epsilon_closure( - stack, - &mut [], - curr_lookaround, - lookaround, - input, - start_position, - *look_behind_start, - ); - } - } - - // We are jumping ahead due to the pre-filter, thus we must bring - // the look-behind threads to the new position. - self.fast_forward_lookbehinds( - Span { - start: start_position, - end: span.start, - }, - input, - stack, - curr_lookaround, - next_lookaround, - lookaround, - ); - } + // We are jumping ahead due to the pre-filter, thus we must bring + // the look-behind threads to the new position. + self.fast_forward_lookbehinds( + Span { start: at, end: span.start }, + input, + stack, + curr_lookaround, + next_lookaround, + lookaround, + ); at = span.start } } @@ -1540,21 +1500,89 @@ impl PikeVM { next_lookaround: &mut ActiveStates, lookaround: &mut Vec>, ) { - for lb_at in forward_span.start..forward_span.end { - self.nexts( - stack, - curr_lookaround, - next_lookaround, - lookaround, - input, - lb_at, - // Since capture groups are not allowed inside look-arounds, - // there won't be any Capture epsilon transitions and hence it is ok to - // use &mut [] for the slots parameter. - &mut [], + // Note: since capture groups are not allowed inside look-behinds, + // there won't be any Capture epsilon transitions and hence it is ok to + // use &mut [] for the slots parameter. + + // We check the furthest offset from forward_span.end that we must start at. + // This greatest offset is stored with the first `self.lookbehinds` due to + // the requirements of that vector's order. If that largest offset expands + // before forward_span.start and that start is not the beginning of the + // input, we cannot use the optimization and fallback to fast-forward all + // look-behind threads together through the entire span. Otherwise, we clear + // the state of look-behinds and start them one by one joining each whenever + // their offset from forward_span.end is reached. Inner look-behind threads + // are started before their outer look-behind's due to the requirements of + // the self.lookbehinds vector. + + if self.lookaround_count() > 0 { + let total_distance = forward_span.end - forward_span.start; + + let start_offset = usize::min( + total_distance, + self.lookbehinds[0].1.unwrap_or(total_distance), ); - core::mem::swap(curr_lookaround, next_lookaround); - next_lookaround.set.clear(); + + if forward_span.start == 0 || start_offset < total_distance { + curr_lookaround.set.clear(); + + let mut current_lookbehind = 0; + + for offset in (0..=start_offset).rev() { + let position = forward_span.end - offset; + + while let Some(start_id) = self + .lookbehinds + .get(current_lookbehind) + .and_then(|e| { + if e.1.unwrap_or(total_distance) >= offset { + Some(e.0) + } else { + None + } + }) + { + self.epsilon_closure( + stack, + &mut [], + curr_lookaround, + lookaround, + input, + position, + start_id, + ); + current_lookbehind += 1; + } + // We skip calling nexts when we are at forward_span.end. + if offset != 0 { + self.nexts( + stack, + curr_lookaround, + next_lookaround, + lookaround, + input, + position, + &mut [], + ); + core::mem::swap(curr_lookaround, next_lookaround); + next_lookaround.set.clear(); + } + } + } else { + for position in forward_span.start..forward_span.end { + self.nexts( + stack, + curr_lookaround, + next_lookaround, + lookaround, + input, + position, + &mut [], + ); + core::mem::swap(curr_lookaround, next_lookaround); + next_lookaround.set.clear(); + } + } } } @@ -1615,17 +1643,6 @@ impl PikeVM { match_lookaround: _, } = cache; - for look_behind_start in &self.lookbehind_starts { - self.epsilon_closure( - stack, - &mut [], - curr_lookaround, - lookaround, - input, - 0, - *look_behind_start, - ); - } self.fast_forward_lookbehinds( Span { start: 0, end: input.start() }, input, @@ -1634,6 +1651,7 @@ impl PikeVM { next_lookaround, lookaround, ); + for at in input.start()..=input.end() { let any_matches = !patset.is_empty(); if curr.set.is_empty() { From 0835a7289ab3e84d21570fd3bc8334f6319fd5cf Mon Sep 17 00:00:00 2001 From: shilangyu Date: Thu, 5 Jun 2025 17:57:16 +0200 Subject: [PATCH 6/7] Change lookbehind optimization condition --- regex-automata/src/nfa/thompson/pikevm.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regex-automata/src/nfa/thompson/pikevm.rs b/regex-automata/src/nfa/thompson/pikevm.rs index 6f43b8b6c..ea18a30f4 100644 --- a/regex-automata/src/nfa/thompson/pikevm.rs +++ b/regex-automata/src/nfa/thompson/pikevm.rs @@ -1515,7 +1515,7 @@ impl PikeVM { // are started before their outer look-behind's due to the requirements of // the self.lookbehinds vector. - if self.lookaround_count() > 0 { + if !self.lookbehinds.is_empty() { let total_distance = forward_span.end - forward_span.start; let start_offset = usize::min( From 4740cebc74b56e1058a0f2d3cc3bf52baa3907b4 Mon Sep 17 00:00:00 2001 From: shilangyu Date: Fri, 27 Jun 2025 19:10:07 +0200 Subject: [PATCH 7/7] Compute memory usage well --- regex-automata/src/nfa/thompson/nfa.rs | 51 +++++++++++++++++--------- 1 file changed, 33 insertions(+), 18 deletions(-) diff --git a/regex-automata/src/nfa/thompson/nfa.rs b/regex-automata/src/nfa/thompson/nfa.rs index 8a48681b0..66eaf180e 100644 --- a/regex-automata/src/nfa/thompson/nfa.rs +++ b/regex-automata/src/nfa/thompson/nfa.rs @@ -1185,7 +1185,10 @@ impl NFA { + self.0.states.len() * size_of::() + self.0.start_pattern.len() * size_of::() + self.0.group_info.memory_usage() - + self.0.start_look_behind.len() * size_of::() + + self.0.lookbehinds.iter() + .map(|b| + b.try_fold(0, &|acc, _| Some(acc + 1)).unwrap() + ).sum::() * size_of::() + self.0.memory_extra } } @@ -1319,23 +1322,30 @@ impl LookBehindTree { } /// Calls `fun` on this look-behind tree and all of its children in pre-order. - /// `fun` should return `true` if the traversal should continue and `false` + /// `fun` should return `Some` if the traversal should continue and `None` /// if it should stop. /// - /// The return value indicates whether the traversal was at any point stopped. - pub fn preorder(&self, fun: &impl Fn(&LookBehindTree) -> bool) -> bool { - if !fun(self) { - return false; - } - for child in &self.children { - if !child.preorder(fun) { - return false; - } + /// The return value is the fold of all `Some`s, or `None` if at any point `None` + /// was returned. + pub fn try_fold( + &self, + acc: A, + fun: &impl Fn(A, &LookBehindTree) -> Option, + ) -> Option { + if let Some(acc) = fun(acc, self) { + self.children + .iter() + .try_fold(acc, |acc, child| child.try_fold(acc, fun)) + } else { + None } - true } - /// Like [`preorder`], but allows mutating the nodes. + /// Calls `fun` on this look-behind tree and all of its children in pre-order. + /// `fun` should return `true` if the traversal should continue and `false` + /// if it should stop. + /// + /// The return value indicates whether the traversal was at any point stopped. pub fn preorder_mut( &mut self, fun: &impl Fn(&mut LookBehindTree) -> bool, @@ -1577,11 +1587,16 @@ impl fmt::Debug for Inner { '^' } else if sid == self.start_unanchored { '>' - } else if self - .lookbehinds - .iter() - .any(|i| !i.preorder(&|e| e.start_id() != sid)) - { + } else if self.lookbehinds.iter().any(|i| { + i.try_fold((), &|_, e| { + if e.start_id() == sid { + None + } else { + Some(()) + } + }) + .is_none() + }) { '<' } else { ' '