diff --git a/regex-automata/src/nfa/thompson/builder.rs b/regex-automata/src/nfa/thompson/builder.rs index 4f2f9af79..9f9f88004 100644 --- a/regex-automata/src/nfa/thompson/builder.rs +++ b/regex-automata/src/nfa/thompson/builder.rs @@ -5,7 +5,7 @@ use alloc::{sync::Arc, vec, vec::Vec}; use crate::{ nfa::thompson::{ error::BuildError, - nfa::{self, SparseTransitions, Transition, NFA}, + nfa::{self, LookBehindTree, SparseTransitions, Transition, NFA}, }, util::{ look::{Look, LookMatcher}, @@ -340,8 +340,9 @@ pub struct Builder { /// contains a single regex, then `start_pattern[0]` and `start_anchored` /// are always equivalent. start_pattern: Vec, - /// The starting states for each individual look-behind sub-expression. - start_look_behind: Vec, + /// A vector of look-behinds appearing in the regex. Order reflects the + /// order in the regex. + lookbehinds: Vec, /// A map from pattern ID to capture group index to name. (If no name /// exists, then a None entry is present. Thus, all capturing groups are /// present in this mapping.) @@ -387,7 +388,7 @@ impl Builder { self.pattern_id = None; self.states.clear(); self.start_pattern.clear(); - self.start_look_behind.clear(); + self.lookbehinds.clear(); self.captures.clear(); self.memory_states = 0; } @@ -452,7 +453,7 @@ impl Builder { remap.resize(self.states.len(), StateID::ZERO); nfa.set_starts(start_anchored, start_unanchored, &self.start_pattern); - nfa.set_look_behind_starts(self.start_look_behind.as_slice()); + nfa.set_lookbehinds(self.lookbehinds.as_slice()); nfa.set_captures(&self.captures).map_err(BuildError::captures)?; // The idea here is to convert our intermediate states to their final // form. The only real complexity here is the process of converting @@ -711,9 +712,26 @@ impl Builder { } /// Adds the `start_id` to the set of starting states that is used when - /// running look-behind expressions. - pub fn start_look_behind(&mut self, start_id: StateID) { - self.start_look_behind.push(start_id); + /// running look-behind expressions. Additionally registers the furthest + /// offset (in bytes) from the start of the main regex this look-behind + /// starts. + /// + /// Look-behinds must be started in a depth-first pre-order fashion with + /// regards to the nesting of look-behinds. The nesting path is stored + /// as indices in `path`. + pub fn start_lookbehind( + &mut self, + start_id: StateID, + offset_from_start: Option, + path: &[usize], + ) { + let mut current = &mut self.lookbehinds; + + for index in path { + current = current[*index].children_mut(); + } + + current.push(LookBehindTree::new(start_id, offset_from_start)); } /// Add an "empty" NFA state. diff --git a/regex-automata/src/nfa/thompson/compiler.rs b/regex-automata/src/nfa/thompson/compiler.rs index c79a0556a..979ff8d7d 100644 --- a/regex-automata/src/nfa/thompson/compiler.rs +++ b/regex-automata/src/nfa/thompson/compiler.rs @@ -713,6 +713,13 @@ pub struct Compiler { utf8_suffix: RefCell, /// The next index to use for a look-around expression. lookaround_index: RefCell, + /// How far from the beginning (in bytes) of the main regex does the + /// current look-behind start at. This is updated when relativizing to + /// the current look-behind expression. When `None`, the distance can be + /// seen as infinity. + current_lookbehind_offset_from_start: RefCell>, + /// The current path of look-behind nesting. + lookbehind_nesting_path: RefCell>, } impl Compiler { @@ -726,6 +733,8 @@ impl Compiler { trie_state: RefCell::new(RangeTrie::new()), utf8_suffix: RefCell::new(Utf8SuffixMap::new(1000)), lookaround_index: RefCell::new(SmallIndex::ZERO), + current_lookbehind_offset_from_start: RefCell::new(Some(0)), + lookbehind_nesting_path: RefCell::new(vec![0]), } } @@ -966,6 +975,8 @@ impl Compiler { .borrow_mut() .set_size_limit(self.config.get_nfa_size_limit())?; *self.lookaround_index.borrow_mut() = SmallIndex::ZERO; + *self.lookbehind_nesting_path.borrow_mut() = vec![0]; + *self.current_lookbehind_offset_from_start.borrow_mut() = Some(0); // We always add an unanchored prefix unless we were specifically told // not to (for tests only), or if we know that the regex is anchored @@ -1022,10 +1033,13 @@ impl Compiler { } } + /// Compile a look-around expression as its own sub-automaton. Its starting + /// state is saved. fn c_lookaround( &self, lookaround: &LookAround, ) -> Result { + // Assign a unique index for this look-around. let idx = *self.lookaround_index.borrow(); *self.lookaround_index.borrow_mut() = SmallIndex::new(idx.one_more()) .map_err(|e| { @@ -1037,11 +1051,38 @@ impl Compiler { }; let check = self.add_check_lookaround(idx, pos)?; + // Compute the furthest offset from the start of the main regex + // where this look-around can begin at. We offset the current start + // offset by the maximal match length of the subexpression. + let maximum_len = lookaround.sub().properties().maximum_len(); + let relative_start = + *self.current_lookbehind_offset_from_start.borrow(); + let start_offset = match (relative_start, maximum_len) { + (Some(s), Some(l)) => Some(s + l), + (None, _) | (_, None) => None, + }; + let unanchored = self.c_at_least(&Hir::dot(hir::Dot::AnyByte), false, 0)?; - self.builder.borrow_mut().start_look_behind(unanchored.start); + self.builder.borrow_mut().start_lookbehind( + unanchored.start, + start_offset, + self.lookbehind_nesting_path.borrow().split_last().unwrap().1, + ); + // When compiling the subexpression we temporarily change the starting + // offset and restore it after. This way, the subexpression is relativized + // to our current offset. We also update the path to the current lookbehind + // expression. + self.lookbehind_nesting_path.borrow_mut().push(0); + *self.current_lookbehind_offset_from_start.borrow_mut() = start_offset; let sub = self.c(lookaround.sub())?; + let mut path = self.lookbehind_nesting_path.borrow_mut(); + path.pop(); + *path.last_mut().unwrap() += 1; + *self.current_lookbehind_offset_from_start.borrow_mut() = + relative_start; + let write = self.add_write_lookaround(idx)?; self.patch(unanchored.end, sub.start)?; self.patch(sub.end, write)?; diff --git a/regex-automata/src/nfa/thompson/nfa.rs b/regex-automata/src/nfa/thompson/nfa.rs index 34458766e..66eaf180e 100644 --- a/regex-automata/src/nfa/thompson/nfa.rs +++ b/regex-automata/src/nfa/thompson/nfa.rs @@ -1108,8 +1108,8 @@ impl NFA { /// Returns the starting states for initializing look-behind evaluation. #[inline] - pub fn look_behind_starts(&self) -> &Vec { - &self.0.start_look_behind + pub fn lookbehinds(&self) -> &[LookBehindTree] { + &self.0.lookbehinds } // FIXME: The `look_set_prefix_all` computation was not correct, and it @@ -1185,7 +1185,10 @@ impl NFA { + self.0.states.len() * size_of::() + self.0.start_pattern.len() * size_of::() + self.0.group_info.memory_usage() - + self.0.start_look_behind.len() * size_of::() + + self.0.lookbehinds.iter() + .map(|b| + b.try_fold(0, &|acc, _| Some(acc + 1)).unwrap() + ).sum::() * size_of::() + self.0.memory_extra } } @@ -1277,8 +1280,9 @@ pub(super) struct Inner { /// This is needed to initialize the table for storing the result of /// look-around evaluation. lookaround_count: usize, - /// Contains the start state for each of the look-behind subexpressions. - start_look_behind: Vec, + /// A vector of look-behinds appearing in the regex. Order reflects the + /// order in the regex. + lookbehinds: Vec, /// Heap memory used indirectly by NFA states and other things (like the /// various capturing group representations above). Since each state /// might use a different amount of heap, we need to keep track of this @@ -1286,6 +1290,82 @@ pub(super) struct Inner { memory_extra: usize, } +/// Information about a look-behinds needed for execution. It preserves the +/// nesting structure of look-behinds. +#[derive(Clone, Debug)] +pub struct LookBehindTree { + start_id: StateID, + offset_from_start: Option, + children: Vec, +} + +impl LookBehindTree { + pub fn new(start_id: StateID, offset_from_start: Option) -> Self { + Self { start_id, offset_from_start, children: Vec::new() } + } + + /// The id of the start state of the look-behind subexpression. + pub fn start_id(&self) -> StateID { + self.start_id + } + + /// The offset (in bytes) from the beginning of the main regex that a + /// look-behind starts at. If `None`, the offset is unbounded. + pub fn offset_from_start(&self) -> Option { + self.offset_from_start + } + + /// The look-behinds this look-behind contains. Order reflects the order + /// in the regex. + pub fn children(&self) -> &[LookBehindTree] { + &self.children + } + + /// Calls `fun` on this look-behind tree and all of its children in pre-order. + /// `fun` should return `Some` if the traversal should continue and `None` + /// if it should stop. + /// + /// The return value is the fold of all `Some`s, or `None` if at any point `None` + /// was returned. + pub fn try_fold( + &self, + acc: A, + fun: &impl Fn(A, &LookBehindTree) -> Option, + ) -> Option { + if let Some(acc) = fun(acc, self) { + self.children + .iter() + .try_fold(acc, |acc, child| child.try_fold(acc, fun)) + } else { + None + } + } + + /// Calls `fun` on this look-behind tree and all of its children in pre-order. + /// `fun` should return `true` if the traversal should continue and `false` + /// if it should stop. + /// + /// The return value indicates whether the traversal was at any point stopped. + pub fn preorder_mut( + &mut self, + fun: &impl Fn(&mut LookBehindTree) -> bool, + ) -> bool { + if !fun(self) { + return false; + } + for child in &mut self.children { + if !child.preorder_mut(fun) { + return false; + } + } + true + } + + pub fn children_mut(&mut self) -> &mut Vec { + &mut self.children + } +} + impl Inner { /// Runs any last finalization bits and turns this into a full NFA. pub(super) fn into_nfa(mut self) -> NFA { @@ -1428,11 +1508,12 @@ impl Inner { self.start_pattern = start_pattern.to_vec(); } - pub(super) fn set_look_behind_starts( - &mut self, - look_behind_starts: &[StateID], - ) { - self.start_look_behind = look_behind_starts.to_vec(); + /// Sets the look-behind information of this NFA. + /// + /// The slice must be in a depth-first pre-order with regards to the + /// nesting of look-behinds. + pub(super) fn set_lookbehinds(&mut self, lookbehinds: &[LookBehindTree]) { + self.lookbehinds = lookbehinds.to_vec(); } /// Sets the UTF-8 mode of this NFA. @@ -1488,8 +1569,12 @@ impl Inner { for id in self.start_pattern.iter_mut() { *id = old_to_new[*id]; } - for id in self.start_look_behind.iter_mut() { - *id = old_to_new[*id]; + + for lbs in self.lookbehinds.iter_mut() { + lbs.preorder_mut(&|e| { + e.start_id = old_to_new[e.start_id]; + true + }); } } } @@ -1502,7 +1587,16 @@ impl fmt::Debug for Inner { '^' } else if sid == self.start_unanchored { '>' - } else if self.start_look_behind.contains(&sid) { + } else if self.lookbehinds.iter().any(|i| { + i.try_fold((), &|_, e| { + if e.start_id() == sid { + None + } else { + Some(()) + } + }) + .is_none() + }) { '<' } else { ' ' diff --git a/regex-automata/src/nfa/thompson/pikevm.rs b/regex-automata/src/nfa/thompson/pikevm.rs index b18101c53..ea18a30f4 100644 --- a/regex-automata/src/nfa/thompson/pikevm.rs +++ b/regex-automata/src/nfa/thompson/pikevm.rs @@ -7,6 +7,7 @@ resolving all spans of capturing groups that participate in a match. #[cfg(feature = "internal-instrument-pikevm")] use core::cell::RefCell; +use core::cmp::Ordering; use alloc::{vec, vec::Vec}; @@ -278,7 +279,39 @@ impl Builder { /// given here is already built. pub fn build_from_nfa(&self, nfa: NFA) -> Result { nfa.look_set_any().available().map_err(BuildError::word)?; - Ok(PikeVM { config: self.config.clone(), nfa }) + + // We first fill the array depth-first reverse-pre-order. + let mut lookbehinds = vec![]; + + // Depth-first traversal stack. Offsets are initially zero. + let mut stack = + nfa.lookbehinds().iter().map(|e| (e, Some(0))).collect::>(); + + while let Some((lb, offset)) = stack.pop() { + // To uphold the condition on `lookbehinds`, we must make sure that more nested look-behinds + // have an offset bigger or equal to those surrounding them. + let offset = match (offset, lb.offset_from_start()) { + (Some(o1), Some(o2)) => Some(usize::max(o1, o2)), + // A None subsumes the entire result. + (None, _) | (_, None) => None, + }; + + lookbehinds.push((lb.start_id(), offset)); + + stack.extend(lb.children().iter().map(|e| (e, offset))); + } + + // We need to change the reverse-pre-order into a post-order (to store nested look-behinds before + // those surrounding them) and then use a **stable** sort on the offsets to uphold the conditions. + lookbehinds.reverse(); + lookbehinds.sort_by(|a, b| match (a.1, b.1) { + (None, None) => Ordering::Equal, + (None, _) => Ordering::Less, + (_, None) => Ordering::Greater, + (Some(a), Some(b)) => b.cmp(&a), + }); + + Ok(PikeVM { config: self.config.clone(), nfa, lookbehinds }) } /// Apply the given `PikeVM` configuration options to this builder. @@ -387,6 +420,19 @@ impl Builder { pub struct PikeVM { config: Config, nfa: NFA, + /// Vector of look-behind start states together with the offset (in bytes) + /// of its start from the beginning of the main regex. An offset that is + /// `None` is unbounded. + /// + /// The order of the vector **must** uphold the following conditions: + /// 1. Elements with a `None` offset are ordered before other ones + /// 2. Elements with a larger offset are ordered before other ones + /// 3. A nested look-behind is ordered before its outer ones + /// + /// These conditions are crutial for starting the look-behind threads + /// in the correct haystack position. Offsets can be conservatively made + /// larger to uphold the previous conditions. + lookbehinds: Vec<(StateID, Option)>, } impl PikeVM { @@ -1273,28 +1319,7 @@ impl PikeVM { if let Some(active) = match_lookaround { *curr_lookaround = active.clone(); - } else if self.lookaround_count() > 0 { - // This initializes the look-behind threads from the start of the input - // Note: since capture groups are not allowed inside look-behinds, - // there won't be any Capture epsilon transitions and hence it is ok to - // use &mut [] for the slots parameter. We need to add the start states - // in reverse because more deeply nested look-behinds have a higher index - // but must be executed first, so that the result is available for the - // outer expression. - for look_behind_start in self.nfa.look_behind_starts().iter().rev() - { - self.epsilon_closure( - stack, - &mut [], - curr_lookaround, - lookaround, - input, - 0, - *look_behind_start, - ); - } - // This is necessary for look-behinds to be able to match outside of the - // input span. + } else { self.fast_forward_lookbehinds( Span { start: 0, end: input.start() }, input, @@ -1345,18 +1370,16 @@ impl PikeVM { match pre.find(input.haystack(), span) { None => break, Some(ref span) => { - if self.lookaround_count() > 0 { - // We are jumping ahead due to the pre-filter, thus we must bring - // the look-behind threads to the new position. - self.fast_forward_lookbehinds( - Span { start: at, end: span.start }, - input, - stack, - curr_lookaround, - next_lookaround, - lookaround, - ); - } + // We are jumping ahead due to the pre-filter, thus we must bring + // the look-behind threads to the new position. + self.fast_forward_lookbehinds( + Span { start: at, end: span.start }, + input, + stack, + curr_lookaround, + next_lookaround, + lookaround, + ); at = span.start } } @@ -1477,21 +1500,89 @@ impl PikeVM { next_lookaround: &mut ActiveStates, lookaround: &mut Vec>, ) { - for lb_at in forward_span.start..forward_span.end { - self.nexts( - stack, - curr_lookaround, - next_lookaround, - lookaround, - input, - lb_at, - // Since capture groups are not allowed inside look-arounds, - // there won't be any Capture epsilon transitions and hence it is ok to - // use &mut [] for the slots parameter. - &mut [], + // Note: since capture groups are not allowed inside look-behinds, + // there won't be any Capture epsilon transitions and hence it is ok to + // use &mut [] for the slots parameter. + + // We check the furthest offset from forward_span.end that we must start at. + // This greatest offset is stored with the first `self.lookbehinds` due to + // the requirements of that vector's order. If that largest offset expands + // before forward_span.start and that start is not the beginning of the + // input, we cannot use the optimization and fallback to fast-forward all + // look-behind threads together through the entire span. Otherwise, we clear + // the state of look-behinds and start them one by one joining each whenever + // their offset from forward_span.end is reached. Inner look-behind threads + // are started before their outer look-behind's due to the requirements of + // the self.lookbehinds vector. + + if !self.lookbehinds.is_empty() { + let total_distance = forward_span.end - forward_span.start; + + let start_offset = usize::min( + total_distance, + self.lookbehinds[0].1.unwrap_or(total_distance), ); - core::mem::swap(curr_lookaround, next_lookaround); - next_lookaround.set.clear(); + + if forward_span.start == 0 || start_offset < total_distance { + curr_lookaround.set.clear(); + + let mut current_lookbehind = 0; + + for offset in (0..=start_offset).rev() { + let position = forward_span.end - offset; + + while let Some(start_id) = self + .lookbehinds + .get(current_lookbehind) + .and_then(|e| { + if e.1.unwrap_or(total_distance) >= offset { + Some(e.0) + } else { + None + } + }) + { + self.epsilon_closure( + stack, + &mut [], + curr_lookaround, + lookaround, + input, + position, + start_id, + ); + current_lookbehind += 1; + } + // We skip calling nexts when we are at forward_span.end. + if offset != 0 { + self.nexts( + stack, + curr_lookaround, + next_lookaround, + lookaround, + input, + position, + &mut [], + ); + core::mem::swap(curr_lookaround, next_lookaround); + next_lookaround.set.clear(); + } + } + } else { + for position in forward_span.start..forward_span.end { + self.nexts( + stack, + curr_lookaround, + next_lookaround, + lookaround, + input, + position, + &mut [], + ); + core::mem::swap(curr_lookaround, next_lookaround); + next_lookaround.set.clear(); + } + } } } @@ -1552,17 +1643,6 @@ impl PikeVM { match_lookaround: _, } = cache; - for look_behind_start in self.nfa.look_behind_starts().iter().rev() { - self.epsilon_closure( - stack, - &mut [], - curr_lookaround, - lookaround, - input, - 0, - *look_behind_start, - ); - } self.fast_forward_lookbehinds( Span { start: 0, end: input.start() }, input, @@ -1571,6 +1651,7 @@ impl PikeVM { next_lookaround, lookaround, ); + for at in input.start()..=input.end() { let any_matches = !patset.is_empty(); if curr.set.is_empty() {