Skip to content

Optimize the start position of bounded look-behinds #11

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 7 commits into
base: captureless-lookbehinds
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 26 additions & 8 deletions regex-automata/src/nfa/thompson/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use alloc::{sync::Arc, vec, vec::Vec};
use crate::{
nfa::thompson::{
error::BuildError,
nfa::{self, SparseTransitions, Transition, NFA},
nfa::{self, LookBehindTree, SparseTransitions, Transition, NFA},
},
util::{
look::{Look, LookMatcher},
Expand Down Expand Up @@ -340,8 +340,9 @@ pub struct Builder {
/// contains a single regex, then `start_pattern[0]` and `start_anchored`
/// are always equivalent.
start_pattern: Vec<StateID>,
/// The starting states for each individual look-behind sub-expression.
start_look_behind: Vec<StateID>,
/// A vector of look-behinds appearing in the regex. Order reflects the
/// order in the regex.
lookbehinds: Vec<LookBehindTree>,
/// A map from pattern ID to capture group index to name. (If no name
/// exists, then a None entry is present. Thus, all capturing groups are
/// present in this mapping.)
Expand Down Expand Up @@ -387,7 +388,7 @@ impl Builder {
self.pattern_id = None;
self.states.clear();
self.start_pattern.clear();
self.start_look_behind.clear();
self.lookbehinds.clear();
self.captures.clear();
self.memory_states = 0;
}
Expand Down Expand Up @@ -452,7 +453,7 @@ impl Builder {
remap.resize(self.states.len(), StateID::ZERO);

nfa.set_starts(start_anchored, start_unanchored, &self.start_pattern);
nfa.set_look_behind_starts(self.start_look_behind.as_slice());
nfa.set_lookbehinds(self.lookbehinds.as_slice());
nfa.set_captures(&self.captures).map_err(BuildError::captures)?;
// The idea here is to convert our intermediate states to their final
// form. The only real complexity here is the process of converting
Expand Down Expand Up @@ -711,9 +712,26 @@ impl Builder {
}

/// Adds the `start_id` to the set of starting states that is used when
/// running look-behind expressions.
pub fn start_look_behind(&mut self, start_id: StateID) {
self.start_look_behind.push(start_id);
/// running look-behind expressions. Additionally registers the furthest
/// offset (in bytes) from the start of the main regex this look-behind
/// starts.
///
/// Look-behinds must be started in a depth-first pre-order fashion with
/// regards to the nesting of look-behinds. The nesting path is stored
/// as indices in `path`.
pub fn start_lookbehind(
&mut self,
start_id: StateID,
offset_from_start: Option<usize>,
path: &[usize],
) {
let mut current = &mut self.lookbehinds;

for index in path {
current = current[*index].children_mut();
}

current.push(LookBehindTree::new(start_id, offset_from_start));
}

/// Add an "empty" NFA state.
Expand Down
43 changes: 42 additions & 1 deletion regex-automata/src/nfa/thompson/compiler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -713,6 +713,13 @@ pub struct Compiler {
utf8_suffix: RefCell<Utf8SuffixMap>,
/// The next index to use for a look-around expression.
lookaround_index: RefCell<SmallIndex>,
/// How far from the beginning (in bytes) of the main regex does the
/// current look-behind start at. This is updated when relativizing to
/// the current look-behind expression. When `None`, the distance can be
/// seen as infinity.
current_lookbehind_offset_from_start: RefCell<Option<usize>>,
/// The current path of look-behind nesting.
lookbehind_nesting_path: RefCell<Vec<usize>>,
}

impl Compiler {
Expand All @@ -726,6 +733,8 @@ impl Compiler {
trie_state: RefCell::new(RangeTrie::new()),
utf8_suffix: RefCell::new(Utf8SuffixMap::new(1000)),
lookaround_index: RefCell::new(SmallIndex::ZERO),
current_lookbehind_offset_from_start: RefCell::new(Some(0)),
lookbehind_nesting_path: RefCell::new(vec![0]),
}
}

Expand Down Expand Up @@ -966,6 +975,8 @@ impl Compiler {
.borrow_mut()
.set_size_limit(self.config.get_nfa_size_limit())?;
*self.lookaround_index.borrow_mut() = SmallIndex::ZERO;
*self.lookbehind_nesting_path.borrow_mut() = vec![0];
*self.current_lookbehind_offset_from_start.borrow_mut() = Some(0);

// We always add an unanchored prefix unless we were specifically told
// not to (for tests only), or if we know that the regex is anchored
Expand Down Expand Up @@ -1022,10 +1033,13 @@ impl Compiler {
}
}

/// Compile a look-around expression as its own sub-automaton. Its starting
/// state is saved.
fn c_lookaround(
&self,
lookaround: &LookAround,
) -> Result<ThompsonRef, BuildError> {
// Assign a unique index for this look-around.
let idx = *self.lookaround_index.borrow();
*self.lookaround_index.borrow_mut() = SmallIndex::new(idx.one_more())
.map_err(|e| {
Expand All @@ -1037,11 +1051,38 @@ impl Compiler {
};
let check = self.add_check_lookaround(idx, pos)?;

// Compute the furthest offset from the start of the main regex
// where this look-around can begin at. We offset the current start
// offset by the maximal match length of the subexpression.
let maximum_len = lookaround.sub().properties().maximum_len();
let relative_start =
*self.current_lookbehind_offset_from_start.borrow();
let start_offset = match (relative_start, maximum_len) {
(Some(s), Some(l)) => Some(s + l),
(None, _) | (_, None) => None,
};

let unanchored =
self.c_at_least(&Hir::dot(hir::Dot::AnyByte), false, 0)?;
self.builder.borrow_mut().start_look_behind(unanchored.start);
self.builder.borrow_mut().start_lookbehind(
unanchored.start,
start_offset,
self.lookbehind_nesting_path.borrow().split_last().unwrap().1,
);

// When compiling the subexpression we temporarily change the starting
// offset and restore it after. This way, the subexpression is relativized
// to our current offset. We also update the path to the current lookbehind
// expression.
self.lookbehind_nesting_path.borrow_mut().push(0);
*self.current_lookbehind_offset_from_start.borrow_mut() = start_offset;
let sub = self.c(lookaround.sub())?;
let mut path = self.lookbehind_nesting_path.borrow_mut();
path.pop();
*path.last_mut().unwrap() += 1;
*self.current_lookbehind_offset_from_start.borrow_mut() =
relative_start;

let write = self.add_write_lookaround(idx)?;
self.patch(unanchored.end, sub.start)?;
self.patch(sub.end, write)?;
Expand Down
120 changes: 107 additions & 13 deletions regex-automata/src/nfa/thompson/nfa.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1108,8 +1108,8 @@ impl NFA {

/// Returns the starting states for initializing look-behind evaluation.
#[inline]
pub fn look_behind_starts(&self) -> &Vec<StateID> {
&self.0.start_look_behind
pub fn lookbehinds(&self) -> &[LookBehindTree] {
&self.0.lookbehinds
}

// FIXME: The `look_set_prefix_all` computation was not correct, and it
Expand Down Expand Up @@ -1185,7 +1185,10 @@ impl NFA {
+ self.0.states.len() * size_of::<State>()
+ self.0.start_pattern.len() * size_of::<StateID>()
+ self.0.group_info.memory_usage()
+ self.0.start_look_behind.len() * size_of::<StateID>()
+ self.0.lookbehinds.iter()
.map(|b|
b.try_fold(0, &|acc, _| Some(acc + 1)).unwrap()
).sum::<usize>() * size_of::<LookBehindTree>()
+ self.0.memory_extra
}
}
Expand Down Expand Up @@ -1277,15 +1280,92 @@ pub(super) struct Inner {
/// This is needed to initialize the table for storing the result of
/// look-around evaluation.
lookaround_count: usize,
/// Contains the start state for each of the look-behind subexpressions.
start_look_behind: Vec<StateID>,
/// A vector of look-behinds appearing in the regex. Order reflects the
/// order in the regex.
lookbehinds: Vec<LookBehindTree>,
/// Heap memory used indirectly by NFA states and other things (like the
/// various capturing group representations above). Since each state
/// might use a different amount of heap, we need to keep track of this
/// incrementally.
memory_extra: usize,
}

/// Information about a look-behinds needed for execution. It preserves the
/// nesting structure of look-behinds.
#[derive(Clone, Debug)]
pub struct LookBehindTree {
start_id: StateID,
offset_from_start: Option<usize>,
children: Vec<LookBehindTree>,
}

impl LookBehindTree {
pub fn new(start_id: StateID, offset_from_start: Option<usize>) -> Self {
Self { start_id, offset_from_start, children: Vec::new() }
}

/// The id of the start state of the look-behind subexpression.
pub fn start_id(&self) -> StateID {
self.start_id
}

/// The offset (in bytes) from the beginning of the main regex that a
/// look-behind starts at. If `None`, the offset is unbounded.
pub fn offset_from_start(&self) -> Option<usize> {
self.offset_from_start
}

/// The look-behinds this look-behind contains. Order reflects the order
/// in the regex.
pub fn children(&self) -> &[LookBehindTree] {
&self.children
}

/// Calls `fun` on this look-behind tree and all of its children in pre-order.
/// `fun` should return `Some` if the traversal should continue and `None`
/// if it should stop.
///
/// The return value is the fold of all `Some`s, or `None` if at any point `None`
/// was returned.
pub fn try_fold<A>(
&self,
acc: A,
fun: &impl Fn(A, &LookBehindTree) -> Option<A>,
) -> Option<A> {
if let Some(acc) = fun(acc, self) {
self.children
.iter()
.try_fold(acc, |acc, child| child.try_fold(acc, fun))
} else {
None
}
}

/// Calls `fun` on this look-behind tree and all of its children in pre-order.
/// `fun` should return `true` if the traversal should continue and `false`
/// if it should stop.
///
/// The return value indicates whether the traversal was at any point stopped.
pub fn preorder_mut(
&mut self,
fun: &impl Fn(&mut LookBehindTree) -> bool,
) -> bool {
if !fun(self) {
return false;
}
for child in &mut self.children {
if !child.preorder_mut(fun) {
return false;
}
}
true
}

pub fn children_mut(&mut self) -> &mut Vec<LookBehindTree> {
&mut self.children
}
}

impl Inner {
/// Runs any last finalization bits and turns this into a full NFA.
pub(super) fn into_nfa(mut self) -> NFA {
Expand Down Expand Up @@ -1428,11 +1508,12 @@ impl Inner {
self.start_pattern = start_pattern.to_vec();
}

pub(super) fn set_look_behind_starts(
&mut self,
look_behind_starts: &[StateID],
) {
self.start_look_behind = look_behind_starts.to_vec();
/// Sets the look-behind information of this NFA.
///
/// The slice must be in a depth-first pre-order with regards to the
/// nesting of look-behinds.
pub(super) fn set_lookbehinds(&mut self, lookbehinds: &[LookBehindTree]) {
self.lookbehinds = lookbehinds.to_vec();
}

/// Sets the UTF-8 mode of this NFA.
Expand Down Expand Up @@ -1488,8 +1569,12 @@ impl Inner {
for id in self.start_pattern.iter_mut() {
*id = old_to_new[*id];
}
for id in self.start_look_behind.iter_mut() {
*id = old_to_new[*id];

for lbs in self.lookbehinds.iter_mut() {
lbs.preorder_mut(&|e| {
e.start_id = old_to_new[e.start_id];
true
});
}
}
}
Expand All @@ -1502,7 +1587,16 @@ impl fmt::Debug for Inner {
'^'
} else if sid == self.start_unanchored {
'>'
} else if self.start_look_behind.contains(&sid) {
} else if self.lookbehinds.iter().any(|i| {
i.try_fold((), &|_, e| {
if e.start_id() == sid {
None
} else {
Some(())
}
})
.is_none()
}) {
'<'
} else {
' '
Expand Down
Loading