diff --git a/regex-automata/src/nfa/thompson/builder.rs b/regex-automata/src/nfa/thompson/builder.rs
index 4f2f9af79..9f9f88004 100644
--- a/regex-automata/src/nfa/thompson/builder.rs
+++ b/regex-automata/src/nfa/thompson/builder.rs
@@ -5,7 +5,7 @@ use alloc::{sync::Arc, vec, vec::Vec};
 use crate::{
     nfa::thompson::{
         error::BuildError,
-        nfa::{self, SparseTransitions, Transition, NFA},
+        nfa::{self, LookBehindTree, SparseTransitions, Transition, NFA},
     },
     util::{
         look::{Look, LookMatcher},
@@ -340,8 +340,9 @@ pub struct Builder {
     /// contains a single regex, then `start_pattern[0]` and `start_anchored`
     /// are always equivalent.
     start_pattern: Vec<StateID>,
-    /// The starting states for each individual look-behind sub-expression.
-    start_look_behind: Vec<StateID>,
+    /// A vector of look-behinds appearing in the regex. Order reflects the
+    /// order in the regex.
+    lookbehinds: Vec<LookBehindTree>,
     /// A map from pattern ID to capture group index to name. (If no name
     /// exists, then a None entry is present. Thus, all capturing groups are
     /// present in this mapping.)
@@ -387,7 +388,7 @@ impl Builder {
         self.pattern_id = None;
         self.states.clear();
         self.start_pattern.clear();
-        self.start_look_behind.clear();
+        self.lookbehinds.clear();
         self.captures.clear();
         self.memory_states = 0;
     }
@@ -452,7 +453,7 @@ impl Builder {
         remap.resize(self.states.len(), StateID::ZERO);
 
         nfa.set_starts(start_anchored, start_unanchored, &self.start_pattern);
-        nfa.set_look_behind_starts(self.start_look_behind.as_slice());
+        nfa.set_lookbehinds(self.lookbehinds.as_slice());
         nfa.set_captures(&self.captures).map_err(BuildError::captures)?;
         // The idea here is to convert our intermediate states to their final
         // form. The only real complexity here is the process of converting
@@ -711,9 +712,26 @@ impl Builder {
     }
 
     /// Adds the `start_id` to the set of starting states that is used when
-    /// running look-behind expressions.
-    pub fn start_look_behind(&mut self, start_id: StateID) {
-        self.start_look_behind.push(start_id);
+    /// running look-behind expressions. Additionally registers the furthest
+    /// offset (in bytes) from the start of the main regex this look-behind
+    /// starts.
+    ///
+    /// Look-behinds must be started in a depth-first pre-order fashion with
+    /// regards to the nesting of look-behinds. The nesting path is stored
+    /// as indices in `path`.
+    pub fn start_lookbehind(
+        &mut self,
+        start_id: StateID,
+        offset_from_start: Option<usize>,
+        path: &[usize],
+    ) {
+        let mut current = &mut self.lookbehinds;
+
+        for index in path {
+            current = current[*index].children_mut();
+        }
+
+        current.push(LookBehindTree::new(start_id, offset_from_start));
     }
 
     /// Add an "empty" NFA state.
diff --git a/regex-automata/src/nfa/thompson/compiler.rs b/regex-automata/src/nfa/thompson/compiler.rs
index c79a0556a..979ff8d7d 100644
--- a/regex-automata/src/nfa/thompson/compiler.rs
+++ b/regex-automata/src/nfa/thompson/compiler.rs
@@ -713,6 +713,13 @@ pub struct Compiler {
     utf8_suffix: RefCell<Utf8SuffixMap>,
     /// The next index to use for a look-around expression.
     lookaround_index: RefCell<SmallIndex>,
+    /// How far from the beginning (in bytes) of the main regex does the
+    /// current look-behind start at. This is updated when relativizing to
+    /// the current look-behind expression. When `None`, the distance can be
+    /// seen as infinity.
+    current_lookbehind_offset_from_start: RefCell<Option<usize>>,
+    /// The current path of look-behind nesting.
+    lookbehind_nesting_path: RefCell<Vec<usize>>,
 }
 
 impl Compiler {
@@ -726,6 +733,8 @@ impl Compiler {
             trie_state: RefCell::new(RangeTrie::new()),
             utf8_suffix: RefCell::new(Utf8SuffixMap::new(1000)),
             lookaround_index: RefCell::new(SmallIndex::ZERO),
+            current_lookbehind_offset_from_start: RefCell::new(Some(0)),
+            lookbehind_nesting_path: RefCell::new(vec![0]),
         }
     }
 
@@ -966,6 +975,8 @@ impl Compiler {
             .borrow_mut()
             .set_size_limit(self.config.get_nfa_size_limit())?;
         *self.lookaround_index.borrow_mut() = SmallIndex::ZERO;
+        *self.lookbehind_nesting_path.borrow_mut() = vec![0];
+        *self.current_lookbehind_offset_from_start.borrow_mut() = Some(0);
 
         // We always add an unanchored prefix unless we were specifically told
         // not to (for tests only), or if we know that the regex is anchored
@@ -1022,10 +1033,13 @@ impl Compiler {
         }
     }
 
+    /// Compile a look-around expression as its own sub-automaton. Its starting
+    /// state is saved.
     fn c_lookaround(
         &self,
         lookaround: &LookAround,
     ) -> Result<ThompsonRef, BuildError> {
+        // Assign a unique index for this look-around.
         let idx = *self.lookaround_index.borrow();
         *self.lookaround_index.borrow_mut() = SmallIndex::new(idx.one_more())
             .map_err(|e| {
@@ -1037,11 +1051,38 @@ impl Compiler {
         };
         let check = self.add_check_lookaround(idx, pos)?;
 
+        // Compute the furthest offset from the start of the main regex
+        // where this look-around can begin at. We offset the current start
+        // offset by the maximal match length of the subexpression.
+        let maximum_len = lookaround.sub().properties().maximum_len();
+        let relative_start =
+            *self.current_lookbehind_offset_from_start.borrow();
+        let start_offset = match (relative_start, maximum_len) {
+            (Some(s), Some(l)) => Some(s + l),
+            (None, _) | (_, None) => None,
+        };
+
         let unanchored =
             self.c_at_least(&Hir::dot(hir::Dot::AnyByte), false, 0)?;
-        self.builder.borrow_mut().start_look_behind(unanchored.start);
+        self.builder.borrow_mut().start_lookbehind(
+            unanchored.start,
+            start_offset,
+            self.lookbehind_nesting_path.borrow().split_last().unwrap().1,
+        );
 
+        // When compiling the subexpression we temporarily change the starting
+        // offset and restore it after. This way, the subexpression is relativized
+        // to our current offset. We also update the path to the current lookbehind
+        // expression.
+        self.lookbehind_nesting_path.borrow_mut().push(0);
+        *self.current_lookbehind_offset_from_start.borrow_mut() = start_offset;
         let sub = self.c(lookaround.sub())?;
+        let mut path = self.lookbehind_nesting_path.borrow_mut();
+        path.pop();
+        *path.last_mut().unwrap() += 1;
+        *self.current_lookbehind_offset_from_start.borrow_mut() =
+            relative_start;
+
         let write = self.add_write_lookaround(idx)?;
         self.patch(unanchored.end, sub.start)?;
         self.patch(sub.end, write)?;
diff --git a/regex-automata/src/nfa/thompson/nfa.rs b/regex-automata/src/nfa/thompson/nfa.rs
index 34458766e..66eaf180e 100644
--- a/regex-automata/src/nfa/thompson/nfa.rs
+++ b/regex-automata/src/nfa/thompson/nfa.rs
@@ -1108,8 +1108,8 @@ impl NFA {
 
     /// Returns the starting states for initializing look-behind evaluation.
     #[inline]
-    pub fn look_behind_starts(&self) -> &Vec<StateID> {
-        &self.0.start_look_behind
+    pub fn lookbehinds(&self) -> &[LookBehindTree] {
+        &self.0.lookbehinds
     }
 
     // FIXME: The `look_set_prefix_all` computation was not correct, and it
@@ -1185,7 +1185,10 @@ impl NFA {
             + self.0.states.len() * size_of::<State>()
             + self.0.start_pattern.len() * size_of::<StateID>()
             + self.0.group_info.memory_usage()
-            + self.0.start_look_behind.len() * size_of::<StateID>()
+            + self.0.lookbehinds.iter()
+                .map(|b|
+                    b.try_fold(0, &|acc, _| Some(acc + 1)).unwrap()
+                ).sum::<usize>() * size_of::<LookBehindTree>()
             + self.0.memory_extra
     }
 }
@@ -1277,8 +1280,9 @@ pub(super) struct Inner {
     /// This is needed to initialize the table for storing the result of
     /// look-around evaluation.
     lookaround_count: usize,
-    /// Contains the start state for each of the look-behind subexpressions.
-    start_look_behind: Vec<StateID>,
+    /// A vector of look-behinds appearing in the regex. Order reflects the
+    /// order in the regex.
+    lookbehinds: Vec<LookBehindTree>,
     /// Heap memory used indirectly by NFA states and other things (like the
     /// various capturing group representations above). Since each state
     /// might use a different amount of heap, we need to keep track of this
@@ -1286,6 +1290,82 @@ pub(super) struct Inner {
     memory_extra: usize,
 }
 
+/// Information about a look-behinds needed for execution. It preserves the
+/// nesting structure of look-behinds.
+#[derive(Clone, Debug)]
+pub struct LookBehindTree {
+    start_id: StateID,
+    offset_from_start: Option<usize>,
+    children: Vec<LookBehindTree>,
+}
+
+impl LookBehindTree {
+    pub fn new(start_id: StateID, offset_from_start: Option<usize>) -> Self {
+        Self { start_id, offset_from_start, children: Vec::new() }
+    }
+
+    /// The id of the start state of the look-behind subexpression.
+    pub fn start_id(&self) -> StateID {
+        self.start_id
+    }
+
+    /// The offset (in bytes) from the beginning of the main regex that a
+    /// look-behind starts at. If `None`, the offset is unbounded.
+    pub fn offset_from_start(&self) -> Option<usize> {
+        self.offset_from_start
+    }
+
+    /// The look-behinds this look-behind contains. Order reflects the order
+    /// in the regex.
+    pub fn children(&self) -> &[LookBehindTree] {
+        &self.children
+    }
+
+    /// Calls `fun` on this look-behind tree and all of its children in pre-order.
+    /// `fun` should return `Some` if the traversal should continue and `None`
+    /// if it should stop.
+    ///
+    /// The return value is the fold of all `Some`s, or `None` if at any point `None`
+    /// was returned.
+    pub fn try_fold<A>(
+        &self,
+        acc: A,
+        fun: &impl Fn(A, &LookBehindTree) -> Option<A>,
+    ) -> Option<A> {
+        if let Some(acc) = fun(acc, self) {
+            self.children
+                .iter()
+                .try_fold(acc, |acc, child| child.try_fold(acc, fun))
+        } else {
+            None
+        }
+    }
+
+    /// Calls `fun` on this look-behind tree and all of its children in pre-order.
+    /// `fun` should return `true` if the traversal should continue and `false`
+    /// if it should stop.
+    ///
+    /// The return value indicates whether the traversal was at any point stopped.
+    pub fn preorder_mut(
+        &mut self,
+        fun: &impl Fn(&mut LookBehindTree) -> bool,
+    ) -> bool {
+        if !fun(self) {
+            return false;
+        }
+        for child in &mut self.children {
+            if !child.preorder_mut(fun) {
+                return false;
+            }
+        }
+        true
+    }
+
+    pub fn children_mut(&mut self) -> &mut Vec<LookBehindTree> {
+        &mut self.children
+    }
+}
+
 impl Inner {
     /// Runs any last finalization bits and turns this into a full NFA.
     pub(super) fn into_nfa(mut self) -> NFA {
@@ -1428,11 +1508,12 @@ impl Inner {
         self.start_pattern = start_pattern.to_vec();
     }
 
-    pub(super) fn set_look_behind_starts(
-        &mut self,
-        look_behind_starts: &[StateID],
-    ) {
-        self.start_look_behind = look_behind_starts.to_vec();
+    /// Sets the look-behind information of this NFA.
+    ///
+    /// The slice must be in a depth-first pre-order with regards to the
+    /// nesting of look-behinds.
+    pub(super) fn set_lookbehinds(&mut self, lookbehinds: &[LookBehindTree]) {
+        self.lookbehinds = lookbehinds.to_vec();
     }
 
     /// Sets the UTF-8 mode of this NFA.
@@ -1488,8 +1569,12 @@ impl Inner {
         for id in self.start_pattern.iter_mut() {
             *id = old_to_new[*id];
         }
-        for id in self.start_look_behind.iter_mut() {
-            *id = old_to_new[*id];
+
+        for lbs in self.lookbehinds.iter_mut() {
+            lbs.preorder_mut(&|e| {
+                e.start_id = old_to_new[e.start_id];
+                true
+            });
         }
     }
 }
@@ -1502,7 +1587,16 @@ impl fmt::Debug for Inner {
                 '^'
             } else if sid == self.start_unanchored {
                 '>'
-            } else if self.start_look_behind.contains(&sid) {
+            } else if self.lookbehinds.iter().any(|i| {
+                i.try_fold((), &|_, e| {
+                    if e.start_id() == sid {
+                        None
+                    } else {
+                        Some(())
+                    }
+                })
+                .is_none()
+            }) {
                 '<'
             } else {
                 ' '
diff --git a/regex-automata/src/nfa/thompson/pikevm.rs b/regex-automata/src/nfa/thompson/pikevm.rs
index b18101c53..ea18a30f4 100644
--- a/regex-automata/src/nfa/thompson/pikevm.rs
+++ b/regex-automata/src/nfa/thompson/pikevm.rs
@@ -7,6 +7,7 @@ resolving all spans of capturing groups that participate in a match.
 
 #[cfg(feature = "internal-instrument-pikevm")]
 use core::cell::RefCell;
+use core::cmp::Ordering;
 
 use alloc::{vec, vec::Vec};
 
@@ -278,7 +279,39 @@ impl Builder {
     /// given here is already built.
     pub fn build_from_nfa(&self, nfa: NFA) -> Result<PikeVM, BuildError> {
         nfa.look_set_any().available().map_err(BuildError::word)?;
-        Ok(PikeVM { config: self.config.clone(), nfa })
+
+        // We first fill the array depth-first reverse-pre-order.
+        let mut lookbehinds = vec![];
+
+        // Depth-first traversal stack. Offsets are initially zero.
+        let mut stack =
+            nfa.lookbehinds().iter().map(|e| (e, Some(0))).collect::<Vec<_>>();
+
+        while let Some((lb, offset)) = stack.pop() {
+            // To uphold the condition on `lookbehinds`, we must make sure that more nested look-behinds
+            // have an offset bigger or equal to those surrounding them.
+            let offset = match (offset, lb.offset_from_start()) {
+                (Some(o1), Some(o2)) => Some(usize::max(o1, o2)),
+                // A None subsumes the entire result.
+                (None, _) | (_, None) => None,
+            };
+
+            lookbehinds.push((lb.start_id(), offset));
+
+            stack.extend(lb.children().iter().map(|e| (e, offset)));
+        }
+
+        // We need to change the reverse-pre-order into a post-order (to store nested look-behinds before
+        // those surrounding them) and then use a **stable** sort on the offsets to uphold the conditions.
+        lookbehinds.reverse();
+        lookbehinds.sort_by(|a, b| match (a.1, b.1) {
+            (None, None) => Ordering::Equal,
+            (None, _) => Ordering::Less,
+            (_, None) => Ordering::Greater,
+            (Some(a), Some(b)) => b.cmp(&a),
+        });
+
+        Ok(PikeVM { config: self.config.clone(), nfa, lookbehinds })
     }
 
     /// Apply the given `PikeVM` configuration options to this builder.
@@ -387,6 +420,19 @@ impl Builder {
 pub struct PikeVM {
     config: Config,
     nfa: NFA,
+    /// Vector of look-behind start states together with the offset (in bytes)
+    /// of its start from the beginning of the main regex. An offset that is
+    /// `None` is unbounded.
+    ///
+    /// The order of the vector **must** uphold the following conditions:
+    /// 1. Elements with a `None` offset are ordered before other ones
+    /// 2. Elements with a larger offset are ordered before other ones
+    /// 3. A nested look-behind is ordered before its outer ones
+    ///
+    /// These conditions are crutial for starting the look-behind threads
+    /// in the correct haystack position. Offsets can be conservatively made
+    /// larger to uphold the previous conditions.
+    lookbehinds: Vec<(StateID, Option<usize>)>,
 }
 
 impl PikeVM {
@@ -1273,28 +1319,7 @@ impl PikeVM {
 
         if let Some(active) = match_lookaround {
             *curr_lookaround = active.clone();
-        } else if self.lookaround_count() > 0 {
-            // This initializes the look-behind threads from the start of the input
-            // Note: since capture groups are not allowed inside look-behinds,
-            // there won't be any Capture epsilon transitions and hence it is ok to
-            // use &mut [] for the slots parameter. We need to add the start states
-            // in reverse because more deeply nested look-behinds have a higher index
-            // but must be executed first, so that the result is available for the
-            // outer expression.
-            for look_behind_start in self.nfa.look_behind_starts().iter().rev()
-            {
-                self.epsilon_closure(
-                    stack,
-                    &mut [],
-                    curr_lookaround,
-                    lookaround,
-                    input,
-                    0,
-                    *look_behind_start,
-                );
-            }
-            // This is necessary for look-behinds to be able to match outside of the
-            // input span.
+        } else {
             self.fast_forward_lookbehinds(
                 Span { start: 0, end: input.start() },
                 input,
@@ -1345,18 +1370,16 @@ impl PikeVM {
                     match pre.find(input.haystack(), span) {
                         None => break,
                         Some(ref span) => {
-                            if self.lookaround_count() > 0 {
-                                // We are jumping ahead due to the pre-filter, thus we must bring
-                                // the look-behind threads to the new position.
-                                self.fast_forward_lookbehinds(
-                                    Span { start: at, end: span.start },
-                                    input,
-                                    stack,
-                                    curr_lookaround,
-                                    next_lookaround,
-                                    lookaround,
-                                );
-                            }
+                            // We are jumping ahead due to the pre-filter, thus we must bring
+                            // the look-behind threads to the new position.
+                            self.fast_forward_lookbehinds(
+                                Span { start: at, end: span.start },
+                                input,
+                                stack,
+                                curr_lookaround,
+                                next_lookaround,
+                                lookaround,
+                            );
                             at = span.start
                         }
                     }
@@ -1477,21 +1500,89 @@ impl PikeVM {
         next_lookaround: &mut ActiveStates,
         lookaround: &mut Vec<Option<NonMaxUsize>>,
     ) {
-        for lb_at in forward_span.start..forward_span.end {
-            self.nexts(
-                stack,
-                curr_lookaround,
-                next_lookaround,
-                lookaround,
-                input,
-                lb_at,
-                // Since capture groups are not allowed inside look-arounds,
-                // there won't be any Capture epsilon transitions and hence it is ok to
-                // use &mut [] for the slots parameter.
-                &mut [],
+        // Note: since capture groups are not allowed inside look-behinds,
+        // there won't be any Capture epsilon transitions and hence it is ok to
+        // use &mut [] for the slots parameter.
+
+        // We check the furthest offset from forward_span.end that we must start at.
+        // This greatest offset is stored with the first `self.lookbehinds` due to
+        // the requirements of that vector's order. If that largest offset expands
+        // before forward_span.start and that start is not the beginning of the
+        // input, we cannot use the optimization and fallback to fast-forward all
+        // look-behind threads together through the entire span. Otherwise, we clear
+        // the state of look-behinds and start them one by one joining each whenever
+        // their offset from forward_span.end is reached. Inner look-behind threads
+        // are started before their outer look-behind's due to the requirements of
+        // the self.lookbehinds vector.
+
+        if !self.lookbehinds.is_empty() {
+            let total_distance = forward_span.end - forward_span.start;
+
+            let start_offset = usize::min(
+                total_distance,
+                self.lookbehinds[0].1.unwrap_or(total_distance),
             );
-            core::mem::swap(curr_lookaround, next_lookaround);
-            next_lookaround.set.clear();
+
+            if forward_span.start == 0 || start_offset < total_distance {
+                curr_lookaround.set.clear();
+
+                let mut current_lookbehind = 0;
+
+                for offset in (0..=start_offset).rev() {
+                    let position = forward_span.end - offset;
+
+                    while let Some(start_id) = self
+                        .lookbehinds
+                        .get(current_lookbehind)
+                        .and_then(|e| {
+                            if e.1.unwrap_or(total_distance) >= offset {
+                                Some(e.0)
+                            } else {
+                                None
+                            }
+                        })
+                    {
+                        self.epsilon_closure(
+                            stack,
+                            &mut [],
+                            curr_lookaround,
+                            lookaround,
+                            input,
+                            position,
+                            start_id,
+                        );
+                        current_lookbehind += 1;
+                    }
+                    // We skip calling nexts when we are at forward_span.end.
+                    if offset != 0 {
+                        self.nexts(
+                            stack,
+                            curr_lookaround,
+                            next_lookaround,
+                            lookaround,
+                            input,
+                            position,
+                            &mut [],
+                        );
+                        core::mem::swap(curr_lookaround, next_lookaround);
+                        next_lookaround.set.clear();
+                    }
+                }
+            } else {
+                for position in forward_span.start..forward_span.end {
+                    self.nexts(
+                        stack,
+                        curr_lookaround,
+                        next_lookaround,
+                        lookaround,
+                        input,
+                        position,
+                        &mut [],
+                    );
+                    core::mem::swap(curr_lookaround, next_lookaround);
+                    next_lookaround.set.clear();
+                }
+            }
         }
     }
 
@@ -1552,17 +1643,6 @@ impl PikeVM {
             match_lookaround: _,
         } = cache;
 
-        for look_behind_start in self.nfa.look_behind_starts().iter().rev() {
-            self.epsilon_closure(
-                stack,
-                &mut [],
-                curr_lookaround,
-                lookaround,
-                input,
-                0,
-                *look_behind_start,
-            );
-        }
         self.fast_forward_lookbehinds(
             Span { start: 0, end: input.start() },
             input,
@@ -1571,6 +1651,7 @@ impl PikeVM {
             next_lookaround,
             lookaround,
         );
+
         for at in input.start()..=input.end() {
             let any_matches = !patset.is_empty();
             if curr.set.is_empty() {