Skip to content

Commit 103f20e

Browse files
author
Ethan Pailes
committed
Fix documentation and style issues.
This patch just has a bunch of style and doc fixes that I noticed when going over the diff on github.
1 parent e173c42 commit 103f20e

File tree

3 files changed

+78
-35
lines changed

3 files changed

+78
-35
lines changed

src/analysis.rs

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ impl IsOnePassVisitor {
7777
self.0 = self.0 && !fsets_clash(&es.iter().collect::<Vec<_>>());
7878
}
7979

80-
// Unicode classes are really big alternatives from the byte
80+
// Unicode classes are really just big alternatives from the byte
8181
// oriented point of view.
8282
//
8383
// This function translates a unicode class into the
@@ -100,7 +100,7 @@ impl IsOnePassVisitor {
100100
}
101101
}
102102
}
103-
_ => {} // FALLTHROUGH
103+
_ => {}
104104
}
105105
}
106106

@@ -129,14 +129,14 @@ fn fsets_clash(es: &[&Hir]) -> bool {
129129

130130
/// Compute the first set of a given regular expression.
131131
///
132-
/// The first set of a regular expression is the set of all characters
132+
/// The first set of a regular expression is the set of all bytes
133133
/// which might begin it. This is a less general version of the
134134
/// notion of a regular expression preview (the first set can be
135135
/// thought of as the 1-preview of a regular expression).
136136
///
137137
/// Note that first sets are byte-oriented because the DFA is
138138
/// byte oriented. This means an expression like /Δ|δ/ is actually not
139-
/// one-pass, even though there is clearly no non-determinism inherent
139+
/// onepass, even though there is clearly no non-determinism inherent
140140
/// to the regex at a unicode code point level (big delta and little
141141
/// delta start with the same byte).
142142
fn fset_of(expr: &Hir) -> FirstSet {
@@ -204,7 +204,8 @@ fn fset_of(expr: &Hir) -> FirstSet {
204204

205205
// The most involved case. We need to strip leading empty-looks
206206
// as well as take the union of the first sets of the first n+1
207-
// expressions where n is the number of leading repetitions.
207+
// expressions where n is the number of leading expressions which
208+
// accept the empty string.
208209
&HirKind::Concat(ref es) => {
209210
let mut fset = FirstSet::empty();
210211
for (i, e) in es.iter().enumerate() {
@@ -223,9 +224,9 @@ fn fset_of(expr: &Hir) -> FirstSet {
223224
if !inner_fset.accepts_empty() {
224225
// We can stop accumulating after we stop seeing
225226
// first sets which contain epsilon.
226-
// Also, a contatination which terminated by
227+
// Also, a concatenation which is terminated by
227228
// one or more expressions which do not accept
228-
// epsilon itself does not acceept epsilon.
229+
// epsilon itself does not accept epsilon.
229230
fset.accepts_empty = false;
230231
break;
231232
}
@@ -246,8 +247,8 @@ fn fset_of(expr: &Hir) -> FirstSet {
246247

247248
/// The first byte of a unicode code point.
248249
///
249-
/// We only ever care about the first byte of a particular character,
250-
/// because the onepass DFA is implemented in the byte space, not the
250+
/// We only ever care about the first byte of a particular character
251+
/// because the onepass DFA is implemented in the byte space not the
251252
/// character space. This means, for example, that a branch between
252253
/// lowercase delta and uppercase delta is actually non-deterministic.
253254
fn first_byte(c: char) -> u8 {

src/exec.rs

Lines changed: 1 addition & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -449,8 +449,6 @@ impl<'c> RegularExpression for ExecNoSync<'c> {
449449
let mut slots = vec![None; self.slots_len()];
450450
if op.exec(&mut slots, text, start) {
451451
slots[0]
452-
// TODO(ethan): why did I chain?
453-
// slots[1].and_then(|_| slots[0])
454452
} else {
455453
None
456454
}
@@ -513,13 +511,6 @@ impl<'c> RegularExpression for ExecNoSync<'c> {
513511
Some(ref op) => {
514512
let mut slots = vec![None; self.slots_len()];
515513
op.exec(&mut slots, text, start)
516-
/* TODO: why did I check capture slots??
517-
if op.exec(&mut slots, text, start) {
518-
slots[0].is_some() && slots[1].is_some()
519-
} else {
520-
false
521-
}
522-
*/
523514
}
524515
None => unreachable!(),
525516
}
@@ -1340,9 +1331,7 @@ enum MatchType {
13401331
/// can be decomposed into unambiguous literal search.
13411332
Literal(MatchLiteralType),
13421333
/// A onepass DFA search. If a onepass search is impossible, we just
1343-
/// fall back to an `Nfa(MatchNfaType::Auto)` search. The execution
1344-
/// planner will never select the `OnePassDfa` match type when
1345-
/// there will be a fallback.
1334+
/// fall back to an automatically chosen search.
13461335
OnePassDfa(Box<Option<MatchType>>),
13471336
/// A normal DFA search.
13481337
Dfa,

src/onepass.rs

Lines changed: 67 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -286,11 +286,15 @@ impl OnePass {
286286
// No need to mask because no flags are set.
287287
state_ptr = self.follow(state_ptr as usize, byte_class);
288288
} else {
289+
// STATE_HALT and STATE_DEAD must always be checked
290+
// first because they have STATE_ACTION and STATE_MATCH
291+
// set, even though those flags don't apply. It would
292+
// probably be better for performance to check them last,
293+
// so it may be worthwhile to try to rejigger the
294+
// representation of StatePtrs.
289295
if state_ptr == STATE_HALT {
290296
break;
291-
}
292-
293-
if state_ptr == STATE_DEAD {
297+
} else if state_ptr == STATE_DEAD {
294298
trace!("::exec_ drain-dead");
295299
slots[FULL_MATCH_CAPTURE_END] = last_match;
296300
return last_match.is_some();
@@ -474,7 +478,7 @@ pub struct OnePassCompiler {
474478
/// if they should have the STATE_MATCH flag set.
475479
accepting_states: Vec<bool>,
476480

477-
/// A DAG of forwarding relationship indicating when
481+
/// A DAG of forwarding relationships indicating when
478482
/// a state needs to be forwarded to an Action state
479483
/// once that Action state has been fully constructed.
480484
forwards: Forwards,
@@ -595,6 +599,13 @@ impl OnePassCompiler {
595599
Ok(self.onepass)
596600
}
597601

602+
/// Compile the stage 1 transition table for the state corresponding
603+
/// to the given instruction.
604+
///
605+
/// The result of `inst_trans` will end up in `self.transitions`.
606+
///
607+
/// Returns a list of child instructions which must be compiled
608+
/// via `inst_trans`.
598609
fn inst_trans(
599610
&mut self,
600611
inst_idx: usize
@@ -671,6 +682,45 @@ impl OnePassCompiler {
671682
Ok(children)
672683
}
673684

685+
/// Topologically sort the forwarding jobs so that we
686+
/// start with jobs that have no dependencies and then
687+
/// shuffle the transitions over. Mutate `self.transitions`
688+
/// in place.
689+
///
690+
/// To make that a little more concrete, consider the program snippet:
691+
///
692+
/// 0000: Bytes(a, a)
693+
/// 0001: Save(2)
694+
/// 0002: Bytes(b, b)
695+
///
696+
/// Here the state for `Bytes(a, a)` needs to transition to
697+
/// the state for `Save(2)`, but it does not know when to do
698+
/// so. The right answer is that it should transition to
699+
/// the `Save(2)` state when it sees a `b`, but it is hard
700+
/// to know what children `Save(2)` has from where `Bytes(a, a)`
701+
/// stands. To handle this we just emit a forwarding job
702+
/// that says "when you know enough about the `Save(2)` state,
703+
/// please forward `Bytes(a, a)` to `Save(2)`.". We need to use
704+
/// a full DAG for this because there could be multiple forwarding
705+
/// states in a row:
706+
///
707+
/// 0000: Bytes(a, a)
708+
/// 0001: Save(2)
709+
/// 0002: Save(3)
710+
/// 0003: Bytes(b, b)
711+
///
712+
/// Here we will end up with two forwarding jobs:
713+
///
714+
/// 1. Forward from `Bytes(a, a)` to `Save(2)`.
715+
/// 2. Forward from `Save(2)` to `Save(3)`.
716+
///
717+
/// Which we structure as a dag that looks like:
718+
///
719+
/// (2) --> (1)
720+
///
721+
/// The arrow flows in a funny direction because we want the jobs
722+
/// with no dependencies to live at the roots of the DAG so that
723+
/// we can process them first.
674724
fn solve_forwards(&mut self) -> Result<(), OnePassError> {
675725
// TODO(ethan):yakshaving drop the clone
676726
for fwd in self.forwards.clone().into_iter_topo() {
@@ -718,9 +768,9 @@ impl OnePassCompiler {
718768
}
719769

720770
// Finally, if a match instruction is reachable through
721-
// a save fwd, the from state is accepting.
722-
match (&self.prog[fwd.to], &self.prog[fwd.from]) {
723-
(&Inst::Save(_), _) => {
771+
// a save fwd (which can never fail), the from state is accepting.
772+
match &self.prog[fwd.to] {
773+
&Inst::Save(_) => {
724774
self.accepting_states[fwd.from] =
725775
self.accepting_states[fwd.to];
726776
}
@@ -731,11 +781,12 @@ impl OnePassCompiler {
731781
Ok(())
732782
}
733783

734-
// Once all the per-instruction transition tables have been worked
735-
// out, we can bake them into the single flat transition table we
736-
// are going to use for the actual DFA.
784+
/// Once all the per-instruction transition tables have been worked
785+
/// out, we can bake them into the single flat transition table we
786+
/// are going to use for the actual DFA. This function creates the
787+
/// baked form, storing it in `self.onepass.table`.
737788
fn emit_transitions(&mut self) {
738-
// pre-compute the state indicies
789+
// pre-compute the state indices
739790
let mut state_starts = Vec::with_capacity(self.prog.len());
740791
let mut off = 0;
741792
for inst_idx in 0..self.prog.len() {
@@ -760,10 +811,12 @@ impl OnePassCompiler {
760811
p
761812
};
762813

814+
self.onepass.table.reserve(
815+
state_starts[state_starts.len() - 1]
816+
+ self.onepass.num_byte_classes);
763817
for inst_idx in 0..self.prog.len() {
764818
let mut trans = Vec::with_capacity(
765-
state_starts[state_starts.len() - 1]
766-
+ self.onepass.num_byte_classes);
819+
self.onepass.num_byte_classes * 2);
767820

768821
match &self.transitions[inst_idx] {
769822
&None => continue,
@@ -1102,7 +1155,7 @@ const STATE_DEAD: StatePtr = STATE_MATCH + 1;
11021155
/// from DEAD only in that an accepting state that transitions to HALT
11031156
/// still accepts, while an accepting state which transitions to DEAD
11041157
/// does not.
1105-
const STATE_HALT: StatePtr = STATE_MATCH + 2;
1158+
const STATE_HALT: StatePtr = STATE_ACTION + 1;
11061159

11071160
/// The maximum state pointer. This is useful to mask out the "valid" state
11081161
/// pointer from a state with the "start" or "match" bits set.

0 commit comments

Comments
 (0)