Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,14 @@ jobs:
os: ubuntu-latest
rust: stable
target: i686-unknown-linux-gnu
- build: stable-mips
- build: stable-powerpc64
os: ubuntu-latest
rust: stable
target: mips64-unknown-linux-gnuabi64
target: powerpc64-unknown-linux-gnu
- build: stable-s390x
os: ubuntu-latest
rust: stable
target: s390x-unknown-linux-gnu
- build: beta
os: ubuntu-latest
rust: beta
Expand Down
13 changes: 11 additions & 2 deletions regex-automata/src/util/prefilter/aho_corasick.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,20 @@ impl AhoCorasick {
}
#[cfg(feature = "perf-literal-multisubstring")]
{
// We used to use `aho_corasick::MatchKind::Standard` here when
// `kind` was `MatchKind::All`, but this is not correct. The
// "standard" Aho-Corasick match semantics are to report a match
// immediately as soon as it is seen, but `All` isn't like that.
// In particular, with "standard" semantics, given the needles
// "abc" and "b" and the haystack "abc," it would report a match
// at offset 1 before a match at offset 0. This is never what we
// want in the context of the regex engine, regardless of whether
// we have leftmost-first or 'all' semantics. Namely, we always
// want the leftmost match.
let ac_match_kind = match kind {
MatchKind::LeftmostFirst => {
MatchKind::LeftmostFirst | MatchKind::All => {
aho_corasick::MatchKind::LeftmostFirst
}
MatchKind::All => aho_corasick::MatchKind::Standard,
};
// This is kind of just an arbitrary number, but basically, if we
// have a small enough set of literals, then we try to use the VERY
Expand Down
9 changes: 0 additions & 9 deletions regex-automata/src/util/prefilter/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -195,15 +195,6 @@ impl Prefilter {
/// Some(Span::from(6..9)),
/// pre.find(hay.as_bytes(), Span::from(0..hay.len())),
/// );
/// // Now we put 'samwise' back before 'sam', but change the match
/// // semantics to 'All'. In this case, there is no preference
/// // order semantics and the first match detected is returned.
/// let pre = Prefilter::new(MatchKind::All, &["samwise", "sam"])
/// .expect("a prefilter");
/// assert_eq!(
/// Some(Span::from(6..9)),
/// pre.find(hay.as_bytes(), Span::from(0..hay.len())),
/// );
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
Expand Down
9 changes: 7 additions & 2 deletions regex-automata/src/util/prefilter/teddy.rs
Original file line number Diff line number Diff line change
Expand Up @@ -50,12 +50,17 @@ impl Teddy {
// theory we could at least support leftmost-longest, as the
// aho-corasick crate does, but regex-automata doesn't know about
// leftmost-longest currently.
//
// And like the aho-corasick prefilter, if we're using `All`
// semantics, then we can still use leftmost semantics for a
// prefilter. (This might be a suspicious choice for the literal
// engine, which uses a prefilter as a regex engine directly, but
// that only happens when using leftmost-first semantics.)
let (packed_match_kind, ac_match_kind) = match kind {
MatchKind::LeftmostFirst => (
MatchKind::LeftmostFirst | MatchKind::All => (
aho_corasick::packed::MatchKind::LeftmostFirst,
aho_corasick::MatchKind::LeftmostFirst,
),
_ => return None,
};
let minimum_len =
needles.iter().map(|n| n.as_ref().len()).min().unwrap_or(0);
Expand Down
26 changes: 26 additions & 0 deletions testdata/regression.toml
Original file line number Diff line number Diff line change
Expand Up @@ -756,3 +756,29 @@ name = "reverse-inner-short"
regex = '(?:([0-9][0-9][0-9]):)?([0-9][0-9]):([0-9][0-9])'
haystack = '102:12:39'
matches = [[[0, 9], [0, 3], [4, 6], [7, 9]]]

# This regression test was found via the RegexSet APIs. It triggered a
# particular code path where a regex was compiled with 'All' match semantics
# (to support overlapping search), but got funneled down into a standard
# leftmost search when calling 'is_match'. This is fine on its own, but the
# leftmost search will use a prefilter and that's where this went awry.
#
# Namely, since 'All' semantics were used, the aho-corasick prefilter was
# incorrectly compiled with 'Standard' semantics. This was wrong because
# 'Standard' immediately attempts to report a match at every position, even if
# that would mean reporting a match past the leftmost match before reporting
# the leftmost match. This breaks the prefilter contract of never having false
# negatives and leads overall to the engine not finding a match.
#
# See: https://github.com/rust-lang/regex/issues/1070
[[test]]
name = "prefilter-with-aho-corasick-standard-semantics"
regex = '(?m)^ *v [0-9]'
haystack = 'v 0'
matches = [
{ id = 0, spans = [[0, 3]] },
]
match-kind = "all"
search-kind = "overlapping"
unicode = true
utf8 = true