1515use mem;
1616use char;
1717use clone:: Clone ;
18+ use cmp;
1819use cmp:: { Eq , TotalEq } ;
1920use container:: Container ;
2021use default:: Default ;
2122use iter:: { Filter , Map , Iterator } ;
2223use iter:: { Rev , DoubleEndedIterator , ExactSize } ;
24+ use iter:: range;
2325use num:: Saturating ;
2426use option:: { None , Option , Some } ;
2527use raw:: Repr ;
2628use slice:: { ImmutableVector , Vector } ;
2729use slice;
30+ use uint;
2831
2932/*
3033Section: Creating a string
@@ -316,13 +319,207 @@ impl<'a, Sep: CharEq> Iterator<&'a str> for CharSplitsN<'a, Sep> {
316319 }
317320}
318321
322+ /// The internal state of an iterator that searches for matches of a substring
323+ /// within a larger string using naive search
324+ #[ deriving( Clone ) ]
325+ struct NaiveSearcher {
326+ position : uint
327+ }
328+
329+ impl NaiveSearcher {
330+ fn new ( ) -> NaiveSearcher {
331+ NaiveSearcher { position : 0 }
332+ }
333+
334+ fn next ( & mut self , haystack : & [ u8 ] , needle : & [ u8 ] ) -> Option < ( uint , uint ) > {
335+ while self . position + needle. len ( ) <= haystack. len ( ) {
336+ if haystack. slice ( self . position , self . position + needle. len ( ) ) == needle {
337+ let matchPos = self . position ;
338+ self . position += needle. len ( ) ; // add 1 for all matches
339+ return Some ( ( matchPos, matchPos + needle. len ( ) ) ) ;
340+ } else {
341+ self . position += 1 ;
342+ }
343+ }
344+ None
345+ }
346+ }
347+
348+ /// The internal state of an iterator that searches for matches of a substring
349+ /// within a larger string using two-way search
350+ #[ deriving( Clone ) ]
351+ struct TwoWaySearcher {
352+ // constants
353+ critPos : uint ,
354+ period : uint ,
355+ byteset : u64 ,
356+
357+ // variables
358+ position : uint ,
359+ memory : uint
360+ }
361+
362+ impl TwoWaySearcher {
363+ fn new ( needle : & [ u8 ] ) -> TwoWaySearcher {
364+ let ( critPos1, period1) = TwoWaySearcher :: maximal_suffix ( needle, false ) ;
365+ let ( critPos2, period2) = TwoWaySearcher :: maximal_suffix ( needle, true ) ;
366+
367+ let critPos;
368+ let period;
369+ if critPos1 > critPos2 {
370+ critPos = critPos1;
371+ period = period1;
372+ } else {
373+ critPos = critPos2;
374+ period = period2;
375+ }
376+
377+ let byteset = needle. iter ( ) . fold ( 0 , |a, & b| ( 1 << ( b & 0x3f ) ) | a) ;
378+
379+ if needle. slice_to ( critPos) == needle. slice_from ( needle. len ( ) - critPos) {
380+ TwoWaySearcher {
381+ critPos : critPos,
382+ period : period,
383+ byteset : byteset,
384+
385+ position : 0 ,
386+ memory : 0
387+ }
388+ } else {
389+ TwoWaySearcher {
390+ critPos : critPos,
391+ period : cmp:: max ( critPos, needle. len ( ) - critPos) + 1 ,
392+ byteset : byteset,
393+
394+ position : 0 ,
395+ memory : uint:: MAX // Dummy value to signify that the period is long
396+ }
397+ }
398+ }
399+
400+ #[ inline]
401+ fn next ( & mut self , haystack : & [ u8 ] , needle : & [ u8 ] , longPeriod : bool ) -> Option < ( uint , uint ) > {
402+ ' search: loop {
403+ // Check that we have room to search in
404+ if self . position + needle. len ( ) > haystack. len ( ) {
405+ return None ;
406+ }
407+
408+ // Quickly skip by large portions unrelated to our substring
409+ if ( self . byteset >> ( haystack[ self . position + needle. len ( ) - 1 ] & 0x3f ) ) & 1 == 0 {
410+ self . position += needle. len ( ) ;
411+ continue ' search;
412+ }
413+
414+ // See if the right part of the needle matches
415+ let start = if longPeriod { self . critPos } else { cmp:: max ( self . critPos , self . memory ) } ;
416+ for i in range ( start, needle. len ( ) ) {
417+ if needle[ i] != haystack[ self . position + i] {
418+ self . position += i - self . critPos + 1 ;
419+ if !longPeriod {
420+ self . memory = 0 ;
421+ }
422+ continue ' search;
423+ }
424+ }
425+
426+ // See if the left part of the needle matches
427+ let start = if longPeriod { 0 } else { self . memory } ;
428+ for i in range ( start, self . critPos ) . rev ( ) {
429+ if needle[ i] != haystack[ self . position + i] {
430+ self . position += self . period ;
431+ if !longPeriod {
432+ self . memory = needle. len ( ) - self . period ;
433+ }
434+ continue ' search;
435+ }
436+ }
437+
438+ // We have found a match!
439+ let matchPos = self . position ;
440+ self . position += needle. len ( ) ; // add self.period for all matches
441+ if !longPeriod {
442+ self . memory = 0 ; // set to needle.len() - self.period for all matches
443+ }
444+ return Some ( ( matchPos, matchPos + needle. len ( ) ) ) ;
445+ }
446+ }
447+
448+ #[ inline]
449+ fn maximal_suffix ( arr : & [ u8 ] , reversed : bool ) -> ( uint , uint ) {
450+ let mut left = -1 ; // Corresponds to i in the paper
451+ let mut right = 0 ; // Corresponds to j in the paper
452+ let mut offset = 1 ; // Corresponds to k in the paper
453+ let mut period = 1 ; // Corresponds to p in the paper
454+
455+ while right + offset < arr. len ( ) {
456+ let a;
457+ let b;
458+ if reversed {
459+ a = arr[ left + offset] ;
460+ b = arr[ right + offset] ;
461+ } else {
462+ a = arr[ right + offset] ;
463+ b = arr[ left + offset] ;
464+ }
465+ if a < b {
466+ // Suffix is smaller, period is entire prefix so far.
467+ right += offset;
468+ offset = 1 ;
469+ period = right - left;
470+ } else if a == b {
471+ // Advance through repetition of the current period.
472+ if offset == period {
473+ right += offset;
474+ offset = 1 ;
475+ } else {
476+ offset += 1 ;
477+ }
478+ } else {
479+ // Suffix is larger, start over from current location.
480+ left = right;
481+ right += 1 ;
482+ offset = 1 ;
483+ period = 1 ;
484+ }
485+ }
486+ ( left + 1 , period)
487+ }
488+ }
489+
490+ /// The internal state of an iterator that searches for matches of a substring
491+ /// within a larger string using a dynamically chosed search algorithm
492+ #[ deriving( Clone ) ]
493+ enum Searcher {
494+ Naive ( NaiveSearcher ) ,
495+ TwoWay ( TwoWaySearcher ) ,
496+ TwoWayLong ( TwoWaySearcher )
497+ }
498+
499+ impl Searcher {
500+ fn new ( haystack : & [ u8 ] , needle : & [ u8 ] ) -> Searcher {
501+ // FIXME: Tune this.
502+ if needle. len ( ) > haystack. len ( ) - 20 {
503+ Naive ( NaiveSearcher :: new ( ) )
504+ } else {
505+ let searcher = TwoWaySearcher :: new ( needle) ;
506+ if searcher. memory == uint:: MAX { // If the period is long
507+ TwoWayLong ( searcher)
508+ } else {
509+ TwoWay ( searcher)
510+ }
511+ }
512+ }
513+ }
514+
319515/// An iterator over the start and end indices of the matches of a
320516/// substring within a larger string
321517#[ deriving( Clone ) ]
322518pub struct MatchIndices < ' a > {
519+ // constants
323520 haystack : & ' a str ,
324521 needle : & ' a str ,
325- position : uint ,
522+ searcher : Searcher
326523}
327524
328525/// An iterator over the substrings of a string separated by a given
@@ -337,31 +534,14 @@ pub struct StrSplits<'a> {
337534impl < ' a > Iterator < ( uint , uint ) > for MatchIndices < ' a > {
338535 #[ inline]
339536 fn next ( & mut self ) -> Option < ( uint , uint ) > {
340- // See Issue #1932 for why this is a naive search
341- let ( h_len, n_len) = ( self . haystack . len ( ) , self . needle . len ( ) ) ;
342- let mut match_start = 0 ;
343- let mut match_i = 0 ;
344-
345- while self . position < h_len {
346- if self . haystack [ self . position ] == self . needle [ match_i] {
347- if match_i == 0 { match_start = self . position ; }
348- match_i += 1 ;
349- self . position += 1 ;
350-
351- if match_i == n_len {
352- // found a match!
353- return Some ( ( match_start, self . position ) ) ;
354- }
355- } else {
356- // failed match, backtrack
357- if match_i > 0 {
358- match_i = 0 ;
359- self . position = match_start;
360- }
361- self . position += 1 ;
362- }
537+ match self . searcher {
538+ Naive ( ref mut searcher)
539+ => searcher. next ( self . haystack . as_bytes ( ) , self . needle . as_bytes ( ) ) ,
540+ TwoWay ( ref mut searcher)
541+ => searcher. next ( self . haystack . as_bytes ( ) , self . needle . as_bytes ( ) , false ) ,
542+ TwoWayLong ( ref mut searcher)
543+ => searcher. next ( self . haystack . as_bytes ( ) , self . needle . as_bytes ( ) , true )
363544 }
364- None
365545 }
366546}
367547
@@ -1581,7 +1761,7 @@ impl<'a> StrSlice<'a> for &'a str {
15811761 MatchIndices {
15821762 haystack : * self ,
15831763 needle : sep,
1584- position : 0
1764+ searcher : Searcher :: new ( self . as_bytes ( ) , sep . as_bytes ( ) )
15851765 }
15861766 }
15871767
0 commit comments