@@ -35,8 +35,8 @@ pub use unicode_xid::UNICODE_VERSION as UNICODE_XID_VERSION;
3535
3636use  self :: LiteralKind :: * ; 
3737use  self :: TokenKind :: * ; 
38- pub  use  crate :: cursor:: Cursor ; 
3938use  crate :: cursor:: EOF_CHAR ; 
39+ pub  use  crate :: cursor:: { Cursor ,  FrontmatterAllowed } ; 
4040
4141/// Parsed token. 
4242/// It doesn't contain information about data that has been parsed, 
@@ -57,17 +57,27 @@ impl Token {
5757#[ derive( Clone ,  Copy ,  Debug ,  PartialEq ,  Eq ) ]  
5858pub  enum  TokenKind  { 
5959    /// A line comment, e.g. `// comment`. 
60-      LineComment  {  doc_style :  Option < DocStyle >  } , 
60+      LineComment  { 
61+         doc_style :  Option < DocStyle > , 
62+     } , 
6163
6264    /// A block comment, e.g. `/* block comment */`. 
6365     /// 
6466     /// Block comments can be recursive, so a sequence like `/* /* */` 
6567     /// will not be considered terminated and will result in a parsing error. 
66-      BlockComment  {  doc_style :  Option < DocStyle > ,  terminated :  bool  } , 
68+      BlockComment  { 
69+         doc_style :  Option < DocStyle > , 
70+         terminated :  bool , 
71+     } , 
6772
6873    /// Any whitespace character sequence. 
6974     Whitespace , 
7075
76+     Frontmatter  { 
77+         has_invalid_preceding_whitespace :  bool , 
78+         invalid_infostring :  bool , 
79+     } , 
80+ 
7181    /// An identifier or keyword, e.g. `ident` or `continue`. 
7282     Ident , 
7383
@@ -109,10 +119,15 @@ pub enum TokenKind {
109119     /// this type will need to check for and reject that case. 
110120     /// 
111121     /// See [LiteralKind] for more details. 
112-      Literal  {  kind :  LiteralKind ,  suffix_start :  u32  } , 
122+      Literal  { 
123+         kind :  LiteralKind , 
124+         suffix_start :  u32 , 
125+     } , 
113126
114127    /// A lifetime, e.g. `'a`. 
115-      Lifetime  {  starts_with_number :  bool  } , 
128+      Lifetime  { 
129+         starts_with_number :  bool , 
130+     } , 
116131
117132    /// `;` 
118133     Semi , 
@@ -280,7 +295,7 @@ pub fn strip_shebang(input: &str) -> Option<usize> {
280295#[ inline]  
281296pub  fn  validate_raw_str ( input :  & str ,  prefix_len :  u32 )  -> Result < ( ) ,  RawStrError >  { 
282297    debug_assert ! ( !input. is_empty( ) ) ; 
283-     let  mut  cursor = Cursor :: new ( input) ; 
298+     let  mut  cursor = Cursor :: new ( input,   FrontmatterAllowed :: No ) ; 
284299    // Move past the leading `r` or `br`. 
285300    for  _ in  0 ..prefix_len { 
286301        cursor. bump ( ) . unwrap ( ) ; 
@@ -290,7 +305,7 @@ pub fn validate_raw_str(input: &str, prefix_len: u32) -> Result<(), RawStrError>
290305
291306/// Creates an iterator that produces tokens from the input string. 
292307pub  fn  tokenize ( input :  & str )  -> impl  Iterator < Item  = Token >  { 
293-     let  mut  cursor = Cursor :: new ( input) ; 
308+     let  mut  cursor = Cursor :: new ( input,   FrontmatterAllowed :: No ) ; 
294309    std:: iter:: from_fn ( move  || { 
295310        let  token = cursor. advance_token ( ) ; 
296311        if  token. kind  != TokenKind :: Eof  {  Some ( token)  }  else  {  None  } 
@@ -361,7 +376,34 @@ impl Cursor<'_> {
361376            Some ( c)  => c, 
362377            None  => return  Token :: new ( TokenKind :: Eof ,  0 ) , 
363378        } ; 
379+ 
364380        let  token_kind = match  first_char { 
381+             c if  matches ! ( self . frontmatter_allowed,  FrontmatterAllowed :: Yes ) 
382+                 && is_whitespace ( c)  =>
383+             { 
384+                 let  mut  last = first_char; 
385+                 while  is_whitespace ( self . first ( ) )  { 
386+                     let  Some ( c)  = self . bump ( )  else  { 
387+                         break ; 
388+                     } ; 
389+                     last = c; 
390+                 } 
391+                 // invalid frontmatter opening as whitespace preceding it isn't newline. 
392+                 // combine the whitespace and the frontmatter to a single token as we shall 
393+                 // error later. 
394+                 if  last != '\n'  && self . as_str ( ) . starts_with ( "---" )  { 
395+                     self . bump ( ) ; 
396+                     self . frontmatter ( true ) 
397+                 }  else  { 
398+                     Whitespace 
399+                 } 
400+             } 
401+             '-'  if  matches ! ( self . frontmatter_allowed,  FrontmatterAllowed :: Yes ) 
402+                 && self . as_str ( ) . starts_with ( "--" )  =>
403+             { 
404+                 // happy path 
405+                 self . frontmatter ( false ) 
406+             } 
365407            // Slash, comment or block comment. 
366408            '/'  => match  self . first ( )  { 
367409                '/'  => self . line_comment ( ) , 
@@ -464,11 +506,110 @@ impl Cursor<'_> {
464506            c if  !c. is_ascii ( )  && c. is_emoji_char ( )  => self . invalid_ident ( ) , 
465507            _ => Unknown , 
466508        } ; 
509+         if  matches ! ( self . frontmatter_allowed,  FrontmatterAllowed :: Yes ) 
510+             && !matches ! ( token_kind,  Whitespace ) 
511+         { 
512+             // stop allowing frontmatters after first non-whitespace token 
513+             self . frontmatter_allowed  = FrontmatterAllowed :: No ; 
514+         } 
467515        let  res = Token :: new ( token_kind,  self . pos_within_token ( ) ) ; 
468516        self . reset_pos_within_token ( ) ; 
469517        res
470518    } 
471519
520+     /// Given that one `-` was eaten, eat the rest of the frontmatter. 
521+      fn  frontmatter ( & mut  self ,  has_invalid_preceding_whitespace :  bool )  -> TokenKind  { 
522+         debug_assert_eq ! ( '-' ,  self . prev( ) ) ; 
523+ 
524+         let  pos = self . pos_within_token ( ) ; 
525+         self . eat_while ( |c| c == '-' ) ; 
526+ 
527+         // one `-` is eaten by the caller. 
528+         let  length_opening = self . pos_within_token ( )  - pos + 1 ; 
529+ 
530+         // must be ensured by the caller 
531+         debug_assert ! ( length_opening >= 3 ) ; 
532+ 
533+         // whitespace between the opening and the infostring. 
534+         self . eat_while ( |ch| ch != '\n'  && is_whitespace ( ch) ) ; 
535+ 
536+         // copied from `eat_identifier`, but allows `.` in infostring to allow something like 
537+         // `---Cargo.toml` as a valid opener 
538+         if  is_id_start ( self . first ( ) )  { 
539+             self . bump ( ) ; 
540+             self . eat_while ( |c| is_id_continue ( c)  || c == '.' ) ; 
541+         } 
542+ 
543+         self . eat_while ( |ch| ch != '\n'  && is_whitespace ( ch) ) ; 
544+         let  invalid_infostring = self . first ( )  != '\n' ; 
545+ 
546+         let  mut  s = self . as_str ( ) ; 
547+         let  mut  found = false ; 
548+         while  let  Some ( closing)  = s. find ( & "-" . repeat ( length_opening as  usize ) )  { 
549+             let  preceding_chars_start = s[ ..closing] . rfind ( "\n " ) . map_or ( 0 ,  |i| i + 1 ) ; 
550+             if  s[ preceding_chars_start..closing] . chars ( ) . all ( is_whitespace)  { 
551+                 // candidate found 
552+                 self . bump_bytes ( closing) ; 
553+                 // in case like 
554+                 // ---cargo 
555+                 // --- blahblah 
556+                 // or 
557+                 // ---cargo 
558+                 // ---- 
559+                 // combine those stuff into this frontmatter token such that it gets detected later. 
560+                 self . eat_until ( b'\n' ) ; 
561+                 found = true ; 
562+                 break ; 
563+             }  else  { 
564+                 s = & s[ closing + length_opening as  usize ..] ; 
565+             } 
566+         } 
567+ 
568+         if  !found { 
569+             // recovery strategy: a closing statement might have precending whitespace/newline 
570+             // but not have enough dashes to properly close. In this case, we eat until there, 
571+             // and report a mismatch in the parser. 
572+             let  mut  rest = self . as_str ( ) ; 
573+             // We can look for a shorter closing (starting with four dashes but closing with three) 
574+             // and other indications that Rust has started and the infostring has ended. 
575+             let  mut  potential_closing = rest
576+                 . find ( "\n ---" ) 
577+                 // n.b. only in the case where there are dashes, we move the index to the line where 
578+                 // the dashes start as we eat to include that line. For other cases those are Rust code 
579+                 // and not included in the frontmatter. 
580+                 . map ( |x| x + 1 ) 
581+                 . or_else ( || rest. find ( "\n use" ) ) 
582+                 . or_else ( || rest. find ( "\n //!" ) ) 
583+                 . or_else ( || rest. find ( "\n #![" ) ) ; 
584+ 
585+             if  potential_closing. is_none ( )  { 
586+                 // a less fortunate recovery if all else fails which finds any dashes preceded by whitespace 
587+                 // on a standalone line. Might be wrong. 
588+                 while  let  Some ( closing)  = rest. find ( "---" )  { 
589+                     let  preceding_chars_start = rest[ ..closing] . rfind ( "\n " ) . map_or ( 0 ,  |i| i + 1 ) ; 
590+                     if  rest[ preceding_chars_start..closing] . chars ( ) . all ( is_whitespace)  { 
591+                         // candidate found 
592+                         potential_closing = Some ( closing) ; 
593+                         break ; 
594+                     }  else  { 
595+                         rest = & rest[ closing + 3 ..] ; 
596+                     } 
597+                 } 
598+             } 
599+ 
600+             if  let  Some ( potential_closing)  = potential_closing { 
601+                 // bump to the potential closing, and eat everything on that line. 
602+                 self . bump_bytes ( potential_closing) ; 
603+                 self . eat_until ( b'\n' ) ; 
604+             }  else  { 
605+                 // eat everything. this will get reported as an unclosed frontmatter. 
606+                 self . eat_while ( |_| true ) ; 
607+             } 
608+         } 
609+ 
610+         Frontmatter  {  has_invalid_preceding_whitespace,  invalid_infostring } 
611+     } 
612+ 
472613    fn  line_comment ( & mut  self )  -> TokenKind  { 
473614        debug_assert ! ( self . prev( )  == '/'  && self . first( )  == '/' ) ; 
474615        self . bump ( ) ; 
0 commit comments