From 1f23b4849272186b9294347c8268571e07fdcb49 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Le=C3=B3n=20Orell=20Valerian=20Liehr?= Date: Wed, 29 Oct 2025 02:49:16 +0100 Subject: [PATCH 1/3] rustdoc: Properly highlight shebang and frontmatter --- src/librustdoc/html/highlight.rs | 200 ++++++++++-------- tests/rustdoc/jump-to-def/shebang.rs | 15 ++ .../rustdoc/source-code-pages/frontmatter.rs | 10 + tests/rustdoc/source-code-pages/shebang.rs | 6 + 4 files changed, 137 insertions(+), 94 deletions(-) create mode 100644 tests/rustdoc/jump-to-def/shebang.rs create mode 100644 tests/rustdoc/source-code-pages/frontmatter.rs create mode 100644 tests/rustdoc/source-code-pages/shebang.rs diff --git a/src/librustdoc/html/highlight.rs b/src/librustdoc/html/highlight.rs index c37736f137df9..81dde140fa819 100644 --- a/src/librustdoc/html/highlight.rs +++ b/src/librustdoc/html/highlight.rs @@ -566,52 +566,52 @@ pub(super) fn write_code( }; let mut current_expansion = get_expansion(&mut token_handler, expanded_codes, file_span); - Classifier::new( + classify( &src, - token_handler.href_context.as_ref().map(|c| c.file_span).unwrap_or(DUMMY_SP), + token_handler.href_context.as_ref().map_or(DUMMY_SP, |c| c.file_span), decoration_info, - ) - .highlight(&mut |span, highlight| match highlight { - Highlight::Token { text, class } => { - token_handler.push_token(class, Cow::Borrowed(text)); - - if text == "\n" { - if current_expansion.is_none() { - current_expansion = get_expansion(&mut token_handler, expanded_codes, span); - } - if let Some(ref current_expansion) = current_expansion - && current_expansion.span.lo() == span.hi() - { - token_handler.add_expanded_code(current_expansion); - } - } else { - let mut need_end = false; - if let Some(ref current_expansion) = current_expansion { - if current_expansion.span.lo() == span.hi() { - token_handler.add_expanded_code(current_expansion); - } else if current_expansion.end_line == token_handler.line - && span.hi() >= current_expansion.span.hi() + &mut |span, highlight| match highlight { + Highlight::Token { text, class } => { + token_handler.push_token(class, Cow::Borrowed(text)); + + if text == "\n" { + if current_expansion.is_none() { + current_expansion = get_expansion(&mut token_handler, expanded_codes, span); + } + if let Some(ref current_expansion) = current_expansion + && current_expansion.span.lo() == span.hi() { - need_end = true; + token_handler.add_expanded_code(current_expansion); + } + } else { + let mut need_end = false; + if let Some(ref current_expansion) = current_expansion { + if current_expansion.span.lo() == span.hi() { + token_handler.add_expanded_code(current_expansion); + } else if current_expansion.end_line == token_handler.line + && span.hi() >= current_expansion.span.hi() + { + need_end = true; + } + } + if need_end { + current_expansion = end_expansion(&mut token_handler, expanded_codes, span); } - } - if need_end { - current_expansion = end_expansion(&mut token_handler, expanded_codes, span); } } - } - Highlight::EnterSpan { class } => { - token_handler.class_stack.enter_elem( - token_handler.out, - &token_handler.href_context, - class, - None, - ); - } - Highlight::ExitSpan => { - token_handler.class_stack.exit_elem(); - } - }); + Highlight::EnterSpan { class } => { + token_handler.class_stack.enter_elem( + token_handler.out, + &token_handler.href_context, + class, + None, + ); + } + Highlight::ExitSpan => { + token_handler.class_stack.exit_elem(); + } + }, + ); } fn write_footer(playground_button: Option<&str>) -> impl Display { @@ -735,6 +735,12 @@ struct TokenIter<'a> { cursor: Cursor<'a>, } +impl<'a> TokenIter<'a> { + fn new(src: &'a str) -> Self { + Self { src, cursor: Cursor::new(src, FrontmatterAllowed::Yes) } + } +} + impl<'a> Iterator for TokenIter<'a> { type Item = (TokenKind, &'a str); fn next(&mut self) -> Option<(TokenKind, &'a str)> { @@ -843,6 +849,54 @@ fn new_span(lo: u32, text: &str, file_span: Span) -> Span { file_span.with_lo(file_lo + BytePos(lo)).with_hi(file_lo + BytePos(hi)) } +fn classify<'src>( + src: &'src str, + file_span: Span, + decoration_info: Option<&DecorationInfo>, + sink: &mut dyn FnMut(Span, Highlight<'src>), +) { + let offset = rustc_lexer::strip_shebang(src); + + if let Some(offset) = offset { + sink(DUMMY_SP, Highlight::Token { text: &src[..offset], class: Some(Class::Comment) }); + } + + let mut classifier = + Classifier::new(src, offset.unwrap_or_default(), file_span, decoration_info); + + loop { + if let Some(decs) = classifier.decorations.as_mut() { + let byte_pos = classifier.byte_pos; + let n_starts = decs.starts.iter().filter(|(i, _)| byte_pos >= *i).count(); + for (_, kind) in decs.starts.drain(0..n_starts) { + sink(DUMMY_SP, Highlight::EnterSpan { class: Class::Decoration(kind) }); + } + + let n_ends = decs.ends.iter().filter(|i| byte_pos >= **i).count(); + for _ in decs.ends.drain(0..n_ends) { + sink(DUMMY_SP, Highlight::ExitSpan); + } + } + + if let Some((TokenKind::Colon | TokenKind::Ident, _)) = classifier.tokens.peek() { + let tokens = classifier.get_full_ident_path(); + for &(token, start, end) in &tokens { + let text = &classifier.src[start..end]; + classifier.advance(token, text, sink, start as u32); + classifier.byte_pos += text.len() as u32; + } + if !tokens.is_empty() { + continue; + } + } + if let Some((token, text, before)) = classifier.next() { + classifier.advance(token, text, sink, before); + } else { + break; + } + } +} + /// Processes program tokens, classifying strings of text by highlighting /// category (`Class`). struct Classifier<'src> { @@ -857,21 +911,23 @@ struct Classifier<'src> { } impl<'src> Classifier<'src> { - /// Takes as argument the source code to HTML-ify, the rust edition to use and the source code - /// file span which will be used later on by the `span_correspondence_map`. - fn new(src: &'src str, file_span: Span, decoration_info: Option<&DecorationInfo>) -> Self { - let tokens = - PeekIter::new(TokenIter { src, cursor: Cursor::new(src, FrontmatterAllowed::Yes) }); - let decorations = decoration_info.map(Decorations::new); + /// Takes as argument the source code to HTML-ify and the source code file span + /// which will be used later on by the `span_correspondence_map`. + fn new( + src: &'src str, + byte_pos: usize, + file_span: Span, + decoration_info: Option<&DecorationInfo>, + ) -> Self { Classifier { - tokens, + tokens: PeekIter::new(TokenIter::new(&src[byte_pos..])), in_attribute: false, in_macro: false, in_macro_nonterminal: false, - byte_pos: 0, + byte_pos: byte_pos as u32, file_span, src, - decorations, + decorations: decoration_info.map(Decorations::new), } } @@ -938,50 +994,6 @@ impl<'src> Classifier<'src> { } } - /// Exhausts the `Classifier` writing the output into `sink`. - /// - /// The general structure for this method is to iterate over each token, - /// possibly giving it an HTML span with a class specifying what flavor of - /// token is used. - fn highlight(mut self, sink: &mut dyn FnMut(Span, Highlight<'src>)) { - loop { - if let Some(decs) = self.decorations.as_mut() { - let byte_pos = self.byte_pos; - let n_starts = decs.starts.iter().filter(|(i, _)| byte_pos >= *i).count(); - for (_, kind) in decs.starts.drain(0..n_starts) { - sink(DUMMY_SP, Highlight::EnterSpan { class: Class::Decoration(kind) }); - } - - let n_ends = decs.ends.iter().filter(|i| byte_pos >= **i).count(); - for _ in decs.ends.drain(0..n_ends) { - sink(DUMMY_SP, Highlight::ExitSpan); - } - } - - if self - .tokens - .peek() - .map(|t| matches!(t.0, TokenKind::Colon | TokenKind::Ident)) - .unwrap_or(false) - { - let tokens = self.get_full_ident_path(); - for (token, start, end) in &tokens { - let text = &self.src[*start..*end]; - self.advance(*token, text, sink, *start as u32); - self.byte_pos += text.len() as u32; - } - if !tokens.is_empty() { - continue; - } - } - if let Some((token, text, before)) = self.next() { - self.advance(token, text, sink, before); - } else { - break; - } - } - } - /// Single step of highlighting. This will classify `token`, but maybe also a couple of /// following ones as well. /// @@ -1019,6 +1031,7 @@ impl<'src> Classifier<'src> { Class::Comment } } + TokenKind::Frontmatter { .. } => Class::Comment, // Consider this as part of a macro invocation if there was a // leading identifier. TokenKind::Bang if self.in_macro => { @@ -1117,7 +1130,6 @@ impl<'src> Classifier<'src> { | TokenKind::At | TokenKind::Tilde | TokenKind::Colon - | TokenKind::Frontmatter { .. } | TokenKind::Unknown => return no_highlight(sink), TokenKind::Question => Class::QuestionMark, diff --git a/tests/rustdoc/jump-to-def/shebang.rs b/tests/rustdoc/jump-to-def/shebang.rs new file mode 100644 index 0000000000000..a631762554b1a --- /dev/null +++ b/tests/rustdoc/jump-to-def/shebang.rs @@ -0,0 +1,15 @@ +#!/path/to/my/interpreter +//@ compile-flags: -Zunstable-options --generate-link-to-definition + +// Ensure that we can successfully generate links to definitions in the presence of shebang. +// Implementation-wise, shebang is not a token that's emitted by the lexer. Instead, we need +// to offset the actual lexing which is tricky due to all the byte index and span calculations +// in the Classifier. + +fn scope() { +//@ has 'src/shebang/shebang.rs.html' +//@ has - '//a[@href="#15"]' 'function' + function(); +} + +fn function() {} diff --git a/tests/rustdoc/source-code-pages/frontmatter.rs b/tests/rustdoc/source-code-pages/frontmatter.rs new file mode 100644 index 0000000000000..c352504bab2b2 --- /dev/null +++ b/tests/rustdoc/source-code-pages/frontmatter.rs @@ -0,0 +1,10 @@ +--- json +{"edition": "2024"} +--- +#![feature(frontmatter)] + +// Test that we highlight frontmatter as comments on source code pages. + +//@ has 'src/frontmatter/frontmatter.rs.html' +//@ has - '//pre[@class="rust"]//span[@class="comment"]' \ +// '--- json {"edition": "2024"} ---' diff --git a/tests/rustdoc/source-code-pages/shebang.rs b/tests/rustdoc/source-code-pages/shebang.rs new file mode 100644 index 0000000000000..975ca3a31858b --- /dev/null +++ b/tests/rustdoc/source-code-pages/shebang.rs @@ -0,0 +1,6 @@ +#!/path/to/somewhere 0 if false "" + +// Test that we highlight shebang as comments on source code pages. + +//@ has 'src/shebang/shebang.rs.html' +//@ has - '//pre[@class="rust"]//span[@class="comment"]' '#!/path/to/somewhere 0 if false ""' From a5f07138ac19744fc283c07e278d21b61360e43e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Le=C3=B3n=20Orell=20Valerian=20Liehr?= Date: Tue, 28 Oct 2025 21:21:54 +0100 Subject: [PATCH 2/3] rustdoc: Recognize more weak keywords when highlighting Rust code --- src/librustdoc/html/highlight.rs | 78 ++++++++++++++++++-------------- 1 file changed, 45 insertions(+), 33 deletions(-) diff --git a/src/librustdoc/html/highlight.rs b/src/librustdoc/html/highlight.rs index 81dde140fa819..7c22a7ab91f66 100644 --- a/src/librustdoc/html/highlight.rs +++ b/src/librustdoc/html/highlight.rs @@ -764,7 +764,8 @@ fn get_real_ident_class(text: &str, allow_path_keywords: bool) -> Option Some(match text { "ref" | "mut" => Class::RefKeyWord, "false" | "true" => Class::Bool, - _ if Symbol::intern(text).is_reserved(|| Edition::Edition2021) => Class::KeyWord, + // FIXME(#148221): Don't hard-code the edition. The classifier should take it as an argument. + _ if Symbol::intern(text).is_reserved(|| Edition::Edition2024) => Class::KeyWord, _ => return None, }) } @@ -1201,7 +1202,7 @@ impl<'src> Classifier<'src> { }, TokenKind::GuardedStrPrefix => return no_highlight(sink), TokenKind::Ident | TokenKind::RawIdent - if self.peek_non_whitespace() == Some(TokenKind::Bang) => + if let Some((TokenKind::Bang, _)) = self.peek_non_trivia() => { self.in_macro = true; let span = new_span(before, text, file_span); @@ -1209,26 +1210,22 @@ impl<'src> Classifier<'src> { sink(span, Highlight::Token { text, class: None }); return; } - TokenKind::Ident => { - match get_real_ident_class(text, false) { - None => match text { - "Option" | "Result" => Class::PreludeTy(new_span(before, text, file_span)), - "Some" | "None" | "Ok" | "Err" => { - Class::PreludeVal(new_span(before, text, file_span)) - } - // "union" is a weak keyword and is only considered as a keyword when declaring - // a union type. - "union" if self.check_if_is_union_keyword() => Class::KeyWord, - _ if self.in_macro_nonterminal => { - self.in_macro_nonterminal = false; - Class::MacroNonTerminal - } - "self" | "Self" => Class::Self_(new_span(before, text, file_span)), - _ => Class::Ident(new_span(before, text, file_span)), - }, - Some(c) => c, - } - } + TokenKind::Ident => match get_real_ident_class(text, false) { + None => match text { + "Option" | "Result" => Class::PreludeTy(new_span(before, text, file_span)), + "Some" | "None" | "Ok" | "Err" => { + Class::PreludeVal(new_span(before, text, file_span)) + } + _ if self.is_weak_keyword(text) => Class::KeyWord, + _ if self.in_macro_nonterminal => { + self.in_macro_nonterminal = false; + Class::MacroNonTerminal + } + "self" | "Self" => Class::Self_(new_span(before, text, file_span)), + _ => Class::Ident(new_span(before, text, file_span)), + }, + Some(c) => c, + }, TokenKind::RawIdent | TokenKind::UnknownPrefix | TokenKind::InvalidIdent => { Class::Ident(new_span(before, text, file_span)) } @@ -1249,25 +1246,40 @@ impl<'src> Classifier<'src> { } } + fn is_weak_keyword(&mut self, text: &str) -> bool { + // NOTE: `yeet` (`do yeet $expr`), `catch` (`do catch $block`), `default` (specialization), + // `contract_{ensures,requires}`, `builtin` (builtin_syntax) & `reuse` (fn_delegation) are + // too difficult or annoying to properly detect under this simple scheme. + + let matches = match text { + "auto" => |text| text == "trait", // `auto trait Trait {}` (`auto_traits`) + "pin" => |text| text == "const" || text == "mut", // `&pin mut Type` (`pin_ergonomics`) + "raw" => |text| text == "const" || text == "mut", // `&raw const local` + "safe" => |text| text == "fn" || text == "extern", // `unsafe extern { safe fn f(); }` + "union" => |_| true, // `union Untagged { field: () }` + _ => return false, + }; + matches!(self.peek_non_trivia(), Some((TokenKind::Ident, text)) if matches(text)) + } + fn peek(&mut self) -> Option { - self.tokens.peek().map(|(token_kind, _text)| *token_kind) + self.tokens.peek().map(|&(kind, _)| kind) } - fn peek_non_whitespace(&mut self) -> Option { - while let Some((token_kind, _)) = self.tokens.peek_next() { - if *token_kind != TokenKind::Whitespace { - let token_kind = *token_kind; - self.tokens.stop_peeking(); - return Some(token_kind); + fn peek_non_trivia(&mut self) -> Option<(TokenKind, &str)> { + while let Some(&token @ (kind, _)) = self.tokens.peek_next() { + if let TokenKind::Whitespace + | TokenKind::LineComment { doc_style: None } + | TokenKind::BlockComment { doc_style: None, .. } = kind + { + continue; } + self.tokens.stop_peeking(); + return Some(token); } self.tokens.stop_peeking(); None } - - fn check_if_is_union_keyword(&mut self) -> bool { - self.peek_non_whitespace().is_some_and(|kind| kind == TokenKind::Ident) - } } fn generate_link_to_def( From 0582085e6fac50865a0d7217d74ef9260f05a6ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Le=C3=B3n=20Orell=20Valerian=20Liehr?= Date: Tue, 28 Oct 2025 22:30:58 +0100 Subject: [PATCH 3/3] rustdoc: Refactor keyword highlighting and make metavars take precedence --- src/librustdoc/html/highlight.rs | 83 ++++++++++++++------------------ 1 file changed, 37 insertions(+), 46 deletions(-) diff --git a/src/librustdoc/html/highlight.rs b/src/librustdoc/html/highlight.rs index 7c22a7ab91f66..bc872573cc184 100644 --- a/src/librustdoc/html/highlight.rs +++ b/src/librustdoc/html/highlight.rs @@ -754,22 +754,6 @@ impl<'a> Iterator for TokenIter<'a> { } } -/// Classifies into identifier class; returns `None` if this is a non-keyword identifier. -fn get_real_ident_class(text: &str, allow_path_keywords: bool) -> Option { - let ignore: &[&str] = - if allow_path_keywords { &["self", "Self", "super", "crate"] } else { &["self", "Self"] }; - if ignore.contains(&text) { - return None; - } - Some(match text { - "ref" | "mut" => Class::RefKeyWord, - "false" | "true" => Class::Bool, - // FIXME(#148221): Don't hard-code the edition. The classifier should take it as an argument. - _ if Symbol::intern(text).is_reserved(|| Edition::Edition2024) => Class::KeyWord, - _ => return None, - }) -} - /// This iterator comes from the same idea than "Peekable" except that it allows to "peek" more than /// just the next item by using `peek_next`. The `peek` method always returns the next item after /// the current one whereas `peek_next` will return the next item after the last one peeked. @@ -787,16 +771,16 @@ impl<'a> PeekIter<'a> { Self { stored: VecDeque::new(), peek_pos: 0, iter } } /// Returns the next item after the current one. It doesn't interfere with `peek_next` output. - fn peek(&mut self) -> Option<&(TokenKind, &'a str)> { + fn peek(&mut self) -> Option<(TokenKind, &'a str)> { if self.stored.is_empty() && let Some(next) = self.iter.next() { self.stored.push_back(next); } - self.stored.front() + self.stored.front().copied() } /// Returns the next item after the last one peeked. It doesn't interfere with `peek` output. - fn peek_next(&mut self) -> Option<&(TokenKind, &'a str)> { + fn peek_next(&mut self) -> Option<(TokenKind, &'a str)> { self.peek_pos += 1; if self.peek_pos - 1 < self.stored.len() { self.stored.get(self.peek_pos - 1) @@ -806,6 +790,7 @@ impl<'a> PeekIter<'a> { } else { None } + .copied() } fn stop_peeking(&mut self) { @@ -956,15 +941,10 @@ impl<'src> Classifier<'src> { } } - if let Some((None, text)) = self.tokens.peek().map(|(token, text)| { - if *token == TokenKind::Ident { - let class = get_real_ident_class(text, true); - (class, text) - } else { - // Doesn't matter which Class we put in here... - (Some(Class::Comment), text) - } - }) { + if let Some((TokenKind::Ident, text)) = self.tokens.peek() + && let symbol = Symbol::intern(text) + && (symbol.is_path_segment_keyword() || !is_keyword(symbol)) + { // We only "add" the colon if there is an ident behind. pos += text.len() + nb; has_ident = true; @@ -1210,22 +1190,7 @@ impl<'src> Classifier<'src> { sink(span, Highlight::Token { text, class: None }); return; } - TokenKind::Ident => match get_real_ident_class(text, false) { - None => match text { - "Option" | "Result" => Class::PreludeTy(new_span(before, text, file_span)), - "Some" | "None" | "Ok" | "Err" => { - Class::PreludeVal(new_span(before, text, file_span)) - } - _ if self.is_weak_keyword(text) => Class::KeyWord, - _ if self.in_macro_nonterminal => { - self.in_macro_nonterminal = false; - Class::MacroNonTerminal - } - "self" | "Self" => Class::Self_(new_span(before, text, file_span)), - _ => Class::Ident(new_span(before, text, file_span)), - }, - Some(c) => c, - }, + TokenKind::Ident => self.classify_ident(before, text), TokenKind::RawIdent | TokenKind::UnknownPrefix | TokenKind::InvalidIdent => { Class::Ident(new_span(before, text, file_span)) } @@ -1246,6 +1211,27 @@ impl<'src> Classifier<'src> { } } + fn classify_ident(&mut self, before: u32, text: &'src str) -> Class { + // Macro non-terminals (meta vars) take precedence. + if self.in_macro_nonterminal { + self.in_macro_nonterminal = false; + return Class::MacroNonTerminal; + } + + let file_span = self.file_span; + let span = || new_span(before, text, file_span); + + match text { + "ref" | "mut" => Class::RefKeyWord, + "false" | "true" => Class::Bool, + "self" | "Self" => Class::Self_(span()), + "Option" | "Result" => Class::PreludeTy(span()), + "Some" | "None" | "Ok" | "Err" => Class::PreludeVal(span()), + _ if self.is_weak_keyword(text) || is_keyword(Symbol::intern(text)) => Class::KeyWord, + _ => Class::Ident(span()), + } + } + fn is_weak_keyword(&mut self, text: &str) -> bool { // NOTE: `yeet` (`do yeet $expr`), `catch` (`do catch $block`), `default` (specialization), // `contract_{ensures,requires}`, `builtin` (builtin_syntax) & `reuse` (fn_delegation) are @@ -1263,11 +1249,11 @@ impl<'src> Classifier<'src> { } fn peek(&mut self) -> Option { - self.tokens.peek().map(|&(kind, _)| kind) + self.tokens.peek().map(|(kind, _)| kind) } fn peek_non_trivia(&mut self) -> Option<(TokenKind, &str)> { - while let Some(&token @ (kind, _)) = self.tokens.peek_next() { + while let Some(token @ (kind, _)) = self.tokens.peek_next() { if let TokenKind::Whitespace | TokenKind::LineComment { doc_style: None } | TokenKind::BlockComment { doc_style: None, .. } = kind @@ -1282,6 +1268,11 @@ impl<'src> Classifier<'src> { } } +fn is_keyword(symbol: Symbol) -> bool { + // FIXME(#148221): Don't hard-code the edition. The classifier should take it as an argument. + symbol.is_reserved(|| Edition::Edition2024) +} + fn generate_link_to_def( out: &mut impl Write, text_s: &str,