Skip to content

Commit 1ff7c32

Browse files
committed
[AST] Added a helper to extract a user-friendly text of a comment.
Summary: The helper is used in clangd for documentation shown in code completion and storing the docs in the symbols. See D45999. This patch reuses the code of the Doxygen comment lexer, disabling the bits that do command and html tag parsing. The new helper works on all comments, including non-doxygen comments. However, it does not understand or transform any doxygen directives, i.e. cannot extract brief text, etc. Reviewers: sammccall, hokein, ioeric Reviewed By: ioeric Subscribers: mgorny, cfe-commits Differential Revision: https://reviews.llvm.org/D46000 llvm-svn: 332458
1 parent a3f955b commit 1ff7c32

File tree

6 files changed

+379
-126
lines changed

6 files changed

+379
-126
lines changed

clang/include/clang/AST/CommentLexer.h

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -281,6 +281,11 @@ class Lexer {
281281
/// command, including command marker.
282282
SmallString<16> VerbatimBlockEndCommandName;
283283

284+
/// If true, the commands, html tags, etc will be parsed and reported as
285+
/// separate tokens inside the comment body. If false, the comment text will
286+
/// be parsed into text and newline tokens.
287+
bool ParseCommands;
288+
284289
/// Given a character reference name (e.g., "lt"), return the character that
285290
/// it stands for (e.g., "<").
286291
StringRef resolveHTMLNamedCharacterReference(StringRef Name) const;
@@ -315,12 +320,11 @@ class Lexer {
315320
/// Eat string matching regexp \code \s*\* \endcode.
316321
void skipLineStartingDecorations();
317322

318-
/// Lex stuff inside comments. CommentEnd should be set correctly.
323+
/// Lex comment text, including commands if ParseCommands is set to true.
319324
void lexCommentText(Token &T);
320325

321-
void setupAndLexVerbatimBlock(Token &T,
322-
const char *TextBegin,
323-
char Marker, const CommandInfo *Info);
326+
void setupAndLexVerbatimBlock(Token &T, const char *TextBegin, char Marker,
327+
const CommandInfo *Info);
324328

325329
void lexVerbatimBlockFirstLine(Token &T);
326330

@@ -343,14 +347,13 @@ class Lexer {
343347

344348
public:
345349
Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
346-
const CommandTraits &Traits,
347-
SourceLocation FileLoc,
348-
const char *BufferStart, const char *BufferEnd);
350+
const CommandTraits &Traits, SourceLocation FileLoc,
351+
const char *BufferStart, const char *BufferEnd,
352+
bool ParseCommands = true);
349353

350354
void lex(Token &T);
351355

352-
StringRef getSpelling(const Token &Tok,
353-
const SourceManager &SourceMgr,
356+
StringRef getSpelling(const Token &Tok, const SourceManager &SourceMgr,
354357
bool *Invalid = nullptr) const;
355358
};
356359

clang/include/clang/AST/RawCommentList.h

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,30 @@ class RawComment {
111111
return extractBriefText(Context);
112112
}
113113

114+
/// Returns sanitized comment text, suitable for presentation in editor UIs.
115+
/// E.g. will transform:
116+
/// // This is a long multiline comment.
117+
/// // Parts of it might be indented.
118+
/// /* The comments styles might be mixed. */
119+
/// into
120+
/// "This is a long multiline comment.\n"
121+
/// " Parts of it might be indented.\n"
122+
/// "The comments styles might be mixed."
123+
/// Also removes leading indentation and sanitizes some common cases:
124+
/// /* This is a first line.
125+
/// * This is a second line. It is indented.
126+
/// * This is a third line. */
127+
/// and
128+
/// /* This is a first line.
129+
/// This is a second line. It is indented.
130+
/// This is a third line. */
131+
/// will both turn into:
132+
/// "This is a first line.\n"
133+
/// " This is a second line. It is indented.\n"
134+
/// "This is a third line."
135+
std::string getFormattedText(const SourceManager &SourceMgr,
136+
DiagnosticsEngine &Diags) const;
137+
114138
/// Parse the comment, assuming it is attached to decl \c D.
115139
comments::FullComment *parse(const ASTContext &Context,
116140
const Preprocessor *PP, const Decl *D) const;

clang/lib/AST/CommentLexer.cpp

Lines changed: 129 additions & 117 deletions
Original file line numberDiff line numberDiff line change
@@ -294,6 +294,39 @@ void Lexer::lexCommentText(Token &T) {
294294
assert(CommentState == LCS_InsideBCPLComment ||
295295
CommentState == LCS_InsideCComment);
296296

297+
// Handles lexing non-command text, i.e. text and newline.
298+
auto HandleNonCommandToken = [&]() -> void {
299+
assert(State == LS_Normal);
300+
301+
const char *TokenPtr = BufferPtr;
302+
assert(TokenPtr < CommentEnd);
303+
switch (*TokenPtr) {
304+
case '\n':
305+
case '\r':
306+
TokenPtr = skipNewline(TokenPtr, CommentEnd);
307+
formTokenWithChars(T, TokenPtr, tok::newline);
308+
309+
if (CommentState == LCS_InsideCComment)
310+
skipLineStartingDecorations();
311+
return;
312+
313+
default: {
314+
StringRef TokStartSymbols = ParseCommands ? "\n\r\\@&<" : "\n\r";
315+
size_t End = StringRef(TokenPtr, CommentEnd - TokenPtr)
316+
.find_first_of(TokStartSymbols);
317+
if (End != StringRef::npos)
318+
TokenPtr += End;
319+
else
320+
TokenPtr = CommentEnd;
321+
formTextToken(T, TokenPtr);
322+
return;
323+
}
324+
}
325+
};
326+
327+
if (!ParseCommands)
328+
return HandleNonCommandToken();
329+
297330
switch (State) {
298331
case LS_Normal:
299332
break;
@@ -315,136 +348,116 @@ void Lexer::lexCommentText(Token &T) {
315348
}
316349

317350
assert(State == LS_Normal);
318-
319351
const char *TokenPtr = BufferPtr;
320352
assert(TokenPtr < CommentEnd);
321-
while (TokenPtr != CommentEnd) {
322-
switch(*TokenPtr) {
323-
case '\\':
324-
case '@': {
325-
// Commands that start with a backslash and commands that start with
326-
// 'at' have equivalent semantics. But we keep information about the
327-
// exact syntax in AST for comments.
328-
tok::TokenKind CommandKind =
329-
(*TokenPtr == '@') ? tok::at_command : tok::backslash_command;
353+
switch(*TokenPtr) {
354+
case '\\':
355+
case '@': {
356+
// Commands that start with a backslash and commands that start with
357+
// 'at' have equivalent semantics. But we keep information about the
358+
// exact syntax in AST for comments.
359+
tok::TokenKind CommandKind =
360+
(*TokenPtr == '@') ? tok::at_command : tok::backslash_command;
361+
TokenPtr++;
362+
if (TokenPtr == CommentEnd) {
363+
formTextToken(T, TokenPtr);
364+
return;
365+
}
366+
char C = *TokenPtr;
367+
switch (C) {
368+
default:
369+
break;
370+
371+
case '\\': case '@': case '&': case '$':
372+
case '#': case '<': case '>': case '%':
373+
case '\"': case '.': case ':':
374+
// This is one of \\ \@ \& \$ etc escape sequences.
330375
TokenPtr++;
331-
if (TokenPtr == CommentEnd) {
332-
formTextToken(T, TokenPtr);
333-
return;
334-
}
335-
char C = *TokenPtr;
336-
switch (C) {
337-
default:
338-
break;
339-
340-
case '\\': case '@': case '&': case '$':
341-
case '#': case '<': case '>': case '%':
342-
case '\"': case '.': case ':':
343-
// This is one of \\ \@ \& \$ etc escape sequences.
376+
if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
377+
// This is the \:: escape sequence.
344378
TokenPtr++;
345-
if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
346-
// This is the \:: escape sequence.
347-
TokenPtr++;
348-
}
349-
StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
350-
formTokenWithChars(T, TokenPtr, tok::text);
351-
T.setText(UnescapedText);
352-
return;
353379
}
380+
StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
381+
formTokenWithChars(T, TokenPtr, tok::text);
382+
T.setText(UnescapedText);
383+
return;
384+
}
354385

355-
// Don't make zero-length commands.
356-
if (!isCommandNameStartCharacter(*TokenPtr)) {
357-
formTextToken(T, TokenPtr);
358-
return;
359-
}
386+
// Don't make zero-length commands.
387+
if (!isCommandNameStartCharacter(*TokenPtr)) {
388+
formTextToken(T, TokenPtr);
389+
return;
390+
}
360391

361-
TokenPtr = skipCommandName(TokenPtr, CommentEnd);
362-
unsigned Length = TokenPtr - (BufferPtr + 1);
363-
364-
// Hardcoded support for lexing LaTeX formula commands
365-
// \f$ \f[ \f] \f{ \f} as a single command.
366-
if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
367-
C = *TokenPtr;
368-
if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') {
369-
TokenPtr++;
370-
Length++;
371-
}
372-
}
392+
TokenPtr = skipCommandName(TokenPtr, CommentEnd);
393+
unsigned Length = TokenPtr - (BufferPtr + 1);
373394

374-
StringRef CommandName(BufferPtr + 1, Length);
375-
376-
const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
377-
if (!Info) {
378-
if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) {
379-
StringRef CorrectedName = Info->Name;
380-
SourceLocation Loc = getSourceLocation(BufferPtr);
381-
SourceLocation EndLoc = getSourceLocation(TokenPtr);
382-
SourceRange FullRange = SourceRange(Loc, EndLoc);
383-
SourceRange CommandRange(Loc.getLocWithOffset(1), EndLoc);
384-
Diag(Loc, diag::warn_correct_comment_command_name)
385-
<< FullRange << CommandName << CorrectedName
386-
<< FixItHint::CreateReplacement(CommandRange, CorrectedName);
387-
} else {
388-
formTokenWithChars(T, TokenPtr, tok::unknown_command);
389-
T.setUnknownCommandName(CommandName);
390-
Diag(T.getLocation(), diag::warn_unknown_comment_command_name)
391-
<< SourceRange(T.getLocation(), T.getEndLocation());
392-
return;
393-
}
394-
}
395-
if (Info->IsVerbatimBlockCommand) {
396-
setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
397-
return;
398-
}
399-
if (Info->IsVerbatimLineCommand) {
400-
setupAndLexVerbatimLine(T, TokenPtr, Info);
401-
return;
395+
// Hardcoded support for lexing LaTeX formula commands
396+
// \f$ \f[ \f] \f{ \f} as a single command.
397+
if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
398+
C = *TokenPtr;
399+
if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') {
400+
TokenPtr++;
401+
Length++;
402402
}
403-
formTokenWithChars(T, TokenPtr, CommandKind);
404-
T.setCommandID(Info->getID());
405-
return;
406403
}
407404

408-
case '&':
409-
lexHTMLCharacterReference(T);
410-
return;
411-
412-
case '<': {
413-
TokenPtr++;
414-
if (TokenPtr == CommentEnd) {
415-
formTextToken(T, TokenPtr);
405+
StringRef CommandName(BufferPtr + 1, Length);
406+
407+
const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
408+
if (!Info) {
409+
if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) {
410+
StringRef CorrectedName = Info->Name;
411+
SourceLocation Loc = getSourceLocation(BufferPtr);
412+
SourceLocation EndLoc = getSourceLocation(TokenPtr);
413+
SourceRange FullRange = SourceRange(Loc, EndLoc);
414+
SourceRange CommandRange(Loc.getLocWithOffset(1), EndLoc);
415+
Diag(Loc, diag::warn_correct_comment_command_name)
416+
<< FullRange << CommandName << CorrectedName
417+
<< FixItHint::CreateReplacement(CommandRange, CorrectedName);
418+
} else {
419+
formTokenWithChars(T, TokenPtr, tok::unknown_command);
420+
T.setUnknownCommandName(CommandName);
421+
Diag(T.getLocation(), diag::warn_unknown_comment_command_name)
422+
<< SourceRange(T.getLocation(), T.getEndLocation());
416423
return;
417424
}
418-
const char C = *TokenPtr;
419-
if (isHTMLIdentifierStartingCharacter(C))
420-
setupAndLexHTMLStartTag(T);
421-
else if (C == '/')
422-
setupAndLexHTMLEndTag(T);
423-
else
424-
formTextToken(T, TokenPtr);
425+
}
426+
if (Info->IsVerbatimBlockCommand) {
427+
setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
425428
return;
426429
}
427-
428-
case '\n':
429-
case '\r':
430-
TokenPtr = skipNewline(TokenPtr, CommentEnd);
431-
formTokenWithChars(T, TokenPtr, tok::newline);
432-
433-
if (CommentState == LCS_InsideCComment)
434-
skipLineStartingDecorations();
430+
if (Info->IsVerbatimLineCommand) {
431+
setupAndLexVerbatimLine(T, TokenPtr, Info);
435432
return;
433+
}
434+
formTokenWithChars(T, TokenPtr, CommandKind);
435+
T.setCommandID(Info->getID());
436+
return;
437+
}
436438

437-
default: {
438-
size_t End = StringRef(TokenPtr, CommentEnd - TokenPtr).
439-
find_first_of("\n\r\\@&<");
440-
if (End != StringRef::npos)
441-
TokenPtr += End;
442-
else
443-
TokenPtr = CommentEnd;
439+
case '&':
440+
lexHTMLCharacterReference(T);
441+
return;
442+
443+
case '<': {
444+
TokenPtr++;
445+
if (TokenPtr == CommentEnd) {
444446
formTextToken(T, TokenPtr);
445447
return;
446448
}
449+
const char C = *TokenPtr;
450+
if (isHTMLIdentifierStartingCharacter(C))
451+
setupAndLexHTMLStartTag(T);
452+
else if (C == '/')
453+
setupAndLexHTMLEndTag(T);
454+
else
455+
formTextToken(T, TokenPtr);
456+
return;
447457
}
458+
459+
default:
460+
return HandleNonCommandToken();
448461
}
449462
}
450463

@@ -727,14 +740,13 @@ void Lexer::lexHTMLEndTag(Token &T) {
727740
}
728741

729742
Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
730-
const CommandTraits &Traits,
731-
SourceLocation FileLoc,
732-
const char *BufferStart, const char *BufferEnd):
733-
Allocator(Allocator), Diags(Diags), Traits(Traits),
734-
BufferStart(BufferStart), BufferEnd(BufferEnd),
735-
FileLoc(FileLoc), BufferPtr(BufferStart),
736-
CommentState(LCS_BeforeComment), State(LS_Normal) {
737-
}
743+
const CommandTraits &Traits, SourceLocation FileLoc,
744+
const char *BufferStart, const char *BufferEnd,
745+
bool ParseCommands)
746+
: Allocator(Allocator), Diags(Diags), Traits(Traits),
747+
BufferStart(BufferStart), BufferEnd(BufferEnd), FileLoc(FileLoc),
748+
BufferPtr(BufferStart), CommentState(LCS_BeforeComment), State(LS_Normal),
749+
ParseCommands(ParseCommands) {}
738750

739751
void Lexer::lex(Token &T) {
740752
again:

0 commit comments

Comments
 (0)