Skip to content

Commit dba238e

Browse files
committed
[llvm][mustache] Use single pass when tokenizing
The old implementation used many string searches over the same portions of the strings. This version sacrifices some API niceness for perf wins. Metric | Baseline | Single-Pass | Change -------------- | -------- | ----------- | ------- Time (ms) | 36.09 | 35.78 | -0.86% Cycles | 35.3M | 35.0M | -0.79% Instructions | 86.7M | 85.8M | -1.03% Branch Misses | 116K | 114K | -1.91% Cache Misses | 244K | 232K | -4.98%
1 parent 5b2c23c commit dba238e

File tree

1 file changed

+73
-113
lines changed

1 file changed

+73
-113
lines changed

llvm/lib/Support/Mustache.cpp

Lines changed: 73 additions & 113 deletions
Original file line numberDiff line numberDiff line change
@@ -371,141 +371,101 @@ static const char *jsonKindToString(json::Value::Kind K) {
371371
llvm_unreachable("Unknown json::Value::Kind");
372372
}
373373

374-
static Tag findNextTag(StringRef Template, size_t StartPos, StringRef Open,
375-
StringRef Close) {
376-
const StringLiteral TripleOpen("{{{");
377-
const StringLiteral TripleClose("}}}");
378-
379-
size_t NormalOpenPos = Template.find(Open, StartPos);
380-
size_t TripleOpenPos = Template.find(TripleOpen, StartPos);
381-
382-
Tag Result;
383-
384-
// Determine which tag comes first.
385-
if (TripleOpenPos != StringRef::npos &&
386-
(NormalOpenPos == StringRef::npos || TripleOpenPos <= NormalOpenPos)) {
387-
// Found a triple mustache tag.
388-
size_t EndPos =
389-
Template.find(TripleClose, TripleOpenPos + TripleOpen.size());
390-
if (EndPos == StringRef::npos)
391-
return Result; // No closing tag found.
392-
393-
Result.TagKind = Tag::Kind::Triple;
394-
Result.StartPosition = TripleOpenPos;
395-
size_t ContentStart = TripleOpenPos + TripleOpen.size();
396-
Result.Content = Template.substr(ContentStart, EndPos - ContentStart);
397-
Result.FullMatch = Template.substr(
398-
TripleOpenPos, (EndPos + TripleClose.size()) - TripleOpenPos);
399-
} else if (NormalOpenPos != StringRef::npos) {
400-
// Found a normal mustache tag.
401-
size_t EndPos = Template.find(Close, NormalOpenPos + Open.size());
402-
if (EndPos == StringRef::npos)
403-
return Result; // No closing tag found.
404-
405-
Result.TagKind = Tag::Kind::Normal;
406-
Result.StartPosition = NormalOpenPos;
407-
size_t ContentStart = NormalOpenPos + Open.size();
408-
Result.Content = Template.substr(ContentStart, EndPos - ContentStart);
409-
Result.FullMatch =
410-
Template.substr(NormalOpenPos, (EndPos + Close.size()) - NormalOpenPos);
411-
}
412-
413-
return Result;
414-
}
415-
416-
static std::optional<std::pair<StringRef, StringRef>>
417-
processTag(const Tag &T, SmallVectorImpl<Token> &Tokens, MustacheContext &Ctx) {
418-
LLVM_DEBUG(dbgs() << "[Tag] " << T.FullMatch << ", Content: " << T.Content
419-
<< ", Kind: " << tagKindToString(T.TagKind) << "\n");
420-
if (T.TagKind == Tag::Kind::Triple) {
421-
Tokens.emplace_back(T.FullMatch, Ctx.Saver.save("&" + T.Content), '&', Ctx);
422-
return std::nullopt;
423-
}
424-
StringRef Interpolated = T.Content;
425-
if (!Interpolated.trim().starts_with("=")) {
426-
char Front = Interpolated.empty() ? ' ' : Interpolated.trim().front();
427-
Tokens.emplace_back(T.FullMatch, Interpolated, Front, Ctx);
428-
return std::nullopt;
429-
}
430-
Tokens.emplace_back(T.FullMatch, Interpolated, '=', Ctx);
431-
StringRef DelimSpec = Interpolated.trim();
432-
DelimSpec = DelimSpec.drop_front(1);
433-
DelimSpec = DelimSpec.take_until([](char C) { return C == '='; });
434-
DelimSpec = DelimSpec.trim();
435-
436-
auto [NewOpen, NewClose] = DelimSpec.split(' ');
437-
LLVM_DEBUG(dbgs() << "[Set Delimiter] NewOpen: " << NewOpen
438-
<< ", NewClose: " << NewClose << "\n");
439-
return std::make_pair(NewOpen, NewClose);
440-
}
441-
442374
// Simple tokenizer that splits the template into tokens.
443-
// The mustache spec allows {{{ }}} to unescape variables,
444-
// but we don't support that here. An unescape variable
445-
// is represented only by {{& variable}}.
446375
static SmallVector<Token> tokenize(StringRef Template, MustacheContext &Ctx) {
447376
LLVM_DEBUG(dbgs() << "[Tokenize Template] \"" << Template << "\"\n");
448377
SmallVector<Token> Tokens;
449378
SmallString<8> Open("{{");
450379
SmallString<8> Close("}}");
451-
size_t Start = 0;
380+
size_t Cursor = 0;
381+
size_t TextStart = 0;
382+
383+
const StringLiteral TripleOpen("{{{");
384+
const StringLiteral TripleClose("}}}");
452385

453-
while (Start < Template.size()) {
454-
LLVM_DEBUG(dbgs() << "[Tokenize Loop] Start=" << Start << ", Open='" << Open
455-
<< "', Close='" << Close << "'\n");
456-
Tag T = findNextTag(Template, Start, Open, Close);
386+
while (Cursor < Template.size()) {
387+
StringRef TemplateSuffix = Template.substr(Cursor);
388+
StringRef TagOpen, TagClose;
389+
Tag::Kind Kind;
390+
391+
// Determine which tag we've encountered.
392+
if (TemplateSuffix.starts_with(TripleOpen)) {
393+
Kind = Tag::Kind::Triple;
394+
TagOpen = TripleOpen;
395+
TagClose = TripleClose;
396+
} else if (TemplateSuffix.starts_with(Open)) {
397+
Kind = Tag::Kind::Normal;
398+
TagOpen = Open;
399+
TagClose = Close;
400+
} else {
401+
// Not at a tag, continue scanning.
402+
++Cursor;
403+
continue;
404+
}
457405

458-
if (T.TagKind == Tag::Kind::None) {
459-
// No more tags, the rest is text.
460-
Tokens.emplace_back(Template.substr(Start));
461-
break;
406+
// Found a tag, first add the preceding text.
407+
if (Cursor > TextStart) {
408+
Tokens.emplace_back(Template.slice(TextStart, Cursor));
462409
}
463410

464-
// Add the text before the tag.
465-
if (T.StartPosition > Start) {
466-
StringRef Text = Template.substr(Start, T.StartPosition - Start);
467-
Tokens.emplace_back(Text);
411+
// Find the closing tag.
412+
size_t EndPos = Template.find(TagClose, Cursor + TagOpen.size());
413+
if (EndPos == StringRef::npos) {
414+
// No closing tag, the rest is text.
415+
Tokens.emplace_back(Template.substr(Cursor));
416+
TextStart = Cursor = Template.size();
417+
break;
468418
}
469419

470-
if (auto NewDelims = processTag(T, Tokens, Ctx)) {
471-
std::tie(Open, Close) = *NewDelims;
420+
// Extract tag content and full match.
421+
size_t ContentStart = Cursor + TagOpen.size();
422+
StringRef Content = Template.substr(ContentStart, EndPos - ContentStart);
423+
StringRef FullMatch =
424+
Template.substr(Cursor, (EndPos + TagClose.size()) - Cursor);
425+
426+
// Process the tag (inlined logic from processTag).
427+
LLVM_DEBUG(dbgs() << "[Tag] " << FullMatch << ", Content: " << Content
428+
<< ", Kind: " << tagKindToString(Kind) << "\n");
429+
if (Kind == Tag::Kind::Triple) {
430+
Tokens.emplace_back(FullMatch, Ctx.Saver.save("&" + Content), '&', Ctx);
431+
} else { // Normal Tag
432+
StringRef Interpolated = Content;
433+
if (!Interpolated.trim().starts_with("=")) {
434+
char Front = Interpolated.empty() ? ' ' : Interpolated.trim().front();
435+
Tokens.emplace_back(FullMatch, Interpolated, Front, Ctx);
436+
} else { // Set Delimiter
437+
Tokens.emplace_back(FullMatch, Interpolated, '=', Ctx);
438+
StringRef DelimSpec = Interpolated.trim();
439+
DelimSpec = DelimSpec.drop_front(1);
440+
DelimSpec = DelimSpec.take_until([](char C) { return C == '='; });
441+
DelimSpec = DelimSpec.trim();
442+
443+
auto [NewOpen, NewClose] = DelimSpec.split(' ');
444+
LLVM_DEBUG(dbgs() << "[Set Delimiter] NewOpen: " << NewOpen
445+
<< ", NewClose: " << NewClose << "\n");
446+
Open = NewOpen;
447+
Close = NewClose;
448+
}
472449
}
473450

474-
// Move past the tag.
475-
Start = T.StartPosition + T.FullMatch.size();
451+
// Move past the tag for the next iteration.
452+
Cursor += FullMatch.size();
453+
TextStart = Cursor;
476454
}
477455

478-
// Fix up white spaces for:
479-
// - open sections
480-
// - inverted sections
481-
// - close sections
482-
// - comments
483-
//
484-
// This loop attempts to find standalone tokens and tries to trim out
485-
// the surrounding whitespace.
486-
// For example:
487-
// if you have the template string
488-
// {{#section}} \n Example \n{{/section}}
489-
// The output should would be
490-
// For example:
491-
// \n Example \n
456+
// Add any remaining text after the last tag.
457+
if (TextStart < Template.size()) {
458+
Tokens.emplace_back(Template.substr(TextStart));
459+
}
460+
461+
// Fix up white spaces for standalone tags.
492462
size_t LastIdx = Tokens.size() - 1;
493463
for (size_t Idx = 0, End = Tokens.size(); Idx < End; ++Idx) {
494464
Token &CurrentToken = Tokens[Idx];
495465
Token::Type CurrentType = CurrentToken.getType();
496-
// Check if token type requires cleanup.
497-
bool RequiresCleanUp = requiresCleanUp(CurrentType);
498-
499-
if (!RequiresCleanUp)
466+
if (!requiresCleanUp(CurrentType))
500467
continue;
501468

502-
// We adjust the token body if there's no text behind or ahead.
503-
// A token is considered to have no text ahead if the right of the previous
504-
// token is a newline followed by spaces.
505-
// A token is considered to have no text behind if the left of the next
506-
// token is spaces followed by a newline.
507-
// eg.
508-
// "Line 1\n {{#section}} \n Line 2 \n {{/section}} \n Line 3"
509469
bool HasTextBehind = hasTextBehind(Idx, Tokens);
510470
bool HasTextAhead = hasTextAhead(Idx, Tokens);
511471

0 commit comments

Comments
 (0)