Skip to content

[Lexer] Add Unicode identifier and whitespace recognition #23

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 9 commits into
base: dil-main
Choose a base branch
from
4 changes: 2 additions & 2 deletions lldb/include/lldb/ValueObject/DILLexer.h
Original file line number Diff line number Diff line change
Expand Up @@ -227,8 +227,8 @@ class DILLexer {
m_expr(dil_expr), m_lexed_tokens(std::move(lexed_tokens)),
m_tokens_idx(0) {}

static llvm::Expected<Token> Lex(llvm::StringRef expr,
llvm::StringRef &remainder);
static llvm::Expected<Token>
Lex(llvm::StringRef expr, llvm::StringRef &remainder, uint32_t &position);

bool IsStringLiteral(Token::Kind kind) {
return (kind == Token::string_literal ||
Expand Down
120 changes: 66 additions & 54 deletions lldb/source/ValueObject/DILLexer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,39 +12,14 @@
//===----------------------------------------------------------------------===//

#include "lldb/ValueObject/DILLexer.h"
//#include "llvm/ADT/StringMap.h"
#include "clang/Basic/CharInfo.h"
#include "llvm/ADT/StringSwitch.h"
#include "llvm/Support/ConvertUTF.h"
#include "llvm/Support/Unicode.h"
#include <tuple>

namespace lldb_private::dil {

/*
const llvm::StringMap<Token::Kind> Keywords = {
{"bool", Token::kw_bool},
{"char", Token::kw_char},
{"char16_t", Token::kw_char16_t},
{"char32_t", Token::kw_char32_t},
{"const", Token::kw_const},
{"double", Token::kw_double},
{"dynamic_cast", Token::kw_dynamic_cast},
{"false", Token::kw_false},
{"float", Token::kw_float},
{"int", Token::kw_int},
{"long", Token::kw_long},
{"namespace", Token::kw_namespace},
{"nullptr", Token::kw_nullptr},
{"reinterpret_cast", Token::kw_reinterpret_cast},
{"short", Token::kw_short},
{"signed", Token::kw_signed},
{"sizeof", Token::kw_sizeof},
{"static_cast", Token::kw_static_cast},
{"this", Token::kw_this},
{"true", Token::kw_true},
{"unsigned", Token::kw_unsigned},
{"void", Token::kw_void},
{"volatile", Token::kw_volatile},
{"wchar_t", Token::kw_wchar_t}};
*/

llvm::StringRef Token::GetTokenName(Kind kind) {
switch (kind){
case Token::amp: return "amp";
Expand Down Expand Up @@ -153,22 +128,53 @@ llvm::StringRef Token::GetTokenName(Kind kind) {
}
}

static bool IsLetter (char c) {
static bool IsLetter(char c) {
return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z');
}

static bool IsDigit (char c) { return ('0' <= c && c <= '9'); }
static bool IsDigit(char c) { return ('0' <= c && c <= '9'); }

inline bool IsOperator(unsigned char c) {
using namespace clang::charinfo;
return (InfoTable[c] & (CHAR_PUNCT | CHAR_PERIOD)) != 0;
}

static bool IsValidIdentifierContinuation(char c) {
if (c == '$')
return true;
return !IsOperator(c) && !clang::isWhitespace(c);
}

static std::optional<llvm::StringRef> IsWord(llvm::StringRef &remainder) {
llvm::StringRef::iterator cur_pos = remainder.begin();
llvm::StringRef::iterator start = cur_pos;

if (IsDigit(*cur_pos))
return std::nullopt;

while (cur_pos < remainder.end()) {
uint8_t c = *cur_pos;
if (c < 0x80) {
if (IsValidIdentifierContinuation(c)) {
cur_pos++;
continue;
} else
break;
}
if (llvm::isLegalUTF8Sequence((const llvm::UTF8 *)cur_pos,
(const llvm::UTF8 *)remainder.end())) {
cur_pos += llvm::getNumBytesForUTF8(*cur_pos);
continue;
}
break;
}

static std::optional<llvm::StringRef> IsWord(llvm::StringRef expr,
llvm::StringRef &remainder) {
// Find the longest prefix consisting of letters, digits, underscors and
// '$'. If it doesn't start with a digit, then it's a word.
llvm::StringRef candidate = remainder.take_while(
[](char c) { return IsDigit(c) || IsLetter(c) || c == '_' || c == '$'; });
if (candidate.empty() || IsDigit(candidate[0]))
if (cur_pos == start)
return std::nullopt;
remainder = remainder.drop_front(candidate.size());
return candidate;

auto length = cur_pos - start;
remainder = remainder.drop_front(length);
return llvm::StringRef(start, length);
}

static void ConsumeNumberBody(uint32_t &length, char &prev_ch,
Expand Down Expand Up @@ -224,35 +230,37 @@ static std::optional<llvm::StringRef> IsNumber(llvm::StringRef expr,
llvm::Expected<DILLexer> DILLexer::Create(llvm::StringRef expr) {
std::vector<Token> tokens;
llvm::StringRef remainder = expr;
uint32_t position = 0;
do {
if (llvm::Expected<Token> t = Lex(expr, remainder)) {
if (llvm::Expected<Token> t = Lex(expr, remainder, position)) {
tokens.push_back(std::move(*t));
} else {
return t.takeError();
}
} while (tokens.back().GetKind() != Token::eof);

return DILLexer(expr, std::move(tokens));
}


llvm::Expected<Token> DILLexer::Lex(llvm::StringRef expr,
llvm::StringRef &remainder) {
// Skip over whitespace (spaces).
llvm::StringRef &remainder,
uint32_t &position) {
llvm::StringRef::iterator start = remainder.begin();
remainder = remainder.ltrim();
llvm::StringRef::iterator cur_pos = remainder.begin();
position += remainder.begin() - start;

// Check to see if we've reached the end of our input string.
if (remainder.empty())
return Token(Token::eof, "", (uint32_t)expr.size());
return Token(Token::eof, "", position);

uint32_t position = cur_pos - expr.begin();;
llvm::StringRef::iterator start = cur_pos;
std::optional<llvm::StringRef> maybe_number = IsNumber(expr, remainder);
if (maybe_number) {
std::string number = (*maybe_number).str();
return Token(Token::numeric_constant, number, position);
auto token = Token(Token::numeric_constant, number, position);
position += number.size();
return token;
} else {
std::optional<llvm::StringRef> maybe_word = IsWord(expr, remainder);
std::optional<llvm::StringRef> maybe_word = IsWord(remainder);
if (maybe_word) {
llvm::StringRef word = *maybe_word;
Token::Kind kind = llvm::StringSwitch<Token::Kind>(word)
Expand Down Expand Up @@ -281,11 +289,12 @@ llvm::Expected<Token> DILLexer::Lex(llvm::StringRef expr,
.Case("volatile", Token::kw_volatile)
.Case("wchar_t", Token::kw_wchar_t)
.Default(Token::identifier);
return Token(kind, word.str(), (uint32_t)position);
auto token = Token(kind, word.str(), position);
position += llvm::sys::unicode::columnWidthUTF8(word.str());
return token;
}
}

cur_pos = start;
constexpr std::pair<Token::Kind, const char *> operators[] = {
{Token::l_square, "["},
{Token::r_square, "]"},
Expand Down Expand Up @@ -332,8 +341,11 @@ llvm::Expected<Token> DILLexer::Lex(llvm::StringRef expr,
{Token::tilde, "~"},
};
for (auto [kind, str] : operators) {
if (remainder.consume_front(str))
return Token(kind, str, position);
if (remainder.consume_front(str)) {
auto token = Token(kind, str, position);
position += strlen(str);
return token;
}
}

// Unrecognized character(s) in string; unable to lex it.
Expand Down
5 changes: 3 additions & 2 deletions lldb/unittests/DIL/DILLexerTests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -121,8 +121,9 @@ TEST(DILLexerTests, MultiTokenLexTest) {
TEST(DILLexerTests, IdentifiersTest) {
// These strings should lex into identifier tokens.
std::vector<std::string> valid_identifiers = {
"$My_name1", "$pc", "abcd", "_", "_a", "_a_", "$",
"a_b", "kw_this", "self", "a", "MyName", "kw_namespace"};
"$My_name1", "$pc", "abcd", "_", "_a", "_a_", "$",
"a_b", "kw_this", "self", "a", "MyName", "kw_namespace", "föo",
"🍫", "שלום"};

// The lexer can lex these strings, but they should not be identifiers.
std::vector<std::string> invalid_identifiers = {"", "::", "(", ")", "234", "2"};
Expand Down
17 changes: 17 additions & 0 deletions lldb/unittests/DIL/DILTests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3680,3 +3680,20 @@ TEST_F(EvalTest, DISABLED_TestStringParsing) {
EXPECT_THAT(Eval("*\"abc\""), IsError("string literals are not supported"));
}
#endif

TEST_F(EvalTest, TestUnicodeInput) {
EXPECT_THAT(Eval("フー + 1"), IsEqual("2"));
EXPECT_THAT(Eval("1 + フー"), IsEqual("2"));
EXPECT_THAT(Eval("föo + 1"), IsEqual("4"));
EXPECT_THAT(Eval("שלום + 1"), IsEqual("5"));

// Check diagnostic pointer location
EXPECT_THAT(Eval("фу + бар"),
IsError("<expr:1:6>: use of undeclared identifier 'бар'\n"
"фу + бар\n"
" ^"));
EXPECT_THAT(Eval("フー + бар"),
IsError("<expr:1:8>: use of undeclared identifier 'бар'\n"
"フー + бар\n"
" ^"));
}
9 changes: 9 additions & 0 deletions lldb/unittests/DIL/Inputs/test_binary.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1196,6 +1196,14 @@ static void TestStringParsing() {
// BREAK(TestStringParsing)
}

static void TestUnicodeInput() {
int フー = 1;
int фу = 2;
int föo = 3;
int שלום = 4;
// BREAK(TestUnicodeInput)
}

namespace test_binary {

void main() {
Expand Down Expand Up @@ -1250,6 +1258,7 @@ void main() {

TestCharParsing();
TestStringParsing();
TestUnicodeInput();

// BREAK HERE
}
Expand Down