From 56aa0dec7ce284626431e9f03b9c470cf8b60565 Mon Sep 17 00:00:00 2001 From: Ilia Kuklin Date: Fri, 21 Feb 2025 23:52:30 +0500 Subject: [PATCH 1/9] Add basic support for UTF-8 identifiers --- lldb/include/lldb/ValueObject/DILLexer.h | 4 +- lldb/source/ValueObject/DILLexer.cpp | 126 +++++++++++++++++++++-- lldb/unittests/DIL/DILTests.cpp | 9 ++ lldb/unittests/DIL/Inputs/test_binary.cc | 8 ++ 4 files changed, 135 insertions(+), 12 deletions(-) diff --git a/lldb/include/lldb/ValueObject/DILLexer.h b/lldb/include/lldb/ValueObject/DILLexer.h index 02c4e4611e02..e45057c9b84d 100644 --- a/lldb/include/lldb/ValueObject/DILLexer.h +++ b/lldb/include/lldb/ValueObject/DILLexer.h @@ -227,8 +227,8 @@ class DILLexer { m_expr(dil_expr), m_lexed_tokens(std::move(lexed_tokens)), m_tokens_idx(0) {} - static llvm::Expected Lex(llvm::StringRef expr, - llvm::StringRef &remainder); + static llvm::Expected + Lex(llvm::StringRef expr, llvm::StringRef &remainder, uint32_t &position); bool IsStringLiteral(Token::Kind kind) { return (kind == Token::string_literal || diff --git a/lldb/source/ValueObject/DILLexer.cpp b/lldb/source/ValueObject/DILLexer.cpp index f41889fef921..dd5f1f463694 100644 --- a/lldb/source/ValueObject/DILLexer.cpp +++ b/lldb/source/ValueObject/DILLexer.cpp @@ -12,8 +12,11 @@ //===----------------------------------------------------------------------===// #include "lldb/ValueObject/DILLexer.h" +#include "clang/Basic/CharInfo.h" //#include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringSwitch.h" +#include "llvm/Support/ConvertUTF.h" +#include "llvm/Support/UnicodeCharRanges.h" namespace lldb_private::dil { @@ -45,6 +48,69 @@ const llvm::StringMap Keywords = { {"wchar_t", Token::kw_wchar_t}}; */ +using clang::isDigit; +static bool isValidIdentifierContinuationCodePoint(uint32_t c) { + // N1518: Recommendations for extended identifier characters for C and C++ + // Proposed Annex X.1: Ranges of characters allowed + return c == 0x00A8 || c == 0x00AA || c == 0x00AD || c == 0x00AF || + (c >= 0x00B2 && c <= 0x00B5) || (c >= 0x00B7 && c <= 0x00BA) || + (c >= 0x00BC && c <= 0x00BE) || (c >= 0x00C0 && c <= 0x00D6) || + (c >= 0x00D8 && c <= 0x00F6) || (c >= 0x00F8 && c <= 0x00FF) + + || (c >= 0x0100 && c <= 0x167F) || (c >= 0x1681 && c <= 0x180D) || + (c >= 0x180F && c <= 0x1FFF) + + || (c >= 0x200B && c <= 0x200D) || (c >= 0x202A && c <= 0x202E) || + (c >= 0x203F && c <= 0x2040) || c == 0x2054 || + (c >= 0x2060 && c <= 0x206F) + + || (c >= 0x2070 && c <= 0x218F) || (c >= 0x2460 && c <= 0x24FF) || + (c >= 0x2776 && c <= 0x2793) || (c >= 0x2C00 && c <= 0x2DFF) || + (c >= 0x2E80 && c <= 0x2FFF) + + || (c >= 0x3004 && c <= 0x3007) || (c >= 0x3021 && c <= 0x302F) || + (c >= 0x3031 && c <= 0x303F) + + || (c >= 0x3040 && c <= 0xD7FF) + + || (c >= 0xF900 && c <= 0xFD3D) || (c >= 0xFD40 && c <= 0xFDCF) || + (c >= 0xFDF0 && c <= 0xFE44) || (c >= 0xFE47 && c <= 0xFFF8) + + || (c >= 0x10000 && c <= 0x1FFFD) || (c >= 0x20000 && c <= 0x2FFFD) || + (c >= 0x30000 && c <= 0x3FFFD) || (c >= 0x40000 && c <= 0x4FFFD) || + (c >= 0x50000 && c <= 0x5FFFD) || (c >= 0x60000 && c <= 0x6FFFD) || + (c >= 0x70000 && c <= 0x7FFFD) || (c >= 0x80000 && c <= 0x8FFFD) || + (c >= 0x90000 && c <= 0x9FFFD) || (c >= 0xA0000 && c <= 0xAFFFD) || + (c >= 0xB0000 && c <= 0xBFFFD) || (c >= 0xC0000 && c <= 0xCFFFD) || + (c >= 0xD0000 && c <= 0xDFFFD) || (c >= 0xE0000 && c <= 0xEFFFD); +} + +static bool isValidIdentifierStartCodePoint(uint32_t c) { + if (!isValidIdentifierContinuationCodePoint(c)) + return false; + if (c < 0x80 && (isDigit(c) || c == '$')) + return false; + + // N1518: Recommendations for extended identifier characters for C and C++ + // Proposed Annex X.2: Ranges of characters disallowed initially + if ((c >= 0x0300 && c <= 0x036F) || (c >= 0x1DC0 && c <= 0x1DFF) || + (c >= 0x20D0 && c <= 0x20FF) || (c >= 0xFE20 && c <= 0xFE2F)) + return false; + + return true; +} + +static const llvm::sys::UnicodeCharRange UnicodeWhitespaceCharRanges[] = { + {0x0085, 0x0085}, {0x00A0, 0x00A0}, {0x1680, 0x1680}, + {0x180E, 0x180E}, {0x2000, 0x200A}, {0x2028, 0x2029}, + {0x202F, 0x202F}, {0x205F, 0x205F}, {0x3000, 0x3000}}; + +static bool isUnicodeWhitespace(uint32_t Codepoint) { + static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars( + UnicodeWhitespaceCharRanges); + return UnicodeWhitespaceChars.contains(Codepoint); +} + llvm::StringRef Token::GetTokenName(Kind kind) { switch (kind){ case Token::amp: return "amp"; @@ -224,8 +290,9 @@ static std::optional IsNumber(llvm::StringRef expr, llvm::Expected DILLexer::Create(llvm::StringRef expr) { std::vector tokens; llvm::StringRef remainder = expr; + uint32_t position = 0; do { - if (llvm::Expected t = Lex(expr, remainder)) { + if (llvm::Expected t = Lex(expr, remainder, position)) { tokens.push_back(std::move(*t)); } else { return t.takeError(); @@ -234,23 +301,27 @@ llvm::Expected DILLexer::Create(llvm::StringRef expr) { return DILLexer(expr, std::move(tokens)); } - llvm::Expected DILLexer::Lex(llvm::StringRef expr, - llvm::StringRef &remainder) { + llvm::StringRef &remainder, + uint32_t &position) { + llvm::StringRef::iterator cur_pos = remainder.begin(); // Skip over whitespace (spaces). remainder = remainder.ltrim(); - llvm::StringRef::iterator cur_pos = remainder.begin(); + position += remainder.begin() - cur_pos; + cur_pos = remainder.begin(); // Check to see if we've reached the end of our input string. if (remainder.empty()) - return Token(Token::eof, "", (uint32_t)expr.size()); + return Token(Token::eof, "", position); - uint32_t position = cur_pos - expr.begin();; + // uint32_t position = cur_pos - expr.begin();; llvm::StringRef::iterator start = cur_pos; std::optional maybe_number = IsNumber(expr, remainder); if (maybe_number) { std::string number = (*maybe_number).str(); - return Token(Token::numeric_constant, number, position); + auto token = Token(Token::numeric_constant, number, position); + position += number.size(); + return token; } else { std::optional maybe_word = IsWord(expr, remainder); if (maybe_word) { @@ -281,7 +352,9 @@ llvm::Expected DILLexer::Lex(llvm::StringRef expr, .Case("volatile", Token::kw_volatile) .Case("wchar_t", Token::kw_wchar_t) .Default(Token::identifier); - return Token(kind, word.str(), (uint32_t)position); + auto token = Token(kind, word.str(), position); + position += word.size(); + return token; } } @@ -332,8 +405,41 @@ llvm::Expected DILLexer::Lex(llvm::StringRef expr, {Token::tilde, "~"}, }; for (auto [kind, str] : operators) { - if (remainder.consume_front(str)) - return Token(kind, str, position); + if (remainder.consume_front(str)) { + auto token = Token(kind, str, position); + position += strlen(str); + return token; + } + } + + cur_pos = start; + llvm::UTF32 CodePoint; + llvm::ConversionResult Status; + unsigned size = llvm::getNumBytesForUTF8(*cur_pos); + Status = llvm::convertUTF8Sequence((const llvm::UTF8 **)&cur_pos, + (const llvm::UTF8 *)remainder.end(), + &CodePoint, llvm::strictConversion); + if (Status == llvm::conversionOK && + isValidIdentifierStartCodePoint(CodePoint)) { + unsigned utf_length = 1; + unsigned length = size; + while (true) { + size = llvm::getNumBytesForUTF8(*cur_pos); + Status = llvm::convertUTF8Sequence((const llvm::UTF8 **)&cur_pos, + (const llvm::UTF8 *)remainder.end(), + &CodePoint, llvm::strictConversion); + if (Status != llvm::conversionOK || + !isValidIdentifierContinuationCodePoint(CodePoint)) + break; + utf_length++; + length += size; + } + + remainder = remainder.drop_front(length); + llvm::StringRef utf_token(start, length); + auto token = Token(Token::identifier, utf_token.str(), position); + position += utf_length; + return token; } // Unrecognized character(s) in string; unable to lex it. diff --git a/lldb/unittests/DIL/DILTests.cpp b/lldb/unittests/DIL/DILTests.cpp index 8cbdbb1fdc3b..1e62a85eeb0c 100644 --- a/lldb/unittests/DIL/DILTests.cpp +++ b/lldb/unittests/DIL/DILTests.cpp @@ -3680,3 +3680,12 @@ TEST_F(EvalTest, DISABLED_TestStringParsing) { EXPECT_THAT(Eval("*\"abc\""), IsError("string literals are not supported")); } #endif + +TEST_F(EvalTest, TestUnicodeIdentifiers) { + EXPECT_THAT(Eval("フー + 1"), IsEqual("2")); + EXPECT_THAT(Eval("1 + フー"), IsEqual("2")); + EXPECT_THAT(Eval("фу + бар"), + IsError(": use of undeclared identifier 'бар'\n" + "фу + бар\n" + " ^")); +} \ No newline at end of file diff --git a/lldb/unittests/DIL/Inputs/test_binary.cc b/lldb/unittests/DIL/Inputs/test_binary.cc index 64ef8dc217c3..57007ffaf5ba 100644 --- a/lldb/unittests/DIL/Inputs/test_binary.cc +++ b/lldb/unittests/DIL/Inputs/test_binary.cc @@ -1196,6 +1196,13 @@ static void TestStringParsing() { // BREAK(TestStringParsing) } +static void TestUnicodeIdentifiers() { + int フー = 1; + int фу = 2; + int föo = 3; + // BREAK(TestUnicodeIdentifiers) +} + namespace test_binary { void main() { @@ -1250,6 +1257,7 @@ void main() { TestCharParsing(); TestStringParsing(); + TestUnicodeIdentifiers(); // BREAK HERE } From 97fa50915143a001afd51925ad562199b6747b33 Mon Sep 17 00:00:00 2001 From: Ilia Kuklin Date: Mon, 24 Feb 2025 22:18:56 +0500 Subject: [PATCH 2/9] Replace IsWord with identifier UTF recognition and add ASCII characters to it --- lldb/source/ValueObject/DILLexer.cpp | 84 +++++++++++++--------------- lldb/unittests/DIL/DILTests.cpp | 3 +- 2 files changed, 41 insertions(+), 46 deletions(-) diff --git a/lldb/source/ValueObject/DILLexer.cpp b/lldb/source/ValueObject/DILLexer.cpp index dd5f1f463694..c0d042e474bc 100644 --- a/lldb/source/ValueObject/DILLexer.cpp +++ b/lldb/source/ValueObject/DILLexer.cpp @@ -48,8 +48,10 @@ const llvm::StringMap Keywords = { {"wchar_t", Token::kw_wchar_t}}; */ -using clang::isDigit; static bool isValidIdentifierContinuationCodePoint(uint32_t c) { + if (c < 0x80) + return clang::isAsciiIdentifierContinue(c, /*dollar*/ true); + // N1518: Recommendations for extended identifier characters for C and C++ // Proposed Annex X.1: Ranges of characters allowed return c == 0x00A8 || c == 0x00AA || c == 0x00AD || c == 0x00AF || @@ -88,7 +90,7 @@ static bool isValidIdentifierContinuationCodePoint(uint32_t c) { static bool isValidIdentifierStartCodePoint(uint32_t c) { if (!isValidIdentifierContinuationCodePoint(c)) return false; - if (c < 0x80 && (isDigit(c) || c == '$')) + if (c < 0x80 && clang::isDigit(c)) return false; // N1518: Recommendations for extended identifier characters for C and C++ @@ -225,16 +227,37 @@ static bool IsLetter (char c) { static bool IsDigit (char c) { return ('0' <= c && c <= '9'); } -static std::optional IsWord(llvm::StringRef expr, - llvm::StringRef &remainder) { - // Find the longest prefix consisting of letters, digits, underscors and - // '$'. If it doesn't start with a digit, then it's a word. - llvm::StringRef candidate = remainder.take_while( - [](char c) { return IsDigit(c) || IsLetter(c) || c == '_' || c == '$'; }); - if (candidate.empty() || IsDigit(candidate[0])) - return std::nullopt; - remainder = remainder.drop_front(candidate.size()); - return candidate; +static std::optional +IsWord(llvm::StringRef expr, llvm::StringRef &remainder, uint32_t &utf_length) { + llvm::StringRef::iterator cur_pos = remainder.begin(); + llvm::StringRef::iterator start = cur_pos; + llvm::UTF32 CodePoint; + llvm::ConversionResult Status; + unsigned size = llvm::getNumBytesForUTF8(*cur_pos); + Status = llvm::convertUTF8Sequence((const llvm::UTF8 **)&cur_pos, + (const llvm::UTF8 *)remainder.end(), + &CodePoint, llvm::strictConversion); + if (Status == llvm::conversionOK && + isValidIdentifierStartCodePoint(CodePoint)) { + utf_length = 1; + unsigned length = size; + while (true) { + size = llvm::getNumBytesForUTF8(*cur_pos); + Status = llvm::convertUTF8Sequence((const llvm::UTF8 **)&cur_pos, + (const llvm::UTF8 *)remainder.end(), + &CodePoint, llvm::strictConversion); + if (Status != llvm::conversionOK || + !isValidIdentifierContinuationCodePoint(CodePoint)) + break; + utf_length++; + length += size; + } + + remainder = remainder.drop_front(length); + llvm::StringRef utf_token(start, length); + return utf_token; + } + return std::nullopt; } static void ConsumeNumberBody(uint32_t &length, char &prev_ch, @@ -314,7 +337,6 @@ llvm::Expected DILLexer::Lex(llvm::StringRef expr, if (remainder.empty()) return Token(Token::eof, "", position); - // uint32_t position = cur_pos - expr.begin();; llvm::StringRef::iterator start = cur_pos; std::optional maybe_number = IsNumber(expr, remainder); if (maybe_number) { @@ -323,7 +345,9 @@ llvm::Expected DILLexer::Lex(llvm::StringRef expr, position += number.size(); return token; } else { - std::optional maybe_word = IsWord(expr, remainder); + uint32_t utf_length = 0; + std::optional maybe_word = + IsWord(expr, remainder, utf_length); if (maybe_word) { llvm::StringRef word = *maybe_word; Token::Kind kind = llvm::StringSwitch(word) @@ -353,7 +377,7 @@ llvm::Expected DILLexer::Lex(llvm::StringRef expr, .Case("wchar_t", Token::kw_wchar_t) .Default(Token::identifier); auto token = Token(kind, word.str(), position); - position += word.size(); + position += utf_length; return token; } } @@ -412,36 +436,6 @@ llvm::Expected DILLexer::Lex(llvm::StringRef expr, } } - cur_pos = start; - llvm::UTF32 CodePoint; - llvm::ConversionResult Status; - unsigned size = llvm::getNumBytesForUTF8(*cur_pos); - Status = llvm::convertUTF8Sequence((const llvm::UTF8 **)&cur_pos, - (const llvm::UTF8 *)remainder.end(), - &CodePoint, llvm::strictConversion); - if (Status == llvm::conversionOK && - isValidIdentifierStartCodePoint(CodePoint)) { - unsigned utf_length = 1; - unsigned length = size; - while (true) { - size = llvm::getNumBytesForUTF8(*cur_pos); - Status = llvm::convertUTF8Sequence((const llvm::UTF8 **)&cur_pos, - (const llvm::UTF8 *)remainder.end(), - &CodePoint, llvm::strictConversion); - if (Status != llvm::conversionOK || - !isValidIdentifierContinuationCodePoint(CodePoint)) - break; - utf_length++; - length += size; - } - - remainder = remainder.drop_front(length); - llvm::StringRef utf_token(start, length); - auto token = Token(Token::identifier, utf_token.str(), position); - position += utf_length; - return token; - } - // Unrecognized character(s) in string; unable to lex it. return llvm::createStringError("Unable to lex input string"); } diff --git a/lldb/unittests/DIL/DILTests.cpp b/lldb/unittests/DIL/DILTests.cpp index 1e62a85eeb0c..b9f51d9437c1 100644 --- a/lldb/unittests/DIL/DILTests.cpp +++ b/lldb/unittests/DIL/DILTests.cpp @@ -3683,7 +3683,8 @@ TEST_F(EvalTest, DISABLED_TestStringParsing) { TEST_F(EvalTest, TestUnicodeIdentifiers) { EXPECT_THAT(Eval("フー + 1"), IsEqual("2")); - EXPECT_THAT(Eval("1 + フー"), IsEqual("2")); + EXPECT_THAT(Eval("2 + フー"), IsEqual("3")); + EXPECT_THAT(Eval("föo + 1"), IsEqual("4")); EXPECT_THAT(Eval("фу + бар"), IsError(": use of undeclared identifier 'бар'\n" "фу + бар\n" From 526df72545729266c35fafd7200481c6edf22b4d Mon Sep 17 00:00:00 2001 From: Ilia Kuklin Date: Tue, 25 Feb 2025 19:01:01 +0500 Subject: [PATCH 3/9] Refactor code & add unicode whitespace skipping --- lldb/source/ValueObject/DILLexer.cpp | 189 +++++++++++++++------------ lldb/unittests/DIL/DILLexerTests.cpp | 4 +- lldb/unittests/DIL/DILTests.cpp | 4 +- 3 files changed, 112 insertions(+), 85 deletions(-) diff --git a/lldb/source/ValueObject/DILLexer.cpp b/lldb/source/ValueObject/DILLexer.cpp index c0d042e474bc..564d3b02029d 100644 --- a/lldb/source/ValueObject/DILLexer.cpp +++ b/lldb/source/ValueObject/DILLexer.cpp @@ -17,6 +17,7 @@ #include "llvm/ADT/StringSwitch.h" #include "llvm/Support/ConvertUTF.h" #include "llvm/Support/UnicodeCharRanges.h" +#include namespace lldb_private::dil { @@ -48,71 +49,6 @@ const llvm::StringMap Keywords = { {"wchar_t", Token::kw_wchar_t}}; */ -static bool isValidIdentifierContinuationCodePoint(uint32_t c) { - if (c < 0x80) - return clang::isAsciiIdentifierContinue(c, /*dollar*/ true); - - // N1518: Recommendations for extended identifier characters for C and C++ - // Proposed Annex X.1: Ranges of characters allowed - return c == 0x00A8 || c == 0x00AA || c == 0x00AD || c == 0x00AF || - (c >= 0x00B2 && c <= 0x00B5) || (c >= 0x00B7 && c <= 0x00BA) || - (c >= 0x00BC && c <= 0x00BE) || (c >= 0x00C0 && c <= 0x00D6) || - (c >= 0x00D8 && c <= 0x00F6) || (c >= 0x00F8 && c <= 0x00FF) - - || (c >= 0x0100 && c <= 0x167F) || (c >= 0x1681 && c <= 0x180D) || - (c >= 0x180F && c <= 0x1FFF) - - || (c >= 0x200B && c <= 0x200D) || (c >= 0x202A && c <= 0x202E) || - (c >= 0x203F && c <= 0x2040) || c == 0x2054 || - (c >= 0x2060 && c <= 0x206F) - - || (c >= 0x2070 && c <= 0x218F) || (c >= 0x2460 && c <= 0x24FF) || - (c >= 0x2776 && c <= 0x2793) || (c >= 0x2C00 && c <= 0x2DFF) || - (c >= 0x2E80 && c <= 0x2FFF) - - || (c >= 0x3004 && c <= 0x3007) || (c >= 0x3021 && c <= 0x302F) || - (c >= 0x3031 && c <= 0x303F) - - || (c >= 0x3040 && c <= 0xD7FF) - - || (c >= 0xF900 && c <= 0xFD3D) || (c >= 0xFD40 && c <= 0xFDCF) || - (c >= 0xFDF0 && c <= 0xFE44) || (c >= 0xFE47 && c <= 0xFFF8) - - || (c >= 0x10000 && c <= 0x1FFFD) || (c >= 0x20000 && c <= 0x2FFFD) || - (c >= 0x30000 && c <= 0x3FFFD) || (c >= 0x40000 && c <= 0x4FFFD) || - (c >= 0x50000 && c <= 0x5FFFD) || (c >= 0x60000 && c <= 0x6FFFD) || - (c >= 0x70000 && c <= 0x7FFFD) || (c >= 0x80000 && c <= 0x8FFFD) || - (c >= 0x90000 && c <= 0x9FFFD) || (c >= 0xA0000 && c <= 0xAFFFD) || - (c >= 0xB0000 && c <= 0xBFFFD) || (c >= 0xC0000 && c <= 0xCFFFD) || - (c >= 0xD0000 && c <= 0xDFFFD) || (c >= 0xE0000 && c <= 0xEFFFD); -} - -static bool isValidIdentifierStartCodePoint(uint32_t c) { - if (!isValidIdentifierContinuationCodePoint(c)) - return false; - if (c < 0x80 && clang::isDigit(c)) - return false; - - // N1518: Recommendations for extended identifier characters for C and C++ - // Proposed Annex X.2: Ranges of characters disallowed initially - if ((c >= 0x0300 && c <= 0x036F) || (c >= 0x1DC0 && c <= 0x1DFF) || - (c >= 0x20D0 && c <= 0x20FF) || (c >= 0xFE20 && c <= 0xFE2F)) - return false; - - return true; -} - -static const llvm::sys::UnicodeCharRange UnicodeWhitespaceCharRanges[] = { - {0x0085, 0x0085}, {0x00A0, 0x00A0}, {0x1680, 0x1680}, - {0x180E, 0x180E}, {0x2000, 0x200A}, {0x2028, 0x2029}, - {0x202F, 0x202F}, {0x205F, 0x205F}, {0x3000, 0x3000}}; - -static bool isUnicodeWhitespace(uint32_t Codepoint) { - static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars( - UnicodeWhitespaceCharRanges); - return UnicodeWhitespaceChars.contains(Codepoint); -} - llvm::StringRef Token::GetTokenName(Kind kind) { switch (kind){ case Token::amp: return "amp"; @@ -221,31 +157,123 @@ llvm::StringRef Token::GetTokenName(Kind kind) { } } -static bool IsLetter (char c) { +static bool IsLetter(char c) { return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'); } -static bool IsDigit (char c) { return ('0' <= c && c <= '9'); } +static bool IsDigit(char c) { return ('0' <= c && c <= '9'); } + +static bool isValidIdentifierContinuationCodePoint(uint32_t c) { + if (c < 0x80) + return clang::isAsciiIdentifierContinue(c, /*dollar*/ true); + + // N1518: Recommendations for extended identifier characters for C and C++ + // Proposed Annex X.1: Ranges of characters allowed + return c == 0x00A8 || c == 0x00AA || c == 0x00AD || c == 0x00AF || + (c >= 0x00B2 && c <= 0x00B5) || (c >= 0x00B7 && c <= 0x00BA) || + (c >= 0x00BC && c <= 0x00BE) || (c >= 0x00C0 && c <= 0x00D6) || + (c >= 0x00D8 && c <= 0x00F6) || (c >= 0x00F8 && c <= 0x00FF) + + || (c >= 0x0100 && c <= 0x167F) || (c >= 0x1681 && c <= 0x180D) || + (c >= 0x180F && c <= 0x1FFF) + + || (c >= 0x200B && c <= 0x200D) || (c >= 0x202A && c <= 0x202E) || + (c >= 0x203F && c <= 0x2040) || c == 0x2054 || + (c >= 0x2060 && c <= 0x206F) + + || (c >= 0x2070 && c <= 0x218F) || (c >= 0x2460 && c <= 0x24FF) || + (c >= 0x2776 && c <= 0x2793) || (c >= 0x2C00 && c <= 0x2DFF) || + (c >= 0x2E80 && c <= 0x2FFF) + + || (c >= 0x3004 && c <= 0x3007) || (c >= 0x3021 && c <= 0x302F) || + (c >= 0x3031 && c <= 0x303F) + + || (c >= 0x3040 && c <= 0xD7FF) + + || (c >= 0xF900 && c <= 0xFD3D) || (c >= 0xFD40 && c <= 0xFDCF) || + (c >= 0xFDF0 && c <= 0xFE44) || (c >= 0xFE47 && c <= 0xFFF8) + + || (c >= 0x10000 && c <= 0x1FFFD) || (c >= 0x20000 && c <= 0x2FFFD) || + (c >= 0x30000 && c <= 0x3FFFD) || (c >= 0x40000 && c <= 0x4FFFD) || + (c >= 0x50000 && c <= 0x5FFFD) || (c >= 0x60000 && c <= 0x6FFFD) || + (c >= 0x70000 && c <= 0x7FFFD) || (c >= 0x80000 && c <= 0x8FFFD) || + (c >= 0x90000 && c <= 0x9FFFD) || (c >= 0xA0000 && c <= 0xAFFFD) || + (c >= 0xB0000 && c <= 0xBFFFD) || (c >= 0xC0000 && c <= 0xCFFFD) || + (c >= 0xD0000 && c <= 0xDFFFD) || (c >= 0xE0000 && c <= 0xEFFFD); +} + +static bool isValidIdentifierStartCodePoint(uint32_t c) { + if (!isValidIdentifierContinuationCodePoint(c)) + return false; + if (c < 0x80 && IsDigit(c)) + return false; + + // N1518: Recommendations for extended identifier characters for C and C++ + // Proposed Annex X.2: Ranges of characters disallowed initially + if ((c >= 0x0300 && c <= 0x036F) || (c >= 0x1DC0 && c <= 0x1DFF) || + (c >= 0x20D0 && c <= 0x20FF) || (c >= 0xFE20 && c <= 0xFE2F)) + return false; + + return true; +} + +static const llvm::sys::UnicodeCharRange UnicodeWhitespaceCharRanges[] = { + {0x0085, 0x0085}, {0x00A0, 0x00A0}, {0x1680, 0x1680}, + {0x180E, 0x180E}, {0x2000, 0x200A}, {0x2028, 0x2029}, + {0x202F, 0x202F}, {0x205F, 0x205F}, {0x3000, 0x3000}}; + +static bool isUnicodeWhitespace(uint32_t Codepoint) { + static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars( + UnicodeWhitespaceCharRanges); + return UnicodeWhitespaceChars.contains(Codepoint); +} + +static std::tuple +convertUTF8SequenceAndAdvance(llvm::StringRef::iterator &cur_pos, + llvm::StringRef::iterator end) { + llvm::UTF32 CodePoint; + uint32_t size = llvm::getNumBytesForUTF8(*cur_pos); + llvm::ConversionResult Status = llvm::convertUTF8Sequence( + (const llvm::UTF8 **)&cur_pos, (const llvm::UTF8 *)end, &CodePoint, + llvm::strictConversion); + return {Status, CodePoint, size}; +} + +static void SkipUnicodeWhitespaces(llvm::StringRef &remainder) { + llvm::StringRef::iterator cur_pos = remainder.begin(); + uint32_t length = 0; + while (true) { + auto [Status, CodePoint, size] = + convertUTF8SequenceAndAdvance(cur_pos, remainder.end()); + if (Status != llvm::conversionOK || !isUnicodeWhitespace(CodePoint)) + break; + length += size; + } + remainder = remainder.drop_front(length); +} + +static void SkipWhitespaces(llvm::StringRef &remainder) { + llvm::StringRef::iterator cur_pos; + do { + cur_pos = remainder.begin(); + remainder = remainder.ltrim(); + SkipUnicodeWhitespaces(remainder); + } while (remainder.begin() != cur_pos); +} static std::optional IsWord(llvm::StringRef expr, llvm::StringRef &remainder, uint32_t &utf_length) { llvm::StringRef::iterator cur_pos = remainder.begin(); llvm::StringRef::iterator start = cur_pos; - llvm::UTF32 CodePoint; - llvm::ConversionResult Status; - unsigned size = llvm::getNumBytesForUTF8(*cur_pos); - Status = llvm::convertUTF8Sequence((const llvm::UTF8 **)&cur_pos, - (const llvm::UTF8 *)remainder.end(), - &CodePoint, llvm::strictConversion); + auto [Status, CodePoint, size] = + convertUTF8SequenceAndAdvance(cur_pos, remainder.end()); if (Status == llvm::conversionOK && isValidIdentifierStartCodePoint(CodePoint)) { utf_length = 1; unsigned length = size; while (true) { - size = llvm::getNumBytesForUTF8(*cur_pos); - Status = llvm::convertUTF8Sequence((const llvm::UTF8 **)&cur_pos, - (const llvm::UTF8 *)remainder.end(), - &CodePoint, llvm::strictConversion); + auto [Status, CodePoint, size] = + convertUTF8SequenceAndAdvance(cur_pos, remainder.end()); if (Status != llvm::conversionOK || !isValidIdentifierContinuationCodePoint(CodePoint)) break; @@ -328,16 +356,14 @@ llvm::Expected DILLexer::Lex(llvm::StringRef expr, llvm::StringRef &remainder, uint32_t &position) { llvm::StringRef::iterator cur_pos = remainder.begin(); - // Skip over whitespace (spaces). - remainder = remainder.ltrim(); + SkipWhitespaces(remainder); position += remainder.begin() - cur_pos; - cur_pos = remainder.begin(); // Check to see if we've reached the end of our input string. if (remainder.empty()) return Token(Token::eof, "", position); - llvm::StringRef::iterator start = cur_pos; + cur_pos = remainder.begin(); std::optional maybe_number = IsNumber(expr, remainder); if (maybe_number) { std::string number = (*maybe_number).str(); @@ -382,7 +408,6 @@ llvm::Expected DILLexer::Lex(llvm::StringRef expr, } } - cur_pos = start; constexpr std::pair operators[] = { {Token::l_square, "["}, {Token::r_square, "]"}, diff --git a/lldb/unittests/DIL/DILLexerTests.cpp b/lldb/unittests/DIL/DILLexerTests.cpp index 026a092aa9e8..d4eb494a62e0 100644 --- a/lldb/unittests/DIL/DILLexerTests.cpp +++ b/lldb/unittests/DIL/DILLexerTests.cpp @@ -121,8 +121,8 @@ TEST(DILLexerTests, MultiTokenLexTest) { TEST(DILLexerTests, IdentifiersTest) { // These strings should lex into identifier tokens. std::vector valid_identifiers = { - "$My_name1", "$pc", "abcd", "_", "_a", "_a_", "$", - "a_b", "kw_this", "self", "a", "MyName", "kw_namespace"}; + "$My_name1", "$pc", "abcd", "_", "_a", "_a_", "$", "a_b", + "kw_this", "self", "a", "MyName", "kw_namespace", "föo", "🍫"}; // The lexer can lex these strings, but they should not be identifiers. std::vector invalid_identifiers = {"", "::", "(", ")", "234", "2"}; diff --git a/lldb/unittests/DIL/DILTests.cpp b/lldb/unittests/DIL/DILTests.cpp index b9f51d9437c1..aecddeffb268 100644 --- a/lldb/unittests/DIL/DILTests.cpp +++ b/lldb/unittests/DIL/DILTests.cpp @@ -3683,8 +3683,10 @@ TEST_F(EvalTest, DISABLED_TestStringParsing) { TEST_F(EvalTest, TestUnicodeIdentifiers) { EXPECT_THAT(Eval("フー + 1"), IsEqual("2")); - EXPECT_THAT(Eval("2 + フー"), IsEqual("3")); + EXPECT_THAT(Eval("1 + フー"), IsEqual("2")); EXPECT_THAT(Eval("föo + 1"), IsEqual("4")); + EXPECT_THAT(Eval(" 1 +   föo   "), + IsEqual("4")); // Contains Unicode whitespaces EXPECT_THAT(Eval("фу + бар"), IsError(": use of undeclared identifier 'бар'\n" "фу + бар\n" From ccab3d1a308c35cfaceb4460485e74623cecbe6e Mon Sep 17 00:00:00 2001 From: Ilia Kuklin Date: Thu, 27 Feb 2025 19:24:13 +0500 Subject: [PATCH 4/9] Add test for an identifier with a right-to-left script --- lldb/unittests/DIL/DILLexerTests.cpp | 5 +++-- lldb/unittests/DIL/DILTests.cpp | 1 + lldb/unittests/DIL/Inputs/test_binary.cc | 1 + 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/lldb/unittests/DIL/DILLexerTests.cpp b/lldb/unittests/DIL/DILLexerTests.cpp index d4eb494a62e0..c6abc8674de6 100644 --- a/lldb/unittests/DIL/DILLexerTests.cpp +++ b/lldb/unittests/DIL/DILLexerTests.cpp @@ -121,8 +121,9 @@ TEST(DILLexerTests, MultiTokenLexTest) { TEST(DILLexerTests, IdentifiersTest) { // These strings should lex into identifier tokens. std::vector valid_identifiers = { - "$My_name1", "$pc", "abcd", "_", "_a", "_a_", "$", "a_b", - "kw_this", "self", "a", "MyName", "kw_namespace", "föo", "🍫"}; + "$My_name1", "$pc", "abcd", "_", "_a", "_a_", "$", + "a_b", "kw_this", "self", "a", "MyName", "kw_namespace", "föo", + "🍫", "שלום"}; // The lexer can lex these strings, but they should not be identifiers. std::vector invalid_identifiers = {"", "::", "(", ")", "234", "2"}; diff --git a/lldb/unittests/DIL/DILTests.cpp b/lldb/unittests/DIL/DILTests.cpp index aecddeffb268..715c22ea101b 100644 --- a/lldb/unittests/DIL/DILTests.cpp +++ b/lldb/unittests/DIL/DILTests.cpp @@ -3685,6 +3685,7 @@ TEST_F(EvalTest, TestUnicodeIdentifiers) { EXPECT_THAT(Eval("フー + 1"), IsEqual("2")); EXPECT_THAT(Eval("1 + フー"), IsEqual("2")); EXPECT_THAT(Eval("föo + 1"), IsEqual("4")); + EXPECT_THAT(Eval("שלום + 1"), IsEqual("5")); EXPECT_THAT(Eval(" 1 +   föo   "), IsEqual("4")); // Contains Unicode whitespaces EXPECT_THAT(Eval("фу + бар"), diff --git a/lldb/unittests/DIL/Inputs/test_binary.cc b/lldb/unittests/DIL/Inputs/test_binary.cc index 57007ffaf5ba..af4d52b166b7 100644 --- a/lldb/unittests/DIL/Inputs/test_binary.cc +++ b/lldb/unittests/DIL/Inputs/test_binary.cc @@ -1200,6 +1200,7 @@ static void TestUnicodeIdentifiers() { int フー = 1; int фу = 2; int föo = 3; + int שלום = 4; // BREAK(TestUnicodeIdentifiers) } From 4e791156df1b08391180085507ad972074181595 Mon Sep 17 00:00:00 2001 From: Ilia Kuklin Date: Mon, 3 Mar 2025 23:56:57 +0500 Subject: [PATCH 5/9] Count UTF-8 width of skipped whitespaces --- lldb/source/ValueObject/DILLexer.cpp | 9 ++++++--- lldb/unittests/DIL/DILTests.cpp | 8 ++++++-- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/lldb/source/ValueObject/DILLexer.cpp b/lldb/source/ValueObject/DILLexer.cpp index 564d3b02029d..c94556ebbeb6 100644 --- a/lldb/source/ValueObject/DILLexer.cpp +++ b/lldb/source/ValueObject/DILLexer.cpp @@ -16,6 +16,7 @@ //#include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/Support/ConvertUTF.h" +#include "llvm/Support/Unicode.h" #include "llvm/Support/UnicodeCharRanges.h" #include @@ -355,15 +356,17 @@ llvm::Expected DILLexer::Create(llvm::StringRef expr) { llvm::Expected DILLexer::Lex(llvm::StringRef expr, llvm::StringRef &remainder, uint32_t &position) { - llvm::StringRef::iterator cur_pos = remainder.begin(); + llvm::StringRef::iterator start = remainder.begin(); SkipWhitespaces(remainder); - position += remainder.begin() - cur_pos; + if (start < remainder.begin()) { + llvm::StringRef skipped(start, remainder.begin() - start); + position += llvm::sys::unicode::columnWidthUTF8(skipped); + } // Check to see if we've reached the end of our input string. if (remainder.empty()) return Token(Token::eof, "", position); - cur_pos = remainder.begin(); std::optional maybe_number = IsNumber(expr, remainder); if (maybe_number) { std::string number = (*maybe_number).str(); diff --git a/lldb/unittests/DIL/DILTests.cpp b/lldb/unittests/DIL/DILTests.cpp index 715c22ea101b..cf814ea6090d 100644 --- a/lldb/unittests/DIL/DILTests.cpp +++ b/lldb/unittests/DIL/DILTests.cpp @@ -3686,10 +3686,14 @@ TEST_F(EvalTest, TestUnicodeIdentifiers) { EXPECT_THAT(Eval("1 + フー"), IsEqual("2")); EXPECT_THAT(Eval("föo + 1"), IsEqual("4")); EXPECT_THAT(Eval("שלום + 1"), IsEqual("5")); - EXPECT_THAT(Eval(" 1 +   föo   "), - IsEqual("4")); // Contains Unicode whitespaces + EXPECT_THAT(Eval(" 1 +   föo   "), // Contains Unicode whitespaces + IsEqual("4")); EXPECT_THAT(Eval("фу + бар"), IsError(": use of undeclared identifier 'бар'\n" "фу + бар\n" " ^")); + EXPECT_THAT(Eval("фу + бар"), // Wide Unicode whitespaces + IsError(": use of undeclared identifier 'бар'\n" + "фу + бар\n" + " ^")); } \ No newline at end of file From 6520218fbd4b20f989d38bb84f4fce13654bf472 Mon Sep 17 00:00:00 2001 From: Ilia Kuklin Date: Tue, 4 Mar 2025 01:07:52 +0500 Subject: [PATCH 6/9] Allow identifier to be contain anything except operators and whitespaces --- lldb/source/ValueObject/DILLexer.cpp | 123 ++++++----------------- lldb/unittests/DIL/DILTests.cpp | 8 +- lldb/unittests/DIL/Inputs/test_binary.cc | 6 +- 3 files changed, 39 insertions(+), 98 deletions(-) diff --git a/lldb/source/ValueObject/DILLexer.cpp b/lldb/source/ValueObject/DILLexer.cpp index c94556ebbeb6..272ab86375ca 100644 --- a/lldb/source/ValueObject/DILLexer.cpp +++ b/lldb/source/ValueObject/DILLexer.cpp @@ -13,7 +13,6 @@ #include "lldb/ValueObject/DILLexer.h" #include "clang/Basic/CharInfo.h" -//#include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/Support/ConvertUTF.h" #include "llvm/Support/Unicode.h" @@ -22,34 +21,6 @@ namespace lldb_private::dil { -/* -const llvm::StringMap Keywords = { - {"bool", Token::kw_bool}, - {"char", Token::kw_char}, - {"char16_t", Token::kw_char16_t}, - {"char32_t", Token::kw_char32_t}, - {"const", Token::kw_const}, - {"double", Token::kw_double}, - {"dynamic_cast", Token::kw_dynamic_cast}, - {"false", Token::kw_false}, - {"float", Token::kw_float}, - {"int", Token::kw_int}, - {"long", Token::kw_long}, - {"namespace", Token::kw_namespace}, - {"nullptr", Token::kw_nullptr}, - {"reinterpret_cast", Token::kw_reinterpret_cast}, - {"short", Token::kw_short}, - {"signed", Token::kw_signed}, - {"sizeof", Token::kw_sizeof}, - {"static_cast", Token::kw_static_cast}, - {"this", Token::kw_this}, - {"true", Token::kw_true}, - {"unsigned", Token::kw_unsigned}, - {"void", Token::kw_void}, - {"volatile", Token::kw_volatile}, - {"wchar_t", Token::kw_wchar_t}}; -*/ - llvm::StringRef Token::GetTokenName(Kind kind) { switch (kind){ case Token::amp: return "amp"; @@ -164,71 +135,39 @@ static bool IsLetter(char c) { static bool IsDigit(char c) { return ('0' <= c && c <= '9'); } -static bool isValidIdentifierContinuationCodePoint(uint32_t c) { - if (c < 0x80) - return clang::isAsciiIdentifierContinue(c, /*dollar*/ true); - - // N1518: Recommendations for extended identifier characters for C and C++ - // Proposed Annex X.1: Ranges of characters allowed - return c == 0x00A8 || c == 0x00AA || c == 0x00AD || c == 0x00AF || - (c >= 0x00B2 && c <= 0x00B5) || (c >= 0x00B7 && c <= 0x00BA) || - (c >= 0x00BC && c <= 0x00BE) || (c >= 0x00C0 && c <= 0x00D6) || - (c >= 0x00D8 && c <= 0x00F6) || (c >= 0x00F8 && c <= 0x00FF) - - || (c >= 0x0100 && c <= 0x167F) || (c >= 0x1681 && c <= 0x180D) || - (c >= 0x180F && c <= 0x1FFF) - - || (c >= 0x200B && c <= 0x200D) || (c >= 0x202A && c <= 0x202E) || - (c >= 0x203F && c <= 0x2040) || c == 0x2054 || - (c >= 0x2060 && c <= 0x206F) - - || (c >= 0x2070 && c <= 0x218F) || (c >= 0x2460 && c <= 0x24FF) || - (c >= 0x2776 && c <= 0x2793) || (c >= 0x2C00 && c <= 0x2DFF) || - (c >= 0x2E80 && c <= 0x2FFF) - - || (c >= 0x3004 && c <= 0x3007) || (c >= 0x3021 && c <= 0x302F) || - (c >= 0x3031 && c <= 0x303F) +static const llvm::sys::UnicodeCharRange UnicodeWhitespaceCharRanges[] = { + {0x0085, 0x0085}, {0x00A0, 0x00A0}, {0x1680, 0x1680}, + {0x180E, 0x180E}, {0x2000, 0x200A}, {0x2028, 0x2029}, + {0x202F, 0x202F}, {0x205F, 0x205F}, {0x3000, 0x3000}}; - || (c >= 0x3040 && c <= 0xD7FF) +static bool IsUnicodeWhitespace(uint32_t Codepoint) { + static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars( + UnicodeWhitespaceCharRanges); + return UnicodeWhitespaceChars.contains(Codepoint); +} - || (c >= 0xF900 && c <= 0xFD3D) || (c >= 0xFD40 && c <= 0xFDCF) || - (c >= 0xFDF0 && c <= 0xFE44) || (c >= 0xFE47 && c <= 0xFFF8) +inline bool IsOperator(unsigned char c) { + using namespace clang::charinfo; + return (InfoTable[c] & (CHAR_PUNCT | CHAR_PERIOD)) != 0; +} - || (c >= 0x10000 && c <= 0x1FFFD) || (c >= 0x20000 && c <= 0x2FFFD) || - (c >= 0x30000 && c <= 0x3FFFD) || (c >= 0x40000 && c <= 0x4FFFD) || - (c >= 0x50000 && c <= 0x5FFFD) || (c >= 0x60000 && c <= 0x6FFFD) || - (c >= 0x70000 && c <= 0x7FFFD) || (c >= 0x80000 && c <= 0x8FFFD) || - (c >= 0x90000 && c <= 0x9FFFD) || (c >= 0xA0000 && c <= 0xAFFFD) || - (c >= 0xB0000 && c <= 0xBFFFD) || (c >= 0xC0000 && c <= 0xCFFFD) || - (c >= 0xD0000 && c <= 0xDFFFD) || (c >= 0xE0000 && c <= 0xEFFFD); +static bool IsValidIdentifierContinuationCodePoint(uint32_t c) { + if (c < 0x80) { + if (c == '$') + return true; + return !IsOperator(c) && !clang::isWhitespace(c); + } + return !IsUnicodeWhitespace(c); } -static bool isValidIdentifierStartCodePoint(uint32_t c) { - if (!isValidIdentifierContinuationCodePoint(c)) +static bool IsValidIdentifierStartCodePoint(uint32_t c) { + if (!IsValidIdentifierContinuationCodePoint(c)) return false; if (c < 0x80 && IsDigit(c)) return false; - - // N1518: Recommendations for extended identifier characters for C and C++ - // Proposed Annex X.2: Ranges of characters disallowed initially - if ((c >= 0x0300 && c <= 0x036F) || (c >= 0x1DC0 && c <= 0x1DFF) || - (c >= 0x20D0 && c <= 0x20FF) || (c >= 0xFE20 && c <= 0xFE2F)) - return false; - return true; } -static const llvm::sys::UnicodeCharRange UnicodeWhitespaceCharRanges[] = { - {0x0085, 0x0085}, {0x00A0, 0x00A0}, {0x1680, 0x1680}, - {0x180E, 0x180E}, {0x2000, 0x200A}, {0x2028, 0x2029}, - {0x202F, 0x202F}, {0x205F, 0x205F}, {0x3000, 0x3000}}; - -static bool isUnicodeWhitespace(uint32_t Codepoint) { - static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars( - UnicodeWhitespaceCharRanges); - return UnicodeWhitespaceChars.contains(Codepoint); -} - static std::tuple convertUTF8SequenceAndAdvance(llvm::StringRef::iterator &cur_pos, llvm::StringRef::iterator end) { @@ -246,7 +185,7 @@ static void SkipUnicodeWhitespaces(llvm::StringRef &remainder) { while (true) { auto [Status, CodePoint, size] = convertUTF8SequenceAndAdvance(cur_pos, remainder.end()); - if (Status != llvm::conversionOK || !isUnicodeWhitespace(CodePoint)) + if (Status != llvm::conversionOK || !IsUnicodeWhitespace(CodePoint)) break; length += size; } @@ -262,23 +201,21 @@ static void SkipWhitespaces(llvm::StringRef &remainder) { } while (remainder.begin() != cur_pos); } -static std::optional -IsWord(llvm::StringRef expr, llvm::StringRef &remainder, uint32_t &utf_length) { +static std::optional IsWord(llvm::StringRef expr, + llvm::StringRef &remainder) { llvm::StringRef::iterator cur_pos = remainder.begin(); llvm::StringRef::iterator start = cur_pos; auto [Status, CodePoint, size] = convertUTF8SequenceAndAdvance(cur_pos, remainder.end()); if (Status == llvm::conversionOK && - isValidIdentifierStartCodePoint(CodePoint)) { - utf_length = 1; + IsValidIdentifierStartCodePoint(CodePoint)) { unsigned length = size; while (true) { auto [Status, CodePoint, size] = convertUTF8SequenceAndAdvance(cur_pos, remainder.end()); if (Status != llvm::conversionOK || - !isValidIdentifierContinuationCodePoint(CodePoint)) + !IsValidIdentifierContinuationCodePoint(CodePoint)) break; - utf_length++; length += size; } @@ -374,9 +311,7 @@ llvm::Expected DILLexer::Lex(llvm::StringRef expr, position += number.size(); return token; } else { - uint32_t utf_length = 0; - std::optional maybe_word = - IsWord(expr, remainder, utf_length); + std::optional maybe_word = IsWord(expr, remainder); if (maybe_word) { llvm::StringRef word = *maybe_word; Token::Kind kind = llvm::StringSwitch(word) @@ -406,7 +341,7 @@ llvm::Expected DILLexer::Lex(llvm::StringRef expr, .Case("wchar_t", Token::kw_wchar_t) .Default(Token::identifier); auto token = Token(kind, word.str(), position); - position += utf_length; + position += llvm::sys::unicode::columnWidthUTF8(word.str()); return token; } } diff --git a/lldb/unittests/DIL/DILTests.cpp b/lldb/unittests/DIL/DILTests.cpp index cf814ea6090d..32f3f03ecd63 100644 --- a/lldb/unittests/DIL/DILTests.cpp +++ b/lldb/unittests/DIL/DILTests.cpp @@ -3681,13 +3681,15 @@ TEST_F(EvalTest, DISABLED_TestStringParsing) { } #endif -TEST_F(EvalTest, TestUnicodeIdentifiers) { +TEST_F(EvalTest, TestUnicodeInput) { EXPECT_THAT(Eval("フー + 1"), IsEqual("2")); EXPECT_THAT(Eval("1 + フー"), IsEqual("2")); EXPECT_THAT(Eval("föo + 1"), IsEqual("4")); EXPECT_THAT(Eval("שלום + 1"), IsEqual("5")); EXPECT_THAT(Eval(" 1 +   föo   "), // Contains Unicode whitespaces IsEqual("4")); + + // Check diagnostic pointer location EXPECT_THAT(Eval("фу + бар"), IsError(": use of undeclared identifier 'бар'\n" "фу + бар\n" @@ -3696,4 +3698,8 @@ TEST_F(EvalTest, TestUnicodeIdentifiers) { IsError(": use of undeclared identifier 'бар'\n" "фу + бар\n" " ^")); + EXPECT_THAT(Eval("フー + бар"), + IsError(": use of undeclared identifier 'бар'\n" + "フー + бар\n" + " ^")); } \ No newline at end of file diff --git a/lldb/unittests/DIL/Inputs/test_binary.cc b/lldb/unittests/DIL/Inputs/test_binary.cc index af4d52b166b7..c289f93a1202 100644 --- a/lldb/unittests/DIL/Inputs/test_binary.cc +++ b/lldb/unittests/DIL/Inputs/test_binary.cc @@ -1196,12 +1196,12 @@ static void TestStringParsing() { // BREAK(TestStringParsing) } -static void TestUnicodeIdentifiers() { +static void TestUnicodeInput() { int フー = 1; int фу = 2; int föo = 3; int שלום = 4; - // BREAK(TestUnicodeIdentifiers) + // BREAK(TestUnicodeInput) } namespace test_binary { @@ -1258,7 +1258,7 @@ void main() { TestCharParsing(); TestStringParsing(); - TestUnicodeIdentifiers(); + TestUnicodeInput(); // BREAK HERE } From 20ee555de0f8ee3cba2a898556ff4ceb61aa233b Mon Sep 17 00:00:00 2001 From: Ilia Kuklin Date: Fri, 7 Mar 2025 17:51:16 +0500 Subject: [PATCH 7/9] Manually count identifier and whitespace width using charWidth --- lldb/source/ValueObject/DILLexer.cpp | 29 ++++++++++++++++------------ llvm/include/llvm/Support/Unicode.h | 13 +++++++++++++ llvm/lib/Support/Unicode.cpp | 13 +------------ 3 files changed, 31 insertions(+), 24 deletions(-) diff --git a/lldb/source/ValueObject/DILLexer.cpp b/lldb/source/ValueObject/DILLexer.cpp index 272ab86375ca..a476c719789b 100644 --- a/lldb/source/ValueObject/DILLexer.cpp +++ b/lldb/source/ValueObject/DILLexer.cpp @@ -179,7 +179,8 @@ convertUTF8SequenceAndAdvance(llvm::StringRef::iterator &cur_pos, return {Status, CodePoint, size}; } -static void SkipUnicodeWhitespaces(llvm::StringRef &remainder) { +static void SkipUnicodeWhitespaces(llvm::StringRef &remainder, + uint32_t &width) { llvm::StringRef::iterator cur_pos = remainder.begin(); uint32_t length = 0; while (true) { @@ -188,27 +189,30 @@ static void SkipUnicodeWhitespaces(llvm::StringRef &remainder) { if (Status != llvm::conversionOK || !IsUnicodeWhitespace(CodePoint)) break; length += size; + width += llvm::sys::unicode::charWidth(CodePoint); } remainder = remainder.drop_front(length); } -static void SkipWhitespaces(llvm::StringRef &remainder) { +static void SkipWhitespaces(llvm::StringRef &remainder, uint32_t &width) { llvm::StringRef::iterator cur_pos; do { cur_pos = remainder.begin(); remainder = remainder.ltrim(); - SkipUnicodeWhitespaces(remainder); + width += remainder.begin() - cur_pos; + SkipUnicodeWhitespaces(remainder, width); } while (remainder.begin() != cur_pos); } -static std::optional IsWord(llvm::StringRef expr, - llvm::StringRef &remainder) { +static std::optional +IsWord(llvm::StringRef expr, llvm::StringRef &remainder, uint32_t &width) { llvm::StringRef::iterator cur_pos = remainder.begin(); llvm::StringRef::iterator start = cur_pos; auto [Status, CodePoint, size] = convertUTF8SequenceAndAdvance(cur_pos, remainder.end()); if (Status == llvm::conversionOK && IsValidIdentifierStartCodePoint(CodePoint)) { + width = llvm::sys::unicode::charWidth(CodePoint); unsigned length = size; while (true) { auto [Status, CodePoint, size] = @@ -216,6 +220,7 @@ static std::optional IsWord(llvm::StringRef expr, if (Status != llvm::conversionOK || !IsValidIdentifierContinuationCodePoint(CodePoint)) break; + width += llvm::sys::unicode::charWidth(CodePoint); length += size; } @@ -287,6 +292,7 @@ llvm::Expected DILLexer::Create(llvm::StringRef expr) { return t.takeError(); } } while (tokens.back().GetKind() != Token::eof); + return DILLexer(expr, std::move(tokens)); } @@ -294,11 +300,9 @@ llvm::Expected DILLexer::Lex(llvm::StringRef expr, llvm::StringRef &remainder, uint32_t &position) { llvm::StringRef::iterator start = remainder.begin(); - SkipWhitespaces(remainder); - if (start < remainder.begin()) { - llvm::StringRef skipped(start, remainder.begin() - start); - position += llvm::sys::unicode::columnWidthUTF8(skipped); - } + uint32_t width = 0; + SkipWhitespaces(remainder, width); + position += width; // Check to see if we've reached the end of our input string. if (remainder.empty()) @@ -311,7 +315,8 @@ llvm::Expected DILLexer::Lex(llvm::StringRef expr, position += number.size(); return token; } else { - std::optional maybe_word = IsWord(expr, remainder); + uint32_t width = 0; + std::optional maybe_word = IsWord(expr, remainder, width); if (maybe_word) { llvm::StringRef word = *maybe_word; Token::Kind kind = llvm::StringSwitch(word) @@ -341,7 +346,7 @@ llvm::Expected DILLexer::Lex(llvm::StringRef expr, .Case("wchar_t", Token::kw_wchar_t) .Default(Token::identifier); auto token = Token(kind, word.str(), position); - position += llvm::sys::unicode::columnWidthUTF8(word.str()); + position += width; return token; } } diff --git a/llvm/include/llvm/Support/Unicode.h b/llvm/include/llvm/Support/Unicode.h index 861548728d4f..2090a0377b93 100644 --- a/llvm/include/llvm/Support/Unicode.h +++ b/llvm/include/llvm/Support/Unicode.h @@ -41,6 +41,19 @@ bool isPrintable(int UCS); // Formatting codepoints are codepoints in the Cf category. bool isFormatting(int UCS); +/// Gets the number of positions a character is likely to occupy when output +/// on a terminal ("character width"). This depends on the implementation of the +/// terminal, and there's no standard definition of character width. +/// The implementation defines it in a way that is expected to be compatible +/// with a generic Unicode-capable terminal. +/// \return Character width: +/// * ErrorNonPrintableCharacter (-1) for non-printable characters (as +/// identified by isPrintable); +/// * 0 for non-spacing and enclosing combining marks; +/// * 2 for CJK characters excluding halfwidth forms; +/// * 1 for all remaining characters. +int charWidth(int UCS); + /// Gets the number of positions the UTF8-encoded \p Text is likely to occupy /// when output on a terminal ("character width"). This depends on the /// implementation of the terminal, and there's no standard definition of diff --git a/llvm/lib/Support/Unicode.cpp b/llvm/lib/Support/Unicode.cpp index 288b75c872e1..2c79632fbeec 100644 --- a/llvm/lib/Support/Unicode.cpp +++ b/llvm/lib/Support/Unicode.cpp @@ -290,18 +290,7 @@ bool isFormatting(int UCS) { return Format.contains(UCS); } -/// Gets the number of positions a character is likely to occupy when output -/// on a terminal ("character width"). This depends on the implementation of the -/// terminal, and there's no standard definition of character width. -/// The implementation defines it in a way that is expected to be compatible -/// with a generic Unicode-capable terminal. -/// \return Character width: -/// * ErrorNonPrintableCharacter (-1) for non-printable characters (as -/// identified by isPrintable); -/// * 0 for non-spacing and enclosing combining marks; -/// * 2 for CJK characters excluding halfwidth forms; -/// * 1 for all remaining characters. -static inline int charWidth(int UCS) { +int charWidth(int UCS) { if (!isPrintable(UCS)) return ErrorNonPrintableCharacter; From ce05c759f2822d875118f1eff71ccac3215a9995 Mon Sep 17 00:00:00 2001 From: Ilia Kuklin Date: Tue, 11 Mar 2025 00:00:50 +0500 Subject: [PATCH 8/9] Remove Unicode whitespaces recognition --- lldb/source/ValueObject/DILLexer.cpp | 44 ++-------------------------- lldb/unittests/DIL/DILTests.cpp | 6 ---- 2 files changed, 3 insertions(+), 47 deletions(-) diff --git a/lldb/source/ValueObject/DILLexer.cpp b/lldb/source/ValueObject/DILLexer.cpp index a476c719789b..9b87a7cfaff8 100644 --- a/lldb/source/ValueObject/DILLexer.cpp +++ b/lldb/source/ValueObject/DILLexer.cpp @@ -16,7 +16,6 @@ #include "llvm/ADT/StringSwitch.h" #include "llvm/Support/ConvertUTF.h" #include "llvm/Support/Unicode.h" -#include "llvm/Support/UnicodeCharRanges.h" #include namespace lldb_private::dil { @@ -135,17 +134,6 @@ static bool IsLetter(char c) { static bool IsDigit(char c) { return ('0' <= c && c <= '9'); } -static const llvm::sys::UnicodeCharRange UnicodeWhitespaceCharRanges[] = { - {0x0085, 0x0085}, {0x00A0, 0x00A0}, {0x1680, 0x1680}, - {0x180E, 0x180E}, {0x2000, 0x200A}, {0x2028, 0x2029}, - {0x202F, 0x202F}, {0x205F, 0x205F}, {0x3000, 0x3000}}; - -static bool IsUnicodeWhitespace(uint32_t Codepoint) { - static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars( - UnicodeWhitespaceCharRanges); - return UnicodeWhitespaceChars.contains(Codepoint); -} - inline bool IsOperator(unsigned char c) { using namespace clang::charinfo; return (InfoTable[c] & (CHAR_PUNCT | CHAR_PERIOD)) != 0; @@ -157,7 +145,7 @@ static bool IsValidIdentifierContinuationCodePoint(uint32_t c) { return true; return !IsOperator(c) && !clang::isWhitespace(c); } - return !IsUnicodeWhitespace(c); + return true; } static bool IsValidIdentifierStartCodePoint(uint32_t c) { @@ -179,31 +167,6 @@ convertUTF8SequenceAndAdvance(llvm::StringRef::iterator &cur_pos, return {Status, CodePoint, size}; } -static void SkipUnicodeWhitespaces(llvm::StringRef &remainder, - uint32_t &width) { - llvm::StringRef::iterator cur_pos = remainder.begin(); - uint32_t length = 0; - while (true) { - auto [Status, CodePoint, size] = - convertUTF8SequenceAndAdvance(cur_pos, remainder.end()); - if (Status != llvm::conversionOK || !IsUnicodeWhitespace(CodePoint)) - break; - length += size; - width += llvm::sys::unicode::charWidth(CodePoint); - } - remainder = remainder.drop_front(length); -} - -static void SkipWhitespaces(llvm::StringRef &remainder, uint32_t &width) { - llvm::StringRef::iterator cur_pos; - do { - cur_pos = remainder.begin(); - remainder = remainder.ltrim(); - width += remainder.begin() - cur_pos; - SkipUnicodeWhitespaces(remainder, width); - } while (remainder.begin() != cur_pos); -} - static std::optional IsWord(llvm::StringRef expr, llvm::StringRef &remainder, uint32_t &width) { llvm::StringRef::iterator cur_pos = remainder.begin(); @@ -300,9 +263,8 @@ llvm::Expected DILLexer::Lex(llvm::StringRef expr, llvm::StringRef &remainder, uint32_t &position) { llvm::StringRef::iterator start = remainder.begin(); - uint32_t width = 0; - SkipWhitespaces(remainder, width); - position += width; + remainder = remainder.ltrim(); + position += remainder.begin() - start; // Check to see if we've reached the end of our input string. if (remainder.empty()) diff --git a/lldb/unittests/DIL/DILTests.cpp b/lldb/unittests/DIL/DILTests.cpp index 32f3f03ecd63..08dfd985a08b 100644 --- a/lldb/unittests/DIL/DILTests.cpp +++ b/lldb/unittests/DIL/DILTests.cpp @@ -3686,18 +3686,12 @@ TEST_F(EvalTest, TestUnicodeInput) { EXPECT_THAT(Eval("1 + フー"), IsEqual("2")); EXPECT_THAT(Eval("föo + 1"), IsEqual("4")); EXPECT_THAT(Eval("שלום + 1"), IsEqual("5")); - EXPECT_THAT(Eval(" 1 +   föo   "), // Contains Unicode whitespaces - IsEqual("4")); // Check diagnostic pointer location EXPECT_THAT(Eval("фу + бар"), IsError(": use of undeclared identifier 'бар'\n" "фу + бар\n" " ^")); - EXPECT_THAT(Eval("фу + бар"), // Wide Unicode whitespaces - IsError(": use of undeclared identifier 'бар'\n" - "фу + бар\n" - " ^")); EXPECT_THAT(Eval("フー + бар"), IsError(": use of undeclared identifier 'бар'\n" "フー + бар\n" From daf1735eba492838416d8dbe8ceff6b7f11b9c59 Mon Sep 17 00:00:00 2001 From: Ilia Kuklin Date: Tue, 11 Mar 2025 20:58:29 +0500 Subject: [PATCH 9/9] Replace unicode codepoint conversion with UTF8 validity check --- lldb/source/ValueObject/DILLexer.cpp | 80 +++++++++++----------------- llvm/include/llvm/Support/Unicode.h | 13 ----- llvm/lib/Support/Unicode.cpp | 13 ++++- 3 files changed, 43 insertions(+), 63 deletions(-) diff --git a/lldb/source/ValueObject/DILLexer.cpp b/lldb/source/ValueObject/DILLexer.cpp index 9b87a7cfaff8..88ef57cc42f6 100644 --- a/lldb/source/ValueObject/DILLexer.cpp +++ b/lldb/source/ValueObject/DILLexer.cpp @@ -139,59 +139,42 @@ inline bool IsOperator(unsigned char c) { return (InfoTable[c] & (CHAR_PUNCT | CHAR_PERIOD)) != 0; } -static bool IsValidIdentifierContinuationCodePoint(uint32_t c) { - if (c < 0x80) { - if (c == '$') - return true; - return !IsOperator(c) && !clang::isWhitespace(c); - } - return true; -} - -static bool IsValidIdentifierStartCodePoint(uint32_t c) { - if (!IsValidIdentifierContinuationCodePoint(c)) - return false; - if (c < 0x80 && IsDigit(c)) - return false; - return true; +static bool IsValidIdentifierContinuation(char c) { + if (c == '$') + return true; + return !IsOperator(c) && !clang::isWhitespace(c); } -static std::tuple -convertUTF8SequenceAndAdvance(llvm::StringRef::iterator &cur_pos, - llvm::StringRef::iterator end) { - llvm::UTF32 CodePoint; - uint32_t size = llvm::getNumBytesForUTF8(*cur_pos); - llvm::ConversionResult Status = llvm::convertUTF8Sequence( - (const llvm::UTF8 **)&cur_pos, (const llvm::UTF8 *)end, &CodePoint, - llvm::strictConversion); - return {Status, CodePoint, size}; -} - -static std::optional -IsWord(llvm::StringRef expr, llvm::StringRef &remainder, uint32_t &width) { +static std::optional IsWord(llvm::StringRef &remainder) { llvm::StringRef::iterator cur_pos = remainder.begin(); llvm::StringRef::iterator start = cur_pos; - auto [Status, CodePoint, size] = - convertUTF8SequenceAndAdvance(cur_pos, remainder.end()); - if (Status == llvm::conversionOK && - IsValidIdentifierStartCodePoint(CodePoint)) { - width = llvm::sys::unicode::charWidth(CodePoint); - unsigned length = size; - while (true) { - auto [Status, CodePoint, size] = - convertUTF8SequenceAndAdvance(cur_pos, remainder.end()); - if (Status != llvm::conversionOK || - !IsValidIdentifierContinuationCodePoint(CodePoint)) + + if (IsDigit(*cur_pos)) + return std::nullopt; + + while (cur_pos < remainder.end()) { + uint8_t c = *cur_pos; + if (c < 0x80) { + if (IsValidIdentifierContinuation(c)) { + cur_pos++; + continue; + } else break; - width += llvm::sys::unicode::charWidth(CodePoint); - length += size; } - - remainder = remainder.drop_front(length); - llvm::StringRef utf_token(start, length); - return utf_token; + if (llvm::isLegalUTF8Sequence((const llvm::UTF8 *)cur_pos, + (const llvm::UTF8 *)remainder.end())) { + cur_pos += llvm::getNumBytesForUTF8(*cur_pos); + continue; + } + break; } - return std::nullopt; + + if (cur_pos == start) + return std::nullopt; + + auto length = cur_pos - start; + remainder = remainder.drop_front(length); + return llvm::StringRef(start, length); } static void ConsumeNumberBody(uint32_t &length, char &prev_ch, @@ -277,8 +260,7 @@ llvm::Expected DILLexer::Lex(llvm::StringRef expr, position += number.size(); return token; } else { - uint32_t width = 0; - std::optional maybe_word = IsWord(expr, remainder, width); + std::optional maybe_word = IsWord(remainder); if (maybe_word) { llvm::StringRef word = *maybe_word; Token::Kind kind = llvm::StringSwitch(word) @@ -308,7 +290,7 @@ llvm::Expected DILLexer::Lex(llvm::StringRef expr, .Case("wchar_t", Token::kw_wchar_t) .Default(Token::identifier); auto token = Token(kind, word.str(), position); - position += width; + position += llvm::sys::unicode::columnWidthUTF8(word.str()); return token; } } diff --git a/llvm/include/llvm/Support/Unicode.h b/llvm/include/llvm/Support/Unicode.h index 2090a0377b93..861548728d4f 100644 --- a/llvm/include/llvm/Support/Unicode.h +++ b/llvm/include/llvm/Support/Unicode.h @@ -41,19 +41,6 @@ bool isPrintable(int UCS); // Formatting codepoints are codepoints in the Cf category. bool isFormatting(int UCS); -/// Gets the number of positions a character is likely to occupy when output -/// on a terminal ("character width"). This depends on the implementation of the -/// terminal, and there's no standard definition of character width. -/// The implementation defines it in a way that is expected to be compatible -/// with a generic Unicode-capable terminal. -/// \return Character width: -/// * ErrorNonPrintableCharacter (-1) for non-printable characters (as -/// identified by isPrintable); -/// * 0 for non-spacing and enclosing combining marks; -/// * 2 for CJK characters excluding halfwidth forms; -/// * 1 for all remaining characters. -int charWidth(int UCS); - /// Gets the number of positions the UTF8-encoded \p Text is likely to occupy /// when output on a terminal ("character width"). This depends on the /// implementation of the terminal, and there's no standard definition of diff --git a/llvm/lib/Support/Unicode.cpp b/llvm/lib/Support/Unicode.cpp index 2c79632fbeec..288b75c872e1 100644 --- a/llvm/lib/Support/Unicode.cpp +++ b/llvm/lib/Support/Unicode.cpp @@ -290,7 +290,18 @@ bool isFormatting(int UCS) { return Format.contains(UCS); } -int charWidth(int UCS) { +/// Gets the number of positions a character is likely to occupy when output +/// on a terminal ("character width"). This depends on the implementation of the +/// terminal, and there's no standard definition of character width. +/// The implementation defines it in a way that is expected to be compatible +/// with a generic Unicode-capable terminal. +/// \return Character width: +/// * ErrorNonPrintableCharacter (-1) for non-printable characters (as +/// identified by isPrintable); +/// * 0 for non-spacing and enclosing combining marks; +/// * 2 for CJK characters excluding halfwidth forms; +/// * 1 for all remaining characters. +static inline int charWidth(int UCS) { if (!isPrintable(UCS)) return ErrorNonPrintableCharacter;