From 56aa0dec7ce284626431e9f03b9c470cf8b60565 Mon Sep 17 00:00:00 2001
From: Ilia Kuklin <ikuklin@accesssoftek.com>
Date: Fri, 21 Feb 2025 23:52:30 +0500
Subject: [PATCH 1/9] Add basic support for UTF-8 identifiers

---
 lldb/include/lldb/ValueObject/DILLexer.h |   4 +-
 lldb/source/ValueObject/DILLexer.cpp     | 126 +++++++++++++++++++++--
 lldb/unittests/DIL/DILTests.cpp          |   9 ++
 lldb/unittests/DIL/Inputs/test_binary.cc |   8 ++
 4 files changed, 135 insertions(+), 12 deletions(-)
diff --git a/lldb/include/lldb/ValueObject/DILLexer.h b/lldb/include/lldb/ValueObject/DILLexer.h
index 02c4e4611e02..e45057c9b84d 100644
--- a/lldb/include/lldb/ValueObject/DILLexer.h
+++ b/lldb/include/lldb/ValueObject/DILLexer.h
@@ -227,8 +227,8 @@ class DILLexer {
       m_expr(dil_expr), m_lexed_tokens(std::move(lexed_tokens)),
       m_tokens_idx(0) {}
 
-  static llvm::Expected<Token> Lex(llvm::StringRef expr,
-                                   llvm::StringRef &remainder);
+  static llvm::Expected<Token>
+  Lex(llvm::StringRef expr, llvm::StringRef &remainder, uint32_t &position);
 
   bool IsStringLiteral(Token::Kind kind) {
     return (kind == Token::string_literal ||
diff --git a/lldb/source/ValueObject/DILLexer.cpp b/lldb/source/ValueObject/DILLexer.cpp
index f41889fef921..dd5f1f463694 100644
--- a/lldb/source/ValueObject/DILLexer.cpp
+++ b/lldb/source/ValueObject/DILLexer.cpp
@@ -12,8 +12,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "lldb/ValueObject/DILLexer.h"
+#include "clang/Basic/CharInfo.h"
 //#include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/Support/ConvertUTF.h"
+#include "llvm/Support/UnicodeCharRanges.h"
 
 namespace lldb_private::dil {
 
@@ -45,6 +48,69 @@ const llvm::StringMap<Token::Kind> Keywords = {
     {"wchar_t", Token::kw_wchar_t}};
 */
 
+using clang::isDigit;
+static bool isValidIdentifierContinuationCodePoint(uint32_t c) {
+  // N1518: Recommendations for extended identifier characters for C and C++
+  // Proposed Annex X.1: Ranges of characters allowed
+  return c == 0x00A8 || c == 0x00AA || c == 0x00AD || c == 0x00AF ||
+         (c >= 0x00B2 && c <= 0x00B5) || (c >= 0x00B7 && c <= 0x00BA) ||
+         (c >= 0x00BC && c <= 0x00BE) || (c >= 0x00C0 && c <= 0x00D6) ||
+         (c >= 0x00D8 && c <= 0x00F6) || (c >= 0x00F8 && c <= 0x00FF)
+
+         || (c >= 0x0100 && c <= 0x167F) || (c >= 0x1681 && c <= 0x180D) ||
+         (c >= 0x180F && c <= 0x1FFF)
+
+         || (c >= 0x200B && c <= 0x200D) || (c >= 0x202A && c <= 0x202E) ||
+         (c >= 0x203F && c <= 0x2040) || c == 0x2054 ||
+         (c >= 0x2060 && c <= 0x206F)
+
+         || (c >= 0x2070 && c <= 0x218F) || (c >= 0x2460 && c <= 0x24FF) ||
+         (c >= 0x2776 && c <= 0x2793) || (c >= 0x2C00 && c <= 0x2DFF) ||
+         (c >= 0x2E80 && c <= 0x2FFF)
+
+         || (c >= 0x3004 && c <= 0x3007) || (c >= 0x3021 && c <= 0x302F) ||
+         (c >= 0x3031 && c <= 0x303F)
+
+         || (c >= 0x3040 && c <= 0xD7FF)
+
+         || (c >= 0xF900 && c <= 0xFD3D) || (c >= 0xFD40 && c <= 0xFDCF) ||
+         (c >= 0xFDF0 && c <= 0xFE44) || (c >= 0xFE47 && c <= 0xFFF8)
+
+         || (c >= 0x10000 && c <= 0x1FFFD) || (c >= 0x20000 && c <= 0x2FFFD) ||
+         (c >= 0x30000 && c <= 0x3FFFD) || (c >= 0x40000 && c <= 0x4FFFD) ||
+         (c >= 0x50000 && c <= 0x5FFFD) || (c >= 0x60000 && c <= 0x6FFFD) ||
+         (c >= 0x70000 && c <= 0x7FFFD) || (c >= 0x80000 && c <= 0x8FFFD) ||
+         (c >= 0x90000 && c <= 0x9FFFD) || (c >= 0xA0000 && c <= 0xAFFFD) ||
+         (c >= 0xB0000 && c <= 0xBFFFD) || (c >= 0xC0000 && c <= 0xCFFFD) ||
+         (c >= 0xD0000 && c <= 0xDFFFD) || (c >= 0xE0000 && c <= 0xEFFFD);
+}
+
+static bool isValidIdentifierStartCodePoint(uint32_t c) {
+  if (!isValidIdentifierContinuationCodePoint(c))
+    return false;
+  if (c < 0x80 && (isDigit(c) || c == '$'))
+    return false;
+
+  // N1518: Recommendations for extended identifier characters for C and C++
+  // Proposed Annex X.2: Ranges of characters disallowed initially
+  if ((c >= 0x0300 && c <= 0x036F) || (c >= 0x1DC0 && c <= 0x1DFF) ||
+      (c >= 0x20D0 && c <= 0x20FF) || (c >= 0xFE20 && c <= 0xFE2F))
+    return false;
+
+  return true;
+}
+
+static const llvm::sys::UnicodeCharRange UnicodeWhitespaceCharRanges[] = {
+    {0x0085, 0x0085}, {0x00A0, 0x00A0}, {0x1680, 0x1680},
+    {0x180E, 0x180E}, {0x2000, 0x200A}, {0x2028, 0x2029},
+    {0x202F, 0x202F}, {0x205F, 0x205F}, {0x3000, 0x3000}};
+
+static bool isUnicodeWhitespace(uint32_t Codepoint) {
+  static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars(
+      UnicodeWhitespaceCharRanges);
+  return UnicodeWhitespaceChars.contains(Codepoint);
+}
+
 llvm::StringRef Token::GetTokenName(Kind kind) {
   switch (kind){
     case Token::amp: return "amp";
@@ -224,8 +290,9 @@ static std::optional<llvm::StringRef> IsNumber(llvm::StringRef expr,
 llvm::Expected<DILLexer> DILLexer::Create(llvm::StringRef expr) {
   std::vector<Token> tokens;
   llvm::StringRef remainder = expr;
+  uint32_t position = 0;
   do {
-    if (llvm::Expected<Token> t = Lex(expr, remainder)) {
+    if (llvm::Expected<Token> t = Lex(expr, remainder, position)) {
       tokens.push_back(std::move(*t));
     } else {
       return t.takeError();
@@ -234,23 +301,27 @@ llvm::Expected<DILLexer> DILLexer::Create(llvm::StringRef expr) {
   return DILLexer(expr, std::move(tokens));
 }
 
-
 llvm::Expected<Token> DILLexer::Lex(llvm::StringRef expr,
-                                    llvm::StringRef &remainder) {
+                                    llvm::StringRef &remainder,
+                                    uint32_t &position) {
+  llvm::StringRef::iterator cur_pos = remainder.begin();
   // Skip over whitespace (spaces).
   remainder = remainder.ltrim();
-  llvm::StringRef::iterator cur_pos = remainder.begin();
+  position += remainder.begin() - cur_pos;
+  cur_pos = remainder.begin();
 
   // Check to see if we've reached the end of our input string.
   if (remainder.empty())
-    return Token(Token::eof, "", (uint32_t)expr.size());
+    return Token(Token::eof, "", position);
 
-  uint32_t position = cur_pos - expr.begin();;
+  // uint32_t position = cur_pos - expr.begin();;
   llvm::StringRef::iterator start = cur_pos;
   std::optional<llvm::StringRef> maybe_number = IsNumber(expr, remainder);
   if (maybe_number) {
     std::string number = (*maybe_number).str();
-    return Token(Token::numeric_constant, number, position);
+    auto token = Token(Token::numeric_constant, number, position);
+    position += number.size();
+    return token;
   } else {
     std::optional<llvm::StringRef> maybe_word = IsWord(expr, remainder);
     if (maybe_word) {
@@ -281,7 +352,9 @@ llvm::Expected<Token> DILLexer::Lex(llvm::StringRef expr,
                             .Case("volatile", Token::kw_volatile)
                             .Case("wchar_t", Token::kw_wchar_t)
                             .Default(Token::identifier);
-      return Token(kind, word.str(), (uint32_t)position);
+      auto token = Token(kind, word.str(), position);
+      position += word.size();
+      return token;
     }
   }
 
@@ -332,8 +405,41 @@ llvm::Expected<Token> DILLexer::Lex(llvm::StringRef expr,
     {Token::tilde, "~"},
   };
   for (auto [kind, str] : operators) {
-    if (remainder.consume_front(str))
-      return Token(kind, str, position);
+    if (remainder.consume_front(str)) {
+      auto token = Token(kind, str, position);
+      position += strlen(str);
+      return token;
+    }
+  }
+
+  cur_pos = start;
+  llvm::UTF32 CodePoint;
+  llvm::ConversionResult Status;
+  unsigned size = llvm::getNumBytesForUTF8(*cur_pos);
+  Status = llvm::convertUTF8Sequence((const llvm::UTF8 **)&cur_pos,
+                                     (const llvm::UTF8 *)remainder.end(),
+                                     &CodePoint, llvm::strictConversion);
+  if (Status == llvm::conversionOK &&
+      isValidIdentifierStartCodePoint(CodePoint)) {
+    unsigned utf_length = 1;
+    unsigned length = size;
+    while (true) {
+      size = llvm::getNumBytesForUTF8(*cur_pos);
+      Status = llvm::convertUTF8Sequence((const llvm::UTF8 **)&cur_pos,
+                                         (const llvm::UTF8 *)remainder.end(),
+                                         &CodePoint, llvm::strictConversion);
+      if (Status != llvm::conversionOK ||
+          !isValidIdentifierContinuationCodePoint(CodePoint))
+        break;
+      utf_length++;
+      length += size;
+    }
+
+    remainder = remainder.drop_front(length);
+    llvm::StringRef utf_token(start, length);
+    auto token = Token(Token::identifier, utf_token.str(), position);
+    position += utf_length;
+    return token;
   }
 
   // Unrecognized character(s) in string; unable to lex it.
diff --git a/lldb/unittests/DIL/DILTests.cpp b/lldb/unittests/DIL/DILTests.cpp
index 8cbdbb1fdc3b..1e62a85eeb0c 100644
--- a/lldb/unittests/DIL/DILTests.cpp
+++ b/lldb/unittests/DIL/DILTests.cpp
@@ -3680,3 +3680,12 @@ TEST_F(EvalTest, DISABLED_TestStringParsing) {
   EXPECT_THAT(Eval("*\"abc\""), IsError("string literals are not supported"));
 }
 #endif
+
+TEST_F(EvalTest, TestUnicodeIdentifiers) {
+  EXPECT_THAT(Eval("フー + 1"), IsEqual("2"));
+  EXPECT_THAT(Eval("1 + フー"), IsEqual("2"));
+  EXPECT_THAT(Eval("фу + бар"),
+              IsError("<expr:1:6>: use of undeclared identifier 'бар'\n"
+                      "фу + бар\n"
+                      "     ^"));
+}
\ No newline at end of file
diff --git a/lldb/unittests/DIL/Inputs/test_binary.cc b/lldb/unittests/DIL/Inputs/test_binary.cc
index 64ef8dc217c3..57007ffaf5ba 100644
--- a/lldb/unittests/DIL/Inputs/test_binary.cc
+++ b/lldb/unittests/DIL/Inputs/test_binary.cc
@@ -1196,6 +1196,13 @@ static void TestStringParsing() {
   // BREAK(TestStringParsing)
 }
 
+static void TestUnicodeIdentifiers() {
+  int フー = 1;
+  int фу = 2;
+  int föo = 3;
+  // BREAK(TestUnicodeIdentifiers)
+}
+
 namespace test_binary {
 
 void main() {
@@ -1250,6 +1257,7 @@ void main() {
 
   TestCharParsing();
   TestStringParsing();
+  TestUnicodeIdentifiers();
 
   // BREAK HERE
 }

From 97fa50915143a001afd51925ad562199b6747b33 Mon Sep 17 00:00:00 2001
From: Ilia Kuklin <ikuklin@accesssoftek.com>
Date: Mon, 24 Feb 2025 22:18:56 +0500
Subject: [PATCH 2/9] Replace IsWord with identifier UTF recognition and add
 ASCII characters to it

---
 lldb/source/ValueObject/DILLexer.cpp | 84 +++++++++++++---------------
 lldb/unittests/DIL/DILTests.cpp      |  3 +-
 2 files changed, 41 insertions(+), 46 deletions(-)

diff --git a/lldb/source/ValueObject/DILLexer.cpp b/lldb/source/ValueObject/DILLexer.cpp
index dd5f1f463694..c0d042e474bc 100644
--- a/lldb/source/ValueObject/DILLexer.cpp
+++ b/lldb/source/ValueObject/DILLexer.cpp
@@ -48,8 +48,10 @@ const llvm::StringMap<Token::Kind> Keywords = {
     {"wchar_t", Token::kw_wchar_t}};
 */
 
-using clang::isDigit;
 static bool isValidIdentifierContinuationCodePoint(uint32_t c) {
+  if (c < 0x80)
+    return clang::isAsciiIdentifierContinue(c, /*dollar*/ true);
+
   // N1518: Recommendations for extended identifier characters for C and C++
   // Proposed Annex X.1: Ranges of characters allowed
   return c == 0x00A8 || c == 0x00AA || c == 0x00AD || c == 0x00AF ||
@@ -88,7 +90,7 @@ static bool isValidIdentifierContinuationCodePoint(uint32_t c) {
 static bool isValidIdentifierStartCodePoint(uint32_t c) {
   if (!isValidIdentifierContinuationCodePoint(c))
     return false;
-  if (c < 0x80 && (isDigit(c) || c == '$'))
+  if (c < 0x80 && clang::isDigit(c))
     return false;
 
   // N1518: Recommendations for extended identifier characters for C and C++
@@ -225,16 +227,37 @@ static bool IsLetter (char c) {
 
 static bool IsDigit (char c) { return ('0' <= c && c <= '9'); }
 
-static std::optional<llvm::StringRef> IsWord(llvm::StringRef expr,
-                                             llvm::StringRef &remainder) {
-  // Find the longest prefix consisting of letters, digits, underscors and
-  // '$'. If it doesn't start with a digit, then it's a word.
-  llvm::StringRef candidate = remainder.take_while(
-      [](char c) { return IsDigit(c) || IsLetter(c) || c == '_' || c == '$'; });
-  if (candidate.empty() || IsDigit(candidate[0]))
-    return std::nullopt;
-  remainder = remainder.drop_front(candidate.size());
-  return candidate;
+static std::optional<llvm::StringRef>
+IsWord(llvm::StringRef expr, llvm::StringRef &remainder, uint32_t &utf_length) {
+  llvm::StringRef::iterator cur_pos = remainder.begin();
+  llvm::StringRef::iterator start = cur_pos;
+  llvm::UTF32 CodePoint;
+  llvm::ConversionResult Status;
+  unsigned size = llvm::getNumBytesForUTF8(*cur_pos);
+  Status = llvm::convertUTF8Sequence((const llvm::UTF8 **)&cur_pos,
+                                     (const llvm::UTF8 *)remainder.end(),
+                                     &CodePoint, llvm::strictConversion);
+  if (Status == llvm::conversionOK &&
+      isValidIdentifierStartCodePoint(CodePoint)) {
+    utf_length = 1;
+    unsigned length = size;
+    while (true) {
+      size = llvm::getNumBytesForUTF8(*cur_pos);
+      Status = llvm::convertUTF8Sequence((const llvm::UTF8 **)&cur_pos,
+                                         (const llvm::UTF8 *)remainder.end(),
+                                         &CodePoint, llvm::strictConversion);
+      if (Status != llvm::conversionOK ||
+          !isValidIdentifierContinuationCodePoint(CodePoint))
+        break;
+      utf_length++;
+      length += size;
+    }
+
+    remainder = remainder.drop_front(length);
+    llvm::StringRef utf_token(start, length);
+    return utf_token;
+  }
+  return std::nullopt;
 }
 
 static void ConsumeNumberBody(uint32_t &length, char &prev_ch,
@@ -314,7 +337,6 @@ llvm::Expected<Token> DILLexer::Lex(llvm::StringRef expr,
   if (remainder.empty())
     return Token(Token::eof, "", position);
 
-  // uint32_t position = cur_pos - expr.begin();;
   llvm::StringRef::iterator start = cur_pos;
   std::optional<llvm::StringRef> maybe_number = IsNumber(expr, remainder);
   if (maybe_number) {
@@ -323,7 +345,9 @@ llvm::Expected<Token> DILLexer::Lex(llvm::StringRef expr,
     position += number.size();
     return token;
   } else {
-    std::optional<llvm::StringRef> maybe_word = IsWord(expr, remainder);
+    uint32_t utf_length = 0;
+    std::optional<llvm::StringRef> maybe_word =
+        IsWord(expr, remainder, utf_length);
     if (maybe_word) {
       llvm::StringRef word = *maybe_word;
       Token::Kind kind = llvm::StringSwitch<Token::Kind>(word)
@@ -353,7 +377,7 @@ llvm::Expected<Token> DILLexer::Lex(llvm::StringRef expr,
                             .Case("wchar_t", Token::kw_wchar_t)
                             .Default(Token::identifier);
       auto token = Token(kind, word.str(), position);
-      position += word.size();
+      position += utf_length;
       return token;
     }
   }
@@ -412,36 +436,6 @@ llvm::Expected<Token> DILLexer::Lex(llvm::StringRef expr,
     }
   }
 
-  cur_pos = start;
-  llvm::UTF32 CodePoint;
-  llvm::ConversionResult Status;
-  unsigned size = llvm::getNumBytesForUTF8(*cur_pos);
-  Status = llvm::convertUTF8Sequence((const llvm::UTF8 **)&cur_pos,
-                                     (const llvm::UTF8 *)remainder.end(),
-                                     &CodePoint, llvm::strictConversion);
-  if (Status == llvm::conversionOK &&
-      isValidIdentifierStartCodePoint(CodePoint)) {
-    unsigned utf_length = 1;
-    unsigned length = size;
-    while (true) {
-      size = llvm::getNumBytesForUTF8(*cur_pos);
-      Status = llvm::convertUTF8Sequence((const llvm::UTF8 **)&cur_pos,
-                                         (const llvm::UTF8 *)remainder.end(),
-                                         &CodePoint, llvm::strictConversion);
-      if (Status != llvm::conversionOK ||
-          !isValidIdentifierContinuationCodePoint(CodePoint))
-        break;
-      utf_length++;
-      length += size;
-    }
-
-    remainder = remainder.drop_front(length);
-    llvm::StringRef utf_token(start, length);
-    auto token = Token(Token::identifier, utf_token.str(), position);
-    position += utf_length;
-    return token;
-  }
-
   // Unrecognized character(s) in string; unable to lex it.
   return llvm::createStringError("Unable to lex input string");
 }
diff --git a/lldb/unittests/DIL/DILTests.cpp b/lldb/unittests/DIL/DILTests.cpp
index 1e62a85eeb0c..b9f51d9437c1 100644
--- a/lldb/unittests/DIL/DILTests.cpp
+++ b/lldb/unittests/DIL/DILTests.cpp
@@ -3683,7 +3683,8 @@ TEST_F(EvalTest, DISABLED_TestStringParsing) {
 
 TEST_F(EvalTest, TestUnicodeIdentifiers) {
   EXPECT_THAT(Eval("フー + 1"), IsEqual("2"));
-  EXPECT_THAT(Eval("1 + フー"), IsEqual("2"));
+  EXPECT_THAT(Eval("2 + フー"), IsEqual("3"));
+  EXPECT_THAT(Eval("föo + 1"), IsEqual("4"));
   EXPECT_THAT(Eval("фу + бар"),
               IsError("<expr:1:6>: use of undeclared identifier 'бар'\n"
                       "фу + бар\n"

From 526df72545729266c35fafd7200481c6edf22b4d Mon Sep 17 00:00:00 2001
From: Ilia Kuklin <ikuklin@accesssoftek.com>
Date: Tue, 25 Feb 2025 19:01:01 +0500
Subject: [PATCH 3/9] Refactor code & add unicode whitespace skipping

---
 lldb/source/ValueObject/DILLexer.cpp | 189 +++++++++++++++------------
 lldb/unittests/DIL/DILLexerTests.cpp |   4 +-
 lldb/unittests/DIL/DILTests.cpp      |   4 +-
 3 files changed, 112 insertions(+), 85 deletions(-)

diff --git a/lldb/source/ValueObject/DILLexer.cpp b/lldb/source/ValueObject/DILLexer.cpp
index c0d042e474bc..564d3b02029d 100644
--- a/lldb/source/ValueObject/DILLexer.cpp
+++ b/lldb/source/ValueObject/DILLexer.cpp
@@ -17,6 +17,7 @@
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/ConvertUTF.h"
 #include "llvm/Support/UnicodeCharRanges.h"
+#include <tuple>
 
 namespace lldb_private::dil {
 
@@ -48,71 +49,6 @@ const llvm::StringMap<Token::Kind> Keywords = {
     {"wchar_t", Token::kw_wchar_t}};
 */
 
-static bool isValidIdentifierContinuationCodePoint(uint32_t c) {
-  if (c < 0x80)
-    return clang::isAsciiIdentifierContinue(c, /*dollar*/ true);
-
-  // N1518: Recommendations for extended identifier characters for C and C++
-  // Proposed Annex X.1: Ranges of characters allowed
-  return c == 0x00A8 || c == 0x00AA || c == 0x00AD || c == 0x00AF ||
-         (c >= 0x00B2 && c <= 0x00B5) || (c >= 0x00B7 && c <= 0x00BA) ||
-         (c >= 0x00BC && c <= 0x00BE) || (c >= 0x00C0 && c <= 0x00D6) ||
-         (c >= 0x00D8 && c <= 0x00F6) || (c >= 0x00F8 && c <= 0x00FF)
-
-         || (c >= 0x0100 && c <= 0x167F) || (c >= 0x1681 && c <= 0x180D) ||
-         (c >= 0x180F && c <= 0x1FFF)
-
-         || (c >= 0x200B && c <= 0x200D) || (c >= 0x202A && c <= 0x202E) ||
-         (c >= 0x203F && c <= 0x2040) || c == 0x2054 ||
-         (c >= 0x2060 && c <= 0x206F)
-
-         || (c >= 0x2070 && c <= 0x218F) || (c >= 0x2460 && c <= 0x24FF) ||
-         (c >= 0x2776 && c <= 0x2793) || (c >= 0x2C00 && c <= 0x2DFF) ||
-         (c >= 0x2E80 && c <= 0x2FFF)
-
-         || (c >= 0x3004 && c <= 0x3007) || (c >= 0x3021 && c <= 0x302F) ||
-         (c >= 0x3031 && c <= 0x303F)
-
-         || (c >= 0x3040 && c <= 0xD7FF)
-
-         || (c >= 0xF900 && c <= 0xFD3D) || (c >= 0xFD40 && c <= 0xFDCF) ||
-         (c >= 0xFDF0 && c <= 0xFE44) || (c >= 0xFE47 && c <= 0xFFF8)
-
-         || (c >= 0x10000 && c <= 0x1FFFD) || (c >= 0x20000 && c <= 0x2FFFD) ||
-         (c >= 0x30000 && c <= 0x3FFFD) || (c >= 0x40000 && c <= 0x4FFFD) ||
-         (c >= 0x50000 && c <= 0x5FFFD) || (c >= 0x60000 && c <= 0x6FFFD) ||
-         (c >= 0x70000 && c <= 0x7FFFD) || (c >= 0x80000 && c <= 0x8FFFD) ||
-         (c >= 0x90000 && c <= 0x9FFFD) || (c >= 0xA0000 && c <= 0xAFFFD) ||
-         (c >= 0xB0000 && c <= 0xBFFFD) || (c >= 0xC0000 && c <= 0xCFFFD) ||
-         (c >= 0xD0000 && c <= 0xDFFFD) || (c >= 0xE0000 && c <= 0xEFFFD);
-}
-
-static bool isValidIdentifierStartCodePoint(uint32_t c) {
-  if (!isValidIdentifierContinuationCodePoint(c))
-    return false;
-  if (c < 0x80 && clang::isDigit(c))
-    return false;
-
-  // N1518: Recommendations for extended identifier characters for C and C++
-  // Proposed Annex X.2: Ranges of characters disallowed initially
-  if ((c >= 0x0300 && c <= 0x036F) || (c >= 0x1DC0 && c <= 0x1DFF) ||
-      (c >= 0x20D0 && c <= 0x20FF) || (c >= 0xFE20 && c <= 0xFE2F))
-    return false;
-
-  return true;
-}
-
-static const llvm::sys::UnicodeCharRange UnicodeWhitespaceCharRanges[] = {
-    {0x0085, 0x0085}, {0x00A0, 0x00A0}, {0x1680, 0x1680},
-    {0x180E, 0x180E}, {0x2000, 0x200A}, {0x2028, 0x2029},
-    {0x202F, 0x202F}, {0x205F, 0x205F}, {0x3000, 0x3000}};
-
-static bool isUnicodeWhitespace(uint32_t Codepoint) {
-  static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars(
-      UnicodeWhitespaceCharRanges);
-  return UnicodeWhitespaceChars.contains(Codepoint);
-}
-
 llvm::StringRef Token::GetTokenName(Kind kind) {
   switch (kind){
     case Token::amp: return "amp";
@@ -221,31 +157,123 @@ llvm::StringRef Token::GetTokenName(Kind kind) {
   }
 }
 
-static bool IsLetter (char c) {
+static bool IsLetter(char c) {
   return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z');
 }
 
-static bool IsDigit (char c) { return ('0' <= c && c <= '9'); }
+static bool IsDigit(char c) { return ('0' <= c && c <= '9'); }
+
+static bool isValidIdentifierContinuationCodePoint(uint32_t c) {
+  if (c < 0x80)
+    return clang::isAsciiIdentifierContinue(c, /*dollar*/ true);
+
+  // N1518: Recommendations for extended identifier characters for C and C++
+  // Proposed Annex X.1: Ranges of characters allowed
+  return c == 0x00A8 || c == 0x00AA || c == 0x00AD || c == 0x00AF ||
+         (c >= 0x00B2 && c <= 0x00B5) || (c >= 0x00B7 && c <= 0x00BA) ||
+         (c >= 0x00BC && c <= 0x00BE) || (c >= 0x00C0 && c <= 0x00D6) ||
+         (c >= 0x00D8 && c <= 0x00F6) || (c >= 0x00F8 && c <= 0x00FF)
+
+         || (c >= 0x0100 && c <= 0x167F) || (c >= 0x1681 && c <= 0x180D) ||
+         (c >= 0x180F && c <= 0x1FFF)
+
+         || (c >= 0x200B && c <= 0x200D) || (c >= 0x202A && c <= 0x202E) ||
+         (c >= 0x203F && c <= 0x2040) || c == 0x2054 ||
+         (c >= 0x2060 && c <= 0x206F)
+
+         || (c >= 0x2070 && c <= 0x218F) || (c >= 0x2460 && c <= 0x24FF) ||
+         (c >= 0x2776 && c <= 0x2793) || (c >= 0x2C00 && c <= 0x2DFF) ||
+         (c >= 0x2E80 && c <= 0x2FFF)
+
+         || (c >= 0x3004 && c <= 0x3007) || (c >= 0x3021 && c <= 0x302F) ||
+         (c >= 0x3031 && c <= 0x303F)
+
+         || (c >= 0x3040 && c <= 0xD7FF)
+
+         || (c >= 0xF900 && c <= 0xFD3D) || (c >= 0xFD40 && c <= 0xFDCF) ||
+         (c >= 0xFDF0 && c <= 0xFE44) || (c >= 0xFE47 && c <= 0xFFF8)
+
+         || (c >= 0x10000 && c <= 0x1FFFD) || (c >= 0x20000 && c <= 0x2FFFD) ||
+         (c >= 0x30000 && c <= 0x3FFFD) || (c >= 0x40000 && c <= 0x4FFFD) ||
+         (c >= 0x50000 && c <= 0x5FFFD) || (c >= 0x60000 && c <= 0x6FFFD) ||
+         (c >= 0x70000 && c <= 0x7FFFD) || (c >= 0x80000 && c <= 0x8FFFD) ||
+         (c >= 0x90000 && c <= 0x9FFFD) || (c >= 0xA0000 && c <= 0xAFFFD) ||
+         (c >= 0xB0000 && c <= 0xBFFFD) || (c >= 0xC0000 && c <= 0xCFFFD) ||
+         (c >= 0xD0000 && c <= 0xDFFFD) || (c >= 0xE0000 && c <= 0xEFFFD);
+}
+
+static bool isValidIdentifierStartCodePoint(uint32_t c) {
+  if (!isValidIdentifierContinuationCodePoint(c))
+    return false;
+  if (c < 0x80 && IsDigit(c))
+    return false;
+
+  // N1518: Recommendations for extended identifier characters for C and C++
+  // Proposed Annex X.2: Ranges of characters disallowed initially
+  if ((c >= 0x0300 && c <= 0x036F) || (c >= 0x1DC0 && c <= 0x1DFF) ||
+      (c >= 0x20D0 && c <= 0x20FF) || (c >= 0xFE20 && c <= 0xFE2F))
+    return false;
+
+  return true;
+}
+
+static const llvm::sys::UnicodeCharRange UnicodeWhitespaceCharRanges[] = {
+    {0x0085, 0x0085}, {0x00A0, 0x00A0}, {0x1680, 0x1680},
+    {0x180E, 0x180E}, {0x2000, 0x200A}, {0x2028, 0x2029},
+    {0x202F, 0x202F}, {0x205F, 0x205F}, {0x3000, 0x3000}};
+
+static bool isUnicodeWhitespace(uint32_t Codepoint) {
+  static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars(
+      UnicodeWhitespaceCharRanges);
+  return UnicodeWhitespaceChars.contains(Codepoint);
+}
+
+static std::tuple<llvm::ConversionResult, llvm::UTF32, uint32_t>
+convertUTF8SequenceAndAdvance(llvm::StringRef::iterator &cur_pos,
+                              llvm::StringRef::iterator end) {
+  llvm::UTF32 CodePoint;
+  uint32_t size = llvm::getNumBytesForUTF8(*cur_pos);
+  llvm::ConversionResult Status = llvm::convertUTF8Sequence(
+      (const llvm::UTF8 **)&cur_pos, (const llvm::UTF8 *)end, &CodePoint,
+      llvm::strictConversion);
+  return {Status, CodePoint, size};
+}
+
+static void SkipUnicodeWhitespaces(llvm::StringRef &remainder) {
+  llvm::StringRef::iterator cur_pos = remainder.begin();
+  uint32_t length = 0;
+  while (true) {
+    auto [Status, CodePoint, size] =
+        convertUTF8SequenceAndAdvance(cur_pos, remainder.end());
+    if (Status != llvm::conversionOK || !isUnicodeWhitespace(CodePoint))
+      break;
+    length += size;
+  }
+  remainder = remainder.drop_front(length);
+}
+
+static void SkipWhitespaces(llvm::StringRef &remainder) {
+  llvm::StringRef::iterator cur_pos;
+  do {
+    cur_pos = remainder.begin();
+    remainder = remainder.ltrim();
+    SkipUnicodeWhitespaces(remainder);
+  } while (remainder.begin() != cur_pos);
+}
 
 static std::optional<llvm::StringRef>
 IsWord(llvm::StringRef expr, llvm::StringRef &remainder, uint32_t &utf_length) {
   llvm::StringRef::iterator cur_pos = remainder.begin();
   llvm::StringRef::iterator start = cur_pos;
-  llvm::UTF32 CodePoint;
-  llvm::ConversionResult Status;
-  unsigned size = llvm::getNumBytesForUTF8(*cur_pos);
-  Status = llvm::convertUTF8Sequence((const llvm::UTF8 **)&cur_pos,
-                                     (const llvm::UTF8 *)remainder.end(),
-                                     &CodePoint, llvm::strictConversion);
+  auto [Status, CodePoint, size] =
+      convertUTF8SequenceAndAdvance(cur_pos, remainder.end());
   if (Status == llvm::conversionOK &&
       isValidIdentifierStartCodePoint(CodePoint)) {
     utf_length = 1;
     unsigned length = size;
     while (true) {
-      size = llvm::getNumBytesForUTF8(*cur_pos);
-      Status = llvm::convertUTF8Sequence((const llvm::UTF8 **)&cur_pos,
-                                         (const llvm::UTF8 *)remainder.end(),
-                                         &CodePoint, llvm::strictConversion);
+      auto [Status, CodePoint, size] =
+          convertUTF8SequenceAndAdvance(cur_pos, remainder.end());
       if (Status != llvm::conversionOK ||
           !isValidIdentifierContinuationCodePoint(CodePoint))
         break;
@@ -328,16 +356,14 @@ llvm::Expected<Token> DILLexer::Lex(llvm::StringRef expr,
                                     llvm::StringRef &remainder,
                                     uint32_t &position) {
   llvm::StringRef::iterator cur_pos = remainder.begin();
-  // Skip over whitespace (spaces).
-  remainder = remainder.ltrim();
+  SkipWhitespaces(remainder);
   position += remainder.begin() - cur_pos;
-  cur_pos = remainder.begin();
 
   // Check to see if we've reached the end of our input string.
   if (remainder.empty())
     return Token(Token::eof, "", position);
 
-  llvm::StringRef::iterator start = cur_pos;
+  cur_pos = remainder.begin();
   std::optional<llvm::StringRef> maybe_number = IsNumber(expr, remainder);
   if (maybe_number) {
     std::string number = (*maybe_number).str();
@@ -382,7 +408,6 @@ llvm::Expected<Token> DILLexer::Lex(llvm::StringRef expr,
     }
   }
 
-  cur_pos = start;
   constexpr std::pair<Token::Kind, const char *> operators[] = {
     {Token::l_square, "["},
     {Token::r_square, "]"},
diff --git a/lldb/unittests/DIL/DILLexerTests.cpp b/lldb/unittests/DIL/DILLexerTests.cpp
index 026a092aa9e8..d4eb494a62e0 100644
--- a/lldb/unittests/DIL/DILLexerTests.cpp
+++ b/lldb/unittests/DIL/DILLexerTests.cpp
@@ -121,8 +121,8 @@ TEST(DILLexerTests, MultiTokenLexTest) {
 TEST(DILLexerTests, IdentifiersTest) {
   // These strings should lex into identifier tokens.
   std::vector<std::string> valid_identifiers = {
-      "$My_name1", "$pc",  "abcd", "_", "_a",     "_a_",      "$",
-      "a_b",       "kw_this", "self", "a", "MyName", "kw_namespace"};
+      "$My_name1", "$pc",  "abcd", "_",      "_a",           "_a_", "$", "a_b",
+      "kw_this",   "self", "a",    "MyName", "kw_namespace", "föo", "🍫"};
 
   // The lexer can lex these strings, but they should not be identifiers.
   std::vector<std::string> invalid_identifiers = {"", "::", "(", ")", "234", "2"};
diff --git a/lldb/unittests/DIL/DILTests.cpp b/lldb/unittests/DIL/DILTests.cpp
index b9f51d9437c1..aecddeffb268 100644
--- a/lldb/unittests/DIL/DILTests.cpp
+++ b/lldb/unittests/DIL/DILTests.cpp
@@ -3683,8 +3683,10 @@ TEST_F(EvalTest, DISABLED_TestStringParsing) {
 
 TEST_F(EvalTest, TestUnicodeIdentifiers) {
   EXPECT_THAT(Eval("フー + 1"), IsEqual("2"));
-  EXPECT_THAT(Eval("2 + フー"), IsEqual("3"));
+  EXPECT_THAT(Eval("1 + フー"), IsEqual("2"));
   EXPECT_THAT(Eval("föo + 1"), IsEqual("4"));
+  EXPECT_THAT(Eval(" 1　+   föo   "),
+              IsEqual("4")); // Contains Unicode whitespaces
   EXPECT_THAT(Eval("фу + бар"),
               IsError("<expr:1:6>: use of undeclared identifier 'бар'\n"
                       "фу + бар\n"

From ccab3d1a308c35cfaceb4460485e74623cecbe6e Mon Sep 17 00:00:00 2001
From: Ilia Kuklin <ikuklin@accesssoftek.com>
Date: Thu, 27 Feb 2025 19:24:13 +0500
Subject: [PATCH 4/9] Add test for an identifier with a right-to-left script

---
 lldb/unittests/DIL/DILLexerTests.cpp     | 5 +++--
 lldb/unittests/DIL/DILTests.cpp          | 1 +
 lldb/unittests/DIL/Inputs/test_binary.cc | 1 +
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/lldb/unittests/DIL/DILLexerTests.cpp b/lldb/unittests/DIL/DILLexerTests.cpp
index d4eb494a62e0..c6abc8674de6 100644
--- a/lldb/unittests/DIL/DILLexerTests.cpp
+++ b/lldb/unittests/DIL/DILLexerTests.cpp
@@ -121,8 +121,9 @@ TEST(DILLexerTests, MultiTokenLexTest) {
 TEST(DILLexerTests, IdentifiersTest) {
   // These strings should lex into identifier tokens.
   std::vector<std::string> valid_identifiers = {
-      "$My_name1", "$pc",  "abcd", "_",      "_a",           "_a_", "$", "a_b",
-      "kw_this",   "self", "a",    "MyName", "kw_namespace", "föo", "🍫"};
+      "$My_name1", "$pc",     "abcd", "_", "_a",     "_a_",          "$",
+      "a_b",       "kw_this", "self", "a", "MyName", "kw_namespace", "föo",
+      "🍫",        "שלום"};
 
   // The lexer can lex these strings, but they should not be identifiers.
   std::vector<std::string> invalid_identifiers = {"", "::", "(", ")", "234", "2"};
diff --git a/lldb/unittests/DIL/DILTests.cpp b/lldb/unittests/DIL/DILTests.cpp
index aecddeffb268..715c22ea101b 100644
--- a/lldb/unittests/DIL/DILTests.cpp
+++ b/lldb/unittests/DIL/DILTests.cpp
@@ -3685,6 +3685,7 @@ TEST_F(EvalTest, TestUnicodeIdentifiers) {
   EXPECT_THAT(Eval("フー + 1"), IsEqual("2"));
   EXPECT_THAT(Eval("1 + フー"), IsEqual("2"));
   EXPECT_THAT(Eval("föo + 1"), IsEqual("4"));
+  EXPECT_THAT(Eval("שלום + 1"), IsEqual("5"));
   EXPECT_THAT(Eval(" 1　+   föo   "),
               IsEqual("4")); // Contains Unicode whitespaces
   EXPECT_THAT(Eval("фу + бар"),
diff --git a/lldb/unittests/DIL/Inputs/test_binary.cc b/lldb/unittests/DIL/Inputs/test_binary.cc
index 57007ffaf5ba..af4d52b166b7 100644
--- a/lldb/unittests/DIL/Inputs/test_binary.cc
+++ b/lldb/unittests/DIL/Inputs/test_binary.cc
@@ -1200,6 +1200,7 @@ static void TestUnicodeIdentifiers() {
   int フー = 1;
   int фу = 2;
   int föo = 3;
+  int שלום = 4;
   // BREAK(TestUnicodeIdentifiers)
 }
 

From 4e791156df1b08391180085507ad972074181595 Mon Sep 17 00:00:00 2001
From: Ilia Kuklin <ikuklin@accesssoftek.com>
Date: Mon, 3 Mar 2025 23:56:57 +0500
Subject: [PATCH 5/9] Count UTF-8 width of skipped whitespaces

---
 lldb/source/ValueObject/DILLexer.cpp | 9 ++++++---
 lldb/unittests/DIL/DILTests.cpp      | 8 ++++++--
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/lldb/source/ValueObject/DILLexer.cpp b/lldb/source/ValueObject/DILLexer.cpp
index 564d3b02029d..c94556ebbeb6 100644
--- a/lldb/source/ValueObject/DILLexer.cpp
+++ b/lldb/source/ValueObject/DILLexer.cpp
@@ -16,6 +16,7 @@
 //#include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/ConvertUTF.h"
+#include "llvm/Support/Unicode.h"
 #include "llvm/Support/UnicodeCharRanges.h"
 #include <tuple>
 
@@ -355,15 +356,17 @@ llvm::Expected<DILLexer> DILLexer::Create(llvm::StringRef expr) {
 llvm::Expected<Token> DILLexer::Lex(llvm::StringRef expr,
                                     llvm::StringRef &remainder,
                                     uint32_t &position) {
-  llvm::StringRef::iterator cur_pos = remainder.begin();
+  llvm::StringRef::iterator start = remainder.begin();
   SkipWhitespaces(remainder);
-  position += remainder.begin() - cur_pos;
+  if (start < remainder.begin()) {
+    llvm::StringRef skipped(start, remainder.begin() - start);
+    position += llvm::sys::unicode::columnWidthUTF8(skipped);
+  }
 
   // Check to see if we've reached the end of our input string.
   if (remainder.empty())
     return Token(Token::eof, "", position);
 
-  cur_pos = remainder.begin();
   std::optional<llvm::StringRef> maybe_number = IsNumber(expr, remainder);
   if (maybe_number) {
     std::string number = (*maybe_number).str();
diff --git a/lldb/unittests/DIL/DILTests.cpp b/lldb/unittests/DIL/DILTests.cpp
index 715c22ea101b..cf814ea6090d 100644
--- a/lldb/unittests/DIL/DILTests.cpp
+++ b/lldb/unittests/DIL/DILTests.cpp
@@ -3686,10 +3686,14 @@ TEST_F(EvalTest, TestUnicodeIdentifiers) {
   EXPECT_THAT(Eval("1 + フー"), IsEqual("2"));
   EXPECT_THAT(Eval("föo + 1"), IsEqual("4"));
   EXPECT_THAT(Eval("שלום + 1"), IsEqual("5"));
-  EXPECT_THAT(Eval(" 1　+   föo   "),
-              IsEqual("4")); // Contains Unicode whitespaces
+  EXPECT_THAT(Eval(" 1　+   föo   "), // Contains Unicode whitespaces
+              IsEqual("4"));
   EXPECT_THAT(Eval("фу + бар"),
               IsError("<expr:1:6>: use of undeclared identifier 'бар'\n"
                       "фу + бар\n"
                       "     ^"));
+  EXPECT_THAT(Eval("фу　+　бар"), // Wide Unicode whitespaces
+              IsError("<expr:1:8>: use of undeclared identifier 'бар'\n"
+                      "фу　+　бар\n"
+                      "       ^"));
 }
\ No newline at end of file

From 6520218fbd4b20f989d38bb84f4fce13654bf472 Mon Sep 17 00:00:00 2001
From: Ilia Kuklin <ikuklin@accesssoftek.com>
Date: Tue, 4 Mar 2025 01:07:52 +0500
Subject: [PATCH 6/9] Allow identifier to be contain anything except operators
 and whitespaces

---
 lldb/source/ValueObject/DILLexer.cpp     | 123 ++++++-----------------
 lldb/unittests/DIL/DILTests.cpp          |   8 +-
 lldb/unittests/DIL/Inputs/test_binary.cc |   6 +-
 3 files changed, 39 insertions(+), 98 deletions(-)

diff --git a/lldb/source/ValueObject/DILLexer.cpp b/lldb/source/ValueObject/DILLexer.cpp
index c94556ebbeb6..272ab86375ca 100644
--- a/lldb/source/ValueObject/DILLexer.cpp
+++ b/lldb/source/ValueObject/DILLexer.cpp
@@ -13,7 +13,6 @@
 
 #include "lldb/ValueObject/DILLexer.h"
 #include "clang/Basic/CharInfo.h"
-//#include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/ConvertUTF.h"
 #include "llvm/Support/Unicode.h"
@@ -22,34 +21,6 @@
 
 namespace lldb_private::dil {
 
-/*
-const llvm::StringMap<Token::Kind> Keywords = {
-    {"bool", Token::kw_bool},
-    {"char", Token::kw_char},
-    {"char16_t", Token::kw_char16_t},
-    {"char32_t", Token::kw_char32_t},
-    {"const", Token::kw_const},
-    {"double", Token::kw_double},
-    {"dynamic_cast", Token::kw_dynamic_cast},
-    {"false", Token::kw_false},
-    {"float", Token::kw_float},
-    {"int", Token::kw_int},
-    {"long", Token::kw_long},
-    {"namespace", Token::kw_namespace},
-    {"nullptr", Token::kw_nullptr},
-    {"reinterpret_cast", Token::kw_reinterpret_cast},
-    {"short", Token::kw_short},
-    {"signed", Token::kw_signed},
-    {"sizeof", Token::kw_sizeof},
-    {"static_cast", Token::kw_static_cast},
-    {"this", Token::kw_this},
-    {"true", Token::kw_true},
-    {"unsigned", Token::kw_unsigned},
-    {"void", Token::kw_void},
-    {"volatile", Token::kw_volatile},
-    {"wchar_t", Token::kw_wchar_t}};
-*/
-
 llvm::StringRef Token::GetTokenName(Kind kind) {
   switch (kind){
     case Token::amp: return "amp";
@@ -164,71 +135,39 @@ static bool IsLetter(char c) {
 
 static bool IsDigit(char c) { return ('0' <= c && c <= '9'); }
 
-static bool isValidIdentifierContinuationCodePoint(uint32_t c) {
-  if (c < 0x80)
-    return clang::isAsciiIdentifierContinue(c, /*dollar*/ true);
-
-  // N1518: Recommendations for extended identifier characters for C and C++
-  // Proposed Annex X.1: Ranges of characters allowed
-  return c == 0x00A8 || c == 0x00AA || c == 0x00AD || c == 0x00AF ||
-         (c >= 0x00B2 && c <= 0x00B5) || (c >= 0x00B7 && c <= 0x00BA) ||
-         (c >= 0x00BC && c <= 0x00BE) || (c >= 0x00C0 && c <= 0x00D6) ||
-         (c >= 0x00D8 && c <= 0x00F6) || (c >= 0x00F8 && c <= 0x00FF)
-
-         || (c >= 0x0100 && c <= 0x167F) || (c >= 0x1681 && c <= 0x180D) ||
-         (c >= 0x180F && c <= 0x1FFF)
-
-         || (c >= 0x200B && c <= 0x200D) || (c >= 0x202A && c <= 0x202E) ||
-         (c >= 0x203F && c <= 0x2040) || c == 0x2054 ||
-         (c >= 0x2060 && c <= 0x206F)
-
-         || (c >= 0x2070 && c <= 0x218F) || (c >= 0x2460 && c <= 0x24FF) ||
-         (c >= 0x2776 && c <= 0x2793) || (c >= 0x2C00 && c <= 0x2DFF) ||
-         (c >= 0x2E80 && c <= 0x2FFF)
-
-         || (c >= 0x3004 && c <= 0x3007) || (c >= 0x3021 && c <= 0x302F) ||
-         (c >= 0x3031 && c <= 0x303F)
+static const llvm::sys::UnicodeCharRange UnicodeWhitespaceCharRanges[] = {
+    {0x0085, 0x0085}, {0x00A0, 0x00A0}, {0x1680, 0x1680},
+    {0x180E, 0x180E}, {0x2000, 0x200A}, {0x2028, 0x2029},
+    {0x202F, 0x202F}, {0x205F, 0x205F}, {0x3000, 0x3000}};
 
-         || (c >= 0x3040 && c <= 0xD7FF)
+static bool IsUnicodeWhitespace(uint32_t Codepoint) {
+  static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars(
+      UnicodeWhitespaceCharRanges);
+  return UnicodeWhitespaceChars.contains(Codepoint);
+}
 
-         || (c >= 0xF900 && c <= 0xFD3D) || (c >= 0xFD40 && c <= 0xFDCF) ||
-         (c >= 0xFDF0 && c <= 0xFE44) || (c >= 0xFE47 && c <= 0xFFF8)
+inline bool IsOperator(unsigned char c) {
+  using namespace clang::charinfo;
+  return (InfoTable[c] & (CHAR_PUNCT | CHAR_PERIOD)) != 0;
+}
 
-         || (c >= 0x10000 && c <= 0x1FFFD) || (c >= 0x20000 && c <= 0x2FFFD) ||
-         (c >= 0x30000 && c <= 0x3FFFD) || (c >= 0x40000 && c <= 0x4FFFD) ||
-         (c >= 0x50000 && c <= 0x5FFFD) || (c >= 0x60000 && c <= 0x6FFFD) ||
-         (c >= 0x70000 && c <= 0x7FFFD) || (c >= 0x80000 && c <= 0x8FFFD) ||
-         (c >= 0x90000 && c <= 0x9FFFD) || (c >= 0xA0000 && c <= 0xAFFFD) ||
-         (c >= 0xB0000 && c <= 0xBFFFD) || (c >= 0xC0000 && c <= 0xCFFFD) ||
-         (c >= 0xD0000 && c <= 0xDFFFD) || (c >= 0xE0000 && c <= 0xEFFFD);
+static bool IsValidIdentifierContinuationCodePoint(uint32_t c) {
+  if (c < 0x80) {
+    if (c == '$')
+      return true;
+    return !IsOperator(c) && !clang::isWhitespace(c);
+  }
+  return !IsUnicodeWhitespace(c);
 }
 
-static bool isValidIdentifierStartCodePoint(uint32_t c) {
-  if (!isValidIdentifierContinuationCodePoint(c))
+static bool IsValidIdentifierStartCodePoint(uint32_t c) {
+  if (!IsValidIdentifierContinuationCodePoint(c))
     return false;
   if (c < 0x80 && IsDigit(c))
     return false;
-
-  // N1518: Recommendations for extended identifier characters for C and C++
-  // Proposed Annex X.2: Ranges of characters disallowed initially
-  if ((c >= 0x0300 && c <= 0x036F) || (c >= 0x1DC0 && c <= 0x1DFF) ||
-      (c >= 0x20D0 && c <= 0x20FF) || (c >= 0xFE20 && c <= 0xFE2F))
-    return false;
-
   return true;
 }
 
-static const llvm::sys::UnicodeCharRange UnicodeWhitespaceCharRanges[] = {
-    {0x0085, 0x0085}, {0x00A0, 0x00A0}, {0x1680, 0x1680},
-    {0x180E, 0x180E}, {0x2000, 0x200A}, {0x2028, 0x2029},
-    {0x202F, 0x202F}, {0x205F, 0x205F}, {0x3000, 0x3000}};
-
-static bool isUnicodeWhitespace(uint32_t Codepoint) {
-  static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars(
-      UnicodeWhitespaceCharRanges);
-  return UnicodeWhitespaceChars.contains(Codepoint);
-}
-
 static std::tuple<llvm::ConversionResult, llvm::UTF32, uint32_t>
 convertUTF8SequenceAndAdvance(llvm::StringRef::iterator &cur_pos,
                               llvm::StringRef::iterator end) {
@@ -246,7 +185,7 @@ static void SkipUnicodeWhitespaces(llvm::StringRef &remainder) {
   while (true) {
     auto [Status, CodePoint, size] =
         convertUTF8SequenceAndAdvance(cur_pos, remainder.end());
-    if (Status != llvm::conversionOK || !isUnicodeWhitespace(CodePoint))
+    if (Status != llvm::conversionOK || !IsUnicodeWhitespace(CodePoint))
       break;
     length += size;
   }
@@ -262,23 +201,21 @@ static void SkipWhitespaces(llvm::StringRef &remainder) {
   } while (remainder.begin() != cur_pos);
 }
 
-static std::optional<llvm::StringRef>
-IsWord(llvm::StringRef expr, llvm::StringRef &remainder, uint32_t &utf_length) {
+static std::optional<llvm::StringRef> IsWord(llvm::StringRef expr,
+                                             llvm::StringRef &remainder) {
   llvm::StringRef::iterator cur_pos = remainder.begin();
   llvm::StringRef::iterator start = cur_pos;
   auto [Status, CodePoint, size] =
       convertUTF8SequenceAndAdvance(cur_pos, remainder.end());
   if (Status == llvm::conversionOK &&
-      isValidIdentifierStartCodePoint(CodePoint)) {
-    utf_length = 1;
+      IsValidIdentifierStartCodePoint(CodePoint)) {
     unsigned length = size;
     while (true) {
       auto [Status, CodePoint, size] =
           convertUTF8SequenceAndAdvance(cur_pos, remainder.end());
       if (Status != llvm::conversionOK ||
-          !isValidIdentifierContinuationCodePoint(CodePoint))
+          !IsValidIdentifierContinuationCodePoint(CodePoint))
         break;
-      utf_length++;
       length += size;
     }
 
@@ -374,9 +311,7 @@ llvm::Expected<Token> DILLexer::Lex(llvm::StringRef expr,
     position += number.size();
     return token;
   } else {
-    uint32_t utf_length = 0;
-    std::optional<llvm::StringRef> maybe_word =
-        IsWord(expr, remainder, utf_length);
+    std::optional<llvm::StringRef> maybe_word = IsWord(expr, remainder);
     if (maybe_word) {
       llvm::StringRef word = *maybe_word;
       Token::Kind kind = llvm::StringSwitch<Token::Kind>(word)
@@ -406,7 +341,7 @@ llvm::Expected<Token> DILLexer::Lex(llvm::StringRef expr,
                             .Case("wchar_t", Token::kw_wchar_t)
                             .Default(Token::identifier);
       auto token = Token(kind, word.str(), position);
-      position += utf_length;
+      position += llvm::sys::unicode::columnWidthUTF8(word.str());
       return token;
     }
   }
diff --git a/lldb/unittests/DIL/DILTests.cpp b/lldb/unittests/DIL/DILTests.cpp
index cf814ea6090d..32f3f03ecd63 100644
--- a/lldb/unittests/DIL/DILTests.cpp
+++ b/lldb/unittests/DIL/DILTests.cpp
@@ -3681,13 +3681,15 @@ TEST_F(EvalTest, DISABLED_TestStringParsing) {
 }
 #endif
 
-TEST_F(EvalTest, TestUnicodeIdentifiers) {
+TEST_F(EvalTest, TestUnicodeInput) {
   EXPECT_THAT(Eval("フー + 1"), IsEqual("2"));
   EXPECT_THAT(Eval("1 + フー"), IsEqual("2"));
   EXPECT_THAT(Eval("föo + 1"), IsEqual("4"));
   EXPECT_THAT(Eval("שלום + 1"), IsEqual("5"));
   EXPECT_THAT(Eval(" 1　+   föo   "), // Contains Unicode whitespaces
               IsEqual("4"));
+
+  // Check diagnostic pointer location
   EXPECT_THAT(Eval("фу + бар"),
               IsError("<expr:1:6>: use of undeclared identifier 'бар'\n"
                       "фу + бар\n"
@@ -3696,4 +3698,8 @@ TEST_F(EvalTest, TestUnicodeIdentifiers) {
               IsError("<expr:1:8>: use of undeclared identifier 'бар'\n"
                       "фу　+　бар\n"
                       "       ^"));
+  EXPECT_THAT(Eval("フー + бар"),
+              IsError("<expr:1:8>: use of undeclared identifier 'бар'\n"
+                      "フー + бар\n"
+                      "       ^"));
 }
\ No newline at end of file
diff --git a/lldb/unittests/DIL/Inputs/test_binary.cc b/lldb/unittests/DIL/Inputs/test_binary.cc
index af4d52b166b7..c289f93a1202 100644
--- a/lldb/unittests/DIL/Inputs/test_binary.cc
+++ b/lldb/unittests/DIL/Inputs/test_binary.cc
@@ -1196,12 +1196,12 @@ static void TestStringParsing() {
   // BREAK(TestStringParsing)
 }
 
-static void TestUnicodeIdentifiers() {
+static void TestUnicodeInput() {
   int フー = 1;
   int фу = 2;
   int föo = 3;
   int שלום = 4;
-  // BREAK(TestUnicodeIdentifiers)
+  // BREAK(TestUnicodeInput)
 }
 
 namespace test_binary {
@@ -1258,7 +1258,7 @@ void main() {
 
   TestCharParsing();
   TestStringParsing();
-  TestUnicodeIdentifiers();
+  TestUnicodeInput();
 
   // BREAK HERE
 }

From 20ee555de0f8ee3cba2a898556ff4ceb61aa233b Mon Sep 17 00:00:00 2001
From: Ilia Kuklin <ikuklin@accesssoftek.com>
Date: Fri, 7 Mar 2025 17:51:16 +0500
Subject: [PATCH 7/9] Manually count identifier and whitespace width using
 charWidth

---
 lldb/source/ValueObject/DILLexer.cpp | 29 ++++++++++++++++------------
 llvm/include/llvm/Support/Unicode.h  | 13 +++++++++++++
 llvm/lib/Support/Unicode.cpp         | 13 +------------
 3 files changed, 31 insertions(+), 24 deletions(-)

diff --git a/lldb/source/ValueObject/DILLexer.cpp b/lldb/source/ValueObject/DILLexer.cpp
index 272ab86375ca..a476c719789b 100644
--- a/lldb/source/ValueObject/DILLexer.cpp
+++ b/lldb/source/ValueObject/DILLexer.cpp
@@ -179,7 +179,8 @@ convertUTF8SequenceAndAdvance(llvm::StringRef::iterator &cur_pos,
   return {Status, CodePoint, size};
 }
 
-static void SkipUnicodeWhitespaces(llvm::StringRef &remainder) {
+static void SkipUnicodeWhitespaces(llvm::StringRef &remainder,
+                                   uint32_t &width) {
   llvm::StringRef::iterator cur_pos = remainder.begin();
   uint32_t length = 0;
   while (true) {
@@ -188,27 +189,30 @@ static void SkipUnicodeWhitespaces(llvm::StringRef &remainder) {
     if (Status != llvm::conversionOK || !IsUnicodeWhitespace(CodePoint))
       break;
     length += size;
+    width += llvm::sys::unicode::charWidth(CodePoint);
   }
   remainder = remainder.drop_front(length);
 }
 
-static void SkipWhitespaces(llvm::StringRef &remainder) {
+static void SkipWhitespaces(llvm::StringRef &remainder, uint32_t &width) {
   llvm::StringRef::iterator cur_pos;
   do {
     cur_pos = remainder.begin();
     remainder = remainder.ltrim();
-    SkipUnicodeWhitespaces(remainder);
+    width += remainder.begin() - cur_pos;
+    SkipUnicodeWhitespaces(remainder, width);
   } while (remainder.begin() != cur_pos);
 }
 
-static std::optional<llvm::StringRef> IsWord(llvm::StringRef expr,
-                                             llvm::StringRef &remainder) {
+static std::optional<llvm::StringRef>
+IsWord(llvm::StringRef expr, llvm::StringRef &remainder, uint32_t &width) {
   llvm::StringRef::iterator cur_pos = remainder.begin();
   llvm::StringRef::iterator start = cur_pos;
   auto [Status, CodePoint, size] =
       convertUTF8SequenceAndAdvance(cur_pos, remainder.end());
   if (Status == llvm::conversionOK &&
       IsValidIdentifierStartCodePoint(CodePoint)) {
+    width = llvm::sys::unicode::charWidth(CodePoint);
     unsigned length = size;
     while (true) {
       auto [Status, CodePoint, size] =
@@ -216,6 +220,7 @@ static std::optional<llvm::StringRef> IsWord(llvm::StringRef expr,
       if (Status != llvm::conversionOK ||
           !IsValidIdentifierContinuationCodePoint(CodePoint))
         break;
+      width += llvm::sys::unicode::charWidth(CodePoint);
       length += size;
     }
 
@@ -287,6 +292,7 @@ llvm::Expected<DILLexer> DILLexer::Create(llvm::StringRef expr) {
       return t.takeError();
     }
   } while (tokens.back().GetKind() != Token::eof);
+
   return DILLexer(expr, std::move(tokens));
 }
 
@@ -294,11 +300,9 @@ llvm::Expected<Token> DILLexer::Lex(llvm::StringRef expr,
                                     llvm::StringRef &remainder,
                                     uint32_t &position) {
   llvm::StringRef::iterator start = remainder.begin();
-  SkipWhitespaces(remainder);
-  if (start < remainder.begin()) {
-    llvm::StringRef skipped(start, remainder.begin() - start);
-    position += llvm::sys::unicode::columnWidthUTF8(skipped);
-  }
+  uint32_t width = 0;
+  SkipWhitespaces(remainder, width);
+  position += width;
 
   // Check to see if we've reached the end of our input string.
   if (remainder.empty())
@@ -311,7 +315,8 @@ llvm::Expected<Token> DILLexer::Lex(llvm::StringRef expr,
     position += number.size();
     return token;
   } else {
-    std::optional<llvm::StringRef> maybe_word = IsWord(expr, remainder);
+    uint32_t width = 0;
+    std::optional<llvm::StringRef> maybe_word = IsWord(expr, remainder, width);
     if (maybe_word) {
       llvm::StringRef word = *maybe_word;
       Token::Kind kind = llvm::StringSwitch<Token::Kind>(word)
@@ -341,7 +346,7 @@ llvm::Expected<Token> DILLexer::Lex(llvm::StringRef expr,
                             .Case("wchar_t", Token::kw_wchar_t)
                             .Default(Token::identifier);
       auto token = Token(kind, word.str(), position);
-      position += llvm::sys::unicode::columnWidthUTF8(word.str());
+      position += width;
       return token;
     }
   }
diff --git a/llvm/include/llvm/Support/Unicode.h b/llvm/include/llvm/Support/Unicode.h
index 861548728d4f..2090a0377b93 100644
--- a/llvm/include/llvm/Support/Unicode.h
+++ b/llvm/include/llvm/Support/Unicode.h
@@ -41,6 +41,19 @@ bool isPrintable(int UCS);
 // Formatting codepoints are codepoints in the Cf category.
 bool isFormatting(int UCS);
 
+/// Gets the number of positions a character is likely to occupy when output
+/// on a terminal ("character width"). This depends on the implementation of the
+/// terminal, and there's no standard definition of character width.
+/// The implementation defines it in a way that is expected to be compatible
+/// with a generic Unicode-capable terminal.
+/// \return Character width:
+///   * ErrorNonPrintableCharacter (-1) for non-printable characters (as
+///     identified by isPrintable);
+///   * 0 for non-spacing and enclosing combining marks;
+///   * 2 for CJK characters excluding halfwidth forms;
+///   * 1 for all remaining characters.
+int charWidth(int UCS);
+
 /// Gets the number of positions the UTF8-encoded \p Text is likely to occupy
 /// when output on a terminal ("character width"). This depends on the
 /// implementation of the terminal, and there's no standard definition of
diff --git a/llvm/lib/Support/Unicode.cpp b/llvm/lib/Support/Unicode.cpp
index 288b75c872e1..2c79632fbeec 100644
--- a/llvm/lib/Support/Unicode.cpp
+++ b/llvm/lib/Support/Unicode.cpp
@@ -290,18 +290,7 @@ bool isFormatting(int UCS) {
   return Format.contains(UCS);
 }
 
-/// Gets the number of positions a character is likely to occupy when output
-/// on a terminal ("character width"). This depends on the implementation of the
-/// terminal, and there's no standard definition of character width.
-/// The implementation defines it in a way that is expected to be compatible
-/// with a generic Unicode-capable terminal.
-/// \return Character width:
-///   * ErrorNonPrintableCharacter (-1) for non-printable characters (as
-///     identified by isPrintable);
-///   * 0 for non-spacing and enclosing combining marks;
-///   * 2 for CJK characters excluding halfwidth forms;
-///   * 1 for all remaining characters.
-static inline int charWidth(int UCS) {
+int charWidth(int UCS) {
   if (!isPrintable(UCS))
     return ErrorNonPrintableCharacter;
 

From ce05c759f2822d875118f1eff71ccac3215a9995 Mon Sep 17 00:00:00 2001
From: Ilia Kuklin <ikuklin@accesssoftek.com>
Date: Tue, 11 Mar 2025 00:00:50 +0500
Subject: [PATCH 8/9] Remove Unicode whitespaces recognition

---
 lldb/source/ValueObject/DILLexer.cpp | 44 ++--------------------------
 lldb/unittests/DIL/DILTests.cpp      |  6 ----
 2 files changed, 3 insertions(+), 47 deletions(-)

diff --git a/lldb/source/ValueObject/DILLexer.cpp b/lldb/source/ValueObject/DILLexer.cpp
index a476c719789b..9b87a7cfaff8 100644
--- a/lldb/source/ValueObject/DILLexer.cpp
+++ b/lldb/source/ValueObject/DILLexer.cpp
@@ -16,7 +16,6 @@
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/ConvertUTF.h"
 #include "llvm/Support/Unicode.h"
-#include "llvm/Support/UnicodeCharRanges.h"
 #include <tuple>
 
 namespace lldb_private::dil {
@@ -135,17 +134,6 @@ static bool IsLetter(char c) {
 
 static bool IsDigit(char c) { return ('0' <= c && c <= '9'); }
 
-static const llvm::sys::UnicodeCharRange UnicodeWhitespaceCharRanges[] = {
-    {0x0085, 0x0085}, {0x00A0, 0x00A0}, {0x1680, 0x1680},
-    {0x180E, 0x180E}, {0x2000, 0x200A}, {0x2028, 0x2029},
-    {0x202F, 0x202F}, {0x205F, 0x205F}, {0x3000, 0x3000}};
-
-static bool IsUnicodeWhitespace(uint32_t Codepoint) {
-  static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars(
-      UnicodeWhitespaceCharRanges);
-  return UnicodeWhitespaceChars.contains(Codepoint);
-}
-
 inline bool IsOperator(unsigned char c) {
   using namespace clang::charinfo;
   return (InfoTable[c] & (CHAR_PUNCT | CHAR_PERIOD)) != 0;
@@ -157,7 +145,7 @@ static bool IsValidIdentifierContinuationCodePoint(uint32_t c) {
       return true;
     return !IsOperator(c) && !clang::isWhitespace(c);
   }
-  return !IsUnicodeWhitespace(c);
+  return true;
 }
 
 static bool IsValidIdentifierStartCodePoint(uint32_t c) {
@@ -179,31 +167,6 @@ convertUTF8SequenceAndAdvance(llvm::StringRef::iterator &cur_pos,
   return {Status, CodePoint, size};
 }
 
-static void SkipUnicodeWhitespaces(llvm::StringRef &remainder,
-                                   uint32_t &width) {
-  llvm::StringRef::iterator cur_pos = remainder.begin();
-  uint32_t length = 0;
-  while (true) {
-    auto [Status, CodePoint, size] =
-        convertUTF8SequenceAndAdvance(cur_pos, remainder.end());
-    if (Status != llvm::conversionOK || !IsUnicodeWhitespace(CodePoint))
-      break;
-    length += size;
-    width += llvm::sys::unicode::charWidth(CodePoint);
-  }
-  remainder = remainder.drop_front(length);
-}
-
-static void SkipWhitespaces(llvm::StringRef &remainder, uint32_t &width) {
-  llvm::StringRef::iterator cur_pos;
-  do {
-    cur_pos = remainder.begin();
-    remainder = remainder.ltrim();
-    width += remainder.begin() - cur_pos;
-    SkipUnicodeWhitespaces(remainder, width);
-  } while (remainder.begin() != cur_pos);
-}
-
 static std::optional<llvm::StringRef>
 IsWord(llvm::StringRef expr, llvm::StringRef &remainder, uint32_t &width) {
   llvm::StringRef::iterator cur_pos = remainder.begin();
@@ -300,9 +263,8 @@ llvm::Expected<Token> DILLexer::Lex(llvm::StringRef expr,
                                     llvm::StringRef &remainder,
                                     uint32_t &position) {
   llvm::StringRef::iterator start = remainder.begin();
-  uint32_t width = 0;
-  SkipWhitespaces(remainder, width);
-  position += width;
+  remainder = remainder.ltrim();
+  position += remainder.begin() - start;
 
   // Check to see if we've reached the end of our input string.
   if (remainder.empty())
diff --git a/lldb/unittests/DIL/DILTests.cpp b/lldb/unittests/DIL/DILTests.cpp
index 32f3f03ecd63..08dfd985a08b 100644
--- a/lldb/unittests/DIL/DILTests.cpp
+++ b/lldb/unittests/DIL/DILTests.cpp
@@ -3686,18 +3686,12 @@ TEST_F(EvalTest, TestUnicodeInput) {
   EXPECT_THAT(Eval("1 + フー"), IsEqual("2"));
   EXPECT_THAT(Eval("föo + 1"), IsEqual("4"));
   EXPECT_THAT(Eval("שלום + 1"), IsEqual("5"));
-  EXPECT_THAT(Eval(" 1　+   föo   "), // Contains Unicode whitespaces
-              IsEqual("4"));
 
   // Check diagnostic pointer location
   EXPECT_THAT(Eval("фу + бар"),
               IsError("<expr:1:6>: use of undeclared identifier 'бар'\n"
                       "фу + бар\n"
                       "     ^"));
-  EXPECT_THAT(Eval("фу　+　бар"), // Wide Unicode whitespaces
-              IsError("<expr:1:8>: use of undeclared identifier 'бар'\n"
-                      "фу　+　бар\n"
-                      "       ^"));
   EXPECT_THAT(Eval("フー + бар"),
               IsError("<expr:1:8>: use of undeclared identifier 'бар'\n"
                       "フー + бар\n"

From daf1735eba492838416d8dbe8ceff6b7f11b9c59 Mon Sep 17 00:00:00 2001
From: Ilia Kuklin <ikuklin@accesssoftek.com>
Date: Tue, 11 Mar 2025 20:58:29 +0500
Subject: [PATCH 9/9] Replace unicode codepoint conversion with UTF8 validity
 check

---
 lldb/source/ValueObject/DILLexer.cpp | 80 +++++++++++-----------------
 llvm/include/llvm/Support/Unicode.h  | 13 -----
 llvm/lib/Support/Unicode.cpp         | 13 ++++-
 3 files changed, 43 insertions(+), 63 deletions(-)

diff --git a/lldb/source/ValueObject/DILLexer.cpp b/lldb/source/ValueObject/DILLexer.cpp
index 9b87a7cfaff8..88ef57cc42f6 100644
--- a/lldb/source/ValueObject/DILLexer.cpp
+++ b/lldb/source/ValueObject/DILLexer.cpp
@@ -139,59 +139,42 @@ inline bool IsOperator(unsigned char c) {
   return (InfoTable[c] & (CHAR_PUNCT | CHAR_PERIOD)) != 0;
 }
 
-static bool IsValidIdentifierContinuationCodePoint(uint32_t c) {
-  if (c < 0x80) {
-    if (c == '$')
-      return true;
-    return !IsOperator(c) && !clang::isWhitespace(c);
-  }
-  return true;
-}
-
-static bool IsValidIdentifierStartCodePoint(uint32_t c) {
-  if (!IsValidIdentifierContinuationCodePoint(c))
-    return false;
-  if (c < 0x80 && IsDigit(c))
-    return false;
-  return true;
+static bool IsValidIdentifierContinuation(char c) {
+  if (c == '$')
+    return true;
+  return !IsOperator(c) && !clang::isWhitespace(c);
 }
 
-static std::tuple<llvm::ConversionResult, llvm::UTF32, uint32_t>
-convertUTF8SequenceAndAdvance(llvm::StringRef::iterator &cur_pos,
-                              llvm::StringRef::iterator end) {
-  llvm::UTF32 CodePoint;
-  uint32_t size = llvm::getNumBytesForUTF8(*cur_pos);
-  llvm::ConversionResult Status = llvm::convertUTF8Sequence(
-      (const llvm::UTF8 **)&cur_pos, (const llvm::UTF8 *)end, &CodePoint,
-      llvm::strictConversion);
-  return {Status, CodePoint, size};
-}
-
-static std::optional<llvm::StringRef>
-IsWord(llvm::StringRef expr, llvm::StringRef &remainder, uint32_t &width) {
+static std::optional<llvm::StringRef> IsWord(llvm::StringRef &remainder) {
   llvm::StringRef::iterator cur_pos = remainder.begin();
   llvm::StringRef::iterator start = cur_pos;
-  auto [Status, CodePoint, size] =
-      convertUTF8SequenceAndAdvance(cur_pos, remainder.end());
-  if (Status == llvm::conversionOK &&
-      IsValidIdentifierStartCodePoint(CodePoint)) {
-    width = llvm::sys::unicode::charWidth(CodePoint);
-    unsigned length = size;
-    while (true) {
-      auto [Status, CodePoint, size] =
-          convertUTF8SequenceAndAdvance(cur_pos, remainder.end());
-      if (Status != llvm::conversionOK ||
-          !IsValidIdentifierContinuationCodePoint(CodePoint))
+
+  if (IsDigit(*cur_pos))
+    return std::nullopt;
+
+  while (cur_pos < remainder.end()) {
+    uint8_t c = *cur_pos;
+    if (c < 0x80) {
+      if (IsValidIdentifierContinuation(c)) {
+        cur_pos++;
+        continue;
+      } else
         break;
-      width += llvm::sys::unicode::charWidth(CodePoint);
-      length += size;
     }
-
-    remainder = remainder.drop_front(length);
-    llvm::StringRef utf_token(start, length);
-    return utf_token;
+    if (llvm::isLegalUTF8Sequence((const llvm::UTF8 *)cur_pos,
+                                  (const llvm::UTF8 *)remainder.end())) {
+      cur_pos += llvm::getNumBytesForUTF8(*cur_pos);
+      continue;
+    }
+    break;
   }
-  return std::nullopt;
+
+  if (cur_pos == start)
+    return std::nullopt;
+
+  auto length = cur_pos - start;
+  remainder = remainder.drop_front(length);
+  return llvm::StringRef(start, length);
 }
 
 static void ConsumeNumberBody(uint32_t &length, char &prev_ch,
@@ -277,8 +260,7 @@ llvm::Expected<Token> DILLexer::Lex(llvm::StringRef expr,
     position += number.size();
     return token;
   } else {
-    uint32_t width = 0;
-    std::optional<llvm::StringRef> maybe_word = IsWord(expr, remainder, width);
+    std::optional<llvm::StringRef> maybe_word = IsWord(remainder);
     if (maybe_word) {
       llvm::StringRef word = *maybe_word;
       Token::Kind kind = llvm::StringSwitch<Token::Kind>(word)
@@ -308,7 +290,7 @@ llvm::Expected<Token> DILLexer::Lex(llvm::StringRef expr,
                             .Case("wchar_t", Token::kw_wchar_t)
                             .Default(Token::identifier);
       auto token = Token(kind, word.str(), position);
-      position += width;
+      position += llvm::sys::unicode::columnWidthUTF8(word.str());
       return token;
     }
   }
diff --git a/llvm/include/llvm/Support/Unicode.h b/llvm/include/llvm/Support/Unicode.h
index 2090a0377b93..861548728d4f 100644
--- a/llvm/include/llvm/Support/Unicode.h
+++ b/llvm/include/llvm/Support/Unicode.h
@@ -41,19 +41,6 @@ bool isPrintable(int UCS);
 // Formatting codepoints are codepoints in the Cf category.
 bool isFormatting(int UCS);
 
-/// Gets the number of positions a character is likely to occupy when output
-/// on a terminal ("character width"). This depends on the implementation of the
-/// terminal, and there's no standard definition of character width.
-/// The implementation defines it in a way that is expected to be compatible
-/// with a generic Unicode-capable terminal.
-/// \return Character width:
-///   * ErrorNonPrintableCharacter (-1) for non-printable characters (as
-///     identified by isPrintable);
-///   * 0 for non-spacing and enclosing combining marks;
-///   * 2 for CJK characters excluding halfwidth forms;
-///   * 1 for all remaining characters.
-int charWidth(int UCS);
-
 /// Gets the number of positions the UTF8-encoded \p Text is likely to occupy
 /// when output on a terminal ("character width"). This depends on the
 /// implementation of the terminal, and there's no standard definition of
diff --git a/llvm/lib/Support/Unicode.cpp b/llvm/lib/Support/Unicode.cpp
index 2c79632fbeec..288b75c872e1 100644
--- a/llvm/lib/Support/Unicode.cpp
+++ b/llvm/lib/Support/Unicode.cpp
@@ -290,7 +290,18 @@ bool isFormatting(int UCS) {
   return Format.contains(UCS);
 }
 
-int charWidth(int UCS) {
+/// Gets the number of positions a character is likely to occupy when output
+/// on a terminal ("character width"). This depends on the implementation of the
+/// terminal, and there's no standard definition of character width.
+/// The implementation defines it in a way that is expected to be compatible
+/// with a generic Unicode-capable terminal.
+/// \return Character width:
+///   * ErrorNonPrintableCharacter (-1) for non-printable characters (as
+///     identified by isPrintable);
+///   * 0 for non-spacing and enclosing combining marks;
+///   * 2 for CJK characters excluding halfwidth forms;
+///   * 1 for all remaining characters.
+static inline int charWidth(int UCS) {
   if (!isPrintable(UCS))
     return ErrorNonPrintableCharacter;