diff --git a/libc/src/__support/CMakeLists.txt b/libc/src/__support/CMakeLists.txt index 7e85136c08851..9de6971b73294 100644 --- a/libc/src/__support/CMakeLists.txt +++ b/libc/src/__support/CMakeLists.txt @@ -393,7 +393,7 @@ add_subdirectory(time) # Requires access to uchar header which is not on macos # Therefore, cannot currently build this on macos in overlay mode -if(NOT(LIBC_TARGET_OS_IS_DARWIN)) +if(NOT (LIBC_TARGET_OS_IS_DARWIN OR LIBC_TARGET_OS_IS_WINDOWS)) add_subdirectory(wchar) endif() diff --git a/libc/src/__support/wchar/CMakeLists.txt b/libc/src/__support/wchar/CMakeLists.txt index d3fb58ed0c71c..802441d37fe92 100644 --- a/libc/src/__support/wchar/CMakeLists.txt +++ b/libc/src/__support/wchar/CMakeLists.txt @@ -6,6 +6,19 @@ add_header_library( libc.hdr.types.char32_t ) +add_header_library( + string_converter + HDRS + string_converter.h + DEPENDS + libc.hdr.types.char8_t + libc.hdr.types.char32_t + libc.hdr.types.size_t + libc.src.__support.error_or + .mbstate + .character_converter +) + add_object_library( character_converter HDRS @@ -16,6 +29,7 @@ add_object_library( libc.hdr.errno_macros libc.hdr.types.char8_t libc.hdr.types.char32_t + libc.hdr.types.size_t libc.src.__support.error_or libc.src.__support.math_extras .mbstate diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp index 3cacfa5689e4d..15d0f478a18a9 100644 --- a/libc/src/__support/wchar/character_converter.cpp +++ b/libc/src/__support/wchar/character_converter.cpp @@ -9,6 +9,7 @@ #include "hdr/errno_macros.h" #include "hdr/types/char32_t.h" #include "hdr/types/char8_t.h" +#include "hdr/types/size_t.h" #include "src/__support/CPP/bit.h" #include "src/__support/common.h" #include "src/__support/error_or.h" @@ -92,6 +93,7 @@ int CharacterConverter::push(char8_t utf8_byte) { state->bytes_stored++; return 0; } + // Invalid byte -> reset the state clear(); return EILSEQ; @@ -130,6 +132,12 @@ ErrorOr CharacterConverter::pop_utf32() { return utf32; } +size_t CharacterConverter::sizeAsUTF32() { + return 1; // a single utf-32 value can fit an entire character +} + +size_t CharacterConverter::sizeAsUTF8() { return state->total_bytes; } + ErrorOr CharacterConverter::pop_utf8() { if (isEmpty()) return Error(-1); @@ -156,6 +164,9 @@ ErrorOr CharacterConverter::pop_utf8() { } state->bytes_stored--; + if (state->bytes_stored == 0) + clear(); + return static_cast(output); } diff --git a/libc/src/__support/wchar/character_converter.h b/libc/src/__support/wchar/character_converter.h index d9a63fdc0522c..b6d918f2d2edc 100644 --- a/libc/src/__support/wchar/character_converter.h +++ b/libc/src/__support/wchar/character_converter.h @@ -11,6 +11,7 @@ #include "hdr/types/char32_t.h" #include "hdr/types/char8_t.h" +#include "hdr/types/size_t.h" #include "src/__support/common.h" #include "src/__support/error_or.h" #include "src/__support/wchar/mbstate.h" @@ -30,6 +31,9 @@ class CharacterConverter { bool isEmpty(); bool isValidState(); + size_t sizeAsUTF32(); + size_t sizeAsUTF8(); + int push(char8_t utf8_byte); int push(char32_t utf32); diff --git a/libc/src/__support/wchar/string_converter.h b/libc/src/__support/wchar/string_converter.h new file mode 100644 index 0000000000000..0635bc57bf3e2 --- /dev/null +++ b/libc/src/__support/wchar/string_converter.h @@ -0,0 +1,110 @@ +//===-- Definition of a class for mbstate_t and conversion -----*-- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC___SUPPORT_STRING_CONVERTER_H +#define LLVM_LIBC_SRC___SUPPORT_STRING_CONVERTER_H + +#include "hdr/types/char32_t.h" +#include "hdr/types/char8_t.h" +#include "hdr/types/size_t.h" +#include "src/__support/common.h" +#include "src/__support/error_or.h" +#include "src/__support/wchar/character_converter.h" +#include "src/__support/wchar/mbstate.h" + +namespace LIBC_NAMESPACE_DECL { +namespace internal { + +template class StringConverter { +private: + CharacterConverter cr; + const T *src; + size_t src_len; + size_t src_idx; + + // # of pops we are allowed to perform (essentially size of the dest buffer) + size_t num_to_write; + + ErrorOr pushFullCharacter() { + size_t num_pushed; + for (num_pushed = 0; !cr.isFull() && src_idx + num_pushed < src_len; + ++num_pushed) { + int err = cr.push(src[src_idx + num_pushed]); + if (err != 0) + return Error(err); + } + + // if we aren't able to read a full character from the source string + if (src_idx + num_pushed == src_len && !cr.isFull()) { + src_idx += num_pushed; + return Error(-1); + } + + return num_pushed; + } + +public: + StringConverter(const T *s, mbstate *ps, size_t dstlen, + size_t srclen = SIZE_MAX) + : cr(ps), src(s), src_len(srclen), src_idx(0), num_to_write(dstlen) {} + + // TODO: following functions are almost identical + // look into templating CharacterConverter pop functions + ErrorOr popUTF32() { + if (cr.isEmpty() || src_idx == 0) { + auto src_elements_read = pushFullCharacter(); + if (!src_elements_read.has_value()) + return Error(src_elements_read.error()); + + if (cr.sizeAsUTF32() > num_to_write) { + cr.clear(); + return Error(-1); + } + + src_idx += src_elements_read.value(); + } + + auto out = cr.pop_utf32(); + if (out.has_value() && out.value() == L'\0') + src_len = src_idx; + + num_to_write--; + + return out; + } + + ErrorOr popUTF8() { + if (cr.isEmpty() || src_idx == 0) { + auto src_elements_read = pushFullCharacter(); + if (!src_elements_read.has_value()) + return Error(src_elements_read.error()); + + if (cr.sizeAsUTF8() > num_to_write) { + cr.clear(); + return Error(-1); + } + + src_idx += src_elements_read.value(); + } + + auto out = cr.pop_utf8(); + if (out.has_value() && out.value() == '\0') + src_len = src_idx; + + num_to_write--; + + return out; + } + + size_t getSourceIndex() { return src_idx; } +}; + +} // namespace internal +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC___SUPPORT_STRING_CONVERTER_H diff --git a/libc/test/src/__support/CMakeLists.txt b/libc/test/src/__support/CMakeLists.txt index 9f626ed31cc07..578bef871fed5 100644 --- a/libc/test/src/__support/CMakeLists.txt +++ b/libc/test/src/__support/CMakeLists.txt @@ -277,6 +277,6 @@ add_subdirectory(time) add_subdirectory(threads) # Requires access to uchar header which is not on MacOS # Cannot currently build this on MacOS in overlay mode -if(NOT(LIBC_TARGET_OS_IS_DARWIN)) +if(NOT (LIBC_TARGET_OS_IS_DARWIN OR LIBC_TARGET_OS_IS_WINDOWS)) add_subdirectory(wchar) endif() diff --git a/libc/test/src/__support/wchar/CMakeLists.txt b/libc/test/src/__support/wchar/CMakeLists.txt index 5176bfd4b024b..6982232d67544 100644 --- a/libc/test/src/__support/wchar/CMakeLists.txt +++ b/libc/test/src/__support/wchar/CMakeLists.txt @@ -19,3 +19,17 @@ add_libc_test( DEPENDS libc.src.__support.wchar.character_converter ) + +add_libc_test( + string_converter_test.cpp + SUITE + libc-support-tests + SRCS + string_converter_test.cpp + DEPENDS + libc.src.__support.wchar.string_converter + libc.src.__support.wchar.mbstate + libc.src.__support.error_or + libc.hdr.errno_macros + libc.hdr.types.char32_t +) diff --git a/libc/test/src/__support/wchar/string_converter_test.cpp b/libc/test/src/__support/wchar/string_converter_test.cpp new file mode 100644 index 0000000000000..b11df19f0dafb --- /dev/null +++ b/libc/test/src/__support/wchar/string_converter_test.cpp @@ -0,0 +1,344 @@ +//===-- Unittests for StringConverter class -------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "hdr/errno_macros.h" +#include "hdr/types/char32_t.h" +#include "hdr/types/char8_t.h" +#include "src/__support/error_or.h" +#include "src/__support/wchar/mbstate.h" +#include "src/__support/wchar/string_converter.h" +#include "test/UnitTest/Test.h" + +TEST(LlvmLibcStringConverterTest, UTF8To32) { + // first 4 bytes are clown emoji (🤡) + // next 3 bytes are sigma symbol (∑) + // next 2 bytes are y with diaeresis (ÿ) + // last byte is the letter A + const char *src = "\xF0\x9F\xA4\xA1\xE2\x88\x91\xC3\xBF\x41"; + LIBC_NAMESPACE::internal::mbstate state; + LIBC_NAMESPACE::internal::StringConverter sc( + reinterpret_cast(src), &state, SIZE_MAX); + + auto res = sc.popUTF32(); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(static_cast(res.value()), 0x1f921); + ASSERT_EQ(static_cast(sc.getSourceIndex()), 4); + + res = sc.popUTF32(); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(static_cast(res.value()), 0x2211); + ASSERT_EQ(static_cast(sc.getSourceIndex()), 7); + + res = sc.popUTF32(); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(static_cast(res.value()), 0xff); + ASSERT_EQ(static_cast(sc.getSourceIndex()), 9); + + res = sc.popUTF32(); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(static_cast(res.value()), 0x41); + ASSERT_EQ(static_cast(sc.getSourceIndex()), 10); + + res = sc.popUTF32(); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(static_cast(res.value()), 0); + ASSERT_EQ(static_cast(sc.getSourceIndex()), 11); + + res = sc.popUTF32(); + ASSERT_FALSE(res.has_value()); + ASSERT_EQ(res.error(), -1); + ASSERT_EQ(static_cast(sc.getSourceIndex()), 11); +} + +TEST(LlvmLibcStringConverterTest, UTF32To8) { + // clown emoji, sigma symbol, y with diaeresis, letter A + const wchar_t *src = L"\x1f921\x2211\xff\x41"; + LIBC_NAMESPACE::internal::mbstate state; + LIBC_NAMESPACE::internal::StringConverter sc( + reinterpret_cast(src), &state, SIZE_MAX); + + auto res = sc.popUTF8(); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(static_cast(res.value()), 0xF0); + ASSERT_EQ(static_cast(sc.getSourceIndex()), 1); + + res = sc.popUTF8(); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(static_cast(res.value()), 0x9F); + ASSERT_EQ(static_cast(sc.getSourceIndex()), 1); + + res = sc.popUTF8(); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(static_cast(res.value()), 0xA4); + ASSERT_EQ(static_cast(sc.getSourceIndex()), 1); + + res = sc.popUTF8(); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(static_cast(res.value()), 0xA1); + ASSERT_EQ(static_cast(sc.getSourceIndex()), 1); + + // end of clown emoji, sigma symbol begins + res = sc.popUTF8(); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(static_cast(res.value()), 0xE2); + ASSERT_EQ(static_cast(sc.getSourceIndex()), 2); + + res = sc.popUTF8(); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(static_cast(res.value()), 0x88); + ASSERT_EQ(static_cast(sc.getSourceIndex()), 2); + + res = sc.popUTF8(); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(static_cast(res.value()), 0x91); + ASSERT_EQ(static_cast(sc.getSourceIndex()), 2); + + // end of sigma symbol, y with diaeresis begins + res = sc.popUTF8(); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(static_cast(res.value()), 0xC3); + ASSERT_EQ(static_cast(sc.getSourceIndex()), 3); + + res = sc.popUTF8(); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(static_cast(res.value()), 0xBF); + ASSERT_EQ(static_cast(sc.getSourceIndex()), 3); + + // end of y with diaeresis, letter A begins + res = sc.popUTF8(); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(static_cast(res.value()), 0x41); + ASSERT_EQ(static_cast(sc.getSourceIndex()), 4); + + // null byte + res = sc.popUTF8(); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(static_cast(res.value()), 0); + ASSERT_EQ(static_cast(sc.getSourceIndex()), 5); + + res = sc.popUTF8(); + ASSERT_FALSE(res.has_value()); + ASSERT_EQ(res.error(), -1); + ASSERT_EQ(static_cast(sc.getSourceIndex()), 5); +} + +TEST(LlvmLibcStringConverterTest, UTF32To8PartialRead) { + const wchar_t *src = L"\x1f921\x2211"; // clown emoji, sigma symbol + LIBC_NAMESPACE::internal::mbstate state; + LIBC_NAMESPACE::internal::StringConverter sc( + reinterpret_cast(src), &state, SIZE_MAX, 1); + + auto res = sc.popUTF8(); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(static_cast(res.value()), 0xF0); + ASSERT_EQ(static_cast(sc.getSourceIndex()), 1); + + res = sc.popUTF8(); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(static_cast(res.value()), 0x9F); + ASSERT_EQ(static_cast(sc.getSourceIndex()), 1); + + res = sc.popUTF8(); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(static_cast(res.value()), 0xA4); + ASSERT_EQ(static_cast(sc.getSourceIndex()), 1); + + res = sc.popUTF8(); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(static_cast(res.value()), 0xA1); + ASSERT_EQ(static_cast(sc.getSourceIndex()), 1); + + // can only read 1 character from source string, so error on next pop + res = sc.popUTF8(); + ASSERT_FALSE(res.has_value()); + ASSERT_EQ(res.error(), -1); +} + +TEST(LlvmLibcStringConverterTest, UTF8To32PartialRead) { + // first 4 bytes are clown emoji, then next 3 are sigma symbol + const char *src = "\xF0\x9F\xA4\xA1\xE2\x88\x91"; + LIBC_NAMESPACE::internal::mbstate state; + LIBC_NAMESPACE::internal::StringConverter sc( + reinterpret_cast(src), &state, SIZE_MAX, 5); + + auto res = sc.popUTF32(); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(static_cast(res.value()), 0x1f921); + ASSERT_EQ(static_cast(sc.getSourceIndex()), 4); + + res = sc.popUTF32(); + ASSERT_FALSE(res.has_value()); + ASSERT_EQ(static_cast(res.error()), -1); + ASSERT_EQ(static_cast(sc.getSourceIndex()), 5); +} + +TEST(LlvmLibcStringConverterTest, UTF32To8ErrorHandling) { + const wchar_t *src = L"\x1f921\xffffff"; // clown emoji, invalid utf32 + LIBC_NAMESPACE::internal::mbstate state; + LIBC_NAMESPACE::internal::StringConverter sc( + reinterpret_cast(src), &state, SIZE_MAX); + + auto res = sc.popUTF8(); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(static_cast(res.value()), 0xF0); + ASSERT_EQ(static_cast(sc.getSourceIndex()), 1); + + res = sc.popUTF8(); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(static_cast(res.value()), 0x9F); + ASSERT_EQ(static_cast(sc.getSourceIndex()), 1); + + res = sc.popUTF8(); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(static_cast(res.value()), 0xA4); + ASSERT_EQ(static_cast(sc.getSourceIndex()), 1); + + res = sc.popUTF8(); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(static_cast(res.value()), 0xA1); + ASSERT_EQ(static_cast(sc.getSourceIndex()), 1); + + res = sc.popUTF8(); + ASSERT_FALSE(res.has_value()); + ASSERT_EQ(static_cast(res.error()), EILSEQ); + ASSERT_EQ(static_cast(sc.getSourceIndex()), 1); +} + +TEST(LlvmLibcStringConverterTest, UTF8To32ErrorHandling) { + // first 4 bytes are clown emoji (🤡) + // next 3 form an invalid character + const char *src = "\xF0\x9F\xA4\xA1\x90\x88\x30"; + LIBC_NAMESPACE::internal::mbstate state; + LIBC_NAMESPACE::internal::StringConverter sc( + reinterpret_cast(src), &state, SIZE_MAX); + + auto res = sc.popUTF32(); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(static_cast(res.value()), 0x1f921); + ASSERT_EQ(static_cast(sc.getSourceIndex()), 4); + + res = sc.popUTF32(); + ASSERT_FALSE(res.has_value()); + ASSERT_EQ(static_cast(res.error()), EILSEQ); + ASSERT_EQ(static_cast(sc.getSourceIndex()), 4); +} + +TEST(LlvmLibcStringConverterTest, MultipleStringConverters32To8) { + /* + We do NOT test partially popping a character and expecting the next + StringConverter to continue where we left off. This is not expected to work + and considered invalid. + */ + const wchar_t *src = L"\x1f921\xff"; // clown emoji, y with diaeresis (ÿ) + LIBC_NAMESPACE::internal::mbstate state; + LIBC_NAMESPACE::internal::StringConverter sc1( + reinterpret_cast(src), &state, SIZE_MAX, 1); + + auto res = sc1.popUTF8(); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(static_cast(res.value()), 0xF0); + ASSERT_EQ(static_cast(sc1.getSourceIndex()), 1); + + res = sc1.popUTF8(); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(static_cast(res.value()), 0x9F); + ASSERT_EQ(static_cast(sc1.getSourceIndex()), 1); + + res = sc1.popUTF8(); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(static_cast(res.value()), 0xA4); + ASSERT_EQ(static_cast(sc1.getSourceIndex()), 1); + + res = sc1.popUTF8(); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(static_cast(res.value()), 0xA1); + ASSERT_EQ(static_cast(sc1.getSourceIndex()), 1); + + // sc2 should pick up where sc1 left off and continue the conversion + LIBC_NAMESPACE::internal::StringConverter sc2( + reinterpret_cast(src) + sc1.getSourceIndex(), &state, + SIZE_MAX, 1); + + res = sc2.popUTF8(); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(static_cast(res.value()), 0xC3); + ASSERT_EQ(static_cast(sc2.getSourceIndex()), 1); + + res = sc2.popUTF8(); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(static_cast(res.value()), 0xBF); + ASSERT_EQ(static_cast(sc2.getSourceIndex()), 1); +} + +TEST(LlvmLibcStringConverterTest, MultipleStringConverters8To32) { + const char *src = "\xF0\x9F\xA4\xA1"; // clown emoji + LIBC_NAMESPACE::internal::mbstate state; + LIBC_NAMESPACE::internal::StringConverter sc1( + reinterpret_cast(src), &state, SIZE_MAX, 2); + + auto res = sc1.popUTF32(); + ASSERT_FALSE(res.has_value()); + ASSERT_EQ(static_cast(res.error()), -1); + ASSERT_EQ(static_cast(sc1.getSourceIndex()), 2); + + // sc2 should pick up where sc1 left off and continue the conversion + LIBC_NAMESPACE::internal::StringConverter sc2( + reinterpret_cast(src) + sc1.getSourceIndex(), &state, + SIZE_MAX, 3); + + res = sc2.popUTF32(); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(static_cast(res.value()), 0x1f921); + ASSERT_EQ(static_cast(sc2.getSourceIndex()), 2); + + res = sc2.popUTF32(); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(static_cast(res.value()), 0); + ASSERT_EQ(static_cast(sc2.getSourceIndex()), 3); +} + +TEST(LlvmLibcStringConverterTest, DestLimitUTF8To32) { + const char *src = "\xF0\x9F\xA4\xA1\xF0\x9F\xA4\xA1"; // 2 clown emojis + LIBC_NAMESPACE::internal::mbstate state; + LIBC_NAMESPACE::internal::StringConverter sc( + reinterpret_cast(src), &state, 1); + + auto res = sc.popUTF32(); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(static_cast(sc.getSourceIndex()), 4); + + res = sc.popUTF32(); // no space to pop this into + ASSERT_FALSE(res.has_value()); +} + +TEST(LlvmLibcStringConverterTest, DestLimitUTF32To8) { + const wchar_t *src = L"\x1f921\x1f921"; // 2 clown emojis + LIBC_NAMESPACE::internal::mbstate state; + LIBC_NAMESPACE::internal::StringConverter sc( + reinterpret_cast(src), &state, 5); + + auto res = sc.popUTF8(); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(static_cast(sc.getSourceIndex()), 1); + + res = sc.popUTF8(); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(static_cast(sc.getSourceIndex()), 1); + + res = sc.popUTF8(); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(static_cast(sc.getSourceIndex()), 1); + + res = sc.popUTF8(); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(static_cast(sc.getSourceIndex()), 1); + + res = sc.popUTF8(); + ASSERT_FALSE(res.has_value()); + ASSERT_EQ(static_cast(sc.getSourceIndex()), 1); +}