diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp index 7f147ac26d3d1..ca709769616c3 100644 --- a/libc/src/__support/wchar/character_converter.cpp +++ b/libc/src/__support/wchar/character_converter.cpp @@ -8,7 +8,6 @@ #include "hdr/types/char32_t.h" #include "hdr/types/char8_t.h" -#include "src/__support/CPP/bit.h" #include "src/__support/common.h" #include "src/__support/error_or.h" #include "src/__support/math_extras.h" @@ -31,49 +30,6 @@ bool CharacterConverter::isComplete() { return state->bytes_processed == state->total_bytes; } -int CharacterConverter::push(char8_t utf8_byte) { - uint8_t num_ones = static_cast(cpp::countl_one(utf8_byte)); - // Checking the first byte if first push - if (state->bytes_processed == 0) { - // UTF-8 char has 1 byte total - if (num_ones == 0) { - state->total_bytes = 1; - } - // UTF-8 char has 2 through 4 bytes total - else if (num_ones >= 2 && num_ones <= 4) { - /* Since the format is 110xxxxx, 1110xxxx, and 11110xxx for 2, 3, and 4, - we will make the base mask with 7 ones and right shift it as necessary. */ - constexpr size_t SIGNIFICANT_BITS = 7; - uint32_t base_mask = mask_trailing_ones(); - state->total_bytes = num_ones; - utf8_byte &= (base_mask >> num_ones); - } - // Invalid first byte - else { - // bytes_processed and total_bytes will always be 0 here - state->partial = static_cast(0); - return -1; - } - state->partial = static_cast(utf8_byte); - state->bytes_processed++; - return 0; - } - // Any subsequent push - // Adding 6 more bits so need to left shift - constexpr size_t ENCODED_BITS_PER_UTF8 = 6; - if (num_ones == 1 && !isComplete()) { - char32_t byte = - utf8_byte & mask_trailing_ones(); - state->partial = state->partial << ENCODED_BITS_PER_UTF8; - state->partial |= byte; - state->bytes_processed++; - return 0; - } - // Invalid byte -> reset the state - clear(); - return -1; -} - int CharacterConverter::push(char32_t utf32) { // we can't be partially through a conversion when pushing a utf32 value if (!isComplete()) @@ -98,17 +54,6 @@ int CharacterConverter::push(char32_t utf32) { return -1; } -ErrorOr CharacterConverter::pop_utf32() { - // If pop is called too early, do not reset the state, use error to determine - // whether enough bytes have been pushed - if (!isComplete() || state->bytes_processed == 0) - return Error(-1); - char32_t utf32 = state->partial; - // reset if successful pop - clear(); - return utf32; -} - ErrorOr CharacterConverter::pop_utf8() { if (isComplete()) return Error(-1); diff --git a/libc/test/src/__support/CMakeLists.txt b/libc/test/src/__support/CMakeLists.txt index 9f626ed31cc07..76218a16e0cf7 100644 --- a/libc/test/src/__support/CMakeLists.txt +++ b/libc/test/src/__support/CMakeLists.txt @@ -275,8 +275,9 @@ add_subdirectory(fixed_point) add_subdirectory(HashTable) add_subdirectory(time) add_subdirectory(threads) -# Requires access to uchar header which is not on MacOS -# Cannot currently build this on MacOS in overlay mode + +# Requires access to uchar header which is not on macos +# Therefore, cannot currently build this on macos in overlay mode if(NOT(LIBC_TARGET_OS_IS_DARWIN)) add_subdirectory(wchar) endif() diff --git a/libc/test/src/__support/wchar/CMakeLists.txt b/libc/test/src/__support/wchar/CMakeLists.txt index 5176bfd4b024b..5dff6e9115f7d 100644 --- a/libc/test/src/__support/wchar/CMakeLists.txt +++ b/libc/test/src/__support/wchar/CMakeLists.txt @@ -1,15 +1,5 @@ add_custom_target(libc-support-wchar-tests) -add_libc_test( - utf8_to_32_test - SUITE - libc-support-tests - SRCS - utf8_to_32_test.cpp - DEPENDS - libc.src.__support.wchar.character_converter -) - add_libc_test( utf32_to_8_test SUITE diff --git a/libc/test/src/__support/wchar/utf8_to_32_test.cpp b/libc/test/src/__support/wchar/utf8_to_32_test.cpp deleted file mode 100644 index 9cb059faa9374..0000000000000 --- a/libc/test/src/__support/wchar/utf8_to_32_test.cpp +++ /dev/null @@ -1,196 +0,0 @@ -//===-- Unittests for character_converter utf8->utf32 ---------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "src/__support/error_or.h" -#include "src/__support/wchar/character_converter.h" -#include "src/__support/wchar/mbstate.h" -#include "test/UnitTest/Test.h" - -TEST(LlvmLibcCharacterConverterUTF8To32Test, OneByte) { - LIBC_NAMESPACE::internal::mbstate state; - state.bytes_processed = 0; - state.total_bytes = 0; - char ch = 'A'; - - LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state); - int err = char_conv.push(static_cast(ch)); - auto wch = char_conv.pop_utf32(); - - ASSERT_EQ(err, 0); - ASSERT_TRUE(wch.has_value()); - ASSERT_EQ(static_cast(wch.value()), 65); -} - -TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoBytes) { - LIBC_NAMESPACE::internal::mbstate state; - state.bytes_processed = 0; - state.total_bytes = 0; - const char ch[2] = {static_cast(0xC2), - static_cast(0x8E)}; // Ž car symbol - - LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state); - char_conv.push(static_cast(ch[0])); - char_conv.push(static_cast(ch[1])); - auto wch = char_conv.pop_utf32(); - - ASSERT_TRUE(wch.has_value()); - ASSERT_EQ(static_cast(wch.value()), 142); -} - -TEST(LlvmLibcCharacterConverterUTF8To32Test, ThreeBytes) { - LIBC_NAMESPACE::internal::mbstate state; - state.bytes_processed = 0; - state.total_bytes = 0; - const char ch[3] = {static_cast(0xE2), static_cast(0x88), - static_cast(0x91)}; // ∑ sigma symbol - - LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state); - char_conv.push(static_cast(ch[0])); - char_conv.push(static_cast(ch[1])); - char_conv.push(static_cast(ch[2])); - auto wch = char_conv.pop_utf32(); - - ASSERT_TRUE(wch.has_value()); - ASSERT_EQ(static_cast(wch.value()), 8721); -} - -TEST(LlvmLibcCharacterConverterUTF8To32Test, FourBytes) { - LIBC_NAMESPACE::internal::mbstate state; - state.bytes_processed = 0; - state.total_bytes = 0; - const char ch[4] = {static_cast(0xF0), static_cast(0x9F), - static_cast(0xA4), - static_cast(0xA1)}; // 🤡 clown emoji - - LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state); - char_conv.push(static_cast(ch[0])); - char_conv.push(static_cast(ch[1])); - char_conv.push(static_cast(ch[2])); - char_conv.push(static_cast(ch[3])); - auto wch = char_conv.pop_utf32(); - - ASSERT_TRUE(wch.has_value()); - ASSERT_EQ(static_cast(wch.value()), 129313); -} - -TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidByte) { - LIBC_NAMESPACE::internal::mbstate state; - state.bytes_processed = 0; - state.total_bytes = 0; - const char ch = static_cast(0x80); // invalid starting bit sequence - - LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state); - int err = char_conv.push(static_cast(ch)); - - ASSERT_EQ(err, -1); -} - -TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidMultiByte) { - LIBC_NAMESPACE::internal::mbstate state; - state.bytes_processed = 0; - state.total_bytes = 0; - const char ch[4] = { - static_cast(0x80), static_cast(0x00), static_cast(0x80), - static_cast(0x00)}; // first and third bytes are invalid - - LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state); - int err = char_conv.push(static_cast(ch[0])); - ASSERT_EQ(err, -1); - err = char_conv.push(static_cast(ch[1])); - ASSERT_EQ(err, 0); - // Prev byte was single byte so trying to push another should error. - err = char_conv.push(static_cast(ch[2])); - ASSERT_EQ(err, -1); - err = char_conv.push(static_cast(ch[3])); - ASSERT_EQ(err, 0); -} - -TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidLastByte) { - LIBC_NAMESPACE::internal::mbstate state; - state.bytes_processed = 0; - state.total_bytes = 0; - // Last byte is invalid since it does not have correct starting sequence. - // 0xC0 --> 11000000 starting sequence should be 10xxxxxx - const char ch[4] = {static_cast(0xF1), static_cast(0x80), - static_cast(0x80), static_cast(0xC0)}; - - LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state); - int err = char_conv.push(static_cast(ch[0])); - ASSERT_EQ(err, 0); - err = char_conv.push(static_cast(ch[1])); - ASSERT_EQ(err, 0); - err = char_conv.push(static_cast(ch[2])); - ASSERT_EQ(err, 0); - err = char_conv.push(static_cast(ch[3])); - ASSERT_EQ(err, -1); -} - -TEST(LlvmLibcCharacterConverterUTF8To32Test, ValidTwoByteWithExtraRead) { - LIBC_NAMESPACE::internal::mbstate state; - state.bytes_processed = 0; - state.total_bytes = 0; - const char ch[3] = {static_cast(0xC2), static_cast(0x8E), - static_cast(0x80)}; - - LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state); - int err = char_conv.push(static_cast(ch[0])); - ASSERT_EQ(err, 0); - err = char_conv.push(static_cast(ch[1])); - ASSERT_EQ(err, 0); - // Should produce an error on 3rd byte - err = char_conv.push(static_cast(ch[2])); - ASSERT_EQ(err, -1); - - // Should produce an error since mbstate was reset - auto wch = char_conv.pop_utf32(); - ASSERT_FALSE(wch.has_value()); -} - -TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoValidTwoBytes) { - LIBC_NAMESPACE::internal::mbstate state; - state.bytes_processed = 0; - state.total_bytes = 0; - const char ch[4] = {static_cast(0xC2), static_cast(0x8E), - static_cast(0xC7), static_cast(0x8C)}; - - LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state); - int err = char_conv.push(static_cast(ch[0])); - ASSERT_EQ(err, 0); - err = char_conv.push(static_cast(ch[1])); - ASSERT_EQ(err, 0); - auto wch = char_conv.pop_utf32(); - ASSERT_TRUE(wch.has_value()); - ASSERT_EQ(static_cast(wch.value()), 142); - - // Second two byte character - err = char_conv.push(static_cast(ch[2])); - ASSERT_EQ(err, 0); - err = char_conv.push(static_cast(ch[3])); - ASSERT_EQ(err, 0); - wch = char_conv.pop_utf32(); - ASSERT_TRUE(wch.has_value()); - ASSERT_EQ(static_cast(wch.value()), 460); -} - -TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidPop) { - LIBC_NAMESPACE::internal::mbstate state; - state.bytes_processed = 0; - state.total_bytes = 0; - LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state); - const char ch[2] = {static_cast(0xC2), static_cast(0x8E)}; - int err = char_conv.push(static_cast(ch[0])); - ASSERT_EQ(err, 0); - auto wch = char_conv.pop_utf32(); - ASSERT_FALSE( - wch.has_value()); // Should fail since we have not read enough bytes - err = char_conv.push(static_cast(ch[1])); - ASSERT_EQ(err, 0); - wch = char_conv.pop_utf32(); - ASSERT_TRUE(wch.has_value()); - ASSERT_EQ(static_cast(wch.value()), 142); -}