Skip to content

[libc] Wchar Stringconverter #146388

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 11 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion libc/src/__support/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -393,7 +393,7 @@ add_subdirectory(time)

# Requires access to uchar header which is not on macos
# Therefore, cannot currently build this on macos in overlay mode
if(NOT(LIBC_TARGET_OS_IS_DARWIN))
if(NOT (LIBC_TARGET_OS_IS_DARWIN OR LIBC_TARGET_OS_IS_WINDOWS))
add_subdirectory(wchar)
endif()

Expand Down
14 changes: 14 additions & 0 deletions libc/src/__support/wchar/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,19 @@ add_header_library(
libc.hdr.types.char32_t
)

add_header_library(
string_converter
HDRS
string_converter.h
DEPENDS
libc.hdr.types.char8_t
libc.hdr.types.char32_t
libc.hdr.types.size_t
libc.src.__support.error_or
.mbstate
.character_converter
)

add_object_library(
character_converter
HDRS
Expand All @@ -16,6 +29,7 @@ add_object_library(
libc.hdr.errno_macros
libc.hdr.types.char8_t
libc.hdr.types.char32_t
libc.hdr.types.size_t
libc.src.__support.error_or
libc.src.__support.math_extras
.mbstate
Expand Down
11 changes: 11 additions & 0 deletions libc/src/__support/wchar/character_converter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include "hdr/errno_macros.h"
#include "hdr/types/char32_t.h"
#include "hdr/types/char8_t.h"
#include "hdr/types/size_t.h"
#include "src/__support/CPP/bit.h"
#include "src/__support/common.h"
#include "src/__support/error_or.h"
Expand Down Expand Up @@ -92,6 +93,7 @@ int CharacterConverter::push(char8_t utf8_byte) {
state->bytes_stored++;
return 0;
}

// Invalid byte -> reset the state
clear();
return EILSEQ;
Expand Down Expand Up @@ -130,6 +132,12 @@ ErrorOr<char32_t> CharacterConverter::pop_utf32() {
return utf32;
}

size_t CharacterConverter::sizeAsUTF32() {
return 1; // a single utf-32 value can fit an entire character
}

size_t CharacterConverter::sizeAsUTF8() { return state->total_bytes; }

ErrorOr<char8_t> CharacterConverter::pop_utf8() {
if (isEmpty())
return Error(-1);
Expand All @@ -156,6 +164,9 @@ ErrorOr<char8_t> CharacterConverter::pop_utf8() {
}

state->bytes_stored--;
if (state->bytes_stored == 0)
clear();

return static_cast<char8_t>(output);
}

Expand Down
4 changes: 4 additions & 0 deletions libc/src/__support/wchar/character_converter.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

#include "hdr/types/char32_t.h"
#include "hdr/types/char8_t.h"
#include "hdr/types/size_t.h"
#include "src/__support/common.h"
#include "src/__support/error_or.h"
#include "src/__support/wchar/mbstate.h"
Expand All @@ -30,6 +31,9 @@ class CharacterConverter {
bool isEmpty();
bool isValidState();

size_t sizeAsUTF32();
size_t sizeAsUTF8();

int push(char8_t utf8_byte);
int push(char32_t utf32);

Expand Down
110 changes: 110 additions & 0 deletions libc/src/__support/wchar/string_converter.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
//===-- Definition of a class for mbstate_t and conversion -----*-- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#ifndef LLVM_LIBC_SRC___SUPPORT_STRING_CONVERTER_H
#define LLVM_LIBC_SRC___SUPPORT_STRING_CONVERTER_H

#include "hdr/types/char32_t.h"
#include "hdr/types/char8_t.h"
#include "hdr/types/size_t.h"
#include "src/__support/common.h"
#include "src/__support/error_or.h"
#include "src/__support/wchar/character_converter.h"
#include "src/__support/wchar/mbstate.h"

namespace LIBC_NAMESPACE_DECL {
namespace internal {

template <typename T> class StringConverter {
private:
CharacterConverter cr;
const T *src;
size_t src_len;
size_t src_idx;

// # of pops we are allowed to perform (essentially size of the dest buffer)
size_t num_to_write;

ErrorOr<size_t> pushFullCharacter() {
size_t num_pushed;
for (num_pushed = 0; !cr.isFull() && src_idx + num_pushed < src_len;
++num_pushed) {
int err = cr.push(src[src_idx + num_pushed]);
if (err != 0)
return Error(err);
}

// if we aren't able to read a full character from the source string
if (src_idx + num_pushed == src_len && !cr.isFull()) {
src_idx += num_pushed;
return Error(-1);
}

return num_pushed;
}

public:
StringConverter(const T *s, mbstate *ps, size_t dstlen,
size_t srclen = SIZE_MAX)
: cr(ps), src(s), src_len(srclen), src_idx(0), num_to_write(dstlen) {}

// TODO: following functions are almost identical
// look into templating CharacterConverter pop functions
ErrorOr<char32_t> popUTF32() {
if (cr.isEmpty() || src_idx == 0) {
auto src_elements_read = pushFullCharacter();
if (!src_elements_read.has_value())
return Error(src_elements_read.error());

if (cr.sizeAsUTF32() > num_to_write) {
cr.clear();
return Error(-1);
}

src_idx += src_elements_read.value();
}

auto out = cr.pop_utf32();
if (out.has_value() && out.value() == L'\0')
src_len = src_idx;

num_to_write--;

return out;
}

ErrorOr<char8_t> popUTF8() {
if (cr.isEmpty() || src_idx == 0) {
auto src_elements_read = pushFullCharacter();
if (!src_elements_read.has_value())
return Error(src_elements_read.error());

if (cr.sizeAsUTF8() > num_to_write) {
cr.clear();
return Error(-1);
}

src_idx += src_elements_read.value();
}

auto out = cr.pop_utf8();
if (out.has_value() && out.value() == '\0')
src_len = src_idx;

num_to_write--;

return out;
}

size_t getSourceIndex() { return src_idx; }
};

} // namespace internal
} // namespace LIBC_NAMESPACE_DECL

#endif // LLVM_LIBC_SRC___SUPPORT_STRING_CONVERTER_H
2 changes: 1 addition & 1 deletion libc/test/src/__support/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,6 @@ add_subdirectory(time)
add_subdirectory(threads)
# Requires access to uchar header which is not on MacOS
# Cannot currently build this on MacOS in overlay mode
if(NOT(LIBC_TARGET_OS_IS_DARWIN))
if(NOT (LIBC_TARGET_OS_IS_DARWIN OR LIBC_TARGET_OS_IS_WINDOWS))
add_subdirectory(wchar)
endif()
14 changes: 14 additions & 0 deletions libc/test/src/__support/wchar/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,17 @@ add_libc_test(
DEPENDS
libc.src.__support.wchar.character_converter
)

add_libc_test(
string_converter_test.cpp
SUITE
libc-support-tests
SRCS
string_converter_test.cpp
DEPENDS
libc.src.__support.wchar.string_converter
libc.src.__support.wchar.mbstate
libc.src.__support.error_or
libc.hdr.errno_macros
libc.hdr.types.char32_t
)
Loading
Loading