Skip to content

[libc][NFC] refactor Cortex memcpy code #148204

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions libc/src/string/memory_utils/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ add_header_library(
aarch64/inline_memcpy.h
aarch64/inline_memmove.h
aarch64/inline_memset.h
arm/common.h
arm/inline_memcpy.h
generic/aligned_access.h
generic/byte_per_byte.h
Expand Down
52 changes: 52 additions & 0 deletions libc/src/string/memory_utils/arm/common.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
//===-- Common constants and defines for arm --------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_COMMON_H
#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_COMMON_H

#include "src/__support/macros/attributes.h" // LIBC_INLINE_VAR
#include "src/string/memory_utils/utils.h" // CPtr, Ptr, distance_to_align

#include <stddef.h> // size_t

// https://libc.llvm.org/compiler_support.html
// Support for [[likely]] / [[unlikely]]
// [X] GCC 12.2
// [X] Clang 12
// [ ] Clang 11
#define LIBC_ATTR_LIKELY [[likely]]
#define LIBC_ATTR_UNLIKELY [[unlikely]]

#if defined(LIBC_COMPILER_IS_CLANG)
#if LIBC_COMPILER_CLANG_VER < 1200
#undef LIBC_ATTR_LIKELY
#undef LIBC_ATTR_UNLIKELY
#define LIBC_ATTR_LIKELY
#define LIBC_ATTR_UNLIKELY
#endif
#endif

namespace LIBC_NAMESPACE_DECL {

LIBC_INLINE_VAR constexpr size_t kWordSize = sizeof(uint32_t);

enum class BumpSize { kNo, kYes };
enum class BlockOp { kFull, kByWord };

LIBC_INLINE auto misaligned(CPtr ptr) {
return distance_to_align_down<kWordSize>(ptr);
}

LIBC_INLINE CPtr bitwise_or(CPtr a, CPtr b) {
return cpp::bit_cast<CPtr>(cpp::bit_cast<uintptr_t>(a) |
cpp::bit_cast<uintptr_t>(b));
}

} // namespace LIBC_NAMESPACE_DECL

#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_COMMON_H
128 changes: 48 additions & 80 deletions libc/src/string/memory_utils/arm/inline_memcpy.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,58 +10,38 @@

#include "src/__support/macros/attributes.h" // LIBC_INLINE
#include "src/__support/macros/optimization.h" // LIBC_LOOP_NOUNROLL
#include "src/string/memory_utils/arm/common.h" // LIBC_ATTR_LIKELY, LIBC_ATTR_UNLIKELY
#include "src/string/memory_utils/utils.h" // memcpy_inline, distance_to_align

#include <stddef.h> // size_t

// https://libc.llvm.org/compiler_support.html
// Support for [[likely]] / [[unlikely]]
// [X] GCC 12.2
// [X] Clang 12
// [ ] Clang 11
#define LIBC_ATTR_LIKELY [[likely]]
#define LIBC_ATTR_UNLIKELY [[unlikely]]

#if defined(LIBC_COMPILER_IS_CLANG)
#if LIBC_COMPILER_CLANG_VER < 1200
#undef LIBC_ATTR_LIKELY
#undef LIBC_ATTR_UNLIKELY
#define LIBC_ATTR_LIKELY
#define LIBC_ATTR_UNLIKELY
#endif
#endif

namespace LIBC_NAMESPACE_DECL {

namespace {

LIBC_INLINE_VAR constexpr size_t kWordSize = sizeof(uint32_t);

enum Strategy {
ForceWordLdStChain,
AssumeWordAligned,
AssumeUnaligned,
};
template <size_t bytes>
LIBC_INLINE void copy_assume_aligned(void *dst, const void *src) {
constexpr size_t alignment = bytes > kWordSize ? kWordSize : bytes;
memcpy_inline<bytes>(assume_aligned<alignment>(dst),
assume_aligned<alignment>(src));
}

template <size_t bytes, Strategy strategy = AssumeUnaligned>
LIBC_INLINE void copy_and_bump_pointers(Ptr &dst, CPtr &src) {
if constexpr (strategy == AssumeUnaligned) {
memcpy_inline<bytes>(assume_aligned<1>(dst), assume_aligned<1>(src));
} else if constexpr (strategy == AssumeWordAligned) {
static_assert(bytes >= kWordSize);
memcpy_inline<bytes>(assume_aligned<kWordSize>(dst),
assume_aligned<kWordSize>(src));
} else if constexpr (strategy == ForceWordLdStChain) {
template <size_t bytes, BlockOp block_op = BlockOp::kFull>
LIBC_INLINE void copy_block_and_bump_pointers(Ptr &dst, CPtr &src) {
if constexpr (block_op == BlockOp::kFull) {
copy_assume_aligned<bytes>(dst, src);
} else if constexpr (block_op == BlockOp::kByWord) {
// We restrict loads/stores to 4 byte to prevent the use of load/store
// multiple (LDM, STM) and load/store double (LDRD, STRD). First, they may
// fault (see notes below) and second, they use more registers which in turn
// adds push/pop instructions in the hot path.
static_assert((bytes % kWordSize == 0) && (bytes >= kWordSize));
// multiple (LDM, STM) and load/store double (LDRD, STRD). First, they
// may fault (see notes below) and second, they use more registers which
// in turn adds push/pop instructions in the hot path.
static_assert(bytes >= kWordSize);
LIBC_LOOP_UNROLL
for (size_t i = 0; i < bytes / kWordSize; ++i) {
const size_t offset = i * kWordSize;
memcpy_inline<kWordSize>(dst + offset, src + offset);
for (size_t offset = 0; offset < bytes; offset += kWordSize) {
copy_assume_aligned<kWordSize>(dst + offset, src + offset);
}
} else {
static_assert(false, "Invalid BlockOp");
}
// In the 1, 2, 4 byte copy case, the compiler can fold pointer offsetting
// into the load/store instructions.
Expand All @@ -72,30 +52,19 @@ LIBC_INLINE void copy_and_bump_pointers(Ptr &dst, CPtr &src) {
src += bytes;
}

LIBC_INLINE void copy_bytes_and_bump_pointers(Ptr &dst, CPtr &src,
const size_t size) {
template <size_t bytes, BlockOp block_op, BumpSize bump_size = BumpSize::kYes>
LIBC_INLINE void consume_by_aligned_block(Ptr &dst, CPtr &src, size_t &size) {
LIBC_LOOP_NOUNROLL
for (size_t i = 0; i < size; ++i)
*dst++ = *src++;
}

template <size_t block_size, Strategy strategy>
LIBC_INLINE void copy_blocks_and_update_args(Ptr &dst, CPtr &src,
size_t &size) {
LIBC_LOOP_NOUNROLL
for (size_t i = 0; i < size / block_size; ++i)
copy_and_bump_pointers<block_size, strategy>(dst, src);
// Update `size` once at the end instead of once per iteration.
size %= block_size;
}

LIBC_INLINE CPtr bitwise_or(CPtr a, CPtr b) {
return cpp::bit_cast<CPtr>(cpp::bit_cast<uintptr_t>(a) |
cpp::bit_cast<uintptr_t>(b));
for (size_t i = 0; i < size / bytes; ++i)
copy_block_and_bump_pointers<bytes, block_op>(dst, src);
if constexpr (bump_size == BumpSize::kYes) {
size %= bytes;
}
}

LIBC_INLINE auto misaligned(CPtr a) {
return distance_to_align_down<kWordSize>(a);
LIBC_INLINE void copy_bytes_and_bump_pointers(Ptr &dst, CPtr &src,
size_t size) {
consume_by_aligned_block<1, BlockOp::kFull, BumpSize::kNo>(dst, src, size);
}

} // namespace
Expand Down Expand Up @@ -125,20 +94,21 @@ LIBC_INLINE auto misaligned(CPtr a) {
if (src_alignment == 0)
LIBC_ATTR_LIKELY {
// Both `src` and `dst` are now word-aligned.
copy_blocks_and_update_args<64, AssumeWordAligned>(dst, src, size);
copy_blocks_and_update_args<16, AssumeWordAligned>(dst, src, size);
copy_blocks_and_update_args<4, AssumeWordAligned>(dst, src, size);
consume_by_aligned_block<64, BlockOp::kFull>(dst, src, size);
consume_by_aligned_block<16, BlockOp::kFull>(dst, src, size);
consume_by_aligned_block<4, BlockOp::kFull>(dst, src, size);
}
else {
// `dst` is aligned but `src` is not.
LIBC_LOOP_NOUNROLL
while (size >= kWordSize) {
// Recompose word from multiple loads depending on the alignment.
// Recompose word from multiple loads depending on the
// alignment.
const uint32_t value =
src_alignment == 2
? load_aligned<uint32_t, uint16_t, uint16_t>(src)
: load_aligned<uint32_t, uint8_t, uint16_t, uint8_t>(src);
memcpy_inline<kWordSize>(assume_aligned<kWordSize>(dst), &value);
copy_assume_aligned<kWordSize>(dst, &value);
dst += kWordSize;
src += kWordSize;
size -= kWordSize;
Expand Down Expand Up @@ -169,31 +139,33 @@ LIBC_INLINE auto misaligned(CPtr a) {
if (size < 8)
LIBC_ATTR_UNLIKELY {
if (size & 1)
copy_and_bump_pointers<1>(dst, src);
copy_block_and_bump_pointers<1>(dst, src);
if (size & 2)
copy_and_bump_pointers<2>(dst, src);
copy_block_and_bump_pointers<2>(dst, src);
if (size & 4)
copy_and_bump_pointers<4>(dst, src);
copy_block_and_bump_pointers<4>(dst, src);
return;
}
if (misaligned(src))
LIBC_ATTR_UNLIKELY {
const size_t offset = distance_to_align_up<kWordSize>(dst);
if (offset & 1)
copy_and_bump_pointers<1>(dst, src);
copy_block_and_bump_pointers<1>(dst, src);
if (offset & 2)
copy_and_bump_pointers<2>(dst, src);
copy_block_and_bump_pointers<2>(dst, src);
size -= offset;
}
}
copy_blocks_and_update_args<64, ForceWordLdStChain>(dst, src, size);
copy_blocks_and_update_args<16, ForceWordLdStChain>(dst, src, size);
copy_blocks_and_update_args<4, AssumeUnaligned>(dst, src, size);
// `dst` and `src` are not necessarily both aligned at that point but this
// implementation assumes hardware support for unaligned loads and stores.
consume_by_aligned_block<64, BlockOp::kByWord>(dst, src, size);
consume_by_aligned_block<16, BlockOp::kByWord>(dst, src, size);
consume_by_aligned_block<4, BlockOp::kFull>(dst, src, size);
if (size & 1)
copy_and_bump_pointers<1>(dst, src);
copy_block_and_bump_pointers<1>(dst, src);
if (size & 2)
LIBC_ATTR_UNLIKELY
copy_and_bump_pointers<2>(dst, src);
copy_block_and_bump_pointers<2>(dst, src);
}

[[maybe_unused]] LIBC_INLINE void inline_memcpy_arm(void *__restrict dst_,
Expand All @@ -210,8 +182,4 @@ LIBC_INLINE auto misaligned(CPtr a) {

} // namespace LIBC_NAMESPACE_DECL

// Cleanup local macros
#undef LIBC_ATTR_LIKELY
#undef LIBC_ATTR_UNLIKELY

#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_INLINE_MEMCPY_H
1 change: 1 addition & 0 deletions utils/bazel/llvm-project-overlay/libc/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -4268,6 +4268,7 @@ libc_support_library(
"src/string/memory_utils/aarch64/inline_memcpy.h",
"src/string/memory_utils/aarch64/inline_memmove.h",
"src/string/memory_utils/aarch64/inline_memset.h",
"src/string/memory_utils/arm/common.h",
"src/string/memory_utils/arm/inline_memcpy.h",
"src/string/memory_utils/generic/aligned_access.h",
"src/string/memory_utils/generic/byte_per_byte.h",
Expand Down
Loading