Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions offload/liboffload/API/Memory.td
Original file line number Diff line number Diff line change
Expand Up @@ -63,3 +63,23 @@ def : Function {
];
let returns = [];
}

def : Function {
let name = "olMemFill";
let desc = "Fill memory with copies of the given pattern";
let details = [
"Filling with patterns larger than 4 bytes may be less performant",
"The destination pointer and queue must be associated with the same device",
"The fill size must be a multiple of the pattern size",
];
let params = [
Param<"ol_queue_handle_t", "Queue", "handle of the queue", PARAM_IN_OPTIONAL>,
Param<"void*", "Ptr", "destination pointer to start filling at", PARAM_IN>,
Param<"size_t", "PatternSize", "the size of the pattern in bytes", PARAM_IN>,
Param<"const void*", "PatternPtr", "", PARAM_IN>,
Param<"size_t", "FillSize", "number of bytes to fill", PARAM_IN>,
];
let returns = [
Return<"OL_ERRC_INVALID_SIZE", ["`FillSize % PatternSize != 0`"]>
];
}
6 changes: 6 additions & 0 deletions offload/liboffload/src/OffloadImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -656,6 +656,12 @@ Error olMemcpy_impl(ol_queue_handle_t Queue, void *DstPtr,
return Error::success();
}

Error olMemFill_impl(ol_queue_handle_t Queue, void *Ptr, size_t PatternSize,
const void *PatternPtr, size_t FillSize) {
return Queue->Device->Device->dataFill(Ptr, PatternPtr, PatternSize, FillSize,
Queue->AsyncInfo);
}

Error olCreateProgram_impl(ol_device_handle_t Device, const void *ProgData,
size_t ProgDataSize, ol_program_handle_t *Program) {
// Make a copy of the program binary in case it is released by the caller.
Expand Down
25 changes: 25 additions & 0 deletions offload/plugins-nextgen/amdgpu/src/rtl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2583,6 +2583,31 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
return Plugin::success();
}

Error dataFillImpl(void *TgtPtr, const void *PatternPtr, int64_t PatternSize,
int64_t Size,
AsyncInfoWrapperTy &AsyncInfoWrapper) override {
hsa_status_t Status;

// We can use hsa_amd_memory_fill for this size, but it's not async so the
// queue needs to be synchronized first
if (PatternSize == 4) {
if (AsyncInfoWrapper.hasQueue())
if (auto Err = synchronize(AsyncInfoWrapper))
return Err;
Status = hsa_amd_memory_fill(TgtPtr,
*static_cast<const uint32_t *>(PatternPtr),
Size / PatternSize);

if (auto Err =
Plugin::check(Status, "error in hsa_amd_memory_fill: %s\n"))
return Err;
} else {
// TODO: Implement for AMDGPU. Most likely by doing the fill in pinned
// memory and copying to the device in one go.
return Plugin::error(ErrorCode::UNSUPPORTED, "Unsupported fill size");
}
}

/// Initialize the async info for interoperability purposes.
Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override {
// TODO: Implement this function.
Expand Down
7 changes: 7 additions & 0 deletions offload/plugins-nextgen/common/include/PluginInterface.h
Original file line number Diff line number Diff line change
Expand Up @@ -957,6 +957,13 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
void *DstPtr, int64_t Size,
AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;

/// Fill data on the device with a pattern from the host
Error dataFill(void *TgtPtr, const void *PatternPtr, int64_t PatternSize,
int64_t Size, __tgt_async_info *AsyncInfo);
virtual Error dataFillImpl(void *TgtPtr, const void *PatternPtr,
int64_t PatternSize, int64_t Size,
AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;

/// Run the kernel associated with \p EntryPtr
Error launchKernel(void *EntryPtr, void **ArgPtrs, ptrdiff_t *ArgOffsets,
KernelArgsTy &KernelArgs, __tgt_async_info *AsyncInfo);
Expand Down
10 changes: 10 additions & 0 deletions offload/plugins-nextgen/common/src/PluginInterface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1540,6 +1540,16 @@ Error GenericDeviceTy::dataExchange(const void *SrcPtr, GenericDeviceTy &DstDev,
return Err;
}

Error GenericDeviceTy::dataFill(void *TgtPtr, const void *PatternPtr,
int64_t PatternSize, int64_t Size,
__tgt_async_info *AsyncInfo) {
AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo);
auto Err =
dataFillImpl(TgtPtr, PatternPtr, PatternSize, Size, AsyncInfoWrapper);
AsyncInfoWrapper.finalize(Err);
return Err;
}

Error GenericDeviceTy::launchKernel(void *EntryPtr, void **ArgPtrs,
ptrdiff_t *ArgOffsets,
KernelArgsTy &KernelArgs,
Expand Down
7 changes: 7 additions & 0 deletions offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,13 @@ DLWRAP(cuMemcpyDtoHAsync, 4)
DLWRAP(cuMemcpyHtoD, 3)
DLWRAP(cuMemcpyHtoDAsync, 4)

DLWRAP(cuMemsetD8Async, 4)
DLWRAP(cuMemsetD16Async, 4)
DLWRAP(cuMemsetD32Async, 4)
DLWRAP(cuMemsetD2D8Async, 6)
DLWRAP(cuMemsetD2D16Async, 6)
DLWRAP(cuMemsetD2D32Async, 6)

DLWRAP(cuMemFree, 1)
DLWRAP(cuMemFreeHost, 1)
DLWRAP(cuMemFreeAsync, 2)
Expand Down
10 changes: 10 additions & 0 deletions offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -321,6 +321,16 @@ CUresult cuMemcpyDtoHAsync(void *, CUdeviceptr, size_t, CUstream);
CUresult cuMemcpyHtoD(CUdeviceptr, const void *, size_t);
CUresult cuMemcpyHtoDAsync(CUdeviceptr, const void *, size_t, CUstream);

CUresult cuMemsetD8Async(CUdeviceptr, unsigned int, size_t, CUstream);
CUresult cuMemsetD16Async(CUdeviceptr, unsigned int, size_t, CUstream);
CUresult cuMemsetD32Async(CUdeviceptr, unsigned int, size_t, CUstream);
CUresult cuMemsetD2D8Async(CUdeviceptr, size_t, unsigned int, size_t, size_t,
CUstream);
CUresult cuMemsetD2D16Async(CUdeviceptr, size_t, unsigned int, size_t, size_t,
CUstream);
CUresult cuMemsetD2D32Async(CUdeviceptr, size_t, unsigned int, size_t, size_t,
CUstream);

CUresult cuMemFree(CUdeviceptr);
CUresult cuMemFreeHost(void *);
CUresult cuMemFreeAsync(CUdeviceptr, CUstream);
Expand Down
58 changes: 58 additions & 0 deletions offload/plugins-nextgen/cuda/src/rtl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -844,6 +844,64 @@ struct CUDADeviceTy : public GenericDeviceTy {
void *DstPtr, int64_t Size,
AsyncInfoWrapperTy &AsyncInfoWrapper) override;

Error dataFillImpl(void *TgtPtr, const void *PatternPtr, int64_t PatternSize,
int64_t Size,
AsyncInfoWrapperTy &AsyncInfoWrapper) override {
if (auto Err = setContext())
return Err;

CUstream Stream;
if (auto Err = getStream(AsyncInfoWrapper, Stream))
return Err;

CUresult Res;
size_t N = Size / PatternSize;
if (PatternSize == 1) {
Res = cuMemsetD8Async((CUdeviceptr)TgtPtr,
*(static_cast<const uint8_t *>(PatternPtr)), N,
Stream);
} else if (PatternSize == 2) {
Res = cuMemsetD16Async((CUdeviceptr)TgtPtr,
*(static_cast<const uint16_t *>(PatternPtr)), N,
Stream);
} else if (PatternSize == 4) {
Res = cuMemsetD32Async((CUdeviceptr)TgtPtr,
*(static_cast<const uint32_t *>(PatternPtr)), N,
Stream);
} else {
// For larger patterns we can do a series of strided fills to copy the
// pattern efficiently
int64_t MemsetSize = PatternSize % 4u == 0u ? 4u
: PatternSize % 2u == 0u ? 2u
: 1u;

int64_t NumberOfSteps = PatternSize / MemsetSize;
int64_t Pitch = NumberOfSteps * MemsetSize;
int64_t Height = Size / PatternSize;

for (auto Step = 0u; Step < NumberOfSteps; ++Step) {
if (MemsetSize == 4) {
Res = cuMemsetD2D32Async(
(CUdeviceptr)TgtPtr + Step * MemsetSize, Pitch,
*(static_cast<const uint32_t *>(PatternPtr) + Step), 1u, Height,
Stream);
} else if (MemsetSize == 2) {
Res = cuMemsetD2D16Async(
(CUdeviceptr)TgtPtr + Step * MemsetSize, Pitch,
*(static_cast<const uint16_t *>(PatternPtr) + Step), 1u, Height,
Stream);
} else {
Res = cuMemsetD2D8Async(
(CUdeviceptr)TgtPtr + Step * MemsetSize, Pitch,
*(static_cast<const uint8_t *>(PatternPtr) + Step), 1u, Height,
Stream);
}
}
}

return Plugin::check(Res, "error in cuMemset: %s");
}

/// Initialize the async info for interoperability purposes.
Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override {
if (auto Err = setContext())
Expand Down
15 changes: 15 additions & 0 deletions offload/plugins-nextgen/host/src/rtl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -302,6 +302,21 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
return Plugin::success();
}

Error dataFillImpl(void *TgtPtr, const void *PatternPtr, int64_t PatternSize,
int64_t Size,
AsyncInfoWrapperTy &AsyncInfoWrapper) override {
if (PatternSize == 1) {
std::memset(TgtPtr, *static_cast<const char *>(PatternPtr), Size);
} else {
for (unsigned int Step = 0; Step < Size; Step += PatternSize) {
auto *Dst = static_cast<char *>(TgtPtr) + Step;
std::memcpy(Dst, PatternPtr, PatternSize);
}
}

return Plugin::success();
}

/// All functions are already synchronous. No need to do anything on this
/// synchronization function.
Error synchronizeImpl(__tgt_async_info &AsyncInfo,
Expand Down
1 change: 1 addition & 0 deletions offload/unittests/OffloadAPI/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ add_offload_unittest("kernel"

add_offload_unittest("memory"
memory/olMemAlloc.cpp
memory/olMemFill.cpp
memory/olMemFree.cpp
memory/olMemcpy.cpp)

Expand Down
134 changes: 134 additions & 0 deletions offload/unittests/OffloadAPI/memory/olMemFill.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
//===------- Offload API tests - olMemFill --------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "../common/Fixtures.hpp"
#include <OffloadAPI.h>
#include <gtest/gtest.h>

using olMemFillTest = OffloadQueueTest;
OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olMemFillTest);

TEST_P(olMemFillTest, Success8) {
constexpr size_t Size = 1024;
void *Alloc;
ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc));

uint8_t Pattern = 0x42;
ASSERT_SUCCESS(olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size));

olSyncQueue(Queue);

size_t N = Size / sizeof(Pattern);
for (size_t i = 0; i < N; i++) {
uint8_t *AllocPtr = reinterpret_cast<uint8_t *>(Alloc);
ASSERT_EQ(AllocPtr[i], Pattern);
}

olMemFree(Alloc);
}

TEST_P(olMemFillTest, Success16) {
constexpr size_t Size = 1024;
void *Alloc;
ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc));

uint16_t Pattern = 0x4242;
ASSERT_SUCCESS(olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size));

olSyncQueue(Queue);

size_t N = Size / sizeof(Pattern);
for (size_t i = 0; i < N; i++) {
uint16_t *AllocPtr = reinterpret_cast<uint16_t *>(Alloc);
ASSERT_EQ(AllocPtr[i], Pattern);
}

olMemFree(Alloc);
}

TEST_P(olMemFillTest, Success32) {
constexpr size_t Size = 1024;
void *Alloc;
ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc));

uint32_t Pattern = 0xDEADBEEF;
ASSERT_SUCCESS(olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size));

olSyncQueue(Queue);

size_t N = Size / sizeof(Pattern);
for (size_t i = 0; i < N; i++) {
uint32_t *AllocPtr = reinterpret_cast<uint32_t *>(Alloc);
ASSERT_EQ(AllocPtr[i], Pattern);
}

olMemFree(Alloc);
}

TEST_P(olMemFillTest, SuccessLarge) {
constexpr size_t Size = 1024;
void *Alloc;
ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc));

struct PatternT {
uint64_t A;
uint64_t B;
} Pattern{UINT64_MAX, UINT64_MAX};

ASSERT_SUCCESS(olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size));

olSyncQueue(Queue);

size_t N = Size / sizeof(Pattern);
for (size_t i = 0; i < N; i++) {
PatternT *AllocPtr = reinterpret_cast<PatternT *>(Alloc);
ASSERT_EQ(AllocPtr[i].A, UINT64_MAX);
ASSERT_EQ(AllocPtr[i].B, UINT64_MAX);
}

olMemFree(Alloc);
}

TEST_P(olMemFillTest, SuccessLargeByteAligned) {
constexpr size_t Size = 17 * 64;
void *Alloc;
ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc));

struct __attribute__((packed)) PatternT {
uint64_t A;
uint64_t B;
uint8_t C;
} Pattern{UINT64_MAX, UINT64_MAX, 255};

ASSERT_SUCCESS(olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size));

olSyncQueue(Queue);

size_t N = Size / sizeof(Pattern);
for (size_t i = 0; i < N; i++) {
PatternT *AllocPtr = reinterpret_cast<PatternT *>(Alloc);
ASSERT_EQ(AllocPtr[i].A, UINT64_MAX);
ASSERT_EQ(AllocPtr[i].B, UINT64_MAX);
ASSERT_EQ(AllocPtr[i].C, 255);
}

olMemFree(Alloc);
}

TEST_P(olMemFillTest, InvalidPatternSize) {
constexpr size_t Size = 1025;
void *Alloc;
ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc));

uint16_t Pattern = 0x4242;
ASSERT_ERROR(OL_ERRC_INVALID_SIZE,
olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size));

olSyncQueue(Queue);
olMemFree(Alloc);
}
Loading