Skip to content

Commit f96bdec

Browse files
committed
[Offload] Implement olMemFill
1 parent c1e2a9c commit f96bdec

File tree

11 files changed

+287
-0
lines changed

11 files changed

+287
-0
lines changed

offload/liboffload/API/Memory.td

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,3 +63,23 @@ def : Function {
6363
];
6464
let returns = [];
6565
}
66+
67+
def : Function {
68+
let name = "olMemFill";
69+
let desc = "Fill memory with copies of the given pattern";
70+
let details = [
71+
"Filling with patterns larger than 4 bytes may be less performant",
72+
"The destination pointer and queue must be associated with the same device",
73+
"The fill size must be a multiple of the pattern size",
74+
];
75+
let params = [
76+
Param<"ol_queue_handle_t", "Queue", "handle of the queue", PARAM_IN_OPTIONAL>,
77+
Param<"void*", "Ptr", "destination pointer to start filling at", PARAM_IN>,
78+
Param<"size_t", "PatternSize", "the size of the pattern in bytes", PARAM_IN>,
79+
Param<"void*", "PatternPtr", "", PARAM_IN>,
80+
Param<"size_t", "FillSize", "number of bytes to fill", PARAM_IN>,
81+
];
82+
let returns = [
83+
Return<"OL_ERRC_INVALID_SIZE", ["`FillSize % PatternSize != 0`"]>
84+
];
85+
}

offload/liboffload/src/OffloadImpl.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -656,6 +656,12 @@ Error olMemcpy_impl(ol_queue_handle_t Queue, void *DstPtr,
656656
return Error::success();
657657
}
658658

659+
Error olMemFill_impl(ol_queue_handle_t Queue, void *Ptr, size_t PatternSize,
660+
void *PatternPtr, size_t FillSize) {
661+
return Queue->Device->Device->dataFill(Ptr, PatternPtr, PatternSize, FillSize,
662+
Queue->AsyncInfo);
663+
}
664+
659665
Error olCreateProgram_impl(ol_device_handle_t Device, const void *ProgData,
660666
size_t ProgDataSize, ol_program_handle_t *Program) {
661667
// Make a copy of the program binary in case it is released by the caller.

offload/plugins-nextgen/amdgpu/src/rtl.cpp

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2583,6 +2583,30 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
25832583
return Plugin::success();
25842584
}
25852585

2586+
Error dataFillImpl(void *TgtPtr, const void *PatternPtr, int64_t PatternSize,
2587+
int64_t Size,
2588+
AsyncInfoWrapperTy &AsyncInfoWrapper) override {
2589+
hsa_status_t Status;
2590+
2591+
// We can use hsa_amd_memory_fill for this size, but it's not async so the
2592+
// queue needs to be synchronized first
2593+
if (PatternSize == 4) {
2594+
if (AsyncInfoWrapper.hasQueue())
2595+
if (auto Err = synchronize(AsyncInfoWrapper))
2596+
return Err;
2597+
Status = hsa_amd_memory_fill(TgtPtr, *(uint32_t *)(PatternPtr),
2598+
Size / PatternSize);
2599+
2600+
if (auto Err =
2601+
Plugin::check(Status, "error in hsa_amd_memory_fill: %s\n"))
2602+
return Err;
2603+
} else {
2604+
// TODO: Implement for AMDGPU. Most likely by doing the fill in pinned
2605+
// memory and copying to the device in one go.
2606+
return Plugin::error(ErrorCode::UNSUPPORTED, "Unsupported fill size");
2607+
}
2608+
}
2609+
25862610
/// Initialize the async info for interoperability purposes.
25872611
Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override {
25882612
// TODO: Implement this function.

offload/plugins-nextgen/common/include/PluginInterface.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -957,6 +957,13 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
957957
void *DstPtr, int64_t Size,
958958
AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
959959

960+
/// Fill data on the device with a pattern from the host
961+
Error dataFill(void *TgtPtr, const void *PatternPtr, int64_t PatternSize,
962+
int64_t Size, __tgt_async_info *AsyncInfo);
963+
virtual Error dataFillImpl(void *TgtPtr, const void *PatternPtr,
964+
int64_t PatternSize, int64_t Size,
965+
AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
966+
960967
/// Run the kernel associated with \p EntryPtr
961968
Error launchKernel(void *EntryPtr, void **ArgPtrs, ptrdiff_t *ArgOffsets,
962969
KernelArgsTy &KernelArgs, __tgt_async_info *AsyncInfo);

offload/plugins-nextgen/common/src/PluginInterface.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1540,6 +1540,16 @@ Error GenericDeviceTy::dataExchange(const void *SrcPtr, GenericDeviceTy &DstDev,
15401540
return Err;
15411541
}
15421542

1543+
Error GenericDeviceTy::dataFill(void *TgtPtr, const void *PatternPtr,
1544+
int64_t PatternSize, int64_t Size,
1545+
__tgt_async_info *AsyncInfo) {
1546+
AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo);
1547+
auto Err =
1548+
dataFillImpl(TgtPtr, PatternPtr, PatternSize, Size, AsyncInfoWrapper);
1549+
AsyncInfoWrapper.finalize(Err);
1550+
return Err;
1551+
}
1552+
15431553
Error GenericDeviceTy::launchKernel(void *EntryPtr, void **ArgPtrs,
15441554
ptrdiff_t *ArgOffsets,
15451555
KernelArgsTy &KernelArgs,

offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,13 @@ DLWRAP(cuMemcpyDtoHAsync, 4)
5353
DLWRAP(cuMemcpyHtoD, 3)
5454
DLWRAP(cuMemcpyHtoDAsync, 4)
5555

56+
DLWRAP(cuMemsetD8Async, 4)
57+
DLWRAP(cuMemsetD16Async, 4)
58+
DLWRAP(cuMemsetD32Async, 4)
59+
DLWRAP(cuMemsetD2D8Async, 6)
60+
DLWRAP(cuMemsetD2D16Async, 6)
61+
DLWRAP(cuMemsetD2D32Async, 6)
62+
5663
DLWRAP(cuMemFree, 1)
5764
DLWRAP(cuMemFreeHost, 1)
5865
DLWRAP(cuMemFreeAsync, 2)

offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -321,6 +321,16 @@ CUresult cuMemcpyDtoHAsync(void *, CUdeviceptr, size_t, CUstream);
321321
CUresult cuMemcpyHtoD(CUdeviceptr, const void *, size_t);
322322
CUresult cuMemcpyHtoDAsync(CUdeviceptr, const void *, size_t, CUstream);
323323

324+
CUresult cuMemsetD8Async(CUdeviceptr, unsigned int, size_t, CUstream);
325+
CUresult cuMemsetD16Async(CUdeviceptr, unsigned int, size_t, CUstream);
326+
CUresult cuMemsetD32Async(CUdeviceptr, unsigned int, size_t, CUstream);
327+
CUresult cuMemsetD2D8Async(CUdeviceptr, size_t, unsigned int, size_t, size_t,
328+
CUstream);
329+
CUresult cuMemsetD2D16Async(CUdeviceptr, size_t, unsigned int, size_t, size_t,
330+
CUstream);
331+
CUresult cuMemsetD2D32Async(CUdeviceptr, size_t, unsigned int, size_t, size_t,
332+
CUstream);
333+
324334
CUresult cuMemFree(CUdeviceptr);
325335
CUresult cuMemFreeHost(void *);
326336
CUresult cuMemFreeAsync(CUdeviceptr, CUstream);

offload/plugins-nextgen/cuda/src/rtl.cpp

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -844,6 +844,58 @@ struct CUDADeviceTy : public GenericDeviceTy {
844844
void *DstPtr, int64_t Size,
845845
AsyncInfoWrapperTy &AsyncInfoWrapper) override;
846846

847+
Error dataFillImpl(void *TgtPtr, const void *PatternPtr, int64_t PatternSize,
848+
int64_t Size,
849+
AsyncInfoWrapperTy &AsyncInfoWrapper) override {
850+
if (auto Err = setContext())
851+
return Err;
852+
853+
CUstream Stream;
854+
if (auto Err = getStream(AsyncInfoWrapper, Stream))
855+
return Err;
856+
857+
CUresult Res;
858+
size_t N = Size / PatternSize;
859+
if (PatternSize == 1) {
860+
Res = cuMemsetD8Async((CUdeviceptr)TgtPtr, *((const uint8_t *)PatternPtr),
861+
N, Stream);
862+
} else if (PatternSize == 2) {
863+
Res = cuMemsetD16Async((CUdeviceptr)TgtPtr,
864+
*((const uint16_t *)PatternPtr), N, Stream);
865+
} else if (PatternSize == 4) {
866+
Res = cuMemsetD32Async((CUdeviceptr)TgtPtr,
867+
*((const uint32_t *)PatternPtr), N, Stream);
868+
} else {
869+
// For larger patterns we can do a series of strided fills to copy the
870+
// pattern efficiently
871+
int64_t MemsetSize = PatternSize % 4u == 0u ? 4u
872+
: PatternSize % 2u == 0u ? 2u
873+
: 1u;
874+
875+
int64_t NumberOfSteps = PatternSize / MemsetSize;
876+
int64_t Pitch = NumberOfSteps * MemsetSize;
877+
int64_t Height = Size / PatternSize;
878+
879+
for (auto Step = 0u; Step < NumberOfSteps; ++Step) {
880+
if (MemsetSize == 4) {
881+
Res = cuMemsetD2D32Async(
882+
(CUdeviceptr)TgtPtr + Step * MemsetSize, Pitch,
883+
*((const uint32_t *)PatternPtr + Step), 1u, Height, Stream);
884+
} else if (MemsetSize == 2) {
885+
Res = cuMemsetD2D16Async(
886+
(CUdeviceptr)TgtPtr + Step * MemsetSize, Pitch,
887+
*((const uint16_t *)PatternPtr + Step), 1u, Height, Stream);
888+
} else {
889+
Res = cuMemsetD2D8Async((CUdeviceptr)TgtPtr + Step * MemsetSize,
890+
Pitch, *((const uint8_t *)PatternPtr + Step),
891+
1u, Height, Stream);
892+
}
893+
}
894+
}
895+
896+
return Plugin::check(Res, "error in cuMemset: %s");
897+
}
898+
847899
/// Initialize the async info for interoperability purposes.
848900
Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override {
849901
if (auto Err = setContext())

offload/plugins-nextgen/host/src/rtl.cpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -302,6 +302,22 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
302302
return Plugin::success();
303303
}
304304

305+
Error dataFillImpl(void *TgtPtr, const void *PatternPtr, int64_t PatternSize,
306+
int64_t Size,
307+
AsyncInfoWrapperTy &AsyncInfoWrapper) override {
308+
if (PatternSize == 1) {
309+
std::memset(TgtPtr, *static_cast<const char *>(PatternPtr), Size);
310+
} else {
311+
for (unsigned int Step = 0; Step < Size; Step += PatternSize) {
312+
auto *Dst =
313+
reinterpret_cast<void *>(reinterpret_cast<char *>(TgtPtr) + Step);
314+
std::memcpy(TgtPtr, PatternPtr, PatternSize);
315+
}
316+
}
317+
318+
return Plugin::success();
319+
}
320+
305321
/// All functions are already synchronous. No need to do anything on this
306322
/// synchronization function.
307323
Error synchronizeImpl(__tgt_async_info &AsyncInfo,

offload/unittests/OffloadAPI/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ add_offload_unittest("kernel"
2424

2525
add_offload_unittest("memory"
2626
memory/olMemAlloc.cpp
27+
memory/olMemFill.cpp
2728
memory/olMemFree.cpp
2829
memory/olMemcpy.cpp)
2930

0 commit comments

Comments
 (0)