From f96bdecd8c13f0cf4f1beb673c83c8e26e2d2de5 Mon Sep 17 00:00:00 2001 From: Callum Fare Date: Thu, 7 Aug 2025 13:12:55 +0100 Subject: [PATCH 1/4] [Offload] Implement olMemFill --- offload/liboffload/API/Memory.td | 20 +++ offload/liboffload/src/OffloadImpl.cpp | 6 + offload/plugins-nextgen/amdgpu/src/rtl.cpp | 24 ++++ .../common/include/PluginInterface.h | 7 + .../common/src/PluginInterface.cpp | 10 ++ .../cuda/dynamic_cuda/cuda.cpp | 7 + .../plugins-nextgen/cuda/dynamic_cuda/cuda.h | 10 ++ offload/plugins-nextgen/cuda/src/rtl.cpp | 52 +++++++ offload/plugins-nextgen/host/src/rtl.cpp | 16 +++ offload/unittests/OffloadAPI/CMakeLists.txt | 1 + .../unittests/OffloadAPI/memory/olMemFill.cpp | 134 ++++++++++++++++++ 11 files changed, 287 insertions(+) create mode 100644 offload/unittests/OffloadAPI/memory/olMemFill.cpp diff --git a/offload/liboffload/API/Memory.td b/offload/liboffload/API/Memory.td index 5f7158588bc77..82f942b2e06c5 100644 --- a/offload/liboffload/API/Memory.td +++ b/offload/liboffload/API/Memory.td @@ -63,3 +63,23 @@ def : Function { ]; let returns = []; } + +def : Function { + let name = "olMemFill"; + let desc = "Fill memory with copies of the given pattern"; + let details = [ + "Filling with patterns larger than 4 bytes may be less performant", + "The destination pointer and queue must be associated with the same device", + "The fill size must be a multiple of the pattern size", + ]; + let params = [ + Param<"ol_queue_handle_t", "Queue", "handle of the queue", PARAM_IN_OPTIONAL>, + Param<"void*", "Ptr", "destination pointer to start filling at", PARAM_IN>, + Param<"size_t", "PatternSize", "the size of the pattern in bytes", PARAM_IN>, + Param<"void*", "PatternPtr", "", PARAM_IN>, + Param<"size_t", "FillSize", "number of bytes to fill", PARAM_IN>, + ]; + let returns = [ + Return<"OL_ERRC_INVALID_SIZE", ["`FillSize % PatternSize != 0`"]> + ]; +} diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp index 1c9dfc69d445a..0b6363ab6ffbf 100644 --- a/offload/liboffload/src/OffloadImpl.cpp +++ b/offload/liboffload/src/OffloadImpl.cpp @@ -656,6 +656,12 @@ Error olMemcpy_impl(ol_queue_handle_t Queue, void *DstPtr, return Error::success(); } +Error olMemFill_impl(ol_queue_handle_t Queue, void *Ptr, size_t PatternSize, + void *PatternPtr, size_t FillSize) { + return Queue->Device->Device->dataFill(Ptr, PatternPtr, PatternSize, FillSize, + Queue->AsyncInfo); +} + Error olCreateProgram_impl(ol_device_handle_t Device, const void *ProgData, size_t ProgDataSize, ol_program_handle_t *Program) { // Make a copy of the program binary in case it is released by the caller. diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp index 83280fe0a49c9..949f23278277c 100644 --- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp @@ -2583,6 +2583,30 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { return Plugin::success(); } + Error dataFillImpl(void *TgtPtr, const void *PatternPtr, int64_t PatternSize, + int64_t Size, + AsyncInfoWrapperTy &AsyncInfoWrapper) override { + hsa_status_t Status; + + // We can use hsa_amd_memory_fill for this size, but it's not async so the + // queue needs to be synchronized first + if (PatternSize == 4) { + if (AsyncInfoWrapper.hasQueue()) + if (auto Err = synchronize(AsyncInfoWrapper)) + return Err; + Status = hsa_amd_memory_fill(TgtPtr, *(uint32_t *)(PatternPtr), + Size / PatternSize); + + if (auto Err = + Plugin::check(Status, "error in hsa_amd_memory_fill: %s\n")) + return Err; + } else { + // TODO: Implement for AMDGPU. Most likely by doing the fill in pinned + // memory and copying to the device in one go. + return Plugin::error(ErrorCode::UNSUPPORTED, "Unsupported fill size"); + } + } + /// Initialize the async info for interoperability purposes. Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override { // TODO: Implement this function. diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h index a448721755a6f..b2145979ae599 100644 --- a/offload/plugins-nextgen/common/include/PluginInterface.h +++ b/offload/plugins-nextgen/common/include/PluginInterface.h @@ -957,6 +957,13 @@ struct GenericDeviceTy : public DeviceAllocatorTy { void *DstPtr, int64_t Size, AsyncInfoWrapperTy &AsyncInfoWrapper) = 0; + /// Fill data on the device with a pattern from the host + Error dataFill(void *TgtPtr, const void *PatternPtr, int64_t PatternSize, + int64_t Size, __tgt_async_info *AsyncInfo); + virtual Error dataFillImpl(void *TgtPtr, const void *PatternPtr, + int64_t PatternSize, int64_t Size, + AsyncInfoWrapperTy &AsyncInfoWrapper) = 0; + /// Run the kernel associated with \p EntryPtr Error launchKernel(void *EntryPtr, void **ArgPtrs, ptrdiff_t *ArgOffsets, KernelArgsTy &KernelArgs, __tgt_async_info *AsyncInfo); diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp index c06c35e1e6a5b..fb672c9782a8b 100644 --- a/offload/plugins-nextgen/common/src/PluginInterface.cpp +++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp @@ -1540,6 +1540,16 @@ Error GenericDeviceTy::dataExchange(const void *SrcPtr, GenericDeviceTy &DstDev, return Err; } +Error GenericDeviceTy::dataFill(void *TgtPtr, const void *PatternPtr, + int64_t PatternSize, int64_t Size, + __tgt_async_info *AsyncInfo) { + AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo); + auto Err = + dataFillImpl(TgtPtr, PatternPtr, PatternSize, Size, AsyncInfoWrapper); + AsyncInfoWrapper.finalize(Err); + return Err; +} + Error GenericDeviceTy::launchKernel(void *EntryPtr, void **ArgPtrs, ptrdiff_t *ArgOffsets, KernelArgsTy &KernelArgs, diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp index 361a781e8f9b6..e8da25bc1d155 100644 --- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp +++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp @@ -53,6 +53,13 @@ DLWRAP(cuMemcpyDtoHAsync, 4) DLWRAP(cuMemcpyHtoD, 3) DLWRAP(cuMemcpyHtoDAsync, 4) +DLWRAP(cuMemsetD8Async, 4) +DLWRAP(cuMemsetD16Async, 4) +DLWRAP(cuMemsetD32Async, 4) +DLWRAP(cuMemsetD2D8Async, 6) +DLWRAP(cuMemsetD2D16Async, 6) +DLWRAP(cuMemsetD2D32Async, 6) + DLWRAP(cuMemFree, 1) DLWRAP(cuMemFreeHost, 1) DLWRAP(cuMemFreeAsync, 2) diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h index b6c022c8e7e8b..93496d95327f3 100644 --- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h +++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h @@ -321,6 +321,16 @@ CUresult cuMemcpyDtoHAsync(void *, CUdeviceptr, size_t, CUstream); CUresult cuMemcpyHtoD(CUdeviceptr, const void *, size_t); CUresult cuMemcpyHtoDAsync(CUdeviceptr, const void *, size_t, CUstream); +CUresult cuMemsetD8Async(CUdeviceptr, unsigned int, size_t, CUstream); +CUresult cuMemsetD16Async(CUdeviceptr, unsigned int, size_t, CUstream); +CUresult cuMemsetD32Async(CUdeviceptr, unsigned int, size_t, CUstream); +CUresult cuMemsetD2D8Async(CUdeviceptr, size_t, unsigned int, size_t, size_t, + CUstream); +CUresult cuMemsetD2D16Async(CUdeviceptr, size_t, unsigned int, size_t, size_t, + CUstream); +CUresult cuMemsetD2D32Async(CUdeviceptr, size_t, unsigned int, size_t, size_t, + CUstream); + CUresult cuMemFree(CUdeviceptr); CUresult cuMemFreeHost(void *); CUresult cuMemFreeAsync(CUdeviceptr, CUstream); diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp index a99357a3adeaa..70020a6581a5c 100644 --- a/offload/plugins-nextgen/cuda/src/rtl.cpp +++ b/offload/plugins-nextgen/cuda/src/rtl.cpp @@ -844,6 +844,58 @@ struct CUDADeviceTy : public GenericDeviceTy { void *DstPtr, int64_t Size, AsyncInfoWrapperTy &AsyncInfoWrapper) override; + Error dataFillImpl(void *TgtPtr, const void *PatternPtr, int64_t PatternSize, + int64_t Size, + AsyncInfoWrapperTy &AsyncInfoWrapper) override { + if (auto Err = setContext()) + return Err; + + CUstream Stream; + if (auto Err = getStream(AsyncInfoWrapper, Stream)) + return Err; + + CUresult Res; + size_t N = Size / PatternSize; + if (PatternSize == 1) { + Res = cuMemsetD8Async((CUdeviceptr)TgtPtr, *((const uint8_t *)PatternPtr), + N, Stream); + } else if (PatternSize == 2) { + Res = cuMemsetD16Async((CUdeviceptr)TgtPtr, + *((const uint16_t *)PatternPtr), N, Stream); + } else if (PatternSize == 4) { + Res = cuMemsetD32Async((CUdeviceptr)TgtPtr, + *((const uint32_t *)PatternPtr), N, Stream); + } else { + // For larger patterns we can do a series of strided fills to copy the + // pattern efficiently + int64_t MemsetSize = PatternSize % 4u == 0u ? 4u + : PatternSize % 2u == 0u ? 2u + : 1u; + + int64_t NumberOfSteps = PatternSize / MemsetSize; + int64_t Pitch = NumberOfSteps * MemsetSize; + int64_t Height = Size / PatternSize; + + for (auto Step = 0u; Step < NumberOfSteps; ++Step) { + if (MemsetSize == 4) { + Res = cuMemsetD2D32Async( + (CUdeviceptr)TgtPtr + Step * MemsetSize, Pitch, + *((const uint32_t *)PatternPtr + Step), 1u, Height, Stream); + } else if (MemsetSize == 2) { + Res = cuMemsetD2D16Async( + (CUdeviceptr)TgtPtr + Step * MemsetSize, Pitch, + *((const uint16_t *)PatternPtr + Step), 1u, Height, Stream); + } else { + Res = cuMemsetD2D8Async((CUdeviceptr)TgtPtr + Step * MemsetSize, + Pitch, *((const uint8_t *)PatternPtr + Step), + 1u, Height, Stream); + } + } + } + + return Plugin::check(Res, "error in cuMemset: %s"); + } + /// Initialize the async info for interoperability purposes. Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override { if (auto Err = setContext()) diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp index 25443fd1ac0b3..0286fe216b2dc 100644 --- a/offload/plugins-nextgen/host/src/rtl.cpp +++ b/offload/plugins-nextgen/host/src/rtl.cpp @@ -302,6 +302,22 @@ struct GenELF64DeviceTy : public GenericDeviceTy { return Plugin::success(); } + Error dataFillImpl(void *TgtPtr, const void *PatternPtr, int64_t PatternSize, + int64_t Size, + AsyncInfoWrapperTy &AsyncInfoWrapper) override { + if (PatternSize == 1) { + std::memset(TgtPtr, *static_cast(PatternPtr), Size); + } else { + for (unsigned int Step = 0; Step < Size; Step += PatternSize) { + auto *Dst = + reinterpret_cast(reinterpret_cast(TgtPtr) + Step); + std::memcpy(TgtPtr, PatternPtr, PatternSize); + } + } + + return Plugin::success(); + } + /// All functions are already synchronous. No need to do anything on this /// synchronization function. Error synchronizeImpl(__tgt_async_info &AsyncInfo, diff --git a/offload/unittests/OffloadAPI/CMakeLists.txt b/offload/unittests/OffloadAPI/CMakeLists.txt index b25db7022e9d7..58c9b89d1ed0d 100644 --- a/offload/unittests/OffloadAPI/CMakeLists.txt +++ b/offload/unittests/OffloadAPI/CMakeLists.txt @@ -24,6 +24,7 @@ add_offload_unittest("kernel" add_offload_unittest("memory" memory/olMemAlloc.cpp + memory/olMemFill.cpp memory/olMemFree.cpp memory/olMemcpy.cpp) diff --git a/offload/unittests/OffloadAPI/memory/olMemFill.cpp b/offload/unittests/OffloadAPI/memory/olMemFill.cpp new file mode 100644 index 0000000000000..1b0bafa202080 --- /dev/null +++ b/offload/unittests/OffloadAPI/memory/olMemFill.cpp @@ -0,0 +1,134 @@ +//===------- Offload API tests - olMemFill --------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "../common/Fixtures.hpp" +#include +#include + +using olMemFillTest = OffloadQueueTest; +OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olMemFillTest); + +TEST_P(olMemFillTest, Success8) { + constexpr size_t Size = 1024; + void *Alloc; + ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc)); + + uint8_t Pattern = 0x42; + ASSERT_SUCCESS(olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size)); + + olSyncQueue(Queue); + + size_t N = Size / sizeof(Pattern); + for (size_t i = 0; i < N; i++) { + uint8_t *AllocPtr = reinterpret_cast(Alloc); + ASSERT_EQ(AllocPtr[i], Pattern); + } + + olMemFree(Alloc); +} + +TEST_P(olMemFillTest, Success16) { + constexpr size_t Size = 1024; + void *Alloc; + ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc)); + + uint16_t Pattern = 0x4242; + ASSERT_SUCCESS(olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size)); + + olSyncQueue(Queue); + + size_t N = Size / sizeof(Pattern); + for (size_t i = 0; i < N; i++) { + uint16_t *AllocPtr = reinterpret_cast(Alloc); + ASSERT_EQ(AllocPtr[i], Pattern); + } + + olMemFree(Alloc); +} + +TEST_P(olMemFillTest, Success32) { + constexpr size_t Size = 1024; + void *Alloc; + ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc)); + + uint32_t Pattern = 0xDEADBEEF; + ASSERT_SUCCESS(olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size)); + + olSyncQueue(Queue); + + size_t N = Size / sizeof(Pattern); + for (size_t i = 0; i < N; i++) { + uint32_t *AllocPtr = reinterpret_cast(Alloc); + ASSERT_EQ(AllocPtr[i], Pattern); + } + + olMemFree(Alloc); +} + +TEST_P(olMemFillTest, SuccessLarge) { + constexpr size_t Size = 1024; + void *Alloc; + ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc)); + + struct PatternT { + uint64_t A; + uint64_t B; + } Pattern{UINT64_MAX, UINT64_MAX}; + + ASSERT_SUCCESS(olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size)); + + olSyncQueue(Queue); + + size_t N = Size / sizeof(Pattern); + for (size_t i = 0; i < N; i++) { + PatternT *AllocPtr = reinterpret_cast(Alloc); + ASSERT_EQ(AllocPtr[i].A, UINT64_MAX); + ASSERT_EQ(AllocPtr[i].B, UINT64_MAX); + } + + olMemFree(Alloc); +} + +TEST_P(olMemFillTest, SuccessLargeByteAligned) { + constexpr size_t Size = 17 * 64; + void *Alloc; + ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc)); + + struct __attribute__((packed)) PatternT { + uint64_t A; + uint64_t B; + uint8_t C; + } Pattern{UINT64_MAX, UINT64_MAX, 255}; + + ASSERT_SUCCESS(olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size)); + + olSyncQueue(Queue); + + size_t N = Size / sizeof(Pattern); + for (size_t i = 0; i < N; i++) { + PatternT *AllocPtr = reinterpret_cast(Alloc); + ASSERT_EQ(AllocPtr[i].A, UINT64_MAX); + ASSERT_EQ(AllocPtr[i].B, UINT64_MAX); + ASSERT_EQ(AllocPtr[i].C, 255); + } + + olMemFree(Alloc); +} + +TEST_P(olMemFillTest, InvalidPatternSize) { + constexpr size_t Size = 1025; + void *Alloc; + ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc)); + + uint16_t Pattern = 0x4242; + ASSERT_ERROR(OL_ERRC_INVALID_SIZE, + olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size)); + + olSyncQueue(Queue); + olMemFree(Alloc); +} From 32e28c28c6e2ca5aa90fc6f5db7297026b25424b Mon Sep 17 00:00:00 2001 From: Callum Fare Date: Mon, 18 Aug 2025 15:54:06 +0100 Subject: [PATCH 2/4] Fix host memfill --- offload/plugins-nextgen/host/src/rtl.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp index 0286fe216b2dc..e1d85e172d02b 100644 --- a/offload/plugins-nextgen/host/src/rtl.cpp +++ b/offload/plugins-nextgen/host/src/rtl.cpp @@ -311,7 +311,7 @@ struct GenELF64DeviceTy : public GenericDeviceTy { for (unsigned int Step = 0; Step < Size; Step += PatternSize) { auto *Dst = reinterpret_cast(reinterpret_cast(TgtPtr) + Step); - std::memcpy(TgtPtr, PatternPtr, PatternSize); + std::memcpy(Dst, PatternPtr, PatternSize); } } From c8f86c7a8880d7ddf0dd7c45ab10a52116dc8bfb Mon Sep 17 00:00:00 2001 From: Callum Fare Date: Mon, 18 Aug 2025 16:09:19 +0100 Subject: [PATCH 3/4] Remove C style casts --- offload/plugins-nextgen/amdgpu/src/rtl.cpp | 3 ++- offload/plugins-nextgen/cuda/src/rtl.cpp | 24 ++++++++++++++-------- offload/plugins-nextgen/host/src/rtl.cpp | 3 +-- 3 files changed, 18 insertions(+), 12 deletions(-) diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp index 949f23278277c..964d5e5c263a7 100644 --- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp @@ -2594,7 +2594,8 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { if (AsyncInfoWrapper.hasQueue()) if (auto Err = synchronize(AsyncInfoWrapper)) return Err; - Status = hsa_amd_memory_fill(TgtPtr, *(uint32_t *)(PatternPtr), + Status = hsa_amd_memory_fill(TgtPtr, + *static_cast(PatternPtr), Size / PatternSize); if (auto Err = diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp index 70020a6581a5c..c905cfa9e65d6 100644 --- a/offload/plugins-nextgen/cuda/src/rtl.cpp +++ b/offload/plugins-nextgen/cuda/src/rtl.cpp @@ -857,14 +857,17 @@ struct CUDADeviceTy : public GenericDeviceTy { CUresult Res; size_t N = Size / PatternSize; if (PatternSize == 1) { - Res = cuMemsetD8Async((CUdeviceptr)TgtPtr, *((const uint8_t *)PatternPtr), - N, Stream); + Res = cuMemsetD8Async((CUdeviceptr)TgtPtr, + *(static_cast(PatternPtr)), N, + Stream); } else if (PatternSize == 2) { Res = cuMemsetD16Async((CUdeviceptr)TgtPtr, - *((const uint16_t *)PatternPtr), N, Stream); + *(static_cast(PatternPtr)), N, + Stream); } else if (PatternSize == 4) { Res = cuMemsetD32Async((CUdeviceptr)TgtPtr, - *((const uint32_t *)PatternPtr), N, Stream); + *(static_cast(PatternPtr)), N, + Stream); } else { // For larger patterns we can do a series of strided fills to copy the // pattern efficiently @@ -880,15 +883,18 @@ struct CUDADeviceTy : public GenericDeviceTy { if (MemsetSize == 4) { Res = cuMemsetD2D32Async( (CUdeviceptr)TgtPtr + Step * MemsetSize, Pitch, - *((const uint32_t *)PatternPtr + Step), 1u, Height, Stream); + *(static_cast(PatternPtr) + Step), 1u, Height, + Stream); } else if (MemsetSize == 2) { Res = cuMemsetD2D16Async( (CUdeviceptr)TgtPtr + Step * MemsetSize, Pitch, - *((const uint16_t *)PatternPtr + Step), 1u, Height, Stream); + *(static_cast(PatternPtr) + Step), 1u, Height, + Stream); } else { - Res = cuMemsetD2D8Async((CUdeviceptr)TgtPtr + Step * MemsetSize, - Pitch, *((const uint8_t *)PatternPtr + Step), - 1u, Height, Stream); + Res = cuMemsetD2D8Async( + (CUdeviceptr)TgtPtr + Step * MemsetSize, Pitch, + *(static_cast(PatternPtr) + Step), 1u, Height, + Stream); } } } diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp index e1d85e172d02b..35fd45458f893 100644 --- a/offload/plugins-nextgen/host/src/rtl.cpp +++ b/offload/plugins-nextgen/host/src/rtl.cpp @@ -309,8 +309,7 @@ struct GenELF64DeviceTy : public GenericDeviceTy { std::memset(TgtPtr, *static_cast(PatternPtr), Size); } else { for (unsigned int Step = 0; Step < Size; Step += PatternSize) { - auto *Dst = - reinterpret_cast(reinterpret_cast(TgtPtr) + Step); + auto *Dst = static_cast(TgtPtr) + Step; std::memcpy(Dst, PatternPtr, PatternSize); } } From 854f1b04d05a595757e436bbd5700ad51eeaf9ca Mon Sep 17 00:00:00 2001 From: Callum Fare Date: Fri, 22 Aug 2025 14:11:19 +0100 Subject: [PATCH 4/4] Make PatternPtr const --- offload/liboffload/API/Memory.td | 2 +- offload/liboffload/src/OffloadImpl.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/offload/liboffload/API/Memory.td b/offload/liboffload/API/Memory.td index 82f942b2e06c5..7f14004b43ba8 100644 --- a/offload/liboffload/API/Memory.td +++ b/offload/liboffload/API/Memory.td @@ -76,7 +76,7 @@ def : Function { Param<"ol_queue_handle_t", "Queue", "handle of the queue", PARAM_IN_OPTIONAL>, Param<"void*", "Ptr", "destination pointer to start filling at", PARAM_IN>, Param<"size_t", "PatternSize", "the size of the pattern in bytes", PARAM_IN>, - Param<"void*", "PatternPtr", "", PARAM_IN>, + Param<"const void*", "PatternPtr", "", PARAM_IN>, Param<"size_t", "FillSize", "number of bytes to fill", PARAM_IN>, ]; let returns = [ diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp index 0b6363ab6ffbf..707bda9bf6f11 100644 --- a/offload/liboffload/src/OffloadImpl.cpp +++ b/offload/liboffload/src/OffloadImpl.cpp @@ -657,7 +657,7 @@ Error olMemcpy_impl(ol_queue_handle_t Queue, void *DstPtr, } Error olMemFill_impl(ol_queue_handle_t Queue, void *Ptr, size_t PatternSize, - void *PatternPtr, size_t FillSize) { + const void *PatternPtr, size_t FillSize) { return Queue->Device->Device->dataFill(Ptr, PatternPtr, PatternSize, FillSize, Queue->AsyncInfo); }