Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,8 @@ is something we are interested in expanding on.
| Recording an in-order queue preserves linear dependencies | Not implemented |
| Using `handler::parallel_for` in a graph node | Implemented |
| Using `handler::single_task` in a graph node | Implemented |
| Using `handler::memcpy` in a graph node | Implemented for USM, not implemented for buffer accessors |
| Using `handler::copy` in a graph node | Not implemented |
| Using `handler::memcpy` in a graph node | Implemented |
| Using `handler::copy` in a graph node | Implemented |
| Using `handler::host_task` in a graph node | Not implemented |
| Using `handler::fill` in a graph node | Implemented for USM, not implemented for buffer accessors |
| Using `handler::memset` in a graph node | Not implemented |
Expand Down
2 changes: 2 additions & 0 deletions sycl/include/sycl/detail/pi.def
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,8 @@ _PI_API(piextCommandBufferRelease)
_PI_API(piextCommandBufferFinalize)
_PI_API(piextCommandBufferNDRangeKernel)
_PI_API(piextCommandBufferMemcpyUSM)
_PI_API(piextCommandBufferMemBufferCopy)
_PI_API(piextCommandBufferMemBufferCopyRect)
_PI_API(piextEnqueueCommandBuffer)
_PI_API(piPluginGetLastError)

Expand Down
44 changes: 44 additions & 0 deletions sycl/include/sycl/detail/pi.h
Original file line number Diff line number Diff line change
Expand Up @@ -2183,6 +2183,50 @@ __SYCL_EXPORT pi_result piextCommandBufferMemcpyUSM(
const pi_ext_sync_point *sync_point_wait_list,
pi_ext_sync_point *sync_point);

/// API to append a mem buffer copy command to the command-buffer.
/// \param command_buffer The command-buffer to append onto.
/// \param src_buffer is the data to be copied
/// \param dst_buffer is the location the data will be copied
/// \param src_offset offset into \p src_buffer
/// \param dst_offset offset into \p dst_buffer
/// \param size is number of bytes to copy
/// \param num_sync_points_in_wait_list The number of sync points in the
/// provided wait list.
/// \param sync_point_wait_list A list of sync points that this command must
/// wait on.
/// \param sync_point The sync_point associated with this memory operation.
__SYCL_EXPORT pi_result piextCommandBufferMemBufferCopy(
pi_ext_command_buffer command_buffer, pi_mem src_buffer, pi_mem dst_buffer,
size_t src_offset, size_t dst_offset, size_t size,
pi_uint32 num_sync_points_in_wait_list,
const pi_ext_sync_point *sync_point_wait_list,
pi_ext_sync_point *sync_point);

/// API to append a rectangular mem buffer copy command to the command-buffer.
/// \param command_buffer The command-buffer to append onto.
/// \param src_buffer is the data to be copied
/// \param dst_buffer is the location the data will be copied
/// \param src_origin offset for the start of the region to copy in src_buffer
/// \param dst_origin offset for the start of the region to copy in dst_buffer
/// \param region The size of the region to be copied
/// \param src_row_pitch Row pitch for the src data
/// \param src_slice_pitch Slice pitch for the src data
/// \param dst_row_pitch Row pitch for the dst data
/// \param dst_slice_pitch Slice pitch for the dst data
/// \param num_sync_points_in_wait_list The number of sync points in the
/// provided wait list.
/// \param sync_point_wait_list A list of sync points that this command must
/// wait on.
/// \param sync_point The sync_point associated with this memory operation.
__SYCL_EXPORT pi_result piextCommandBufferMemBufferCopyRect(
pi_ext_command_buffer command_buffer, pi_mem src_buffer, pi_mem dst_buffer,
pi_buff_rect_offset src_origin, pi_buff_rect_offset dst_origin,
pi_buff_rect_region region, size_t src_row_pitch, size_t src_slice_pitch,
size_t dst_row_pitch, size_t dst_slice_pitch,
pi_uint32 num_sync_points_in_wait_list,
const pi_ext_sync_point *sync_point_wait_list,
pi_ext_sync_point *sync_point);

/// API to submit the command-buffer to queue for execution, returns an error if
/// command-buffer not finalized or another instance of same command-buffer
/// currently executing.
Expand Down
25 changes: 25 additions & 0 deletions sycl/plugins/cuda/pi_cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5845,6 +5845,28 @@ pi_result cuda_piextCommandBufferMemcpyUSM(
return {};
}

pi_result cuda_piextCommandBufferMemBufferCopy(
pi_ext_command_buffer command_buffer, pi_mem src_buffer, pi_mem dst_buffer,
size_t src_offset, size_t dst_offset, size_t size,
pi_uint32 num_sync_points_in_wait_list,
const pi_ext_sync_point *sync_point_wait_list,
pi_ext_sync_point *sync_point) {
sycl::detail::pi::die("command-buffer API not implemented in CUDA backend");
return {};
}

pi_result cuda_piextCommandBufferMemBufferCopyRect(
pi_ext_command_buffer command_buffer, pi_mem src_buffer, pi_mem dst_buffer,
pi_buff_rect_offset src_origin, pi_buff_rect_offset dst_origin,
pi_buff_rect_region region, size_t src_row_pitch, size_t src_slice_pitch,
size_t dst_row_pitch, size_t dst_slice_pitch,
pi_uint32 num_sync_points_in_wait_list,
const pi_ext_sync_point *sync_point_wait_list,
pi_ext_sync_point *sync_point) {
sycl::detail::pi::die("command-buffer API not implemented in CUDA backend");
return {};
}

pi_result cuda_piextEnqueueCommandBuffer(pi_ext_command_buffer command_buffer,
pi_queue queue,
pi_uint32 num_events_in_wait_list,
Expand Down Expand Up @@ -6050,6 +6072,9 @@ pi_result piPluginInit(pi_plugin *PluginInit) {
_PI_CL(piextCommandBufferRelease, cuda_piextCommandBufferRelease)
_PI_CL(piextCommandBufferNDRangeKernel, cuda_piextCommandBufferNDRangeKernel)
_PI_CL(piextCommandBufferMemcpyUSM, cuda_piextCommandBufferMemcpyUSM)
_PI_CL(piextCommandBufferMemBufferCopy, cuda_piextCommandBufferMemBufferCopy)
_PI_CL(piextCommandBufferMemBufferCopyRect,
cuda_piextCommandBufferMemBufferCopyRect)
_PI_CL(piextEnqueueCommandBuffer, cuda_piextEnqueueCommandBuffer)

_PI_CL(piextKernelSetArgMemObj, cuda_piextKernelSetArgMemObj)
Expand Down
20 changes: 20 additions & 0 deletions sycl/plugins/esimd_emulator/pi_esimd_emulator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2136,6 +2136,26 @@ piextCommandBufferMemcpyUSM(pi_ext_command_buffer command_buffer, void *dst_ptr,
DIE_NO_IMPLEMENTATION;
}

pi_result piextCommandBufferMemBufferCopy(
pi_ext_command_buffer command_buffer, pi_mem src_buffer, pi_mem dst_buffer,
size_t src_offset, size_t dst_offset, size_t size,
pi_uint32 num_sync_points_in_wait_list,
const pi_ext_sync_point *sync_point_wait_list,
pi_ext_sync_point *sync_point) {
DIE_NO_IMPLEMENTATION;
}

pi_result piextCommandBufferMemBufferCopyRect(
pi_ext_command_buffer command_buffer, pi_mem src_buffer, pi_mem dst_buffer,
pi_buff_rect_offset src_origin, pi_buff_rect_offset dst_origin,
pi_buff_rect_region region, size_t src_row_pitch, size_t src_slice_pitch,
size_t dst_row_pitch, size_t dst_slice_pitch,
pi_uint32 num_sync_points_in_wait_list,
const pi_ext_sync_point *sync_point_wait_list,
pi_ext_sync_point *sync_point) {
DIE_NO_IMPLEMENTATION;
}

pi_result piextEnqueueCommandBuffer(pi_ext_command_buffer command_buffer,
pi_queue queue,
pi_uint32 num_events_in_wait_list,
Expand Down
25 changes: 25 additions & 0 deletions sycl/plugins/hip/pi_hip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5593,6 +5593,28 @@ hip_piextCommandBufferMemcpyUSM(pi_ext_command_buffer command_buffer,
return {};
}

pi_result hip_piextCommandBufferMemBufferCopy(
pi_ext_command_buffer command_buffer, pi_mem src_buffer, pi_mem dst_buffer,
size_t src_offset, size_t dst_offset, size_t size,
pi_uint32 num_sync_points_in_wait_list,
const pi_ext_sync_point *sync_point_wait_list,
pi_ext_sync_point *sync_point) {
sycl::detail::pi::die("command-buffer API not implemented in HIP backend");
return {};
}

pi_result hip_piextCommandBufferMemBufferCopyRect(
pi_ext_command_buffer command_buffer, pi_mem src_buffer, pi_mem dst_buffer,
pi_buff_rect_offset src_origin, pi_buff_rect_offset dst_origin,
pi_buff_rect_region region, size_t src_row_pitch, size_t src_slice_pitch,
size_t dst_row_pitch, size_t dst_slice_pitch,
pi_uint32 num_sync_points_in_wait_list,
const pi_ext_sync_point *sync_point_wait_list,
pi_ext_sync_point *sync_point) {
sycl::detail::pi::die("command-buffer API not implemented in HIP backend");
return {};
}

pi_result hip_piextEnqueueCommandBuffer(pi_ext_command_buffer command_buffer,
pi_queue queue,
pi_uint32 num_events_in_wait_list,
Expand Down Expand Up @@ -5798,6 +5820,9 @@ pi_result piPluginInit(pi_plugin *PluginInit) {
_PI_CL(piextCommandBufferRelease, hip_piextCommandBufferRelease)
_PI_CL(piextCommandBufferNDRangeKernel, hip_piextCommandBufferNDRangeKernel)
_PI_CL(piextCommandBufferMemcpyUSM, hip_piextCommandBufferMemcpyUSM)
_PI_CL(piextCommandBufferMemBufferCopy, hip_piextCommandBufferMemBufferCopy)
_PI_CL(piextCommandBufferMemBufferCopyRect,
hip_piextCommandBufferMemBufferCopyRect)
_PI_CL(piextEnqueueCommandBuffer, hip_piextEnqueueCommandBuffer)

_PI_CL(piextKernelSetArgMemObj, hip_piextKernelSetArgMemObj)
Expand Down
151 changes: 140 additions & 11 deletions sycl/plugins/level_zero/pi_level_zero.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9058,30 +9058,25 @@ pi_result piextCommandBufferNDRangeKernel(
return PI_SUCCESS;
}

pi_result piextCommandBufferMemcpyUSM(
pi_ext_command_buffer CommandBuffer, void *DstPtr, const void *SrcPtr,
// Helper function for common code when enqueuing memory operations to a command
// buffer.
static pi_result enqueueCommandBufferMemCopyHelper(
pi_ext_command_buffer CommandBuffer, void *Dst, const void *Src,
size_t Size, pi_uint32 NumSyncPointsInWaitList,
const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint) {
if (!DstPtr) {
return PI_ERROR_INVALID_VALUE;
}

std::vector<ze_event_handle_t> ZeEventList;
pi_result Res = getEventsFromSyncPoints(CommandBuffer->SyncPoints,
NumSyncPointsInWaitList,
SyncPointWaitList, ZeEventList);
if (Res) {
return Res;
}

pi_event LaunchEvent;
Res = EventCreate(CommandBuffer->Context, nullptr, true, &LaunchEvent);
if (Res)
return PI_ERROR_OUT_OF_HOST_MEMORY;

ZE_CALL(zeCommandListAppendMemoryCopy,
(CommandBuffer->ZeCommandList, DstPtr, SrcPtr, Size,
LaunchEvent->ZeEvent, ZeEventList.size(), ZeEventList.data()));
(CommandBuffer->ZeCommandList, Dst, Src, Size, LaunchEvent->ZeEvent,
ZeEventList.size(), ZeEventList.data()));

urPrint("calling zeCommandListAppendMemoryCopy() with"
" ZeEvent %#lx\n",
Expand All @@ -9093,6 +9088,140 @@ pi_result piextCommandBufferMemcpyUSM(
return PI_SUCCESS;
}

// Helper function for common code when enqueuing rectangular memory operations
// to a command buffer.
static pi_result enqueueCommandBufferMemCopyRectHelper(
pi_ext_command_buffer CommandBuffer, void *Dst, const void *Src,
pi_buff_rect_offset SrcOrigin, pi_buff_rect_offset DstOrigin,
pi_buff_rect_region Region, size_t SrcRowPitch, size_t DstRowPitch,
size_t SrcSlicePitch, size_t DstSlicePitch,
pi_uint32 NumSyncPointsInWaitList,
const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint) {
PI_ASSERT(Region && SrcOrigin && DstOrigin, PI_ERROR_INVALID_VALUE);

uint32_t SrcOriginX = ur_cast<uint32_t>(SrcOrigin->x_bytes);
uint32_t SrcOriginY = ur_cast<uint32_t>(SrcOrigin->y_scalar);
uint32_t SrcOriginZ = ur_cast<uint32_t>(SrcOrigin->z_scalar);

uint32_t SrcPitch = SrcRowPitch;
if (SrcPitch == 0)
SrcPitch = ur_cast<uint32_t>(Region->width_bytes);

if (SrcSlicePitch == 0)
SrcSlicePitch = ur_cast<uint32_t>(Region->height_scalar) * SrcPitch;

uint32_t DstOriginX = ur_cast<uint32_t>(DstOrigin->x_bytes);
uint32_t DstOriginY = ur_cast<uint32_t>(DstOrigin->y_scalar);
uint32_t DstOriginZ = ur_cast<uint32_t>(DstOrigin->z_scalar);

uint32_t DstPitch = DstRowPitch;
if (DstPitch == 0)
DstPitch = ur_cast<uint32_t>(Region->width_bytes);

if (DstSlicePitch == 0)
DstSlicePitch = ur_cast<uint32_t>(Region->height_scalar) * DstPitch;

uint32_t Width = ur_cast<uint32_t>(Region->width_bytes);
uint32_t Height = ur_cast<uint32_t>(Region->height_scalar);
uint32_t Depth = ur_cast<uint32_t>(Region->depth_scalar);

const ze_copy_region_t ZeSrcRegion = {SrcOriginX, SrcOriginY, SrcOriginZ,
Width, Height, Depth};
const ze_copy_region_t ZeDstRegion = {DstOriginX, DstOriginY, DstOriginZ,
Width, Height, Depth};

std::vector<ze_event_handle_t> ZeEventList;
pi_result Res = getEventsFromSyncPoints(CommandBuffer->SyncPoints,
NumSyncPointsInWaitList,
SyncPointWaitList, ZeEventList);

pi_event LaunchEvent;
Res = EventCreate(CommandBuffer->Context, nullptr, true, &LaunchEvent);
if (Res)
return PI_ERROR_OUT_OF_HOST_MEMORY;

ZE_CALL(zeCommandListAppendMemoryCopyRegion,
(CommandBuffer->ZeCommandList, Dst, &ZeDstRegion, DstPitch,
DstSlicePitch, Src, &ZeSrcRegion, SrcPitch, SrcSlicePitch,
LaunchEvent->ZeEvent, ZeEventList.size(), ZeEventList.data()));

urPrint("calling zeCommandListAppendMemoryCopyRegion() with"
" ZeEvent %#lx\n",
ur_cast<std::uintptr_t>(LaunchEvent->ZeEvent));

// Get sync point and register the event with it.
*SyncPoint = CommandBuffer->GetNextSyncPoint();
CommandBuffer->RegisterSyncPoint(*SyncPoint, LaunchEvent);
return PI_SUCCESS;
}

pi_result piextCommandBufferMemcpyUSM(
pi_ext_command_buffer CommandBuffer, void *DstPtr, const void *SrcPtr,
size_t Size, pi_uint32 NumSyncPointsInWaitList,
const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint) {
if (!DstPtr) {
return PI_ERROR_INVALID_VALUE;
}

return enqueueCommandBufferMemCopyHelper(CommandBuffer, DstPtr, SrcPtr, Size,
NumSyncPointsInWaitList,
SyncPointWaitList, SyncPoint);
}

pi_result piextCommandBufferMemBufferCopy(
pi_ext_command_buffer CommandBuffer, pi_mem SrcMem, pi_mem DstMem,
size_t SrcOffset, size_t DstOffset, size_t Size,
pi_uint32 NumSyncPointsInWaitList,
const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint) {
PI_ASSERT(SrcMem && DstMem, PI_ERROR_INVALID_MEM_OBJECT);

auto SrcBuffer = ur_cast<pi_buffer>(SrcMem);
auto DstBuffer = ur_cast<pi_buffer>(DstMem);

std::shared_lock<ur_shared_mutex> SrcLock(SrcBuffer->Mutex, std::defer_lock);
std::scoped_lock<std::shared_lock<ur_shared_mutex>, ur_shared_mutex> LockAll(
SrcLock, DstBuffer->Mutex);

char *ZeHandleSrc;
PI_CALL(SrcBuffer->getZeHandle(ZeHandleSrc, _pi_mem::read_only,
CommandBuffer->Device));
char *ZeHandleDst;
PI_CALL(DstBuffer->getZeHandle(ZeHandleDst, _pi_mem::write_only,
CommandBuffer->Device));

return enqueueCommandBufferMemCopyHelper(
CommandBuffer, ZeHandleDst, ZeHandleSrc, Size, NumSyncPointsInWaitList,
SyncPointWaitList, SyncPoint);
}

pi_result piextCommandBufferMemBufferCopyRect(
pi_ext_command_buffer CommandBuffer, pi_mem SrcMem, pi_mem DstMem,
pi_buff_rect_offset SrcOrigin, pi_buff_rect_offset DstOrigin,
pi_buff_rect_region Region, size_t SrcRowPitch, size_t SrcSlicePitch,
size_t DstRowPitch, size_t DstSlicePitch, pi_uint32 NumSyncPointsInWaitList,
const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint) {
PI_ASSERT(SrcMem && DstMem, PI_ERROR_INVALID_MEM_OBJECT);

auto SrcBuffer = ur_cast<pi_buffer>(SrcMem);
auto DstBuffer = ur_cast<pi_buffer>(DstMem);

std::shared_lock<ur_shared_mutex> SrcLock(SrcBuffer->Mutex, std::defer_lock);
std::scoped_lock<std::shared_lock<ur_shared_mutex>, ur_shared_mutex> LockAll(
SrcLock, DstBuffer->Mutex);

char *ZeHandleSrc;
PI_CALL(SrcBuffer->getZeHandle(ZeHandleSrc, _pi_mem::read_only,
CommandBuffer->Device));
char *ZeHandleDst;
PI_CALL(DstBuffer->getZeHandle(ZeHandleDst, _pi_mem::write_only,
CommandBuffer->Device));

return enqueueCommandBufferMemCopyRectHelper(
CommandBuffer, ZeHandleDst, ZeHandleSrc, SrcOrigin, DstOrigin, Region,
SrcRowPitch, DstRowPitch, SrcSlicePitch, DstSlicePitch,
NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint);
}

pi_result piextEnqueueCommandBuffer(pi_ext_command_buffer CommandBuffer,
pi_queue Queue,
pi_uint32 NumEventsInWaitList,
Expand Down
25 changes: 25 additions & 0 deletions sycl/plugins/opencl/pi_opencl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2320,6 +2320,28 @@ piextCommandBufferMemcpyUSM(pi_ext_command_buffer command_buffer, void *dst_ptr,
return {};
}

pi_result piextCommandBufferMemBufferCopy(
pi_ext_command_buffer command_buffer, pi_mem src_buffer, pi_mem dst_buffer,
size_t src_offset, size_t dst_offset, size_t size,
pi_uint32 num_sync_points_in_wait_list,
const pi_ext_sync_point *sync_point_wait_list,
pi_ext_sync_point *sync_point) {
// Not implemented
return {};
}

pi_result piextCommandBufferMemBufferCopyRect(
pi_ext_command_buffer command_buffer, pi_mem src_buffer, pi_mem dst_buffer,
pi_buff_rect_offset src_origin, pi_buff_rect_offset dst_origin,
pi_buff_rect_region region, size_t src_row_pitch, size_t src_slice_pitch,
size_t dst_row_pitch, size_t dst_slice_pitch,
pi_uint32 num_sync_points_in_wait_list,
const pi_ext_sync_point *sync_point_wait_list,
pi_ext_sync_point *sync_point) {
// Not implemented
return {};
}

pi_result piextEnqueueCommandBuffer(pi_ext_command_buffer command_buffer,
pi_queue queue,
pi_uint32 num_events_in_wait_list,
Expand Down Expand Up @@ -2530,6 +2552,9 @@ pi_result piPluginInit(pi_plugin *PluginInit) {
_PI_CL(piextCommandBufferRelease, piextCommandBufferRelease)
_PI_CL(piextCommandBufferNDRangeKernel, piextCommandBufferNDRangeKernel)
_PI_CL(piextCommandBufferMemcpyUSM, piextCommandBufferMemcpyUSM)
_PI_CL(piextCommandBufferMemBufferCopy, piextCommandBufferMemBufferCopy)
_PI_CL(piextCommandBufferMemBufferCopyRect,
piextCommandBufferMemBufferCopyRect)
_PI_CL(piextEnqueueCommandBuffer, piextEnqueueCommandBuffer)

_PI_CL(piextKernelSetArgMemObj, piextKernelSetArgMemObj)
Expand Down
Loading