Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions sycl/include/sycl/detail/pi.def
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,7 @@ _PI_API(piextCommandBufferRetain)
_PI_API(piextCommandBufferRelease)
_PI_API(piextCommandBufferFinalize)
_PI_API(piextCommandBufferNDRangeKernel)
_PI_API(piextCommandBufferMemcpyUSM)
_PI_API(piextEnqueueCommandBuffer)
_PI_API(piPluginGetLastError)

Expand Down
18 changes: 17 additions & 1 deletion sycl/include/sycl/detail/pi.h
Original file line number Diff line number Diff line change
Expand Up @@ -2157,7 +2157,7 @@ piextCommandBufferFinalize(pi_ext_command_buffer command_buffer);
/// \param local_work_size Local work size to use when executing kernel.
/// \param num_sync_points_in_wait_list The number of sync points in the
/// provided wait list.
/// \param sync_point_wait_list A list of sync points that this executions must
/// \param sync_point_wait_list A list of sync points that this command must
/// wait on.
/// \param sync_point The sync_point associated with this kernel execution.
__SYCL_EXPORT pi_result piextCommandBufferNDRangeKernel(
Expand All @@ -2167,6 +2167,22 @@ __SYCL_EXPORT pi_result piextCommandBufferNDRangeKernel(
const pi_ext_sync_point *sync_point_wait_list,
pi_ext_sync_point *sync_point);

/// API to append a USM memcpy command to the command-buffer.
/// \param command_buffer The command-buffer to append onto.
/// \param dst_ptr is the location the data will be copied
/// \param src_ptr is the data to be copied
/// \param size is number of bytes to copy
/// \param num_sync_points_in_wait_list The number of sync points in the
/// provided wait list.
/// \param sync_point_wait_list A list of sync points that this command must
/// wait on.
/// \param sync_point The sync_point associated with this memory operation.
__SYCL_EXPORT pi_result piextCommandBufferMemcpyUSM(
pi_ext_command_buffer command_buffer, void *dst_ptr, const void *src_ptr,
size_t size, pi_uint32 num_sync_points_in_wait_list,
const pi_ext_sync_point *sync_point_wait_list,
pi_ext_sync_point *sync_point);

/// API to submit the command-buffer to queue for execution, returns an error if
/// command-buffer not finalized or another instance of same command-buffer
/// currently executing.
Expand Down
10 changes: 10 additions & 0 deletions sycl/plugins/cuda/pi_cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5836,6 +5836,15 @@ pi_result cuda_piextCommandBufferNDRangeKernel(
return {};
}

pi_result cuda_piextCommandBufferMemcpyUSM(
pi_ext_command_buffer command_buffer, void *dst_ptr, const void *src_ptr,
size_t size, pi_uint32 num_sync_points_in_wait_list,
const pi_ext_sync_point *sync_point_wait_list,
pi_ext_sync_point *sync_point) {
sycl::detail::pi::die("command-buffer API not implemented in CUDA backend");
return {};
}

pi_result cuda_piextEnqueueCommandBuffer(pi_ext_command_buffer command_buffer,
pi_queue queue,
pi_uint32 num_events_in_wait_list,
Expand Down Expand Up @@ -6040,6 +6049,7 @@ pi_result piPluginInit(pi_plugin *PluginInit) {
_PI_CL(piextCommandBufferRetain, cuda_piextCommandBufferRetain)
_PI_CL(piextCommandBufferRelease, cuda_piextCommandBufferRelease)
_PI_CL(piextCommandBufferNDRangeKernel, cuda_piextCommandBufferNDRangeKernel)
_PI_CL(piextCommandBufferMemcpyUSM, cuda_piextCommandBufferMemcpyUSM)
_PI_CL(piextEnqueueCommandBuffer, cuda_piextEnqueueCommandBuffer)

_PI_CL(piextKernelSetArgMemObj, cuda_piextKernelSetArgMemObj)
Expand Down
9 changes: 9 additions & 0 deletions sycl/plugins/esimd_emulator/pi_esimd_emulator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2127,6 +2127,15 @@ pi_result piextCommandBufferNDRangeKernel(
DIE_NO_IMPLEMENTATION;
}

pi_result
piextCommandBufferMemcpyUSM(pi_ext_command_buffer command_buffer, void *dst_ptr,
const void *src_ptr, size_t size,
pi_uint32 num_sync_points_in_wait_list,
const pi_ext_sync_point *sync_point_wait_list,
pi_ext_sync_point *sync_point) {
DIE_NO_IMPLEMENTATION;
}

pi_result piextEnqueueCommandBuffer(pi_ext_command_buffer command_buffer,
pi_queue queue,
pi_uint32 num_events_in_wait_list,
Expand Down
11 changes: 11 additions & 0 deletions sycl/plugins/hip/pi_hip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5583,6 +5583,16 @@ pi_result hip_piextCommandBufferNDRangeKernel(
return {};
}

pi_result
hip_piextCommandBufferMemcpyUSM(pi_ext_command_buffer command_buffer,
void *dst_ptr, const void *src_ptr, size_t size,
pi_uint32 num_sync_points_in_wait_list,
const pi_ext_sync_point *sync_point_wait_list,
pi_ext_sync_point *sync_point) {
sycl::detail::pi::die("command-buffer API not implemented in HIP backend");
return {};
}

pi_result hip_piextEnqueueCommandBuffer(pi_ext_command_buffer command_buffer,
pi_queue queue,
pi_uint32 num_events_in_wait_list,
Expand Down Expand Up @@ -5787,6 +5797,7 @@ pi_result piPluginInit(pi_plugin *PluginInit) {
_PI_CL(piextCommandBufferRetain, hip_piextCommandBufferRetain)
_PI_CL(piextCommandBufferRelease, hip_piextCommandBufferRelease)
_PI_CL(piextCommandBufferNDRangeKernel, hip_piextCommandBufferNDRangeKernel)
_PI_CL(piextCommandBufferMemcpyUSM, hip_piextCommandBufferMemcpyUSM)
_PI_CL(piextEnqueueCommandBuffer, hip_piextEnqueueCommandBuffer)

_PI_CL(piextKernelSetArgMemObj, hip_piextKernelSetArgMemObj)
Expand Down
71 changes: 60 additions & 11 deletions sycl/plugins/level_zero/pi_level_zero.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8831,6 +8831,23 @@ pi_result _pi_buffer::free() {

/// command-buffer Extension

/// Helper function to take a list of pi_ext_sync_points and fill the provided
/// vector with the associated ZeEvents
static pi_result getEventsFromSyncPoints(
const std::unordered_map<pi_ext_sync_point, pi_event> &SyncPoints,
size_t NumSyncPointsInWaitList, const pi_ext_sync_point *SyncPointWaitList,
std::vector<ze_event_handle_t> &ZeEventList) {
for (size_t i = 0; i < NumSyncPointsInWaitList; i++) {
if (auto EventHandle = SyncPoints.find(SyncPointWaitList[i]);
EventHandle != SyncPoints.end()) {
ZeEventList.push_back(EventHandle->second->ZeEvent);
} else {
return PI_ERROR_INVALID_VALUE;
}
}
return PI_SUCCESS;
}

pi_result piextCommandBufferCreate(pi_context Context, pi_device Device,
const pi_ext_command_buffer_desc *Desc,
pi_ext_command_buffer *RetCommandBuffer) {
Expand Down Expand Up @@ -8935,19 +8952,16 @@ pi_result piextCommandBufferNDRangeKernel(

ZE_CALL(zeKernelSetGroupSize, (Kernel->ZeKernel, WG[0], WG[1], WG[2]));

std::vector<ze_event_handle_t> ZeEventList(NumSyncPointsInWaitList);
for (size_t i = 0; i < NumSyncPointsInWaitList; i++) {
if (auto EventHandle = CommandBuffer->SyncPoints.find(SyncPointWaitList[i]);
EventHandle != CommandBuffer->SyncPoints.end()) {
ZeEventList[i] = CommandBuffer->SyncPoints[SyncPointWaitList[i]]->ZeEvent;
} else {
return PI_ERROR_INVALID_VALUE;
}
std::vector<ze_event_handle_t> ZeEventList;
pi_result Res = getEventsFromSyncPoints(CommandBuffer->SyncPoints,
NumSyncPointsInWaitList,
SyncPointWaitList, ZeEventList);
if (Res) {
return Res;
}

pi_event LaunchEvent;
auto res = EventCreate(CommandBuffer->Context, nullptr, true, &LaunchEvent);
if (res)
Res = EventCreate(CommandBuffer->Context, nullptr, true, &LaunchEvent);
if (Res)
return PI_ERROR_OUT_OF_HOST_MEMORY;

LaunchEvent->CommandData = (void *)Kernel;
Expand All @@ -8972,6 +8986,41 @@ pi_result piextCommandBufferNDRangeKernel(
return PI_SUCCESS;
}

pi_result piextCommandBufferMemcpyUSM(
pi_ext_command_buffer CommandBuffer, void *DstPtr, const void *SrcPtr,
size_t Size, pi_uint32 NumSyncPointsInWaitList,
const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint) {
if (!DstPtr) {
return PI_ERROR_INVALID_VALUE;
}

std::vector<ze_event_handle_t> ZeEventList;
pi_result Res = getEventsFromSyncPoints(CommandBuffer->SyncPoints,
NumSyncPointsInWaitList,
SyncPointWaitList, ZeEventList);
if (Res) {
return Res;
}

pi_event LaunchEvent;
Res = EventCreate(CommandBuffer->Context, nullptr, true, &LaunchEvent);
if (Res)
return PI_ERROR_OUT_OF_HOST_MEMORY;

ZE_CALL(zeCommandListAppendMemoryCopy,
(CommandBuffer->ZeCommandList, DstPtr, SrcPtr, Size,
LaunchEvent->ZeEvent, ZeEventList.size(), ZeEventList.data()));

urPrint("calling zeCommandListAppendMemoryCopy() with"
" ZeEvent %#lx\n",
ur_cast<std::uintptr_t>(LaunchEvent->ZeEvent));

// Get sync point and register the event with it.
*SyncPoint = CommandBuffer->GetNextSyncPoint();
CommandBuffer->RegisterSyncPoint(*SyncPoint, LaunchEvent);
return PI_SUCCESS;
}

pi_result piextEnqueueCommandBuffer(pi_ext_command_buffer CommandBuffer,
pi_queue Queue,
pi_uint32 NumEventsInWaitList,
Expand Down
11 changes: 11 additions & 0 deletions sycl/plugins/opencl/pi_opencl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2300,6 +2300,16 @@ pi_result piextCommandBufferNDRangeKernel(
return {};
}

pi_result
piextCommandBufferMemcpyUSM(pi_ext_command_buffer command_buffer, void *dst_ptr,
const void *src_ptr, size_t size,
pi_uint32 num_sync_points_in_wait_list,
const pi_ext_sync_point *sync_point_wait_list,
pi_ext_sync_point *sync_point) {
// Not implemented
return {};
}

pi_result piextEnqueueCommandBuffer(pi_ext_command_buffer command_buffer,
pi_queue queue,
pi_uint32 num_events_in_wait_list,
Expand Down Expand Up @@ -2509,6 +2519,7 @@ pi_result piPluginInit(pi_plugin *PluginInit) {
_PI_CL(piextCommandBufferRetain, piextCommandBufferRetain)
_PI_CL(piextCommandBufferRelease, piextCommandBufferRelease)
_PI_CL(piextCommandBufferNDRangeKernel, piextCommandBufferNDRangeKernel)
_PI_CL(piextCommandBufferMemcpyUSM, piextCommandBufferMemcpyUSM)
_PI_CL(piextEnqueueCommandBuffer, piextEnqueueCommandBuffer)

_PI_CL(piextKernelSetArgMemObj, piextKernelSetArgMemObj)
Expand Down
15 changes: 15 additions & 0 deletions sycl/source/detail/memory_manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1223,6 +1223,21 @@ void MemoryManager::copy_from_device_global(
DepEvents, OutEvent);
}

// Command buffer methods
void MemoryManager::ext_oneapi_copy_usm_cmd_buffer(
ContextImplPtr Context, const void *SrcMem,
RT::PiExtCommandBuffer CommandBuffer, size_t Len, void *DstMem,
std::vector<RT::PiExtSyncPoint> Deps, RT::PiExtSyncPoint *OutSyncPoint) {
if (!SrcMem || !DstMem)
throw runtime_error("NULL pointer argument in memory copy operation.",
PI_ERROR_INVALID_VALUE);

const PluginPtr &Plugin = Context->getPlugin();
Plugin->call<PiApiKind::piextCommandBufferMemcpyUSM>(
CommandBuffer, DstMem, SrcMem, Len, Deps.size(), Deps.data(),
OutSyncPoint);
}

} // namespace detail
} // __SYCL_INLINE_VER_NAMESPACE(_V1)
} // namespace sycl
6 changes: 6 additions & 0 deletions sycl/source/detail/memory_manager.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,12 @@ class __SYCL_EXPORT MemoryManager {
const void *DeviceGlobalPtr, bool IsDeviceImageScoped, QueueImplPtr Queue,
size_t NumBytes, size_t Offset, void *DstMem, OSModuleHandle M,
const std::vector<RT::PiEvent> &DepEvents, RT::PiEvent *OutEvent);

// Command buffer extension methods
static void ext_oneapi_copy_usm_cmd_buffer(
ContextImplPtr Context, const void *SrcMem,
RT::PiExtCommandBuffer CommandBuffer, size_t Len, void *DstMem,
std::vector<RT::PiExtSyncPoint> Deps, RT::PiExtSyncPoint *OutSyncPoint);
};
} // namespace detail
} // __SYCL_INLINE_VER_NAMESPACE(_V1)
Expand Down
10 changes: 10 additions & 0 deletions sycl/source/detail/queue_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,16 @@ event queue_impl::memcpy(const std::shared_ptr<detail::queue_impl> &Self,
// Emit a begin/end scope for this call
PrepareNotify.scopedNotify((uint16_t)xpti::trace_point_type_t::task_begin);
#endif
// If we have a command graph set we need to capture the copy through normal
// queue submission rather than execute the copy directly.
if (MGraph) {
return submit(
[&](handler &CGH) {
CGH.depends_on(DepEvents);
CGH.memcpy(Dest, Src, Count);
},
Self, {});
}
if (MHasDiscardEventsSupport) {
MemoryManager::copy_usm(Src, Self, Count, Dest,
getOrWaitEvents(DepEvents, MContext), nullptr);
Expand Down
10 changes: 9 additions & 1 deletion sycl/source/detail/scheduler/commands.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2487,6 +2487,7 @@ pi_int32 ExecCGCommand::enqueueImpCommandBuffer() {
MCommandGroup->MRequirements.size() == 0)
? nullptr
: &MEvent->getHandleRef();
RT::PiExtSyncPoint OutSyncPoint;
switch (MCommandGroup->getType()) {
case CG::CGTYPE::Kernel: {
CGExecKernel *ExecKernel = (CGExecKernel *)MCommandGroup.get();
Expand All @@ -2506,13 +2507,20 @@ pi_int32 ExecCGCommand::enqueueImpCommandBuffer() {
Event = &MEvent->getHandleRef();
}
}
RT::PiExtSyncPoint OutSyncPoint;
auto result = enqueueImpCommandBufferKernel(
MQueue->get_context(), MQueue->getDeviceImplPtr(), MCommandBuffer,
*ExecKernel, MSyncPointDeps, &OutSyncPoint, getMemAllocationFunc);
MEvent->setSyncPoint(OutSyncPoint);
return result;
}
case CG::CGTYPE::CopyUSM: {
CGCopyUSM *Copy = (CGCopyUSM *)MCommandGroup.get();
MemoryManager::ext_oneapi_copy_usm_cmd_buffer(
MQueue->getContextImplPtr(), Copy->getSrc(), MCommandBuffer,
Copy->getLength(), Copy->getDst(), MSyncPointDeps, &OutSyncPoint);
MEvent->setSyncPoint(OutSyncPoint);
return PI_SUCCESS;
}
default:
throw runtime_error("CG type not implemented for command buffers.",
PI_ERROR_INVALID_OPERATION);
Expand Down
33 changes: 20 additions & 13 deletions sycl/test-e2e/Graph/Explicit/usm_copy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,6 @@
// RUN: %{build} -o %t.out
// RUN: %{run} %t.out

// Expected fail as memcopy not implemented yet
// XFAIL: *

// Tests adding a usm memcpy node using the explicit API and submitting
// the graph.

Expand Down Expand Up @@ -45,7 +42,7 @@ int main() {
Queue.copy(DataC.data(), PtrC, Size);
Queue.wait_and_throw();

// memcpy from B to A
// Copy from B to A
auto NodeA = Graph.add([&](handler &CGH) { CGH.copy(PtrB, PtrA, Size); });

// Read & write A
Expand All @@ -58,9 +55,20 @@ int main() {
},
{exp_ext::property::node::depends_on(NodeA)});

// memcpy from B to A
auto NodeC = Graph.add([&](handler &CGH) { CGH.copy(PtrA, PtrB, Size); },
{exp_ext::property::node::depends_on(NodeB)});
// Read & write B
auto NodeModB = Graph.add(
[&](handler &CGH) {
CGH.parallel_for(range<1>(Size), [=](item<1> id) {
auto LinID = id.get_linear_id();
PtrB[LinID] += ModValue;
});
},
{exp_ext::property::node::depends_on(NodeA)});

// memcpy from A to B
auto NodeC =
Graph.add([&](handler &CGH) { CGH.memcpy(PtrB, PtrA, Size * sizeof(T)); },
{exp_ext::property::node::depends_on(NodeB, NodeModB)});

// Read and write B
auto NodeD = Graph.add(
Expand All @@ -72,9 +80,9 @@ int main() {
},
{exp_ext::property::node::depends_on(NodeC)});

// memcpy from B to C
// Copy from B to C
Graph.add([&](handler &CGH) { CGH.copy(PtrB, PtrC, Size); },
{exp_ext::property::node::depends_on(NodeB)});
{exp_ext::property::node::depends_on(NodeD)});

auto GraphExec = Graph.finalize();

Expand All @@ -86,12 +94,11 @@ int main() {
});
}

Queue.copy(PtrA, DataA.data(), Size, Event);
Queue.copy(PtrB, DataB.data(), Size, Event);
Queue.copy(PtrC, DataC.data(), Size, Event);
Queue.wait_and_throw();

Queue.copy(PtrA, DataA.data(), Size);
Queue.copy(PtrB, DataB.data(), Size);
Queue.copy(PtrC, DataC.data(), Size);

free(PtrA, Queue);
free(PtrB, Queue);
free(PtrC, Queue);
Expand Down
Loading