Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/intel-llvm-mirror-base-commit
Original file line number Diff line number Diff line change
@@ -1 +1 @@
edb24728920365ff7aab67922ef22b7ec323b834
6133d210a53fa455357c97f818de0e15940b05e7
10 changes: 5 additions & 5 deletions source/adapters/cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -63,14 +63,14 @@ add_library(cudadrv SHARED IMPORTED GLOBAL)
if (WIN32)
set_target_properties(
cudadrv PROPERTIES
IMPORTED_IMPLIB ${CUDA_cuda_driver_LIBRARY}
INTERFACE_INCLUDE_DIRECTORIES ${CUDAToolkit_INCLUDE_DIRS}
IMPORTED_IMPLIB "${CUDA_cuda_driver_LIBRARY}"
INTERFACE_INCLUDE_DIRECTORIES "${CUDAToolkit_INCLUDE_DIRS}"
)
else()
set_target_properties(
cudadrv PROPERTIES
IMPORTED_LOCATION ${CUDA_cuda_driver_LIBRARY}
INTERFACE_INCLUDE_DIRECTORIES ${CUDAToolkit_INCLUDE_DIRS}
IMPORTED_LOCATION "${CUDA_cuda_driver_LIBRARY}"
INTERFACE_INCLUDE_DIRECTORIES "${CUDAToolkit_INCLUDE_DIRS}"
)
endif()

Expand Down Expand Up @@ -121,4 +121,4 @@ target_link_libraries(${TARGET_NAME} PRIVATE

target_include_directories(${TARGET_NAME} PRIVATE
"${CMAKE_CURRENT_SOURCE_DIR}/../../"
)
)
47 changes: 37 additions & 10 deletions source/adapters/cuda/enqueue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,14 @@ ur_result_t enqueueEventsWait(ur_queue_handle_t CommandQueue, CUstream Stream,
}
}

#if CUDA_VERSION >= 13000
using CuLocationType = CUmemLocation;
#else
using CuLocationType = CUdevice;
#endif
void setCuMemAdvise(CUdeviceptr DevPtr, size_t Size,
ur_usm_advice_flags_t URAdviceFlags, CUdevice Device) {
ur_usm_advice_flags_t URAdviceFlags,
CuLocationType Location) {
std::unordered_map<ur_usm_advice_flags_t, CUmem_advise>
URToCUMemAdviseDeviceFlagsMap = {
{UR_USM_ADVICE_FLAG_SET_READ_MOSTLY, CU_MEM_ADVISE_SET_READ_MOSTLY},
Expand All @@ -64,7 +70,7 @@ void setCuMemAdvise(CUdeviceptr DevPtr, size_t Size,
};
for (auto &FlagPair : URToCUMemAdviseDeviceFlagsMap) {
if (URAdviceFlags & FlagPair.first) {
UR_CHECK_ERROR(cuMemAdvise(DevPtr, Size, FlagPair.second, Device));
UR_CHECK_ERROR(cuMemAdvise(DevPtr, Size, FlagPair.second, Location));
}
}

Expand All @@ -82,7 +88,14 @@ void setCuMemAdvise(CUdeviceptr DevPtr, size_t Size,

for (auto &FlagPair : URToCUMemAdviseHostFlagsMap) {
if (URAdviceFlags & FlagPair.first) {
UR_CHECK_ERROR(cuMemAdvise(DevPtr, Size, FlagPair.second, CU_DEVICE_CPU));
#if CUDA_VERSION >= 13000
CUmemLocation LocationHost;
LocationHost.id = 0; // ignored with HOST_NUMA_CURRENT
LocationHost.type = CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT;
#else
int LocationHost = CU_DEVICE_CPU;
#endif
UR_CHECK_ERROR(cuMemAdvise(DevPtr, Size, FlagPair.second, LocationHost));
}
}

Expand Down Expand Up @@ -1550,8 +1563,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch(
return UR_RESULT_SUCCESS;
}

#if CUDA_VERSION >= 13000
CUmemLocation Location;
Location.id = Device->get();
Location.type = CU_MEM_LOCATION_TYPE_DEVICE;
unsigned int Flags = 0U;
UR_CHECK_ERROR(
cuMemPrefetchAsync((CUdeviceptr)pMem, size, Location, Flags, CuStream));
#else
UR_CHECK_ERROR(
cuMemPrefetchAsync((CUdeviceptr)pMem, size, Device->get(), CuStream));
#endif
} catch (ur_result_t Err) {
return Err;
}
Expand Down Expand Up @@ -1619,19 +1641,24 @@ urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size,
return UR_RESULT_SUCCESS;
}

#if CUDA_VERSION >= 13000
CUmemLocation Location;
Location.id = hQueue->getDevice()->get();
Location.type = CU_MEM_LOCATION_TYPE_DEVICE;
#else
int Location = hQueue->getDevice()->get();
#endif

if (advice & UR_USM_ADVICE_FLAG_DEFAULT) {
UR_CHECK_ERROR(cuMemAdvise((CUdeviceptr)pMem, size,
CU_MEM_ADVISE_UNSET_READ_MOSTLY,
hQueue->getDevice()->get()));
CU_MEM_ADVISE_UNSET_READ_MOSTLY, Location));
UR_CHECK_ERROR(cuMemAdvise((CUdeviceptr)pMem, size,
CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION,
hQueue->getDevice()->get()));
Location));
UR_CHECK_ERROR(cuMemAdvise((CUdeviceptr)pMem, size,
CU_MEM_ADVISE_UNSET_ACCESSED_BY,
hQueue->getDevice()->get()));
CU_MEM_ADVISE_UNSET_ACCESSED_BY, Location));
} else {
setCuMemAdvise((CUdeviceptr)pMem, size, advice,
hQueue->getDevice()->get());
setCuMemAdvise((CUdeviceptr)pMem, size, advice, Location);
}
} catch (ur_result_t err) {
return err;
Expand Down
4 changes: 2 additions & 2 deletions source/adapters/offload/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@ if (NOT TARGET cudadrv)
add_library(cudadrv SHARED IMPORTED GLOBAL)
set_target_properties(
cudadrv PROPERTIES
IMPORTED_LOCATION ${CUDA_cuda_driver_LIBRARY}
INTERFACE_INCLUDE_DIRECTORIES ${CUDAToolkit_INCLUDE_DIRS}
IMPORTED_LOCATION "${CUDA_cuda_driver_LIBRARY}"
INTERFACE_INCLUDE_DIRECTORIES "${CUDAToolkit_INCLUDE_DIRS}"
)
endif()

Expand Down
186 changes: 139 additions & 47 deletions source/adapters/offload/enqueue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,16 +19,130 @@
#include "queue.hpp"
#include "ur2offload.hpp"

namespace {
ol_result_t waitOnEvents(ol_queue_handle_t Queue,
const ur_event_handle_t *UrEvents, size_t NumEvents) {
if (NumEvents) {
std::vector<ol_event_handle_t> OlEvents;
OlEvents.reserve(NumEvents);
for (size_t I = 0; I < NumEvents; I++) {
OlEvents.push_back(UrEvents[I]->OffloadEvent);
}

return olWaitEvents(Queue, OlEvents.data(), NumEvents);
}
return OL_SUCCESS;
}

ol_result_t makeEvent(ur_command_t Type, ol_queue_handle_t OlQueue,
ur_queue_handle_t UrQueue, ur_event_handle_t *UrEvent) {
if (UrEvent) {
auto *Event = new ur_event_handle_t_(Type, UrQueue);
if (auto Res = olCreateEvent(OlQueue, &Event->OffloadEvent)) {
delete Event;
return Res;
};
*UrEvent = Event;
}
return OL_SUCCESS;
}

template <bool Barrier>
ur_result_t doWait(ur_queue_handle_t hQueue, uint32_t numEventsInWaitList,
const ur_event_handle_t *phEventWaitList,
ur_event_handle_t *phEvent) {
std::lock_guard<std::mutex> Lock(hQueue->OooMutex);
constexpr ur_command_t TYPE =
Barrier ? UR_COMMAND_EVENTS_WAIT_WITH_BARRIER : UR_COMMAND_EVENTS_WAIT;
ol_queue_handle_t TargetQueue;
if (!numEventsInWaitList && hQueue->isInOrder()) {
// In order queue so all work is done in submission order, so it's a
// no-op
if (phEvent) {
OL_RETURN_ON_ERR(hQueue->nextQueueNoLock(TargetQueue));
OL_RETURN_ON_ERR(makeEvent(TYPE, TargetQueue, hQueue, phEvent));
}
return UR_RESULT_SUCCESS;
}
OL_RETURN_ON_ERR(hQueue->nextQueueNoLock(TargetQueue));

if (!numEventsInWaitList) {
// "If the event list is empty, it waits for all previously enqueued
// commands to complete."

// Create events on each active queue for an arbitrary thread to block on
// TODO: Can we efficiently check if each thread is "finished" rather than
// creating an event?
std::vector<ol_event_handle_t> OffloadHandles{};
for (auto *Q : hQueue->OffloadQueues) {
if (Q == nullptr) {
break;
}
if (Q == TargetQueue) {
continue;
}
OL_RETURN_ON_ERR(olCreateEvent(Q, &OffloadHandles.emplace_back()));
}
OL_RETURN_ON_ERR(olWaitEvents(TargetQueue, OffloadHandles.data(),
OffloadHandles.size()));
} else {
OL_RETURN_ON_ERR(
waitOnEvents(TargetQueue, phEventWaitList, numEventsInWaitList));
}

OL_RETURN_ON_ERR(makeEvent(TYPE, TargetQueue, hQueue, phEvent));

if constexpr (Barrier) {
ol_event_handle_t BarrierEvent;
if (phEvent) {
BarrierEvent = (*phEvent)->OffloadEvent;
} else {
OL_RETURN_ON_ERR(olCreateEvent(TargetQueue, &BarrierEvent));
}

// Ensure any newly created work waits on this barrier
if (hQueue->Barrier) {
OL_RETURN_ON_ERR(olDestroyEvent(hQueue->Barrier));
}
hQueue->Barrier = BarrierEvent;

// Block all existing threads on the barrier
for (auto *Q : hQueue->OffloadQueues) {
if (Q == nullptr) {
break;
}
if (Q == TargetQueue) {
continue;
}
OL_RETURN_ON_ERR(olWaitEvents(Q, &BarrierEvent, 1));
}
}

return UR_RESULT_SUCCESS;
}
} // namespace

UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWait(
ur_queue_handle_t hQueue, uint32_t numEventsInWaitList,
const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
return doWait<false>(hQueue, numEventsInWaitList, phEventWaitList, phEvent);
}

UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier(
ur_queue_handle_t hQueue, uint32_t numEventsInWaitList,
const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
return doWait<true>(hQueue, numEventsInWaitList, phEventWaitList, phEvent);
}

UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
const size_t *pLocalWorkSize, uint32_t, const ur_kernel_launch_property_t *,
uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
ur_event_handle_t *phEvent) {
// Ignore wait list for now
(void)numEventsInWaitList;
(void)phEventWaitList;
//
ol_queue_handle_t Queue;
OL_RETURN_ON_ERR(hQueue->nextQueue(Queue));
OL_RETURN_ON_ERR(waitOnEvents(Queue, phEventWaitList, numEventsInWaitList));

(void)pGlobalWorkOffset;

Expand Down Expand Up @@ -67,20 +181,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
LaunchArgs.GroupSize.z = GroupSize[2];
LaunchArgs.DynSharedMemory = 0;

ol_queue_handle_t Queue;
OL_RETURN_ON_ERR(hQueue->nextQueue(Queue));
OL_RETURN_ON_ERR(olLaunchKernel(
Queue, hQueue->OffloadDevice, hKernel->OffloadKernel,
hKernel->Args.getStorage(), hKernel->Args.getStorageSize(), &LaunchArgs));

if (phEvent) {
auto *Event = new ur_event_handle_t_(UR_COMMAND_KERNEL_LAUNCH, hQueue);
if (auto Res = olCreateEvent(Queue, &Event->OffloadEvent)) {
delete Event;
return offloadResultToUR(Res);
};
*phEvent = Event;
}
OL_RETURN_ON_ERR(makeEvent(UR_COMMAND_KERNEL_LAUNCH, Queue, hQueue, phEvent));
return UR_RESULT_SUCCESS;
}

Expand All @@ -103,10 +208,9 @@ ur_result_t doMemcpy(ur_command_t Command, ur_queue_handle_t hQueue,
size_t size, bool blocking, uint32_t numEventsInWaitList,
const ur_event_handle_t *phEventWaitList,
ur_event_handle_t *phEvent) {
// Ignore wait list for now
(void)numEventsInWaitList;
(void)phEventWaitList;
//
ol_queue_handle_t Queue;
OL_RETURN_ON_ERR(hQueue->nextQueue(Queue));
OL_RETURN_ON_ERR(waitOnEvents(Queue, phEventWaitList, numEventsInWaitList));

if (blocking) {
OL_RETURN_ON_ERR(
Expand All @@ -117,8 +221,6 @@ ur_result_t doMemcpy(ur_command_t Command, ur_queue_handle_t hQueue,
return UR_RESULT_SUCCESS;
}

ol_queue_handle_t Queue;
OL_RETURN_ON_ERR(hQueue->nextQueue(Queue));
OL_RETURN_ON_ERR(
olMemcpy(Queue, DestPtr, DestDevice, SrcPtr, SrcDevice, size));
if (phEvent) {
Expand Down Expand Up @@ -192,17 +294,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableWrite(
numEventsInWaitList, phEventWaitList, phEvent);
}

ur_result_t enqueueNoOp(ur_command_t Type, ur_queue_handle_t hQueue,
ur_event_handle_t *phEvent) {
// This path is a no-op, but we can't output a real event because
// Offload doesn't currently support creating arbitrary events, and we
// don't know the last real event in the queue. Instead we just have to
// wait on the whole queue and then return an empty (implicitly
// finished) event.
*phEvent = ur_event_handle_t_::createEmptyEvent(Type, hQueue);
return urQueueFinish(hQueue);
}

UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap(
ur_queue_handle_t hQueue, ur_mem_handle_t hBuffer, bool blockingMap,
ur_map_flags_t mapFlags, size_t offset, size_t size,
Expand All @@ -226,15 +317,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap(
Result = urEnqueueMemBufferRead(hQueue, hBuffer, blockingMap, offset, size,
MapPtr, numEventsInWaitList,
phEventWaitList, phEvent);
} else {
if (IsPinned) {
// TODO: Ignore the event waits list for now. When urEnqueueEventsWait is
// implemented we can call it on the wait list.
}

if (phEvent) {
enqueueNoOp(UR_COMMAND_MEM_BUFFER_MAP, hQueue, phEvent);
} else if (numEventsInWaitList || phEvent) {
ol_queue_handle_t Queue;
OL_RETURN_ON_ERR(hQueue->nextQueue(Queue));
if ((!hQueue->isInOrder() && phEvent) || hQueue->isInOrder()) {
// Out-of-order queues running no-op work only have side effects if there
// is an output event
waitOnEvents(Queue, phEventWaitList, numEventsInWaitList);
}
OL_RETURN_ON_ERR(
makeEvent(UR_COMMAND_MEM_BUFFER_MAP, Queue, hQueue, phEvent));
}
*ppRetMap = MapPtr;

Expand All @@ -260,15 +352,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap(
Result = urEnqueueMemBufferWrite(
hQueue, hMem, true, Map->MapOffset, Map->MapSize, pMappedPtr,
numEventsInWaitList, phEventWaitList, phEvent);
} else {
if (IsPinned) {
// TODO: Ignore the event waits list for now. When urEnqueueEventsWait is
// implemented we can call it on the wait list.
}

if (phEvent) {
enqueueNoOp(UR_COMMAND_MEM_UNMAP, hQueue, phEvent);
} else if (numEventsInWaitList || phEvent) {
ol_queue_handle_t Queue;
OL_RETURN_ON_ERR(hQueue->nextQueue(Queue));
if ((!hQueue->isInOrder() && phEvent) || hQueue->isInOrder()) {
// Out-of-order queues running no-op work only have side effects if there
// is an output event
waitOnEvents(Queue, phEventWaitList, numEventsInWaitList);
}
OL_RETURN_ON_ERR(makeEvent(UR_COMMAND_MEM_UNMAP, Queue, hQueue, phEvent));
}
BufferImpl.unmap(pMappedPtr);

Expand Down
Loading