diff --git a/.github/intel-llvm-mirror-base-commit b/.github/intel-llvm-mirror-base-commit index 7a4a3ea40c..4b024d3a9a 100644 --- a/.github/intel-llvm-mirror-base-commit +++ b/.github/intel-llvm-mirror-base-commit @@ -1 +1 @@ -edb24728920365ff7aab67922ef22b7ec323b834 +6133d210a53fa455357c97f818de0e15940b05e7 diff --git a/source/adapters/cuda/CMakeLists.txt b/source/adapters/cuda/CMakeLists.txt index 5d7ba70e31..1751b00b61 100644 --- a/source/adapters/cuda/CMakeLists.txt +++ b/source/adapters/cuda/CMakeLists.txt @@ -63,14 +63,14 @@ add_library(cudadrv SHARED IMPORTED GLOBAL) if (WIN32) set_target_properties( cudadrv PROPERTIES - IMPORTED_IMPLIB ${CUDA_cuda_driver_LIBRARY} - INTERFACE_INCLUDE_DIRECTORIES ${CUDAToolkit_INCLUDE_DIRS} + IMPORTED_IMPLIB "${CUDA_cuda_driver_LIBRARY}" + INTERFACE_INCLUDE_DIRECTORIES "${CUDAToolkit_INCLUDE_DIRS}" ) else() set_target_properties( cudadrv PROPERTIES - IMPORTED_LOCATION ${CUDA_cuda_driver_LIBRARY} - INTERFACE_INCLUDE_DIRECTORIES ${CUDAToolkit_INCLUDE_DIRS} + IMPORTED_LOCATION "${CUDA_cuda_driver_LIBRARY}" + INTERFACE_INCLUDE_DIRECTORIES "${CUDAToolkit_INCLUDE_DIRS}" ) endif() @@ -121,4 +121,4 @@ target_link_libraries(${TARGET_NAME} PRIVATE target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/../../" -) \ No newline at end of file +) diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp index 8eb00ccab2..6f4a5bce3c 100644 --- a/source/adapters/cuda/enqueue.cpp +++ b/source/adapters/cuda/enqueue.cpp @@ -46,8 +46,14 @@ ur_result_t enqueueEventsWait(ur_queue_handle_t CommandQueue, CUstream Stream, } } +#if CUDA_VERSION >= 13000 +using CuLocationType = CUmemLocation; +#else +using CuLocationType = CUdevice; +#endif void setCuMemAdvise(CUdeviceptr DevPtr, size_t Size, - ur_usm_advice_flags_t URAdviceFlags, CUdevice Device) { + ur_usm_advice_flags_t URAdviceFlags, + CuLocationType Location) { std::unordered_map URToCUMemAdviseDeviceFlagsMap = { {UR_USM_ADVICE_FLAG_SET_READ_MOSTLY, CU_MEM_ADVISE_SET_READ_MOSTLY}, @@ -64,7 +70,7 @@ void setCuMemAdvise(CUdeviceptr DevPtr, size_t Size, }; for (auto &FlagPair : URToCUMemAdviseDeviceFlagsMap) { if (URAdviceFlags & FlagPair.first) { - UR_CHECK_ERROR(cuMemAdvise(DevPtr, Size, FlagPair.second, Device)); + UR_CHECK_ERROR(cuMemAdvise(DevPtr, Size, FlagPair.second, Location)); } } @@ -82,7 +88,14 @@ void setCuMemAdvise(CUdeviceptr DevPtr, size_t Size, for (auto &FlagPair : URToCUMemAdviseHostFlagsMap) { if (URAdviceFlags & FlagPair.first) { - UR_CHECK_ERROR(cuMemAdvise(DevPtr, Size, FlagPair.second, CU_DEVICE_CPU)); +#if CUDA_VERSION >= 13000 + CUmemLocation LocationHost; + LocationHost.id = 0; // ignored with HOST_NUMA_CURRENT + LocationHost.type = CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT; +#else + int LocationHost = CU_DEVICE_CPU; +#endif + UR_CHECK_ERROR(cuMemAdvise(DevPtr, Size, FlagPair.second, LocationHost)); } } @@ -1550,8 +1563,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch( return UR_RESULT_SUCCESS; } +#if CUDA_VERSION >= 13000 + CUmemLocation Location; + Location.id = Device->get(); + Location.type = CU_MEM_LOCATION_TYPE_DEVICE; + unsigned int Flags = 0U; + UR_CHECK_ERROR( + cuMemPrefetchAsync((CUdeviceptr)pMem, size, Location, Flags, CuStream)); +#else UR_CHECK_ERROR( cuMemPrefetchAsync((CUdeviceptr)pMem, size, Device->get(), CuStream)); +#endif } catch (ur_result_t Err) { return Err; } @@ -1619,19 +1641,24 @@ urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size, return UR_RESULT_SUCCESS; } +#if CUDA_VERSION >= 13000 + CUmemLocation Location; + Location.id = hQueue->getDevice()->get(); + Location.type = CU_MEM_LOCATION_TYPE_DEVICE; +#else + int Location = hQueue->getDevice()->get(); +#endif + if (advice & UR_USM_ADVICE_FLAG_DEFAULT) { UR_CHECK_ERROR(cuMemAdvise((CUdeviceptr)pMem, size, - CU_MEM_ADVISE_UNSET_READ_MOSTLY, - hQueue->getDevice()->get())); + CU_MEM_ADVISE_UNSET_READ_MOSTLY, Location)); UR_CHECK_ERROR(cuMemAdvise((CUdeviceptr)pMem, size, CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION, - hQueue->getDevice()->get())); + Location)); UR_CHECK_ERROR(cuMemAdvise((CUdeviceptr)pMem, size, - CU_MEM_ADVISE_UNSET_ACCESSED_BY, - hQueue->getDevice()->get())); + CU_MEM_ADVISE_UNSET_ACCESSED_BY, Location)); } else { - setCuMemAdvise((CUdeviceptr)pMem, size, advice, - hQueue->getDevice()->get()); + setCuMemAdvise((CUdeviceptr)pMem, size, advice, Location); } } catch (ur_result_t err) { return err; diff --git a/source/adapters/offload/CMakeLists.txt b/source/adapters/offload/CMakeLists.txt index 70336c6b24..9411139a3e 100644 --- a/source/adapters/offload/CMakeLists.txt +++ b/source/adapters/offload/CMakeLists.txt @@ -24,8 +24,8 @@ if (NOT TARGET cudadrv) add_library(cudadrv SHARED IMPORTED GLOBAL) set_target_properties( cudadrv PROPERTIES - IMPORTED_LOCATION ${CUDA_cuda_driver_LIBRARY} - INTERFACE_INCLUDE_DIRECTORIES ${CUDAToolkit_INCLUDE_DIRS} + IMPORTED_LOCATION "${CUDA_cuda_driver_LIBRARY}" + INTERFACE_INCLUDE_DIRECTORIES "${CUDAToolkit_INCLUDE_DIRS}" ) endif() diff --git a/source/adapters/offload/enqueue.cpp b/source/adapters/offload/enqueue.cpp index b1a1edac52..6b9a013538 100644 --- a/source/adapters/offload/enqueue.cpp +++ b/source/adapters/offload/enqueue.cpp @@ -19,16 +19,130 @@ #include "queue.hpp" #include "ur2offload.hpp" +namespace { +ol_result_t waitOnEvents(ol_queue_handle_t Queue, + const ur_event_handle_t *UrEvents, size_t NumEvents) { + if (NumEvents) { + std::vector OlEvents; + OlEvents.reserve(NumEvents); + for (size_t I = 0; I < NumEvents; I++) { + OlEvents.push_back(UrEvents[I]->OffloadEvent); + } + + return olWaitEvents(Queue, OlEvents.data(), NumEvents); + } + return OL_SUCCESS; +} + +ol_result_t makeEvent(ur_command_t Type, ol_queue_handle_t OlQueue, + ur_queue_handle_t UrQueue, ur_event_handle_t *UrEvent) { + if (UrEvent) { + auto *Event = new ur_event_handle_t_(Type, UrQueue); + if (auto Res = olCreateEvent(OlQueue, &Event->OffloadEvent)) { + delete Event; + return Res; + }; + *UrEvent = Event; + } + return OL_SUCCESS; +} + +template +ur_result_t doWait(ur_queue_handle_t hQueue, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + std::lock_guard Lock(hQueue->OooMutex); + constexpr ur_command_t TYPE = + Barrier ? UR_COMMAND_EVENTS_WAIT_WITH_BARRIER : UR_COMMAND_EVENTS_WAIT; + ol_queue_handle_t TargetQueue; + if (!numEventsInWaitList && hQueue->isInOrder()) { + // In order queue so all work is done in submission order, so it's a + // no-op + if (phEvent) { + OL_RETURN_ON_ERR(hQueue->nextQueueNoLock(TargetQueue)); + OL_RETURN_ON_ERR(makeEvent(TYPE, TargetQueue, hQueue, phEvent)); + } + return UR_RESULT_SUCCESS; + } + OL_RETURN_ON_ERR(hQueue->nextQueueNoLock(TargetQueue)); + + if (!numEventsInWaitList) { + // "If the event list is empty, it waits for all previously enqueued + // commands to complete." + + // Create events on each active queue for an arbitrary thread to block on + // TODO: Can we efficiently check if each thread is "finished" rather than + // creating an event? + std::vector OffloadHandles{}; + for (auto *Q : hQueue->OffloadQueues) { + if (Q == nullptr) { + break; + } + if (Q == TargetQueue) { + continue; + } + OL_RETURN_ON_ERR(olCreateEvent(Q, &OffloadHandles.emplace_back())); + } + OL_RETURN_ON_ERR(olWaitEvents(TargetQueue, OffloadHandles.data(), + OffloadHandles.size())); + } else { + OL_RETURN_ON_ERR( + waitOnEvents(TargetQueue, phEventWaitList, numEventsInWaitList)); + } + + OL_RETURN_ON_ERR(makeEvent(TYPE, TargetQueue, hQueue, phEvent)); + + if constexpr (Barrier) { + ol_event_handle_t BarrierEvent; + if (phEvent) { + BarrierEvent = (*phEvent)->OffloadEvent; + } else { + OL_RETURN_ON_ERR(olCreateEvent(TargetQueue, &BarrierEvent)); + } + + // Ensure any newly created work waits on this barrier + if (hQueue->Barrier) { + OL_RETURN_ON_ERR(olDestroyEvent(hQueue->Barrier)); + } + hQueue->Barrier = BarrierEvent; + + // Block all existing threads on the barrier + for (auto *Q : hQueue->OffloadQueues) { + if (Q == nullptr) { + break; + } + if (Q == TargetQueue) { + continue; + } + OL_RETURN_ON_ERR(olWaitEvents(Q, &BarrierEvent, 1)); + } + } + + return UR_RESULT_SUCCESS; +} +} // namespace + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWait( + ur_queue_handle_t hQueue, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + return doWait(hQueue, numEventsInWaitList, phEventWaitList, phEvent); +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier( + ur_queue_handle_t hQueue, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + return doWait(hQueue, numEventsInWaitList, phEventWaitList, phEvent); +} + UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, uint32_t, const ur_kernel_launch_property_t *, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - // Ignore wait list for now - (void)numEventsInWaitList; - (void)phEventWaitList; - // + ol_queue_handle_t Queue; + OL_RETURN_ON_ERR(hQueue->nextQueue(Queue)); + OL_RETURN_ON_ERR(waitOnEvents(Queue, phEventWaitList, numEventsInWaitList)); (void)pGlobalWorkOffset; @@ -67,20 +181,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( LaunchArgs.GroupSize.z = GroupSize[2]; LaunchArgs.DynSharedMemory = 0; - ol_queue_handle_t Queue; - OL_RETURN_ON_ERR(hQueue->nextQueue(Queue)); OL_RETURN_ON_ERR(olLaunchKernel( Queue, hQueue->OffloadDevice, hKernel->OffloadKernel, hKernel->Args.getStorage(), hKernel->Args.getStorageSize(), &LaunchArgs)); - if (phEvent) { - auto *Event = new ur_event_handle_t_(UR_COMMAND_KERNEL_LAUNCH, hQueue); - if (auto Res = olCreateEvent(Queue, &Event->OffloadEvent)) { - delete Event; - return offloadResultToUR(Res); - }; - *phEvent = Event; - } + OL_RETURN_ON_ERR(makeEvent(UR_COMMAND_KERNEL_LAUNCH, Queue, hQueue, phEvent)); return UR_RESULT_SUCCESS; } @@ -103,10 +208,9 @@ ur_result_t doMemcpy(ur_command_t Command, ur_queue_handle_t hQueue, size_t size, bool blocking, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - // Ignore wait list for now - (void)numEventsInWaitList; - (void)phEventWaitList; - // + ol_queue_handle_t Queue; + OL_RETURN_ON_ERR(hQueue->nextQueue(Queue)); + OL_RETURN_ON_ERR(waitOnEvents(Queue, phEventWaitList, numEventsInWaitList)); if (blocking) { OL_RETURN_ON_ERR( @@ -117,8 +221,6 @@ ur_result_t doMemcpy(ur_command_t Command, ur_queue_handle_t hQueue, return UR_RESULT_SUCCESS; } - ol_queue_handle_t Queue; - OL_RETURN_ON_ERR(hQueue->nextQueue(Queue)); OL_RETURN_ON_ERR( olMemcpy(Queue, DestPtr, DestDevice, SrcPtr, SrcDevice, size)); if (phEvent) { @@ -192,17 +294,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableWrite( numEventsInWaitList, phEventWaitList, phEvent); } -ur_result_t enqueueNoOp(ur_command_t Type, ur_queue_handle_t hQueue, - ur_event_handle_t *phEvent) { - // This path is a no-op, but we can't output a real event because - // Offload doesn't currently support creating arbitrary events, and we - // don't know the last real event in the queue. Instead we just have to - // wait on the whole queue and then return an empty (implicitly - // finished) event. - *phEvent = ur_event_handle_t_::createEmptyEvent(Type, hQueue); - return urQueueFinish(hQueue); -} - UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap( ur_queue_handle_t hQueue, ur_mem_handle_t hBuffer, bool blockingMap, ur_map_flags_t mapFlags, size_t offset, size_t size, @@ -226,15 +317,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap( Result = urEnqueueMemBufferRead(hQueue, hBuffer, blockingMap, offset, size, MapPtr, numEventsInWaitList, phEventWaitList, phEvent); - } else { - if (IsPinned) { - // TODO: Ignore the event waits list for now. When urEnqueueEventsWait is - // implemented we can call it on the wait list. - } - - if (phEvent) { - enqueueNoOp(UR_COMMAND_MEM_BUFFER_MAP, hQueue, phEvent); + } else if (numEventsInWaitList || phEvent) { + ol_queue_handle_t Queue; + OL_RETURN_ON_ERR(hQueue->nextQueue(Queue)); + if ((!hQueue->isInOrder() && phEvent) || hQueue->isInOrder()) { + // Out-of-order queues running no-op work only have side effects if there + // is an output event + waitOnEvents(Queue, phEventWaitList, numEventsInWaitList); } + OL_RETURN_ON_ERR( + makeEvent(UR_COMMAND_MEM_BUFFER_MAP, Queue, hQueue, phEvent)); } *ppRetMap = MapPtr; @@ -260,15 +352,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap( Result = urEnqueueMemBufferWrite( hQueue, hMem, true, Map->MapOffset, Map->MapSize, pMappedPtr, numEventsInWaitList, phEventWaitList, phEvent); - } else { - if (IsPinned) { - // TODO: Ignore the event waits list for now. When urEnqueueEventsWait is - // implemented we can call it on the wait list. - } - - if (phEvent) { - enqueueNoOp(UR_COMMAND_MEM_UNMAP, hQueue, phEvent); + } else if (numEventsInWaitList || phEvent) { + ol_queue_handle_t Queue; + OL_RETURN_ON_ERR(hQueue->nextQueue(Queue)); + if ((!hQueue->isInOrder() && phEvent) || hQueue->isInOrder()) { + // Out-of-order queues running no-op work only have side effects if there + // is an output event + waitOnEvents(Queue, phEventWaitList, numEventsInWaitList); } + OL_RETURN_ON_ERR(makeEvent(UR_COMMAND_MEM_UNMAP, Queue, hQueue, phEvent)); } BufferImpl.unmap(pMappedPtr); diff --git a/source/adapters/offload/queue.hpp b/source/adapters/offload/queue.hpp index 25585db273..8f887a9c3b 100644 --- a/source/adapters/offload/queue.hpp +++ b/source/adapters/offload/queue.hpp @@ -23,8 +23,8 @@ struct ur_queue_handle_t_ : RefCounted { : OffloadQueues((Flags & UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE) ? 1 : OOO_QUEUE_POOL_SIZE), - QueueOffset(0), OffloadDevice(Device), UrContext(UrContext), - Flags(Flags) {} + QueueOffset(0), Barrier(nullptr), OffloadDevice(Device), + UrContext(UrContext), Flags(Flags) {} // In-order queues only have one element here, while out of order queues have // a bank of queues to use. We rotate through them round robin instead of @@ -35,22 +35,37 @@ struct ur_queue_handle_t_ : RefCounted { // `stream_queue_t`. In the future, if we want more performance or it // simplifies the implementation of a feature, we can consider using it. std::vector OffloadQueues; + // Mutex guarding the offset and barrier for out of order queues + std::mutex OooMutex; size_t QueueOffset; + ol_event_handle_t Barrier; ol_device_handle_t OffloadDevice; ur_context_handle_t UrContext; ur_queue_flags_t Flags; - ol_result_t nextQueue(ol_queue_handle_t &Handle) { - auto &Slot = OffloadQueues[QueueOffset++]; - QueueOffset %= OffloadQueues.size(); + bool isInOrder() const { return OffloadQueues.size() == 1; } + + ol_result_t nextQueueNoLock(ol_queue_handle_t &Handle) { + auto &Slot = OffloadQueues[(QueueOffset++) % OffloadQueues.size()]; if (!Slot) { if (auto Res = olCreateQueue(OffloadDevice, &Slot)) { return Res; } + + if (auto Event = Barrier) { + if (auto Res = olWaitEvents(Slot, &Event, 1)) { + return Res; + } + } } Handle = Slot; return nullptr; } + + ol_result_t nextQueue(ol_queue_handle_t &Handle) { + std::lock_guard Lock(OooMutex); + return nextQueueNoLock(Handle); + } }; diff --git a/source/adapters/offload/ur_interface_loader.cpp b/source/adapters/offload/ur_interface_loader.cpp index 02de9df99f..498b09d7da 100644 --- a/source/adapters/offload/ur_interface_loader.cpp +++ b/source/adapters/offload/ur_interface_loader.cpp @@ -170,8 +170,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueProcAddrTable( } pDdiTable->pfnDeviceGlobalVariableRead = urEnqueueDeviceGlobalVariableRead; pDdiTable->pfnDeviceGlobalVariableWrite = urEnqueueDeviceGlobalVariableWrite; - pDdiTable->pfnEventsWait = nullptr; - pDdiTable->pfnEventsWaitWithBarrier = nullptr; + pDdiTable->pfnEventsWait = urEnqueueEventsWait; + pDdiTable->pfnEventsWaitWithBarrier = urEnqueueEventsWaitWithBarrier; pDdiTable->pfnKernelLaunch = urEnqueueKernelLaunch; pDdiTable->pfnMemBufferCopy = nullptr; pDdiTable->pfnMemBufferCopyRect = nullptr;