Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions include/ur_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -1601,6 +1601,7 @@ typedef enum ur_device_info_t {
///< command-buffers.
UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT_EXP = 0x1001, ///< [::ur_bool_t] Returns true if the device supports updating the kernel
///< commands in a command-buffer.
UR_DEVICE_INFO_CLUSTER_LAUNCH_EXP = 0x1111, ///< [::ur_bool_t] return true if enqueue Cluster Launch is supported
UR_DEVICE_INFO_BINDLESS_IMAGES_SUPPORT_EXP = 0x2000, ///< [::ur_bool_t] returns true if the device supports the creation of
///< bindless images
UR_DEVICE_INFO_BINDLESS_IMAGES_SHARED_USM_SUPPORT_EXP = 0x2001, ///< [::ur_bool_t] returns true if the device supports the creation of
Expand Down Expand Up @@ -2824,6 +2825,8 @@ urMemBufferPartition(
/// - The application may call this function from simultaneous threads for
/// the same context.
/// - The implementation of this function should be thread-safe.
/// - The implementation may require a valid device handle to return the
/// native mem handle
///
/// @returns
/// - ::UR_RESULT_SUCCESS
Expand All @@ -2832,15 +2835,16 @@ urMemBufferPartition(
/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC
/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
/// + `NULL == hMem`
/// + `NULL == hDevice`
/// + If `hDevice == NULL` and the implementation requires a valid device.
/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
/// + `NULL == phNativeMem`
/// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE
/// + If the adapter has no underlying equivalent handle.
UR_APIEXPORT ur_result_t UR_APICALL
urMemGetNativeHandle(
ur_mem_handle_t hMem, ///< [in] handle of the mem.
ur_device_handle_t hDevice, ///< [in] handle of the device that the native handle will be resident on.
ur_device_handle_t hDevice, ///< [in][optional] handle of the device that the native handle will be
///< resident on.
ur_native_handle_t *phNativeMem ///< [out] a pointer to the native handle of the mem.
);

Expand Down
15 changes: 15 additions & 0 deletions include/ur_print.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2530,6 +2530,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_device_info_t value) {
case UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT_EXP:
os << "UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT_EXP";
break;
case UR_DEVICE_INFO_CLUSTER_LAUNCH_EXP:
os << "UR_DEVICE_INFO_CLUSTER_LAUNCH_EXP";
break;
case UR_DEVICE_INFO_BINDLESS_IMAGES_SUPPORT_EXP:
os << "UR_DEVICE_INFO_BINDLESS_IMAGES_SUPPORT_EXP";
break;
Expand Down Expand Up @@ -4029,6 +4032,18 @@ inline ur_result_t printTagged(std::ostream &os, const void *ptr, ur_device_info

os << ")";
} break;
case UR_DEVICE_INFO_CLUSTER_LAUNCH_EXP: {
const ur_bool_t *tptr = (const ur_bool_t *)ptr;
if (sizeof(ur_bool_t) > size) {
os << "invalid size (is: " << size << ", expected: >=" << sizeof(ur_bool_t) << ")";
return UR_RESULT_ERROR_INVALID_SIZE;
}
os << (const void *)(tptr) << " (";

os << *tptr;

os << ")";
} break;
case UR_DEVICE_INFO_BINDLESS_IMAGES_SUPPORT_EXP: {
const ur_bool_t *tptr = (const ur_bool_t *)ptr;
if (sizeof(ur_bool_t) > size) {
Expand Down
10 changes: 10 additions & 0 deletions scripts/core/exp-launch-properties.yml
Original file line number Diff line number Diff line change
Expand Up @@ -128,4 +128,14 @@ returns:
- $X_RESULT_ERROR_INVALID_VALUE
- $X_RESULT_ERROR_OUT_OF_HOST_MEMORY
- $X_RESULT_ERROR_OUT_OF_RESOURCES
--- #--------------------------------------------------------------------------
type: enum
extend: true
typed_etors: true
desc: "Extension enums to $x_device_info_t to support arch specific launch properties."
name: $x_device_info_t
etors:
- name: CLUSTER_LAUNCH_EXP
value: "0x1111"
desc: "[$x_bool_t] return true if enqueue Cluster Launch is supported"

5 changes: 4 additions & 1 deletion scripts/core/memory.yml
Original file line number Diff line number Diff line change
Expand Up @@ -436,6 +436,7 @@ details:
- "Use interoperability platform extensions to convert native handle to native type."
- "The application may call this function from simultaneous threads for the same context."
- "The implementation of this function should be thread-safe."
- "The implementation may require a valid device handle to return the native mem handle"
params:
- type: $x_mem_handle_t
name: hMem
Expand All @@ -444,14 +445,16 @@ params:
- type: $x_device_handle_t
name: hDevice
desc: |
[in] handle of the device that the native handle will be resident on.
[in][optional] handle of the device that the native handle will be resident on.
- type: $x_native_handle_t*
name: phNativeMem
desc: |
[out] a pointer to the native handle of the mem.
returns:
- $X_RESULT_ERROR_UNSUPPORTED_FEATURE:
- "If the adapter has no underlying equivalent handle."
- $X_RESULT_ERROR_INVALID_NULL_HANDLE:
- "If `hDevice == NULL` and the implementation requires a valid device."
--- #--------------------------------------------------------------------------
type: struct
desc: "Native memory object creation properties"
Expand Down
5 changes: 5 additions & 0 deletions source/adapters/cuda/device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1089,6 +1089,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
case UR_DEVICE_INFO_COMMAND_BUFFER_SUPPORT_EXP:
case UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT_EXP:
return ReturnValue(true);
case UR_DEVICE_INFO_CLUSTER_LAUNCH_EXP: {
int Value = getAttribute(hDevice,
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >= 9;
return ReturnValue(static_cast<bool>(Value));
}

default:
break;
Expand Down
61 changes: 41 additions & 20 deletions source/adapters/cuda/enqueue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -530,6 +530,21 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
}

std::vector<CUlaunchAttribute> launch_attribute(numPropsInLaunchPropList);

// Early exit for zero size kernel
if (*pGlobalWorkSize == 0) {
return urEnqueueEventsWaitWithBarrier(hQueue, numEventsInWaitList,
phEventWaitList, phEvent);
}

// Set the number of threads per block to the number of threads per warp
// by default unless user has provided a better number
size_t ThreadsPerBlock[3] = {32u, 1u, 1u};
size_t BlocksPerGrid[3] = {1u, 1u, 1u};

uint32_t LocalSize = hKernel->getLocalSize();
CUfunction CuFunc = hKernel->get();

for (uint32_t i = 0; i < numPropsInLaunchPropList; i++) {
switch (launchPropList[i].id) {
case UR_EXP_LAUNCH_PROPERTY_ID_IGNORE: {
Expand All @@ -540,12 +555,32 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(

launch_attribute[i].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
// Note that cuda orders from right to left wrt SYCL dimensional order.
launch_attribute[i].value.clusterDim.x =
launchPropList[i].value.clusterDim[2];
launch_attribute[i].value.clusterDim.y =
launchPropList[i].value.clusterDim[1];
launch_attribute[i].value.clusterDim.z =
launchPropList[i].value.clusterDim[0];
if (workDim == 3) {
launch_attribute[i].value.clusterDim.x =
launchPropList[i].value.clusterDim[2];
launch_attribute[i].value.clusterDim.y =
launchPropList[i].value.clusterDim[1];
launch_attribute[i].value.clusterDim.z =
launchPropList[i].value.clusterDim[0];
} else if (workDim == 2) {
launch_attribute[i].value.clusterDim.x =
launchPropList[i].value.clusterDim[1];
launch_attribute[i].value.clusterDim.y =
launchPropList[i].value.clusterDim[0];
launch_attribute[i].value.clusterDim.z =
launchPropList[i].value.clusterDim[2];
} else {
launch_attribute[i].value.clusterDim.x =
launchPropList[i].value.clusterDim[0];
launch_attribute[i].value.clusterDim.y =
launchPropList[i].value.clusterDim[1];
launch_attribute[i].value.clusterDim.z =
launchPropList[i].value.clusterDim[2];
}

UR_CHECK_ERROR(cuFuncSetAttribute(
CuFunc, CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED, 1));

break;
}
case UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE: {
Expand All @@ -560,20 +595,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
}
}

// Early exit for zero size kernel
if (*pGlobalWorkSize == 0) {
return urEnqueueEventsWaitWithBarrier(hQueue, numEventsInWaitList,
phEventWaitList, phEvent);
}

// Set the number of threads per block to the number of threads per warp
// by default unless user has provided a better number
size_t ThreadsPerBlock[3] = {32u, 1u, 1u};
size_t BlocksPerGrid[3] = {1u, 1u, 1u};

uint32_t LocalSize = hKernel->getLocalSize();
CUfunction CuFunc = hKernel->get();

// This might return UR_RESULT_ERROR_ADAPTER_SPECIFIC, which cannot be handled
// using the standard UR_CHECK_ERROR
if (ur_result_t Ret =
Expand Down
4 changes: 3 additions & 1 deletion source/adapters/cuda/event.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventSetCallback(ur_event_handle_t,
UR_APIEXPORT ur_result_t UR_APICALL
urEventWait(uint32_t numEvents, const ur_event_handle_t *phEventWaitList) {
try {
ScopedContext Active(phEventWaitList[0]->getQueue()->getDevice());
// Interop events don't have an associated queue, so get device through
// context
ScopedContext Active(phEventWaitList[0]->getContext()->getDevices()[0]);

auto WaitFunc = [](ur_event_handle_t Event) -> ur_result_t {
UR_ASSERT(Event, UR_RESULT_ERROR_INVALID_EVENT);
Expand Down
4 changes: 2 additions & 2 deletions source/adapters/cuda/kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice,
case UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE: {
size_t GroupSize[3] = {0, 0, 0};
const auto &ReqdWGSizeMDMap =
hKernel->get_program()->KernelReqdWorkGroupSizeMD;
hKernel->getProgram()->KernelReqdWorkGroupSizeMD;
const auto ReqdWGSizeMD = ReqdWGSizeMDMap.find(hKernel->getName());
if (ReqdWGSizeMD != ReqdWGSizeMDMap.end()) {
const auto ReqdWGSize = ReqdWGSizeMD->second;
Expand Down Expand Up @@ -222,7 +222,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetInfo(ur_kernel_handle_t hKernel,
case UR_KERNEL_INFO_CONTEXT:
return ReturnValue(hKernel->getContext());
case UR_KERNEL_INFO_PROGRAM:
return ReturnValue(hKernel->get_program());
return ReturnValue(hKernel->getProgram());
case UR_KERNEL_INFO_ATTRIBUTES:
return ReturnValue("");
case UR_KERNEL_INFO_NUM_REGS: {
Expand Down
3 changes: 1 addition & 2 deletions source/adapters/cuda/kernel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -178,15 +178,14 @@ struct ur_kernel_handle_t_ {
urContextRelease(Context);
}

ur_program_handle_t get_program() const noexcept { return Program; }

uint32_t incrementReferenceCount() noexcept { return ++RefCount; }

uint32_t decrementReferenceCount() noexcept { return --RefCount; }

uint32_t getReferenceCount() const noexcept { return RefCount; }

native_type get() const noexcept { return Function; };

ur_program_handle_t getProgram() const noexcept { return Program; };

native_type get_with_offset_parameter() const noexcept {
Expand Down
1 change: 1 addition & 0 deletions source/adapters/cuda/memory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemRelease(ur_mem_handle_t hMem) {
UR_APIEXPORT ur_result_t UR_APICALL
urMemGetNativeHandle(ur_mem_handle_t hMem, ur_device_handle_t Device,
ur_native_handle_t *phNativeMem) {
UR_ASSERT(Device != nullptr, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
try {
*phNativeMem = reinterpret_cast<ur_native_handle_t>(
std::get<BufferMem>(hMem->Mem).getPtr(Device));
Expand Down
28 changes: 22 additions & 6 deletions source/adapters/hip/enqueue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1424,8 +1424,6 @@ urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size,
UR_ASSERT(size <= PointerRangeSize, UR_RESULT_ERROR_INVALID_SIZE);
#endif

ur_result_t Result = UR_RESULT_SUCCESS;

try {
ScopedContext Active(Device);
std::unique_ptr<ur_event_handle_t_> EventPtr{nullptr};
Expand Down Expand Up @@ -1480,6 +1478,20 @@ urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size,
// UR_USM_MEM_ADVICE_SET/MEM_ADVICE_CLEAR_READ_MOSTLY.
}

// hipMemAdvise only supports managed memory allocated via
// hipMallocManaged. We can't support this API with any other types of
// pointer. We should ignore them and result UR_RESULT_SUCCESS but instead
// we report a warning.
// FIXME: Fix this up when there's a better warning mechanism.
if (auto ptrAttribs = getPointerAttributes(pMem);
!ptrAttribs || !ptrAttribs->isManaged) {
releaseEvent();
setErrorMessage("mem_advise is ignored as the pointer argument is not "
"a shared USM pointer",
UR_RESULT_SUCCESS);
return UR_RESULT_ERROR_ADAPTER_SPECIFIC;
}

const auto DeviceID = Device->get();
if (advice & UR_USM_ADVICE_FLAG_DEFAULT) {
UR_CHECK_ERROR(
Expand All @@ -1493,7 +1505,11 @@ urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size,
hipMemAdvise(pMem, size, hipMemAdviseUnsetCoarseGrain, DeviceID));
#endif
} else {
Result = setHipMemAdvise(HIPDevicePtr, size, advice, DeviceID);
ur_result_t Result =
setHipMemAdvise(HIPDevicePtr, size, advice, DeviceID);
assert((Result == UR_RESULT_SUCCESS ||
Result == UR_RESULT_ERROR_INVALID_ENUMERATION) &&
"Unexpected return code");
// UR_RESULT_ERROR_INVALID_ENUMERATION is returned when using a valid
// but currently unmapped advice arguments as not supported by this
// platform. Therefore, warn the user instead of throwing and aborting
Expand All @@ -1509,12 +1525,12 @@ urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size,

releaseEvent();
} catch (ur_result_t err) {
Result = err;
return err;
} catch (...) {
Result = UR_RESULT_ERROR_UNKNOWN;
return UR_RESULT_ERROR_UNKNOWN;
}

return Result;
return UR_RESULT_SUCCESS;
}

UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill2D(
Expand Down
1 change: 1 addition & 0 deletions source/adapters/hip/memory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -301,6 +301,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemGetInfo(ur_mem_handle_t hMemory,
UR_APIEXPORT ur_result_t UR_APICALL
urMemGetNativeHandle(ur_mem_handle_t hMem, ur_device_handle_t Device,
ur_native_handle_t *phNativeMem) {
UR_ASSERT(Device != nullptr, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
#if defined(__HIP_PLATFORM_NVIDIA__)
if (sizeof(BufferMem::native_type) > sizeof(ur_native_handle_t)) {
// Check that all the upper bits that cannot be represented by
Expand Down
4 changes: 2 additions & 2 deletions source/adapters/level_zero/command_buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -885,7 +885,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp(
UR_CALL(DstBuffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only,
CommandBuffer->Device));

bool PreferCopyEngine = (SrcBuffer->OnHost || SrcBuffer->OnHost);
bool PreferCopyEngine = (SrcBuffer->OnHost || DstBuffer->OnHost);

PreferCopyEngine |= UseCopyEngineForD2DCopy;

Expand Down Expand Up @@ -917,7 +917,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp(
UR_CALL(DstBuffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only,
CommandBuffer->Device));

bool PreferCopyEngine = (SrcBuffer->OnHost || SrcBuffer->OnHost);
bool PreferCopyEngine = (SrcBuffer->OnHost || DstBuffer->OnHost);

PreferCopyEngine |= UseCopyEngineForD2DCopy;

Expand Down
5 changes: 3 additions & 2 deletions source/adapters/level_zero/event.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1022,6 +1022,7 @@ ur_result_t urEventReleaseInternal(ur_event_handle_t Event) {

// Save pointer to the queue before deleting/resetting event.
auto Queue = Legacy(Event->UrQueue);
auto URQueue = Event->UrQueue;

// If the event was a timestamp recording, we try to evict its entry in the
// queue.
Expand Down Expand Up @@ -1053,8 +1054,8 @@ ur_result_t urEventReleaseInternal(ur_event_handle_t Event) {
// created so that we can avoid ur_queue_handle_t is released before the
// associated ur_event_handle_t is released. Here we have to decrement it so
// ur_queue_handle_t can be released successfully.
if (Event->UrQueue) {
UR_CALL(urQueueReleaseInternal(Event->UrQueue));
if (URQueue) {
UR_CALL(urQueueReleaseInternal(URQueue));
}

return UR_RESULT_SUCCESS;
Expand Down
3 changes: 2 additions & 1 deletion source/adapters/null/ur_nullddi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -937,7 +937,8 @@ __urdlllocal ur_result_t UR_APICALL urMemBufferPartition(
__urdlllocal ur_result_t UR_APICALL urMemGetNativeHandle(
ur_mem_handle_t hMem, ///< [in] handle of the mem.
ur_device_handle_t
hDevice, ///< [in] handle of the device that the native handle will be resident on.
hDevice, ///< [in][optional] handle of the device that the native handle will be
///< resident on.
ur_native_handle_t
*phNativeMem ///< [out] a pointer to the native handle of the mem.
) try {
Expand Down
3 changes: 2 additions & 1 deletion source/loader/layers/tracing/ur_trcddi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1209,7 +1209,8 @@ __urdlllocal ur_result_t UR_APICALL urMemBufferPartition(
__urdlllocal ur_result_t UR_APICALL urMemGetNativeHandle(
ur_mem_handle_t hMem, ///< [in] handle of the mem.
ur_device_handle_t
hDevice, ///< [in] handle of the device that the native handle will be resident on.
hDevice, ///< [in][optional] handle of the device that the native handle will be
///< resident on.
ur_native_handle_t
*phNativeMem ///< [out] a pointer to the native handle of the mem.
) {
Expand Down
7 changes: 2 additions & 5 deletions source/loader/layers/validation/ur_valddi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1290,7 +1290,8 @@ __urdlllocal ur_result_t UR_APICALL urMemBufferPartition(
__urdlllocal ur_result_t UR_APICALL urMemGetNativeHandle(
ur_mem_handle_t hMem, ///< [in] handle of the mem.
ur_device_handle_t
hDevice, ///< [in] handle of the device that the native handle will be resident on.
hDevice, ///< [in][optional] handle of the device that the native handle will be
///< resident on.
ur_native_handle_t
*phNativeMem ///< [out] a pointer to the native handle of the mem.
) {
Expand All @@ -1305,10 +1306,6 @@ __urdlllocal ur_result_t UR_APICALL urMemGetNativeHandle(
return UR_RESULT_ERROR_INVALID_NULL_HANDLE;
}

if (NULL == hDevice) {
return UR_RESULT_ERROR_INVALID_NULL_HANDLE;
}

if (NULL == phNativeMem) {
return UR_RESULT_ERROR_INVALID_NULL_POINTER;
}
Expand Down
Loading