Skip to content

Commit ef4fa95

Browse files
committed
[UR][Offload] Queue flag and out-of-order queue support
Out of order support is similar to CUDA and HIP; there's a pool of queues, and incoming tasks are allocated to the queues round-robin style. In addition, the memcpy helper function now creates and destroys an ol queue, since they are cheap to create and we don't need it to hang around. In addition, flags are now parsed by `urQueueCreate` and can be queried with `urQueueGetInfo`. This determines whether the queue is in-order or out-of-order.
1 parent 254482c commit ef4fa95

File tree

4 files changed

+91
-21
lines changed

4 files changed

+91
-21
lines changed

unified-runtime/source/adapters/offload/device.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
101101
case UR_DEVICE_INFO_USM_SYSTEM_SHARED_SUPPORT:
102102
return ReturnValue(uint32_t{0});
103103
case UR_DEVICE_INFO_QUEUE_PROPERTIES:
104+
case UR_DEVICE_INFO_QUEUE_ON_HOST_PROPERTIES:
105+
case UR_DEVICE_INFO_QUEUE_ON_DEVICE_PROPERTIES:
106+
return ReturnValue(
107+
ur_queue_flags_t{UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE});
104108
case UR_DEVICE_INFO_KERNEL_LAUNCH_CAPABILITIES:
105109
return ReturnValue(0);
106110
case UR_DEVICE_INFO_SUPPORTED_PARTITIONS: {

unified-runtime/source/adapters/offload/enqueue.cpp

Lines changed: 26 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -68,10 +68,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
6868
LaunchArgs.DynSharedMemory = 0;
6969

7070
ol_event_handle_t EventOut;
71+
ol_queue_handle_t Queue;
72+
OL_RETURN_ON_ERR(hQueue->nextQueue(Queue));
7173
OL_RETURN_ON_ERR(
72-
olLaunchKernel(hQueue->OffloadQueue, hQueue->OffloadDevice,
73-
hKernel->OffloadKernel, hKernel->Args.getStorage(),
74-
hKernel->Args.getStorageSize(), &LaunchArgs, &EventOut));
74+
olLaunchKernel(Queue, hQueue->OffloadDevice, hKernel->OffloadKernel,
75+
hKernel->Args.getStorage(), hKernel->Args.getStorageSize(),
76+
&LaunchArgs, &EventOut));
7577

7678
if (phEvent) {
7779
auto *Event = new ur_event_handle_t_();
@@ -107,17 +109,30 @@ ur_result_t doMemcpy(ur_queue_handle_t hQueue, void *DestPtr,
107109

108110
ol_event_handle_t EventOut = nullptr;
109111

110-
OL_RETURN_ON_ERR(olMemcpy(hQueue->OffloadQueue, DestPtr, DestDevice, SrcPtr,
111-
SrcDevice, size, phEvent ? &EventOut : nullptr));
112-
112+
ol_queue_handle_t Queue;
113113
if (blocking) {
114-
OL_RETURN_ON_ERR(olWaitQueue(hQueue->OffloadQueue));
114+
// If we are using a blocking operation, create a temporary queue that lives
115+
// only for this function
116+
OL_RETURN_ON_ERR(olCreateQueue(hQueue->OffloadDevice, &Queue));
117+
} else {
118+
OL_RETURN_ON_ERR(hQueue->nextQueue(Queue));
115119
}
120+
OL_RETURN_ON_ERR(olMemcpy(Queue, DestPtr, DestDevice, SrcPtr, SrcDevice, size,
121+
(phEvent || blocking) ? &EventOut : nullptr));
116122

117-
if (phEvent) {
118-
auto *Event = new ur_event_handle_t_();
119-
Event->OffloadEvent = EventOut;
120-
*phEvent = Event;
123+
if (blocking) {
124+
OL_RETURN_ON_ERR(olWaitQueue(Queue));
125+
OL_RETURN_ON_ERR(olDestroyQueue(Queue));
126+
127+
if (phEvent) {
128+
*phEvent = ur_event_handle_t_::createEmptyEvent();
129+
}
130+
} else {
131+
if (phEvent) {
132+
auto *Event = new ur_event_handle_t_();
133+
Event->OffloadEvent = EventOut;
134+
*phEvent = Event;
135+
}
121136
}
122137

123138
return UR_RESULT_SUCCESS;

unified-runtime/source/adapters/offload/queue.cpp

Lines changed: 32 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -19,18 +19,28 @@
1919

2020
UR_APIEXPORT ur_result_t UR_APICALL urQueueCreate(
2121
[[maybe_unused]] ur_context_handle_t hContext, ur_device_handle_t hDevice,
22-
const ur_queue_properties_t *, ur_queue_handle_t *phQueue) {
22+
const ur_queue_properties_t *pProps, ur_queue_handle_t *phQueue) {
2323

2424
assert(hContext->Device == hDevice);
2525

26-
ur_queue_handle_t Queue = new ur_queue_handle_t_();
27-
auto Res = olCreateQueue(hDevice->OffloadDevice, &Queue->OffloadQueue);
28-
if (Res != OL_SUCCESS) {
29-
delete Queue;
30-
return offloadResultToUR(Res);
26+
ur_queue_flags_t URFlags = 0;
27+
if (pProps && pProps->stype == UR_STRUCTURE_TYPE_QUEUE_PROPERTIES) {
28+
URFlags = pProps->flags;
3129
}
3230

33-
Queue->OffloadDevice = hDevice->OffloadDevice;
31+
ur_queue_handle_t Queue =
32+
new ur_queue_handle_t_(hDevice->OffloadDevice, URFlags);
33+
[[maybe_unused]] ol_queue_handle_t InitQueue;
34+
35+
// For in-order queues, create the ol queue on construction so we can report
36+
// any errors earlier
37+
if (!(URFlags & UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE)) {
38+
auto Res = Queue->nextQueue(InitQueue);
39+
if (Res != OL_SUCCESS) {
40+
delete Queue;
41+
return offloadResultToUR(Res);
42+
}
43+
}
3444

3545
*phQueue = Queue;
3646

@@ -45,6 +55,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueGetInfo(ur_queue_handle_t hQueue,
4555
UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet);
4656

4757
switch (propName) {
58+
case UR_QUEUE_INFO_FLAGS:
59+
return ReturnValue(hQueue->Flags);
4860
case UR_QUEUE_INFO_REFERENCE_COUNT:
4961
return ReturnValue(hQueue->RefCount.load());
5062
default:
@@ -61,15 +73,26 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueRetain(ur_queue_handle_t hQueue) {
6173

6274
UR_APIEXPORT ur_result_t UR_APICALL urQueueRelease(ur_queue_handle_t hQueue) {
6375
if (--hQueue->RefCount == 0) {
64-
OL_RETURN_ON_ERR(olDestroyQueue(hQueue->OffloadQueue));
76+
for (auto *Q : hQueue->OffloadQueues) {
77+
if (!Q) {
78+
break;
79+
}
80+
OL_RETURN_ON_ERR(olDestroyQueue(Q));
81+
}
6582
delete hQueue;
6683
}
6784

6885
return UR_RESULT_SUCCESS;
6986
}
7087

7188
UR_APIEXPORT ur_result_t UR_APICALL urQueueFinish(ur_queue_handle_t hQueue) {
72-
return offloadResultToUR(olWaitQueue(hQueue->OffloadQueue));
89+
for (auto *Q : hQueue->OffloadQueues) {
90+
if (!Q) {
91+
break;
92+
}
93+
OL_RETURN_ON_ERR(olWaitQueue(Q));
94+
}
95+
return UR_RESULT_SUCCESS;
7396
}
7497

7598
UR_APIEXPORT ur_result_t UR_APICALL urQueueGetNativeHandle(

unified-runtime/source/adapters/offload/queue.hpp

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,35 @@
1515

1616
#include "common.hpp"
1717

18+
constexpr size_t OOO_QUEUE_POOL_SIZE = 32;
19+
1820
struct ur_queue_handle_t_ : RefCounted {
19-
ol_queue_handle_t OffloadQueue;
21+
ur_queue_handle_t_(ol_device_handle_t Device, ur_queue_flags_t Flags)
22+
: OffloadQueues((Flags & UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE)
23+
? 1
24+
: OOO_QUEUE_POOL_SIZE),
25+
QueueOffset(0), OffloadDevice(Device), Flags(Flags) {}
26+
27+
// In-order queues only have one element here, while out of order queues have
28+
// a bank of queues to use. We rotate through them round robin instead of
29+
// constantly creating new ones in case there is a long-running program that
30+
// never destroys the ur queue. Out-of-order queues create ol queues when
31+
// needed; any queues that are not yet created are nullptr.
32+
std::vector<ol_queue_handle_t> OffloadQueues;
33+
size_t QueueOffset;
2034
ol_device_handle_t OffloadDevice;
35+
ur_queue_flags_t Flags;
36+
37+
ol_result_t nextQueue(ol_queue_handle_t &Handle) {
38+
auto &Slot = OffloadQueues[QueueOffset++];
39+
40+
if (!Slot) {
41+
if (auto Res = olCreateQueue(OffloadDevice, &Slot)) {
42+
return Res;
43+
}
44+
}
45+
46+
Handle = Slot;
47+
return nullptr;
48+
}
2149
};

0 commit comments

Comments
 (0)