From 2c7b6713c9a87b154b688a3e9ee2a9136aea5920 Mon Sep 17 00:00:00 2001 From: Kevin B Smith Date: Tue, 6 Oct 2020 17:07:27 -0700 Subject: [PATCH 1/6] [SYCl][PI][L0] Add support for batching multiple commands into a command list prior to executing that command list. --- sycl/doc/EnvironmentVariables.md | 1 + sycl/plugins/level_zero/pi_level_zero.cpp | 122 ++++++++++++++++++++-- sycl/plugins/level_zero/pi_level_zero.hpp | 39 ++++++- 3 files changed, 149 insertions(+), 13 deletions(-) diff --git a/sycl/doc/EnvironmentVariables.md b/sycl/doc/EnvironmentVariables.md index 2bbf87371e7f3..7236678536d7b 100644 --- a/sycl/doc/EnvironmentVariables.md +++ b/sycl/doc/EnvironmentVariables.md @@ -28,6 +28,7 @@ subject to change. Do not rely on these variables in production code. | SYCL_DEVICELIB_NO_FALLBACK | Any(\*) | Disable loading and linking of device library images | | SYCL_PI_LEVEL0_MAX_COMMAND_LIST_CACHE | Positive integer | Maximum number of oneAPI Level Zero Command lists that can be allocated with no reuse before throwing an "out of resources" error. Default is 20000, threshold may be increased based on resource availabilty and workload demand. | | SYCL_PI_LEVEL0_DISABLE_USM_ALLOCATOR | Any(\*) | Disable USM allocator in Level Zero plugin (each memory request will go directly to Level Zero runtime) | +| SYCL_PI_LEVEL_ZERO_BATCH_SIZE | Positive integer | Sets a preferred number of commands to batch into a command list before executing the command list. Values 0 and 1 turn off batching. Default is 4. | `(*) Note: Any means this environment variable is effective when set to any non-null value.` diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index d0cd5c3b91e34..2341aeca55f72 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -435,12 +435,45 @@ _pi_queue::resetCommandListFenceEntry(ze_command_list_handle_t ZeCommandList, return PI_SUCCESS; } +static const pi_uint32 ZeCommandListBatchSize = [] { + // Default value of 4. This has been seen as a good tradeoff between + // lower overhead of number of enqueue and fence calls, and getting + // commands seen as soon possible (i.e. lazy vs eager submission). + pi_uint32 BatchSizeVal = 4; + const auto BatchSizeStr = std::getenv("SYCL_PI_LEVEL_ZERO_BATCH_SIZE"); + if (BatchSizeStr) { + pi_int32 BatchSizeStrVal = std::atoi(BatchSizeStr); + // Level Zero may only support a limted number of commands per command + // list. The actual upper limit is not specified by the Level Zero + // Specification. For now we allow an arbitrary upper limit. + // Negative numbers will be silently ignored. + if (BatchSizeStrVal >= 0) + BatchSizeVal = BatchSizeStrVal; + } + return BatchSizeVal; +}(); + // Retrieve an available command list to be used in a PI call // Caller must hold a lock on the Queue passed in. -pi_result -_pi_device::getAvailableCommandList(pi_queue Queue, - ze_command_list_handle_t *ZeCommandList, - ze_fence_handle_t *ZeFence) { +pi_result _pi_device::getAvailableCommandList( + pi_queue Queue, ze_command_list_handle_t *ZeCommandList, + ze_fence_handle_t *ZeFence, bool AllowBatching) { + // First see if there is an command-list open for batching commands + // for this queue. + if (Queue->ZeOpenCommandList) { + if (AllowBatching) { + *ZeCommandList = Queue->ZeOpenCommandList; + *ZeFence = Queue->ZeOpenCommandListFence; + return PI_SUCCESS; + } + + // If this command isn't allowed to be batched, then we need to + // go ahead and execute what is already in the batched list, + // and then go on to process this. On exit from executeOpenCommandList + // ZeOpenCommandList will be nullptr. + Queue->executeOpenCommandList(); + } + // Create/Reuse the command list, because in Level Zero commands are added to // the command lists, and later are then added to the command queue. // Each command list is paired with an associated fence to track when the @@ -525,6 +558,57 @@ pi_result _pi_queue::executeCommandList(ze_command_list_handle_t ZeCommandList, return PI_SUCCESS; } +bool _pi_queue::isBatchingAllowed() { + return (this->QueueBatchSize > 1 && ((ZeSerialize & ZeSerializeBlock) == 0)); +} + +pi_result _pi_queue::batchCommandList(ze_command_list_handle_t ZeCommandList, + ze_fence_handle_t ZeFence) { + if (this->isBatchingAllowed()) { + assert(this->ZeOpenCommandList == nullptr || + this->ZeOpenCommandList == ZeCommandList); + + if (this->ZeOpenCommandListSize + 1 < QueueBatchSize) { + this->ZeOpenCommandList = ZeCommandList; + this->ZeOpenCommandListFence = ZeFence; + + // NOTE: we don't know here how many commands are in the ZeCommandList + // but most PI interfaces translate to a single Level-Zero command. + // Some do translate to multiple commands so we may be undercounting + // a bit here, but this is a heuristic, not an exact measure. + // + this->ZeOpenCommandListSize += 1; + + return PI_SUCCESS; + } + + this->ZeOpenCommandList = nullptr; + this->ZeOpenCommandListFence = nullptr; + this->ZeOpenCommandListSize = 0; + } + + return executeCommandList(ZeCommandList, ZeFence); +} + +pi_result _pi_queue::executeOpenCommandList() { + if (this->RefCount > 0) { + // If there are any commands still in the open command list for this + // queue, then close and execute that command list now. + auto OpenList = this->ZeOpenCommandList; + if (OpenList) { + auto OpenListFence = this->ZeOpenCommandListFence; + + this->ZeOpenCommandList = nullptr; + this->ZeOpenCommandListFence = nullptr; + this->ZeOpenCommandListSize = 0; + + return executeCommandList(OpenList, OpenListFence); + } + } + + return PI_SUCCESS; +} + ze_event_handle_t *_pi_event::createZeEventList(pi_uint32 EventListLength, const pi_event *EventList) { try { @@ -1650,7 +1734,8 @@ pi_result piQueueCreate(pi_context Context, pi_device Device, assert(Queue); try { - *Queue = new _pi_queue(ZeCommandQueue, Context, Device); + *Queue = + new _pi_queue(ZeCommandQueue, Context, Device, ZeCommandListBatchSize); } catch (const std::bad_alloc &) { return PI_OUT_OF_HOST_MEMORY; } catch (...) { @@ -1706,6 +1791,12 @@ pi_result piQueueRelease(pi_queue Queue) { std::lock_guard lock(Queue->PiQueueMutex); if (--(Queue->RefCount) == 0) { + // There should be no open command lists. Those should have been closed + // and executed by piQueueFinish or earlier. + assert(Queue->ZeOpenCommandList == nullptr && + Queue->ZeOpenCommandListFence == nullptr && + Queue->ZeOpenCommandListSize == 0); + // Destroy all the fences created associated with this queue. for (const auto &MapEntry : Queue->ZeCommandListFenceMap) { ZE_CALL(zeFenceDestroy(MapEntry.second)); @@ -1724,6 +1815,9 @@ pi_result piQueueFinish(pi_queue Queue) { // Lock automatically releases when this goes out of scope. std::lock_guard lock(Queue->PiQueueMutex); + // execute any command list that may still be open. + Queue->executeOpenCommandList(); + ZE_CALL(zeCommandQueueSynchronize(Queue->ZeCommandQueue, UINT32_MAX)); return PI_SUCCESS; } @@ -1754,7 +1848,7 @@ pi_result piextQueueCreateWithNativeHandle(pi_native_handle NativeHandle, // Attach the queue to the "0" device. // TODO: see if we need to let user choose the device. pi_device Device = Context->Devices[0]; - *Queue = new _pi_queue(ZeQueue, Context, Device); + *Queue = new _pi_queue(ZeQueue, Context, Device, ZeCommandListBatchSize); return PI_SUCCESS; } @@ -3022,7 +3116,7 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim, ze_command_list_handle_t ZeCommandList = nullptr; ze_fence_handle_t ZeFence = nullptr; if (auto Res = Queue->Device->getAvailableCommandList(Queue, &ZeCommandList, - &ZeFence)) + &ZeFence, true)) return Res; ze_event_handle_t ZeEvent = nullptr; @@ -3059,7 +3153,7 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim, // Execute command list asynchronously, as the event will be used // to track down its completion. - if (auto Res = Queue->executeCommandList(ZeCommandList, ZeFence)) + if (auto Res = Queue->batchCommandList(ZeCommandList, ZeFence)) return Res; _pi_event::deleteZeEventList(ZeEventWaitList); @@ -3194,6 +3288,18 @@ pi_result piEventsWait(pi_uint32 NumEvents, const pi_event *EventList) { return PI_INVALID_EVENT; } + // Submit dependent open command lists for execution, if any + for (uint32_t I = 0; I < NumEvents; I++) { + auto Queue = EventList[I]->Queue; + + // Lock automatically releases when this goes out of scope. + std::lock_guard lock(Queue->PiQueueMutex); + + if (Queue->RefCount > 0) { + Queue->executeOpenCommandList(); + } + } + for (uint32_t I = 0; I < NumEvents; I++) { ze_event_handle_t ZeEvent = EventList[I]->ZeEvent; zePrint("ZeEvent = %lx\n", pi_cast(ZeEvent)); diff --git a/sycl/plugins/level_zero/pi_level_zero.hpp b/sycl/plugins/level_zero/pi_level_zero.hpp index 71e34a5b2a834..3577953fa0b44 100644 --- a/sycl/plugins/level_zero/pi_level_zero.hpp +++ b/sycl/plugins/level_zero/pi_level_zero.hpp @@ -177,9 +177,13 @@ struct _pi_device : _pi_object { // caller must pass a command queue to create a new fence for the new command // list if a command list/fence pair is not available. All Command Lists & // associated fences are destroyed at Device Release. + // If AllowBatching is true, then the command list returned may already have + // command in it, if AllowBatching is false, any open command lists that + // already exist in Queue will be closed and executed. pi_result getAvailableCommandList(pi_queue Queue, ze_command_list_handle_t *ZeCommandList, - ze_fence_handle_t *ZeFence); + ze_fence_handle_t *ZeFence, + bool AllowBatching = false); // Cache of the immutable device properties. ze_device_properties_t ZeDeviceProperties; @@ -268,8 +272,9 @@ struct _pi_context : _pi_object { struct _pi_queue : _pi_object { _pi_queue(ze_command_queue_handle_t Queue, pi_context Context, - pi_device Device) - : ZeCommandQueue{Queue}, Context{Context}, Device{Device} {} + pi_device Device, pi_uint32 QueueBatchSize) + : ZeCommandQueue{Queue}, Context{Context}, Device{Device}, + QueueBatchSize{QueueBatchSize} {} // Level Zero command queue handle. ze_command_queue_handle_t ZeCommandQueue; @@ -291,10 +296,23 @@ struct _pi_queue : _pi_object { // needed/used for the queue data structures. std::mutex PiQueueMutex; + // Open command list field for batching commands into this queue. + ze_command_list_handle_t ZeOpenCommandList = {nullptr}; + ze_fence_handle_t ZeOpenCommandListFence = {nullptr}; + pi_uint32 ZeOpenCommandListSize = {0}; + + // Approximate number of commands that are allowed to be batched for + // this queue. + pi_uint32 QueueBatchSize = {0}; + // Map of all Command lists created with their associated Fence used for // tracking when the command list is available for use again. std::map ZeCommandListFenceMap; + // Returns true if any commands for this queue are allowed to + // be batched together. + bool isBatchingAllowed(); + // Resets the Command List and Associated fence in the ZeCommandListFenceMap. // If the reset command list should be made available, then MakeAvailable // needs to be set to true. The caller must verify that this command list and @@ -302,14 +320,25 @@ struct _pi_queue : _pi_object { pi_result resetCommandListFenceEntry(ze_command_list_handle_t ZeCommandList, bool MakeAvailable); + // Attach a command list to this queue and allow it to remain open + // and used for further batching. It may be executed immediately, + // or it may be left open for other future command to be batched into. + pi_result batchCommandList(ze_command_list_handle_t ZeCommandList, + ze_fence_handle_t ZeFence); + // Attach a command list to this queue, close, and execute it. // Note that this command list cannot be appended to after this. - // The "is_blocking" tells if the wait for completion is requested. + // The "IsBlocking" tells if the wait for completion is requested. // The "ZeFence" passed is used to track when the command list passed // has completed execution on the device and can be reused. pi_result executeCommandList(ze_command_list_handle_t ZeCommandList, ze_fence_handle_t ZeFence, - bool is_blocking = false); + bool IsBlocking = false); + + // If there is an open command list associated with this queue, + // close it, exceute it, and reset ZeOpenCommandList, ZeCommandListFence, + // and ZeOpenCommandListSize. + pi_result executeOpenCommandList(); }; struct _pi_mem : _pi_object { From 579c83e9ba7d371c84dfbd8e04e52db1baa20fc3 Mon Sep 17 00:00:00 2001 From: Kevin B Smith Date: Tue, 6 Oct 2020 23:20:12 -0700 Subject: [PATCH 2/6] Fix for overly aggressive assertion. Needed to executeOpenCommandList in this case. --- sycl/plugins/level_zero/pi_level_zero.cpp | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index 2341aeca55f72..9760ddfbd1f22 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -1791,12 +1791,11 @@ pi_result piQueueRelease(pi_queue Queue) { std::lock_guard lock(Queue->PiQueueMutex); if (--(Queue->RefCount) == 0) { - // There should be no open command lists. Those should have been closed - // and executed by piQueueFinish or earlier. - assert(Queue->ZeOpenCommandList == nullptr && - Queue->ZeOpenCommandListFence == nullptr && - Queue->ZeOpenCommandListSize == 0); - + // It is possible to get to here and still have an open command list + // if no wait or finish ever occurred for this queue. But still need + // to make sure commands get executed. + Queue->executeOpenCommandList(); + // Destroy all the fences created associated with this queue. for (const auto &MapEntry : Queue->ZeCommandListFenceMap) { ZE_CALL(zeFenceDestroy(MapEntry.second)); From f2b8767786af479f56ef4f104cf710f0ca5a1211 Mon Sep 17 00:00:00 2001 From: Kevin B Smith Date: Tue, 6 Oct 2020 23:22:22 -0700 Subject: [PATCH 3/6] Fix for extra white space. --- sycl/plugins/level_zero/pi_level_zero.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index 9760ddfbd1f22..7f5c551bb3d02 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -1795,7 +1795,7 @@ pi_result piQueueRelease(pi_queue Queue) { // if no wait or finish ever occurred for this queue. But still need // to make sure commands get executed. Queue->executeOpenCommandList(); - + // Destroy all the fences created associated with this queue. for (const auto &MapEntry : Queue->ZeCommandListFenceMap) { ZE_CALL(zeFenceDestroy(MapEntry.second)); From 1293456eb34f221a2c93f4f9ff24d5779a514d37 Mon Sep 17 00:00:00 2001 From: Kevin B Smith Date: Tue, 6 Oct 2020 23:30:29 -0700 Subject: [PATCH 4/6] Fix for extra white space. --- sycl/doc/EnvironmentVariables.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/doc/EnvironmentVariables.md b/sycl/doc/EnvironmentVariables.md index 7236678536d7b..dd668d30352b6 100644 --- a/sycl/doc/EnvironmentVariables.md +++ b/sycl/doc/EnvironmentVariables.md @@ -28,7 +28,7 @@ subject to change. Do not rely on these variables in production code. | SYCL_DEVICELIB_NO_FALLBACK | Any(\*) | Disable loading and linking of device library images | | SYCL_PI_LEVEL0_MAX_COMMAND_LIST_CACHE | Positive integer | Maximum number of oneAPI Level Zero Command lists that can be allocated with no reuse before throwing an "out of resources" error. Default is 20000, threshold may be increased based on resource availabilty and workload demand. | | SYCL_PI_LEVEL0_DISABLE_USM_ALLOCATOR | Any(\*) | Disable USM allocator in Level Zero plugin (each memory request will go directly to Level Zero runtime) | -| SYCL_PI_LEVEL_ZERO_BATCH_SIZE | Positive integer | Sets a preferred number of commands to batch into a command list before executing the command list. Values 0 and 1 turn off batching. Default is 4. | +| SYCL_PI_LEVEL_ZERO_BATCH_SIZE | Positive integer | Sets a preferred number of commands to batch into a command list before executing the command list. Values 0 and 1 turn off batching. Default is 4. | `(*) Note: Any means this environment variable is effective when set to any non-null value.` From a7f0a656c3e1069133c84429fff5344c9666d536 Mon Sep 17 00:00:00 2001 From: Kevin B Smith Date: Wed, 7 Oct 2020 11:48:05 -0700 Subject: [PATCH 5/6] Added checking for return value from executeOpenCommandList. --- sycl/plugins/level_zero/pi_level_zero.cpp | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index 7f5c551bb3d02..10c2bb541252a 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -471,7 +471,8 @@ pi_result _pi_device::getAvailableCommandList( // go ahead and execute what is already in the batched list, // and then go on to process this. On exit from executeOpenCommandList // ZeOpenCommandList will be nullptr. - Queue->executeOpenCommandList(); + if (auto Res = Queue->executeOpenCommandList()) + return Res; } // Create/Reuse the command list, because in Level Zero commands are added to @@ -1794,7 +1795,8 @@ pi_result piQueueRelease(pi_queue Queue) { // It is possible to get to here and still have an open command list // if no wait or finish ever occurred for this queue. But still need // to make sure commands get executed. - Queue->executeOpenCommandList(); + if (auto Res = Queue->executeOpenCommandList()) + return Res; // Destroy all the fences created associated with this queue. for (const auto &MapEntry : Queue->ZeCommandListFenceMap) { @@ -1815,7 +1817,8 @@ pi_result piQueueFinish(pi_queue Queue) { std::lock_guard lock(Queue->PiQueueMutex); // execute any command list that may still be open. - Queue->executeOpenCommandList(); + if (auto Res = Queue->executeOpenCommandList()) + return Res; ZE_CALL(zeCommandQueueSynchronize(Queue->ZeCommandQueue, UINT32_MAX)); return PI_SUCCESS; @@ -3295,7 +3298,8 @@ pi_result piEventsWait(pi_uint32 NumEvents, const pi_event *EventList) { std::lock_guard lock(Queue->PiQueueMutex); if (Queue->RefCount > 0) { - Queue->executeOpenCommandList(); + if (auto Res = Queue->executeOpenCommandList()) + return Res; } } From 077e040949c9748ca697de2d635f9d959355d58f Mon Sep 17 00:00:00 2001 From: Kevin B Smith Date: Wed, 7 Oct 2020 12:36:05 -0700 Subject: [PATCH 6/6] Added comment on QueueBatchSize, and removed unneeded check of RefCount in executeOpenCommandList as requested in code review. --- sycl/plugins/level_zero/pi_level_zero.cpp | 24 +++++++++++------------ sycl/plugins/level_zero/pi_level_zero.hpp | 4 ++++ 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index 10c2bb541252a..47ca5a2f1ccbe 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -592,19 +592,17 @@ pi_result _pi_queue::batchCommandList(ze_command_list_handle_t ZeCommandList, } pi_result _pi_queue::executeOpenCommandList() { - if (this->RefCount > 0) { - // If there are any commands still in the open command list for this - // queue, then close and execute that command list now. - auto OpenList = this->ZeOpenCommandList; - if (OpenList) { - auto OpenListFence = this->ZeOpenCommandListFence; - - this->ZeOpenCommandList = nullptr; - this->ZeOpenCommandListFence = nullptr; - this->ZeOpenCommandListSize = 0; - - return executeCommandList(OpenList, OpenListFence); - } + // If there are any commands still in the open command list for this + // queue, then close and execute that command list now. + auto OpenList = this->ZeOpenCommandList; + if (OpenList) { + auto OpenListFence = this->ZeOpenCommandListFence; + + this->ZeOpenCommandList = nullptr; + this->ZeOpenCommandListFence = nullptr; + this->ZeOpenCommandListSize = 0; + + return executeCommandList(OpenList, OpenListFence); } return PI_SUCCESS; diff --git a/sycl/plugins/level_zero/pi_level_zero.hpp b/sycl/plugins/level_zero/pi_level_zero.hpp index 3577953fa0b44..53036b4570df3 100644 --- a/sycl/plugins/level_zero/pi_level_zero.hpp +++ b/sycl/plugins/level_zero/pi_level_zero.hpp @@ -303,6 +303,10 @@ struct _pi_queue : _pi_object { // Approximate number of commands that are allowed to be batched for // this queue. + // Added this member to the queue rather than using a global variable + // so that future implementation could use heuristics to change this on + // a queue specific basis. And by putting it in the queue itself, this + // is thread safe because of the locking of the queue that occurs. pi_uint32 QueueBatchSize = {0}; // Map of all Command lists created with their associated Fence used for