diff --git a/include/ur_api.h b/include/ur_api.h index 076000308f..bdaee31486 100644 --- a/include/ur_api.h +++ b/include/ur_api.h @@ -9486,13 +9486,17 @@ urEnqueueCooperativeKernelLaunchExp( /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hKernel` /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// + `NULL == pLocalWorkSize` /// + `NULL == pGroupCountRet` /// - ::UR_RESULT_ERROR_INVALID_KERNEL UR_APIEXPORT ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object - size_t localWorkSize, ///< [in] number of local work-items that will form a work-group when the - ///< kernel is launched + uint32_t workDim, ///< [in] number of dimensions, from 1 to 3, to specify the work-group + ///< work-items + const size_t *pLocalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the + ///< number of local work-items forming a work-group that will execute the + ///< kernel function. size_t dynamicSharedMemorySize, ///< [in] size of dynamic shared memory, for each work-group, in bytes, ///< that will be used when the kernel is launched uint32_t *pGroupCountRet ///< [out] pointer to maximum number of groups @@ -11028,7 +11032,8 @@ typedef struct ur_kernel_set_specialization_constants_params_t { /// allowing the callback the ability to modify the parameter's value typedef struct ur_kernel_suggest_max_cooperative_group_count_exp_params_t { ur_kernel_handle_t *phKernel; - size_t *plocalWorkSize; + uint32_t *pworkDim; + const size_t **ppLocalWorkSize; size_t *pdynamicSharedMemorySize; uint32_t **ppGroupCountRet; } ur_kernel_suggest_max_cooperative_group_count_exp_params_t; diff --git a/include/ur_ddi.h b/include/ur_ddi.h index d4b2595d7f..ce7dd137a9 100644 --- a/include/ur_ddi.h +++ b/include/ur_ddi.h @@ -651,7 +651,8 @@ typedef ur_result_t(UR_APICALL *ur_pfnGetKernelProcAddrTable_t)( /// @brief Function-pointer for urKernelSuggestMaxCooperativeGroupCountExp typedef ur_result_t(UR_APICALL *ur_pfnKernelSuggestMaxCooperativeGroupCountExp_t)( ur_kernel_handle_t, - size_t, + uint32_t, + const size_t *, size_t, uint32_t *); diff --git a/include/ur_print.hpp b/include/ur_print.hpp index 3aa165fcd0..8284731dc1 100644 --- a/include/ur_print.hpp +++ b/include/ur_print.hpp @@ -13074,9 +13074,15 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct *(params->phKernel)); os << ", "; - os << ".localWorkSize = "; + os << ".workDim = "; + + os << *(params->pworkDim); + + os << ", "; + os << ".pLocalWorkSize = "; - os << *(params->plocalWorkSize); + ur::details::printPtr(os, + *(params->ppLocalWorkSize)); os << ", "; os << ".dynamicSharedMemorySize = "; diff --git a/scripts/core/exp-cooperative-kernels.yml b/scripts/core/exp-cooperative-kernels.yml index 941aba29fa..ad3ba0ffba 100644 --- a/scripts/core/exp-cooperative-kernels.yml +++ b/scripts/core/exp-cooperative-kernels.yml @@ -78,9 +78,13 @@ params: - type: $x_kernel_handle_t name: hKernel desc: "[in] handle of the kernel object" - - type: size_t - name: localWorkSize - desc: "[in] number of local work-items that will form a work-group when the kernel is launched" + - type: uint32_t + name: workDim + desc: "[in] number of dimensions, from 1 to 3, to specify the work-group work-items" + - type: "const size_t*" + name: pLocalWorkSize + desc: | + [in] pointer to an array of workDim unsigned values that specify the number of local work-items forming a work-group that will execute the kernel function. - type: size_t name: dynamicSharedMemorySize desc: "[in] size of dynamic shared memory, for each work-group, in bytes, that will be used when the kernel is launched" diff --git a/source/adapters/cuda/kernel.cpp b/source/adapters/cuda/kernel.cpp index 5fb097c304..46c4907d4b 100644 --- a/source/adapters/cuda/kernel.cpp +++ b/source/adapters/cuda/kernel.cpp @@ -190,10 +190,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetNativeHandle( } UR_APIEXPORT ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( - ur_kernel_handle_t hKernel, size_t localWorkSize, + ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pLocalWorkSize, size_t dynamicSharedMemorySize, uint32_t *pGroupCountRet) { UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_KERNEL); + size_t localWorkSize = pLocalWorkSize[0]; + localWorkSize *= (workDim >= 2 ? pLocalWorkSize[1] : 1); + localWorkSize *= (workDim == 3 ? pLocalWorkSize[2] : 1); + // We need to set the active current device for this kernel explicitly here, // because the occupancy querying API does not take device parameter. ur_device_handle_t Device = hKernel->getProgram()->getDevice(); diff --git a/source/adapters/hip/kernel.cpp b/source/adapters/hip/kernel.cpp index 60931cd014..176a2a495a 100644 --- a/source/adapters/hip/kernel.cpp +++ b/source/adapters/hip/kernel.cpp @@ -169,10 +169,11 @@ urKernelGetNativeHandle(ur_kernel_handle_t, ur_native_handle_t *) { } UR_APIEXPORT ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( - ur_kernel_handle_t hKernel, size_t localWorkSize, + ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pLocalWorkSize, size_t dynamicSharedMemorySize, uint32_t *pGroupCountRet) { std::ignore = hKernel; - std::ignore = localWorkSize; + std::ignore = workDim; + std::ignore = pLocalWorkSize; std::ignore = dynamicSharedMemorySize; std::ignore = pGroupCountRet; return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; diff --git a/source/adapters/level_zero/kernel.cpp b/source/adapters/level_zero/kernel.cpp index 153d3861b1..b15b4ce147 100644 --- a/source/adapters/level_zero/kernel.cpp +++ b/source/adapters/level_zero/kernel.cpp @@ -1054,11 +1054,17 @@ ur_result_t urKernelGetNativeHandle( } ur_result_t urKernelSuggestMaxCooperativeGroupCountExp( - ur_kernel_handle_t hKernel, size_t localWorkSize, + ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pLocalWorkSize, size_t dynamicSharedMemorySize, uint32_t *pGroupCountRet) { - (void)localWorkSize; (void)dynamicSharedMemorySize; std::shared_lock Guard(hKernel->Mutex); + + uint32_t WG[3]; + WG[0] = ur_cast(pLocalWorkSize[0]); + WG[1] = workDim >= 2 ? ur_cast(pLocalWorkSize[1]) : 1; + WG[2] = workDim == 3 ? ur_cast(pLocalWorkSize[2]) : 1; + ZE2UR_CALL(zeKernelSetGroupSize, (hKernel->ZeKernel, WG[0], WG[1], WG[2])); + uint32_t TotalGroupCount = 0; ZE2UR_CALL(zeKernelSuggestMaxCooperativeGroupCount, (hKernel->ZeKernel, &TotalGroupCount)); diff --git a/source/adapters/level_zero/ur_interface_loader.hpp b/source/adapters/level_zero/ur_interface_loader.hpp index 6097d8ffd8..ed7e97fa3b 100644 --- a/source/adapters/level_zero/ur_interface_loader.hpp +++ b/source/adapters/level_zero/ur_interface_loader.hpp @@ -687,7 +687,7 @@ ur_result_t urEnqueueCooperativeKernelLaunchExp( const size_t *pLocalWorkSize, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent); ur_result_t urKernelSuggestMaxCooperativeGroupCountExp( - ur_kernel_handle_t hKernel, size_t localWorkSize, + ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pLocalWorkSize, size_t dynamicSharedMemorySize, uint32_t *pGroupCountRet); ur_result_t urEnqueueTimestampRecordingExp( ur_queue_handle_t hQueue, bool blocking, uint32_t numEventsInWaitList, diff --git a/source/adapters/level_zero/v2/api.cpp b/source/adapters/level_zero/v2/api.cpp index e4a70df811..593115a99f 100644 --- a/source/adapters/level_zero/v2/api.cpp +++ b/source/adapters/level_zero/v2/api.cpp @@ -568,7 +568,7 @@ ur_result_t urCommandBufferCommandGetInfoExp( } ur_result_t urKernelSuggestMaxCooperativeGroupCountExp( - ur_kernel_handle_t hKernel, size_t localWorkSize, + ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pLocalWorkSize, size_t dynamicSharedMemorySize, uint32_t *pGroupCountRet) { logger::error("{} function not implemented!", __FUNCTION__); return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; diff --git a/source/adapters/mock/ur_mockddi.cpp b/source/adapters/mock/ur_mockddi.cpp index 79e2b34b5f..baacaacb19 100644 --- a/source/adapters/mock/ur_mockddi.cpp +++ b/source/adapters/mock/ur_mockddi.cpp @@ -10003,9 +10003,13 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp( /// @brief Intercept function for urKernelSuggestMaxCooperativeGroupCountExp __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object - size_t - localWorkSize, ///< [in] number of local work-items that will form a work-group when the - ///< kernel is launched + uint32_t + workDim, ///< [in] number of dimensions, from 1 to 3, to specify the work-group + ///< work-items + const size_t * + pLocalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the + ///< number of local work-items forming a work-group that will execute the + ///< kernel function. size_t dynamicSharedMemorySize, ///< [in] size of dynamic shared memory, for each work-group, in bytes, ///< that will be used when the kernel is launched @@ -10014,7 +10018,8 @@ __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( ur_result_t result = UR_RESULT_SUCCESS; ur_kernel_suggest_max_cooperative_group_count_exp_params_t params = { - &hKernel, &localWorkSize, &dynamicSharedMemorySize, &pGroupCountRet}; + &hKernel, &workDim, &pLocalWorkSize, &dynamicSharedMemorySize, + &pGroupCountRet}; auto beforeCallback = reinterpret_cast( mock::getCallbacks().get_before_callback( diff --git a/source/adapters/opencl/kernel.cpp b/source/adapters/opencl/kernel.cpp index f60c8a2715..df160b65eb 100644 --- a/source/adapters/opencl/kernel.cpp +++ b/source/adapters/opencl/kernel.cpp @@ -390,7 +390,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetNativeHandle( UR_APIEXPORT ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( [[maybe_unused]] ur_kernel_handle_t hKernel, - [[maybe_unused]] size_t localWorkSize, + [[maybe_unused]] uint32_t workDim, + [[maybe_unused]] const size_t *pLocalWorkSize, [[maybe_unused]] size_t dynamicSharedMemorySize, [[maybe_unused]] uint32_t *pGroupCountRet) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; diff --git a/source/loader/layers/tracing/ur_trcddi.cpp b/source/loader/layers/tracing/ur_trcddi.cpp index 7a110099da..bd620a159f 100644 --- a/source/loader/layers/tracing/ur_trcddi.cpp +++ b/source/loader/layers/tracing/ur_trcddi.cpp @@ -8585,9 +8585,13 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp( /// @brief Intercept function for urKernelSuggestMaxCooperativeGroupCountExp __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object - size_t - localWorkSize, ///< [in] number of local work-items that will form a work-group when the - ///< kernel is launched + uint32_t + workDim, ///< [in] number of dimensions, from 1 to 3, to specify the work-group + ///< work-items + const size_t * + pLocalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the + ///< number of local work-items forming a work-group that will execute the + ///< kernel function. size_t dynamicSharedMemorySize, ///< [in] size of dynamic shared memory, for each work-group, in bytes, ///< that will be used when the kernel is launched @@ -8602,7 +8606,8 @@ __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( } ur_kernel_suggest_max_cooperative_group_count_exp_params_t params = { - &hKernel, &localWorkSize, &dynamicSharedMemorySize, &pGroupCountRet}; + &hKernel, &workDim, &pLocalWorkSize, &dynamicSharedMemorySize, + &pGroupCountRet}; uint64_t instance = getContext()->notify_begin( UR_FUNCTION_KERNEL_SUGGEST_MAX_COOPERATIVE_GROUP_COUNT_EXP, "urKernelSuggestMaxCooperativeGroupCountExp", ¶ms); @@ -8611,7 +8616,8 @@ __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( logger.info(" ---> urKernelSuggestMaxCooperativeGroupCountExp\n"); ur_result_t result = pfnSuggestMaxCooperativeGroupCountExp( - hKernel, localWorkSize, dynamicSharedMemorySize, pGroupCountRet); + hKernel, workDim, pLocalWorkSize, dynamicSharedMemorySize, + pGroupCountRet); getContext()->notify_end( UR_FUNCTION_KERNEL_SUGGEST_MAX_COOPERATIVE_GROUP_COUNT_EXP, diff --git a/source/loader/layers/validation/ur_valddi.cpp b/source/loader/layers/validation/ur_valddi.cpp index 59bdd94de6..e27c5ae3e1 100644 --- a/source/loader/layers/validation/ur_valddi.cpp +++ b/source/loader/layers/validation/ur_valddi.cpp @@ -9613,9 +9613,13 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp( /// @brief Intercept function for urKernelSuggestMaxCooperativeGroupCountExp __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object - size_t - localWorkSize, ///< [in] number of local work-items that will form a work-group when the - ///< kernel is launched + uint32_t + workDim, ///< [in] number of dimensions, from 1 to 3, to specify the work-group + ///< work-items + const size_t * + pLocalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the + ///< number of local work-items forming a work-group that will execute the + ///< kernel function. size_t dynamicSharedMemorySize, ///< [in] size of dynamic shared memory, for each work-group, in bytes, ///< that will be used when the kernel is launched @@ -9634,6 +9638,10 @@ __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( return UR_RESULT_ERROR_INVALID_NULL_HANDLE; } + if (NULL == pLocalWorkSize) { + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + } + if (NULL == pGroupCountRet) { return UR_RESULT_ERROR_INVALID_NULL_POINTER; } @@ -9645,7 +9653,8 @@ __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( } ur_result_t result = pfnSuggestMaxCooperativeGroupCountExp( - hKernel, localWorkSize, dynamicSharedMemorySize, pGroupCountRet); + hKernel, workDim, pLocalWorkSize, dynamicSharedMemorySize, + pGroupCountRet); return result; } diff --git a/source/loader/ur_ldrddi.cpp b/source/loader/ur_ldrddi.cpp index 8134313a50..c482fbdfcc 100644 --- a/source/loader/ur_ldrddi.cpp +++ b/source/loader/ur_ldrddi.cpp @@ -8760,9 +8760,13 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp( /// @brief Intercept function for urKernelSuggestMaxCooperativeGroupCountExp __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object - size_t - localWorkSize, ///< [in] number of local work-items that will form a work-group when the - ///< kernel is launched + uint32_t + workDim, ///< [in] number of dimensions, from 1 to 3, to specify the work-group + ///< work-items + const size_t * + pLocalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the + ///< number of local work-items forming a work-group that will execute the + ///< kernel function. size_t dynamicSharedMemorySize, ///< [in] size of dynamic shared memory, for each work-group, in bytes, ///< that will be used when the kernel is launched @@ -8785,7 +8789,8 @@ __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( // forward to device-platform result = pfnSuggestMaxCooperativeGroupCountExp( - hKernel, localWorkSize, dynamicSharedMemorySize, pGroupCountRet); + hKernel, workDim, pLocalWorkSize, dynamicSharedMemorySize, + pGroupCountRet); return result; } diff --git a/source/loader/ur_libapi.cpp b/source/loader/ur_libapi.cpp index 7a1840c2b9..46d3ca8eac 100644 --- a/source/loader/ur_libapi.cpp +++ b/source/loader/ur_libapi.cpp @@ -8893,13 +8893,18 @@ ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp( /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hKernel` /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// + `NULL == pLocalWorkSize` /// + `NULL == pGroupCountRet` /// - ::UR_RESULT_ERROR_INVALID_KERNEL ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object - size_t - localWorkSize, ///< [in] number of local work-items that will form a work-group when the - ///< kernel is launched + uint32_t + workDim, ///< [in] number of dimensions, from 1 to 3, to specify the work-group + ///< work-items + const size_t * + pLocalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the + ///< number of local work-items forming a work-group that will execute the + ///< kernel function. size_t dynamicSharedMemorySize, ///< [in] size of dynamic shared memory, for each work-group, in bytes, ///< that will be used when the kernel is launched @@ -8913,7 +8918,8 @@ ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( } return pfnSuggestMaxCooperativeGroupCountExp( - hKernel, localWorkSize, dynamicSharedMemorySize, pGroupCountRet); + hKernel, workDim, pLocalWorkSize, dynamicSharedMemorySize, + pGroupCountRet); } catch (...) { return exceptionToResult(std::current_exception()); } diff --git a/source/ur_api.cpp b/source/ur_api.cpp index f342e4efa2..e18cc776a5 100644 --- a/source/ur_api.cpp +++ b/source/ur_api.cpp @@ -7543,13 +7543,18 @@ ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp( /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hKernel` /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// + `NULL == pLocalWorkSize` /// + `NULL == pGroupCountRet` /// - ::UR_RESULT_ERROR_INVALID_KERNEL ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object - size_t - localWorkSize, ///< [in] number of local work-items that will form a work-group when the - ///< kernel is launched + uint32_t + workDim, ///< [in] number of dimensions, from 1 to 3, to specify the work-group + ///< work-items + const size_t * + pLocalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the + ///< number of local work-items forming a work-group that will execute the + ///< kernel function. size_t dynamicSharedMemorySize, ///< [in] size of dynamic shared memory, for each work-group, in bytes, ///< that will be used when the kernel is launched