Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions sycl/cmake/modules/UnifiedRuntimeTag.cmake
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# commit 27398080349f1d8d21d6a8680e234d29dcd14734 (HEAD, origin/main, origin/HEAD)
# Merge: 572355db942d dc971af72a31
# commit 72e80a42cc8e5b11d43dd9d34b40d470e1476181 (HEAD, origin/main, origin/HEAD)
# Merge: 6e5d0e6b9a47 9c7e56cc765b
# Author: Martin Grant <[email protected]>
# Date: Thu Dec 5 14:57:07 2024 +0000
# Merge pull request #2293 from yingcong-wu/yc-PR/241107-misc-minor-fix
# [DeviceAsan] Serval bug fixes
set(UNIFIED_RUNTIME_TAG 27398080349f1d8d21d6a8680e234d29dcd14734)
# Date: Fri Dec 6 10:11:15 2024 +0000
# Merge pull request #2316 from 0x12CC/coop_kernel_query
# Change `urSuggestMaxCooperativeGroupCountExp` to accept ND size parameter
set(UNIFIED_RUNTIME_TAG 72e80a42cc8e5b11d43dd9d34b40d470e1476181)
Original file line number Diff line number Diff line change
@@ -1,4 +1 @@
// TODO: Revisit 'max_num_work_group_sync' and align it with the
// 'sycl_ext_oneapi_forward_progress' extension once #7598 is merged.
__SYCL_PARAM_TRAITS_SPEC(ext::oneapi::experimental, kernel_queue_specific, max_num_work_group_sync, size_t,)
__SYCL_PARAM_TRAITS_SPEC(ext::oneapi::experimental, kernel_queue_specific, max_num_work_groups, size_t,)
26 changes: 26 additions & 0 deletions sycl/include/sycl/kernel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,32 @@ class __SYCL_EXPORT kernel : public detail::OwnerLessBase<kernel> {
typename detail::is_kernel_queue_specific_info_desc<Param>::return_type
ext_oneapi_get_info(queue Queue) const;

/// Query queue/launch-specific information from a kernel using the
/// info::kernel_queue_specific descriptor for a specific Queue and values.
/// max_num_work_groups is the only valid descriptor for this function.
///
/// \param Queue is a valid SYCL queue.
/// \param WorkGroupSize is the work-group size the number of work-groups is
/// requested for.
/// \return depends on information being queried.
template <typename Param>
typename detail::is_kernel_queue_specific_info_desc<Param>::return_type
ext_oneapi_get_info(queue Queue, const range<1> &WorkGroupSize,
size_t DynamicLocalMemorySize) const;

/// Query queue/launch-specific information from a kernel using the
/// info::kernel_queue_specific descriptor for a specific Queue and values.
/// max_num_work_groups is the only valid descriptor for this function.
///
/// \param Queue is a valid SYCL queue.
/// \param WorkGroupSize is the work-group size the number of work-groups is
/// requested for.
/// \return depends on information being queried.
template <typename Param>
typename detail::is_kernel_queue_specific_info_desc<Param>::return_type
ext_oneapi_get_info(queue Queue, const range<2> &WorkGroupSize,
size_t DynamicLocalMemorySize) const;

/// Query queue/launch-specific information from a kernel using the
/// info::kernel_queue_specific descriptor for a specific Queue and values.
/// max_num_work_groups is the only valid descriptor for this function.
Expand Down
32 changes: 0 additions & 32 deletions sycl/source/detail/kernel_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -106,38 +106,6 @@ void kernel_impl::checkIfValidForNumArgsInfoQuery() const {
"interoperability function or to query a device built-in kernel");
}

bool kernel_impl::exceedsOccupancyResourceLimits(
const device &Device, const range<3> &WorkGroupSize,
size_t DynamicLocalMemorySize) const {
// Respect occupancy limits for WorkGroupSize and DynamicLocalMemorySize.
// Generally, exceeding hardware resource limits will yield in an error when
// the kernel is launched.
const size_t MaxWorkGroupSize =
get_info<info::kernel_device_specific::work_group_size>(Device);
const size_t MaxLocalMemorySizeInBytes =
Device.get_info<info::device::local_mem_size>();

if (WorkGroupSize.size() > MaxWorkGroupSize)
return true;

if (DynamicLocalMemorySize > MaxLocalMemorySizeInBytes)
return true;

// It will be impossible to launch a kernel for Cuda when the hardware limit
// for the 32-bit registers page file size is exceeded.
if (Device.get_backend() == backend::ext_oneapi_cuda) {
const uint32_t RegsPerWorkItem =
get_info<info::kernel_device_specific::ext_codeplay_num_regs>(Device);
const uint32_t MaxRegsPerWorkGroup =
Device.get_info<ext::codeplay::experimental::info::device::
max_registers_per_work_group>();
if ((MaxWorkGroupSize * RegsPerWorkItem) > MaxRegsPerWorkGroup)
return true;
}

return false;
}

template <>
typename info::platform::version::return_type
kernel_impl::get_backend_info<info::platform::version>() const {
Expand Down
128 changes: 101 additions & 27 deletions sycl/source/detail/kernel_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,32 @@ class kernel_impl {
template <typename Param>
typename Param::return_type ext_oneapi_get_info(queue Queue) const;

/// Query queue/launch-specific information from a kernel using the
/// info::kernel_queue_specific descriptor for a specific Queue and values.
/// max_num_work_groups is the only valid descriptor for this function.
///
/// \param Queue is a valid SYCL queue.
/// \param WorkGroupSize is the work-group size the number of work-groups is
/// requested for.
/// \return depends on information being queried.
template <typename Param>
typename Param::return_type
ext_oneapi_get_info(queue Queue, const range<1> &MaxWorkGroupSize,
size_t DynamicLocalMemorySize) const;

/// Query queue/launch-specific information from a kernel using the
/// info::kernel_queue_specific descriptor for a specific Queue and values.
/// max_num_work_groups is the only valid descriptor for this function.
///
/// \param Queue is a valid SYCL queue.
/// \param WorkGroupSize is the work-group size the number of work-groups is
/// requested for.
/// \return depends on information being queried.
template <typename Param>
typename Param::return_type
ext_oneapi_get_info(queue Queue, const range<2> &MaxWorkGroupSize,
size_t DynamicLocalMemorySize) const;

/// Query queue/launch-specific information from a kernel using the
/// info::kernel_queue_specific descriptor for a specific Queue and values.
/// max_num_work_groups is the only valid descriptor for this function.
Expand Down Expand Up @@ -193,11 +219,49 @@ class kernel_impl {

/// Check if the occupancy limits are exceeded for the given kernel launch
/// configuration.
template <int Dimensions>
bool exceedsOccupancyResourceLimits(const device &Device,
const range<3> &WorkGroupSize,
const range<Dimensions> &WorkGroupSize,
size_t DynamicLocalMemorySize) const;
template <int Dimensions>
size_t queryMaxNumWorkGroups(queue Queue,
const range<Dimensions> &WorkGroupSize,
size_t DynamicLocalMemorySize) const;
};

template <int Dimensions>
bool kernel_impl::exceedsOccupancyResourceLimits(
const device &Device, const range<Dimensions> &WorkGroupSize,
size_t DynamicLocalMemorySize) const {
// Respect occupancy limits for WorkGroupSize and DynamicLocalMemorySize.
// Generally, exceeding hardware resource limits will yield in an error when
// the kernel is launched.
const size_t MaxWorkGroupSize =
get_info<info::kernel_device_specific::work_group_size>(Device);
const size_t MaxLocalMemorySizeInBytes =
Device.get_info<info::device::local_mem_size>();

if (WorkGroupSize.size() > MaxWorkGroupSize)
return true;

if (DynamicLocalMemorySize > MaxLocalMemorySizeInBytes)
return true;

// It will be impossible to launch a kernel for Cuda when the hardware limit
// for the 32-bit registers page file size is exceeded.
if (Device.get_backend() == backend::ext_oneapi_cuda) {
const uint32_t RegsPerWorkItem =
get_info<info::kernel_device_specific::ext_codeplay_num_regs>(Device);
const uint32_t MaxRegsPerWorkGroup =
Device.get_info<ext::codeplay::experimental::info::device::
max_registers_per_work_group>();
if ((MaxWorkGroupSize * RegsPerWorkItem) > MaxRegsPerWorkGroup)
return true;
}

return false;
}

template <typename Param>
inline typename Param::return_type kernel_impl::get_info() const {
static_assert(is_kernel_info_desc<Param>::value,
Expand Down Expand Up @@ -244,13 +308,11 @@ kernel_impl::get_info(const device &Device,

namespace syclex = ext::oneapi::experimental;

template <>
inline typename syclex::info::kernel_queue_specific::max_num_work_groups::
return_type
kernel_impl::ext_oneapi_get_info<
syclex::info::kernel_queue_specific::max_num_work_groups>(
queue Queue, const range<3> &WorkGroupSize,
size_t DynamicLocalMemorySize) const {
template <int Dimensions>
size_t
kernel_impl::queryMaxNumWorkGroups(queue Queue,
const range<Dimensions> &WorkGroupSize,
size_t DynamicLocalMemorySize) const {
if (WorkGroupSize.size() == 0)
throw exception(sycl::make_error_code(errc::invalid),
"The launch work-group size cannot be zero.");
Expand All @@ -259,12 +321,21 @@ inline typename syclex::info::kernel_queue_specific::max_num_work_groups::
const auto &Handle = getHandleRef();
auto Device = Queue.get_device();

size_t WG[Dimensions];
WG[0] = WorkGroupSize[0];
if constexpr (Dimensions >= 2)
WG[1] = WorkGroupSize[1];
if constexpr (Dimensions == 3)
WG[2] = WorkGroupSize[2];

uint32_t GroupCount{0};
if (auto Result = Adapter->call_nocheck<
UrApiKind::urKernelSuggestMaxCooperativeGroupCountExp>(
Handle, WorkGroupSize.size(), DynamicLocalMemorySize, &GroupCount);
Result != UR_RESULT_ERROR_UNSUPPORTED_FEATURE) {
// The feature is supported. Check for other errors and throw if any.
Handle, Dimensions, WG, DynamicLocalMemorySize, &GroupCount);
Result != UR_RESULT_ERROR_UNSUPPORTED_FEATURE &&
Result != UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE) {
// The feature is supported and the group size is valid. Check for other
// errors and throw if any.
Adapter->checkUrResult(Result);
return GroupCount;
}
Expand All @@ -278,30 +349,33 @@ inline typename syclex::info::kernel_queue_specific::max_num_work_groups::
}

template <>
inline typename syclex::info::kernel_queue_specific::max_num_work_group_sync::
inline typename syclex::info::kernel_queue_specific::max_num_work_groups::
return_type
kernel_impl::ext_oneapi_get_info<
syclex::info::kernel_queue_specific::max_num_work_group_sync>(
queue Queue, const range<3> &WorkGroupSize,
syclex::info::kernel_queue_specific::max_num_work_groups>(
queue Queue, const range<1> &WorkGroupSize,
size_t DynamicLocalMemorySize) const {
return ext_oneapi_get_info<
syclex::info::kernel_queue_specific::max_num_work_groups>(
Queue, WorkGroupSize, DynamicLocalMemorySize);
return queryMaxNumWorkGroups(Queue, WorkGroupSize, DynamicLocalMemorySize);
}

template <>
inline typename syclex::info::kernel_queue_specific::max_num_work_group_sync::
inline typename syclex::info::kernel_queue_specific::max_num_work_groups::
return_type
kernel_impl::ext_oneapi_get_info<
syclex::info::kernel_queue_specific::max_num_work_group_sync>(
queue Queue) const {
auto Device = Queue.get_device();
const auto MaxWorkGroupSize =
get_info<info::kernel_device_specific::work_group_size>(Device);
const sycl::range<3> WorkGroupSize{MaxWorkGroupSize, 1, 1};
return ext_oneapi_get_info<
syclex::info::kernel_queue_specific::max_num_work_group_sync>(
Queue, WorkGroupSize, /* DynamicLocalMemorySize */ 0);
syclex::info::kernel_queue_specific::max_num_work_groups>(
queue Queue, const range<2> &WorkGroupSize,
size_t DynamicLocalMemorySize) const {
return queryMaxNumWorkGroups(Queue, WorkGroupSize, DynamicLocalMemorySize);
}

template <>
inline typename syclex::info::kernel_queue_specific::max_num_work_groups::
return_type
kernel_impl::ext_oneapi_get_info<
syclex::info::kernel_queue_specific::max_num_work_groups>(
queue Queue, const range<3> &WorkGroupSize,
size_t DynamicLocalMemorySize) const {
return queryMaxNumWorkGroups(Queue, WorkGroupSize, DynamicLocalMemorySize);
}

} // namespace detail
Expand Down
77 changes: 70 additions & 7 deletions sycl/source/kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -113,26 +113,41 @@ kernel::ext_oneapi_get_info(queue Queue) const {

template <typename Param>
typename detail::is_kernel_queue_specific_info_desc<Param>::return_type
kernel::ext_oneapi_get_info(queue Queue, const range<3> &WorkGroupSize,
kernel::ext_oneapi_get_info(queue Queue, const range<1> &WorkGroupSize,
size_t DynamicLocalMemorySize) const {
return impl->ext_oneapi_get_info<Param>(Queue, WorkGroupSize,
DynamicLocalMemorySize);
}

template __SYCL_EXPORT typename ext::oneapi::experimental::info::
kernel_queue_specific::max_num_work_group_sync::return_type
kernel::ext_oneapi_get_info<
ext::oneapi::experimental::info::kernel_queue_specific::
max_num_work_group_sync>(queue Queue) const;
template <typename Param>
typename detail::is_kernel_queue_specific_info_desc<Param>::return_type
kernel::ext_oneapi_get_info(queue Queue, const range<2> &WorkGroupSize,
size_t DynamicLocalMemorySize) const {
return impl->ext_oneapi_get_info<Param>(Queue, WorkGroupSize,
DynamicLocalMemorySize);
}

template <typename Param>
typename detail::is_kernel_queue_specific_info_desc<Param>::return_type
kernel::ext_oneapi_get_info(queue Queue, const range<3> &WorkGroupSize,
size_t DynamicLocalMemorySize) const {
return impl->ext_oneapi_get_info<Param>(Queue, WorkGroupSize,
DynamicLocalMemorySize);
}

#define __SYCL_PARAM_TRAITS_SPEC(Namespace, DescType, Desc, ReturnT) \
template __SYCL_EXPORT ReturnT \
kernel::ext_oneapi_get_info<Namespace::info::DescType::Desc>( \
queue, const range<1> &, size_t) const; \
template __SYCL_EXPORT ReturnT \
kernel::ext_oneapi_get_info<Namespace::info::DescType::Desc>( \
queue, const range<2> &, size_t) const; \
template __SYCL_EXPORT ReturnT \
kernel::ext_oneapi_get_info<Namespace::info::DescType::Desc>( \
queue, const range<3> &, size_t) const;
// Not including "ext_oneapi_kernel_queue_specific_traits.def" because not all
// kernel_queue_specific queries require the above-defined get_info interface.
// clang-format off
__SYCL_PARAM_TRAITS_SPEC(ext::oneapi::experimental, kernel_queue_specific, max_num_work_group_sync, size_t)
__SYCL_PARAM_TRAITS_SPEC(ext::oneapi::experimental, kernel_queue_specific, max_num_work_groups, size_t)
// clang-format on
#undef __SYCL_PARAM_TRAITS_SPEC
Expand All @@ -143,5 +158,53 @@ ur_native_handle_t kernel::getNative() const { return impl->getNative(); }

ur_native_handle_t kernel::getNativeImpl() const { return impl->getNative(); }

#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
// The following query was deprecated since it doesn't include a way to specify
// the invdividual dimensions of the work group. All of the contents of this
// #ifndef block should be removed during the next ABI breaking window.
namespace ext::oneapi::experimental::info::kernel_queue_specific {
struct max_num_work_group_sync {
using return_type = size_t;
};
} // namespace ext::oneapi::experimental::info::kernel_queue_specific
template <>
struct detail::is_kernel_queue_specific_info_desc<
ext::oneapi::experimental::info::kernel_queue_specific::
max_num_work_group_sync> : std::true_type {
using return_type = ext::oneapi::experimental::info::kernel_queue_specific::
max_num_work_group_sync::return_type;
};
template <>
__SYCL2020_DEPRECATED(
"The 'max_num_work_group_sync' query is deprecated. See "
"'sycl_ext_oneapi_launch_queries' for the new 'max_num_work_groups' query.")
__SYCL_EXPORT typename ext::oneapi::experimental::info::kernel_queue_specific::
max_num_work_group_sync::return_type kernel::ext_oneapi_get_info<
ext::oneapi::experimental::info::kernel_queue_specific::
max_num_work_group_sync>(queue Queue, const range<3> &WorkGroupSize,
size_t DynamicLocalMemorySize) const {
return ext_oneapi_get_info<ext::oneapi::experimental::info::
kernel_queue_specific::max_num_work_groups>(
Queue, WorkGroupSize, DynamicLocalMemorySize);
}
template <>
__SYCL2020_DEPRECATED(
"The 'max_num_work_group_sync' query is deprecated. See "
"'sycl_ext_oneapi_launch_queries' for the new 'max_num_work_groups' query.")
__SYCL_EXPORT typename ext::oneapi::experimental::info::kernel_queue_specific::
max_num_work_group_sync::return_type kernel::ext_oneapi_get_info<
ext::oneapi::experimental::info::kernel_queue_specific::
max_num_work_group_sync>(queue Queue) const {
auto Device = Queue.get_device();
const auto MaxWorkGroupSize =
get_info<info::kernel_device_specific::work_group_size>(Device);
const sycl::range<3> WorkGroupSize{MaxWorkGroupSize, 1, 1};
return ext_oneapi_get_info<ext::oneapi::experimental::info::
kernel_queue_specific::max_num_work_groups>(
Queue, WorkGroupSize,
/* DynamicLocalMemorySize */ 0);
}
#endif

} // namespace _V1
} // namespace sycl
12 changes: 4 additions & 8 deletions sycl/test-e2e/Basic/launch_queries/max_num_work_groups.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -162,14 +162,10 @@ int test_max_num_work_groups(sycl::queue &q, const sycl::device &dev) {
// It cannot be possible to launch a kernel successfully with a configuration
// that exceeds the available resources as in the above defined workGroupSize.
// workGroupSize is larger than maxWorkGroupSize, hence maxWGs must equal 0.
// Note: Level-Zero currently always returns a non-zero value.
// TODO: Remove the backend condition once the Level-Zero API issue is fixed.
if (dev.get_backend() != sycl::backend::ext_oneapi_level_zero) {
assert(maxWGs == 0 &&
"max_num_work_groups query failed.\n"
"It should return 0 possible groups when the requested resources "
"by the lanuch config exceed those available in the hardware.");
}
assert(maxWGs == 0 &&
"max_num_work_groups query failed.\n"
"It should return 0 possible groups when the requested resources "
"by the lanuch config exceed those available in the hardware.");

// As we ensured that the 'max_num_work_groups' query correctly
// returns 0 possible work-groups, test that the kernel launch will fail.
Expand Down
Loading
Loading