diff --git a/llvm/include/llvm/Support/PropertySetIO.h b/llvm/include/llvm/Support/PropertySetIO.h index d878065809692..cb257ba7e5c77 100644 --- a/llvm/include/llvm/Support/PropertySetIO.h +++ b/llvm/include/llvm/Support/PropertySetIO.h @@ -81,8 +81,8 @@ class PropertyValue { PropertyValue(uint32_t Val) : Ty(UINT32), Val({Val}) {} PropertyValue(const byte *Data, SizeTy DataBitSize); - template - PropertyValue(const std::vector &Data) + template + PropertyValue(const C &Data) : PropertyValue(reinterpret_cast(Data.data()), Data.size() * sizeof(T) * /* bits in one byte */ 8) {} PropertyValue(const llvm::StringRef &Str) diff --git a/llvm/test/tools/sycl-post-link/device-requirements/reqd-work-group-size.ll b/llvm/test/tools/sycl-post-link/device-requirements/reqd-work-group-size.ll index defd6939691c2..4a440332a3634 100644 --- a/llvm/test/tools/sycl-post-link/device-requirements/reqd-work-group-size.ll +++ b/llvm/test/tools/sycl-post-link/device-requirements/reqd-work-group-size.ll @@ -25,13 +25,19 @@ ; RUN: FileCheck %s -input-file=%t_0.prop --check-prefix CHECK-PROP-AUTO-SPLIT-0 ; RUN: FileCheck %s -input-file=%t_1.prop --check-prefix CHECK-PROP-AUTO-SPLIT-1 +; TODO: Before intel/llvm#10620, the reqd_work_group_size attribute +; stores its values as uint32_t, but this needed to be expanded to +; uint64_t. However, this change did not happen in ABI-breaking +; window, so we attach the required work-group size as the +; reqd_work_group_size_uint64_t attribute. At the next ABI-breaking +; window, this can be changed back to reqd_work_group_size. ; CHECK-PROP-AUTO-SPLIT-0: [SYCL/device requirements] ; CHECK-PROP-AUTO-SPLIT-0-NEXT: aspects=2|AAAAAAAAAAA -; CHECK-PROP-AUTO-SPLIT-0-NEXT: reqd_work_group_size=2|gAAAAAAAAAAQAAAA +; CHECK-PROP-AUTO-SPLIT-0-NEXT: reqd_work_group_size_uint64_t=2|ABAAAAAAAAAQAAAAAAAAAA ; CHECK-PROP-AUTO-SPLIT-1: [SYCL/device requirements] ; CHECK-PROP-AUTO-SPLIT-1-NEXT: aspects=2|AAAAAAAAAAA -; CHECK-PROP-AUTO-SPLIT-1-NEXT: reqd_work_group_size=2|gAAAAAAAAAAIAAAA +; CHECK-PROP-AUTO-SPLIT-1-NEXT: reqd_work_group_size_uint64_t=2|ABAAAAAAAAAIAAAAAAAAAA ; ModuleID = '/tmp/source-5f7d0d.bc' source_filename = "llvm-link" diff --git a/llvm/tools/sycl-post-link/SYCLDeviceRequirements.cpp b/llvm/tools/sycl-post-link/SYCLDeviceRequirements.cpp index 93650640f97cc..69ebd6220be74 100644 --- a/llvm/tools/sycl-post-link/SYCLDeviceRequirements.cpp +++ b/llvm/tools/sycl-post-link/SYCLDeviceRequirements.cpp @@ -40,9 +40,7 @@ void llvm::getSYCLDeviceRequirements( // Scan the module and if the metadata is present fill the corresponing // property with metadata's aspects constexpr std::pair ReqdMDs[] = { - {"sycl_used_aspects", "aspects"}, - {"sycl_fixed_targets", "fixed_target"}, - {"reqd_work_group_size", "reqd_work_group_size"}}; + {"sycl_used_aspects", "aspects"}, {"sycl_fixed_targets", "fixed_target"}}; for (const auto &[MDName, MappedName] : ReqdMDs) { std::set Values; @@ -69,6 +67,27 @@ void llvm::getSYCLDeviceRequirements( std::vector(Values.begin(), Values.end()); } + std::optional> ReqdWorkGroupSize; + for (const Function &F : MD.getModule()) { + if (const MDNode *MDN = F.getMetadata("reqd_work_group_size")) { + llvm::SmallVector NewReqdWorkGroupSize; + for (size_t I = 0, E = MDN->getNumOperands(); I < E; ++I) + NewReqdWorkGroupSize.push_back( + ExtractUnsignedIntegerFromMDNodeOperand(MDN, I)); + if (!ReqdWorkGroupSize) + ReqdWorkGroupSize = NewReqdWorkGroupSize; + } + } + + // TODO: Before intel/llvm#10620, the reqd_work_group_size attribute + // stores its values as uint32_t, but this needed to be expanded to + // uint64_t. However, this change did not happen in ABI-breaking + // window, so we attach the required work-group size as the + // reqd_work_group_size_uint64_t attribute. At the next ABI-breaking + // window, this can be changed back to reqd_work_group_size. + if (ReqdWorkGroupSize) + Requirements["reqd_work_group_size_uint64_t"] = *ReqdWorkGroupSize; + // There should only be at most one function with // intel_reqd_sub_group_size metadata when considering the entry // points of a module, but not necessarily when considering all the diff --git a/sycl/source/detail/program_manager/program_manager.cpp b/sycl/source/detail/program_manager/program_manager.cpp index a45a43e6bad94..7fc32f513c982 100644 --- a/sycl/source/detail/program_manager/program_manager.cpp +++ b/sycl/source/detail/program_manager/program_manager.cpp @@ -630,61 +630,8 @@ sycl::detail::pi::PiProgram ProgramManager::getBuiltPIProgram( getDeviceImage(KSId, Context, Device, JITCompilationIsRequired); // Check that device supports all aspects used by the kernel - const RTDeviceBinaryImage::PropertyRange &ARange = - Img.getDeviceRequirements(); - -#define __SYCL_ASPECT(ASPECT, ID) \ - case aspect::ASPECT: \ - return #ASPECT; -#define __SYCL_ASPECT_DEPRECATED(ASPECT, ID, MESSAGE) __SYCL_ASPECT(ASPECT, ID) -// We don't need "case aspect::usm_allocator" here because it will duplicate -// "case aspect::usm_system_allocations", therefore leave this macro empty -#define __SYCL_ASPECT_DEPRECATED_ALIAS(ASPECT, ID, MESSAGE) - auto getAspectNameStr = [](aspect AspectNum) -> std::string { - switch (AspectNum) { -#include -#include - } - throw sycl::exception(errc::kernel_not_supported, - "Unknown aspect " + - std::to_string(static_cast(AspectNum))); - }; -#undef __SYCL_ASPECT_DEPRECATED_ALIAS -#undef __SYCL_ASPECT_DEPRECATED -#undef __SYCL_ASPECT - - for (RTDeviceBinaryImage::PropertyRange::ConstIterator It : ARange) { - using namespace std::literals; - if ((*It)->Name == "aspects"sv) { - ByteArray Aspects = DeviceBinaryProperty(*It).asByteArray(); - // 8 because we need to skip 64-bits of size of the byte array - Aspects.dropBytes(8); - while (!Aspects.empty()) { - auto Aspect = static_cast(Aspects.consume()); - if (!Dev->has(Aspect)) - throw sycl::exception(errc::kernel_not_supported, - "Required aspect " + getAspectNameStr(Aspect) + - " is not supported on the device"); - } - } else if ((*It)->Name == "reqd_sub_group_size"sv) { - auto ReqdSubGroupSize = DeviceBinaryProperty(*It).asUint32(); - auto SupportedSubGroupSizes = - Device.get_info(); - - // !getUint32PropAsBool(Img, "isEsimdImage") is a WA for ESIMD, - // as ESIMD images have a reqd-sub-group-size of 1, but currently - // no backend currently includes 1 as a valid sub-group size. - // This can be removed if backends add 1 as a valid sub-group size. - if (!getUint32PropAsBool(Img, "isEsimdImage") && - std::none_of(SupportedSubGroupSizes.cbegin(), - SupportedSubGroupSizes.cend(), - [=](auto s) { return s == ReqdSubGroupSize; })) - throw sycl::exception(errc::kernel_not_supported, - "Sub-group size " + - std::to_string(ReqdSubGroupSize) + - " is not supported on the device"); - } - } + if (auto exception = checkDevSupportDeviceRequirements(Device, Img)) + throw *exception; auto BuildF = [this, &Img, &Context, &ContextImpl, &Device, Prg, &CompileOpts, &LinkOpts, SpecConsts] { @@ -2456,6 +2403,44 @@ ProgramManager::getOrCreateKernel(const context &Context, bool doesDevSupportDeviceRequirements(const device &Dev, const RTDeviceBinaryImage &Img) { + return !checkDevSupportDeviceRequirements(Dev, Img).has_value(); +} + +static std::string getAspectNameStr(sycl::aspect AspectNum) { +#define __SYCL_ASPECT(ASPECT, ID) \ + case aspect::ASPECT: \ + return #ASPECT; +#define __SYCL_ASPECT_DEPRECATED(ASPECT, ID, MESSAGE) __SYCL_ASPECT(ASPECT, ID) +// We don't need "case aspect::usm_allocator" here because it will duplicate +// "case aspect::usm_system_allocations", therefore leave this macro empty +#define __SYCL_ASPECT_DEPRECATED_ALIAS(ASPECT, ID, MESSAGE) + switch (AspectNum) { +#include +#include + } + throw sycl::exception(errc::kernel_not_supported, + "Unknown aspect " + + std::to_string(static_cast(AspectNum))); +#undef __SYCL_ASPECT_DEPRECATED_ALIAS +#undef __SYCL_ASPECT_DEPRECATED +#undef __SYCL_ASPECT +} + +// Check if the multiplication over unsigned integers overflows +template +static std::enable_if_t, std::optional> +multiply_with_overflow_check(T x, T y) { + if (y == 0) + return 0; + if (x > std::numeric_limits::max() / y) + return {}; + else + return x * y; +} + +std::optional +checkDevSupportDeviceRequirements(const device &Dev, + const RTDeviceBinaryImage &Img) { auto getPropIt = [&Img](const std::string &PropName) { const RTDeviceBinaryImage::PropertyRange &PropRange = Img.getDeviceRequirements(); @@ -2471,7 +2456,8 @@ bool doesDevSupportDeviceRequirements(const device &Dev, }; auto AspectsPropIt = getPropIt("aspects"); - auto ReqdWGSizePropIt = getPropIt("reqd_work_group_size"); + auto ReqdWGSizeUint32TPropIt = getPropIt("reqd_work_group_size"); + auto ReqdWGSizeUint64TPropIt = getPropIt("reqd_work_group_size_uint64_t"); auto ReqdSubGroupSizePropIt = getPropIt("reqd_sub_group_size"); // Checking if device supports defined aspects @@ -2483,28 +2469,54 @@ bool doesDevSupportDeviceRequirements(const device &Dev, while (!Aspects.empty()) { aspect Aspect = Aspects.consume(); if (!Dev.has(Aspect)) - return false; + return sycl::exception(errc::kernel_not_supported, + "Required aspect " + getAspectNameStr(Aspect) + + " is not supported on the device"); } } // Checking if device supports defined required work group size - if (ReqdWGSizePropIt) { - ByteArray ReqdWGSize = - DeviceBinaryProperty(*(ReqdWGSizePropIt.value())).asByteArray(); + if (ReqdWGSizeUint32TPropIt || ReqdWGSizeUint64TPropIt) { + /// TODO: Before intel/llvm#10620, the reqd_work_group_size attribute + // stores its values as uint32_t, but this needed to be expanded to + // uint64_t. However, this change did not happen in ABI-breaking + // window, so we attach the required work-group size as the + // reqd_work_group_size_uint64_t attribute. At the next ABI-breaking + // window, we can remove the logic for the 32 bit property. + bool usingUint64_t = ReqdWGSizeUint64TPropIt.has_value(); + auto it = usingUint64_t ? ReqdWGSizeUint64TPropIt : ReqdWGSizeUint32TPropIt; + + ByteArray ReqdWGSize = DeviceBinaryProperty(*(it.value())).asByteArray(); // Drop 8 bytes describing the size of the byte array. ReqdWGSize.dropBytes(8); - int ReqdWGSizeAllDimsTotal = 1; - std::vector ReqdWGSizeVec; + uint64_t ReqdWGSizeAllDimsTotal = 1; + std::vector ReqdWGSizeVec; int Dims = 0; while (!ReqdWGSize.empty()) { - int SingleDimSize = ReqdWGSize.consume(); - ReqdWGSizeAllDimsTotal *= SingleDimSize; + uint64_t SingleDimSize = usingUint64_t ? ReqdWGSize.consume() + : ReqdWGSize.consume(); + if (auto res = multiply_with_overflow_check(ReqdWGSizeAllDimsTotal, + SingleDimSize)) + ReqdWGSizeAllDimsTotal = *res; + else + return sycl::exception( + sycl::errc::kernel_not_supported, + "Required work-group size is not supported" + " (total number of work-items requested can't fit into size_t)"); ReqdWGSizeVec.push_back(SingleDimSize); Dims++; } - if (static_cast(ReqdWGSizeAllDimsTotal) > - Dev.get_info()) - return false; + + // The SingleDimSize was computed in an uint64_t; size_t does not + // necessarily have to be the same uint64_t (but should fit in an + // uint64_t). + if (ReqdWGSizeAllDimsTotal > + Dev.get_info() || + ReqdWGSizeAllDimsTotal > std::numeric_limits::max()) + return sycl::exception(sycl::errc::kernel_not_supported, + "Required work-group size " + + std::to_string(ReqdWGSizeAllDimsTotal) + + " is not supported on the device"); // Creating std::variant to call max_work_item_sizes one time to avoid // performance drop std::variant, id<2>, id<3>> MaxWorkItemSizesVariant; @@ -2522,17 +2534,26 @@ bool doesDevSupportDeviceRequirements(const device &Dev, // issues after that if (Dims == 1) { // ReqdWGSizeVec is in reverse order compared to MaxWorkItemSizes - if (static_cast(ReqdWGSizeVec[i]) > + if (ReqdWGSizeVec[i] > std::get>(MaxWorkItemSizesVariant)[Dims - i - 1]) - return false; + return sycl::exception(sycl::errc::kernel_not_supported, + "Required work-group size " + + std::to_string(ReqdWGSizeVec[i]) + + " is not supported"); } else if (Dims == 2) { - if (static_cast(ReqdWGSizeVec[i]) > + if (ReqdWGSizeVec[i] > std::get>(MaxWorkItemSizesVariant)[Dims - i - 1]) - return false; + return sycl::exception(sycl::errc::kernel_not_supported, + "Required work-group size " + + std::to_string(ReqdWGSizeVec[i]) + + " is not supported"); } else // (Dims == 3) - if (static_cast(ReqdWGSizeVec[i]) > + if (ReqdWGSizeVec[i] > std::get>(MaxWorkItemSizesVariant)[Dims - i - 1]) - return false; + return sycl::exception(sycl::errc::kernel_not_supported, + "Required work-group size " + + std::to_string(ReqdWGSizeVec[i]) + + " is not supported"); } } @@ -2541,14 +2562,21 @@ bool doesDevSupportDeviceRequirements(const device &Dev, auto ReqdSubGroupSize = DeviceBinaryProperty(*(ReqdSubGroupSizePropIt.value())).asUint32(); auto SupportedSubGroupSizes = Dev.get_info(); + // !getUint32PropAsBool(Img, "isEsimdImage") is a WA for ESIMD, + // as ESIMD images have a reqd-sub-group-size of 1, but currently + // no backend currently includes 1 as a valid sub-group size. + // This can be removed if backends add 1 as a valid sub-group size. if (!getUint32PropAsBool(Img, "isEsimdImage") && std::none_of(SupportedSubGroupSizes.cbegin(), SupportedSubGroupSizes.cend(), [=](auto s) { return s == ReqdSubGroupSize; })) - return false; + return sycl::exception(sycl::errc::kernel_not_supported, + "Sub-group size " + + std::to_string(ReqdSubGroupSize) + + " is not supported on the device"); } - return true; + return {}; } } // namespace detail diff --git a/sycl/source/detail/program_manager/program_manager.hpp b/sycl/source/detail/program_manager/program_manager.hpp index b2640cfc065de..39692f2fa7f9d 100644 --- a/sycl/source/detail/program_manager/program_manager.hpp +++ b/sycl/source/detail/program_manager/program_manager.hpp @@ -51,6 +51,9 @@ namespace detail { bool doesDevSupportDeviceRequirements(const device &Dev, const RTDeviceBinaryImage &BinImages); +std::optional +checkDevSupportDeviceRequirements(const device &Dev, + const RTDeviceBinaryImage &BinImages); // This value must be the same as in libdevice/device_itt.h. // See sycl/doc/design/ITTAnnotations.md for more info. diff --git a/sycl/test-e2e/OptionalKernelFeatures/large-reqd-work-group-size.cpp b/sycl/test-e2e/OptionalKernelFeatures/large-reqd-work-group-size.cpp new file mode 100644 index 0000000000000..2e610c8e75510 --- /dev/null +++ b/sycl/test-e2e/OptionalKernelFeatures/large-reqd-work-group-size.cpp @@ -0,0 +1,62 @@ +// UNSUPPORTED: hip +// RUN: %{build} -o %t.out -fno-sycl-id-queries-fit-in-int +// RUN: %{run} %t.out + +#include + +using namespace sycl; +queue q; +int n_fail = 0; + +template +void throws_kernel_not_supported(const char *test_name, FunctorT f) { + try { + f(); + } catch (const sycl::exception &e) { + if (e.code() != errc::kernel_not_supported) { + std::cout << "fail: " << test_name << "\n" + << "Caught wrong exception with error code " << e.code() << "\n" + << e.what() << "\n"; + ++n_fail; + return; + } else { + std::cout << "pass: " << test_name << "\n" + << "Caught right exception:\n" + << e.what() << "\n"; + return; + } + } + std::cout << "fail: " << test_name << "\n" + << "No exception thrown\n"; + ++n_fail; + return; +} + +int main(int argc, char *argv[]) { + throws_kernel_not_supported("nd_range<1>", [] { + constexpr uint32_t N = std::numeric_limits::max(); + q.parallel_for(nd_range<1>(N, N), + [=](auto) [[sycl::reqd_work_group_size(N)]] {}); + }); + + throws_kernel_not_supported("nd_range<2>", [] { + constexpr uint32_t N = std::numeric_limits::max(); + q.parallel_for(nd_range<2>({N, N}, {N, N}), + [=](auto) [[sycl::reqd_work_group_size(N, N)]] {}); + }); + + throws_kernel_not_supported("nd_range<3>", [] { + constexpr uint32_t N = std::numeric_limits::max(); + q.parallel_for(nd_range<3>({N, N, N}, {N, N, N}), + [=](auto) + [[sycl::reqd_work_group_size(N, N, N)]] {}); + }); + + throws_kernel_not_supported("uint32_max+2", [] { + constexpr uint64_t N = std::numeric_limits::max() + uint64_t(2); + q.parallel_for(nd_range<1>(N, N), + [=](auto) [[sycl::reqd_work_group_size(N)]] {}); + }); + + return n_fail; +}