diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp index d7b1dcf72e8ca..47b9d4e39f1a6 100644 --- a/clang/lib/CodeGen/BackendUtil.cpp +++ b/clang/lib/CodeGen/BackendUtil.cpp @@ -911,7 +911,8 @@ void EmitAssemblyHelper::RunOptimizationPipeline( PB.registerPipelineStartEPCallback( [&](ModulePassManager &MPM, OptimizationLevel Level) { MPM.addPass(ESIMDVerifierPass(LangOpts.SYCLESIMDForceStatelessMem)); - MPM.addPass(SYCLPropagateAspectsUsagePass()); + MPM.addPass( + SYCLPropagateAspectsUsagePass(/*ExcludeAspects=*/{"fp64"})); }); // Add the InferAddressSpaces pass for all the SPIR[V] targets @@ -1026,6 +1027,10 @@ void EmitAssemblyHelper::RunOptimizationPipeline( if (LangOpts.EnableDAEInSpirKernels) MPM.addPass(DeadArgumentEliminationSYCLPass()); + // Rerun aspect propagation without warning diagnostics. + MPM.addPass(SYCLPropagateAspectsUsagePass(/*ExcludeAspects=*/{}, + /*ValidateAspects=*/false)); + // Add SPIRITTAnnotations pass to the pass manager if // -fsycl-instrument-device-code option was passed. This option can be // used only with spir triple. diff --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp index 718f6565ee23f..6ef967dac4c1f 100644 --- a/clang/lib/Driver/ToolChains/Gnu.cpp +++ b/clang/lib/Driver/ToolChains/Gnu.cpp @@ -591,6 +591,13 @@ void tools::gnutools::Linker::ConstructJob(Compilation &C, const JobAction &JA, ToolChain.addFastMathRuntimeIfAvailable(Args, CmdArgs); } + // Performing link for dependency file information, undefined symbols are OK. + // True link time errors for symbols will be captured at host link. + if (JA.getType() == types::TY_Host_Dependencies_Image) { + CmdArgs.push_back("-z"); + CmdArgs.push_back("undefs"); + } + Args.AddAllArgs(CmdArgs, options::OPT_L); Args.AddAllArgs(CmdArgs, options::OPT_u); diff --git a/clang/test/CodeGenSYCL/fpga-attr-do-while-loops.cpp b/clang/test/CodeGenSYCL/fpga-attr-do-while-loops.cpp index ae8dc07ae5006..91414bcfe87ea 100644 --- a/clang/test/CodeGenSYCL/fpga-attr-do-while-loops.cpp +++ b/clang/test/CodeGenSYCL/fpga-attr-do-while-loops.cpp @@ -1,4 +1,4 @@ -// RUN: %clang++ -fsycl-device-only -fintelfpga -S %s -o - | FileCheck %s +// RUN: %clang -fsycl-device-only -fintelfpga -S %s -o - | FileCheck %s #include "Inputs/sycl.hpp" diff --git a/clang/test/Driver/sycl-offload-intelfpga-link.cpp b/clang/test/Driver/sycl-offload-intelfpga-link.cpp index d1fdf5aa89101..ae143a8258a9e 100644 --- a/clang/test/Driver/sycl-offload-intelfpga-link.cpp +++ b/clang/test/Driver/sycl-offload-intelfpga-link.cpp @@ -105,6 +105,14 @@ // RUN: | FileCheck %s --check-prefix=CHK-FPGA-LINK-WARN-AOCR // CHK-FPGA-LINK-WARN-AOCR: warning: FPGA archive '{{.*}}-aocr.a' does not contain matching emulation/hardware expectancy +/// Check deps behaviors with input fat archive and creating aocx archive +// RUN: %clangxx -fsycl -fintelfpga -fsycl-link=image \ +// RUN: -target x86_64-unknown-linux-gnu %S/Inputs/SYCL/liblin64.a \ +// RUN: %s -### 2>&1 \ +// RUN: | FileCheck %s --check-prefix=CHK-FPGA-LINK-UNDEFS +// CHK-FPGA-LINK-UNDEFS: ld{{.*}} "-z" "undefs" +// CHK-FPGA-LINK-UNDEFS: clang-offload-deps{{.*}} + /// -fintelfpga -fsycl-link from source // RUN: touch %t.cpp // RUN: %clangxx -target x86_64-unknown-linux-gnu -fsycl -fno-sycl-instrument-device-code -fno-sycl-device-lib=all -fintelfpga -fsycl-link=early %t.cpp -ccc-print-phases 2>&1 \ diff --git a/clang/test/Driver/sycl-offload-static-lib-2.cpp b/clang/test/Driver/sycl-offload-static-lib-2.cpp index ef9c5043306ac..7332b487e677b 100644 --- a/clang/test/Driver/sycl-offload-static-lib-2.cpp +++ b/clang/test/Driver/sycl-offload-static-lib-2.cpp @@ -150,7 +150,7 @@ // RUN: %clangxx -target x86_64-unknown-linux-gnu -fsycl -fsycl-targets=nvptx64-nvidia-cuda %t_lib.a -o output_name -lOpenCL -### %s 2>&1 \ // RUN: | FileCheck %s -check-prefix=STATIC_LIB_SRC2 -DBUNDLE_TRIPLE=sycl-nvptx64-nvidia-cuda-sm_50 -DDEPS_TRIPLE=sycl-nvptx64-nvidia-cuda-sm_50 // STATIC_LIB_SRC2: clang{{.*}} "-emit-obj" {{.*}} "-o" "[[HOSTOBJ:.+\.o]]" -// STATIC_LIB_SRC2: ld{{(.exe)?}}" {{.*}} "-o" "[[HOSTEXE:.+\.out]]" +// STATIC_LIB_SRC2: ld{{(.exe)?}}" {{.*}} "-o" "[[HOSTEXE:.+\.out]]" {{.*}}"-z" "undefs" // STATIC_LIB_SRC2: clang-offload-deps{{.*}} "-targets=[[DEPS_TRIPLE]]" "-outputs=[[OUTDEPS:.+\.bc]]" "[[HOSTEXE]]" // STATIC_LIB_SRC2_DEF: clang-offload-bundler{{.*}} "-type=aoo" "-targets=[[BUNDLE_TRIPLE]]" {{.*}} "-output=[[OUTLIB:.+\.txt]]" // STATIC_LIB_SRC2_NVPTX: clang-offload-bundler{{.*}} "-type=a" "-targets=[[BUNDLE_TRIPLE]]" {{.*}} "-output=[[OUTLIB:.+\.a]]" diff --git a/libclc/generic/include/spirv/spirv_types.h b/libclc/generic/include/spirv/spirv_types.h index be7101dadae29..d45e089032b2b 100644 --- a/libclc/generic/include/spirv/spirv_types.h +++ b/libclc/generic/include/spirv/spirv_types.h @@ -46,4 +46,16 @@ enum GroupOperation { ExclusiveScan = 2, }; +typedef struct { + float real, imag; +} complex_float; + +typedef struct { + double real, imag; +} complex_double; + +typedef struct { + half real, imag; +} complex_half; + #endif // CLC_SPIRV_TYPES diff --git a/libclc/ptx-nvidiacl/libspirv/group/collectives.cl b/libclc/ptx-nvidiacl/libspirv/group/collectives.cl index cf2bd33e987c4..6449152ecb34d 100644 --- a/libclc/ptx-nvidiacl/libspirv/group/collectives.cl +++ b/libclc/ptx-nvidiacl/libspirv/group/collectives.cl @@ -41,6 +41,12 @@ __local float * __clc__get_group_scratch_float() __asm("__clc__get_group_scratch_float"); __local double * __clc__get_group_scratch_double() __asm("__clc__get_group_scratch_double"); +__local complex_half *__clc__get_group_scratch_complex_half() __asm( + "__clc__get_group_scratch_complex_half"); +__local complex_float *__clc__get_group_scratch_complex_float() __asm( + "__clc__get_group_scratch_complex_float"); +__local complex_double *__clc__get_group_scratch_complex_double() __asm( + "__clc__get_group_scratch_complex_double"); _CLC_DEF uint inline __clc__membermask() { // use a full mask as sync operations are required to be convergent and @@ -89,6 +95,46 @@ _CLC_DEF _CLC_OVERLOAD _CLC_CONVERGENT double __clc__SubgroupShuffle(double x, return as_double(__clc__SubgroupShuffle(as_ulong(x), idx)); } +typedef union { + complex_half h; + int i; +} complex_half_converter; + +typedef union { + complex_float f; + int2 i; +} complex_float_converter; + +typedef union { + complex_double d; + int4 i; +} complex_double_converter; + +_CLC_DEF _CLC_OVERLOAD _CLC_CONVERGENT complex_half +__clc__SubgroupShuffle(complex_half x, uint idx) { + complex_half_converter conv = {x}; + conv.i = __clc__SubgroupShuffle(conv.i, idx); + return conv.h; +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONVERGENT complex_float +__clc__SubgroupShuffle(complex_float x, uint idx) { + complex_float_converter conv = {x}; + conv.i.x = __clc__SubgroupShuffle(conv.i.x, idx); + conv.i.y = __clc__SubgroupShuffle(conv.i.y, idx); + return conv.f; +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONVERGENT complex_double +__clc__SubgroupShuffle(complex_double x, uint idx) { + complex_double_converter conv = {x}; + conv.i.x = __clc__SubgroupShuffle(conv.i.x, idx); + conv.i.y = __clc__SubgroupShuffle(conv.i.y, idx); + conv.i.z = __clc__SubgroupShuffle(conv.i.z, idx); + conv.i.w = __clc__SubgroupShuffle(conv.i.w, idx); + return conv.d; +} + #define __CLC_SUBGROUP_SHUFFLEUP_I32(TYPE) \ _CLC_DEF _CLC_OVERLOAD _CLC_CONVERGENT TYPE __clc__SubgroupShuffleUp( \ TYPE x, uint delta) { \ @@ -130,6 +176,31 @@ __clc__SubgroupShuffleUp(double x, uint delta) { return as_double(__clc__SubgroupShuffleUp(as_ulong(x), delta)); } +_CLC_DEF _CLC_OVERLOAD _CLC_CONVERGENT complex_half +__clc__SubgroupShuffleUp(complex_half x, uint delta) { + complex_half_converter conv = {x}; + conv.i = __clc__SubgroupShuffleUp(conv.i, delta); + return conv.h; +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONVERGENT complex_float +__clc__SubgroupShuffleUp(complex_float x, uint delta) { + complex_float_converter conv = {x}; + conv.i.x = __clc__SubgroupShuffleUp(conv.i.x, delta); + conv.i.y = __clc__SubgroupShuffleUp(conv.i.y, delta); + return conv.f; +} + +_CLC_DEF _CLC_OVERLOAD _CLC_CONVERGENT complex_double +__clc__SubgroupShuffleUp(complex_double x, uint delta) { + complex_double_converter conv = {x}; + conv.i.x = __clc__SubgroupShuffleUp(conv.i.x, delta); + conv.i.y = __clc__SubgroupShuffleUp(conv.i.y, delta); + conv.i.z = __clc__SubgroupShuffleUp(conv.i.z, delta); + conv.i.w = __clc__SubgroupShuffleUp(conv.i.w, delta); + return conv.d; +} + // TODO: Implement InclusiveScan/ExclusiveScan // Currently only Reduce is required (for GroupAny and GroupAll) _CLC_DEF _CLC_OVERLOAD _CLC_CONVERGENT bool @@ -155,6 +226,75 @@ __clc__SubgroupBitwiseAny(uint op, bool predicate, bool *carry) { #define __CLC_AND(x, y) (x & y) #define __CLC_MUL(x, y) (x * y) +#define __DEFINE_CLC_COMPLEX_MUL(TYPE) \ + _CLC_DEF _CLC_OVERLOAD _CLC_CONVERGENT complex_##TYPE __clc_complex_mul( \ + complex_##TYPE z, complex_##TYPE w) { \ + TYPE a = z.real; \ + TYPE b = z.imag; \ + TYPE c = w.real; \ + TYPE d = w.imag; \ + TYPE ac = a * c; \ + TYPE bd = b * d; \ + TYPE ad = a * d; \ + TYPE bc = b * c; \ + TYPE x = ac - bd; \ + TYPE y = ad + bc; \ + if (__spirv_IsNan(x) && __spirv_IsNan(y)) { \ + bool __recalc = false; \ + if (__spirv_IsInf(a) || __spirv_IsInf(b)) { \ + a = __spirv_ocl_copysign(__spirv_IsInf(a) ? (TYPE)1 : (TYPE)0, a); \ + b = __spirv_ocl_copysign(__spirv_IsInf(b) ? (TYPE)1 : (TYPE)0, b); \ + if (__spirv_IsNan(c)) \ + c = __spirv_ocl_copysign((TYPE)0, c); \ + if (__spirv_IsNan(d)) \ + d = __spirv_ocl_copysign((TYPE)0, d); \ + __recalc = true; \ + } \ + if (__spirv_IsInf(c) || __spirv_IsInf(d)) { \ + c = __spirv_ocl_copysign(__spirv_IsInf(c) ? (TYPE)1 : (TYPE)0, c); \ + d = __spirv_ocl_copysign(__spirv_IsInf(d) ? (TYPE)1 : (TYPE)0, d); \ + if (__spirv_IsNan(a)) \ + a = __spirv_ocl_copysign((TYPE)0, a); \ + if (__spirv_IsNan(b)) \ + b = __spirv_ocl_copysign((TYPE)0, b); \ + __recalc = true; \ + } \ + if (!__recalc && (__spirv_IsInf(ac) || __spirv_IsInf(bd) || \ + __spirv_IsInf(ad) || __spirv_IsInf(bc))) { \ + if (__spirv_IsNan(a)) \ + a = __spirv_ocl_copysign((TYPE)0, a); \ + if (__spirv_IsNan(b)) \ + b = __spirv_ocl_copysign((TYPE)0, b); \ + if (__spirv_IsNan(c)) \ + c = __spirv_ocl_copysign((TYPE)0, c); \ + if (__spirv_IsNan(d)) \ + d = __spirv_ocl_copysign((TYPE)0, d); \ + __recalc = true; \ + } \ + if (__recalc) { \ + x = (TYPE)INFINITY * (a * c - b * d); \ + y = (TYPE)INFINITY * (a * d + b * c); \ + } \ + } \ + return (complex_##TYPE){x, y}; \ + } + +__DEFINE_CLC_COMPLEX_MUL(half) +__DEFINE_CLC_COMPLEX_MUL(float) +__DEFINE_CLC_COMPLEX_MUL(double) +#undef __DEFINE_CLC_COMPLEX_MUL + +// TODO remove these definitions after we have proper implementation of +// std::complex multiplication in SYCL +complex_float __mulsc3(float a, float b, float c, float d) { + return __clc_complex_mul((complex_float){a, b}, (complex_float){c, d}); +} +complex_double __muldc3(double a, double b, double c, double d) { + return __clc_complex_mul((complex_double){a, b}, (complex_double){c, d}); +} + +#define __CLC_COMPLEX_MUL(x, y) __clc_complex_mul(x, y) + #define __CLC_SUBGROUP_COLLECTIVE_BODY(OP, TYPE, IDENTITY) \ uint sg_lid = __spirv_SubgroupLocalInvocationId(); \ /* Can't use XOR/butterfly shuffles; some lanes may be inactive */ \ @@ -225,6 +365,13 @@ __CLC_SUBGROUP_COLLECTIVE(FMulKHR, __CLC_MUL, half, 1) __CLC_SUBGROUP_COLLECTIVE(FMulKHR, __CLC_MUL, float, 1) __CLC_SUBGROUP_COLLECTIVE(FMulKHR, __CLC_MUL, double, 1) +__CLC_SUBGROUP_COLLECTIVE(CMulINTEL, __CLC_COMPLEX_MUL, complex_half, + ((complex_half){1, 0})) +__CLC_SUBGROUP_COLLECTIVE(CMulINTEL, __CLC_COMPLEX_MUL, complex_float, + ((complex_float){1, 0})) +__CLC_SUBGROUP_COLLECTIVE(CMulINTEL, __CLC_COMPLEX_MUL, complex_double, + ((complex_double){1, 0})) + __CLC_SUBGROUP_COLLECTIVE(SMin, __CLC_MIN, char, CHAR_MAX) __CLC_SUBGROUP_COLLECTIVE(UMin, __CLC_MIN, uchar, UCHAR_MAX) __CLC_SUBGROUP_COLLECTIVE(SMin, __CLC_MIN, short, SHRT_MAX) @@ -281,55 +428,58 @@ __CLC_SUBGROUP_COLLECTIVE(BitwiseXorKHR, __CLC_XOR, long, 0l) #undef __CLC_SUBGROUP_COLLECTIVE #undef __CLC_SUBGROUP_COLLECTIVE_REDUX -#define __CLC_GROUP_COLLECTIVE_INNER(SPIRV_NAME, CLC_NAME, OP, TYPE, IDENTITY) \ - _CLC_DEF _CLC_OVERLOAD _CLC_CONVERGENT TYPE __CLC_APPEND( \ - __spirv_Group, SPIRV_NAME)(uint scope, uint op, TYPE x) { \ - TYPE carry = IDENTITY; \ - /* Perform GroupOperation within sub-group */ \ - TYPE sg_x = __CLC_APPEND(__clc__Subgroup, CLC_NAME)(op, x, &carry); \ - if (scope == Subgroup) { \ - return sg_x; \ - } \ - __local TYPE *scratch = __CLC_APPEND(__clc__get_group_scratch_, TYPE)(); \ - uint sg_id = __spirv_SubgroupId(); \ - uint num_sg = __spirv_NumSubgroups(); \ - uint sg_lid = __spirv_SubgroupLocalInvocationId(); \ - uint sg_size = __spirv_SubgroupSize(); \ - /* Share carry values across sub-groups */ \ - if (sg_lid == sg_size - 1) { \ - scratch[sg_id] = carry; \ - } \ - __spirv_ControlBarrier(Workgroup, 0, 0); \ - /* Perform InclusiveScan over sub-group results */ \ - TYPE sg_prefix; \ - TYPE sg_aggregate = scratch[0]; \ - _Pragma("unroll") for (int s = 1; s < num_sg; ++s) { \ - if (sg_id == s) { \ - sg_prefix = sg_aggregate; \ - } \ - TYPE addend = scratch[s]; \ - sg_aggregate = OP(sg_aggregate, addend); \ +#define __CLC_GROUP_COLLECTIVE_INNER(CLC_NAME, OP, TYPE, IDENTITY) \ + TYPE carry = IDENTITY; \ + /* Perform GroupOperation within sub-group */ \ + TYPE sg_x = __CLC_APPEND(__clc__Subgroup, CLC_NAME)(op, x, &carry); \ + if (scope == Subgroup) { \ + return sg_x; \ + } \ + __local TYPE *scratch = __CLC_APPEND(__clc__get_group_scratch_, TYPE)(); \ + uint sg_id = __spirv_SubgroupId(); \ + uint num_sg = __spirv_NumSubgroups(); \ + uint sg_lid = __spirv_SubgroupLocalInvocationId(); \ + uint sg_size = __spirv_SubgroupSize(); \ + /* Share carry values across sub-groups */ \ + if (sg_lid == sg_size - 1) { \ + scratch[sg_id] = carry; \ + } \ + __spirv_ControlBarrier(Workgroup, 0, 0); \ + /* Perform InclusiveScan over sub-group results */ \ + TYPE sg_prefix; \ + TYPE sg_aggregate = scratch[0]; \ + _Pragma("unroll") for (int s = 1; s < num_sg; ++s) { \ + if (sg_id == s) { \ + sg_prefix = sg_aggregate; \ } \ - /* For Reduce, broadcast result from final sub-group */ \ - /* For Scan, combine results from previous sub-groups */ \ - TYPE result; \ - if (op == Reduce) { \ - result = sg_aggregate; \ - } else if (op == InclusiveScan || op == ExclusiveScan) { \ - if (sg_id == 0) { \ - result = sg_x; \ - } else { \ - result = OP(sg_x, sg_prefix); \ - } \ + TYPE addend = scratch[s]; \ + sg_aggregate = OP(sg_aggregate, addend); \ + } \ + /* For Reduce, broadcast result from final sub-group */ \ + /* For Scan, combine results from previous sub-groups */ \ + TYPE result; \ + if (op == Reduce) { \ + result = sg_aggregate; \ + } else if (op == InclusiveScan || op == ExclusiveScan) { \ + if (sg_id == 0) { \ + result = sg_x; \ + } else { \ + result = OP(sg_x, sg_prefix); \ } \ - __spirv_ControlBarrier(Workgroup, 0, 0); \ - return result; \ + } \ + __spirv_ControlBarrier(Workgroup, 0, 0); \ + return result; + +#define __CLC_GROUP_COLLECTIVE_OUTER(SPIRV_NAME, CLC_NAME, OP, TYPE, IDENTITY) \ + _CLC_DEF _CLC_OVERLOAD _CLC_CONVERGENT TYPE __CLC_APPEND( \ + __spirv_Group, SPIRV_NAME)(uint scope, uint op, TYPE x) { \ + __CLC_GROUP_COLLECTIVE_INNER(CLC_NAME, OP, TYPE, IDENTITY) \ } #define __CLC_GROUP_COLLECTIVE_4(NAME, OP, TYPE, IDENTITY) \ - __CLC_GROUP_COLLECTIVE_INNER(NAME, NAME, OP, TYPE, IDENTITY) + __CLC_GROUP_COLLECTIVE_OUTER(NAME, NAME, OP, TYPE, IDENTITY) #define __CLC_GROUP_COLLECTIVE_5(SPIRV_NAME, CLC_NAME, OP, TYPE, IDENTITY) \ - __CLC_GROUP_COLLECTIVE_INNER(SPIRV_NAME, CLC_NAME, OP, TYPE, IDENTITY) + __CLC_GROUP_COLLECTIVE_OUTER(SPIRV_NAME, CLC_NAME, OP, TYPE, IDENTITY) #define DISPATCH_TO_CLC_GROUP_COLLECTIVE_MACRO(_1, _2, _3, _4, _5, NAME, ...) \ NAME @@ -338,6 +488,13 @@ __CLC_SUBGROUP_COLLECTIVE(BitwiseXorKHR, __CLC_XOR, long, 0l) __VA_ARGS__, __CLC_GROUP_COLLECTIVE_5, __CLC_GROUP_COLLECTIVE_4) \ (__VA_ARGS__) +#define __CLC_GROUP_COLLECTIVE_MANUAL_MANGLE(SPIRV_NAME_MANGLED, CLC_NAME, OP, \ + TYPE, IDENTITY) \ + _CLC_DEF _CLC_CONVERGENT TYPE SPIRV_NAME_MANGLED(uint scope, uint op, \ + TYPE x) { \ + __CLC_GROUP_COLLECTIVE_INNER(CLC_NAME, OP, TYPE, IDENTITY) \ + } + __CLC_GROUP_COLLECTIVE(BitwiseOr, __CLC_OR, bool, false); __CLC_GROUP_COLLECTIVE(BitwiseAny, __CLC_AND, bool, true); _CLC_DEF _CLC_OVERLOAD _CLC_CONVERGENT bool __spirv_GroupAny(uint scope, @@ -373,6 +530,16 @@ __CLC_GROUP_COLLECTIVE(FMulKHR, __CLC_MUL, half, 1) __CLC_GROUP_COLLECTIVE(FMulKHR, __CLC_MUL, float, 1) __CLC_GROUP_COLLECTIVE(FMulKHR, __CLC_MUL, double, 1) +__CLC_GROUP_COLLECTIVE_MANUAL_MANGLE( + _Z22__spirv_GroupCMulINTELjjN5__spv12complex_halfE, CMulINTEL, + __CLC_COMPLEX_MUL, complex_half, ((complex_half){1, 0})) +__CLC_GROUP_COLLECTIVE_MANUAL_MANGLE( + _Z22__spirv_GroupCMulINTELjjN5__spv13complex_floatE, CMulINTEL, + __CLC_COMPLEX_MUL, complex_float, ((complex_float){1, 0})) +__CLC_GROUP_COLLECTIVE_MANUAL_MANGLE( + _Z22__spirv_GroupCMulINTELjjN5__spv14complex_doubleE, CMulINTEL, + __CLC_COMPLEX_MUL, complex_double, ((complex_double){1, 0})) + __CLC_GROUP_COLLECTIVE(SMin, __CLC_MIN, char, CHAR_MAX) __CLC_GROUP_COLLECTIVE(UMin, __CLC_MIN, uchar, UCHAR_MAX) __CLC_GROUP_COLLECTIVE(SMin, __CLC_MIN, short, SHRT_MAX) diff --git a/libclc/ptx-nvidiacl/libspirv/group/collectives_helpers.ll b/libclc/ptx-nvidiacl/libspirv/group/collectives_helpers.ll index 6b2e5d8880386..f2c59b3fddd31 100644 --- a/libclc/ptx-nvidiacl/libspirv/group/collectives_helpers.ll +++ b/libclc/ptx-nvidiacl/libspirv/group/collectives_helpers.ll @@ -1,61 +1,97 @@ -; 64 storage locations is sufficient for all current-generation NVIDIA GPUs -; 64 bits per warp is sufficient for all fundamental data types +; 32 storage locations is sufficient for all current-generation NVIDIA GPUs +; 128 bits per warp is sufficient for all fundamental data types and complex ; Reducing storage for small data types or increasing it for user-defined types ; will likely require an additional pass to track group algorithm usage -@__clc__group_scratch = internal addrspace(3) global [64 x i64] undef, align 1 +@__clc__group_scratch = internal addrspace(3) global [128 x i64] undef, align 1 define i8 addrspace(3)* @__clc__get_group_scratch_bool() nounwind alwaysinline { entry: - %ptr = getelementptr inbounds [64 x i64], [64 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0 + %ptr = getelementptr inbounds [128 x i64], [128 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0 %cast = bitcast i64 addrspace(3)* %ptr to i8 addrspace(3)* ret i8 addrspace(3)* %cast } define i8 addrspace(3)* @__clc__get_group_scratch_char() nounwind alwaysinline { entry: - %ptr = getelementptr inbounds [64 x i64], [64 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0 + %ptr = getelementptr inbounds [128 x i64], [128 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0 %cast = bitcast i64 addrspace(3)* %ptr to i8 addrspace(3)* ret i8 addrspace(3)* %cast } define i16 addrspace(3)* @__clc__get_group_scratch_short() nounwind alwaysinline { entry: - %ptr = getelementptr inbounds [64 x i64], [64 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0 + %ptr = getelementptr inbounds [128 x i64], [128 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0 %cast = bitcast i64 addrspace(3)* %ptr to i16 addrspace(3)* ret i16 addrspace(3)* %cast } define i32 addrspace(3)* @__clc__get_group_scratch_int() nounwind alwaysinline { entry: - %ptr = getelementptr inbounds [64 x i64], [64 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0 + %ptr = getelementptr inbounds [128 x i64], [128 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0 %cast = bitcast i64 addrspace(3)* %ptr to i32 addrspace(3)* ret i32 addrspace(3)* %cast } define i64 addrspace(3)* @__clc__get_group_scratch_long() nounwind alwaysinline { entry: - %ptr = getelementptr inbounds [64 x i64], [64 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0 + %ptr = getelementptr inbounds [128 x i64], [128 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0 %cast = bitcast i64 addrspace(3)* %ptr to i64 addrspace(3)* ret i64 addrspace(3)* %cast } define half addrspace(3)* @__clc__get_group_scratch_half() nounwind alwaysinline { entry: - %ptr = getelementptr inbounds [64 x i64], [64 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0 + %ptr = getelementptr inbounds [128 x i64], [128 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0 %cast = bitcast i64 addrspace(3)* %ptr to half addrspace(3)* ret half addrspace(3)* %cast } define float addrspace(3)* @__clc__get_group_scratch_float() nounwind alwaysinline { entry: - %ptr = getelementptr inbounds [64 x i64], [64 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0 + %ptr = getelementptr inbounds [128 x i64], [128 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0 %cast = bitcast i64 addrspace(3)* %ptr to float addrspace(3)* ret float addrspace(3)* %cast } define double addrspace(3)* @__clc__get_group_scratch_double() nounwind alwaysinline { entry: - %ptr = getelementptr inbounds [64 x i64], [64 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0 + %ptr = getelementptr inbounds [128 x i64], [128 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0 %cast = bitcast i64 addrspace(3)* %ptr to double addrspace(3)* ret double addrspace(3)* %cast } + +%complex_half = type { + half, + half +} + +%complex_float = type { + float, + float +} + +%complex_double = type { + double, + double +} + +define %complex_half addrspace(3)* @__clc__get_group_scratch_complex_half() nounwind alwaysinline { +entry: + %ptr = getelementptr inbounds [128 x i64], [128 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0 + %cast = bitcast i64 addrspace(3)* %ptr to %complex_half addrspace(3)* + ret %complex_half addrspace(3)* %cast +} + +define %complex_float addrspace(3)* @__clc__get_group_scratch_complex_float() nounwind alwaysinline { +entry: + %ptr = getelementptr inbounds [128 x i64], [128 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0 + %cast = bitcast i64 addrspace(3)* %ptr to %complex_float addrspace(3)* + ret %complex_float addrspace(3)* %cast +} + +define %complex_double addrspace(3)* @__clc__get_group_scratch_complex_double() nounwind alwaysinline { +entry: + %ptr = getelementptr inbounds [128 x i64], [128 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0 + %cast = bitcast i64 addrspace(3)* %ptr to %complex_double addrspace(3)* + ret %complex_double addrspace(3)* %cast +} diff --git a/libdevice/cmake/modules/SYCLLibdevice.cmake b/libdevice/cmake/modules/SYCLLibdevice.cmake index 5bb8fef6074f0..75995cc25e946 100644 --- a/libdevice/cmake/modules/SYCLLibdevice.cmake +++ b/libdevice/cmake/modules/SYCLLibdevice.cmake @@ -111,7 +111,7 @@ set(complex_obj_deps device_complex.h device.h sycl-compiler) set(cmath_obj_deps device_math.h device.h sycl-compiler) set(imf_obj_deps device_imf.hpp imf_half.hpp imf_bf16.hpp device.h sycl-compiler) set(itt_obj_deps device_itt.h spirv_vars.h device.h sycl-compiler) -set(bfloat16_obj_deps sycl-compiler) +set(bfloat16_obj_deps sycl-headers sycl-compiler) add_devicelib_obj(libsycl-itt-stubs SRC itt_stubs.cpp DEP ${itt_obj_deps}) add_devicelib_obj(libsycl-itt-compiler-wrappers SRC itt_compiler_wrappers.cpp DEP ${itt_obj_deps}) diff --git a/llvm/include/llvm/SYCLLowerIR/SYCLPropagateAspectsUsage.h b/llvm/include/llvm/SYCLLowerIR/SYCLPropagateAspectsUsage.h index 4845ad6e97bf1..f8f1078b0669a 100644 --- a/llvm/include/llvm/SYCLLowerIR/SYCLPropagateAspectsUsage.h +++ b/llvm/include/llvm/SYCLLowerIR/SYCLPropagateAspectsUsage.h @@ -20,13 +20,19 @@ namespace llvm { class SYCLPropagateAspectsUsagePass : public PassInfoMixin { public: - SYCLPropagateAspectsUsagePass(StringRef OptionsString = {}) { + SYCLPropagateAspectsUsagePass(std::set ExcludeAspects = {}, + bool ValidateAspects = true, + StringRef OptionsString = {}) + : ExcludedAspects{std::move(ExcludeAspects)}, + ValidateAspectUsage{ValidateAspects} { OptionsString.split(this->TargetFixedAspects, ',', /*MaxSplit=*/-1, /*KeepEmpty=*/false); }; PreservedAnalyses run(Module &M, ModuleAnalysisManager &); private: + std::set ExcludedAspects; + const bool ValidateAspectUsage; SmallVector TargetFixedAspects; }; diff --git a/llvm/lib/SYCLLowerIR/CompileTimeProperties.def b/llvm/lib/SYCLLowerIR/CompileTimeProperties.def index f1490e3b8e020..54e96c1823988 100644 --- a/llvm/lib/SYCLLowerIR/CompileTimeProperties.def +++ b/llvm/lib/SYCLLowerIR/CompileTimeProperties.def @@ -27,7 +27,7 @@ SYCL_COMPILE_TIME_PROPERTY("sycl-dwidth", 6178, DecorValueTy::uint32) SYCL_COMPILE_TIME_PROPERTY("sycl-latency", 6179, DecorValueTy::uint32) SYCL_COMPILE_TIME_PROPERTY("sycl-read-write-mode", 6180, DecorValueTy::uint32) SYCL_COMPILE_TIME_PROPERTY("sycl-maxburst", 6181, DecorValueTy::uint32) -SYCL_COMPILE_TIME_PROPERTY("sycl-alignment", 6182, DecorValueTy::uint32) -SYCL_COMPILE_TIME_PROPERTY("sycl-wait-request", 6183, DecorValueTy::uint32) -SYCL_COMPILE_TIME_PROPERTY("sycl-stable", 6184, DecorValueTy::boolean) +SYCL_COMPILE_TIME_PROPERTY("sycl-alignment", 44, DecorValueTy::uint32) +SYCL_COMPILE_TIME_PROPERTY("sycl-wait-request", 6182, DecorValueTy::uint32) +SYCL_COMPILE_TIME_PROPERTY("sycl-stable", 6183, DecorValueTy::boolean) SYCL_COMPILE_TIME_PROPERTY("sycl-strict", 19, DecorValueTy::boolean) diff --git a/llvm/lib/SYCLLowerIR/CompileTimePropertiesPass.cpp b/llvm/lib/SYCLLowerIR/CompileTimePropertiesPass.cpp index 1d7fd512ba868..a04942c8bb447 100644 --- a/llvm/lib/SYCLLowerIR/CompileTimePropertiesPass.cpp +++ b/llvm/lib/SYCLLowerIR/CompileTimePropertiesPass.cpp @@ -337,6 +337,7 @@ PreservedAnalyses CompileTimePropertiesPass::run(Module &M, if (F.getCallingConv() != CallingConv::SPIR_KERNEL) continue; + // Compile time properties on kernel arguments { SmallVector MDOps; MDOps.reserve(F.arg_size()); @@ -345,8 +346,12 @@ PreservedAnalyses CompileTimePropertiesPass::run(Module &M, SmallVector MDArgOps; for (auto &Attribute : F.getAttributes().getParamAttrs(I)) { if (MDNode *SPIRVMetadata = - attributeToDecorateMetadata(Ctx, Attribute)) + attributeToDecorateMetadata(Ctx, Attribute)) { + // sycl-alignment is not collected to SPIRV.ParamDecoration + if (Attribute.getKindAsString() == "sycl-alignment") + continue; MDArgOps.push_back(SPIRVMetadata); + } } if (!MDArgOps.empty()) FoundKernelProperties = true; diff --git a/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp b/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp index 9479d0ee3b838..e6bd8ffe8a44d 100644 --- a/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp +++ b/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp @@ -671,7 +671,8 @@ class ESIMDIntrinDescTable { {"test.src.tmpl.arg", {t(0), t1(1), t8(2), t16(3), t32(4), c8(17)}}}, {"slm_init", {"slm.init", {a(0)}}}, {"bf_cvt", {"bf.cvt", {a(0)}}}, - {"tf32_cvt", {"tf32.cvt", {a(0)}}}}; + {"tf32_cvt", {"tf32.cvt", {a(0)}}}, + {"bfn", {"bfn", {a(0), a(1), a(2), t(0)}}}}; } const IntrinTable &getTable() { return Table; } diff --git a/llvm/lib/SYCLLowerIR/LowerKernelProps.cpp b/llvm/lib/SYCLLowerIR/LowerKernelProps.cpp index c784c7607112f..5d576b7edfd73 100644 --- a/llvm/lib/SYCLLowerIR/LowerKernelProps.cpp +++ b/llvm/lib/SYCLLowerIR/LowerKernelProps.cpp @@ -50,7 +50,8 @@ void processSetKernelPropertiesCall(CallInst &CI) { GraphNode->addFnAttr(::sycl::kernel_props::ATTR_LARGE_GRF); // Add RegisterAllocMode metadata with arg 2 to the kernel to tell // IGC to compile this kernel in large GRF mode. 2 means large. - if (GraphNode->getCallingConv() == CallingConv::SPIR_KERNEL) { + if (GraphNode->getCallingConv() == CallingConv::SPIR_KERNEL && + !GraphNode->hasMetadata("sycl_explicit_simd")) { auto &Ctx = GraphNode->getContext(); Metadata *AttrMDArgs[] = {ConstantAsMetadata::get( Constant::getIntegerValue(Type::getInt32Ty(Ctx), APInt(32, 2)))}; diff --git a/llvm/lib/SYCLLowerIR/SYCLPropagateAspectsUsage.cpp b/llvm/lib/SYCLLowerIR/SYCLPropagateAspectsUsage.cpp index b13e480170fec..0f32fd78b68de 100644 --- a/llvm/lib/SYCLLowerIR/SYCLPropagateAspectsUsage.cpp +++ b/llvm/lib/SYCLLowerIR/SYCLPropagateAspectsUsage.cpp @@ -53,6 +53,11 @@ static cl::opt ClSyclFixedTargets( "is expected to be runnable on"), cl::Hidden, cl::init("")); +static cl::opt ClSyclExcludeAspects( + "sycl-propagate-aspects-usage-exclude-aspects", + cl::desc("Specify aspects to exclude when propagating aspect usage"), + cl::Hidden, cl::init("")); + namespace { using AspectsSetTy = SmallSet; @@ -293,15 +298,37 @@ getAspectUsageChain(const Function *F, const FunctionToAspectsMapTy &AspectsMap, return CallChain; } -void createUsedAspectsMetadataForFunctions(FunctionToAspectsMapTy &Map) { +void createUsedAspectsMetadataForFunctions( + FunctionToAspectsMapTy &Map, const AspectsSetTy &ExcludeAspectVals) { for (auto &[F, Aspects] : Map) { if (Aspects.empty()) continue; LLVMContext &C = F->getContext(); + // Create a set of unique aspects. First we add the ones from the found + // aspects that have not been excluded. + AspectsSetTy UniqueAspects; + for (const int &A : Aspects) + if (!ExcludeAspectVals.contains(A)) + UniqueAspects.insert(A); + + // If there are no new aspects, we can just keep the old metadata. + if (UniqueAspects.empty()) + continue; + + // If there is new metadata, merge it with the old aspects. We preserve + // the excluded ones. + if (const MDNode *ExistingAspects = F->getMetadata("sycl_used_aspects")) { + for (const MDOperand &MDOp : ExistingAspects->operands()) { + const Constant *C = cast(MDOp)->getValue(); + UniqueAspects.insert(cast(C)->getSExtValue()); + } + } + + // Create new metadata. SmallVector AspectsMetadata; - for (const auto &A : Aspects) + for (const int &A : UniqueAspects) AspectsMetadata.push_back(ConstantAsMetadata::get( ConstantInt::getSigned(Type::getInt32Ty(C), A))); @@ -506,7 +533,8 @@ void setSyclFixedTargetsMD(const std::vector &EntryPoints, FunctionToAspectsMapTy buildFunctionsToAspectsMap(Module &M, TypeToAspectsMapTy &TypesWithAspects, const AspectValueToNameMapTy &AspectValues, - const std::vector &EntryPoints) { + const std::vector &EntryPoints, + bool ValidateAspects) { FunctionToAspectsMapTy FunctionToUsedAspects; FunctionToAspectsMapTy FunctionToDeclaredAspects; CallGraphTy CG; @@ -522,8 +550,9 @@ buildFunctionsToAspectsMap(Module &M, TypeToAspectsMapTy &TypesWithAspects, for (Function *F : EntryPoints) propagateAspectsThroughCG(F, CG, FunctionToUsedAspects, Visited); - validateUsedAspectsForFunctions(FunctionToUsedAspects, AspectValues, - EntryPoints, CG); + if (ValidateAspects) + validateUsedAspectsForFunctions(FunctionToUsedAspects, AspectValues, + EntryPoints, CG); // The set of aspects from FunctionToDeclaredAspects should be merged to the // set of FunctionToUsedAspects after validateUsedAspectsForFunctions call to @@ -558,6 +587,14 @@ SYCLPropagateAspectsUsagePass::run(Module &M, ModuleAnalysisManager &MAM) { StringRef(ClSyclFixedTargets) .split(TargetFixedAspects, ',', /*MaxSplit=*/-1, /*KeepEmpty=*/false); + if (ClSyclExcludeAspects.getNumOccurrences() > 0) { + SmallVector ExcludedAspectsVec; + StringRef(ClSyclExcludeAspects) + .split(ExcludedAspectsVec, ',', /*MaxSplit=*/-1, /*KeepEmpty=*/false); + ExcludedAspects.insert(ExcludedAspectsVec.begin(), + ExcludedAspectsVec.end()); + } + std::vector EntryPoints; for (Function &F : M.functions()) if (isEntryPoint(F)) @@ -566,9 +603,19 @@ SYCLPropagateAspectsUsagePass::run(Module &M, ModuleAnalysisManager &MAM) { propagateAspectsToOtherTypesInModule(M, TypesWithAspects, AspectValues); FunctionToAspectsMapTy FunctionToUsedAspects = buildFunctionsToAspectsMap( - M, TypesWithAspects, AspectValues, EntryPoints); + M, TypesWithAspects, AspectValues, EntryPoints, ValidateAspectUsage); + + // Create a set of excluded aspect values. + AspectsSetTy ExcludedAspectVals; + for (const StringRef &AspectName : ExcludedAspects) { + const auto AspectValIter = AspectValues.find(AspectName); + assert(AspectValIter != AspectValues.end() && + "Excluded aspect does not have a corresponding value."); + ExcludedAspectVals.insert(AspectValIter->second); + } - createUsedAspectsMetadataForFunctions(FunctionToUsedAspects); + createUsedAspectsMetadataForFunctions(FunctionToUsedAspects, + ExcludedAspectVals); setSyclFixedTargetsMD(EntryPoints, TargetFixedAspects, AspectValues); diff --git a/llvm/test/SYCLLowerIR/PropagateAspectsUsage/double-prop-after-exclude.ll b/llvm/test/SYCLLowerIR/PropagateAspectsUsage/double-prop-after-exclude.ll new file mode 100644 index 0000000000000..8eb2512b507d6 --- /dev/null +++ b/llvm/test/SYCLLowerIR/PropagateAspectsUsage/double-prop-after-exclude.ll @@ -0,0 +1,48 @@ +; RUN: opt -passes=sycl-propagate-aspects-usage -sycl-propagate-aspects-usage-exclude-aspects=fp64 < %s -S -o %t_first.ll +; RUN: opt -passes=sycl-propagate-aspects-usage < %t_first.ll -S -o %t_second.ll +; FileCheck %s --input-file %t_first.ll --check-prefix=CHECK-FIRST +; FileCheck %s --input-file %t_second.ll --check-prefix=CHECK-SECOND +; +; Test checks that fp64 usage is correctly propagate in the two-run model. + +%composite = type { double } + +; CHECK-FIRST-NOT: spir_kernel void @kernel() {{.*}} !sycl_used_aspects +; CHECK-SECOND: spir_kernel void @kernel() !sycl_used_aspects ![[MDID:]] +define spir_kernel void @kernel() { + call spir_func void @func() + ret void +} + +; CHECK-FIRST-NOT: spir_func void @func() {{.*}} !sycl_used_aspects +; CHECK-SECOND: spir_func void @func() !sycl_used_aspects ![[MDID]] { +define spir_func void @func() { + %tmp = alloca double + ret void +} + +; CHECK-FIRST-NOT: spir_func void @func.array() {{.*}} !sycl_used_aspects +; CHECK-SECOND: spir_func void @func.array() !sycl_used_aspects ![[MDID]] { +define spir_func void @func.array() { + %tmp = alloca [4 x double] + ret void +} + +; CHECK-FIRST-NOT: spir_func void @func.vector() {{.*}} !sycl_used_aspects +; CHECK-SECOND: spir_func void @func.vector() !sycl_used_aspects ![[MDID]] { +define spir_func void @func.vector() { + %tmp = alloca <4 x double> + ret void +} + +; CHECK-FIRST-NOT: spir_func void @func.composite() {{.*}} !sycl_used_aspects +; CHECK-SECOND: spir_func void @func.composite() !sycl_used_aspects ![[MDID]] { +define spir_func void @func.composite() { + %tmp = alloca %composite + ret void +} + +!sycl_aspects = !{!0} +!0 = !{!"fp64", i32 6} + +; CHECK-SECOND: ![[MDID]] = !{i32 6} diff --git a/llvm/test/SYCLLowerIR/PropagateAspectsUsage/exclude-aspect.ll b/llvm/test/SYCLLowerIR/PropagateAspectsUsage/exclude-aspect.ll new file mode 100644 index 0000000000000..59c7964cd60ad --- /dev/null +++ b/llvm/test/SYCLLowerIR/PropagateAspectsUsage/exclude-aspect.ll @@ -0,0 +1,139 @@ +; RUN: opt -passes=sycl-propagate-aspects-usage -sycl-propagate-aspects-usage-exclude-aspects=aspect4,aspect1 -S < %s | FileCheck %s +; +; Test checks that the pass is able to collect all aspects used in a function + +%A = type { i32 } +%B = type { i32 } +%C = type { i32 } +%D = type { i32 } + +; None of funcA's aspects are excluded. +; CHECK: define spir_func void @funcA() !sycl_used_aspects ![[#ID0:]] { +define spir_func void @funcA() { + %tmp = alloca %A + ret void +} + +; funcB uses "aspect1" which is excluded, so the resulting aspects are the same +; as for funcA. +; CHECK: define spir_func void @funcB() !sycl_used_aspects ![[#ID0]] { +define spir_func void @funcB() { + %tmp = alloca %B + call spir_func void @funcA() + ret void +} + +; funcC has an aspect excluded, propagated from funcB. +; CHECK: define spir_func void @funcC() !sycl_used_aspects ![[#ID1:]] { +define spir_func void @funcC() { + %tmp = alloca %C + call spir_func void @funcB() + ret void +} + +; funcD has two aspects excluded; one from the use of D and one from propagated. +; from funcB and funcC. +; CHECK: define spir_func void @funcD() !sycl_used_aspects ![[#ID2:]] { +define spir_func void @funcD() { + %tmp = alloca %D + call spir_func void @funcC() + ret void +} + +; kernel1 has the same aspects as funcD. +; CHECK: define spir_kernel void @kernel1() !sycl_used_aspects ![[#ID2]] +define spir_kernel void @kernel1() { + call spir_func void @funcD() + ret void +} + +; funcE should get none of its explicitly declared aspects in its +; sycl_used_aspects +; CHECK: define spir_func void @funcE() !sycl_declared_aspects ![[#DA1:]] { +define spir_func void @funcE() !sycl_declared_aspects !10 { + ret void +} + +; funcF should have the same aspects as funcE +; CHECK-NOT: define spir_func void @funcF() {{.*}} !sycl_used_aspects +define spir_func void @funcF() { + call spir_func void @funcE() + ret void +} + +; funcG only keeps one aspect, the rest are excluded +; CHECK: define spir_func void @funcG() !sycl_declared_aspects ![[#DA2:]] !sycl_used_aspects ![[#ID3:]] +define spir_func void @funcG() !sycl_declared_aspects !11 { + ret void +} + +; funcH should have the same aspects as funcG +; CHECK: define spir_func void @funcH() !sycl_used_aspects ![[#ID3]] +define spir_func void @funcH() { + call spir_func void @funcG() + ret void +} + +; CHECK: define spir_kernel void @kernel2() !sycl_used_aspects ![[#ID3]] +define spir_kernel void @kernel2() { + call spir_func void @funcF() + call spir_func void @funcH() + ret void +} + +; CHECK: define spir_func void @funcI() !sycl_used_aspects ![[#DA1]] { +define spir_func void @funcI() !sycl_used_aspects !10 { + ret void +} + +; CHECK-NOT: define spir_func void @funcJ() {{.*}} !sycl_used_aspects +define spir_func void @funcJ() { + call spir_func void @funcI() + ret void +} + +; +; Note that the listed aspects can be reordered due to the merging of the +; aspect sets. +; CHECK: define spir_func void @funcK() !sycl_used_aspects ![[#ID4:]] { +define spir_func void @funcK() !sycl_used_aspects !11 { + ret void +} + +; CHECK: define spir_func void @funcL() !sycl_used_aspects ![[#ID3]] +define spir_func void @funcL() { + call spir_func void @funcK() + ret void +} + +; CHECK: define spir_kernel void @kernel3() !sycl_used_aspects ![[#ID3]] +define spir_kernel void @kernel3() { + call spir_func void @funcK() + call spir_func void @funcL() + ret void +} + +!sycl_types_that_use_aspects = !{!0, !1, !2, !3} +!0 = !{!"A", i32 0} +!1 = !{!"B", i32 1} +!2 = !{!"C", i32 2} +!3 = !{!"D", i32 3, i32 4} + +!sycl_aspects = !{!4, !5, !6, !7, !8, !9} +!4 = !{!"aspect0", i32 0} +!5 = !{!"aspect1", i32 1} +!6 = !{!"aspect2", i32 2} +!7 = !{!"aspect3", i32 3} +!8 = !{!"aspect4", i32 4} +!9 = !{!"fp64", i32 5} + +!10 = !{i32 1} +!11 = !{i32 4, i32 2, i32 1} +; CHECK-DAG: ![[#DA1]] = !{i32 1} +; CHECK-DAG: ![[#DA2]] = !{i32 4, i32 2, i32 1} + +; CHECK-DAG: ![[#ID0]] = !{i32 0} +; CHECK-DAG: ![[#ID1]] = !{i32 2, i32 0} +; CHECK-DAG: ![[#ID2]] = !{i32 0, i32 2, i32 3} +; CHECK-DAG: ![[#ID3]] = !{i32 2} +; CHECK-DAG: ![[#ID4]] = !{i32 2, i32 4, i32 1} diff --git a/llvm/test/SYCLLowerIR/lower_kernel_props.ll b/llvm/test/SYCLLowerIR/lower_kernel_props.ll index 4d7793c39a887..f2cef0617739c 100644 --- a/llvm/test/SYCLLowerIR/lower_kernel_props.ll +++ b/llvm/test/SYCLLowerIR/lower_kernel_props.ll @@ -5,7 +5,7 @@ ; - remove the intrinsic call ; - mark the kernel with corresponding attribute (only "large-grf" for now) -; RUN: opt -passes=lower-kernel-props -S %s -o - | FileCheck %s +; RUN: opt -passes=lower-kernel-props -S %s -o - | FileCheck %s --implicit-check-not='RegisterAllocMode' ; ModuleID = 'large_grf.bc' source_filename = "llvm-link" @@ -33,8 +33,7 @@ define weak_odr dso_local spir_kernel void @__large_grf_kernel1() !sycl_explicit ; -- This kernel calls the marker function directly define weak_odr dso_local spir_kernel void @__large_grf_kernel2() #0 !sycl_explicit_simd !0 !intel_reqd_sub_group_size !1 { -; CHECK: {{.*}} spir_kernel void @__large_grf_kernel2() #0 {{.*}} !RegisterAllocMode ![[MetadataArg:[0-9]+]] -; CHECK: ![[MetadataArg]] = !{i32 2} +; CHECK: {{.*}} spir_kernel void @__large_grf_kernel2() #0 {{.*}} call spir_func void @_Z28__sycl_set_kernel_propertiesi(i32 noundef 0) ret void } diff --git a/llvm/test/tools/sycl-post-link/sycl-esimd-large-grf.ll b/llvm/test/tools/sycl-post-link/sycl-esimd-large-grf.ll index 413380ae70432..919eabccfa3c4 100644 --- a/llvm/test/tools/sycl-post-link/sycl-esimd-large-grf.ll +++ b/llvm/test/tools/sycl-post-link/sycl-esimd-large-grf.ll @@ -9,7 +9,7 @@ ; RUN: sycl-post-link -split=source -symbols -split-esimd -lower-esimd -S < %s -o %t.table ; RUN: FileCheck %s -input-file=%t.table -; RUN: FileCheck %s -input-file=%t_esimd_large_grf_0.ll --check-prefixes CHECK-ESIMD-LargeGRF-IR +; RUN: FileCheck %s -input-file=%t_esimd_large_grf_0.ll --check-prefixes CHECK-ESIMD-LargeGRF-IR --implicit-check-not='RegisterAllocMode' ; RUN: FileCheck %s -input-file=%t_esimd_large_grf_0.prop --check-prefixes CHECK-ESIMD-LargeGRF-PROP ; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-SYCL-SYM ; RUN: FileCheck %s -input-file=%t_esimd_1.sym --check-prefixes CHECK-ESIMD-SYM @@ -58,8 +58,7 @@ entry: declare dso_local spir_func void @_Z28__sycl_set_kernel_propertiesi(i32 noundef) define weak_odr dso_local spir_kernel void @__ESIMD_large_grf_kernel() #0 !sycl_explicit_simd !0 !intel_reqd_sub_group_size !1 { -; CHECK-ESIMD-LargeGRF-IR: @__ESIMD_large_grf_kernel() {{.*}} !RegisterAllocMode ![[MetadataArg:[0-9]+]] -; CHECK-ESIMD-LargeGRF-IR: ![[MetadataArg]] = !{i32 2} +; CHECK-ESIMD-LargeGRF-IR: @__ESIMD_large_grf_kernel() {{.*}} entry: call spir_func void @_Z17large_grf_markerv() ret void diff --git a/llvm/test/tools/sycl-post-link/sycl-kernel-arg-annotations.ll b/llvm/test/tools/sycl-post-link/sycl-kernel-arg-annotations.ll index f6cec2be141ef..8f04701bfcac8 100644 --- a/llvm/test/tools/sycl-post-link/sycl-kernel-arg-annotations.ll +++ b/llvm/test/tools/sycl-post-link/sycl-kernel-arg-annotations.ll @@ -29,9 +29,8 @@ entry: !1587 = !{i32 -1} !1588 = !{i32 -1, i32 -1, i32 -1} ; CHECK-DAG: ![[PARMDECOR_CASE1]] = !{![[ARG:[0-9]+]]} -; CHECK-DAG: ![[ARG]] = !{![[ALIGN:[0-9]+]], ![[AWIDTH:[0-9]+]], ![[BL:[0-9]+]], ![[CONDUIT:[0-9]+]], ![[DWIDTH:[0-9]+]], ![[LATENCY:[0-9]+]], ![[MAXBURST:[0-9]+]], ![[RWMODE:[0-9]+]], ![[REGMAP:[0-9]+]], ![[STABLE:[0-9]+]], ![[STRICT:[0-9]+]], ![[WAITREQ:[0-9]+]]} +; CHECK-DAG: ![[ARG]] = !{![[AWIDTH:[0-9]+]], ![[BL:[0-9]+]], ![[CONDUIT:[0-9]+]], ![[DWIDTH:[0-9]+]], ![[LATENCY:[0-9]+]], ![[MAXBURST:[0-9]+]], ![[RWMODE:[0-9]+]], ![[REGMAP:[0-9]+]], ![[STABLE:[0-9]+]], ![[STRICT:[0-9]+]], ![[WAITREQ:[0-9]+]]} -; CHECK: ![[ALIGN]] = !{i32 6182, i32 4} ; CHECK: ![[AWIDTH]] = !{i32 6177, i32 32} ; CHECK: ![[BL]] = !{i32 5921, i32 10} ; CHECK: ![[CONDUIT]] = !{i32 6175, i32 1} @@ -40,13 +39,11 @@ entry: ; CHECK: ![[MAXBURST]] = !{i32 6181, i32 3} ; CHECK: ![[RWMODE]] = !{i32 6180, i32 2} ; CHECK: ![[REGMAP]] = !{i32 6176, i32 1} -; CHECK: ![[STABLE]] = !{i32 6184, i32 1} +; CHECK: ![[STABLE]] = !{i32 6183, i32 1} ; CHECK: ![[STRICT]] = !{i32 19, i32 1} -; CHECK: ![[WAITREQ]] = !{i32 6183, i32 5} +; CHECK: ![[WAITREQ]] = !{i32 6182, i32 5} -; CHECK-DAG: ![[PARMDECOR_CASE2]] = !{![[ARG1:[0-9]+]], ![[ARG2:[0-9]+]], ![[ARG3:[0-9]+]]} -; CHECK-DAG: ![[ARG1]] = !{![[ALIGN1:[0-9]+]]} -; CHECK: ![[ALIGN1]] = !{i32 6182, i32 8} -; CHECK-DAG: ![[ARG2]] = !{} +; CHECK-DAG: ![[PARMDECOR_CASE2]] = !{![[ARG1:[0-9]+]], ![[ARG1:[0-9]+]], ![[ARG3:[0-9]+]]} +; CHECK-DAG: ![[ARG1]] = !{} ; CHECK-DAG: ![[ARG3]] = !{![[AWIDTH3:[0-9]+]]} ; CHECK: ![[AWIDTH3]] = !{i32 6177, i32 64} diff --git a/opencl/CMakeLists.txt b/opencl/CMakeLists.txt index 1442a1ac43075..4ed26bd5e719d 100644 --- a/opencl/CMakeLists.txt +++ b/opencl/CMakeLists.txt @@ -44,7 +44,7 @@ FetchContent_GetProperties(ocl-headers) set(OpenCL_INCLUDE_DIR ${ocl-headers_SOURCE_DIR} CACHE PATH "Path to OpenCL Headers") -target_compile_definitions(Headers INTERFACE -DCL_TARGET_OPENCL_VERSION=220) +target_compile_definitions(Headers INTERFACE -DCL_TARGET_OPENCL_VERSION=300) add_library(OpenCL-Headers ALIAS Headers) # OpenCL Library (ICD Loader) diff --git a/sycl/CMakeLists.txt b/sycl/CMakeLists.txt index bb3cf0476c689..c4ab71454af84 100644 --- a/sycl/CMakeLists.txt +++ b/sycl/CMakeLists.txt @@ -162,6 +162,12 @@ endif() if ("esimd_emulator" IN_LIST SYCL_ENABLE_PLUGINS) set(SYCL_BUILD_PI_ESIMD_EMULATOR ON) endif() +if ("opencl" IN_LIST SYCL_ENABLE_PLUGINS) + set(SYCL_BUILD_PI_OPENCL ON) +endif() +if ("level_zero" IN_LIST SYCL_ENABLE_PLUGINS) + set(SYCL_BUILD_PI_LEVEL_ZERO ON) +endif() # Configure SYCL version macro set(sycl_inc_dir ${CMAKE_CURRENT_SOURCE_DIR}/include) diff --git a/sycl/doc/design/KernelProgramCache.md b/sycl/doc/design/KernelProgramCache.md index fa46ab6e70289..7818706314c1e 100644 --- a/sycl/doc/design/KernelProgramCache.md +++ b/sycl/doc/design/KernelProgramCache.md @@ -215,7 +215,7 @@ kernel of this program will be cached also. All requests to build a program or to create a kernel - whether they originate from explicit user API calls or from internal SYCL runtime execution logic - end up with calling the function -[`getOrBuild()`](https://github.com/intel/llvm/blob/sycl/sycl/source/detail/program_manager/program_manager.cpp#L149) +[`getOrBuild()`](https://github.com/intel/llvm/blob/sycl/sycl/source/detail/program_manager/program_manager.cpp) with number of lambda functions passed as arguments: - Acquire function; @@ -225,7 +225,7 @@ with number of lambda functions passed as arguments: *Acquire* function returns a locked version of cache. Locking is employed for thread safety. The threads are blocked only for insert-or-acquire attempt, i.e. when calling to `map::insert` in -[`getOrBuild`](https://github.com/intel/llvm/blob/sycl/sycl/source/detail/program_manager/program_manager.cpp#L149) +[`getOrBuild()`](https://github.com/intel/llvm/blob/sycl/sycl/source/detail/program_manager/program_manager.cpp) function. The rest of operation is done with the help of atomics and condition variables (plus a mutex for proper work of condition variable). diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_root_group.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_root_group.asciidoc index b2dfa639b3f75..fdf3ced15afde 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_root_group.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_root_group.asciidoc @@ -44,7 +44,7 @@ This extension also depends on the following other SYCL extensions: * link:../experimental/sycl_ext_oneapi_properties.asciidoc[ sycl_ext_oneapi_properties] -* link:../proposed/sycl_ext_oneapi_kernel_properties.asciidoc[ +* link:../experimental/sycl_ext_oneapi_kernel_properties.asciidoc[ sycl_ext_oneapi_kernel_properties] diff --git a/sycl/doc/extensions/supported/sycl_ext_oneapi_weak_object.asciidoc b/sycl/doc/extensions/supported/sycl_ext_oneapi_weak_object.asciidoc index 77f3b8cf179c8..1b58d93091c47 100644 --- a/sycl/doc/extensions/supported/sycl_ext_oneapi_weak_object.asciidoc +++ b/sycl/doc/extensions/supported/sycl_ext_oneapi_weak_object.asciidoc @@ -330,3 +330,5 @@ they are both empty `weak_object` instances. |=== +The `weak_object` class, the `ext_oneapi_owner_before` member functions and the +`owner_less` function object type must not be used in device code. diff --git a/sycl/include/CL/__spirv/spirv_ops.hpp b/sycl/include/CL/__spirv/spirv_ops.hpp index 610a400af48d4..9cd947c21e8d7 100644 --- a/sycl/include/CL/__spirv/spirv_ops.hpp +++ b/sycl/include/CL/__spirv/spirv_ops.hpp @@ -8,6 +8,7 @@ #pragma once #include +#include #include #include #include @@ -1061,6 +1062,19 @@ __CLC_BF16_SCAL_VEC(uint32_t) #undef __CLC_BF16_SCAL_VEC #undef __CLC_BF16 +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL + __SYCL_EXPORT __spv::complex_half + __spirv_GroupCMulINTEL(unsigned int, unsigned int, + __spv::complex_half) noexcept; +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL + __SYCL_EXPORT __spv::complex_float + __spirv_GroupCMulINTEL(unsigned int, unsigned int, + __spv::complex_float) noexcept; +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL + __SYCL_EXPORT __spv::complex_double + __spirv_GroupCMulINTEL(unsigned int, unsigned int, + __spv::complex_double) noexcept; + extern __DPCPP_SYCL_EXTERNAL int32_t __spirv_BuiltInGlobalHWThreadIDINTEL(); extern __DPCPP_SYCL_EXTERNAL int32_t __spirv_BuiltInSubDeviceIDINTEL(); diff --git a/sycl/include/CL/__spirv/spirv_types.hpp b/sycl/include/CL/__spirv/spirv_types.hpp index 89cac7a9d2e07..82c5a39c1500d 100644 --- a/sycl/include/CL/__spirv/spirv_types.hpp +++ b/sycl/include/CL/__spirv/spirv_types.hpp @@ -8,9 +8,12 @@ #pragma once +#include "sycl/half_type.hpp" #include #include +#include +#include #include #include @@ -128,6 +131,27 @@ enum class MatrixLayout : uint32_t { enum class MatrixUse : uint32_t { MatrixA = 0, MatrixB = 1, Accumulator = 2 }; +struct complex_float { + complex_float() = default; + complex_float(std::complex x) : real(x.real()), imag(x.imag()) {} + operator std::complex() { return {real, imag}; } + float real, imag; +}; + +struct complex_double { + complex_double() = default; + complex_double(std::complex x) : real(x.real()), imag(x.imag()) {} + operator std::complex() { return {real, imag}; } + double real, imag; +}; + +struct complex_half { + complex_half() = default; + complex_half(std::complex x) : real(x.real()), imag(x.imag()) {} + operator std::complex() { return {real, imag}; } + sycl::half real, imag; +}; + #if (SYCL_EXT_ONEAPI_MATRIX_VERSION > 1) template > + typename = detail::enable_if_t< + (AccessTarget_ == access::target::host_buffer) || + (AccessTarget_ == access::target::host_task)>> #if SYCL_LANGUAGE_VERSION >= 202001 std::add_pointer_t get_pointer() const noexcept #else @@ -2667,10 +2668,6 @@ class __SYCL_SPECIAL_CLASS local_accessor_base : return AccessorSubscript(*this, Index); } - local_ptr get_pointer() const { - return local_ptr(getQualifiedPtr()); - } - bool operator==(const local_accessor_base &Rhs) const { return impl == Rhs.impl; } @@ -2699,6 +2696,11 @@ class __SYCL_EBO __SYCL_SPECIAL_CLASS accessor< // Use base classes constructors using local_acc::local_acc; +public: + local_ptr get_pointer() const { + return local_ptr(local_acc::getQualifiedPtr()); + } + #ifdef __SYCL_DEVICE_ONLY__ // __init needs to be defined within the class not through inheritance. @@ -2805,6 +2807,10 @@ class __SYCL_EBO __SYCL_SPECIAL_CLASS __SYCL_TYPE(local_accessor) local_accessor return const_reverse_iterator(begin()); } + std::add_pointer_t get_pointer() const noexcept { + return std::add_pointer_t(local_acc::getQualifiedPtr()); + } + template accessor_ptr get_multi_ptr() const noexcept { return accessor_ptr(local_acc::getQualifiedPtr()); diff --git a/sycl/include/sycl/builtins.hpp b/sycl/include/sycl/builtins.hpp index fe6eeaaec2e4e..9e808e60ddebc 100644 --- a/sycl/include/sycl/builtins.hpp +++ b/sycl/include/sycl/builtins.hpp @@ -122,10 +122,27 @@ __SYCL_MATH_FUNCTION_OVERLOAD_FM(log2) __SYCL_MATH_FUNCTION_OVERLOAD_FM(log10) __SYCL_MATH_FUNCTION_OVERLOAD_FM(sqrt) __SYCL_MATH_FUNCTION_OVERLOAD_FM(rsqrt) +__SYCL_MATH_FUNCTION_OVERLOAD_FM(fabs) #undef __SYCL_MATH_FUNCTION_OVERLOAD_FM #undef __SYCL_MATH_FUNCTION_OVERLOAD_IMPL +template +inline __SYCL_ALWAYS_INLINE + std::enable_if_t::value, marray> + ilogb(marray x) __NOEXC { + marray res; + for (size_t i = 0; i < N / 2; i++) { + vec partial_res = + __sycl_std::__invoke_ilogb>(detail::to_vec2(x, i * 2)); + std::memcpy(&res[i * 2], &partial_res, sizeof(vec)); + } + if (N % 2) { + res[N - 1] = __sycl_std::__invoke_ilogb(x[N - 1]); + } + return res; +} + #define __SYCL_MATH_FUNCTION_2_OVERLOAD_IMPL(NAME) \ marray res; \ for (size_t i = 0; i < N / 2; i++) { \ @@ -170,6 +187,98 @@ inline __SYCL_ALWAYS_INLINE #undef __SYCL_MATH_FUNCTION_2_OVERLOAD_IMPL +#define __SYCL_MATH_FUNCTION_2_SGENFLOAT_Y_OVERLOAD(NAME) \ + template \ + inline __SYCL_ALWAYS_INLINE \ + std::enable_if_t::value, marray> \ + NAME(marray x, T y) __NOEXC { \ + marray res; \ + sycl::vec y_vec{y, y}; \ + for (size_t i = 0; i < N / 2; i++) { \ + auto partial_res = __sycl_std::__invoke_##NAME>( \ + detail::to_vec2(x, i * 2), y_vec); \ + std::memcpy(&res[i * 2], &partial_res, sizeof(vec)); \ + } \ + if (N % 2) { \ + res[N - 1] = __sycl_std::__invoke_##NAME(x[N - 1], y_vec[0]); \ + } \ + return res; \ + } + +__SYCL_MATH_FUNCTION_2_SGENFLOAT_Y_OVERLOAD(fmax) + // clang-format off +__SYCL_MATH_FUNCTION_2_SGENFLOAT_Y_OVERLOAD(fmin) + +#undef __SYCL_MATH_FUNCTION_2_SGENFLOAT_Y_OVERLOAD + +template +inline __SYCL_ALWAYS_INLINE + std::enable_if_t::value, marray> + ldexp(marray x, marray k) __NOEXC { + // clang-format on + marray res; + for (size_t i = 0; i < N; i++) { + res[i] = __sycl_std::__invoke_ldexp(x[i], k[i]); + } + return res; +} + +template +inline __SYCL_ALWAYS_INLINE + std::enable_if_t::value, marray> + ldexp(marray x, int k) __NOEXC { + marray res; + for (size_t i = 0; i < N; i++) { + res[i] = __sycl_std::__invoke_ldexp(x[i], k); + } + return res; +} + +#define __SYCL_MATH_FUNCTION_2_GENINT_Y_OVERLOAD_IMPL(NAME) \ + marray res; \ + for (size_t i = 0; i < N; i++) { \ + res[i] = __sycl_std::__invoke_##NAME(x[i], y[i]); \ + } \ + return res; + +template +inline __SYCL_ALWAYS_INLINE + std::enable_if_t::value, marray> + pown(marray x, marray y) __NOEXC { + __SYCL_MATH_FUNCTION_2_GENINT_Y_OVERLOAD_IMPL(pown) +} + +template +inline __SYCL_ALWAYS_INLINE + std::enable_if_t::value, marray> + rootn(marray x, marray y) __NOEXC { + __SYCL_MATH_FUNCTION_2_GENINT_Y_OVERLOAD_IMPL(rootn) +} + +#undef __SYCL_MATH_FUNCTION_2_GENINT_Y_OVERLOAD_IMPL + +#define __SYCL_MATH_FUNCTION_2_INT_Y_OVERLOAD_IMPL(NAME) \ + marray res; \ + for (size_t i = 0; i < N; i++) { \ + res[i] = __sycl_std::__invoke_##NAME(x[i], y); \ + } \ + return res; + +template +inline __SYCL_ALWAYS_INLINE + std::enable_if_t::value, marray> + pown(marray x, int y) __NOEXC { + __SYCL_MATH_FUNCTION_2_INT_Y_OVERLOAD_IMPL(pown) +} + +template +inline __SYCL_ALWAYS_INLINE + std::enable_if_t::value, marray> + rootn(marray x, + int y) __NOEXC{__SYCL_MATH_FUNCTION_2_INT_Y_OVERLOAD_IMPL(rootn)} + +#undef __SYCL_MATH_FUNCTION_2_INT_Y_OVERLOAD_IMPL + #define __SYCL_MATH_FUNCTION_3_OVERLOAD(NAME) \ template \ inline __SYCL_ALWAYS_INLINE \ @@ -789,6 +898,80 @@ detail::enable_if_t::value, T> sign(T x) __NOEXC { return __sycl_std::__invoke_sign(x); } +// marray common functions + +// TODO: can be optimized in the way math functions are optimized (usage of +// vec) +#define __SYCL_MARRAY_COMMON_FUNCTION_OVERLOAD_IMPL(NAME, ...) \ + T res; \ + for (int i = 0; i < T::size(); i++) { \ + res[i] = NAME(__VA_ARGS__); \ + } \ + return res; + +#define __SYCL_MARRAY_COMMON_FUNCTION_UNOP_OVERLOAD(NAME, ARG, ...) \ + template ::value>> \ + T NAME(ARG) __NOEXC { \ + __SYCL_MARRAY_COMMON_FUNCTION_OVERLOAD_IMPL(NAME, __VA_ARGS__) \ + } + +__SYCL_MARRAY_COMMON_FUNCTION_UNOP_OVERLOAD(degrees, T radians, radians[i]) +__SYCL_MARRAY_COMMON_FUNCTION_UNOP_OVERLOAD(radians, T degrees, degrees[i]) +__SYCL_MARRAY_COMMON_FUNCTION_UNOP_OVERLOAD(sign, T x, x[i]) + +#undef __SYCL_MARRAY_COMMON_FUNCTION_UNOP_OVERLOAD + +#define __SYCL_MARRAY_COMMON_FUNCTION_BINOP_OVERLOAD(NAME, ARG1, ARG2, ...) \ + template ::value>> \ + T NAME(ARG1, ARG2) __NOEXC { \ + __SYCL_MARRAY_COMMON_FUNCTION_OVERLOAD_IMPL(NAME, __VA_ARGS__) \ + } + +// min and max may be defined as macros, so we wrap them in parentheses to avoid +// errors. +__SYCL_MARRAY_COMMON_FUNCTION_BINOP_OVERLOAD((min), T x, T y, x[i], y[i]) +__SYCL_MARRAY_COMMON_FUNCTION_BINOP_OVERLOAD((min), T x, + detail::marray_element_type y, + x[i], y) +__SYCL_MARRAY_COMMON_FUNCTION_BINOP_OVERLOAD((max), T x, T y, x[i], y[i]) +__SYCL_MARRAY_COMMON_FUNCTION_BINOP_OVERLOAD((max), T x, + detail::marray_element_type y, + x[i], y) +__SYCL_MARRAY_COMMON_FUNCTION_BINOP_OVERLOAD(step, T edge, T x, edge[i], x[i]) +__SYCL_MARRAY_COMMON_FUNCTION_BINOP_OVERLOAD( + step, detail::marray_element_type edge, T x, edge, x[i]) + +#undef __SYCL_MARRAY_COMMON_FUNCTION_BINOP_OVERLOAD + +#define __SYCL_MARRAY_COMMON_FUNCTION_TEROP_OVERLOAD(NAME, ARG1, ARG2, ARG3, \ + ...) \ + template ::value>> \ + T NAME(ARG1, ARG2, ARG3) __NOEXC { \ + __SYCL_MARRAY_COMMON_FUNCTION_OVERLOAD_IMPL(NAME, __VA_ARGS__) \ + } + +__SYCL_MARRAY_COMMON_FUNCTION_TEROP_OVERLOAD(clamp, T x, T minval, T maxval, + x[i], minval[i], maxval[i]) +__SYCL_MARRAY_COMMON_FUNCTION_TEROP_OVERLOAD( + clamp, T x, detail::marray_element_type minval, + detail::marray_element_type maxval, x[i], minval, maxval) +__SYCL_MARRAY_COMMON_FUNCTION_TEROP_OVERLOAD(mix, T x, T y, T a, x[i], y[i], + a[i]) +__SYCL_MARRAY_COMMON_FUNCTION_TEROP_OVERLOAD(mix, T x, T y, + detail::marray_element_type a, + x[i], y[i], a) +__SYCL_MARRAY_COMMON_FUNCTION_TEROP_OVERLOAD(smoothstep, T edge0, T edge1, T x, + edge0[i], edge1[i], x[i]) +__SYCL_MARRAY_COMMON_FUNCTION_TEROP_OVERLOAD( + smoothstep, detail::marray_element_type edge0, + detail::marray_element_type edge1, T x, edge0, edge1, x[i]) + +#undef __SYCL_MARRAY_COMMON_FUNCTION_TEROP_OVERLOAD +#undef __SYCL_MARRAY_COMMON_FUNCTION_OVERLOAD_IMPL + /* --------------- 4.13.4 Integer functions. --------------------------------*/ // ugeninteger abs (geninteger x) template @@ -1433,8 +1616,7 @@ any(T x) __NOEXC { // int any (vigeninteger x) template -detail::enable_if_t::value, detail::anyall_ret_t> -any(T x) __NOEXC { +detail::enable_if_t::value, int> any(T x) __NOEXC { return detail::rel_sign_bit_test_ret_t( __sycl_std::__invoke_Any>( detail::rel_sign_bit_test_arg_t(x))); @@ -1449,8 +1631,7 @@ all(T x) __NOEXC { // int all (vigeninteger x) template -detail::enable_if_t::value, detail::anyall_ret_t> -all(T x) __NOEXC { +detail::enable_if_t::value, int> all(T x) __NOEXC { return detail::rel_sign_bit_test_ret_t( __sycl_std::__invoke_All>( detail::rel_sign_bit_test_arg_t(x))); @@ -1724,6 +1905,7 @@ __SYCL_HALF_PRECISION_MATH_FUNCTION_OVERLOAD(cos) __SYCL_HALF_PRECISION_MATH_FUNCTION_OVERLOAD(tan) __SYCL_HALF_PRECISION_MATH_FUNCTION_OVERLOAD(exp) __SYCL_HALF_PRECISION_MATH_FUNCTION_OVERLOAD(exp2) +__SYCL_HALF_PRECISION_MATH_FUNCTION_OVERLOAD(exp10) __SYCL_HALF_PRECISION_MATH_FUNCTION_OVERLOAD(log) __SYCL_HALF_PRECISION_MATH_FUNCTION_OVERLOAD(log2) __SYCL_HALF_PRECISION_MATH_FUNCTION_OVERLOAD(log10) diff --git a/sycl/include/sycl/detail/cl.h b/sycl/include/sycl/detail/cl.h index 7e90fe126e40d..20d640bcff59f 100644 --- a/sycl/include/sycl/detail/cl.h +++ b/sycl/include/sycl/detail/cl.h @@ -9,9 +9,9 @@ #pragma once // Suppress a compiler message about undefined CL_TARGET_OPENCL_VERSION -// and define all symbols up to OpenCL 2.2 +// and define all symbols up to OpenCL 3.0 #ifndef CL_TARGET_OPENCL_VERSION -#define CL_TARGET_OPENCL_VERSION 220 +#define CL_TARGET_OPENCL_VERSION 300 #endif #include diff --git a/sycl/include/sycl/detail/generic_type_traits.hpp b/sycl/include/sycl/detail/generic_type_traits.hpp index 09e1bb4850139..a8c479b816562 100644 --- a/sycl/include/sycl/detail/generic_type_traits.hpp +++ b/sycl/include/sycl/detail/generic_type_traits.hpp @@ -8,6 +8,7 @@ #pragma once +#include #include #include #include @@ -16,6 +17,7 @@ #include #include +#include #include namespace sycl { @@ -446,6 +448,14 @@ using select_cl_scalar_float_t = select_apply_cl_scalar_t; +template +using select_cl_scalar_complex_or_T_t = std::conditional_t< + std::is_same>::value, __spv::complex_float, + std::conditional_t< + std::is_same>::value, __spv::complex_double, + std::conditional_t>::value, + __spv::complex_half, T>>>; + template using select_cl_scalar_integral_t = conditional_t::value, @@ -457,12 +467,13 @@ using select_cl_scalar_integral_t = template using select_cl_scalar_t = conditional_t< std::is_integral::value, select_cl_scalar_integral_t, - conditional_t< - std::is_floating_point::value, select_cl_scalar_float_t, - // half is a special case: it is implemented differently on host and - // device and therefore, might lower to different types - conditional_t::value, - sycl::detail::half_impl::BIsRepresentationT, T>>>; + conditional_t::value, select_cl_scalar_float_t, + // half is a special case: it is implemented differently on + // host and device and therefore, might lower to different + // types + conditional_t::value, + sycl::detail::half_impl::BIsRepresentationT, + select_cl_scalar_complex_or_T_t>>>; // select_cl_vector_or_scalar_or_ptr does cl_* type selection for element type // of a vector type T, pointer type substitution, and scalar type substitution. diff --git a/sycl/include/sycl/detail/type_traits.hpp b/sycl/include/sycl/detail/type_traits.hpp index f5563c28ad12e..474ced1a5fe91 100644 --- a/sycl/include/sycl/detail/type_traits.hpp +++ b/sycl/include/sycl/detail/type_traits.hpp @@ -65,8 +65,10 @@ template -inline constexpr bool is_group_v = - detail::is_group::value || detail::is_sub_group::value; +struct is_group : std::bool_constant::value || + detail::is_sub_group::value> {}; + +template inline constexpr bool is_group_v = is_group::value; namespace ext::oneapi::experimental { template diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp index d6da54db85118..bed0f2dcd1d14 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp @@ -714,6 +714,19 @@ __esimd_dpasw_nosrc0(__ESIMD_DNS::vector_type_t src1, } #endif // !__SYCL_DEVICE_ONLY__ +template +__ESIMD_INTRIN __ESIMD_raw_vec_t(T, N) + __esimd_bfn(__ESIMD_raw_vec_t(T, N) src0, __ESIMD_raw_vec_t(T, N) src1, + __ESIMD_raw_vec_t(T, N) src2) +#ifdef __SYCL_DEVICE_ONLY__ + ; +#else // !__SYCL_DEVICE_ONLY__ +{ + __ESIMD_UNSUPPORTED_ON_HOST; + return __ESIMD_DNS::vector_type_t(); +} +#endif // !__SYCL_DEVICE_ONLY__ + #undef __ESIMD_raw_vec_t #undef __ESIMD_cpp_vec_t diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/math.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/math.hpp index 7783b13a3d064..e337e12c66ebf 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/math.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/math.hpp @@ -1727,6 +1727,106 @@ __ESIMD_API __ESIMD_NS::simd dpasw2( } /// @} sycl_esimd_systolic_array_api +/// @addtogroup sycl_esimd_logical +/// @{ + +/// This enum is used to encode all possible logical operations performed +/// on the 3 input operands. It is used as a template argument of the bfn() +/// function. +/// Example: d = bfn<~bfn_t::x & ~bfn_t::y & ~bfn_t::z>(s0, s1, s2); +enum class bfn_t : uint8_t { x = 0xAA, y = 0xCC, z = 0xF0 }; + +static constexpr bfn_t operator~(bfn_t x) { + uint8_t val = static_cast(x); + uint8_t res = ~val; + return static_cast(res); +} + +static constexpr bfn_t operator|(bfn_t x, bfn_t y) { + uint8_t arg0 = static_cast(x); + uint8_t arg1 = static_cast(y); + uint8_t res = arg0 | arg1; + return static_cast(res); +} + +static constexpr bfn_t operator&(bfn_t x, bfn_t y) { + uint8_t arg0 = static_cast(x); + uint8_t arg1 = static_cast(y); + uint8_t res = arg0 & arg1; + return static_cast(res); +} + +static constexpr bfn_t operator^(bfn_t x, bfn_t y) { + uint8_t arg0 = static_cast(x); + uint8_t arg1 = static_cast(y); + uint8_t res = arg0 ^ arg1; + return static_cast(res); +} + +/// Performs binary function computation with three vector operands. +/// @tparam FuncControl boolean function control expressed with bfn_t +/// enum values. +/// @tparam T type of the input vector element. +/// @tparam N size of the input vector. +/// @param s0 First boolean function argument. +/// @param s1 Second boolean function argument. +/// @param s2 Third boolean function argument. +template +__ESIMD_API std::enable_if_t, __ESIMD_NS::simd> +bfn(__ESIMD_NS::simd src0, __ESIMD_NS::simd src1, + __ESIMD_NS::simd src2) { + if constexpr ((sizeof(T) == 8) || ((sizeof(T) == 1) && (N % 4 == 0)) || + ((sizeof(T) == 2) && (N % 2 == 0))) { + // Bitcast Nx8-byte vectors to 2xN vectors of 4-byte integers. + // Bitcast Nx1-byte vectors to N/4 vectors of 4-byte integers. + // Bitcast Nx2-byte vectors to N/2 vectors of 4-byte integers. + auto Result = __ESIMD_ENS::bfn( + src0.template bit_cast_view().read(), + src1.template bit_cast_view().read(), + src2.template bit_cast_view().read()); + return Result.template bit_cast_view(); + } else if constexpr (sizeof(T) == 2 || sizeof(T) == 4) { + constexpr uint8_t FC = static_cast(FuncControl); + return __esimd_bfn(src0.data(), src1.data(), src2.data()); + } else if constexpr (N % 2 == 0) { + // Bitcast Nx1-byte vectors (N is even) to N/2 vectors of 2-byte integers. + auto Result = __ESIMD_ENS::bfn( + src0.template bit_cast_view().read(), + src1.template bit_cast_view().read(), + src2.template bit_cast_view().read()); + return Result.template bit_cast_view(); + } else { + // Odd number of 1-byte elements. + __ESIMD_NS::simd Src0, Src1, Src2; + Src0.template select() = src0; + Src1.template select() = src1; + Src2.template select() = src2; + auto Result = __ESIMD_ENS::bfn(Src0, Src1, Src2); + return Result.template select(); + } +} + +/// Performs binary function computation with three scalar operands. +/// @tparam FuncControl boolean function control expressed with bfn_t enum +/// values. +/// @tparam T type of the input vector element. +/// @param s0 First boolean function argument. +/// @param s1 Second boolean function argument. +/// @param s2 Third boolean function argument. +template +ESIMD_NODEBUG ESIMD_INLINE std::enable_if_t< + __ESIMD_DNS::is_esimd_scalar::value && std::is_integral_v, T> +bfn(T src0, T src1, T src2) { + __ESIMD_NS::simd Src0 = src0; + __ESIMD_NS::simd Src1 = src1; + __ESIMD_NS::simd Src2 = src2; + __ESIMD_NS::simd Result = + esimd::bfn(Src0, Src1, Src2); + return Result[0]; +} + +/// @} sycl_esimd_logical + } // namespace ext::intel::experimental::esimd } // __SYCL_INLINE_VER_NAMESPACE(_V1) } // namespace sycl diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp index c604bf249ca1b..201a92ab53ec4 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp @@ -2665,7 +2665,9 @@ template <__ESIMD_NS::atomic_op Op, typename T, int N, lsc_data_size DS = lsc_data_size::default_size, cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none, typename Toffset> -__ESIMD_API __ESIMD_NS::simd +__ESIMD_API std::enable_if_t< + __ESIMD_DNS::get_num_args<__ESIMD_DNS::to_lsc_atomic_op()>() == 0, + __ESIMD_NS::simd> lsc_atomic_update(T *p, __ESIMD_NS::simd offsets, __ESIMD_NS::simd_mask pred) { static_assert(std::is_integral_v, "Unsupported offset type"); @@ -2697,7 +2699,9 @@ template <__ESIMD_NS::atomic_op Op, typename T, int N, cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none, typename Toffset, typename RegionTy = __ESIMD_NS::region1d_t> -__ESIMD_API __ESIMD_NS::simd +__ESIMD_API std::enable_if_t< + __ESIMD_DNS::get_num_args<__ESIMD_DNS::to_lsc_atomic_op()>() == 0, + __ESIMD_NS::simd> lsc_atomic_update(T *p, __ESIMD_NS::simd_view offsets, __ESIMD_NS::simd_mask pred = 1) { using Ty = typename __ESIMD_NS::simd_view::element_type; @@ -2709,8 +2713,10 @@ template <__ESIMD_NS::atomic_op Op, typename T, int N, lsc_data_size DS = lsc_data_size::default_size, cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none, typename Toffset> -__ESIMD_API - std::enable_if_t, __ESIMD_NS::simd> +__ESIMD_API std::enable_if_t< + std::is_integral_v && + __ESIMD_DNS::get_num_args<__ESIMD_DNS::to_lsc_atomic_op()>() == 0, + __ESIMD_NS::simd> lsc_atomic_update(T *p, Toffset offset, __ESIMD_NS::simd_mask pred = 1) { return lsc_atomic_update( p, __ESIMD_NS::simd(offset), pred); @@ -2735,7 +2741,9 @@ template <__ESIMD_NS::atomic_op Op, typename T, int N, lsc_data_size DS = lsc_data_size::default_size, cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none, typename Toffset> -__ESIMD_API __ESIMD_NS::simd +__ESIMD_API std::enable_if_t< + __ESIMD_DNS::get_num_args<__ESIMD_DNS::to_lsc_atomic_op()>() == 1, + __ESIMD_NS::simd> lsc_atomic_update(T *p, __ESIMD_NS::simd offsets, __ESIMD_NS::simd src0, __ESIMD_NS::simd_mask pred) { static_assert(std::is_integral_v, "Unsupported offset type"); @@ -2768,7 +2776,9 @@ template <__ESIMD_NS::atomic_op Op, typename T, int N, cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none, typename Toffset, typename RegionTy = __ESIMD_NS::region1d_t> -__ESIMD_API __ESIMD_NS::simd +__ESIMD_API std::enable_if_t< + __ESIMD_DNS::get_num_args<__ESIMD_DNS::to_lsc_atomic_op()>() == 1, + __ESIMD_NS::simd> lsc_atomic_update(T *p, __ESIMD_NS::simd_view offsets, __ESIMD_NS::simd src0, __ESIMD_NS::simd_mask pred = 1) { @@ -2781,11 +2791,13 @@ template <__ESIMD_NS::atomic_op Op, typename T, int N, lsc_data_size DS = lsc_data_size::default_size, cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none, typename Toffset> -__ESIMD_API std::enable_if_t && - ((Op != __ESIMD_NS::atomic_op::store && - Op != __ESIMD_NS::atomic_op::xchg) || - N == 1), - __ESIMD_NS::simd> +__ESIMD_API std::enable_if_t< + std::is_integral_v && + __ESIMD_DNS::get_num_args<__ESIMD_DNS::to_lsc_atomic_op()>() == 1 && + ((Op != __ESIMD_NS::atomic_op::store && + Op != __ESIMD_NS::atomic_op::xchg) || + N == 1), + __ESIMD_NS::simd> lsc_atomic_update(T *p, Toffset offset, __ESIMD_NS::simd src0, __ESIMD_NS::simd_mask pred = 1) { return lsc_atomic_update( @@ -2811,7 +2823,9 @@ template <__ESIMD_NS::atomic_op Op, typename T, int N, lsc_data_size DS = lsc_data_size::default_size, cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none, typename Toffset> -__ESIMD_API __ESIMD_NS::simd +__ESIMD_API std::enable_if_t< + __ESIMD_DNS::get_num_args<__ESIMD_DNS::to_lsc_atomic_op()>() == 2, + __ESIMD_NS::simd> lsc_atomic_update(T *p, __ESIMD_NS::simd offsets, __ESIMD_NS::simd src0, __ESIMD_NS::simd src1, __ESIMD_NS::simd_mask pred) { @@ -2846,7 +2860,9 @@ template <__ESIMD_NS::atomic_op Op, typename T, int N, cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none, typename Toffset, typename RegionTy = __ESIMD_NS::region1d_t> -__ESIMD_API __ESIMD_NS::simd +__ESIMD_API std::enable_if_t< + __ESIMD_DNS::get_num_args<__ESIMD_DNS::to_lsc_atomic_op()>() == 2, + __ESIMD_NS::simd> lsc_atomic_update(T *p, __ESIMD_NS::simd_view offsets, __ESIMD_NS::simd src0, __ESIMD_NS::simd src1, __ESIMD_NS::simd_mask pred = 1) { @@ -2859,8 +2875,10 @@ template <__ESIMD_NS::atomic_op Op, typename T, int N, lsc_data_size DS = lsc_data_size::default_size, cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none, typename Toffset> -__ESIMD_API - std::enable_if_t, __ESIMD_NS::simd> +__ESIMD_API std::enable_if_t< + std::is_integral_v && + __ESIMD_DNS::get_num_args<__ESIMD_DNS::to_lsc_atomic_op()>() == 2, + __ESIMD_NS::simd> lsc_atomic_update(T *p, Toffset offset, __ESIMD_NS::simd src0, __ESIMD_NS::simd src1, __ESIMD_NS::simd_mask pred = 1) { @@ -3082,58 +3100,64 @@ namespace esimd { /// native::lsc::atomic_op instead of atomic_op as atomic /// operation template argument. template -__ESIMD_API simd atomic_update(T *p, simd offset, - simd_mask mask) { +__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args() == 0, simd> +atomic_update(T *p, simd offset, simd_mask mask) { return __ESIMD_ENS::lsc_atomic_update(), T, N>( p, offset, mask); } template > -__ESIMD_API simd atomic_update(T *p, simd_view offsets, - simd_mask mask = 1) { +__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args() == 0, simd> +atomic_update(T *p, simd_view offsets, + simd_mask mask = 1) { return __ESIMD_ENS::lsc_atomic_update(), T, N>( p, offsets, mask); } template -__ESIMD_API - std::enable_if_t, __ESIMD_NS::simd> - atomic_update(T *p, Toffset offset, simd_mask mask = 1) { +__ESIMD_API std::enable_if_t && + __ESIMD_DNS::get_num_args() == 0, + simd> +atomic_update(T *p, Toffset offset, simd_mask mask = 1) { return __ESIMD_ENS::lsc_atomic_update(), T, N>( p, offset, mask); } /// LSC version of the single-argument atomic update. template -__ESIMD_API simd atomic_update(T *p, simd offset, - simd src0, simd_mask mask) { +__ESIMD_API __ESIMD_API + std::enable_if_t<__ESIMD_DNS::get_num_args() == 1, simd> + atomic_update(T *p, simd offset, simd src0, + simd_mask mask) { return __ESIMD_ENS::lsc_atomic_update(), T, N>( p, offset, src0, mask); } template > -__ESIMD_API simd atomic_update(T *p, simd_view offsets, - simd src0, simd_mask mask = 1) { +__ESIMD_API __ESIMD_API + std::enable_if_t<__ESIMD_DNS::get_num_args() == 1, simd> + atomic_update(T *p, simd_view offsets, simd src0, + simd_mask mask = 1) { return __ESIMD_ENS::lsc_atomic_update(), T, N>( p, offsets, src0, mask); } template -__ESIMD_API - std::enable_if_t, __ESIMD_NS::simd> - atomic_update(T *p, Toffset offset, simd src0, - simd_mask mask = 1) { +__ESIMD_API std::enable_if_t && + __ESIMD_DNS::get_num_args() == 1, + simd> +atomic_update(T *p, Toffset offset, simd src0, simd_mask mask = 1) { return __ESIMD_ENS::lsc_atomic_update(), T, N>( p, offset, src0, mask); } /// LSC version of the two-argument atomic update. template -__ESIMD_API simd atomic_update(T *p, simd offset, - simd src0, simd src1, - simd_mask mask) { +__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args() == 2, simd> +atomic_update(T *p, simd offset, simd src0, simd src1, + simd_mask mask) { // 2-argument lsc_atomic_update arguments order matches the standard one - // expected value first, then new value. But atomic_update uses reverse // order, hence the src1/src0 swap. @@ -3143,18 +3167,19 @@ __ESIMD_API simd atomic_update(T *p, simd offset, template > -__ESIMD_API simd atomic_update(T *p, simd_view offsets, - simd src0, simd src1, - simd_mask mask = 1) { +__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args() == 2, simd> +atomic_update(T *p, simd_view offsets, simd src0, + simd src1, simd_mask mask = 1) { return __ESIMD_ENS::lsc_atomic_update(), T, N>( p, offsets, src1, src0, mask); } template -__ESIMD_API - std::enable_if_t, __ESIMD_NS::simd> - atomic_update(T *p, Toffset offset, simd src0, simd src1, - simd_mask mask = 1) { +__ESIMD_API std::enable_if_t && + __ESIMD_DNS::get_num_args() == 2, + __ESIMD_NS::simd> +atomic_update(T *p, Toffset offset, simd src0, simd src1, + simd_mask mask = 1) { return __ESIMD_ENS::lsc_atomic_update(), T, N>( p, offset, src1, src0, mask); } diff --git a/sycl/include/sycl/ext/oneapi/experimental/invoke_simd.hpp b/sycl/include/sycl/ext/oneapi/experimental/invoke_simd.hpp index 018eb56612ea3..0853ff55274c3 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/invoke_simd.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/invoke_simd.hpp @@ -194,6 +194,19 @@ template struct unwrap_uniform> { static T impl(uniform val) { return val; } }; +// Verify the callee return type matches the subgroup size as is required by the +// spec. For example: simd foo(simd); The return type vector +// length (8) does not match the subgroup size (16). +template +constexpr void verify_return_type_matches_sg_size() { + if constexpr (is_simd_or_mask_type::value) { + constexpr auto RetVecLength = SimdRet::size(); + static_assert(RetVecLength == SgSize, + "invoke_simd callee return type vector length must match " + "kernel subgroup size"); + } +} + // Deduces subgroup size of the caller based on given SIMD callable and // corresponding SPMD arguments it is being invoke with via invoke_simd. // Basically, for each supported subgroup size, this meta-function finds out if @@ -349,6 +362,8 @@ __attribute__((always_inline)) auto invoke_simd(sycl::sub_group sg, // is fine in this case. constexpr int N = detail::get_sg_size(); using RetSpmd = detail::SpmdRetType; + detail::verify_return_type_matches_sg_size< + N, detail::SimdRetType>(); constexpr bool is_function = detail::is_function_ptr_or_ref_v; if constexpr (is_function) { diff --git a/sycl/include/sycl/ext/oneapi/functional.hpp b/sycl/include/sycl/ext/oneapi/functional.hpp index e9f21be8ce16f..b8be0c6573620 100644 --- a/sycl/include/sycl/ext/oneapi/functional.hpp +++ b/sycl/include/sycl/ext/oneapi/functional.hpp @@ -8,7 +8,9 @@ #pragma once #include +#include +#include #include namespace sycl { @@ -31,6 +33,7 @@ namespace detail { struct GroupOpISigned {}; struct GroupOpIUnsigned {}; struct GroupOpFP {}; +struct GroupOpC {}; template struct GroupOpTag; @@ -49,6 +52,14 @@ struct GroupOpTag::value>> { using type = GroupOpFP; }; +template +struct GroupOpTag< + T, detail::enable_if_t>::value || + std::is_same>::value || + std::is_same>::value>> { + using type = GroupOpC; +}; + #define __SYCL_CALC_OVERLOAD(GroupTag, SPIRVOperation, BinaryOperation) \ template \ static T calc(GroupTag, T x, BinaryOperation) { \ @@ -83,6 +94,7 @@ __SYCL_CALC_OVERLOAD(GroupOpFP, FAdd, sycl::plus) __SYCL_CALC_OVERLOAD(GroupOpISigned, IMulKHR, sycl::multiplies) __SYCL_CALC_OVERLOAD(GroupOpIUnsigned, IMulKHR, sycl::multiplies) __SYCL_CALC_OVERLOAD(GroupOpFP, FMulKHR, sycl::multiplies) +__SYCL_CALC_OVERLOAD(GroupOpC, CMulINTEL, sycl::multiplies) __SYCL_CALC_OVERLOAD(GroupOpISigned, BitwiseOrKHR, sycl::bit_or) __SYCL_CALC_OVERLOAD(GroupOpIUnsigned, BitwiseOrKHR, sycl::bit_or) diff --git a/sycl/include/sycl/ext/oneapi/weak_object_base.hpp b/sycl/include/sycl/ext/oneapi/weak_object_base.hpp index 52877d9d64c9f..7dc10e7e86e1a 100644 --- a/sycl/include/sycl/ext/oneapi/weak_object_base.hpp +++ b/sycl/include/sycl/ext/oneapi/weak_object_base.hpp @@ -33,6 +33,9 @@ template class weak_object_base { weak_object_base(const weak_object_base &Other) noexcept = default; weak_object_base(weak_object_base &&Other) noexcept = default; + weak_object_base &operator=(const weak_object_base &Other) noexcept = default; + weak_object_base &operator=(weak_object_base &&Other) noexcept = default; + void reset() noexcept { MObjWeakPtr.reset(); } void swap(weak_object_base &Other) noexcept { MObjWeakPtr.swap(Other.MObjWeakPtr); diff --git a/sycl/include/sycl/group_algorithm.hpp b/sycl/include/sycl/group_algorithm.hpp index b2b801986cbc6..1fa39d5ba3b5c 100644 --- a/sycl/include/sycl/group_algorithm.hpp +++ b/sycl/include/sycl/group_algorithm.hpp @@ -106,12 +106,19 @@ using is_plus = std::integral_constant< bool, std::is_same>::value || std::is_same>::value>; +// ---- is_multiplies +template +using is_multiplies = std::integral_constant< + bool, std::is_same>::value || + std::is_same>::value>; + // ---- is_complex // NOTE: std::complex not yet supported by group algorithms. template struct is_complex : std::integral_constant>::value || + std::is_same>::value || + std::is_same>::value || std::is_same>::value> { }; @@ -120,12 +127,19 @@ template using is_arithmetic_or_complex = std::integral_constant::value || sycl::detail::is_arithmetic::value>; -// ---- is_plus_if_complex + +template +struct is_vector_arithmetic_or_complex + : bool_constant::value && + (is_arithmetic::value || + is_complex>::value)> {}; + +// ---- is_plus_or_multiplies_if_complex template -using is_plus_if_complex = - std::integral_constant::value - ? is_plus::value - : std::true_type::value)>; +using is_plus_or_multiplies_if_complex = std::integral_constant< + bool, (is_complex::value ? (is_plus::value || + is_multiplies::value) + : std::true_type::value)>; // ---- identity_for_ga_op // the group algorithms support std::complex, limited to sycl::plus operation @@ -139,6 +153,13 @@ identity_for_ga_op() { return {0, 0}; } +template +constexpr detail::enable_if_t< + (is_complex::value && is_multiplies::value), T> +identity_for_ga_op() { + return {1, 0}; +} + template constexpr detail::enable_if_t::value, T> identity_for_ga_op() { return sycl::known_identity_v; @@ -171,7 +192,9 @@ Function for_each(Group g, Ptr first, Ptr last, Function f) { template detail::enable_if_t<(is_group_v> && - detail::is_scalar_arithmetic::value && + (detail::is_scalar_arithmetic::value || + (detail::is_complex::value && + detail::is_multiplies::value)) && detail::is_native_op::value), T> reduce_over_group(Group, T x, BinaryOperation binary_op) { @@ -217,7 +240,7 @@ reduce_over_group(Group g, T x, BinaryOperation binary_op) { template detail::enable_if_t< (is_group_v> && - detail::is_vector_arithmetic>::value && + detail::is_vector_arithmetic_or_complex>::value && detail::is_native_op, BinaryOperation>::value), sycl::vec> reduce_over_group(Group g, sycl::vec x, BinaryOperation binary_op) { @@ -244,8 +267,8 @@ detail::enable_if_t< (detail::is_scalar_arithmetic::value || detail::is_complex::value) && detail::is_native_op::value && detail::is_native_op::value && - detail::is_plus_if_complex::value && - detail::is_plus_if_complex::value), + detail::is_plus_or_multiplies_if_complex::value && + detail::is_plus_or_multiplies_if_complex::value), T> reduce_over_group(Group g, V x, T init, BinaryOperation binary_op) { // FIXME: Do not special-case for half precision @@ -265,8 +288,8 @@ reduce_over_group(Group g, V x, T init, BinaryOperation binary_op) { template detail::enable_if_t<(is_group_v> && - detail::is_vector_arithmetic::value && - detail::is_vector_arithmetic::value && + detail::is_vector_arithmetic_or_complex::value && + detail::is_vector_arithmetic_or_complex::value && detail::is_native_op::value && detail::is_native_op::value), T> @@ -297,8 +320,8 @@ detail::enable_if_t< (is_group_v> && detail::is_pointer::value && detail::is_arithmetic_or_complex< typename detail::remove_pointer::type>::value && - detail::is_plus_if_complex::type, - BinaryOperation>::value), + detail::is_plus_or_multiplies_if_complex< + typename detail::remove_pointer::type, BinaryOperation>::value), typename detail::remove_pointer::type> joint_reduce(Group g, Ptr first, Ptr last, BinaryOperation binary_op) { #ifdef __SYCL_DEVICE_ONLY__ @@ -323,9 +346,9 @@ detail::enable_if_t< detail::is_arithmetic_or_complex::value && detail::is_native_op::type, BinaryOperation>::value && - detail::is_plus_if_complex::type, - BinaryOperation>::value && - detail::is_plus_if_complex::value && + detail::is_plus_or_multiplies_if_complex< + typename detail::remove_pointer::type, BinaryOperation>::value && + detail::is_plus_or_multiplies_if_complex::value && detail::is_native_op::value), T> joint_reduce(Group g, Ptr first, Ptr last, T init, BinaryOperation binary_op) { @@ -600,7 +623,9 @@ group_broadcast(Group g, T x) { // vector template detail::enable_if_t<(is_group_v> && - detail::is_scalar_arithmetic::value && + (detail::is_scalar_arithmetic::value || + (detail::is_complex::value && + detail::is_multiplies::value)) && detail::is_native_op::value), T> exclusive_scan_over_group(Group, T x, BinaryOperation binary_op) { @@ -644,7 +669,7 @@ exclusive_scan_over_group(Group g, T x, BinaryOperation binary_op) { template detail::enable_if_t<(is_group_v> && - detail::is_vector_arithmetic::value && + detail::is_vector_arithmetic_or_complex::value && detail::is_native_op::value), T> exclusive_scan_over_group(Group g, T x, BinaryOperation binary_op) { @@ -666,8 +691,8 @@ exclusive_scan_over_group(Group g, T x, BinaryOperation binary_op) { // once for vector_arithmetic, once for (scalar_arithmetic || complex) template detail::enable_if_t<(is_group_v> && - detail::is_vector_arithmetic::value && - detail::is_vector_arithmetic::value && + detail::is_vector_arithmetic_or_complex::value && + detail::is_vector_arithmetic_or_complex::value && detail::is_native_op::value && detail::is_native_op::value), T> @@ -693,8 +718,8 @@ detail::enable_if_t< (detail::is_scalar_arithmetic::value || detail::is_complex::value) && detail::is_native_op::value && detail::is_native_op::value && - detail::is_plus_if_complex::value && - detail::is_plus_if_complex::value), + detail::is_plus_or_multiplies_if_complex::value && + detail::is_plus_or_multiplies_if_complex::value), T> exclusive_scan_over_group(Group g, V x, T init, BinaryOperation binary_op) { // FIXME: Do not special-case for half precision @@ -732,9 +757,10 @@ detail::enable_if_t< detail::is_native_op::type, BinaryOperation>::value && detail::is_native_op::value && - detail::is_plus_if_complex::type, - BinaryOperation>::value && - detail::is_plus_if_complex::value), + detail::is_plus_or_multiplies_if_complex< + typename detail::remove_pointer::type, + BinaryOperation>::value && + detail::is_plus_or_multiplies_if_complex::value), OutPtr> joint_exclusive_scan(Group g, InPtr first, InPtr last, OutPtr result, T init, BinaryOperation binary_op) { @@ -787,8 +813,8 @@ detail::enable_if_t< typename detail::remove_pointer::type>::value && detail::is_native_op::type, BinaryOperation>::value && - detail::is_plus_if_complex::type, - BinaryOperation>::value), + detail::is_plus_or_multiplies_if_complex< + typename detail::remove_pointer::type, BinaryOperation>::value), OutPtr> joint_exclusive_scan(Group g, InPtr first, InPtr last, OutPtr result, BinaryOperation binary_op) { @@ -811,7 +837,7 @@ joint_exclusive_scan(Group g, InPtr first, InPtr last, OutPtr result, // complex template detail::enable_if_t<(is_group_v> && - detail::is_vector_arithmetic::value && + detail::is_vector_arithmetic_or_complex::value && detail::is_native_op::value), T> inclusive_scan_over_group(Group g, T x, BinaryOperation binary_op) { @@ -831,7 +857,9 @@ inclusive_scan_over_group(Group g, T x, BinaryOperation binary_op) { template detail::enable_if_t<(is_group_v> && - detail::is_scalar_arithmetic::value && + (detail::is_scalar_arithmetic::value || + (detail::is_complex::value && + detail::is_multiplies::value)) && detail::is_native_op::value), T> inclusive_scan_over_group(Group, T x, BinaryOperation binary_op) { @@ -881,8 +909,8 @@ detail::enable_if_t< (detail::is_scalar_arithmetic::value || detail::is_complex::value) && detail::is_native_op::value && detail::is_native_op::value && - detail::is_plus_if_complex::value && - detail::is_plus_if_complex::value), + detail::is_plus_or_multiplies_if_complex::value && + detail::is_plus_or_multiplies_if_complex::value), T> inclusive_scan_over_group(Group g, V x, BinaryOperation binary_op, T init) { // FIXME: Do not special-case for half precision @@ -904,8 +932,8 @@ inclusive_scan_over_group(Group g, V x, BinaryOperation binary_op, T init) { template detail::enable_if_t<(is_group_v> && - detail::is_vector_arithmetic::value && - detail::is_vector_arithmetic::value && + detail::is_vector_arithmetic_or_complex::value && + detail::is_vector_arithmetic_or_complex::value && detail::is_native_op::value && detail::is_native_op::value), T> @@ -935,9 +963,10 @@ detail::enable_if_t< detail::is_native_op::type, BinaryOperation>::value && detail::is_native_op::value && - detail::is_plus_if_complex::type, - BinaryOperation>::value && - detail::is_plus_if_complex::value), + detail::is_plus_or_multiplies_if_complex< + typename detail::remove_pointer::type, + BinaryOperation>::value && + detail::is_plus_or_multiplies_if_complex::value), OutPtr> joint_inclusive_scan(Group g, InPtr first, InPtr last, OutPtr result, BinaryOperation binary_op, T init) { @@ -989,8 +1018,8 @@ detail::enable_if_t< typename detail::remove_pointer::type>::value && detail::is_native_op::type, BinaryOperation>::value && - detail::is_plus_if_complex::type, - BinaryOperation>::value), + detail::is_plus_or_multiplies_if_complex< + typename detail::remove_pointer::type, BinaryOperation>::value), OutPtr> joint_inclusive_scan(Group g, InPtr first, InPtr last, OutPtr result, BinaryOperation binary_op) { diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index 0cf58142e43a5..b98567b3cf94b 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -2507,8 +2507,8 @@ class __SYCL_EXPORT handler { // Do the following: // 1. If both are host, use host_task to copy. - // 2. If either pointer is host or of the backend supports native memcpy2d, - // use special command. + // 2. If either pointer is host or the backend supports native memcpy2d, use + // special command. // 3. Otherwise, launch a kernel for copying. if (SrcIsHost && DestIsHost) { commonUSMCopy2DFallbackHostTask(Src, SrcPitch, Dest, DestPitch, Width, diff --git a/sycl/include/sycl/kernel_bundle.hpp b/sycl/include/sycl/kernel_bundle.hpp index 6eed7b270edca..12651407a537f 100644 --- a/sycl/include/sycl/kernel_bundle.hpp +++ b/sycl/include/sycl/kernel_bundle.hpp @@ -308,17 +308,19 @@ class kernel_bundle : public detail::kernel_bundle_plain, template typename std::remove_reference_t::value_type get_specialization_constant() const { - const char *SpecSymName = detail::get_spec_constant_symbolic_ID(); - if (!is_specialization_constant_set(SpecSymName)) - return SpecName.getDefaultValue(); - using SCType = typename std::remove_reference_t::value_type; + const char *SpecSymName = detail::get_spec_constant_symbolic_ID(); + SCType Res{SpecName.getDefaultValue()}; + if (!is_specialization_constant_set(SpecSymName)) + return Res; + std::array RetValue; get_specialization_constant_impl(SpecSymName, RetValue.data()); + std::memcpy(&Res, RetValue.data(), sizeof(SCType)); - return *reinterpret_cast(RetValue.data()); + return Res; } /// \returns an iterator to the first device image kernel_bundle contains diff --git a/sycl/include/sycl/multi_ptr.hpp b/sycl/include/sycl/multi_ptr.hpp index ecfacec93b7c9..fecbfbfed620a 100644 --- a/sycl/include/sycl/multi_ptr.hpp +++ b/sycl/include/sycl/multi_ptr.hpp @@ -149,7 +149,7 @@ class multi_ptr { (Space == access::address_space::generic_space || Space == access::address_space::local_space)>> multi_ptr(local_accessor Accessor) - : multi_ptr(Accessor.get_pointer().get()) {} + : m_Pointer(detail::cast_AS(Accessor.get_pointer())) {} // The following constructors are necessary to create multi_ptr from accessor. @@ -210,7 +210,7 @@ class multi_ptr { multi_ptr(local_accessor, Dimensions> Accessor) - : multi_ptr(Accessor.get_pointer().get()) {} + : m_Pointer(detail::cast_AS(Accessor.get_pointer())) {} // Assignment and access operators multi_ptr &operator=(const multi_ptr &) = default; @@ -465,7 +465,7 @@ class multi_ptr { typename = typename detail::enable_if_t< RelaySpace == Space && Space == access::address_space::local_space>> multi_ptr(local_accessor Accessor) - : multi_ptr(Accessor.get_pointer().get()) {} + : m_Pointer(detail::cast_AS(Accessor.get_pointer())) {} // Assignment operators multi_ptr &operator=(const multi_ptr &) = default; @@ -591,7 +591,7 @@ class multi_ptr { typename = typename detail::enable_if_t< RelaySpace == Space && Space == access::address_space::local_space>> multi_ptr(local_accessor Accessor) - : multi_ptr(Accessor.get_pointer().get()) {} + : m_Pointer(detail::cast_AS(Accessor.get_pointer())) {} // Assignment operators multi_ptr &operator=(const multi_ptr &) = default; @@ -848,7 +848,7 @@ class multi_ptr { std::is_const::value && std::is_same::value>> multi_ptr( local_accessor, dimensions> Accessor) - : multi_ptr(Accessor.get_pointer()) {} + : m_Pointer(detail::cast_AS(Accessor.get_pointer())) {} // Only if Space == constant_space and element type is const template < @@ -1089,7 +1089,7 @@ class multi_ptr { _Space == Space && (Space == access::address_space::generic_space || Space == access::address_space::local_space)>> multi_ptr(local_accessor Accessor) - : multi_ptr(Accessor.get_pointer()) {} + : m_Pointer(detail::cast_AS(Accessor.get_pointer())) {} // Only if Space == constant_space template < @@ -1232,7 +1232,7 @@ class multi_ptr { _Space == Space && (Space == access::address_space::generic_space || Space == access::address_space::local_space)>> multi_ptr(local_accessor Accessor) - : multi_ptr(Accessor.get_pointer()) {} + : m_Pointer(detail::cast_AS(Accessor.get_pointer())) {} // Only if Space == constant_space template < diff --git a/sycl/include/sycl/reduction.hpp b/sycl/include/sycl/reduction.hpp index 94fa6fadae41a..47dc36c3e0bc6 100644 --- a/sycl/include/sycl/reduction.hpp +++ b/sycl/include/sycl/reduction.hpp @@ -422,6 +422,13 @@ class ReductionIdentityContainer< static constexpr bool has_identity = false; }; +// Token class to help with the in-place construction of reducers. +template +struct ReducerToken { + const IdentityContainerT &IdentityContainer; + const BinaryOperation BOp; +}; + } // namespace detail /// Specialization of the generic class 'reducer'. It is used for reductions @@ -458,6 +465,14 @@ class reducer< reducer(const IdentityContainerT &IdentityContainer, BinaryOperation BOp) : MValue(GetInitialValue(IdentityContainer)), MIdentity(IdentityContainer), MBinaryOp(BOp) {} + reducer( + const detail::ReducerToken &Token) + : reducer(Token.IdentityContainer, Token.BOp) {} + + reducer(const reducer &) = delete; + reducer(reducer &&) = delete; + reducer &operator=(const reducer &) = delete; + reducer &operator=(reducer &&) = delete; reducer &combine(const T &Partial) { if constexpr (has_identity) @@ -515,6 +530,14 @@ class reducer< reducer() : MValue(getIdentity()) {} reducer(const IdentityContainerT & /* Identity */, BinaryOperation) : MValue(getIdentity()) {} + reducer( + const detail::ReducerToken &Token) + : reducer(Token.IdentityContainer, Token.BOp) {} + + reducer(const reducer &) = delete; + reducer(reducer &&) = delete; + reducer &operator=(const reducer &) = delete; + reducer &operator=(reducer &&) = delete; reducer &combine(const T &Partial) { BinaryOperation BOp; @@ -553,6 +576,14 @@ class reducer &Token) + : reducer(Token.IdentityContainer, Token.BOp) {} + + reducer(const reducer &) = delete; + reducer(reducer &&) = delete; + reducer &operator=(const reducer &) = delete; + reducer &operator=(reducer &&) = delete; reducer &combine(const T &Partial) { if constexpr (has_identity) @@ -599,6 +630,14 @@ class reducer< reducer(const IdentityContainerT &IdentityContainer, BinaryOperation BOp) : MValue(GetInitialValue(IdentityContainer)), MIdentity(IdentityContainer), MBinaryOp(BOp) {} + reducer( + const detail::ReducerToken &Token) + : reducer(Token.IdentityContainer, Token.BOp) {} + + reducer(const reducer &) = delete; + reducer(reducer &&) = delete; + reducer &operator=(const reducer &) = delete; + reducer &operator=(reducer &&) = delete; reducer operator[](size_t Index) { @@ -650,6 +689,14 @@ class reducer< reducer() : MValue(getIdentity()) {} reducer(const IdentityContainerT & /* Identity */, BinaryOperation) : MValue(getIdentity()) {} + reducer( + const detail::ReducerToken &Token) + : reducer(Token.IdentityContainer, Token.BOp) {} + + reducer(const reducer &) = delete; + reducer(reducer &&) = delete; + reducer &operator=(const reducer &) = delete; + reducer &operator=(reducer &&) = delete; // SYCL 2020 revision 4 says this should be const, but this is a bug // see https://github.com/KhronosGroup/SYCL-Docs/pull/252 @@ -746,6 +793,8 @@ class reduction_impl_algo { using identity_container_type = ReductionIdentityContainer; + using reducer_token_type = + detail::ReducerToken; using reducer_type = reducer; using result_type = T; @@ -2062,8 +2111,11 @@ void reduCGFuncMulti(handler &CGH, KernelType KernelFunc, // Pass all reductions to user's lambda in the same order as supplied // Each reducer initializes its own storage auto ReduIndices = std::index_sequence_for(); - auto ReducersTuple = std::tuple{typename Reductions::reducer_type{ - std::get(IdentitiesTuple), std::get(BOPsTuple)}...}; + auto ReducerTokensTuple = + std::tuple{typename Reductions::reducer_token_type{ + std::get(IdentitiesTuple), std::get(BOPsTuple)}...}; + auto ReducersTuple = std::tuple{ + std::get(ReducerTokensTuple)...}; std::apply([&](auto &...Reducers) { KernelFunc(NDIt, Reducers...); }, ReducersTuple); diff --git a/sycl/include/sycl/types.hpp b/sycl/include/sycl/types.hpp index 05cea2cf0f043..ae1b2655d7320 100644 --- a/sycl/include/sycl/types.hpp +++ b/sycl/include/sycl/types.hpp @@ -729,9 +729,6 @@ template class vec { std::index_sequence) : m_Data{Arr[Is]...} {} - constexpr vec(const std::array, NumElements> &Arr) - : vec{Arr, std::make_index_sequence()} {} - public: using element_type = DataT; using rel_t = detail::rel_t; @@ -796,7 +793,8 @@ template class vec { template explicit constexpr vec(const EnableIfHostHalf &arg) : vec{detail::RepeatValue( - static_cast>(arg))} {} + static_cast>(arg)), + std::make_index_sequence()} {} template typename detail::enable_if_t< @@ -812,7 +810,8 @@ template class vec { #else explicit constexpr vec(const DataT &arg) : vec{detail::RepeatValue( - static_cast>(arg))} {} + static_cast>(arg)), + std::make_index_sequence()} {} template typename detail::enable_if_t< @@ -883,7 +882,8 @@ template class vec { template , typename = EnableIfSuitableNumElements> constexpr vec(const argTN &...args) - : vec{VecArgArrayCreator, argTN...>::Create(args...)} {} + : vec{VecArgArrayCreator, argTN...>::Create(args...), + std::make_index_sequence()} {} // TODO: Remove, for debug purposes only. void dump() { diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index 2d88978d87780..9e493502645a9 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -1948,18 +1948,14 @@ pi_result cuda_piDeviceGetInfo(pi_device device, pi_device_info param_name, } case PI_DEVICE_INFO_UUID: { - int driver_version = 0; - cuDriverGetVersion(&driver_version); - int major = driver_version / 1000; - int minor = driver_version % 1000 / 10; CUuuid uuid; - if ((major > 11) || (major == 11 && minor >= 4)) { - sycl::detail::pi::assertion(cuDeviceGetUuid_v2(&uuid, device->get()) == - CUDA_SUCCESS); - } else { - sycl::detail::pi::assertion(cuDeviceGetUuid(&uuid, device->get()) == - CUDA_SUCCESS); - } +#if (CUDA_VERSION >= 11040) + sycl::detail::pi::assertion(cuDeviceGetUuid_v2(&uuid, device->get()) == + CUDA_SUCCESS); +#else + sycl::detail::pi::assertion(cuDeviceGetUuid(&uuid, device->get()) == + CUDA_SUCCESS); +#endif std::array name; std::copy(uuid.bytes, uuid.bytes + 16, name.begin()); return getInfoArray(16, param_value_size, param_value, param_value_size_ret, diff --git a/sycl/plugins/esimd_emulator/pi_esimd_emulator.cpp b/sycl/plugins/esimd_emulator/pi_esimd_emulator.cpp index 0fc2a5a10f4f9..5bb0ce881e79f 100644 --- a/sycl/plugins/esimd_emulator/pi_esimd_emulator.cpp +++ b/sycl/plugins/esimd_emulator/pi_esimd_emulator.cpp @@ -805,6 +805,7 @@ pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName, CASE_PI_UNSUPPORTED(PI_DEVICE_INFO_IMAGE_SRGB) CASE_PI_UNSUPPORTED(PI_DEVICE_INFO_ATOMIC_64) CASE_PI_UNSUPPORTED(PI_DEVICE_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES) + CASE_PI_UNSUPPORTED(PI_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES) CASE_PI_UNSUPPORTED(PI_EXT_ONEAPI_DEVICE_INFO_MAX_GLOBAL_WORK_GROUPS) CASE_PI_UNSUPPORTED(PI_EXT_ONEAPI_DEVICE_INFO_MAX_WORK_GROUPS_1D) CASE_PI_UNSUPPORTED(PI_EXT_ONEAPI_DEVICE_INFO_MAX_WORK_GROUPS_2D) diff --git a/sycl/plugins/hip/pi_hip.cpp b/sycl/plugins/hip/pi_hip.cpp index ddca2a872adfc..b3b2276fdfa5a 100644 --- a/sycl/plugins/hip/pi_hip.cpp +++ b/sycl/plugins/hip/pi_hip.cpp @@ -1863,9 +1863,33 @@ pi_result hip_piDeviceGetInfo(pi_device device, pi_device_info param_name, return getInfo(param_value_size, param_value, param_value_size_ret, capabilities); } + + case PI_DEVICE_INFO_DEVICE_ID: { + int value = 0; + sycl::detail::pi::assertion( + hipDeviceGetAttribute(&value, hipDeviceAttributePciDeviceId, + device->get()) == hipSuccess); + sycl::detail::pi::assertion(value >= 0); + return getInfo(param_value_size, param_value, param_value_size_ret, value); + } + + case PI_DEVICE_INFO_UUID: { +#if ((HIP_VERSION_MAJOR == 5 && HIP_VERSION_MINOR >= 2) || \ + HIP_VERSION_MAJOR > 5) + hipUUID uuid = {}; + // Supported since 5.2+ + sycl::detail::pi::assertion(hipDeviceGetUuid(&uuid, device->get()) == + hipSuccess); + std::array name; + std::copy(uuid.bytes, uuid.bytes + 16, name.begin()); + return getInfoArray(16, param_value_size, param_value, param_value_size_ret, + name.data()); +#endif + return PI_ERROR_INVALID_VALUE; + } + // TODO: Investigate if this information is available on HIP. case PI_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES: - case PI_DEVICE_INFO_DEVICE_ID: case PI_DEVICE_INFO_PCI_ADDRESS: case PI_DEVICE_INFO_GPU_EU_COUNT: case PI_DEVICE_INFO_GPU_EU_SIMD_WIDTH: diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index c6658245cc03e..43b8d936ed88e 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -2308,6 +2309,13 @@ pi_result piContextGetInfo(pi_context Context, pi_context_info ParamName, case PI_EXT_ONEAPI_CONTEXT_INFO_USM_MEMSET2D_SUPPORT: // 2D USM fill and memset is not supported. return ReturnValue(pi_bool{false}); + case PI_CONTEXT_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES: { + pi_memory_order_capabilities capabilities = + PI_MEMORY_ORDER_RELAXED | PI_MEMORY_ORDER_ACQUIRE | + PI_MEMORY_ORDER_RELEASE | PI_MEMORY_ORDER_ACQ_REL | + PI_MEMORY_ORDER_SEQ_CST; + return ReturnValue(capabilities); + } case PI_CONTEXT_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES: default: // TODO: implement other parameters @@ -3057,8 +3065,7 @@ pi_result piMemRetain(pi_mem Mem) { // If indirect access tracking is not enabled then this functions just performs // zeMemFree. If indirect access tracking is enabled then reference counting is // performed. -static pi_result ZeMemFreeHelper(pi_context Context, void *Ptr, - bool OwnZeMemHandle = true) { +static pi_result ZeMemFreeHelper(pi_context Context, void *Ptr) { pi_platform Plt = Context->getPlatform(); std::unique_lock ContextsLock(Plt->ContextsMutex, std::defer_lock); @@ -3078,8 +3085,7 @@ static pi_result ZeMemFreeHelper(pi_context Context, void *Ptr, Context->MemAllocs.erase(It); } - if (OwnZeMemHandle) - ZE_CALL(zeMemFree, (Context->ZeContext, Ptr)); + ZE_CALL(zeMemFree, (Context->ZeContext, Ptr)); if (IndirectAccessTrackingEnabled) PI_CALL(ContextReleaseHelper(Context)); @@ -3088,7 +3094,7 @@ static pi_result ZeMemFreeHelper(pi_context Context, void *Ptr, } static pi_result USMFreeHelper(pi_context Context, void *Ptr, - bool OwnZeMemHandle); + bool OwnZeMemHandle = true); pi_result piMemRelease(pi_mem Mem) { PI_ASSERT(Mem, PI_ERROR_INVALID_MEM_OBJECT); @@ -7042,10 +7048,8 @@ static pi_result USMHostAllocImpl(void **ResultPtr, pi_context Context, return PI_SUCCESS; } -static pi_result USMFreeImpl(pi_context Context, void *Ptr, - bool OwnZeMemHandle) { - if (OwnZeMemHandle) - ZE_CALL(zeMemFree, (Context->ZeContext, Ptr)); +static pi_result USMFreeImpl(pi_context Context, void *Ptr) { + ZE_CALL(zeMemFree, (Context->ZeContext, Ptr)); return PI_SUCCESS; } @@ -7104,8 +7108,8 @@ void *USMMemoryAllocBase::allocate(size_t Size, size_t Alignment) { return Ptr; } -void USMMemoryAllocBase::deallocate(void *Ptr, bool OwnZeMemHandle) { - auto Res = USMFreeImpl(Context, Ptr, OwnZeMemHandle); +void USMMemoryAllocBase::deallocate(void *Ptr) { + auto Res = USMFreeImpl(Context, Ptr); if (Res != PI_SUCCESS) { throw UsmAllocationException(Res); } @@ -7353,8 +7357,13 @@ static pi_result USMFreeHelper(pi_context Context, void *Ptr, Context->MemAllocs.erase(It); } + if (!OwnZeMemHandle) { + // Memory should not be freed + return PI_SUCCESS; + } + if (!UseUSMAllocator) { - pi_result Res = USMFreeImpl(Context, Ptr, OwnZeMemHandle); + pi_result Res = USMFreeImpl(Context, Ptr); if (IndirectAccessTrackingEnabled) PI_CALL(ContextReleaseHelper(Context)); return Res; @@ -7373,7 +7382,7 @@ static pi_result USMFreeHelper(pi_context Context, void *Ptr, // If memory type is host release from host pool if (ZeMemoryAllocationProperties.type == ZE_MEMORY_TYPE_HOST) { try { - Context->HostMemAllocContext->deallocate(Ptr, OwnZeMemHandle); + Context->HostMemAllocContext->deallocate(Ptr); } catch (const UsmAllocationException &Ex) { return Ex.getError(); } catch (...) { @@ -7401,16 +7410,16 @@ static pi_result USMFreeHelper(pi_context Context, void *Ptr, PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE); auto DeallocationHelper = - [Context, Device, Ptr, - OwnZeMemHandle](std::unordered_map - &AllocContextMap) { + [Context, Device, + Ptr](std::unordered_map + &AllocContextMap) { try { auto It = AllocContextMap.find(Device->ZeDevice); if (It == AllocContextMap.end()) return PI_ERROR_INVALID_VALUE; // The right context is found, deallocate the pointer - It->second.deallocate(Ptr, OwnZeMemHandle); + It->second.deallocate(Ptr); } catch (const UsmAllocationException &Ex) { return Ex.getError(); } @@ -7436,7 +7445,7 @@ static pi_result USMFreeHelper(pi_context Context, void *Ptr, } } - pi_result Res = USMFreeImpl(Context, Ptr, OwnZeMemHandle); + pi_result Res = USMFreeImpl(Context, Ptr); if (SharedReadOnlyAllocsIterator != Context->SharedReadOnlyAllocs.end()) { Context->SharedReadOnlyAllocs.erase(SharedReadOnlyAllocsIterator); } @@ -7451,7 +7460,7 @@ pi_result piextUSMFree(pi_context Context, void *Ptr) { std::scoped_lock Lock( IndirectAccessTrackingEnabled ? Plt->ContextsMutex : Context->Mutex); - return USMFreeHelper(Context, Ptr, true /* OwnZeMemHandle */); + return USMFreeHelper(Context, Ptr); } pi_result piextKernelSetArgPointer(pi_kernel Kernel, pi_uint32 ArgIndex, @@ -8371,11 +8380,11 @@ pi_result _pi_buffer::free() { std::scoped_lock Lock( IndirectAccessTrackingEnabled ? Plt->ContextsMutex : Context->Mutex); - PI_CALL(USMFreeHelper(Context, ZeHandle, true)); + PI_CALL(USMFreeHelper(Context, ZeHandle)); break; } case allocation_t::free_native: - PI_CALL(ZeMemFreeHelper(Context, ZeHandle, true)); + PI_CALL(ZeMemFreeHelper(Context, ZeHandle)); break; case allocation_t::unimport: ZeUSMImport.doZeUSMRelease(Context->getPlatform()->ZeDriver, ZeHandle); diff --git a/sycl/plugins/level_zero/pi_level_zero.hpp b/sycl/plugins/level_zero/pi_level_zero.hpp index f4b7d1098efe5..a7979a6413508 100644 --- a/sycl/plugins/level_zero/pi_level_zero.hpp +++ b/sycl/plugins/level_zero/pi_level_zero.hpp @@ -112,7 +112,7 @@ class USMMemoryAllocBase : public SystemMemory { : Context{Ctx}, Device{Dev} {} void *allocate(size_t Size) override final; void *allocate(size_t Size, size_t Alignment) override final; - void deallocate(void *Ptr, bool OwnZeMemHandle) override final; + void deallocate(void *Ptr) override final; }; // Allocation routines for shared memory type diff --git a/sycl/plugins/opencl/pi_opencl.cpp b/sycl/plugins/opencl/pi_opencl.cpp index 2c44f0cfe9eb3..f2ea816b023f1 100644 --- a/sycl/plugins/opencl/pi_opencl.cpp +++ b/sycl/plugins/opencl/pi_opencl.cpp @@ -282,9 +282,126 @@ pi_result piDeviceGetInfo(pi_device device, pi_device_info paramName, // For details about Intel UUID extension, see // sycl/doc/extensions/supported/sycl_ext_intel_device_info.md case PI_DEVICE_INFO_UUID: - case PI_DEVICE_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES: - case PI_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES: return PI_ERROR_INVALID_VALUE; + case PI_DEVICE_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES: { + // This query is missing beore OpenCL 3.0 + // Check version and handle appropriately + OCLV::OpenCLVersion devVer; + cl_device_id deviceID = cast(device); + cl_int ret_err = getDeviceVersion(deviceID, devVer); + if (ret_err != CL_SUCCESS) { + return cast(ret_err); + } + + // Minimum required capability to be returned + // For OpenCL 1.2, this is all that is required + pi_memory_order_capabilities capabilities = PI_MEMORY_ORDER_RELAXED; + + if (devVer >= OCLV::V3_0) { + // For OpenCL >=3.0, the query should be implemented + cl_device_atomic_capabilities cl_capabilities = 0; + cl_int ret_err = clGetDeviceInfo( + deviceID, CL_DEVICE_ATOMIC_MEMORY_CAPABILITIES, + sizeof(cl_device_atomic_capabilities), &cl_capabilities, nullptr); + if (ret_err != CL_SUCCESS) + return cast(ret_err); + + // Mask operation to only consider atomic_memory_order* capabilities + cl_int mask = CL_DEVICE_ATOMIC_ORDER_RELAXED | + CL_DEVICE_ATOMIC_ORDER_ACQ_REL | + CL_DEVICE_ATOMIC_ORDER_SEQ_CST; + cl_capabilities &= mask; + + // The memory order capabilities are hierarchical, if one is implied, all + // preceding capbilities are implied as well. Especially in the case of + // ACQ_REL. + if (cl_capabilities & CL_DEVICE_ATOMIC_ORDER_SEQ_CST) { + capabilities |= PI_MEMORY_ORDER_SEQ_CST; + } + if (cl_capabilities & CL_DEVICE_ATOMIC_ORDER_ACQ_REL) { + capabilities |= PI_MEMORY_ORDER_ACQ_REL | PI_MEMORY_ORDER_ACQUIRE | + PI_MEMORY_ORDER_RELEASE; + } + } else if (devVer >= OCLV::V2_0) { + // For OpenCL 2.x, return all capabilities + // (https://registry.khronos.org/OpenCL/specs/3.0-unified/html/OpenCL_API.html#_memory_consistency_model) + capabilities |= PI_MEMORY_ORDER_ACQUIRE | PI_MEMORY_ORDER_RELEASE | + PI_MEMORY_ORDER_ACQ_REL | PI_MEMORY_ORDER_SEQ_CST; + } + + if (paramValue) { + if (paramValueSize < sizeof(pi_memory_order_capabilities)) + return static_cast(CL_INVALID_VALUE); + + std::memcpy(paramValue, &capabilities, sizeof(capabilities)); + } + + if (paramValueSizeRet) + *paramValueSizeRet = sizeof(capabilities); + + return static_cast(CL_SUCCESS); + } + case PI_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES: { + // Initialize result to minimum mandated capabilities according to + // SYCL2020 4.6.3.2 + // Because scopes are hierarchical, wider scopes support all narrower + // scopes. At a minimum, each device must support WORK_ITEM, SUB_GROUP and + // WORK_GROUP. (https://github.com/KhronosGroup/SYCL-Docs/pull/382) + pi_memory_scope_capabilities result = PI_MEMORY_SCOPE_WORK_ITEM | + PI_MEMORY_SCOPE_SUB_GROUP | + PI_MEMORY_SCOPE_WORK_GROUP; + + OCLV::OpenCLVersion devVer; + + cl_device_id deviceID = cast(device); + cl_int ret_err = getDeviceVersion(deviceID, devVer); + if (ret_err != CL_SUCCESS) + return static_cast(ret_err); + + cl_device_atomic_capabilities devCapabilities = 0; + if (devVer >= OCLV::V3_0) { + ret_err = clGetDeviceInfo(deviceID, CL_DEVICE_ATOMIC_MEMORY_CAPABILITIES, + sizeof(cl_device_atomic_capabilities), + &devCapabilities, nullptr); + if (ret_err != CL_SUCCESS) + return static_cast(ret_err); + assert((devCapabilities & CL_DEVICE_ATOMIC_SCOPE_WORK_GROUP) && + "Violates minimum mandated guarantee"); + + // Because scopes are hierarchical, wider scopes support all narrower + // scopes. At a minimum, each device must support WORK_ITEM, SUB_GROUP and + // WORK_GROUP. (https://github.com/KhronosGroup/SYCL-Docs/pull/382) + // We already initialized to these minimum mandated capabilities. Just + // check wider scopes. + if (devCapabilities & CL_DEVICE_ATOMIC_SCOPE_DEVICE) { + result |= PI_MEMORY_SCOPE_DEVICE; + } + + if (devCapabilities & CL_DEVICE_ATOMIC_SCOPE_ALL_DEVICES) { + result |= PI_MEMORY_SCOPE_SYSTEM; + } + + } else { + // This info is only available in OpenCL version >= 3.0 + // Just return minimum mandated capabilities for older versions. + // OpenCL 1.x minimum mandated capabilities are WORK_GROUP, we + // already initialized using it. + if (devVer >= OCLV::V2_0) { + // OpenCL 2.x minimum mandated capabilities are WORK_GROUP | DEVICE | + // ALL_DEVICES + result |= PI_MEMORY_SCOPE_DEVICE | PI_MEMORY_SCOPE_SYSTEM; + } + } + if (paramValue) { + if (paramValueSize < sizeof(cl_device_atomic_capabilities)) + return PI_ERROR_INVALID_VALUE; + + std::memcpy(paramValue, &result, sizeof(result)); + } + if (paramValueSizeRet) + *paramValueSizeRet = sizeof(result); + return PI_SUCCESS; + } case PI_DEVICE_INFO_ATOMIC_64: { cl_int ret_err = CL_SUCCESS; cl_bool result = CL_FALSE; diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp index ffe3bb6555e06..fef607f3ef185 100644 --- a/sycl/plugins/unified_runtime/pi2ur.hpp +++ b/sycl/plugins/unified_runtime/pi2ur.hpp @@ -485,6 +485,8 @@ inline pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName, (ur_device_info_t)UR_DEVICE_INFO_BFLOAT16}, {PI_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES, (ur_device_info_t)UR_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES}, + {PI_DEVICE_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES, + (ur_device_info_t)UR_DEVICE_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES}, }; auto InfoType = InfoMapping.find(ParamName); diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero.cpp index 934084ae67736..c859c166c7735 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero.cpp @@ -1173,9 +1173,30 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( // bfloat16 math functions are not yet supported on Intel GPUs. return ReturnValue(bool{false}); } + case UR_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES: { + // There are no explicit restrictions in L0 programming guide, so assume all + // are supported + ur_memory_scope_capability_flags_t result = + UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_ITEM | + UR_MEMORY_SCOPE_CAPABILITY_FLAG_SUB_GROUP | + UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_GROUP | + UR_MEMORY_SCOPE_CAPABILITY_FLAG_DEVICE | + UR_MEMORY_SCOPE_CAPABILITY_FLAG_SYSTEM; + + return ReturnValue(result); + } + + case UR_DEVICE_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES: { + ur_memory_order_capability_flags_t capabilities = + UR_MEMORY_ORDER_CAPABILITY_FLAG_RELAXED | + UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQUIRE | + UR_MEMORY_ORDER_CAPABILITY_FLAG_RELEASE | + UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQ_REL | + UR_MEMORY_ORDER_CAPABILITY_FLAG_SEQ_CST; + return ReturnValue(capabilities); + } // TODO: Implement. - case UR_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES: default: zePrint("Unsupported ParamName in piGetDeviceInfo\n"); zePrint("ParamName=%d(0x%x)\n", ParamName, ParamName); @@ -1706,7 +1727,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDevicePartition( // Currently supported partitioning (by affinity domain/numa) would always // partition to all sub-devices. // - if (NumDevices !=0) + if (NumDevices != 0) PI_ASSERT(NumDevices == EffectiveNumDevices, UR_RESULT_ERROR_INVALID_VALUE); for (uint32_t I = 0; I < NumDevices; I++) { diff --git a/sycl/plugins/unified_runtime/ur/usm_allocator.cpp b/sycl/plugins/unified_runtime/ur/usm_allocator.cpp index 1716e7d546246..31490a6ce8513 100644 --- a/sycl/plugins/unified_runtime/ur/usm_allocator.cpp +++ b/sycl/plugins/unified_runtime/ur/usm_allocator.cpp @@ -290,7 +290,7 @@ class USMAllocContext::USMAllocImpl { void *allocate(size_t Size, size_t Alignment, bool &FromPool); void *allocate(size_t Size, bool &FromPool); - void deallocate(void *Ptr, bool &ToPool, bool OwnZeMemHandle); + void deallocate(void *Ptr, bool &ToPool); SystemMemory &getMemHandle() { return *MemHandle; } @@ -332,7 +332,7 @@ Slab::Slab(Bucket &Bkt) Slab::~Slab() { unregSlab(*this); - bucket.getMemHandle().deallocate(MemPtr, true /* OwnZeMemHandle */); + bucket.getMemHandle().deallocate(MemPtr); } // Return the index of the first available chunk, -1 otherwize @@ -737,8 +737,7 @@ Bucket &USMAllocContext::USMAllocImpl::findBucket(size_t Size) { return *(*It); } -void USMAllocContext::USMAllocImpl::deallocate(void *Ptr, bool &ToPool, - bool OwnZeMemHandle) { +void USMAllocContext::USMAllocImpl::deallocate(void *Ptr, bool &ToPool) { auto *SlabPtr = AlignPtrDown(Ptr, SlabMinSize()); // Lock the map on read @@ -748,7 +747,7 @@ void USMAllocContext::USMAllocImpl::deallocate(void *Ptr, bool &ToPool, auto Slabs = getKnownSlabs().equal_range(SlabPtr); if (Slabs.first == Slabs.second) { Lk.unlock(); - getMemHandle().deallocate(Ptr, OwnZeMemHandle); + getMemHandle().deallocate(Ptr); return; } @@ -779,7 +778,7 @@ void USMAllocContext::USMAllocImpl::deallocate(void *Ptr, bool &ToPool, // There is a rare case when we have a pointer from system allocation next // to some slab with an entry in the map. So we find a slab // but the range checks fail. - getMemHandle().deallocate(Ptr, OwnZeMemHandle); + getMemHandle().deallocate(Ptr); } USMAllocContext::USMAllocContext(std::unique_ptr MemHandle, @@ -813,9 +812,9 @@ void *USMAllocContext::allocate(size_t size, size_t alignment) { return Ptr; } -void USMAllocContext::deallocate(void *ptr, bool OwnZeMemHandle) { +void USMAllocContext::deallocate(void *ptr) { bool ToPool; - pImpl->deallocate(ptr, ToPool, OwnZeMemHandle); + pImpl->deallocate(ptr, ToPool); if (pImpl->getParams().PoolTrace > 2) { auto MT = pImpl->getParams().memoryTypeName; diff --git a/sycl/plugins/unified_runtime/ur/usm_allocator.hpp b/sycl/plugins/unified_runtime/ur/usm_allocator.hpp index 6d3fa279bde1d..58a3187b00d3e 100755 --- a/sycl/plugins/unified_runtime/ur/usm_allocator.hpp +++ b/sycl/plugins/unified_runtime/ur/usm_allocator.hpp @@ -17,7 +17,7 @@ class SystemMemory { public: virtual void *allocate(size_t size) = 0; virtual void *allocate(size_t size, size_t aligned) = 0; - virtual void deallocate(void *ptr, bool OwnZeMemHandle) = 0; + virtual void deallocate(void *ptr) = 0; virtual ~SystemMemory() = default; }; @@ -68,7 +68,7 @@ class USMAllocContext { void *allocate(size_t size); void *allocate(size_t size, size_t alignment); - void deallocate(void *ptr, bool OwnZeMemHandle); + void deallocate(void *ptr); private: std::unique_ptr pImpl; diff --git a/sycl/source/detail/context_impl.cpp b/sycl/source/detail/context_impl.cpp index 22818eeed302c..198b4f1cc2d30 100644 --- a/sycl/source/detail/context_impl.cpp +++ b/sycl/source/detail/context_impl.cpp @@ -23,6 +23,8 @@ #include #include +#include + namespace sycl { __SYCL_INLINE_VER_NAMESPACE(_V1) { namespace detail { @@ -166,17 +168,26 @@ template <> std::vector context_impl::get_info() const { + std::vector CapabilityList{ + sycl::memory_order::relaxed, sycl::memory_order::acquire, + sycl::memory_order::release, sycl::memory_order::acq_rel, + sycl::memory_order::seq_cst}; if (is_host()) - return {sycl::memory_order::relaxed, sycl::memory_order::acquire, - sycl::memory_order::release, sycl::memory_order::acq_rel, - sycl::memory_order::seq_cst}; + return CapabilityList; + + for (const sycl::device &Device : MDevices) { + std::vector NewCapabilityList(CapabilityList.size()); + std::vector DeviceCapabilities = + Device.get_info(); + std::set_intersection( + CapabilityList.begin(), CapabilityList.end(), + DeviceCapabilities.begin(), DeviceCapabilities.end(), + std::inserter(NewCapabilityList, NewCapabilityList.begin())); + CapabilityList = NewCapabilityList; + } + CapabilityList.shrink_to_fit(); - pi_memory_order_capabilities Result; - getPlugin().call( - MContext, - PiInfoCode::value, - sizeof(Result), &Result, nullptr); - return readMemoryOrderBitfield(Result); + return CapabilityList; } template <> std::vector diff --git a/sycl/source/detail/context_info.hpp b/sycl/source/detail/context_info.hpp index 1056557ec2bad..d30113dc6d08e 100644 --- a/sycl/source/detail/context_info.hpp +++ b/sycl/source/detail/context_info.hpp @@ -29,17 +29,6 @@ typename Param::return_type get_context_info(RT::PiContext Ctx, return Result; } -// Specialization for atomic_memory_order_capabilities, PI returns a bitfield -template <> -std::vector -get_context_info( - RT::PiContext Ctx, const plugin &Plugin) { - pi_memory_order_capabilities Result; - Plugin.call( - Ctx, PiInfoCode::value, - sizeof(Result), &Result, nullptr); - return readMemoryOrderBitfield(Result); -} } // namespace detail } // __SYCL_INLINE_VER_NAMESPACE(_V1) } // namespace sycl diff --git a/sycl/source/detail/platform_impl.cpp b/sycl/source/detail/platform_impl.cpp index 1efa49a200f2a..007acae7e13d2 100644 --- a/sycl/source/detail/platform_impl.cpp +++ b/sycl/source/detail/platform_impl.cpp @@ -75,23 +75,28 @@ static bool IsBannedPlatform(platform Platform) { // To avoid problems on default users and deployment of DPC++ on platforms // where CUDA is available, the OpenCL support is disabled. // - auto IsNVIDIAOpenCL = [](platform Platform) { + // There is also no support for the AMD HSA backend for OpenCL consumption, + // as well as reported problems with device queries, so AMD OpenCL support + // is disabled as well. + // + auto IsMatchingOpenCL = [](platform Platform, const std::string_view name) { if (getSyclObjImpl(Platform)->is_host()) return false; - const bool HasCUDA = Platform.get_info().find( - "NVIDIA CUDA") != std::string::npos; + const bool HasNameMatch = Platform.get_info().find( + name) != std::string::npos; const auto Backend = detail::getSyclObjImpl(Platform)->getPlugin().getBackend(); - const bool IsCUDAOCL = (HasCUDA && Backend == backend::opencl); - if (detail::pi::trace(detail::pi::TraceLevel::PI_TRACE_ALL) && IsCUDAOCL) { - std::cout << "SYCL_PI_TRACE[all]: " - << "NVIDIA CUDA OpenCL platform found but is not compatible." - << std::endl; + const bool IsMatchingOCL = (HasNameMatch && Backend == backend::opencl); + if (detail::pi::trace(detail::pi::TraceLevel::PI_TRACE_ALL) && + IsMatchingOCL) { + std::cout << "SYCL_PI_TRACE[all]: " << name + << " OpenCL platform found but is not compatible." << std::endl; } - return IsCUDAOCL; + return IsMatchingOCL; }; - return IsNVIDIAOpenCL(Platform); + return IsMatchingOpenCL(Platform, "NVIDIA CUDA") || + IsMatchingOpenCL(Platform, "AMD Accelerated Parallel Processing"); } // This routine has the side effect of registering each platform's last device diff --git a/sycl/source/detail/program_manager/program_manager.cpp b/sycl/source/detail/program_manager/program_manager.cpp index a197193d35432..d6bc161428708 100644 --- a/sycl/source/detail/program_manager/program_manager.cpp +++ b/sycl/source/detail/program_manager/program_manager.cpp @@ -567,7 +567,7 @@ RT::PiProgram ProgramManager::getBuiltPIProgram( while (AIt != AEnd) { auto Aspect = static_cast(*AIt); // Strict check for fp64 is disabled temporarily to avoid confusion. - if (Aspect != aspect::fp64 && !Dev->has(Aspect)) + if (!Dev->has(Aspect)) throw sycl::exception(errc::kernel_not_supported, "Required aspect " + getAspectNameStr(Aspect) + " is not supported on the device"); @@ -1184,6 +1184,7 @@ void ProgramManager::addImages(pi_device_binaries DeviceBinary) { const _pi_offload_entry EntriesB = RawImg->EntriesBegin; const _pi_offload_entry EntriesE = RawImg->EntriesEnd; auto Img = make_unique_ptr(RawImg, M); + static uint32_t SequenceID = 0; // Fill the kernel argument mask map const RTDeviceBinaryImage::PropertyRange &KPOIRange = @@ -1257,6 +1258,13 @@ void ProgramManager::addImages(pi_device_binaries DeviceBinary) { if (KSIdIt != KSIdMap.end()) { auto &Imgs = m_DeviceImages[KSIdIt->second]; assert(Imgs && "Device image vector should have been already created"); + if (DumpImages) { + const bool NeedsSequenceID = + std::any_of(Imgs->begin(), Imgs->end(), [&](auto &I) { + return I->getFormat() == Img->getFormat(); + }); + dumpImage(*Img, KSIdIt->second, NeedsSequenceID ? ++SequenceID : 0); + } cacheKernelUsesAssertInfo(M, *Img); @@ -1379,12 +1387,14 @@ ProgramManager::getKernelSetId(OSModuleHandle M, PI_ERROR_INVALID_KERNEL_NAME); } -void ProgramManager::dumpImage(const RTDeviceBinaryImage &Img, - KernelSetId KSId) const { +void ProgramManager::dumpImage(const RTDeviceBinaryImage &Img, KernelSetId KSId, + uint32_t SequenceID) const { std::string Fname("sycl_"); const pi_device_binary_struct &RawImg = Img.getRawData(); Fname += RawImg.DeviceTargetSpec; Fname += std::to_string(KSId); + if (SequenceID) + Fname += '_' + std::to_string(SequenceID); std::string Ext; RT::PiDeviceBinaryType Format = Img.getFormat(); @@ -1683,46 +1693,120 @@ ProgramManager::getSYCLDeviceImagesWithCompatibleState( } assert(BinImages.size() > 0 && "Expected to find at least one device image"); + // Ignore images with incompatible state. Image is considered compatible + // with a target state if an image is already in the target state or can + // be brought to target state by compiling/linking/building. + // + // Example: an image in "executable" state is not compatible with + // "input" target state - there is no operation to convert the image it + // to "input" state. An image in "input" state is compatible with + // "executable" target state because it can be built to get into + // "executable" state. + for (auto It = BinImages.begin(); It != BinImages.end();) { + if (getBinImageState(*It) > TargetState) + It = BinImages.erase(It); + else + ++It; + } + std::vector SYCLDeviceImages; - for (RTDeviceBinaryImage *BinImage : BinImages) { - const bundle_state ImgState = getBinImageState(BinImage); - - // Ignore images with incompatible state. Image is considered compatible - // with a target state if an image is already in the target state or can - // be brought to target state by compiling/linking/building. - // - // Example: an image in "executable" state is not compatible with - // "input" target state - there is no operation to convert the image it - // to "input" state. An image in "input" state is compatible with - // "executable" target state because it can be built to get into - // "executable" state. - if (ImgState > TargetState) - continue; - for (const sycl::device &Dev : Devs) { + // If a non-input state is requested, we can filter out some compatible + // images and return only those with the highest compatible state for each + // device-kernel pair. This map tracks how many kernel-device pairs need each + // image, so that any unneeded ones are skipped. + // TODO this has no effect if the requested state is input, consider having + // a separate branch for that case to avoid unnecessary tracking work. + struct DeviceBinaryImageInfo { + std::shared_ptr> KernelIDs; + bundle_state State = bundle_state::input; + int RequirementCounter = 0; + }; + std::unordered_map ImageInfoMap; + + for (const sycl::device &Dev : Devs) { + // Track the highest image state for each requested kernel. + using StateImagesPairT = + std::pair>; + using KernelImageMapT = + std::map; + KernelImageMapT KernelImageMap; + if (!KernelIDs.empty()) + for (const kernel_id &KernelID : KernelIDs) + KernelImageMap.insert({KernelID, {}}); + + for (RTDeviceBinaryImage *BinImage : BinImages) { if (!compatibleWithDevice(BinImage, Dev) || !doesDevSupportDeviceRequirements(Dev, *BinImage)) continue; - std::shared_ptr> KernelIDs; - // Collect kernel names for the image - { - std::lock_guard KernelIDsGuard(m_KernelIDsMutex); - KernelIDs = m_BinImg2KernelIDs[BinImage]; - // If the image does not contain any non-service kernels we can skip it. - if (!KernelIDs || KernelIDs->empty()) - continue; + auto InsertRes = ImageInfoMap.insert({BinImage, {}}); + DeviceBinaryImageInfo &ImgInfo = InsertRes.first->second; + if (InsertRes.second) { + ImgInfo.State = getBinImageState(BinImage); + // Collect kernel names for the image + { + std::lock_guard KernelIDsGuard(m_KernelIDsMutex); + ImgInfo.KernelIDs = m_BinImg2KernelIDs[BinImage]; + } } + const bundle_state ImgState = ImgInfo.State; + const std::shared_ptr> &ImageKernelIDs = + ImgInfo.KernelIDs; + int &ImgRequirementCounter = ImgInfo.RequirementCounter; - DeviceImageImplPtr Impl = std::make_shared( - BinImage, Ctx, Devs, ImgState, KernelIDs, /*PIProgram=*/nullptr); + // If the image does not contain any non-service kernels we can skip it. + if (!ImageKernelIDs || ImageKernelIDs->empty()) + continue; - SYCLDeviceImages.push_back( - createSyclObjFromImpl(Impl)); - break; + // Update tracked information. + for (kernel_id &KernelID : *ImageKernelIDs) { + StateImagesPairT *StateImagesPair; + // If only specific kernels are requested, ignore the rest. + if (!KernelIDs.empty()) { + auto It = KernelImageMap.find(KernelID); + if (It == KernelImageMap.end()) + continue; + StateImagesPair = &It->second; + } else + StateImagesPair = &KernelImageMap[KernelID]; + + auto &[KernelImagesState, KernelImages] = *StateImagesPair; + + if (KernelImages.empty()) { + KernelImagesState = ImgState; + KernelImages.push_back(BinImage); + ++ImgRequirementCounter; + } else if (KernelImagesState < ImgState) { + for (RTDeviceBinaryImage *Img : KernelImages) { + auto It = ImageInfoMap.find(Img); + assert(It != ImageInfoMap.end()); + assert(It->second.RequirementCounter > 0); + --(It->second.RequirementCounter); + } + KernelImages.clear(); + KernelImages.push_back(BinImage); + KernelImagesState = ImgState; + ++ImgRequirementCounter; + } else if (KernelImagesState == ImgState) { + KernelImages.push_back(BinImage); + ++ImgRequirementCounter; + } + } } } + for (const auto &ImgInfoPair : ImageInfoMap) { + if (ImgInfoPair.second.RequirementCounter == 0) + continue; + + DeviceImageImplPtr Impl = std::make_shared( + ImgInfoPair.first, Ctx, Devs, ImgInfoPair.second.State, + ImgInfoPair.second.KernelIDs, /*PIProgram=*/nullptr); + + SYCLDeviceImages.push_back(createSyclObjFromImpl(Impl)); + } + return SYCLDeviceImages; } @@ -2235,7 +2319,7 @@ bool doesDevSupportDeviceRequirements(const device &Dev, while (!Aspects.empty()) { aspect Aspect = Aspects.consume(); // Strict check for fp64 is disabled temporarily to avoid confusion. - if (Aspect != aspect::fp64 && !Dev.has(Aspect)) + if (!Dev.has(Aspect)) return false; } } diff --git a/sycl/source/detail/program_manager/program_manager.hpp b/sycl/source/detail/program_manager/program_manager.hpp index e85976eb621f4..1bd81491462fa 100644 --- a/sycl/source/detail/program_manager/program_manager.hpp +++ b/sycl/source/detail/program_manager/program_manager.hpp @@ -299,7 +299,8 @@ class ProgramManager { KernelSetId getKernelSetId(OSModuleHandle M, const std::string &KernelName) const; /// Dumps image to current directory - void dumpImage(const RTDeviceBinaryImage &Img, KernelSetId KSId) const; + void dumpImage(const RTDeviceBinaryImage &Img, KernelSetId KSId, + uint32_t SequenceID = 0) const; /// Add info on kernels using assert into cache void cacheKernelUsesAssertInfo(OSModuleHandle M, RTDeviceBinaryImage &Img); diff --git a/sycl/test/basic_tests/accessor/accessor_get_pointer.cpp b/sycl/test/basic_tests/accessor/accessor_get_pointer.cpp new file mode 100644 index 0000000000000..acb63f9ea33d8 --- /dev/null +++ b/sycl/test/basic_tests/accessor/accessor_get_pointer.cpp @@ -0,0 +1,27 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -fsyntax-only + +#include +#include +#include + +using namespace sycl; + +constexpr static int size = 1; + +void test_get_multi_ptr(handler &cgh, buffer &buffer) { + using target_local_accessor_t = + accessor; + using local_accessor_t = local_accessor; + + auto acc = buffer.get_access(cgh); + auto target_local_acc = target_local_accessor_t({size}, cgh); + auto local_acc = local_accessor_t({size}, cgh); + + auto acc_ptr = acc.get_pointer(); + auto target_local_ptr = target_local_acc.get_pointer(); + auto local_pointer = local_acc.get_pointer(); + static_assert(std::is_same_v>); + static_assert(std::is_same_v>); + static_assert( + std::is_same_v>); +} \ No newline at end of file diff --git a/sycl/test/basic_tests/interop-backend-traits-cuda.cpp b/sycl/test/basic_tests/interop-backend-traits-cuda.cpp new file mode 100644 index 0000000000000..5ae25a02de048 --- /dev/null +++ b/sycl/test/basic_tests/interop-backend-traits-cuda.cpp @@ -0,0 +1,30 @@ +// REQUIRES: cuda_be +// RUN: %clangxx -fsycl -fsyntax-only %s +// RUN: %clangxx -fsycl -fsyntax-only -DUSE_CUDA_EXPERIMENTAL %s + +#ifdef USE_CUDA_EXPERIMENTAL +#define SYCL_EXT_ONEAPI_BACKEND_CUDA 1 +#define SYCL_EXT_ONEAPI_BACKEND_CUDA_EXPERIMENTAL 1 +#include +#endif + +#include + +constexpr auto Backend = sycl::backend::ext_oneapi_cuda; + +int main() { + static_assert( + std::is_same_v::return_type, + sycl::detail::interop::type>); +#ifndef USE_CUDA_EXPERIMENTAL + // CUDA experimental return type is different to input type + static_assert( + std::is_same_v::return_type, + sycl::detail::interop::type>); +#endif + static_assert( + std::is_same_v::return_type, + sycl::detail::interop::type>); + + return 0; +} diff --git a/sycl/test/basic_tests/interop-backend-traits-hip.cpp b/sycl/test/basic_tests/interop-backend-traits-hip.cpp new file mode 100644 index 0000000000000..d7a9b2ee8f399 --- /dev/null +++ b/sycl/test/basic_tests/interop-backend-traits-hip.cpp @@ -0,0 +1,17 @@ +// REQUIRES: hip_be +// RUN: %clangxx -fsycl -fsyntax-only %s + +#include + +constexpr auto Backend = sycl::backend::ext_oneapi_hip; + +int main() { + static_assert( + std::is_same_v::return_type, + sycl::detail::interop::type>); + static_assert( + std::is_same_v::return_type, + sycl::detail::interop::type>); + + return 0; +} diff --git a/sycl/test/basic_tests/interop-backend-traits-level-zero.cpp b/sycl/test/basic_tests/interop-backend-traits-level-zero.cpp new file mode 100644 index 0000000000000..97dff0d123eb8 --- /dev/null +++ b/sycl/test/basic_tests/interop-backend-traits-level-zero.cpp @@ -0,0 +1,24 @@ +// REQUIRES: level_zero_be +// RUN: %clangxx %fsycl-host-only -fsyntax-only %s + +#include +#include + +constexpr auto Backend = sycl::backend::ext_oneapi_level_zero; + +int main() { + static_assert( + std::is_same_v::input_type, + sycl::detail::interop::type>); + static_assert( + std::is_same_v::return_type, + sycl::detail::interop::type>); + static_assert( + std::is_same_v::return_type, + sycl::detail::interop::type>); + static_assert( + std::is_same_v::return_type, + sycl::detail::interop::type>); + + return 0; +} diff --git a/sycl/test/basic_tests/interop-backend-traits-opencl.cpp b/sycl/test/basic_tests/interop-backend-traits-opencl.cpp new file mode 100644 index 0000000000000..e2c0c4d856b7d --- /dev/null +++ b/sycl/test/basic_tests/interop-backend-traits-opencl.cpp @@ -0,0 +1,36 @@ +// REQUIRES: opencl_be +// RUN: %clangxx -fsycl -fsyntax-only %s + +#include +#include + +constexpr auto Backend = sycl::backend::opencl; + +int main() { + static_assert( + std::is_same_v::input_type, + sycl::detail::interop::type>); + static_assert( + std::is_same_v::input_type, + sycl::detail::interop::type>); + static_assert( + std::is_same_v::input_type, + sycl::detail::interop::type>); + static_assert( + std::is_same_v::input_type, + sycl::detail::interop::type>); + static_assert( + std::is_same_v::input_type, + sycl::detail::interop::type>); + static_assert( + std::is_same_v::return_type, + sycl::detail::interop::type>); + static_assert( + std::is_same_v::return_type, + sycl::detail::interop::type>); + static_assert( + std::is_same_v::return_type, + sycl::detail::interop::type>); + + return 0; +} diff --git a/sycl/test/basic_tests/interop-backend-traits.cpp b/sycl/test/basic_tests/interop-backend-traits.cpp deleted file mode 100644 index b76b5ca1dc8e4..0000000000000 --- a/sycl/test/basic_tests/interop-backend-traits.cpp +++ /dev/null @@ -1,87 +0,0 @@ -// RUN: %clangxx -fsycl -fsyntax-only -DUSE_OPENCL %s -// RUN: %clangxx %fsycl-host-only -fsyntax-only -DUSE_L0 %s -// RUN: %clangxx -fsycl -fsyntax-only -DUSE_CUDA %s -// RUN: %clangxx -fsycl -fsyntax-only -DUSE_HIP %s -// RUN: %clangxx -fsycl -fsyntax-only -DUSE_CUDA_EXPERIMENTAL %s - -#ifdef USE_OPENCL -#include - -#include - -constexpr auto Backend = sycl::backend::opencl; -#endif - -#ifdef USE_L0 -#include - -#include - -constexpr auto Backend = sycl::backend::ext_oneapi_level_zero; -#endif - -#ifdef USE_CUDA -#include - -constexpr auto Backend = sycl::backend::ext_oneapi_cuda; -#endif - -#ifdef USE_HIP -#include - -constexpr auto Backend = sycl::backend::ext_oneapi_hip; -#endif - -#ifdef USE_CUDA_EXPERIMENTAL -#define SYCL_EXT_ONEAPI_BACKEND_CUDA 1 -#define SYCL_EXT_ONEAPI_BACKEND_CUDA_EXPERIMENTAL 1 -#include -#include - -constexpr auto Backend = sycl::backend::ext_oneapi_cuda; -#endif - -#include - -int main() { -#ifdef USE_OPENCL - static_assert( - std::is_same_v::input_type, - sycl::detail::interop::type>); - static_assert( - std::is_same_v::input_type, - sycl::detail::interop::type>); - static_assert( - std::is_same_v::input_type, - sycl::detail::interop::type>); - static_assert( - std::is_same_v::input_type, - sycl::detail::interop::type>); -#endif - -// CUDA does not have a native type for platforms -// HIP also should follow the same behavior - need confirmation -#if !(defined(USE_CUDA) || defined(USE_HIP)) - static_assert( - std::is_same_v::input_type, - sycl::detail::interop::type>); - static_assert( - std::is_same_v::return_type, - sycl::detail::interop::type>); -#endif - - static_assert( - std::is_same_v::return_type, - sycl::detail::interop::type>); -// CUDA experimental return type is different to inpt type -#ifndef USE_CUDA_EXPERIMENTAL - static_assert( - std::is_same_v::return_type, - sycl::detail::interop::type>); -#endif - static_assert( - std::is_same_v::return_type, - sycl::detail::interop::type>); - - return 0; -} diff --git a/sycl/test/basic_tests/is_group_trait.cpp b/sycl/test/basic_tests/is_group_trait.cpp new file mode 100644 index 0000000000000..60e7dd349ac8f --- /dev/null +++ b/sycl/test/basic_tests/is_group_trait.cpp @@ -0,0 +1,22 @@ +// RUN: %clangxx -fsycl %s + +#include + +template void Check() { + static_assert(std::is_base_of_v>); + static_assert(sycl::is_group::value == ExpectedBaseType::value); + static_assert(sycl::is_group_v == ExpectedBaseType::value); +} + +int main() { + Check, std::true_type>(); + Check, std::true_type>(); + Check, std::true_type>(); + Check(); + + Check(); + Check(); + Check(); + + return 0; +} diff --git a/sycl/test/basic_tests/reduction/reducer_copy_move.cpp b/sycl/test/basic_tests/reduction/reducer_copy_move.cpp new file mode 100644 index 0000000000000..707a880f51979 --- /dev/null +++ b/sycl/test/basic_tests/reduction/reducer_copy_move.cpp @@ -0,0 +1,79 @@ +// RUN: %clangxx -fsycl -fsyntax-only %s + +// Tests that the reducer class is neither movable nor copyable. + +#include + +#include + +template struct PlusWithoutIdentity { + T operator()(const T &A, const T &B) const { return A + B; } +}; + +template static constexpr void checkReducer() { + static_assert(!std::is_copy_constructible_v); + static_assert(!std::is_move_constructible_v); + static_assert(!std::is_copy_assignable_v); + static_assert(!std::is_move_assignable_v); +} + +int main() { + sycl::queue Q; + + int *ScalarMem = sycl::malloc_shared(1, Q); + int *SpanMem = sycl::malloc_shared(8, Q); + auto ScalarRed1 = sycl::reduction(ScalarMem, std::plus{}); + auto ScalarRed2 = sycl::reduction(ScalarMem, PlusWithoutIdentity{}); + auto SpanRed1 = + sycl::reduction(sycl::span{SpanMem, 8}, std::plus{}); + auto SpanRed2 = sycl::reduction(sycl::span{SpanMem, 8}, + PlusWithoutIdentity{}); + + Q.parallel_for(sycl::range<1>{1024}, ScalarRed1, + [=](sycl::item<1>, auto &Reducer) { + checkReducer>(); + }); + + Q.parallel_for(sycl::nd_range<1>{1024, 1024}, ScalarRed1, + [=](sycl::nd_item<1>, auto &Reducer) { + checkReducer>(); + }); + + Q.parallel_for(sycl::range<1>{1024}, ScalarRed2, + [=](sycl::item<1>, auto &Reducer) { + checkReducer>(); + }); + + Q.parallel_for(sycl::nd_range<1>{1024, 1024}, ScalarRed2, + [=](sycl::nd_item<1>, auto &Reducer) { + checkReducer>(); + }); + + Q.parallel_for( + sycl::range<1>{1024}, SpanRed1, [=](sycl::item<1>, auto &Reducer) { + checkReducer>(); + checkReducer>(); + }); + + Q.parallel_for( + sycl::nd_range<1>{1024, 1024}, SpanRed1, + [=](sycl::nd_item<1>, auto &Reducer) { + checkReducer>(); + checkReducer>(); + }); + + Q.parallel_for( + sycl::range<1>{1024}, SpanRed2, [=](sycl::item<1>, auto &Reducer) { + checkReducer>(); + checkReducer>(); + }); + + Q.parallel_for( + sycl::nd_range<1>{1024, 1024}, SpanRed2, + [=](sycl::nd_item<1>, auto &Reducer) { + checkReducer>(); + checkReducer>(); + }); + + return 0; +} diff --git a/sycl/test/basic_tests/reduction/reducer_copy_move_negative.cpp b/sycl/test/basic_tests/reduction/reducer_copy_move_negative.cpp new file mode 100644 index 0000000000000..32b1d08e50f53 --- /dev/null +++ b/sycl/test/basic_tests/reduction/reducer_copy_move_negative.cpp @@ -0,0 +1,221 @@ +// RUN: %clangxx -fsycl -fsyntax-only -ferror-limit=0 -Xclang -verify %s -Xclang -verify-ignore-unexpected=note + +// Tests the errors emitted from using the deleted copy and move assignment +// operators and constructors. + +#include + +#include + +template struct PlusWithoutIdentity { + T operator()(const T &A, const T &B) const { return A + B; } +}; + +int main() { + sycl::queue Q; + + int *ScalarMem = sycl::malloc_shared(1, Q); + int *SpanMem = sycl::malloc_shared(8, Q); + auto ScalarRed1 = sycl::reduction(ScalarMem, std::plus{}); + auto ScalarRed2 = sycl::reduction(ScalarMem, PlusWithoutIdentity{}); + auto SpanRed1 = + sycl::reduction(sycl::span{SpanMem, 8}, std::plus{}); + auto SpanRed2 = sycl::reduction(sycl::span{SpanMem, 8}, + PlusWithoutIdentity{}); + + // expected-error-re@sycl/reduction.hpp:* {{call to deleted constructor of 'sycl::reducer, 0{{.*}}>'}} + Q.parallel_for(sycl::range<1>{1024}, ScalarRed1, + [=](sycl::item<1>, auto Reducer) {}); + + // expected-error-re@sycl/reduction.hpp:* {{call to deleted constructor of 'sycl::reducer, 0{{.*}}>'}} + Q.parallel_for(sycl::nd_range<1>{1024, 1024}, ScalarRed1, + [=](sycl::nd_item<1>, auto Reducer) {}); + + // expected-error-re@sycl/reduction.hpp:* {{call to deleted constructor of 'sycl::reducer, 0{{.*}}>'}} + Q.parallel_for(sycl::range<1>{1024}, ScalarRed2, + [=](sycl::item<1>, auto Reducer) {}); + + // expected-error-re@sycl/reduction.hpp:* {{call to deleted constructor of 'sycl::reducer, 0{{.*}}>'}} + Q.parallel_for(sycl::nd_range<1>{1024, 1024}, ScalarRed2, + [=](sycl::nd_item<1>, auto Reducer) {}); + + // expected-error-re@sycl/reduction.hpp:* {{call to deleted constructor of 'sycl::reducer, 1{{.*}}>'}} + Q.parallel_for(sycl::range<1>{1024}, SpanRed1, + [=](sycl::item<1>, auto Reducer) {}); + + // expected-error-re@sycl/reduction.hpp:* {{call to deleted constructor of 'sycl::reducer, 1{{.*}}>'}} + Q.parallel_for(sycl::nd_range<1>{1024, 1024}, SpanRed1, + [=](sycl::nd_item<1>, auto Reducer) {}); + + // expected-error-re@sycl/reduction.hpp:* {{call to deleted constructor of 'sycl::reducer, 1{{.*}}>'}} + Q.parallel_for(sycl::range<1>{1024}, SpanRed2, + [=](sycl::item<1>, auto Reducer) {}); + + // expected-error-re@sycl/reduction.hpp:* {{call to deleted constructor of 'sycl::reducer, 1{{.*}}>'}} + Q.parallel_for(sycl::nd_range<1>{1024, 1024}, SpanRed2, + [=](sycl::nd_item<1>, auto Reducer) {}); + + Q.parallel_for(sycl::range<1>{1024}, ScalarRed1, + [=](sycl::item<1>, auto &Reducer) { + using reducer_t = std::remove_reference_t; + // expected-error-re@+1 {{call to deleted constructor of 'reducer_t' (aka 'sycl::reducer, 0{{.*}}>')}} + reducer_t ReducerCopyAssign = Reducer; + // expected-error-re@+1 {{call to deleted constructor of 'reducer_t' (aka 'sycl::reducer, 0{{.*}}>')}} + reducer_t ReducerMoveAssign = std::move(Reducer); + // expected-error-re@+1 {{call to deleted constructor of 'reducer_t' (aka 'sycl::reducer, 0{{.*}}>')}} + reducer_t ReducerCopyCtor{Reducer}; + // expected-error-re@+1 {{call to deleted constructor of 'reducer_t' (aka 'sycl::reducer, 0{{.*}}>')}} + reducer_t ReducerMoveCtor{std::move(Reducer)}; + }); + + Q.parallel_for(sycl::nd_range<1>{1024, 1024}, ScalarRed1, + [=](sycl::nd_item<1>, auto &Reducer) { + using reducer_t = std::remove_reference_t; + // expected-error-re@+1 {{call to deleted constructor of 'reducer_t' (aka 'sycl::reducer, 0{{.*}}>')}} + reducer_t ReducerCopyAssign = Reducer; + // expected-error-re@+1 {{call to deleted constructor of 'reducer_t' (aka 'sycl::reducer, 0{{.*}}>')}} + reducer_t ReducerMoveAssign = std::move(Reducer); + // expected-error-re@+1 {{call to deleted constructor of 'reducer_t' (aka 'sycl::reducer, 0{{.*}}>')}} + reducer_t ReducerCopyCtor{Reducer}; + // expected-error-re@+1 {{call to deleted constructor of 'reducer_t' (aka 'sycl::reducer, 0{{.*}}>')}} + reducer_t ReducerMoveCtor{std::move(Reducer)}; + }); + + Q.parallel_for(sycl::range<1>{1024}, ScalarRed2, + [=](sycl::item<1>, auto &Reducer) { + using reducer_t = std::remove_reference_t; + // expected-error-re@+1 {{call to deleted constructor of 'reducer_t' (aka 'sycl::reducer, 0{{.*}}>')}} + reducer_t ReducerCopyAssign = Reducer; + // expected-error-re@+1 {{call to deleted constructor of 'reducer_t' (aka 'sycl::reducer, 0{{.*}}>')}} + reducer_t ReducerMoveAssign = std::move(Reducer); + // expected-error-re@+1 {{call to deleted constructor of 'reducer_t' (aka 'sycl::reducer, 0{{.*}}>')}} + reducer_t ReducerCopyCtor{Reducer}; + // expected-error-re@+1 {{call to deleted constructor of 'reducer_t' (aka 'sycl::reducer, 0{{.*}}>')}} + reducer_t ReducerMoveCtor{std::move(Reducer)}; + }); + + Q.parallel_for(sycl::nd_range<1>{1024, 1024}, ScalarRed2, + [=](sycl::nd_item<1>, auto &Reducer) { + using reducer_t = std::remove_reference_t; + // expected-error-re@+1 {{call to deleted constructor of 'reducer_t' (aka 'sycl::reducer, 0{{.*}}>')}} + reducer_t ReducerCopyAssign = Reducer; + // expected-error-re@+1 {{call to deleted constructor of 'reducer_t' (aka 'sycl::reducer, 0{{.*}}>')}} + reducer_t ReducerMoveAssign = std::move(Reducer); + // expected-error-re@+1 {{call to deleted constructor of 'reducer_t' (aka 'sycl::reducer, 0{{.*}}>')}} + reducer_t ReducerCopyCtor{Reducer}; + // expected-error-re@+1 {{call to deleted constructor of 'reducer_t' (aka 'sycl::reducer, 0{{.*}}>')}} + reducer_t ReducerMoveCtor{std::move(Reducer)}; + }); + + Q.parallel_for( + sycl::range<1>{1024}, SpanRed1, [=](sycl::item<1>, auto &Reducer) { + using reducer_t = std::remove_reference_t; + // expected-error-re@+1 {{call to deleted constructor of 'reducer_t' (aka 'sycl::reducer, 1{{.*}}>')}} + reducer_t ReducerCopyAssign = Reducer; + // expected-error-re@+1 {{call to deleted constructor of 'reducer_t' (aka 'sycl::reducer, 1{{.*}}>')}} + reducer_t ReducerMoveAssign = std::move(Reducer); + // expected-error-re@+1 {{call to deleted constructor of 'reducer_t' (aka 'sycl::reducer, 1{{.*}}>')}} + reducer_t ReducerCopyCtor{Reducer}; + // expected-error-re@+1 {{call to deleted constructor of 'reducer_t' (aka 'sycl::reducer, 1{{.*}}>')}} + reducer_t ReducerMoveCtor{std::move(Reducer)}; + + using reducer_subscript_t = + std::remove_reference_t; + reducer_subscript_t ReducerSubscript = Reducer[0]; + // expected-error-re@+1 {{call to deleted constructor of 'reducer_subscript_t' (aka 'sycl::reducer, 0{{.*}}>')}} + reducer_subscript_t ReducerSubscriptCopyAssign = ReducerSubscript; + // expected-error-re@+1 {{call to deleted constructor of 'reducer_subscript_t' (aka 'sycl::reducer, 0{{.*}}>')}} + reducer_subscript_t ReducerSubscriptMoveAssign = + std::move(ReducerSubscript); + // expected-error-re@+1 {{call to deleted constructor of 'reducer_subscript_t' (aka 'sycl::reducer, 0{{.*}}>')}} + reducer_subscript_t ReducerSubscriptCopyCtor{ReducerSubscript}; + // expected-error-re@+1 {{call to deleted constructor of 'reducer_subscript_t' (aka 'sycl::reducer, 0{{.*}}>')}} + reducer_subscript_t ReducerSubscriptMoveCtor{ + std::move(ReducerSubscript)}; + }); + + Q.parallel_for( + sycl::nd_range<1>{1024, 1024}, SpanRed1, + [=](sycl::nd_item<1>, auto &Reducer) { + using reducer_t = std::remove_reference_t; + // expected-error-re@+1 {{call to deleted constructor of 'reducer_t' (aka 'sycl::reducer, 1{{.*}}>')}} + reducer_t ReducerCopyAssign = Reducer; + // expected-error-re@+1 {{call to deleted constructor of 'reducer_t' (aka 'sycl::reducer, 1{{.*}}>')}} + reducer_t ReducerMoveAssign = std::move(Reducer); + // expected-error-re@+1 {{call to deleted constructor of 'reducer_t' (aka 'sycl::reducer, 1{{.*}}>')}} + reducer_t ReducerCopyCtor{Reducer}; + // expected-error-re@+1 {{call to deleted constructor of 'reducer_t' (aka 'sycl::reducer, 1{{.*}}>')}} + reducer_t ReducerMoveCtor{std::move(Reducer)}; + + using reducer_subscript_t = + std::remove_reference_t; + reducer_subscript_t ReducerSubscript = Reducer[0]; + // expected-error-re@+1 {{call to deleted constructor of 'reducer_subscript_t' (aka 'sycl::reducer, 0{{.*}}>')}} + reducer_subscript_t ReducerSubscriptCopyAssign = ReducerSubscript; + // expected-error-re@+1 {{call to deleted constructor of 'reducer_subscript_t' (aka 'sycl::reducer, 0{{.*}}>')}} + reducer_subscript_t ReducerSubscriptMoveAssign = + std::move(ReducerSubscript); + // expected-error-re@+1 {{call to deleted constructor of 'reducer_subscript_t' (aka 'sycl::reducer, 0{{.*}}>')}} + reducer_subscript_t ReducerSubscriptCopyCtor{ReducerSubscript}; + // expected-error-re@+1 {{call to deleted constructor of 'reducer_subscript_t' (aka 'sycl::reducer, 0{{.*}}>')}} + reducer_subscript_t ReducerSubscriptMoveCtor{ + std::move(ReducerSubscript)}; + }); + + Q.parallel_for( + sycl::range<1>{1024}, SpanRed2, [=](sycl::item<1>, auto &Reducer) { + using reducer_t = std::remove_reference_t; + // expected-error-re@+1 {{call to deleted constructor of 'reducer_t' (aka 'sycl::reducer, 1{{.*}}>')}} + reducer_t ReducerCopyAssign = Reducer; + // expected-error-re@+1 {{call to deleted constructor of 'reducer_t' (aka 'sycl::reducer, 1{{.*}}>')}} + reducer_t ReducerMoveAssign = std::move(Reducer); + // expected-error-re@+1 {{call to deleted constructor of 'reducer_t' (aka 'sycl::reducer, 1{{.*}}>')}} + reducer_t ReducerCopyCtor{Reducer}; + // expected-error-re@+1 {{call to deleted constructor of 'reducer_t' (aka 'sycl::reducer, 1{{.*}}>')}} + reducer_t ReducerMoveCtor{std::move(Reducer)}; + + using reducer_subscript_t = + std::remove_reference_t; + reducer_subscript_t ReducerSubscript = Reducer[0]; + // expected-error-re@+1 {{call to deleted constructor of 'reducer_subscript_t' (aka 'sycl::reducer, 0{{.*}}>')}} + reducer_subscript_t ReducerSubscriptCopyAssign = ReducerSubscript; + // expected-error-re@+1 {{call to deleted constructor of 'reducer_subscript_t' (aka 'sycl::reducer, 0{{.*}}>')}} + reducer_subscript_t ReducerSubscriptMoveAssign = + std::move(ReducerSubscript); + // expected-error-re@+1 {{call to deleted constructor of 'reducer_subscript_t' (aka 'sycl::reducer, 0{{.*}}>')}} + reducer_subscript_t ReducerSubscriptCopyCtor{ReducerSubscript}; + // expected-error-re@+1 {{call to deleted constructor of 'reducer_subscript_t' (aka 'sycl::reducer, 0{{.*}}>')}} + reducer_subscript_t ReducerSubscriptMoveCtor{ + std::move(ReducerSubscript)}; + }); + + Q.parallel_for( + sycl::nd_range<1>{1024, 1024}, SpanRed2, + [=](sycl::nd_item<1>, auto &Reducer) { + using reducer_t = std::remove_reference_t; + // expected-error-re@+1 {{call to deleted constructor of 'reducer_t' (aka 'sycl::reducer, 1{{.*}}>')}} + reducer_t ReducerCopyAssign = Reducer; + // expected-error-re@+1 {{call to deleted constructor of 'reducer_t' (aka 'sycl::reducer, 1{{.*}}>')}} + reducer_t ReducerMoveAssign = std::move(Reducer); + // expected-error-re@+1 {{call to deleted constructor of 'reducer_t' (aka 'sycl::reducer, 1{{.*}}>')}} + reducer_t ReducerCopyCtor{Reducer}; + // expected-error-re@+1 {{call to deleted constructor of 'reducer_t' (aka 'sycl::reducer, 1{{.*}}>')}} + reducer_t ReducerMoveCtor{std::move(Reducer)}; + + using reducer_subscript_t = + std::remove_reference_t; + reducer_subscript_t ReducerSubscript = Reducer[0]; + // expected-error-re@+1 {{call to deleted constructor of 'reducer_subscript_t' (aka 'sycl::reducer, 0{{.*}}>')}} + reducer_subscript_t ReducerSubscriptCopyAssign = ReducerSubscript; + // expected-error-re@+1 {{call to deleted constructor of 'reducer_subscript_t' (aka 'sycl::reducer, 0{{.*}}>')}} + reducer_subscript_t ReducerSubscriptMoveAssign = + std::move(ReducerSubscript); + // expected-error-re@+1 {{call to deleted constructor of 'reducer_subscript_t' (aka 'sycl::reducer, 0{{.*}}>')}} + reducer_subscript_t ReducerSubscriptCopyCtor{ReducerSubscript}; + // expected-error-re@+1 {{call to deleted constructor of 'reducer_subscript_t' (aka 'sycl::reducer, 0{{.*}}>')}} + reducer_subscript_t ReducerSubscriptMoveCtor{ + std::move(ReducerSubscript)}; + }); + + return 0; +} diff --git a/sycl/test/basic_tests/relational_builtins.cpp b/sycl/test/basic_tests/relational_builtins.cpp index 88b97d3980412..33f3ddb671d54 100644 --- a/sycl/test/basic_tests/relational_builtins.cpp +++ b/sycl/test/basic_tests/relational_builtins.cpp @@ -280,28 +280,28 @@ void foo() { // any CHECK(int, bool, any, int16_t) - CHECK(int, bool, any, int16v) + CHECK(int, int, any, int16v) CHECK2020(_, bool, any, int16m) CHECK(int, bool, any, int32_t) - CHECK(int, bool, any, int32v) + CHECK(int, int, any, int32v) CHECK2020(_, bool, any, int32m) CHECK(int, bool, any, int64_t) - CHECK(int, bool, any, int64v) + CHECK(int, int, any, int64v) CHECK2020(_, bool, any, int64m) // all CHECK(int, bool, all, int16_t) - CHECK(int, bool, all, int16v) + CHECK(int, int, all, int16v) CHECK2020(_, bool, all, int16m) CHECK(int, bool, all, int32_t) - CHECK(int, bool, all, int32v) + CHECK(int, int, all, int32v) CHECK2020(_, bool, all, int32m) CHECK(int, bool, all, int64_t) - CHECK(int, bool, all, int64v) + CHECK(int, int, all, int64v) CHECK2020(_, bool, all, int64m) // bitselect diff --git a/sycl/test/check_device_code/accessor_index.cpp b/sycl/test/check_device_code/accessor_index.cpp new file mode 100644 index 0000000000000..04d248e27ccc3 --- /dev/null +++ b/sycl/test/check_device_code/accessor_index.cpp @@ -0,0 +1,17 @@ +// RUN: %clangxx -fsycl-device-only -fno-sycl-early-optimizations -S -emit-llvm -D__SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING__ -o - %s | FileCheck %s +#include + +// Check that accessor index calculation is unrolled in headers. +// CHECK-NOT: llvm.loop +// CHECK-NOT: br i1 +using namespace sycl; +int main() { + queue Q; + range<3> Range{8, 8, 8}; + buffer Buf(Range); + Q.submit([&](handler &Cgh) { + auto Acc = Buf.get_access(Cgh); + local_accessor LocAcc(Range, Cgh); + Cgh.parallel_for(Range, [=](item<3> It) { LocAcc[It] = Acc[It]; }); + }); +} diff --git a/sycl/test/esimd/intrins_trans.cpp b/sycl/test/esimd/intrins_trans.cpp index 7f5b4224c9e92..2ac31f54b058e 100644 --- a/sycl/test/esimd/intrins_trans.cpp +++ b/sycl/test/esimd/intrins_trans.cpp @@ -302,4 +302,12 @@ SYCL_EXTERNAL void test_math_intrins() SYCL_ESIMD_FUNCTION { // CHECK-LABEL: %{{[a-zA-Z0-9.]+}} = call <8 x float> @llvm.genx.ieee.sqrt.v8f32(<8 x float> %{{[a-zA-Z0-9.]+}}) use(y); } + { + vec x0 = get8i(); + vec x1 = get8i(); + vec x2 = get8i(); + auto res = __esimd_bfn<0xff, int, 8>(x0, x1, x2); + // CHECK-LABEL: %{{[a-zA-Z0-9.]+}} = call <8 x i32> @llvm.genx.bfn.v8i32.v8i32(<8 x i32> %{{[a-zA-Z0-9.]+}}, <8 x i32> %{{[a-zA-Z0-9.]+}}, <8 x i32> %{{[a-zA-Z0-9.]+}}, i8 -1) + use(res); + } } diff --git a/sycl/test/esimd/math_impl.cpp b/sycl/test/esimd/math_impl.cpp index 44901f7d04195..637462ae6dbbb 100644 --- a/sycl/test/esimd/math_impl.cpp +++ b/sycl/test/esimd/math_impl.cpp @@ -10,6 +10,7 @@ using namespace sycl; using namespace sycl::ext::intel; using namespace sycl::ext::intel::esimd; +using namespace sycl::ext::intel::experimental::esimd; // Math sin,cos,log,exp functions are translated into scalar __spirv_ocl_ calls SYCL_ESIMD_FUNCTION SYCL_EXTERNAL simd sycl_math(simd x) { @@ -52,3 +53,13 @@ esimd_math_emu(simd x) { v = esimd::exp(v); return v; } + +// Logical BNF function from esimd namespace is translated into __esimd_ calls, +// which later translate into GenX intrinsics. +SYCL_ESIMD_FUNCTION SYCL_EXTERNAL simd +esimd_bfn(simd x, simd y, simd z) { + simd v = + experimental::esimd::bfn<~bfn_t::x & ~bfn_t::y & ~bfn_t::z>(x, y, z); + //CHECK: call spir_func noundef <16 x i32> @_Z11__esimd_bfn + return v; +} diff --git a/sycl/test/invoke_simd/return-type-mismatch-error.cpp b/sycl/test/invoke_simd/return-type-mismatch-error.cpp new file mode 100644 index 0000000000000..2e8907d27f912 --- /dev/null +++ b/sycl/test/invoke_simd/return-type-mismatch-error.cpp @@ -0,0 +1,30 @@ +// RUN: not %clangxx -fsycl -fsycl-device-only -S %s -o /dev/null 2>&1 | FileCheck %s +#include +#include + +using namespace sycl::ext::oneapi::experimental; +using namespace sycl; +namespace esimd = sycl::ext::intel::esimd; + +[[intel::device_indirectly_callable]] simd callee(simd) { + return simd(); +} + +void foo() { + constexpr unsigned Size = 1024; + constexpr unsigned GroupSize = 64; + sycl::range<1> GlobalRange{Size}; + sycl::range<1> LocalRange{GroupSize}; + sycl::nd_range<1> Range(GlobalRange, LocalRange); + queue q; + auto e = q.submit([&](handler &cgh) { + cgh.parallel_for(Range, [=](nd_item<1> ndi) { + invoke_simd(ndi.get_sub_group(), callee, 0); + }); + }); +} + +int main() { + foo(); + // CHECK: {{.*}}error:{{.*}}static assertion failed due to requirement 'RetVecLength == 8': invoke_simd callee return type vector length must match kernel subgroup size{{.*}} +} diff --git a/sycl/test/lit.cfg.py b/sycl/test/lit.cfg.py index b2fffb3c273b5..3ca752c46b182 100644 --- a/sycl/test/lit.cfg.py +++ b/sycl/test/lit.cfg.py @@ -96,7 +96,12 @@ llvm_symbolizer = os.path.join(config.llvm_build_bin_dir, 'llvm-symbolizer') llvm_config.with_environment('LLVM_SYMBOLIZER_PATH', llvm_symbolizer) -config.substitutions.append( ('%fsycl-host-only', '-std=c++17 -Xclang -fsycl-is-host -isystem %s -isystem %s -isystem %s -isystem %s' % (config.sycl_include, config.level_zero_include_dir, config.opencl_include_dir, config.sycl_include + '/sycl/') ) ) +sycl_host_only_options = '-std=c++17 -Xclang -fsycl-is-host' +for include_dir in [config.sycl_include, config.level_zero_include_dir, config.opencl_include_dir, config.sycl_include + '/sycl/']: + if include_dir: + sycl_host_only_options += ' -isystem %s' % include_dir +config.substitutions.append( ('%fsycl-host-only', sycl_host_only_options) ) + config.substitutions.append( ('%sycl_lib', ' -lsycl6' if platform.system() == "Windows" else '-lsycl') ) llvm_config.add_tool_substitutions(['llvm-spirv'], [config.sycl_tools_dir]) @@ -116,6 +121,12 @@ if config.esimd_emulator_be == "ON": config.available_features.add('esimd_emulator_be') +if config.opencl_be == "ON": + config.available_features.add('opencl_be') + +if config.level_zero_be == "ON": + config.available_features.add('level_zero_be') + if triple == 'nvptx64-nvidia-cuda': llvm_config.with_system_environment('CUDA_PATH') config.available_features.add('cuda') diff --git a/sycl/test/lit.site.cfg.py.in b/sycl/test/lit.site.cfg.py.in index d5a6317e2e23c..0e0f031df2ed4 100644 --- a/sycl/test/lit.site.cfg.py.in +++ b/sycl/test/lit.site.cfg.py.in @@ -28,6 +28,8 @@ config.extra_environment = lit_config.params.get("extra_environment", "@LIT_EXTR config.cuda_be = '@SYCL_BUILD_PI_CUDA@' config.esimd_emulator_be = '@SYCL_BUILD_PI_ESIMD_EMULATOR@' config.hip_be = '@SYCL_BUILD_PI_HIP@' +config.opencl_be = '@SYCL_BUILD_PI_OPENCL@' +config.level_zero_be = '@SYCL_BUILD_PI_LEVEL_ZERO@' import lit.llvm lit.llvm.initialize(lit_config, config) diff --git a/sycl/test/optional_kernel_features/relaxed_fp64_propagation.cpp b/sycl/test/optional_kernel_features/relaxed_fp64_propagation.cpp new file mode 100644 index 0000000000000..b354702ca47de --- /dev/null +++ b/sycl/test/optional_kernel_features/relaxed_fp64_propagation.cpp @@ -0,0 +1,24 @@ +// RUN: %clangxx %s -S -o %t_opt.ll -fsycl-device-only -Xclang -verify -Xclang -verify-ignore-unexpected=note +// RUN: FileCheck %s --input-file %t_opt.ll --check-prefix=CHECK-OPT +// RUN: %clangxx %s -S -fno-sycl-early-optimizations -o %t_noopt.ll -fsycl-device-only -Xclang -verify -Xclang -verify-ignore-unexpected=note +// RUN: FileCheck %s --input-file %t_noopt.ll --check-prefix=CHECK-NOOPT + +// Tests that an optimization that removes the use of double still produces a +// warning. + +// CHECK-OPT-NOT: double +// CHECK-NOOPT: double + +#include + +int main() { + sycl::queue Q; + // expected-warning-re@+1 {{function '{{.*}}' uses aspect 'fp64' not listed in its 'sycl::device_has' attribute}} + Q.single_task([=]() [[sycl::device_has()]] { + // Double will be optimized out as LoweredFloat can be set directly to a + // lowered value. + double Double = 3.14; + volatile float LoweredFloat = Double; + }); + return 0; +} diff --git a/sycl/test/regression/vec_init_list_ctor.cpp b/sycl/test/regression/vec_init_list_ctor.cpp new file mode 100644 index 0000000000000..740075bc0fe80 --- /dev/null +++ b/sycl/test/regression/vec_init_list_ctor.cpp @@ -0,0 +1,12 @@ +// RUN: %clang -fsycl -fsyntax-only -Xclang -verify %s +// expected-no-diagnostics + +// Regression test checking that the vector ctor taking an initializer list +// doesn't cause warnings or errors. + +#include + +int main() { + sycl::vec V({1, 2}); + return 0; +} diff --git a/sycl/unittests/Extensions/DeviceGlobal.cpp b/sycl/unittests/Extensions/DeviceGlobal.cpp index 4d47500507502..dc34b822a1013 100644 --- a/sycl/unittests/Extensions/DeviceGlobal.cpp +++ b/sycl/unittests/Extensions/DeviceGlobal.cpp @@ -12,6 +12,7 @@ #include "detail/context_impl.hpp" #include "detail/kernel_program_cache.hpp" +#include #include #include @@ -42,31 +43,17 @@ sycl::ext::oneapi::experimental::device_global< namespace sycl { __SYCL_INLINE_VER_NAMESPACE(_V1) { namespace detail { -template <> struct KernelInfo { - static constexpr unsigned getNumParams() { return 0; } - static const kernel_param_desc_t &getParamDesc(int) { - static kernel_param_desc_t Dummy; - return Dummy; - } +template <> +struct KernelInfo + : public unittest::MockKernelInfoBase { static constexpr const char *getName() { return DeviceGlobalTestKernelName; } - static constexpr bool isESIMD() { return false; } - static constexpr bool callsThisItem() { return false; } - static constexpr bool callsAnyThisFreeFunction() { return false; } - static constexpr int64_t getKernelSize() { return 1; } }; -template <> struct KernelInfo { - static constexpr unsigned getNumParams() { return 0; } - static const kernel_param_desc_t &getParamDesc(int) { - static kernel_param_desc_t Dummy; - return Dummy; - } +template <> +struct KernelInfo + : public unittest::MockKernelInfoBase { static constexpr const char *getName() { return DeviceGlobalImgScopeTestKernelName; } - static constexpr bool isESIMD() { return false; } - static constexpr bool callsThisItem() { return false; } - static constexpr bool callsAnyThisFreeFunction() { return false; } - static constexpr int64_t getKernelSize() { return 1; } }; } // namespace detail } // __SYCL_INLINE_VER_NAMESPACE(_V1) diff --git a/sycl/unittests/Extensions/USMMemcpy2D.cpp b/sycl/unittests/Extensions/USMMemcpy2D.cpp index 8139f6ae24edd..ffdaa86ff95a6 100644 --- a/sycl/unittests/Extensions/USMMemcpy2D.cpp +++ b/sycl/unittests/Extensions/USMMemcpy2D.cpp @@ -10,6 +10,7 @@ #include +#include #include #include @@ -23,7 +24,9 @@ constexpr const char *USMMemcpyHelperKernelNameChar = "__usmmemcpy2d_char"; namespace sycl { __SYCL_INLINE_VER_NAMESPACE(_V1) { namespace detail { -template <> struct KernelInfo> { +template <> +struct KernelInfo> + : public unittest::MockKernelInfoBase { static constexpr const char *getName() { return USMFillHelperKernelNameLong; } static constexpr unsigned getNumParams() { return 7; } static const kernel_param_desc_t &getParamDesc(int Idx) { @@ -39,15 +42,14 @@ template <> struct KernelInfo> { }; return DummySignature[Idx]; } - static constexpr bool isESIMD() { return false; } - static constexpr bool callsThisItem() { return false; } - static constexpr bool callsAnyThisFreeFunction() { return false; } static constexpr int64_t getKernelSize() { return 2 * sizeof(void *) + 2 * sizeof(sycl::id<2>) + 3 * sizeof(size_t); } }; -template <> struct KernelInfo> { +template <> +struct KernelInfo> + : public unittest::MockKernelInfoBase { static constexpr const char *getName() { return USMFillHelperKernelNameChar; } static constexpr unsigned getNumParams() { return 7; } static const kernel_param_desc_t &getParamDesc(int Idx) { @@ -63,15 +65,14 @@ template <> struct KernelInfo> { }; return DummySignature[Idx]; } - static constexpr bool isESIMD() { return false; } - static constexpr bool callsThisItem() { return false; } - static constexpr bool callsAnyThisFreeFunction() { return false; } static constexpr int64_t getKernelSize() { return 2 * sizeof(void *) + 2 * sizeof(sycl::id<2>) + 3 * sizeof(size_t); } }; -template <> struct KernelInfo> { +template <> +struct KernelInfo> + : public unittest::MockKernelInfoBase { static constexpr const char *getName() { return USMMemcpyHelperKernelNameLong; } @@ -90,15 +91,14 @@ template <> struct KernelInfo> { }; return DummySignature[Idx]; } - static constexpr bool isESIMD() { return false; } - static constexpr bool callsThisItem() { return false; } - static constexpr bool callsAnyThisFreeFunction() { return false; } static constexpr int64_t getKernelSize() { return 2 * sizeof(void *) + 2 * sizeof(sycl::id<2>) + 4 * sizeof(size_t); } }; -template <> struct KernelInfo> { +template <> +struct KernelInfo> + : public unittest::MockKernelInfoBase { static constexpr const char *getName() { return USMMemcpyHelperKernelNameChar; } @@ -117,9 +117,6 @@ template <> struct KernelInfo> { }; return DummySignature[Idx]; } - static constexpr bool isESIMD() { return false; } - static constexpr bool callsThisItem() { return false; } - static constexpr bool callsAnyThisFreeFunction() { return false; } static constexpr int64_t getKernelSize() { return 2 * sizeof(void *) + 2 * sizeof(sycl::id<2>) + 4 * sizeof(size_t); } diff --git a/sycl/unittests/Extensions/WeakObject.cpp b/sycl/unittests/Extensions/WeakObject.cpp index 2357ab5febf8a..8af9974bdba4a 100644 --- a/sycl/unittests/Extensions/WeakObject.cpp +++ b/sycl/unittests/Extensions/WeakObject.cpp @@ -198,6 +198,39 @@ template struct WeakObjectCheckOwnerLessMap { } }; +template struct WeakObjectCheckCopy { + void operator()(SyclObjT Obj) { + sycl::ext::oneapi::weak_object WeakObj{Obj}; + + sycl::ext::oneapi::weak_object WeakObjCopyCtor{WeakObj}; + sycl::ext::oneapi::weak_object WeakObjCopyAssign = WeakObj; + + EXPECT_FALSE(WeakObjCopyCtor.expired()); + EXPECT_FALSE(WeakObjCopyAssign.expired()); + + EXPECT_TRUE(WeakObjCopyCtor.lock() == Obj); + EXPECT_TRUE(WeakObjCopyAssign.lock() == Obj); + } +}; + +template struct WeakObjectCheckMove { + void operator()(SyclObjT Obj) { + sycl::ext::oneapi::weak_object WeakObj1{Obj}; + sycl::ext::oneapi::weak_object WeakObj2{Obj}; + + sycl::ext::oneapi::weak_object WeakObjMoveCtor{ + std::move(WeakObj1)}; + sycl::ext::oneapi::weak_object WeakObjMoveAssign = + std::move(WeakObj2); + + EXPECT_FALSE(WeakObjMoveCtor.expired()); + EXPECT_FALSE(WeakObjMoveAssign.expired()); + + EXPECT_TRUE(WeakObjMoveCtor.lock() == Obj); + EXPECT_TRUE(WeakObjMoveAssign.lock() == Obj); + } +}; + template