diff --git a/llvm/test/tools/sycl-post-link/device-code-split/per-reqd-sub-group-size-split-1.ll b/llvm/test/tools/sycl-post-link/device-code-split/per-reqd-sub-group-size-split-1.ll new file mode 100644 index 0000000000000..cd890e158c734 --- /dev/null +++ b/llvm/test/tools/sycl-post-link/device-code-split/per-reqd-sub-group-size-split-1.ll @@ -0,0 +1,133 @@ +; This test emulates two translation units with 3 kernels: +; TU0_kernel0 - 1st translation unit, no reqd_sub_group_size attribute used +; TU0_kernel1 - 1st translation unit, reqd_sub_group_size attribute is used +; TU1_kernel2 - 2nd translation unit, no reqd_sub_group_size attribute used + +; The test is intended to check that sycl-post-link correctly separates kernels +; that use reqd_sub_group_size attributes from kernels which doesn't use them +; regardless of device code split mode + +; RUN: sycl-post-link -split=auto -symbols -S %s -o %t.table +; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-M0-IR \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t_1.ll --check-prefixes CHECK-M1-IR \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel2 +; RUN: FileCheck %s -input-file=%t_2.ll --check-prefixes CHECK-M2-IR \ +; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2 +; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-M0-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-M1-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel2 +; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-M2-SYMS \ +; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2 + +; RUN: sycl-post-link -split=kernel -symbols -S %s -o %t.table +; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-M0-IR \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t_1.ll --check-prefixes CHECK-M1-IR \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel2 +; RUN: FileCheck %s -input-file=%t_2.ll --check-prefixes CHECK-M2-IR \ +; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2 +; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-M0-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-M1-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel2 +; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-M2-SYMS \ +; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2 + +; RUN: sycl-post-link -split=source -symbols -S %s -o %t.table +; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-M0-IR \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t_1.ll --check-prefixes CHECK-M1-IR \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel2 +; RUN: FileCheck %s -input-file=%t_2.ll --check-prefixes CHECK-M2-IR \ +; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2 +; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-M0-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-M1-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel2 +; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-M2-SYMS \ +; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2 + +; Regardless of device code split mode, each kernel should go into a separate +; device image + +; CHECK-M2-IR: define {{.*}} @TU0_kernel0 +; CHECK-M2-SYMS: TU0_kernel0 + +; CHECK-M1-IR: define {{.*}} @TU0_kernel1 +; CHECK-M1-SYMS: TU0_kernel1 + +; CHECK-M0-IR: define {{.*}} @TU1_kernel2 +; CHECK-M0-SYMS: TU1_kernel2 + +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" +target triple = "spir64-unknown-linux" + +; FIXME: device globals should also be properly distributed across device images +; if they are of optional type +@_ZL2GV = internal addrspace(1) constant [1 x i32] [i32 42], align 4 + +define dso_local spir_kernel void @TU0_kernel0() #0 { +entry: + call spir_func void @foo() + ret void +} + +define dso_local spir_func void @foo() { +entry: + %a = alloca i32, align 4 + %call = call spir_func i32 @bar(i32 1) + %add = add nsw i32 2, %call + store i32 %add, i32* %a, align 4 + ret void +} + +; Function Attrs: nounwind +define linkonce_odr dso_local spir_func i32 @bar(i32 %arg) { +entry: + %arg.addr = alloca i32, align 4 + store i32 %arg, i32* %arg.addr, align 4 + %0 = load i32, i32* %arg.addr, align 4 + ret i32 %0 +} + +define dso_local spir_kernel void @TU0_kernel1() #0 !intel_reqd_sub_group_size !2 { +entry: + call spir_func void @foo1() + ret void +} + +; Function Attrs: nounwind +define dso_local spir_func void @foo1() { +entry: + %a = alloca i32, align 4 + store i32 2, i32* %a, align 4 + ret void +} + +define dso_local spir_kernel void @TU1_kernel2() #1 { +entry: + call spir_func void @foo2() + ret void +} + +; Function Attrs: nounwind +define dso_local spir_func void @foo2() { +entry: + %a = alloca i32, align 4 + %0 = load i32, i32 addrspace(4)* getelementptr inbounds ([1 x i32], [1 x i32] addrspace(4)* addrspacecast ([1 x i32] addrspace(1)* @_ZL2GV to [1 x i32] addrspace(4)*), i64 0, i64 0), align 4 + %add = add nsw i32 4, %0 + store i32 %add, i32* %a, align 4 + ret void +} + +attributes #0 = { "sycl-module-id"="TU1.cpp" } +attributes #1 = { "sycl-module-id"="TU2.cpp" } + +!opencl.spir.version = !{!0, !0} +!spirv.Source = !{!1, !1} + +!0 = !{i32 1, i32 2} +!1 = !{i32 4, i32 100000} +!2 = !{i32 32} \ No newline at end of file diff --git a/llvm/test/tools/sycl-post-link/device-code-split/per-reqd-sub-group-size-split-2.ll b/llvm/test/tools/sycl-post-link/device-code-split/per-reqd-sub-group-size-split-2.ll new file mode 100644 index 0000000000000..155b843c390a5 --- /dev/null +++ b/llvm/test/tools/sycl-post-link/device-code-split/per-reqd-sub-group-size-split-2.ll @@ -0,0 +1,60 @@ +; The test is intended to check that sycl-post-link correctly groups kernels +; by unique reqd_sub_group_size values used in them + +; RUN: sycl-post-link -split=auto -symbols -S %s -o %t.table +; RUN: FileCheck %s -input-file=%t.table --check-prefix CHECK-TABLE +; +; RUN: FileCheck %s -input-file=%t_0.sym --check-prefix CHECK-M0-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel3 +; +; RUN: FileCheck %s -input-file=%t_1.sym --check-prefix CHECK-M1-SYMS \ +; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2 \ +; RUN: --implicit-check-not kernel3 + +; +; RUN: FileCheck %s -input-file=%t_2.sym --check-prefix CHECK-M2-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 \ +; RUN: --implicit-check-not kernel2 + +; CHECK-TABLE: Code +; CHECK-TABLE-NEXT: _0.sym +; CHECK-TABLE-NEXT: _1.sym +; CHECK-TABLE-NEXT: _2.sym +; CHECK-TABLE-EMPTY: + +; CHECK-M0-SYMS: kernel1 +; CHECK-M0-SYMS: kernel2 + +; CHECK-M1-SYMS: kernel0 + +; CHECK-M2-SYMS: kernel3 + +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" +target triple = "spir64-unknown-linux" + +define dso_local spir_kernel void @kernel0() #0 !intel_reqd_sub_group_size !1 { +entry: + ret void +} + +define dso_local spir_kernel void @kernel1() #0 !intel_reqd_sub_group_size !2 { +entry: + ret void +} + +define dso_local spir_kernel void @kernel2() #0 !intel_reqd_sub_group_size !3 { +entry: + ret void +} + +define dso_local spir_kernel void @kernel3() #0 !intel_reqd_sub_group_size !4 { +entry: + ret void +} + +attributes #0 = { "sycl-module-id"="TU1.cpp" } + +!1 = !{i32 32} +!2 = !{i32 64} +!3 = !{i32 64} +!4 = !{i32 16} \ No newline at end of file diff --git a/llvm/test/tools/sycl-post-link/device-requirements/reqd-sub-group-size.ll b/llvm/test/tools/sycl-post-link/device-requirements/reqd-sub-group-size.ll new file mode 100644 index 0000000000000..df4d5682c0623 --- /dev/null +++ b/llvm/test/tools/sycl-post-link/device-requirements/reqd-sub-group-size.ll @@ -0,0 +1,125 @@ +; Original code: +; Compile with: clang++ -fsycl -fsycl-device-only -fno-sycl-instrument-device-code -D__SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING__ -S reqd-sub-group-size.cpp +; #include + +; int main() { +; sycl::queue q; +; q.submit([&](sycl::handler &h) { +; h.parallel_for( +; sycl::range<1>(32), +; [=](sycl::item<1> it) [[sycl::reqd_sub_group_size(16)]] {}); +; }); +; q.submit([&](sycl::handler &h) { +; h.parallel_for( +; sycl::range<1>(32), +; [=](sycl::item<1> it) [[sycl::reqd_sub_group_size(32)]] {}); +; }); +; q.submit([&](sycl::handler &h) { +; h.parallel_for( +; sycl::range<1>(32), +; [=](sycl::item<1> it) [[sycl::reqd_sub_group_size(16)]] {}); +; }); +; return 0; +; } + +; RUN: sycl-post-link -split=auto %s -o %t.table +; RUN: FileCheck %s -input-file=%t_0.prop --check-prefix CHECK-PROP-AUTO-SPLIT-0 +; RUN: FileCheck %s -input-file=%t_1.prop --check-prefix CHECK-PROP-AUTO-SPLIT-1 + +; CHECK-PROP-AUTO-SPLIT-0: [SYCL/device requirements] +; CHECK-PROP-AUTO-SPLIT-0: reqd_sub_group_size=1|32 + +; CHECK-PROP-AUTO-SPLIT-1: [SYCL/device requirements] +; CHECK-PROP-AUTO-SPLIT-1: reqd_sub_group_size=1|16 + +; ModuleID = 'foo.cpp' +source_filename = "foo.cpp" +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64" +target triple = "spir64-unknown-unknown" + +$_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E7KernelA = comdat any + +$_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E7KernelB = comdat any + +$_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE1_clES2_E7KernelC = comdat any + +; Function Attrs: norecurse nounwind +define weak_odr dso_local spir_kernel void @_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E7KernelA() local_unnamed_addr #0 comdat !srcloc !48 !kernel_arg_buffer_location !49 !intel_reqd_sub_group_size !50 !sycl_fixed_targets !49 !sycl_kernel_omit_args !49 { +entry: + ret void +} + +; Function Attrs: norecurse nounwind +define weak_odr dso_local spir_kernel void @_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E7KernelB() local_unnamed_addr #0 comdat !srcloc !51 !kernel_arg_buffer_location !49 !intel_reqd_sub_group_size !52 !sycl_fixed_targets !49 !sycl_kernel_omit_args !49 { +entry: + ret void +} + +; Function Attrs: norecurse nounwind +define weak_odr dso_local spir_kernel void @_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE1_clES2_E7KernelC() local_unnamed_addr #0 comdat !srcloc !53 !kernel_arg_buffer_location !49 !intel_reqd_sub_group_size !50 !sycl_fixed_targets !49 !sycl_kernel_omit_args !49 { +entry: + ret void +} + +attributes #0 = { norecurse nounwind "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "sycl-module-id"="foo.cpp" "sycl-optlevel"="2" "uniform-work-group-size"="true" } + +!llvm.module.flags = !{!0, !1} +!opencl.spir.version = !{!2} +!spirv.Source = !{!3} +!sycl_aspects = !{!4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42, !43, !44, !45, !46} +!llvm.ident = !{!47} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 7, !"frame-pointer", i32 2} +!2 = !{i32 1, i32 2} +!3 = !{i32 4, i32 100000} +!4 = !{!"cpu", i32 1} +!5 = !{!"gpu", i32 2} +!6 = !{!"accelerator", i32 3} +!7 = !{!"custom", i32 4} +!8 = !{!"fp16", i32 5} +!9 = !{!"fp64", i32 6} +!10 = !{!"image", i32 9} +!11 = !{!"online_compiler", i32 10} +!12 = !{!"online_linker", i32 11} +!13 = !{!"queue_profiling", i32 12} +!14 = !{!"usm_device_allocations", i32 13} +!15 = !{!"usm_host_allocations", i32 14} +!16 = !{!"usm_shared_allocations", i32 15} +!17 = !{!"usm_system_allocations", i32 17} +!18 = !{!"ext_intel_pci_address", i32 18} +!19 = !{!"ext_intel_gpu_eu_count", i32 19} +!20 = !{!"ext_intel_gpu_eu_simd_width", i32 20} +!21 = !{!"ext_intel_gpu_slices", i32 21} +!22 = !{!"ext_intel_gpu_subslices_per_slice", i32 22} +!23 = !{!"ext_intel_gpu_eu_count_per_subslice", i32 23} +!24 = !{!"ext_intel_max_mem_bandwidth", i32 24} +!25 = !{!"ext_intel_mem_channel", i32 25} +!26 = !{!"usm_atomic_host_allocations", i32 26} +!27 = !{!"usm_atomic_shared_allocations", i32 27} +!28 = !{!"atomic64", i32 28} +!29 = !{!"ext_intel_device_info_uuid", i32 29} +!30 = !{!"ext_oneapi_srgb", i32 30} +!31 = !{!"ext_oneapi_native_assert", i32 31} +!32 = !{!"host_debuggable", i32 32} +!33 = !{!"ext_intel_gpu_hw_threads_per_eu", i32 33} +!34 = !{!"ext_oneapi_cuda_async_barrier", i32 34} +!35 = !{!"ext_oneapi_bfloat16_math_functions", i32 35} +!36 = !{!"ext_intel_free_memory", i32 36} +!37 = !{!"ext_intel_device_id", i32 37} +!38 = !{!"ext_intel_memory_clock_rate", i32 38} +!39 = !{!"ext_intel_memory_bus_width", i32 39} +!40 = !{!"emulated", i32 40} +!41 = !{!"ext_intel_legacy_image", i32 41} +!42 = !{!"int64_base_atomics", i32 7} +!43 = !{!"int64_extended_atomics", i32 8} +!44 = !{!"usm_system_allocator", i32 17} +!45 = !{!"usm_restricted_shared_allocations", i32 16} +!46 = !{!"host", i32 0} +!47 = !{!"clang version 17.0.0 (https://github.com/jzc/llvm eed5b5576bef314433e8ae7313620dae399c9d22)"} +!48 = !{i32 170} +!49 = !{} +!50 = !{i32 16} +!51 = !{i32 351} +!52 = !{i32 32} +!53 = !{i32 532} diff --git a/llvm/test/tools/sycl-post-link/registerallocmode.ll b/llvm/test/tools/sycl-post-link/registerallocmode.ll index 751fe6de2667a..a008d2593dd83 100644 --- a/llvm/test/tools/sycl-post-link/registerallocmode.ll +++ b/llvm/test/tools/sycl-post-link/registerallocmode.ll @@ -2,41 +2,49 @@ ; RUN: sycl-post-link -split=source -symbols -split-esimd -lower-esimd -S < %s -o %t.table ; RUN: FileCheck %s -input-file=%t.table -; RUN: FileCheck %s -input-file=%t_esimd_0.ll --check-prefixes CHECK-ESIMD-LargeGRF-IR -; RUN: FileCheck %s -input-file=%t_esimd_0.prop --check-prefixes CHECK-ESIMD-LargeGRF-PROP -; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-SYCL-LargeGRF-IR -; RUN: FileCheck %s -input-file=%t_0.prop --check-prefixes CHECK-SYCL-LargeGRF-PROP -; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-SYCL-LargeGRF-SYM -; RUN: FileCheck %s -input-file=%t_1.prop --check-prefixes CHECK-SYCL-PROP -; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-SYCL-SYM -; RUN: FileCheck %s -input-file=%t_esimd_1.prop --check-prefixes CHECK-ESIMD-PROP -; RUN: FileCheck %s -input-file=%t_esimd_0.sym --check-prefixes CHECK-ESIMD-LargeGRF-SYM ; CHECK: [Code|Properties|Symbols] -; CHECK: {{.*}}_esimd_0.ll|{{.*}}_esimd_0.prop|{{.*}}_esimd_0.sym -; CHECK: {{.*}}_esimd_1.ll|{{.*}}_esimd_1.prop|{{.*}}_esimd_1.sym -; CHECK: {{.*}}_1.ll|{{.*}}_1.prop|{{.*}}_1.sym +; CHECK-NEXT: {{.*}}_esimd_0.ll|{{.*}}_esimd_0.prop|{{.*}}_esimd_0.sym +; CHECK-NEXT: {{.*}}_1.ll|{{.*}}_1.prop|{{.*}}_1.sym +; CHECK-NEXT: {{.*}}_esimd_2.ll|{{.*}}_esimd_2.prop|{{.*}}_esimd_2.sym +; CHECK-NEXT: {{.*}}_3.ll|{{.*}}_3.prop|{{.*}}_3.sym -; CHECK-ESIMD-LargeGRF-PROP: isEsimdImage=1|1 -; CHECK-ESIMD-LargeGRF-PROP: sycl-register-alloc-mode=1|2 +; RUN: FileCheck %s -input-file=%t_esimd_0.ll --check-prefixes CHECK-ESIMD-LargeGRF-IR +; RUN: FileCheck %s -input-file=%t_esimd_0.prop --check-prefixes CHECK-ESIMD-LargeGRF-PROP +; RUN: FileCheck %s -input-file=%t_esimd_0.sym --check-prefixes CHECK-ESIMD-LargeGRF-SYM -; CHECK-SYCL-LargeGRF-PROP: sycl-register-alloc-mode=1|2 +; CHECK-ESIMD-LargeGRF-SYM: __ESIMD_large_grf_kernel +; CHECK-ESIMD-LargeGRF-SYM-EMPTY: -; CHECK-SYCL-PROP-NOT: sycl-register-alloc-mode +; CHECK-ESIMD-LargeGRF-PROP: isEsimdImage=1|1 +; CHECK-ESIMD-LargeGRF-PROP: sycl-register-alloc-mode=1|2 -; CHECK-SYCL-SYM: __SYCL_kernel -; CHECK-SYCL-SYM-EMPTY: +; RUN: FileCheck %s -input-file=%t_1.ll --check-prefixes CHECK-SYCL-LargeGRF-IR +; RUN: FileCheck %s -input-file=%t_1.prop --check-prefixes CHECK-SYCL-LargeGRF-PROP +; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-SYCL-LargeGRF-SYM ; CHECK-SYCL-LargeGRF-SYM: __SYCL_kernel_large_grf ; CHECK-SYCL-LargeGRF-SYM-EMPTY: +; CHECK-SYCL-LargeGRF-PROP: sycl-register-alloc-mode=1|2 + +; RUN: FileCheck %s -input-file=%t_esimd_2.prop --check-prefixes CHECK-ESIMD-PROP +; RUN: FileCheck %s -input-file=%t_esimd_2.sym --check-prefixes CHECK-ESIMD-SYM + ; CHECK-ESIMD-SYM: __ESIMD_kernel ; CHECK-ESIMD-SYM-EMPTY: ; CHECK-ESIMD-PROP-NOT: sycl-register-alloc-mode -; CHECK-ESIMD-LargeGRF-SYM: __ESIMD_large_grf_kernel -; CHECK-ESIMD-LargeGRF-SYM-EMPTY: +; RUN: FileCheck %s -input-file=%t_3.prop --check-prefixes CHECK-SYCL-PROP +; RUN: FileCheck %s -input-file=%t_3.sym --check-prefixes CHECK-SYCL-SYM + +; CHECK-SYCL-SYM: __SYCL_kernel +; CHECK-SYCL-SYM-EMPTY: + +; CHECK-SYCL-PROP-NOT: sycl-register-alloc-mode + + ; ModuleID = 'large_grf.bc' source_filename = "llvm-link" diff --git a/llvm/test/tools/sycl-post-link/sycl-esimd/invoke-esimd-double.ll b/llvm/test/tools/sycl-post-link/sycl-esimd/invoke-esimd-double.ll index c4f8733a11f19..a614184d2a83c 100644 --- a/llvm/test/tools/sycl-post-link/sycl-esimd/invoke-esimd-double.ll +++ b/llvm/test/tools/sycl-post-link/sycl-esimd/invoke-esimd-double.ll @@ -1,4 +1,4 @@ -; RUN: sycl-post-link -symbols -split=auto -S < %s -o %t.table +; RUN: sycl-post-link --emit-only-kernels-as-entry-points -symbols -split=auto -S < %s -o %t.table ; RUN: FileCheck %s -input-file=%t.table --check-prefixes CHECK-TABLE ; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-M0-SYMS ; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-M1-SYMS @@ -9,14 +9,10 @@ ; CHECK-TABLE: {{.*}}_1.ll|{{.*}}_1.prop|{{.*}}_1.sym ; CHECK-TABLE-EMPTY: -; CHECK-M0-SYMS: simd_func_double -; CHECK-M0-SYMS-NEXT: helper_double_[[#]] -; CHECK-M0-SYMS-NEXT: double_kernel +; CHECK-M0-SYMS: double_kernel ; CHECK-M0-SYMS-EMPTY: -; CHECK-M1-SYMS: simd_func_float -; CHECK-M1-SYMS-NEXT: helper_float_[[#]] -; CHECK-M1-SYMS-NEXT: float_kernel +; CHECK-M1-SYMS: float_kernel ; CHECK-M1-SYMS-EMPTY: target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64" diff --git a/llvm/test/tools/sycl-post-link/sycl-esimd/lower-with-no-esimd-entry.ll b/llvm/test/tools/sycl-post-link/sycl-esimd/lower-with-no-esimd-entry.ll new file mode 100644 index 0000000000000..63a5ffb1fe626 --- /dev/null +++ b/llvm/test/tools/sycl-post-link/sycl-esimd/lower-with-no-esimd-entry.ll @@ -0,0 +1,44 @@ +; This test checks to see if ESIMD lowering is performed even without the +; the presence of ESIMD entry points. + +; RUN: sycl-post-link -symbols -lower-esimd -split=auto -S < %s -o %t.table +; RUN: FileCheck %s -input-file=%t.table --check-prefixes CHECK-TABLE +; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-SYMS +; RUN: FileCheck %s -input-file=%t_0.ll + +; CHECK-TABLE: {{.*}}_0.ll|{{.*}}_0.prop|{{.*}}_0.sym +; CHECK-TABLE-EMPTY: + +; CHECK-SYMS: _ZTSZ4mainE3Foo +; CHECK-SYMS-EMPTY: + +define weak_odr dso_local spir_kernel void @_ZTSZ4mainE3Foo(ptr addrspace(1) noundef align 4 %_arg_p) #0 { +entry: + %0 = load i32, ptr addrspace(1) %_arg_p, align 4 + %call1.i.i = tail call x86_regcallcc noundef i32 @_Z33__regcall3____builtin_invoke_simd1(ptr noundef nonnull @helper, ptr noundef nonnull @ESIMD_function, i32 noundef %0) #5 + store i32 %call1.i.i, ptr addrspace(1) %_arg_p, align 4 + ret void +} + +define linkonce_odr dso_local x86_regcallcc <16 x i32> @ESIMD_function(<16 x i32> %x) #1 !sycl_explicit_simd !0 !intel_reqd_sub_group_size !1 { +entry: + ret <16 x i32> zeroinitializer +} + +declare dso_local x86_regcallcc noundef i32 @_Z33__regcall3____builtin_invoke_simd1(ptr noundef, ptr noundef, i32 noundef) + +; The generated helper should be inlined with the call to @ESIMD_function. +; CHECK: @helper_{{[0-9]+}}(<16 x i32> %simd_args.coerce) +; CHECK-NEXT: entry: +; CHECK-NEXT: ret <16 x i32> zeroinitializer +define linkonce_odr dso_local x86_regcallcc <16 x i32> @helper(ptr noundef nonnull %f, <16 x i32> %simd_args.coerce) #1 { +entry: + %call = tail call x86_regcallcc <16 x i32> %f(<16 x i32> %simd_args.coerce) + ret <16 x i32> %call +} + +attributes #0 = { "sycl-module-id"="test.cpp" } +attributes #1 = { "referenced-indirectly" } + +!0 = !{} +!1 = !{i32 1} \ No newline at end of file diff --git a/llvm/test/tools/sycl-post-link/sycl-esimd/no-sycl-esimd-split-shared-func.ll b/llvm/test/tools/sycl-post-link/sycl-esimd/no-sycl-esimd-split-shared-func.ll index aae4d6278fda6..f2f9568141e6e 100644 --- a/llvm/test/tools/sycl-post-link/sycl-esimd/no-sycl-esimd-split-shared-func.ll +++ b/llvm/test/tools/sycl-post-link/sycl-esimd/no-sycl-esimd-split-shared-func.ll @@ -7,9 +7,10 @@ ; RUN: sycl-post-link -lower-esimd -symbols -split=auto -S < %s -o %t.table ; RUN: FileCheck %s -input-file=%t.table --check-prefixes CHECK-TABLE -; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-HELPERS-SYM -; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-ESIMD-SYM -; RUN: FileCheck %s -input-file=%t_1.ll +; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-HELPERS-SYM-1 +; RUN: FileCheck %s -input-file=%t_esimd_1.sym --check-prefixes CHECK-HELPERS-SYM-2 +; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-ESIMD-SYM +; RUN: FileCheck %s -input-file=%t_2.ll ;---------------- Verify generated table file. ; CHECK-TABLE: [Code|Properties|Symbols] @@ -19,13 +20,15 @@ ; SIMD_CALL_HELPER_* functions generated by the compiler, because we don't ; expect them to be referenced externally. ; CHECK-TABLE: {{.*}}_0.ll|{{.*}}_0.prop|{{.*}}_0.sym -; CHECK-TABLE: {{.*}}_1.ll|{{.*}}_1.prop|{{.*}}_1.sym +; CHECK-TABLE: {{.*}}_esimd_1.ll|{{.*}}_esimd_1.prop|{{.*}}_esimd_1.sym +; CHECK-TABLE: {{.*}}_2.ll|{{.*}}_2.prop|{{.*}}_2.sym ; CHECK-TABLE-EMPTY: ;---------------- Verify generated symbol file. -; CHECK-HELPERS-SYM: SIMD_CALL_HELPER_[[#]] -; CHECK-HELPERS-SYM: SPMD_CALLER -; CHECK-HELPERS-SYM-EMPTY: +; CHECK-HELPERS-SYM-1: SPMD_CALLER +; CHECK-HELPERS-SYM-1-EMPTY: +; CHECK-HELPERS-SYM-2: SIMD_CALL_HELPER_[[#]] +; CHECK-HELPERS-SYM-2-EMPTY: ; ; CHECK-ESIMD-SYM: ESIMD_kernel ; CHECK-ESIMD-SYM: SYCL_kernel diff --git a/llvm/tools/sycl-post-link/ModuleSplitter.cpp b/llvm/tools/sycl-post-link/ModuleSplitter.cpp index 36c731a2cceb3..9c77509a4a85d 100644 --- a/llvm/tools/sycl-post-link/ModuleSplitter.cpp +++ b/llvm/tools/sycl-post-link/ModuleSplitter.cpp @@ -129,48 +129,6 @@ bool isESIMDFunction(const Function &F) { return F.getMetadata(ESIMD_MARKER_MD) != nullptr; } -// This function makes one or two groups depending on kernel types (SYCL, ESIMD) -EntryPointGroupVec -groupEntryPointsByKernelType(ModuleDesc &MD, - bool EmitOnlyKernelsAsEntryPoints) { - Module &M = MD.getModule(); - EntryPointGroupVec EntryPointGroups{}; - std::map EntryPointMap; - - // Only process module entry points: - for (Function &F : M.functions()) { - if (!isEntryPoint(F, EmitOnlyKernelsAsEntryPoints) || - !MD.isEntryPointCandidate(F)) - continue; - if (isESIMDFunction(F)) - EntryPointMap[ESIMD_SCOPE_NAME].insert(&F); - else - EntryPointMap[SYCL_SCOPE_NAME].insert(&F); - } - - if (!EntryPointMap.empty()) { - for (auto &EPG : EntryPointMap) { - EntryPointGroups.emplace_back(EPG.first, std::move(EPG.second), - MD.getEntryPointGroup().Props); - EntryPointGroup &G = EntryPointGroups.back(); - - if (G.GroupId == ESIMD_SCOPE_NAME) { - G.Props.HasESIMD = SyclEsimdSplitStatus::ESIMD_ONLY; - } else { - assert(G.GroupId == SYCL_SCOPE_NAME); - G.Props.HasESIMD = SyclEsimdSplitStatus::SYCL_ONLY; - } - } - } else { - // No entry points met, record this. - EntryPointGroups.emplace_back(SYCL_SCOPE_NAME, EntryPointSet{}); - EntryPointGroup &G = EntryPointGroups.back(); - G.Props.HasESIMD = SyclEsimdSplitStatus::SYCL_ONLY; - } - - return EntryPointGroups; -} - // Represents "dependency" or "use" graph of global objects (functions and // global variables) in a module. It is used during device code split to // understand which global variables and functions (other than entry points) @@ -898,6 +856,8 @@ getDeviceCodeSplitter(ModuleDesc &&MD, IRSplitMode Mode, bool IROutputOnly, Categorizer.registerSimpleStringAttributeRule("sycl-register-alloc-mode"); Categorizer.registerListOfIntegersInMetadataSortedRule("sycl_used_aspects"); Categorizer.registerListOfIntegersInMetadataRule("reqd_work_group_size"); + Categorizer.registerListOfIntegersInMetadataRule( + "intel_reqd_sub_group_size"); Categorizer.registerSimpleStringAttributeRule( sycl::utils::ATTR_SYCL_OPTLEVEL); break; @@ -953,17 +913,47 @@ getDeviceCodeSplitter(ModuleDesc &&MD, IRSplitMode Mode, bool IROutputOnly, // Functions, which are used from both ESIMD and non-ESIMD code will be // duplicated into each module. // -// If there are dependenceis between ESIMD and non-ESIMD code (produced by -// inoke_simd, for example), the modules has to be linked back together to avoid -// undefined behavior at later stages. That is done at higher level, outside of -// this function. +// If there are dependencies between ESIMD and non-ESIMD code (produced by +// invoke_simd, for example), the modules has to be linked back together to +// avoid undefined behavior at later stages. That is done at higher level, +// outside of this function. SmallVector splitByESIMD(ModuleDesc &&MD, bool EmitOnlyKernelsAsEntryPoints) { SmallVector Result; + EntryPointGroupVec EntryPointGroups{}; + EntryPointSet SYCLEntryPoints, ESIMDEntryPoints; + bool hasESIMDFunctions = false; - EntryPointGroupVec EntryPointGroups = - groupEntryPointsByKernelType(MD, EmitOnlyKernelsAsEntryPoints); + // Only process module entry points: + for (Function &F : MD.getModule().functions()) { + if (isESIMDFunction(F)) + hasESIMDFunctions = true; + if (!isEntryPoint(F, EmitOnlyKernelsAsEntryPoints) || + !MD.isEntryPointCandidate(F)) + continue; + if (isESIMDFunction(F)) + ESIMDEntryPoints.insert(&F); + else + SYCLEntryPoints.insert(&F); + } + + // If there are no ESIMD entry points but there are ESIMD functions, + // we still need to create an (empty) entry point group so that we + // can lower the ESIMD functions. + if (!ESIMDEntryPoints.empty() || hasESIMDFunctions) { + EntryPointGroups.emplace_back(ESIMD_SCOPE_NAME, std::move(ESIMDEntryPoints), + MD.getEntryPointGroup().Props); + EntryPointGroup &G = EntryPointGroups.back(); + G.Props.HasESIMD = SyclEsimdSplitStatus::ESIMD_ONLY; + } + + if (!SYCLEntryPoints.empty() || EntryPointGroups.empty()) { + EntryPointGroups.emplace_back(SYCL_SCOPE_NAME, std::move(SYCLEntryPoints), + MD.getEntryPointGroup().Props); + EntryPointGroup &G = EntryPointGroups.back(); + G.Props.HasESIMD = SyclEsimdSplitStatus::SYCL_ONLY; + } if (EntryPointGroups.size() == 1) { Result.emplace_back(std::move(MD.releaseModulePtr()), diff --git a/llvm/tools/sycl-post-link/SYCLDeviceRequirements.cpp b/llvm/tools/sycl-post-link/SYCLDeviceRequirements.cpp index 8d30db1a2522e..4aa28dc4ff643 100644 --- a/llvm/tools/sycl-post-link/SYCLDeviceRequirements.cpp +++ b/llvm/tools/sycl-post-link/SYCLDeviceRequirements.cpp @@ -7,9 +7,11 @@ //===----------------------------------------------------------------------===// #include "SYCLDeviceRequirements.h" +#include "ModuleSplitter.h" #include "llvm/ADT/StringRef.h" #include "llvm/IR/Module.h" +#include "llvm/Support/PropertySetIO.h" #include #include @@ -17,7 +19,8 @@ using namespace llvm; void llvm::getSYCLDeviceRequirements( - const Module &M, std::map> &Requirements) { + const module_split::ModuleDesc &MD, + std::map &Requirements) { auto ExtractIntegerFromMDNodeOperand = [=](const MDNode *N, unsigned OpNo) -> unsigned { Constant *C = @@ -34,18 +37,40 @@ void llvm::getSYCLDeviceRequirements( {"sycl_fixed_targets", "fixed_target"}, {"reqd_work_group_size", "reqd_work_group_size"}}; - for (const auto &MD : ReqdMDs) { + for (const auto &[MDName, MappedName] : ReqdMDs) { std::set Values; - for (const Function &F : M) { - if (const MDNode *MDN = F.getMetadata(MD.first)) { + for (const Function &F : MD.getModule()) { + if (const MDNode *MDN = F.getMetadata(MDName)) { for (size_t I = 0, E = MDN->getNumOperands(); I < E; ++I) Values.insert(ExtractIntegerFromMDNodeOperand(MDN, I)); } } // We don't need the "fixed_target" property if it's empty - if (std::string(MD.first) == "sycl_fixed_targets" && Values.empty()) + if (std::string(MDName) == "sycl_fixed_targets" && Values.empty()) continue; - Requirements[MD.second] = + Requirements[MappedName] = std::vector(Values.begin(), Values.end()); } + + // There should only be at most one function with + // intel_reqd_sub_group_size metadata when considering the entry + // points of a module, but not necessarily when considering all the + // functions of a module: an entry point with a + // intel_reqd_sub_group_size can call an ESIMD function through + // invoke_esimd, and that function has intel_reqd_sub_group_size=1, + // which is valid. + std::optional SubGroupSize; + for (const Function *F : MD.entries()) { + if (auto *MDN = F->getMetadata("intel_reqd_sub_group_size")) { + assert(MDN->getNumOperands() == 1); + auto MDValue = ExtractIntegerFromMDNodeOperand(MDN, 0); + if (!SubGroupSize) + SubGroupSize = MDValue; + else + assert(*SubGroupSize == MDValue); + } + } + // Do not attach reqd_sub_group_size if there is no attached metadata + if (SubGroupSize) + Requirements["reqd_sub_group_size"] = *SubGroupSize; } diff --git a/llvm/tools/sycl-post-link/SYCLDeviceRequirements.h b/llvm/tools/sycl-post-link/SYCLDeviceRequirements.h index 7aaccba4a33bb..5ef5c9aea847e 100644 --- a/llvm/tools/sycl-post-link/SYCLDeviceRequirements.h +++ b/llvm/tools/sycl-post-link/SYCLDeviceRequirements.h @@ -14,10 +14,17 @@ namespace llvm { -class Module; class StringRef; +namespace module_split { +class ModuleDesc; +} +namespace util { +class PropertyValue; +} + void getSYCLDeviceRequirements( - const Module &M, std::map> &Requirements); + const module_split::ModuleDesc &M, + std::map &Requirements); } // namespace llvm diff --git a/llvm/tools/sycl-post-link/sycl-post-link.cpp b/llvm/tools/sycl-post-link/sycl-post-link.cpp index b35d1cdf7022f..1428fd7551756 100644 --- a/llvm/tools/sycl-post-link/sycl-post-link.cpp +++ b/llvm/tools/sycl-post-link/sycl-post-link.cpp @@ -366,8 +366,8 @@ std::string saveModuleProperties(module_split::ModuleDesc &MD, PropSet.add(PropSetRegTy::SYCL_DEVICELIB_REQ_MASK, RMEntry); } { - std::map> Requirements; - getSYCLDeviceRequirements(M, Requirements); + std::map Requirements; + getSYCLDeviceRequirements(MD, Requirements); PropSet.add(PropSetRegTy::SYCL_DEVICE_REQUIREMENTS, Requirements); } if (MD.Props.SpecConstsMet) { diff --git a/sycl/source/detail/program_manager/program_manager.cpp b/sycl/source/detail/program_manager/program_manager.cpp index bcb3f0fbc69e5..398a11ae34a5b 100644 --- a/sycl/source/detail/program_manager/program_manager.cpp +++ b/sycl/source/detail/program_manager/program_manager.cpp @@ -634,20 +634,34 @@ sycl::detail::pi::PiProgram ProgramManager::getBuiltPIProgram( for (RTDeviceBinaryImage::PropertyRange::ConstIterator It : ARange) { using namespace std::literals; - if ((*It)->Name != "aspects"sv) - continue; - ByteArray Aspects = DeviceBinaryProperty(*It).asByteArray(); - // 8 because we need to skip 64-bits of size of the byte array - auto *AIt = reinterpret_cast(&Aspects[8]); - auto *AEnd = - reinterpret_cast(&Aspects[0] + Aspects.size()); - while (AIt != AEnd) { - auto Aspect = static_cast(*AIt); - if (!Dev->has(Aspect)) + if ((*It)->Name == "aspects"sv) { + ByteArray Aspects = DeviceBinaryProperty(*It).asByteArray(); + // 8 because we need to skip 64-bits of size of the byte array + Aspects.dropBytes(8); + while (!Aspects.empty()) { + auto Aspect = static_cast(Aspects.consume()); + if (!Dev->has(Aspect)) + throw sycl::exception(errc::kernel_not_supported, + "Required aspect " + getAspectNameStr(Aspect) + + " is not supported on the device"); + } + } else if ((*It)->Name == "reqd_sub_group_size"sv) { + auto ReqdSubGroupSize = DeviceBinaryProperty(*It).asUint32(); + auto SupportedSubGroupSizes = + Device.get_info(); + + // !getUint32PropAsBool(Img, "isEsimdImage") is a WA for ESIMD, + // as ESIMD images have a reqd-sub-group-size of 1, but currently + // no backend currently includes 1 as a valid sub-group size. + // This can be removed if backends add 1 as a valid sub-group size. + if (!getUint32PropAsBool(Img, "isEsimdImage") && + std::none_of(SupportedSubGroupSizes.cbegin(), + SupportedSubGroupSizes.cend(), + [=](auto s) { return s == ReqdSubGroupSize; })) throw sycl::exception(errc::kernel_not_supported, - "Required aspect " + getAspectNameStr(Aspect) + + "Sub-group size " + + std::to_string(ReqdSubGroupSize) + " is not supported on the device"); - ++AIt; } } @@ -2437,9 +2451,7 @@ bool doesDevSupportDeviceRequirements(const device &Dev, auto AspectsPropIt = getPropIt("aspects"); auto ReqdWGSizePropIt = getPropIt("reqd_work_group_size"); - - if (!AspectsPropIt && !ReqdWGSizePropIt) - return true; + auto ReqdSubGroupSizePropIt = getPropIt("reqd_sub_group_size"); // Checking if device supports defined aspects if (AspectsPropIt) { @@ -2502,6 +2514,19 @@ bool doesDevSupportDeviceRequirements(const device &Dev, return false; } } + + // Check if device supports required sub-group size. + if (ReqdSubGroupSizePropIt) { + auto ReqdSubGroupSize = + DeviceBinaryProperty(*(ReqdSubGroupSizePropIt.value())).asUint32(); + auto SupportedSubGroupSizes = Dev.get_info(); + if (!getUint32PropAsBool(Img, "isEsimdImage") && + std::none_of(SupportedSubGroupSizes.cbegin(), + SupportedSubGroupSizes.cend(), + [=](auto s) { return s == ReqdSubGroupSize; })) + return false; + } + return true; } diff --git a/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_with_aspects.cpp b/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_with_aspects.cpp index de0900b762e79..a0cd2e5a4447d 100644 --- a/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_with_aspects.cpp +++ b/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_with_aspects.cpp @@ -72,6 +72,14 @@ int main() { [=](sycl::item<1> it) [[sycl::reqd_work_group_size(INT_MAX)]] {}); }); } + if (sycl::is_compatible(Dev)) { + assert(false && "sycl::is_compatible must be false"); + Q.submit([&](sycl::handler &h) { + h.parallel_for( + sycl::range<1>(2), + [=](sycl::item<1> it) [[sycl::reqd_sub_group_size(INT_MAX)]] {}); + }); + } return (Compatible && Called) ? 0 : 1; }