diff --git a/llvm/test/CodeGen/AMDGPU/andorbitset.ll b/llvm/test/CodeGen/AMDGPU/andorbitset.ll index a189ba9b10342..0fa58f3c444a5 100644 --- a/llvm/test/CodeGen/AMDGPU/andorbitset.ll +++ b/llvm/test/CodeGen/AMDGPU/andorbitset.ll @@ -1,48 +1,103 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; SI-LABEL: {{^}}s_clear_msb: -; SI: s_bitset0_b32 s{{[0-9]+}}, 31 define amdgpu_kernel void @s_clear_msb(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: s_clear_msb: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_bitset0_b32 s4, 31 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm %x = and i32 %in, 2147483647 store i32 %x, ptr addrspace(1) %out ret void } -; SI-LABEL: {{^}}s_set_msb: -; SI: s_bitset1_b32 s{{[0-9]+}}, 31 define amdgpu_kernel void @s_set_msb(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: s_set_msb: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_bitset1_b32 s4, 31 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm %x = or i32 %in, 2147483648 store i32 %x, ptr addrspace(1) %out ret void } -; SI-LABEL: {{^}}s_clear_lsb: -; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, -2 define amdgpu_kernel void @s_clear_lsb(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: s_clear_lsb: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_and_b32 s4, s4, -2 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm %x = and i32 %in, 4294967294 store i32 %x, ptr addrspace(1) %out ret void } -; SI-LABEL: {{^}}s_set_lsb: -; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1 define amdgpu_kernel void @s_set_lsb(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: s_set_lsb: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_or_b32 s4, s4, 1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm %x = or i32 %in, 1 store i32 %x, ptr addrspace(1) %out ret void } -; SI-LABEL: {{^}}s_clear_midbit: -; SI: s_bitset0_b32 s{{[0-9]+}}, 8 define amdgpu_kernel void @s_clear_midbit(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: s_clear_midbit: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_bitset0_b32 s4, 8 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm %x = and i32 %in, 4294967039 store i32 %x, ptr addrspace(1) %out ret void } -; SI-LABEL: {{^}}s_set_midbit: -; SI: s_bitset1_b32 s{{[0-9]+}}, 8 define amdgpu_kernel void @s_set_midbit(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: s_set_midbit: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_bitset1_b32 s4, 8 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm %x = or i32 %in, 256 store i32 %x, ptr addrspace(1) %out ret void @@ -51,10 +106,27 @@ define amdgpu_kernel void @s_set_midbit(ptr addrspace(1) %out, i32 %in) { @gv = external addrspace(1) global i32 ; Make sure there's no verifier error with an undef source. -; SI-LABEL: {{^}}bitset_verifier_error: -; SI-NOT: %bb.1: -; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff define void @bitset_verifier_error() local_unnamed_addr #0 { +; SI-LABEL: bitset_verifier_error: +; SI: ; %bb.0: ; %bb +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_getpc_b64 s[4:5] +; SI-NEXT: s_add_u32 s4, s4, gv@gotpcrel32@lo+4 +; SI-NEXT: s_addc_u32 s5, s5, gv@gotpcrel32@hi+12 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_and_b32 s8, s4, 0x7fffffff +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, 0x3f7fbe77 +; SI-NEXT: v_cmp_ge_f32_e64 s[4:5], |s4|, v0 +; SI-NEXT: s_and_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccnz .LBB6_2 +; SI-NEXT: ; %bb.1: ; %bb5 +; SI-NEXT: .LBB6_2: ; %bb6 bb: %i = call float @llvm.fabs.f32(float undef) #0 %i1 = bitcast float %i to i32 diff --git a/llvm/test/CodeGen/AMDGPU/andorxorinvimm.ll b/llvm/test/CodeGen/AMDGPU/andorxorinvimm.ll index dc158028bd7b0..4b56b5e9d24f5 100644 --- a/llvm/test/CodeGen/AMDGPU/andorxorinvimm.ll +++ b/llvm/test/CodeGen/AMDGPU/andorxorinvimm.ll @@ -1,48 +1,103 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn -mcpu=tahiti -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; SI-LABEL: {{^}}s_or_to_orn2: -; SI: s_orn2_b32 s{{[0-9]+}}, s{{[0-9]+}}, 50 define amdgpu_kernel void @s_or_to_orn2(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: s_or_to_orn2: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_orn2_b32 s4, s4, 50 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm %x = or i32 %in, -51 store i32 %x, ptr addrspace(1) %out ret void } -; SI-LABEL: {{^}}s_or_to_orn2_imm0: -; SI: s_orn2_b32 s{{[0-9]+}}, s{{[0-9]+}}, 50 define amdgpu_kernel void @s_or_to_orn2_imm0(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: s_or_to_orn2_imm0: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_orn2_b32 s4, s4, 50 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm %x = or i32 -51, %in store i32 %x, ptr addrspace(1) %out ret void } -; SI-LABEL: {{^}}s_and_to_andn2: -; SI: s_andn2_b32 s{{[0-9]+}}, s{{[0-9]+}}, 50 define amdgpu_kernel void @s_and_to_andn2(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: s_and_to_andn2: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_andn2_b32 s4, s4, 50 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm %x = and i32 %in, -51 store i32 %x, ptr addrspace(1) %out ret void } -; SI-LABEL: {{^}}s_and_to_andn2_imm0: -; SI: s_andn2_b32 s{{[0-9]+}}, s{{[0-9]+}}, 50 define amdgpu_kernel void @s_and_to_andn2_imm0(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: s_and_to_andn2_imm0: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_andn2_b32 s4, s4, 50 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm %x = and i32 -51, %in store i32 %x, ptr addrspace(1) %out ret void } -; SI-LABEL: {{^}}s_xor_to_xnor: -; SI: s_xnor_b32 s{{[0-9]+}}, s{{[0-9]+}}, 50 define amdgpu_kernel void @s_xor_to_xnor(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: s_xor_to_xnor: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_xnor_b32 s4, s4, 50 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm %x = xor i32 %in, -51 store i32 %x, ptr addrspace(1) %out ret void } -; SI-LABEL: {{^}}s_xor_to_xnor_imm0: -; SI: s_xnor_b32 s{{[0-9]+}}, s{{[0-9]+}}, 50 define amdgpu_kernel void @s_xor_to_xnor_imm0(ptr addrspace(1) %out, i32 %in) { +; SI-LABEL: s_xor_to_xnor_imm0: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s4, s[2:3], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_xnor_b32 s4, s4, 50 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm %x = xor i32 -51, %in store i32 %x, ptr addrspace(1) %out ret void diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f64.ll b/llvm/test/CodeGen/AMDGPU/fabs.f64.ll index 32d5fa6e72d79..f98124fe2ed73 100644 --- a/llvm/test/CodeGen/AMDGPU/fabs.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs.f64.ll @@ -1,4 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone @@ -7,10 +8,25 @@ declare double @llvm.fabs.f64(double) readnone declare <2 x double> @llvm.fabs.v2f64(<2 x double>) readnone declare <4 x double> @llvm.fabs.v4f64(<4 x double>) readnone -; FUNC-LABEL: {{^}}v_fabs_f64: -; SI: v_and_b32 -; SI: s_endpgm define amdgpu_kernel void @v_fabs_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; SI-LABEL: v_fabs_f64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %tidext = sext i32 %tid to i64 %gep = getelementptr double, ptr addrspace(1) %in, i64 %tidext @@ -20,75 +36,148 @@ define amdgpu_kernel void @v_fabs_f64(ptr addrspace(1) %out, ptr addrspace(1) %i ret void } -; FUNC-LABEL: {{^}}fabs_f64: -; SI: s_bitset0_b32 -; SI: s_endpgm define amdgpu_kernel void @fabs_f64(ptr addrspace(1) %out, double %in) { +; SI-LABEL: fabs_f64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_bitset0_b32 s3, 31 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_endpgm %fabs = call double @llvm.fabs.f64(double %in) store double %fabs, ptr addrspace(1) %out ret void } -; FUNC-LABEL: {{^}}fabs_v2f64: -; SI: s_bitset0_b32 -; SI: s_bitset0_b32 -; SI: s_endpgm define amdgpu_kernel void @fabs_v2f64(ptr addrspace(1) %out, <2 x double> %in) { +; SI-LABEL: fabs_v2f64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_bitset0_b32 s7, 31 +; SI-NEXT: s_bitset0_b32 s5, 31 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: s_endpgm %fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %in) store <2 x double> %fabs, ptr addrspace(1) %out ret void } -; FUNC-LABEL: {{^}}fabs_v4f64: -; SI: s_bitset0_b32 -; SI: s_bitset0_b32 -; SI: s_bitset0_b32 -; SI: s_bitset0_b32 -; SI: s_endpgm define amdgpu_kernel void @fabs_v4f64(ptr addrspace(1) %out, <4 x double> %in) { +; SI-LABEL: fabs_v4f64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x11 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_bitset0_b32 s7, 31 +; SI-NEXT: s_bitset0_b32 s11, 31 +; SI-NEXT: s_bitset0_b32 s9, 31 +; SI-NEXT: s_bitset0_b32 s5, 31 +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: v_mov_b32_e32 v6, s6 +; SI-NEXT: v_mov_b32_e32 v1, s9 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; SI-NEXT: v_mov_b32_e32 v5, s5 +; SI-NEXT: v_mov_b32_e32 v7, s7 +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 +; SI-NEXT: s_endpgm %fabs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %in) store <4 x double> %fabs, ptr addrspace(1) %out ret void } -; SI-LABEL: {{^}}fabs_fold_f64: -; SI: s_load_dwordx2 [[ABS_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13 -; SI-NOT: and -; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, |[[ABS_VALUE]]|, {{v\[[0-9]+:[0-9]+\]}} -; SI: s_endpgm define amdgpu_kernel void @fabs_fold_f64(ptr addrspace(1) %out, [8 x i32], double %in0, [8 x i32], double %in1) { +; SI-LABEL: fabs_fold_f64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x1d +; SI-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mul_f64 v[0:1], |s[6:7]|, v[0:1] +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_endpgm %fabs = call double @llvm.fabs.f64(double %in0) %fmul = fmul double %fabs, %in1 store double %fmul, ptr addrspace(1) %out ret void } -; SI-LABEL: {{^}}fabs_fn_fold_f64: -; SI: s_load_dwordx2 [[ABS_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13 -; SI-NOT: and -; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, |[[ABS_VALUE]]|, {{v\[[0-9]+:[0-9]+\]}} -; SI: s_endpgm define amdgpu_kernel void @fabs_fn_fold_f64(ptr addrspace(1) %out, [8 x i32], double %in0, [8 x i32], double %in1) { +; SI-LABEL: fabs_fn_fold_f64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x1d +; SI-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mul_f64 v[0:1], |s[6:7]|, v[0:1] +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_endpgm %fabs = call double @fabs(double %in0) %fmul = fmul double %fabs, %in1 store double %fmul, ptr addrspace(1) %out ret void } -; FUNC-LABEL: {{^}}fabs_free_f64: -; SI: s_bitset0_b32 -; SI: s_endpgm define amdgpu_kernel void @fabs_free_f64(ptr addrspace(1) %out, i64 %in) { +; SI-LABEL: fabs_free_f64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_bitset0_b32 s3, 31 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_endpgm %bc= bitcast i64 %in to double %fabs = call double @llvm.fabs.f64(double %bc) store double %fabs, ptr addrspace(1) %out ret void } -; FUNC-LABEL: {{^}}fabs_fn_free_f64: -; SI: s_bitset0_b32 -; SI: s_endpgm define amdgpu_kernel void @fabs_fn_free_f64(ptr addrspace(1) %out, i64 %in) { +; SI-LABEL: fabs_fn_free_f64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_bitset0_b32 s3, 31 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_endpgm %bc= bitcast i64 %in to double %fabs = call double @fabs(double %bc) store double %fabs, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.ll index d9227724c22a1..855ca390aabdc 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.ll @@ -1,12 +1,21 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=VERDE %s ;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -;CHECK-LABEL: {{^}}buffer_store: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 -;CHECK: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 glc -;CHECK: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 slc define amdgpu_ps void @buffer_store(ptr addrspace(8) inreg, <4 x float>, <4 x float>, <4 x float>) { +; VERDE-LABEL: buffer_store: +; VERDE: ; %bb.0: ; %main_body +; VERDE-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; VERDE-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 glc +; VERDE-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 slc +; VERDE-NEXT: s_endpgm +; +; CHECK-LABEL: buffer_store: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; CHECK-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 glc +; CHECK-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 slc +; CHECK-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 0, i32 0, i32 0) call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> %2, ptr addrspace(8) %0, i32 0, i32 0, i32 1) @@ -14,34 +23,54 @@ main_body: ret void } -;CHECK-LABEL: {{^}}buffer_store_immoffs: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:42 define amdgpu_ps void @buffer_store_immoffs(ptr addrspace(8) inreg, <4 x float>) { +; VERDE-LABEL: buffer_store_immoffs: +; VERDE: ; %bb.0: ; %main_body +; VERDE-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:42 +; VERDE-NEXT: s_endpgm +; +; CHECK-LABEL: buffer_store_immoffs: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:42 +; CHECK-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 42, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}buffer_store_ofs: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen define amdgpu_ps void @buffer_store_ofs(ptr addrspace(8) inreg, <4 x float>, i32) { +; VERDE-LABEL: buffer_store_ofs: +; VERDE: ; %bb.0: ; %main_body +; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen +; VERDE-NEXT: s_endpgm +; +; CHECK-LABEL: buffer_store_ofs: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen +; CHECK-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 %2, i32 0, i32 0) ret void } ; Ideally, the register allocator would avoid the wait here -; -;CHECK-LABEL: {{^}}buffer_store_wait: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen -;VERDE: s_waitcnt expcnt(0) -;CHECK: buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 offen -;CHECK: s_waitcnt vmcnt(0) -;CHECK: buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 offen define amdgpu_ps void @buffer_store_wait(ptr addrspace(8) inreg, <4 x float>, i32, i32, i32) { +; VERDE-LABEL: buffer_store_wait: +; VERDE: ; %bb.0: ; %main_body +; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen +; VERDE-NEXT: s_waitcnt expcnt(0) +; VERDE-NEXT: buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 offen +; VERDE-NEXT: s_waitcnt vmcnt(0) +; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 offen +; VERDE-NEXT: s_endpgm +; +; CHECK-LABEL: buffer_store_wait: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 offen +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 offen +; CHECK-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 %2, i32 0, i32 0) %data = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 %3, i32 0, i32 0) @@ -49,29 +78,48 @@ main_body: ret void } -;CHECK-LABEL: {{^}}buffer_store_x1: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dword v0, v1, s[0:3], 0 offen define amdgpu_ps void @buffer_store_x1(ptr addrspace(8) inreg %rsrc, float %data, i32 %offset) { +; VERDE-LABEL: buffer_store_x1: +; VERDE: ; %bb.0: ; %main_body +; VERDE-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; VERDE-NEXT: s_endpgm +; +; CHECK-LABEL: buffer_store_x1: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; CHECK-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}buffer_store_x2: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen define amdgpu_ps void @buffer_store_x2(ptr addrspace(8) inreg %rsrc, <2 x float> %data, i32 %offset) #0 { +; VERDE-LABEL: buffer_store_x2: +; VERDE: ; %bb.0: ; %main_body +; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen +; VERDE-NEXT: s_endpgm +; +; CHECK-LABEL: buffer_store_x2: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen +; CHECK-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.ptr.buffer.store.v2f32(<2 x float> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}buffer_store_x1_offen_merged_and: -;CHECK-NOT: s_waitcnt -;CHECK-DAG: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 -;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 define amdgpu_ps void @buffer_store_x1_offen_merged_and(ptr addrspace(8) inreg %rsrc, i32 %a, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { +; VERDE-LABEL: buffer_store_x1_offen_merged_and: +; VERDE: ; %bb.0: +; VERDE-NEXT: buffer_store_dwordx4 v[1:4], v0, s[0:3], 0 offen offset:4 +; VERDE-NEXT: buffer_store_dwordx2 v[5:6], v0, s[0:3], 0 offen offset:28 +; VERDE-NEXT: s_endpgm +; +; CHECK-LABEL: buffer_store_x1_offen_merged_and: +; CHECK: ; %bb.0: +; CHECK-NEXT: buffer_store_dwordx4 v[1:4], v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dwordx2 v[5:6], v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: s_endpgm %a1 = add i32 %a, 4 %a2 = add i32 %a, 8 %a3 = add i32 %a, 12 @@ -87,11 +135,20 @@ define amdgpu_ps void @buffer_store_x1_offen_merged_and(ptr addrspace(8) inreg % ret void } -;CHECK-LABEL: {{^}}buffer_store_x1_offen_merged_or: -;CHECK-NOT: s_waitcnt -;CHECK-DAG: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v{{[0-9]}}, s[0:3], 0 offen offset:4 -;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v{{[0-9]}}, s[0:3], 0 offen offset:28 define amdgpu_ps void @buffer_store_x1_offen_merged_or(ptr addrspace(8) inreg %rsrc, i32 %inp, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { +; VERDE-LABEL: buffer_store_x1_offen_merged_or: +; VERDE: ; %bb.0: +; VERDE-NEXT: v_lshlrev_b32_e32 v0, 6, v0 +; VERDE-NEXT: buffer_store_dwordx4 v[1:4], v0, s[0:3], 0 offen offset:4 +; VERDE-NEXT: buffer_store_dwordx2 v[5:6], v0, s[0:3], 0 offen offset:28 +; VERDE-NEXT: s_endpgm +; +; CHECK-LABEL: buffer_store_x1_offen_merged_or: +; CHECK: ; %bb.0: +; CHECK-NEXT: v_lshlrev_b32_e32 v0, 6, v0 +; CHECK-NEXT: buffer_store_dwordx4 v[1:4], v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dwordx2 v[5:6], v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: s_endpgm %a = shl i32 %inp, 6 %a1 = add i32 %a, 4 %a2 = add i32 %a, 8 @@ -109,12 +166,20 @@ define amdgpu_ps void @buffer_store_x1_offen_merged_or(ptr addrspace(8) inreg %r } -;CHECK-LABEL: {{^}}buffer_store_x1_offen_merged_glc_slc: -;CHECK-NOT: s_waitcnt -;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4{{$}} -;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:12 glc{{$}} -;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 glc slc{{$}} define amdgpu_ps void @buffer_store_x1_offen_merged_glc_slc(ptr addrspace(8) inreg %rsrc, i32 %a, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { +; VERDE-LABEL: buffer_store_x1_offen_merged_glc_slc: +; VERDE: ; %bb.0: +; VERDE-NEXT: buffer_store_dwordx2 v[1:2], v0, s[0:3], 0 offen offset:4 +; VERDE-NEXT: buffer_store_dwordx2 v[3:4], v0, s[0:3], 0 offen offset:12 glc +; VERDE-NEXT: buffer_store_dwordx2 v[5:6], v0, s[0:3], 0 offen offset:28 glc slc +; VERDE-NEXT: s_endpgm +; +; CHECK-LABEL: buffer_store_x1_offen_merged_glc_slc: +; CHECK: ; %bb.0: +; CHECK-NEXT: buffer_store_dwordx2 v[1:2], v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dwordx2 v[3:4], v0, s[0:3], 0 offen offset:12 glc +; CHECK-NEXT: buffer_store_dwordx2 v[5:6], v0, s[0:3], 0 offen offset:28 glc slc +; CHECK-NEXT: s_endpgm %a1 = add i32 %a, 4 %a2 = add i32 %a, 8 %a3 = add i32 %a, 12 @@ -130,10 +195,16 @@ define amdgpu_ps void @buffer_store_x1_offen_merged_glc_slc(ptr addrspace(8) inr ret void } -;CHECK-LABEL: {{^}}buffer_store_x2_offen_merged_and: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 define amdgpu_ps void @buffer_store_x2_offen_merged_and(ptr addrspace(8) inreg %rsrc, i32 %a, <2 x float> %v1, <2 x float> %v2) { +; VERDE-LABEL: buffer_store_x2_offen_merged_and: +; VERDE: ; %bb.0: +; VERDE-NEXT: buffer_store_dwordx4 v[1:4], v0, s[0:3], 0 offen offset:4 +; VERDE-NEXT: s_endpgm +; +; CHECK-LABEL: buffer_store_x2_offen_merged_and: +; CHECK: ; %bb.0: +; CHECK-NEXT: buffer_store_dwordx4 v[1:4], v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: s_endpgm %a1 = add i32 %a, 4 %a2 = add i32 %a, 12 call void @llvm.amdgcn.raw.ptr.buffer.store.v2f32(<2 x float> %v1, ptr addrspace(8) %rsrc, i32 %a1, i32 0, i32 0) @@ -141,10 +212,18 @@ define amdgpu_ps void @buffer_store_x2_offen_merged_and(ptr addrspace(8) inreg % ret void } -;CHECK-LABEL: {{^}}buffer_store_x2_offen_merged_or: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v{{[0-9]}}, s[0:3], 0 offen offset:4 define amdgpu_ps void @buffer_store_x2_offen_merged_or(ptr addrspace(8) inreg %rsrc, i32 %inp, <2 x float> %v1, <2 x float> %v2) { +; VERDE-LABEL: buffer_store_x2_offen_merged_or: +; VERDE: ; %bb.0: +; VERDE-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; VERDE-NEXT: buffer_store_dwordx4 v[1:4], v0, s[0:3], 0 offen offset:4 +; VERDE-NEXT: s_endpgm +; +; CHECK-LABEL: buffer_store_x2_offen_merged_or: +; CHECK: ; %bb.0: +; CHECK-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; CHECK-NEXT: buffer_store_dwordx4 v[1:4], v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: s_endpgm %a = shl i32 %inp, 4 %a1 = add i32 %a, 4 %a2 = add i32 %a, 12 @@ -153,11 +232,18 @@ define amdgpu_ps void @buffer_store_x2_offen_merged_or(ptr addrspace(8) inreg %r ret void } -;CHECK-LABEL: {{^}}buffer_store_x1_offset_merged: -;CHECK-NOT: s_waitcnt -;CHECK-DAG: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 -;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:28 define amdgpu_ps void @buffer_store_x1_offset_merged(ptr addrspace(8) inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { +; VERDE-LABEL: buffer_store_x1_offset_merged: +; VERDE: ; %bb.0: +; VERDE-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:4 +; VERDE-NEXT: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:28 +; VERDE-NEXT: s_endpgm +; +; CHECK-LABEL: buffer_store_x1_offset_merged: +; CHECK: ; %bb.0: +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:4 +; CHECK-NEXT: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:28 +; CHECK-NEXT: s_endpgm call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %v1, ptr addrspace(8) %rsrc, i32 4, i32 0, i32 0) call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %v2, ptr addrspace(8) %rsrc, i32 8, i32 0, i32 0) call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %v3, ptr addrspace(8) %rsrc, i32 12, i32 0, i32 0) @@ -167,21 +253,35 @@ define amdgpu_ps void @buffer_store_x1_offset_merged(ptr addrspace(8) inreg %rsr ret void } -;CHECK-LABEL: {{^}}buffer_store_x2_offset_merged: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 define amdgpu_ps void @buffer_store_x2_offset_merged(ptr addrspace(8) inreg %rsrc, <2 x float> %v1,<2 x float> %v2) { +; VERDE-LABEL: buffer_store_x2_offset_merged: +; VERDE: ; %bb.0: +; VERDE-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:4 +; VERDE-NEXT: s_endpgm +; +; CHECK-LABEL: buffer_store_x2_offset_merged: +; CHECK: ; %bb.0: +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:4 +; CHECK-NEXT: s_endpgm call void @llvm.amdgcn.raw.ptr.buffer.store.v2f32(<2 x float> %v1, ptr addrspace(8) %rsrc, i32 4, i32 0, i32 0) call void @llvm.amdgcn.raw.ptr.buffer.store.v2f32(<2 x float> %v2, ptr addrspace(8) %rsrc, i32 12, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}buffer_store_int: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 -;CHECK: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 glc -;CHECK: buffer_store_dword v6, off, s[0:3], 0 slc define amdgpu_ps void @buffer_store_int(ptr addrspace(8) inreg, <4 x i32>, <2 x i32>, i32) { +; VERDE-LABEL: buffer_store_int: +; VERDE: ; %bb.0: ; %main_body +; VERDE-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; VERDE-NEXT: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 glc +; VERDE-NEXT: buffer_store_dword v6, off, s[0:3], 0 slc +; VERDE-NEXT: s_endpgm +; +; CHECK-LABEL: buffer_store_int: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; CHECK-NEXT: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 glc +; CHECK-NEXT: buffer_store_dword v6, off, s[0:3], 0 slc +; CHECK-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> %1, ptr addrspace(8) %0, i32 0, i32 0, i32 0) call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %2, ptr addrspace(8) %0, i32 0, i32 0, i32 1) @@ -189,12 +289,18 @@ main_body: ret void } -;CHECK-LABEL: {{^}}raw_ptr_buffer_store_byte: -;CHECK-NEXT: %bb. -;CHECK-NEXT: v_cvt_u32_f32_e32 v{{[0-9]}}, v{{[0-9]}} -;CHECK-NEXT: buffer_store_byte v{{[0-9]}}, off, s[0:3], 0 -;CHECK-NEXT: s_endpgm define amdgpu_ps void @raw_ptr_buffer_store_byte(ptr addrspace(8) inreg %rsrc, float %v1) { +; VERDE-LABEL: raw_ptr_buffer_store_byte: +; VERDE: ; %bb.0: ; %main_body +; VERDE-NEXT: v_cvt_u32_f32_e32 v0, v0 +; VERDE-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; VERDE-NEXT: s_endpgm +; +; CHECK-LABEL: raw_ptr_buffer_store_byte: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v0 +; CHECK-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; CHECK-NEXT: s_endpgm main_body: %v2 = fptoui float %v1 to i32 %v3 = trunc i32 %v2 to i8 @@ -202,12 +308,18 @@ main_body: ret void } -;CHECK-LABEL: {{^}}raw_ptr_buffer_store_short: -;CHECK-NEXT: %bb. -;CHECK-NEXT: v_cvt_u32_f32_e32 v{{[0-9]}}, v{{[0-9]}} -;CHECK-NEXT: buffer_store_short v{{[0-9]}}, off, s[0:3], 0 -;CHECK-NEXT: s_endpgm define amdgpu_ps void @raw_ptr_buffer_store_short(ptr addrspace(8) inreg %rsrc, float %v1) { +; VERDE-LABEL: raw_ptr_buffer_store_short: +; VERDE: ; %bb.0: ; %main_body +; VERDE-NEXT: v_cvt_u32_f32_e32 v0, v0 +; VERDE-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VERDE-NEXT: s_endpgm +; +; CHECK-LABEL: raw_ptr_buffer_store_short: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v0 +; CHECK-NEXT: buffer_store_short v0, off, s[0:3], 0 +; CHECK-NEXT: s_endpgm main_body: %v2 = fptoui float %v1 to i32 %v3 = trunc i32 %v2 to i16 @@ -215,12 +327,16 @@ main_body: ret void } -;CHECK-LABEL: {{^}}raw_ptr_buffer_store_f16: -;CHECK-NEXT: %bb. -;CHECK-NOT: v0 -;CHECK-NEXT: buffer_store_short v0, off, s[0:3], 0 -;CHECK-NEXT: s_endpgm define amdgpu_ps void @raw_ptr_buffer_store_f16(ptr addrspace(8) inreg %rsrc, i32 %v1) { +; VERDE-LABEL: raw_ptr_buffer_store_f16: +; VERDE: ; %bb.0: ; %main_body +; VERDE-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VERDE-NEXT: s_endpgm +; +; CHECK-LABEL: raw_ptr_buffer_store_f16: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: buffer_store_short v0, off, s[0:3], 0 +; CHECK-NEXT: s_endpgm main_body: %trunc = trunc i32 %v1 to i16 %cast = bitcast i16 %trunc to half @@ -228,74 +344,169 @@ main_body: ret void } -;CHECK-LABEL: {{^}}buffer_store_v2f16: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dword v0, v1, s[0:3], 0 offen define amdgpu_ps void @buffer_store_v2f16(ptr addrspace(8) inreg %rsrc, <2 x half> %data, i32 %offset) { +; VERDE-LABEL: buffer_store_v2f16: +; VERDE: ; %bb.0: ; %main_body +; VERDE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; VERDE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; VERDE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VERDE-NEXT: v_or_b32_e32 v0, v0, v1 +; VERDE-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; VERDE-NEXT: s_endpgm +; +; CHECK-LABEL: buffer_store_v2f16: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; CHECK-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.ptr.buffer.store.v2f16(<2 x half> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}buffer_store_v4f16: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen define amdgpu_ps void @buffer_store_v4f16(ptr addrspace(8) inreg %rsrc, <4 x half> %data, i32 %offset) #0 { +; VERDE-LABEL: buffer_store_v4f16: +; VERDE: ; %bb.0: ; %main_body +; VERDE-NEXT: v_cvt_f16_f32_e32 v3, v3 +; VERDE-NEXT: v_cvt_f16_f32_e32 v2, v2 +; VERDE-NEXT: v_cvt_f16_f32_e32 v5, v1 +; VERDE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; VERDE-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; VERDE-NEXT: v_or_b32_e32 v1, v2, v1 +; VERDE-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; VERDE-NEXT: v_or_b32_e32 v0, v0, v2 +; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v4, s[0:3], 0 offen +; VERDE-NEXT: s_endpgm +; +; CHECK-LABEL: buffer_store_v4f16: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen +; CHECK-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.ptr.buffer.store.v4f16(<4 x half> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}buffer_store_v8f16: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen define amdgpu_ps void @buffer_store_v8f16(ptr addrspace(8) inreg %rsrc, <8 x half> %data, i32 %offset) #0 { +; VERDE-LABEL: buffer_store_v8f16: +; VERDE: ; %bb.0: ; %main_body +; VERDE-NEXT: v_cvt_f16_f32_e32 v7, v7 +; VERDE-NEXT: v_cvt_f16_f32_e32 v6, v6 +; VERDE-NEXT: v_cvt_f16_f32_e32 v9, v5 +; VERDE-NEXT: v_cvt_f16_f32_e32 v3, v3 +; VERDE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; VERDE-NEXT: v_cvt_f16_f32_e32 v4, v4 +; VERDE-NEXT: v_cvt_f16_f32_e32 v2, v2 +; VERDE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; VERDE-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; VERDE-NEXT: v_or_b32_e32 v5, v6, v5 +; VERDE-NEXT: v_lshlrev_b32_e32 v6, 16, v9 +; VERDE-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; VERDE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VERDE-NEXT: v_or_b32_e32 v4, v4, v6 +; VERDE-NEXT: v_or_b32_e32 v3, v2, v3 +; VERDE-NEXT: v_or_b32_e32 v2, v0, v1 +; VERDE-NEXT: buffer_store_dwordx4 v[2:5], v8, s[0:3], 0 offen +; VERDE-NEXT: s_endpgm +; +; CHECK-LABEL: buffer_store_v8f16: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen +; CHECK-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.ptr.buffer.store.v8f16(<8 x half> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}buffer_store_v2bf16: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dword v0, v1, s[0:3], 0 offen define amdgpu_ps void @buffer_store_v2bf16(ptr addrspace(8) inreg %rsrc, <2 x bfloat> %data, i32 %offset) { +; VERDE-LABEL: buffer_store_v2bf16: +; VERDE: ; %bb.0: +; VERDE-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; VERDE-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VERDE-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; VERDE-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; VERDE-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; VERDE-NEXT: s_endpgm +; +; CHECK-LABEL: buffer_store_v2bf16: +; CHECK: ; %bb.0: +; CHECK-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; CHECK-NEXT: s_endpgm call void @llvm.amdgcn.raw.ptr.buffer.store.v2bf16(<2 x bfloat> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}buffer_store_v4bf16: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen define amdgpu_ps void @buffer_store_v4bf16(ptr addrspace(8) inreg %rsrc, <4 x bfloat> %data, i32 %offset) #0 { +; VERDE-LABEL: buffer_store_v4bf16: +; VERDE: ; %bb.0: +; VERDE-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; VERDE-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; VERDE-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VERDE-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; VERDE-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VERDE-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; VERDE-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; VERDE-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; VERDE-NEXT: buffer_store_dwordx2 v[1:2], v4, s[0:3], 0 offen +; VERDE-NEXT: s_endpgm +; +; CHECK-LABEL: buffer_store_v4bf16: +; CHECK: ; %bb.0: +; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen +; CHECK-NEXT: s_endpgm call void @llvm.amdgcn.raw.ptr.buffer.store.v4bf16(<4 x bfloat> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}raw_ptr_buffer_store_i16: -;CHECK-NEXT: %bb. -;CHECK-NOT: v0 -;CHECK-NEXT: buffer_store_short v0, off, s[0:3], 0 -;CHECK-NEXT: s_endpgm define amdgpu_ps void @raw_ptr_buffer_store_i16(ptr addrspace(8) inreg %rsrc, i32 %v1) { +; VERDE-LABEL: raw_ptr_buffer_store_i16: +; VERDE: ; %bb.0: ; %main_body +; VERDE-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VERDE-NEXT: s_endpgm +; +; CHECK-LABEL: raw_ptr_buffer_store_i16: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: buffer_store_short v0, off, s[0:3], 0 +; CHECK-NEXT: s_endpgm main_body: %trunc = trunc i32 %v1 to i16 call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 %trunc, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}buffer_store_v2i16: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dword v0, v1, s[0:3], 0 offen define amdgpu_ps void @buffer_store_v2i16(ptr addrspace(8) inreg %rsrc, <2 x i16> %data, i32 %offset) { +; VERDE-LABEL: buffer_store_v2i16: +; VERDE: ; %bb.0: ; %main_body +; VERDE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VERDE-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; VERDE-NEXT: v_or_b32_e32 v0, v0, v1 +; VERDE-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; VERDE-NEXT: s_endpgm +; +; CHECK-LABEL: buffer_store_v2i16: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; CHECK-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i16(<2 x i16> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}buffer_store_v4i16: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen define amdgpu_ps void @buffer_store_v4i16(ptr addrspace(8) inreg %rsrc, <4 x i16> %data, i32 %offset) #0 { +; VERDE-LABEL: buffer_store_v4i16: +; VERDE: ; %bb.0: ; %main_body +; VERDE-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; VERDE-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; VERDE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VERDE-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; VERDE-NEXT: v_or_b32_e32 v2, v2, v3 +; VERDE-NEXT: v_or_b32_e32 v1, v0, v1 +; VERDE-NEXT: buffer_store_dwordx2 v[1:2], v4, s[0:3], 0 offen +; VERDE-NEXT: s_endpgm +; +; CHECK-LABEL: buffer_store_v4i16: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen +; CHECK-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i16(<4 x i16> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0) ret void @@ -307,21 +518,45 @@ main_body: ; call void @llvm.amdgcn.raw.ptr.buffer.store.v6i16(<6 x i16> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0) ; ret void ; } - -;CHECK-LABEL: {{^}}buffer_store_v8i16: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen define amdgpu_ps void @buffer_store_v8i16(ptr addrspace(8) inreg %rsrc, <8 x i16> %data, i32 %offset) #0 { +; VERDE-LABEL: buffer_store_v8i16: +; VERDE: ; %bb.0: ; %main_body +; VERDE-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; VERDE-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; VERDE-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; VERDE-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; VERDE-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; VERDE-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; VERDE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VERDE-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; VERDE-NEXT: v_or_b32_e32 v6, v6, v7 +; VERDE-NEXT: v_or_b32_e32 v5, v4, v5 +; VERDE-NEXT: v_or_b32_e32 v4, v2, v3 +; VERDE-NEXT: v_or_b32_e32 v3, v0, v1 +; VERDE-NEXT: buffer_store_dwordx4 v[3:6], v8, s[0:3], 0 offen +; VERDE-NEXT: s_endpgm +; +; CHECK-LABEL: buffer_store_v8i16: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen +; CHECK-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.ptr.buffer.store.v8i16(<8 x i16> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}raw_ptr_buffer_store_x1_offset_merged: -;CHECK-NOT: s_waitcnt -;CHECK-DAG: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 -;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:28 define amdgpu_ps void @raw_ptr_buffer_store_x1_offset_merged(ptr addrspace(8) inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { +; VERDE-LABEL: raw_ptr_buffer_store_x1_offset_merged: +; VERDE: ; %bb.0: +; VERDE-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:4 +; VERDE-NEXT: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:28 +; VERDE-NEXT: s_endpgm +; +; CHECK-LABEL: raw_ptr_buffer_store_x1_offset_merged: +; CHECK: ; %bb.0: +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:4 +; CHECK-NEXT: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:28 +; CHECK-NEXT: s_endpgm call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %v1, ptr addrspace(8) %rsrc, i32 4, i32 0, i32 0) call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %v2, ptr addrspace(8) %rsrc, i32 8, i32 0, i32 0) call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %v3, ptr addrspace(8) %rsrc, i32 12, i32 0, i32 0) @@ -331,14 +566,26 @@ define amdgpu_ps void @raw_ptr_buffer_store_x1_offset_merged(ptr addrspace(8) in ret void } -;CHECK-LABEL: {{^}}raw_ptr_buffer_store_x1_offset_swizzled_not_merged: -;CHECK-DAG: buffer_store_dword v{{[0-9]}}, off, s[0:3], 0 offset:4 -;CHECK-DAG: buffer_store_dword v{{[0-9]}}, off, s[0:3], 0 offset:8 -;CHECK-DAG: buffer_store_dword v{{[0-9]}}, off, s[0:3], 0 offset:12 -;CHECK-DAG: buffer_store_dword v{{[0-9]}}, off, s[0:3], 0 offset:16 -;CHECK-DAG: buffer_store_dword v{{[0-9]}}, off, s[0:3], 0 offset:28 -;CHECK-DAG: buffer_store_dword v{{[0-9]}}, off, s[0:3], 0 offset:32 define amdgpu_ps void @raw_ptr_buffer_store_x1_offset_swizzled_not_merged(ptr addrspace(8) inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { +; VERDE-LABEL: raw_ptr_buffer_store_x1_offset_swizzled_not_merged: +; VERDE: ; %bb.0: +; VERDE-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 +; VERDE-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:8 +; VERDE-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:12 +; VERDE-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:16 +; VERDE-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:28 +; VERDE-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:32 +; VERDE-NEXT: s_endpgm +; +; CHECK-LABEL: raw_ptr_buffer_store_x1_offset_swizzled_not_merged: +; CHECK: ; %bb.0: +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 +; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:8 +; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:12 +; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:16 +; CHECK-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:28 +; CHECK-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:32 +; CHECK-NEXT: s_endpgm call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %v1, ptr addrspace(8) %rsrc, i32 4, i32 0, i32 8) call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %v2, ptr addrspace(8) %rsrc, i32 8, i32 0, i32 8) call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %v3, ptr addrspace(8) %rsrc, i32 12, i32 0, i32 8) diff --git a/llvm/test/CodeGen/AMDGPU/s_mulk_i32.ll b/llvm/test/CodeGen/AMDGPU/s_mulk_i32.ll index c7987d3d00917..02641f5b6ae8c 100644 --- a/llvm/test/CodeGen/AMDGPU/s_mulk_i32.ll +++ b/llvm/test/CodeGen/AMDGPU/s_mulk_i32.ll @@ -1,40 +1,118 @@ -; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga -mattr=-flat-for-global,-xnack -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s - -; SI-LABEL: {{^}}s_mulk_i32_k0: -; SI: s_load_dword [[VAL:s[0-9]+]] -; SI: s_mulk_i32 [[VAL]], 0x41 -; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[VAL]] -; SI: buffer_store_dword [[VRESULT]] -; SI: s_endpgm +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GFX6 %s +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga -mattr=-flat-for-global,-xnack -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8 %s + define amdgpu_kernel void @s_mulk_i32_k0(ptr addrspace(1) %out, i32 %b) { +; GFX6-LABEL: s_mulk_i32_k0: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dword s4, s[2:3], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mulk_i32 s4, 0x41 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX8-LABEL: s_mulk_i32_k0: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dword s4, s[2:3], 0x8 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_mulk_i32 s4, 0x41 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8-NEXT: s_endpgm %mul = mul i32 %b, 65 store i32 %mul, ptr addrspace(1) %out ret void } -; SI-LABEL: {{^}}s_mulk_i32_k1: -; SI: s_mulk_i32 {{s[0-9]+}}, 0x7fff{{$}} -; SI: s_endpgm define amdgpu_kernel void @s_mulk_i32_k1(ptr addrspace(1) %out, i32 %b) { +; GFX6-LABEL: s_mulk_i32_k1: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dword s4, s[2:3], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mulk_i32 s4, 0x7fff +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX8-LABEL: s_mulk_i32_k1: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dword s4, s[2:3], 0x8 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_mulk_i32 s4, 0x7fff +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8-NEXT: s_endpgm %mul = mul i32 %b, 32767 ; (1 << 15) - 1 store i32 %mul, ptr addrspace(1) %out ret void } -; SI-LABEL: {{^}}s_mulk_i32_k2: -; SI: s_mulk_i32 {{s[0-9]+}}, 0xffef{{$}} -; SI: s_endpgm define amdgpu_kernel void @s_mulk_i32_k2(ptr addrspace(1) %out, i32 %b) { +; GFX6-LABEL: s_mulk_i32_k2: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dword s4, s[2:3], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mulk_i32 s4, 0xffef +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX8-LABEL: s_mulk_i32_k2: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dword s4, s[2:3], 0x8 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_mulk_i32 s4, 0xffef +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8-NEXT: s_endpgm %mul = mul i32 %b, -17 store i32 %mul, ptr addrspace(1) %out ret void } -; SI-LABEL: {{^}}no_s_mulk_i32_k0: -; SI: s_mul_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x8001{{$}} -; SI: s_endpgm define amdgpu_kernel void @no_s_mulk_i32_k0(ptr addrspace(1) %out, i32 %b) { +; GFX6-LABEL: no_s_mulk_i32_k0: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dword s4, s[2:3], 0x2 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mul_i32 s4, s4, 0x8001 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX8-LABEL: no_s_mulk_i32_k0: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dword s4, s[2:3], 0x8 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_mul_i32 s4, s4, 0x8001 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8-NEXT: s_endpgm %mul = mul i32 %b, 32769 ; 1 << 15 + 1 store i32 %mul, ptr addrspace(1) %out ret void @@ -42,9 +120,28 @@ define amdgpu_kernel void @no_s_mulk_i32_k0(ptr addrspace(1) %out, i32 %b) { @lds = addrspace(3) global [512 x i32] undef, align 4 -; SI-LABEL: {{^}}commute_s_mulk_i32: -; SI: s_mulk_i32 s{{[0-9]+}}, 0x800{{$}} define amdgpu_kernel void @commute_s_mulk_i32(ptr addrspace(1) %out, i32 %b) #0 { +; GFX6-LABEL: commute_s_mulk_i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dword s0, s[2:3], 0x2 +; GFX6-NEXT: v_mov_b32_e32 v0, 0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mulk_i32 s0, 0x800 +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ; foo v0, s0 +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_endpgm +; +; GFX8-LABEL: commute_s_mulk_i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dword s0, s[2:3], 0x8 +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_mulk_i32 s0, 0x800 +; GFX8-NEXT: ;;#ASMSTART +; GFX8-NEXT: ; foo v0, s0 +; GFX8-NEXT: ;;#ASMEND +; GFX8-NEXT: s_endpgm %size = call i32 @llvm.amdgcn.groupstaticsize() %add = mul i32 %size, %b call void asm sideeffect "; foo $0, $1", "v,s"(ptr addrspace(3) @lds, i32 %add)