diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 8b93ed342c64a..7ed055e8da2b6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -1042,6 +1042,10 @@ bool AMDGPUTargetLowering::isNarrowingProfitable(SDNode *N, EVT SrcVT, case ISD::MUL: case ISD::SETCC: case ISD::SELECT: + case ISD::SMIN: + case ISD::SMAX: + case ISD::UMIN: + case ISD::UMAX: if (Subtarget->has16BitInsts() && (!DestVT.isVector() || !Subtarget->hasVOP3PInsts())) { // Don't narrow back down to i16 if promoted to i32 already. diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll index d2f4f54cefe78..201b85c745c18 100644 --- a/llvm/test/CodeGen/AMDGPU/min.ll +++ b/llvm/test/CodeGen/AMDGPU/min.ll @@ -649,36 +649,35 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32] ; ; GFX9-LABEL: s_test_imin_sle_v4i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s3, s[8:9], 0x4c ; GFX9-NEXT: s_load_dword s2, s[8:9], 0x28 +; GFX9-NEXT: s_load_dword s3, s[8:9], 0x4c ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s5, s2, 16 -; GFX9-NEXT: s_lshr_b32 s8, s3, 16 -; GFX9-NEXT: s_ashr_i32 s9, s3, 24 -; GFX9-NEXT: s_ashr_i32 s6, s2, 24 -; GFX9-NEXT: s_bfe_i32 s8, s8, 0x80000 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: s_bfe_i32 s5, s5, 0x80000 +; GFX9-NEXT: s_sext_i32_i16 s5, s2 ; GFX9-NEXT: s_sext_i32_i16 s7, s3 -; GFX9-NEXT: v_min_i16_e32 v1, s6, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, s8 -; GFX9-NEXT: s_sext_i32_i16 s4, s2 -; GFX9-NEXT: s_lshr_b32 s7, s7, 8 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: v_min_i16_e32 v2, s5, v2 -; GFX9-NEXT: s_lshr_b32 s4, s4, 8 -; GFX9-NEXT: s_bfe_i32 s3, s3, 0x80000 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-NEXT: s_bfe_i32 s2, s2, 0x80000 -; GFX9-NEXT: v_min_i16_e32 v2, s4, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GFX9-NEXT: v_min_i16_e32 v3, s2, v3 -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_ashr_i32 s7, s7, 8 +; GFX9-NEXT: s_ashr_i32 s5, s5, 8 +; GFX9-NEXT: s_ashr_i32 s4, s2, 24 +; GFX9-NEXT: s_ashr_i32 s6, s3, 24 +; GFX9-NEXT: s_min_i32 s5, s5, s7 +; GFX9-NEXT: s_sext_i32_i8 s7, s3 +; GFX9-NEXT: s_sext_i32_i8 s8, s2 +; GFX9-NEXT: s_bfe_i32 s3, s3, 0x80010 +; GFX9-NEXT: s_bfe_i32 s2, s2, 0x80010 +; GFX9-NEXT: s_min_i32 s7, s8, s7 +; GFX9-NEXT: s_min_i32 s4, s4, s6 +; GFX9-NEXT: s_min_i32 s2, s2, s3 +; GFX9-NEXT: s_lshl_b32 s5, s5, 8 +; GFX9-NEXT: s_and_b32 s7, s7, 0xff +; GFX9-NEXT: s_lshl_b32 s4, s4, 8 +; GFX9-NEXT: s_and_b32 s2, s2, 0xff +; GFX9-NEXT: s_or_b32 s5, s7, s5 +; GFX9-NEXT: s_or_b32 s2, s2, s4 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: s_lshl_b32 s2, s2, 16 +; GFX9-NEXT: s_or_b32 s2, s5, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -688,111 +687,70 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32] ; GFX10-NEXT: s_load_dword s2, s[8:9], 0x28 ; GFX10-NEXT: s_load_dword s3, s[8:9], 0x4c ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_sext_i32_i16 s4, s2 +; GFX10-NEXT: s_sext_i32_i16 s5, s2 ; GFX10-NEXT: s_sext_i32_i16 s7, s3 -; GFX10-NEXT: s_ashr_i32 s6, s2, 24 -; GFX10-NEXT: s_ashr_i32 s9, s3, 24 -; GFX10-NEXT: s_lshr_b32 s4, s4, 8 -; GFX10-NEXT: s_lshr_b32 s7, s7, 8 -; GFX10-NEXT: v_min_i16 v0, s6, s9 -; GFX10-NEXT: v_min_i16 v1, s4, s7 -; GFX10-NEXT: s_lshr_b32 s5, s2, 16 -; GFX10-NEXT: s_lshr_b32 s8, s3, 16 -; GFX10-NEXT: s_bfe_i32 s2, s2, 0x80000 -; GFX10-NEXT: s_bfe_i32 s5, s5, 0x80000 -; GFX10-NEXT: s_bfe_i32 s4, s8, 0x80000 -; GFX10-NEXT: s_bfe_i32 s3, s3, 0x80000 -; GFX10-NEXT: v_min_i16 v2, s5, s4 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX10-NEXT: v_min_i16 v3, s2, s3 -; GFX10-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-NEXT: s_ashr_i32 s4, s2, 24 +; GFX10-NEXT: s_ashr_i32 s6, s3, 24 +; GFX10-NEXT: s_sext_i32_i8 s8, s3 +; GFX10-NEXT: s_sext_i32_i8 s9, s2 +; GFX10-NEXT: s_bfe_i32 s3, s3, 0x80010 +; GFX10-NEXT: s_bfe_i32 s2, s2, 0x80010 +; GFX10-NEXT: s_ashr_i32 s7, s7, 8 +; GFX10-NEXT: s_ashr_i32 s5, s5, 8 +; GFX10-NEXT: s_min_i32 s8, s9, s8 +; GFX10-NEXT: s_min_i32 s4, s4, s6 +; GFX10-NEXT: s_min_i32 s2, s2, s3 +; GFX10-NEXT: s_min_i32 s3, s5, s7 +; GFX10-NEXT: s_and_b32 s5, s8, 0xff +; GFX10-NEXT: s_lshl_b32 s4, s4, 8 +; GFX10-NEXT: s_lshl_b32 s3, s3, 8 +; GFX10-NEXT: s_and_b32 s2, s2, 0xff +; GFX10-NEXT: s_or_b32 s3, s5, s3 +; GFX10-NEXT: s_or_b32 s2, s2, s4 +; GFX10-NEXT: s_and_b32 s3, s3, 0xffff +; GFX10-NEXT: s_lshl_b32 s2, s2, 16 +; GFX10-NEXT: s_or_b32 s2, s3, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; -; GFX11-TRUE16-LABEL: s_test_imin_sle_v4i8: -; GFX11-TRUE16: ; %bb.0: -; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: s_load_b32 s0, s[4:5], 0x28 -; GFX11-TRUE16-NEXT: s_load_b32 s1, s[4:5], 0x4c -; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_sext_i32_i16 s2, s0 -; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s0, 16 -; GFX11-TRUE16-NEXT: s_sext_i32_i16 s7, s1 -; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s1, 16 -; GFX11-TRUE16-NEXT: s_ashr_i32 s6, s0, 24 -; GFX11-TRUE16-NEXT: s_ashr_i32 s9, s1, 24 -; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s2, 8 -; GFX11-TRUE16-NEXT: s_bfe_i32 s3, s3, 0x80000 -; GFX11-TRUE16-NEXT: s_bfe_i32 s0, s0, 0x80000 -; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s7, 8 -; GFX11-TRUE16-NEXT: s_bfe_i32 s8, s8, 0x80000 -; GFX11-TRUE16-NEXT: s_bfe_i32 s1, s1, 0x80000 -; GFX11-TRUE16-NEXT: v_min_i16 v0.l, s6, s9 -; GFX11-TRUE16-NEXT: v_min_i16 v1.l, s3, s8 -; GFX11-TRUE16-NEXT: v_min_i16 v2.l, s2, s7 -; GFX11-TRUE16-NEXT: v_min_i16 v3.l, s0, s1 -; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v2 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_store_b32 v2, v0, s[0:1] -; GFX11-TRUE16-NEXT: s_endpgm -; -; GFX11-FAKE16-LABEL: s_test_imin_sle_v4i8: -; GFX11-FAKE16: ; %bb.0: -; GFX11-FAKE16-NEXT: s_clause 0x1 -; GFX11-FAKE16-NEXT: s_load_b32 s0, s[4:5], 0x28 -; GFX11-FAKE16-NEXT: s_load_b32 s1, s[4:5], 0x4c -; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_sext_i32_i16 s2, s0 -; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s0, 16 -; GFX11-FAKE16-NEXT: s_sext_i32_i16 s7, s1 -; GFX11-FAKE16-NEXT: s_lshr_b32 s8, s1, 16 -; GFX11-FAKE16-NEXT: s_ashr_i32 s6, s0, 24 -; GFX11-FAKE16-NEXT: s_bfe_i32 s0, s0, 0x80000 -; GFX11-FAKE16-NEXT: s_ashr_i32 s9, s1, 24 -; GFX11-FAKE16-NEXT: s_bfe_i32 s1, s1, 0x80000 -; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s2, 8 -; GFX11-FAKE16-NEXT: s_bfe_i32 s3, s3, 0x80000 -; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s7, 8 -; GFX11-FAKE16-NEXT: s_bfe_i32 s8, s8, 0x80000 -; GFX11-FAKE16-NEXT: v_min_i16 v0, s6, s9 -; GFX11-FAKE16-NEXT: v_min_i16 v1, s0, s1 -; GFX11-FAKE16-NEXT: v_min_i16 v2, s3, s8 -; GFX11-FAKE16-NEXT: v_min_i16 v3, s2, s7 -; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: global_store_b32 v2, v0, s[0:1] -; GFX11-FAKE16-NEXT: s_endpgm +; GFX11-LABEL: s_test_imin_sle_v4i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x28 +; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x4c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_sext_i32_i16 s5, s2 +; GFX11-NEXT: s_sext_i32_i16 s7, s3 +; GFX11-NEXT: s_ashr_i32 s4, s2, 24 +; GFX11-NEXT: s_ashr_i32 s6, s3, 24 +; GFX11-NEXT: s_sext_i32_i8 s8, s3 +; GFX11-NEXT: s_sext_i32_i8 s9, s2 +; GFX11-NEXT: s_bfe_i32 s3, s3, 0x80010 +; GFX11-NEXT: s_bfe_i32 s2, s2, 0x80010 +; GFX11-NEXT: s_ashr_i32 s7, s7, 8 +; GFX11-NEXT: s_ashr_i32 s5, s5, 8 +; GFX11-NEXT: s_min_i32 s8, s9, s8 +; GFX11-NEXT: s_min_i32 s4, s4, s6 +; GFX11-NEXT: s_min_i32 s2, s2, s3 +; GFX11-NEXT: s_min_i32 s3, s5, s7 +; GFX11-NEXT: s_and_b32 s5, s8, 0xff +; GFX11-NEXT: s_lshl_b32 s4, s4, 8 +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-NEXT: s_or_b32 s3, s5, s3 +; GFX11-NEXT: s_or_b32 s2, s2, s4 +; GFX11-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s2, s3, s2 +; GFX11-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_endpgm %cmp = icmp sle <4 x i8> %a, %b %val = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b store <4 x i8> %val, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll index 86fc0ace2c43f..6ab3022a91cd7 100644 --- a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll +++ b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll @@ -162,10 +162,11 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg ; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0xff ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_max_i16_e64 v1, s2, 0 -; SDAG-VI-NEXT: v_max_i16_e64 v2, s3, 0 -; SDAG-VI-NEXT: v_min_i16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; SDAG-VI-NEXT: v_min_i16_e32 v1, 0xff, v1 +; SDAG-VI-NEXT: s_sext_i32_i16 s2, s2 +; SDAG-VI-NEXT: s_sext_i32_i16 s3, s3 +; SDAG-VI-NEXT: v_med3_i32 v1, s2, 0, v0 +; SDAG-VI-NEXT: v_med3_i32 v0, s3, 0, v0 +; SDAG-VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SDAG-VI-NEXT: v_or_b32_e32 v2, v1, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 @@ -689,12 +690,12 @@ define amdgpu_kernel void @vec_smax_smin_sgpr(ptr addrspace(1) %out, <2 x i16> i ; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0xff ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: s_lshr_b32 s3, s2, 16 -; SDAG-VI-NEXT: v_max_i16_e64 v1, s2, 0 -; SDAG-VI-NEXT: v_max_i16_e64 v2, s3, 0 -; SDAG-VI-NEXT: v_min_i16_e32 v1, 0xff, v1 -; SDAG-VI-NEXT: v_min_i16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; SDAG-VI-NEXT: v_or_b32_e32 v2, v1, v0 +; SDAG-VI-NEXT: s_ashr_i32 s3, s2, 16 +; SDAG-VI-NEXT: s_sext_i32_i16 s2, s2 +; SDAG-VI-NEXT: v_med3_i32 v1, s2, 0, v0 +; SDAG-VI-NEXT: v_med3_i32 v0, s3, 0, v0 +; SDAG-VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SDAG-VI-NEXT: v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 ; SDAG-VI-NEXT: flat_store_dword v[0:1], v2