diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index cc15dd7cb495c..aedad12f61e44 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -131,6 +131,7 @@ class SIFoldOperandsImpl { std::optional getImmOrMaterializedImm(MachineOperand &Op) const; bool tryConstantFoldOp(MachineInstr *MI) const; bool tryFoldCndMask(MachineInstr &MI) const; + bool tryFoldBitMask(MachineInstr &MI) const; bool tryFoldZeroHighBits(MachineInstr &MI) const; bool foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const; @@ -1447,6 +1448,100 @@ bool SIFoldOperandsImpl::tryFoldCndMask(MachineInstr &MI) const { return true; } +static bool getBitsReadByInst(unsigned Opc, unsigned &NumBitsRead, + unsigned &OpIdx) { + switch (Opc) { + case AMDGPU::V_ASHR_I32_e64: + case AMDGPU::V_ASHR_I32_e32: + case AMDGPU::V_LSHR_B32_e64: + case AMDGPU::V_LSHR_B32_e32: + case AMDGPU::V_LSHL_B32_e64: + case AMDGPU::V_LSHL_B32_e32: + case AMDGPU::S_LSHL_B32: + case AMDGPU::S_LSHR_B32: + case AMDGPU::S_ASHR_I32: + NumBitsRead = 5; + OpIdx = 2; + return true; + case AMDGPU::S_LSHL_B64: + case AMDGPU::S_LSHR_B64: + case AMDGPU::S_ASHR_I64: + NumBitsRead = 6; + OpIdx = 2; + return true; + case AMDGPU::V_LSHLREV_B32_e64: + case AMDGPU::V_LSHLREV_B32_e32: + case AMDGPU::V_LSHRREV_B32_e64: + case AMDGPU::V_LSHRREV_B32_e32: + case AMDGPU::V_ASHRREV_I32_e64: + case AMDGPU::V_ASHRREV_I32_e32: + NumBitsRead = 5; + OpIdx = 1; + return true; + default: + return false; + } +} + +static bool isAndBitMaskRedundant(MachineInstr &MI, unsigned BitsNeeded, + unsigned &SrcOp) { + MachineOperand *RegOp = &MI.getOperand(1); + MachineOperand *ImmOp = &MI.getOperand(2); + + if (!RegOp->isReg() || !ImmOp->isImm()) { + if (ImmOp->isReg() && RegOp->isImm()) + std::swap(RegOp, ImmOp); + else + return false; + } + + SrcOp = RegOp->getOperandNo(); + + const unsigned BitMask = maskTrailingOnes(BitsNeeded); + return (ImmOp->getImm() & BitMask) == BitMask; +} + +bool SIFoldOperandsImpl::tryFoldBitMask(MachineInstr &MI) const { + unsigned NumBitsRead = 0; + unsigned OpIdx = 0; + if (!getBitsReadByInst(MI.getOpcode(), NumBitsRead, OpIdx)) + return false; + + MachineOperand &Op = MI.getOperand(OpIdx); + if (!Op.isReg()) + return false; + + Register OpReg = Op.getReg(); + if (OpReg.isPhysical()) + return false; + + MachineInstr *OpDef = MRI->getVRegDef(OpReg); + if (!OpDef) + return false; + + LLVM_DEBUG(dbgs() << "tryFoldBitMask: " << MI << "\tOpIdx:" << OpIdx + << ", NumBitsRead:" << NumBitsRead << "\n"); + + unsigned ReplaceWith; + switch (OpDef->getOpcode()) { + // TODO: add more opcodes? + case AMDGPU::S_AND_B32: + case AMDGPU::V_AND_B32_e32: + case AMDGPU::V_AND_B32_e64: + if (!isAndBitMaskRedundant(*OpDef, NumBitsRead, ReplaceWith)) + return false; + break; + default: + return false; + } + + MachineOperand &ReplaceWithOp = OpDef->getOperand(ReplaceWith); + LLVM_DEBUG(dbgs() << "\treplacing operand with:" << ReplaceWithOp << "\n"); + + MI.getOperand(OpIdx).setReg(ReplaceWithOp.getReg()); + return true; +} + bool SIFoldOperandsImpl::tryFoldZeroHighBits(MachineInstr &MI) const { if (MI.getOpcode() != AMDGPU::V_AND_B32_e64 && MI.getOpcode() != AMDGPU::V_AND_B32_e32) @@ -1458,7 +1553,7 @@ bool SIFoldOperandsImpl::tryFoldZeroHighBits(MachineInstr &MI) const { Register Src1 = MI.getOperand(2).getReg(); MachineInstr *SrcDef = MRI->getVRegDef(Src1); - if (!ST->zeroesHigh16BitsOfDest(SrcDef->getOpcode())) + if (!SrcDef || !ST->zeroesHigh16BitsOfDest(SrcDef->getOpcode())) return false; Register Dst = MI.getOperand(0).getReg(); @@ -2451,6 +2546,7 @@ bool SIFoldOperandsImpl::run(MachineFunction &MF) { MachineOperand *CurrentKnownM0Val = nullptr; for (auto &MI : make_early_inc_range(*MBB)) { Changed |= tryFoldCndMask(MI); + Changed |= tryFoldBitMask(MI); if (tryFoldZeroHighBits(MI)) { Changed = true; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll index f81d7f1c300b8..886d15a93acb4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll @@ -129,7 +129,6 @@ define i24 @v_ashr_i24(i24 %value, i24 %amount) { ; GFX10PLUS-LABEL: v_ashr_i24: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GFX10PLUS-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v0, v1, v0 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] @@ -718,7 +717,6 @@ define amdgpu_ps half @ashr_i16_sv(i16 inreg %value, i16 %amount) { define amdgpu_ps half @ashr_i16_vs(i16 %value, i16 inreg %amount) { ; GFX6-LABEL: ashr_i16_vs: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, s0, v0 ; GFX6-NEXT: ; return to shader part epilog @@ -904,12 +902,10 @@ define amdgpu_ps float @ashr_v2i16_sv(<2 x i16> inreg %value, <2 x i16> %amount) define amdgpu_ps float @ashr_v2i16_vs(<2 x i16> %value, <2 x i16> inreg %amount) { ; GFX6-LABEL: ashr_v2i16_vs: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b32 s0, s0, 0xffff +; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX6-NEXT: v_ashrrev_i32_e32 v1, s1, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, s0, v0 -; GFX6-NEXT: s_and_b32 s0, s1, 0xffff -; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GFX6-NEXT: v_ashrrev_i32_e32 v1, s0, v1 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll index d2b600b04f9fc..25d76c9e8f7d8 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll @@ -1475,7 +1475,6 @@ define amdgpu_ps i24 @s_fshl_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt) ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 23, v0 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffffff, v0 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GFX10-NEXT: v_lshrrev_b32_e64 v1, v1, s1 ; GFX10-NEXT: v_lshl_or_b32 v0, s0, v0, v1 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 @@ -1511,11 +1510,9 @@ define amdgpu_ps i24 @s_fshl_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_sub_nc_u32_e32 v1, 23, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffffff, v0 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffffff, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e64 v1, v1, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshl_or_b32 v0, s0, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: ; return to shader part epilog %result = call i24 @llvm.fshl.i24(i24 %lhs, i24 %rhs, i24 %amt) @@ -1635,7 +1632,6 @@ define i24 @v_fshl_i24(i24 %lhs, i24 %rhs, i24 %amt) { ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo ; GFX10-NEXT: v_sub_nc_u32_e32 v3, 23, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffffff, v2 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffffff, v3 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, v3, v1 ; GFX10-NEXT: v_lshl_or_b32 v0, v0, v2, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -1671,9 +1667,8 @@ define i24 @v_fshl_i24(i24 %lhs, i24 %rhs, i24 %amt) { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_sub_nc_u32_e32 v3, 23, v2 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffffff, v2 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffffff, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v1, v3, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshl_or_b32 v0, v0, v2, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call i24 @llvm.fshl.i24(i24 %lhs, i24 %rhs, i24 %amt) @@ -2010,74 +2005,74 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX10-LABEL: s_fshl_v2i24: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 -; GFX10-NEXT: s_lshr_b32 s10, s4, 8 -; GFX10-NEXT: s_lshr_b32 s11, s4, 16 -; GFX10-NEXT: s_and_b32 s10, s10, 0xff -; GFX10-NEXT: s_lshr_b32 s12, s4, 24 +; GFX10-NEXT: s_lshr_b32 s12, s4, 8 +; GFX10-NEXT: s_lshr_b32 s13, s4, 16 +; GFX10-NEXT: s_and_b32 s12, s12, 0xff +; GFX10-NEXT: s_lshr_b32 s14, s4, 24 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: s_and_b32 s4, s4, 0xff -; GFX10-NEXT: s_and_b32 s11, s11, 0xff -; GFX10-NEXT: s_lshl_b32 s10, s10, 8 -; GFX10-NEXT: s_lshl_b32 s11, s11, 16 -; GFX10-NEXT: s_or_b32 s4, s4, s10 -; GFX10-NEXT: s_lshr_b32 s13, s5, 8 +; GFX10-NEXT: s_and_b32 s13, s13, 0xff +; GFX10-NEXT: s_lshl_b32 s12, s12, 8 +; GFX10-NEXT: s_lshl_b32 s13, s13, 16 +; GFX10-NEXT: s_or_b32 s4, s4, s12 +; GFX10-NEXT: s_lshr_b32 s15, s5, 8 ; GFX10-NEXT: s_and_b32 s5, s5, 0xff -; GFX10-NEXT: s_or_b32 s4, s4, s11 +; GFX10-NEXT: s_or_b32 s4, s4, s13 ; GFX10-NEXT: s_lshl_b32 s5, s5, 8 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX10-NEXT: s_and_b32 s13, s13, 0xff -; GFX10-NEXT: s_or_b32 s5, s12, s5 -; GFX10-NEXT: s_lshl_b32 s10, s13, 16 -; GFX10-NEXT: s_lshr_b32 s9, s1, 8 +; GFX10-NEXT: s_and_b32 s15, s15, 0xff +; GFX10-NEXT: s_or_b32 s5, s14, s5 +; GFX10-NEXT: s_lshl_b32 s12, s15, 16 +; GFX10-NEXT: s_lshr_b32 s6, s0, 8 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX10-NEXT: s_or_b32 s5, s5, s10 -; GFX10-NEXT: s_and_b32 s1, s1, 0xff +; GFX10-NEXT: s_or_b32 s5, s5, s12 ; GFX10-NEXT: s_lshr_b32 s10, s2, 8 -; GFX10-NEXT: s_lshr_b32 s8, s0, 24 +; GFX10-NEXT: s_lshr_b32 s7, s0, 16 +; GFX10-NEXT: s_lshr_b32 s9, s1, 8 ; GFX10-NEXT: v_mul_lo_u32 v2, 0xffffffe8, v0 +; GFX10-NEXT: s_and_b32 s1, s1, 0xff ; GFX10-NEXT: s_lshr_b32 s11, s2, 16 ; GFX10-NEXT: s_lshr_b32 s13, s3, 8 ; GFX10-NEXT: s_and_b32 s3, s3, 0xff -; GFX10-NEXT: s_lshl_b32 s1, s1, 8 +; GFX10-NEXT: s_and_b32 s6, s6, 0xff ; GFX10-NEXT: s_and_b32 s10, s10, 0xff +; GFX10-NEXT: s_lshr_b32 s8, s0, 24 +; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX10-NEXT: s_lshr_b32 s12, s2, 24 ; GFX10-NEXT: s_and_b32 s2, s2, 0xff -; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 +; GFX10-NEXT: s_and_b32 s7, s7, 0xff +; GFX10-NEXT: s_lshl_b32 s1, s1, 8 +; GFX10-NEXT: s_and_b32 s11, s11, 0xff ; GFX10-NEXT: s_lshl_b32 s3, s3, 8 +; GFX10-NEXT: s_and_b32 s13, s13, 0xff +; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 ; GFX10-NEXT: s_or_b32 s1, s8, s1 -; GFX10-NEXT: s_lshl_b32 s8, s10, 8 ; GFX10-NEXT: s_or_b32 s3, s12, s3 -; GFX10-NEXT: s_or_b32 s2, s2, s8 -; GFX10-NEXT: s_lshr_b32 s6, s0, 8 -; GFX10-NEXT: s_lshr_b32 s7, s0, 16 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 -; GFX10-NEXT: s_and_b32 s6, s6, 0xff +; GFX10-NEXT: s_lshl_b32 s8, s13, 16 ; GFX10-NEXT: s_and_b32 s0, s0, 0xff -; GFX10-NEXT: s_and_b32 s7, s7, 0xff -; GFX10-NEXT: s_and_b32 s9, s9, 0xff ; GFX10-NEXT: v_mul_hi_u32 v2, s4, v0 ; GFX10-NEXT: v_mul_hi_u32 v0, s5, v0 -; GFX10-NEXT: s_lshl_b32 s6, s6, 8 -; GFX10-NEXT: s_lshl_b32 s7, s7, 16 -; GFX10-NEXT: s_or_b32 s0, s0, s6 +; GFX10-NEXT: s_or_b32 s3, s3, s8 +; GFX10-NEXT: s_and_b32 s9, s9, 0xff +; GFX10-NEXT: s_lshr_b32 s3, s3, 1 ; GFX10-NEXT: v_mov_b32_e32 v1, 8 -; GFX10-NEXT: s_or_b32 s0, s0, s7 ; GFX10-NEXT: v_mul_lo_u32 v2, v2, 24 ; GFX10-NEXT: v_mul_lo_u32 v0, v0, 24 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, s4, v2 ; GFX10-NEXT: v_sub_nc_u32_e32 v0, s5, v0 -; GFX10-NEXT: s_and_b32 s4, s11, 0xff -; GFX10-NEXT: s_and_b32 s5, s13, 0xff -; GFX10-NEXT: s_lshl_b32 s4, s4, 16 +; GFX10-NEXT: s_lshl_b32 s4, s6, 8 +; GFX10-NEXT: s_lshl_b32 s6, s10, 8 +; GFX10-NEXT: s_lshl_b32 s5, s7, 16 ; GFX10-NEXT: v_add_nc_u32_e32 v3, 0xffffffe8, v2 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v2 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 0xffffffe8, v0 -; GFX10-NEXT: s_lshl_b32 s5, s5, 16 -; GFX10-NEXT: s_or_b32 s2, s2, s4 -; GFX10-NEXT: s_or_b32 s3, s3, s5 +; GFX10-NEXT: s_lshl_b32 s7, s11, 16 +; GFX10-NEXT: s_or_b32 s2, s2, s6 +; GFX10-NEXT: s_or_b32 s0, s0, s4 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 -; GFX10-NEXT: s_lshr_b32 s3, s3, 1 +; GFX10-NEXT: s_or_b32 s2, s2, s7 +; GFX10-NEXT: s_or_b32 s0, s0, s5 ; GFX10-NEXT: s_lshr_b32 s2, s2, 1 ; GFX10-NEXT: v_add_nc_u32_e32 v3, 0xffffffe8, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo @@ -2088,22 +2083,20 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX10-NEXT: v_sub_nc_u32_e32 v3, 23, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo ; GFX10-NEXT: v_and_b32_e32 v2, 0xffffff, v2 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffffff, v3 +; GFX10-NEXT: v_lshrrev_b32_e64 v3, v3, s2 ; GFX10-NEXT: v_sub_nc_u32_e32 v4, 23, v0 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffffff, v0 -; GFX10-NEXT: v_lshrrev_b32_e64 v3, v3, s2 -; GFX10-NEXT: v_and_b32_e32 v4, 0xffffff, v4 ; GFX10-NEXT: s_lshl_b32 s2, s9, 16 ; GFX10-NEXT: s_or_b32 s1, s1, s2 -; GFX10-NEXT: v_lshl_or_b32 v2, s0, v2, v3 ; GFX10-NEXT: v_lshrrev_b32_e64 v4, v4, s3 +; GFX10-NEXT: v_lshl_or_b32 v2, s0, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, 16 -; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshl_or_b32 v0, s1, v0, v4 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v1, 0xff, v2, v1 ; GFX10-NEXT: v_and_b32_e32 v4, 0xff, v0 ; GFX10-NEXT: v_bfe_u32 v5, v0, 8, 8 +; GFX10-NEXT: v_and_or_b32 v1, 0xff, v2, v1 ; GFX10-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v4 ; GFX10-NEXT: v_lshl_or_b32 v0, v0, 8, v5 @@ -2115,102 +2108,97 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX11-LABEL: s_fshl_v2i24: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 -; GFX11-NEXT: s_lshr_b32 s10, s4, 8 -; GFX11-NEXT: s_lshr_b32 s11, s4, 16 -; GFX11-NEXT: s_and_b32 s10, s10, 0xff -; GFX11-NEXT: s_lshr_b32 s12, s4, 24 +; GFX11-NEXT: s_lshr_b32 s13, s4, 8 +; GFX11-NEXT: s_lshr_b32 s14, s4, 16 +; GFX11-NEXT: s_and_b32 s13, s13, 0xff +; GFX11-NEXT: s_lshr_b32 s15, s4, 24 ; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX11-NEXT: s_and_b32 s4, s4, 0xff -; GFX11-NEXT: s_and_b32 s11, s11, 0xff -; GFX11-NEXT: s_lshl_b32 s10, s10, 8 -; GFX11-NEXT: s_lshl_b32 s11, s11, 16 -; GFX11-NEXT: s_or_b32 s4, s4, s10 -; GFX11-NEXT: s_lshr_b32 s13, s5, 8 +; GFX11-NEXT: s_and_b32 s14, s14, 0xff +; GFX11-NEXT: s_lshl_b32 s13, s13, 8 +; GFX11-NEXT: s_lshl_b32 s14, s14, 16 +; GFX11-NEXT: s_or_b32 s4, s4, s13 +; GFX11-NEXT: s_lshr_b32 s16, s5, 8 ; GFX11-NEXT: s_and_b32 s5, s5, 0xff -; GFX11-NEXT: s_or_b32 s4, s4, s11 +; GFX11-NEXT: s_or_b32 s4, s4, s14 ; GFX11-NEXT: s_lshl_b32 s5, s5, 8 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX11-NEXT: s_and_b32 s13, s13, 0xff -; GFX11-NEXT: s_or_b32 s5, s12, s5 -; GFX11-NEXT: s_lshl_b32 s10, s13, 16 -; GFX11-NEXT: s_lshr_b32 s9, s1, 8 +; GFX11-NEXT: s_and_b32 s16, s16, 0xff +; GFX11-NEXT: s_or_b32 s5, s15, s5 +; GFX11-NEXT: s_lshl_b32 s13, s16, 16 +; GFX11-NEXT: s_lshr_b32 s6, s0, 8 ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX11-NEXT: s_or_b32 s5, s5, s10 -; GFX11-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-NEXT: s_or_b32 s5, s5, s13 +; GFX11-NEXT: s_lshr_b32 s7, s0, 16 ; GFX11-NEXT: s_lshr_b32 s10, s2, 8 -; GFX11-NEXT: s_lshr_b32 s8, s0, 24 +; GFX11-NEXT: s_lshr_b32 s9, s1, 8 ; GFX11-NEXT: v_mul_lo_u32 v1, 0xffffffe8, v0 +; GFX11-NEXT: s_and_b32 s1, s1, 0xff ; GFX11-NEXT: s_lshr_b32 s11, s2, 16 -; GFX11-NEXT: s_lshl_b32 s1, s1, 8 -; GFX11-NEXT: s_and_b32 s9, s9, 0xff +; GFX11-NEXT: s_and_b32 s6, s6, 0xff +; GFX11-NEXT: s_and_b32 s7, s7, 0xff ; GFX11-NEXT: s_and_b32 s10, s10, 0xff +; GFX11-NEXT: s_lshr_b32 s8, s0, 24 ; GFX11-NEXT: s_lshr_b32 s12, s2, 24 +; GFX11-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX11-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_and_b32 s11, s11, 0xff ; GFX11-NEXT: s_or_b32 s1, s8, s1 -; GFX11-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX11-NEXT: s_lshl_b32 s8, s9, 16 -; GFX11-NEXT: s_lshl_b32 s9, s10, 8 -; GFX11-NEXT: s_lshr_b32 s6, s0, 8 -; GFX11-NEXT: s_or_b32 s2, s2, s9 +; GFX11-NEXT: s_lshl_b32 s8, s11, 16 ; GFX11-NEXT: s_lshr_b32 s13, s3, 8 ; GFX11-NEXT: s_and_b32 s3, s3, 0xff -; GFX11-NEXT: s_lshr_b32 s7, s0, 16 ; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v1 -; GFX11-NEXT: s_and_b32 s6, s6, 0xff +; GFX11-NEXT: s_and_b32 s9, s9, 0xff ; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_and_b32 s13, s13, 0xff ; GFX11-NEXT: s_and_b32 s0, s0, 0xff -; GFX11-NEXT: s_and_b32 s7, s7, 0xff ; GFX11-NEXT: v_mul_hi_u32 v1, s4, v0 ; GFX11-NEXT: v_mul_hi_u32 v0, s5, v0 -; GFX11-NEXT: s_lshl_b32 s6, s6, 8 ; GFX11-NEXT: s_or_b32 s3, s12, s3 -; GFX11-NEXT: s_lshl_b32 s7, s7, 16 -; GFX11-NEXT: s_or_b32 s0, s0, s6 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: s_or_b32 s0, s0, s7 -; GFX11-NEXT: v_mul_lo_u32 v1, v1, 24 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mul_lo_u32 v1, v1, 24 ; GFX11-NEXT: v_mul_lo_u32 v0, v0, 24 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_sub_nc_u32_e32 v1, s4, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_sub_nc_u32_e32 v0, s5, v0 -; GFX11-NEXT: s_and_b32 s4, s11, 0xff -; GFX11-NEXT: s_and_b32 s5, s13, 0xff -; GFX11-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-NEXT: s_lshl_b32 s4, s6, 8 +; GFX11-NEXT: s_lshl_b32 s6, s7, 16 +; GFX11-NEXT: s_lshl_b32 s7, s10, 8 ; GFX11-NEXT: v_add_nc_u32_e32 v2, 0xffffffe8, v1 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v3, 0xffffffe8, v0 -; GFX11-NEXT: s_or_b32 s2, s2, s4 -; GFX11-NEXT: s_lshl_b32 s5, s5, 16 -; GFX11-NEXT: s_lshr_b32 s2, s2, 1 +; GFX11-NEXT: s_or_b32 s2, s2, s7 +; GFX11-NEXT: s_lshl_b32 s5, s9, 16 +; GFX11-NEXT: s_or_b32 s2, s2, s8 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 -; GFX11-NEXT: s_or_b32 s3, s3, s5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: s_lshr_b32 s2, s2, 1 +; GFX11-NEXT: s_lshl_b32 s9, s13, 16 +; GFX11-NEXT: s_or_b32 s0, s0, s4 ; GFX11-NEXT: v_add_nc_u32_e32 v2, 0xffffffe8, v1 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 +; GFX11-NEXT: s_or_b32 s3, s3, s9 +; GFX11-NEXT: s_or_b32 s0, s0, s6 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_sub_nc_u32_e32 v2, 23, v1 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v3, 0xffffffe8, v0 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffffff, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo ; GFX11-NEXT: v_lshrrev_b32_e64 v2, v2, s2 ; GFX11-NEXT: s_lshr_b32 s2, s3, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo +; GFX11-NEXT: v_lshl_or_b32 v1, s0, v1, v2 +; GFX11-NEXT: s_or_b32 s0, s1, s5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_sub_nc_u32_e32 v3, 23, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffffff, v0 -; GFX11-NEXT: v_lshl_or_b32 v1, s0, v1, v2 -; GFX11-NEXT: s_or_b32 s0, s1, s8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_b32_e32 v3, 0xffffff, v3 ; GFX11-NEXT: v_bfe_u32 v2, v1, 8, 8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_lshrrev_b32_e64 v3, v3, s2 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) @@ -2413,11 +2401,9 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) { ; GFX10-NEXT: v_sub_nc_u32_e32 v6, 23, v4 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo ; GFX10-NEXT: v_and_b32_e32 v4, 0xffffff, v4 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffffff, v6 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, v6, v2 ; GFX10-NEXT: v_sub_nc_u32_e32 v7, 23, v5 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffffff, v5 -; GFX10-NEXT: v_lshrrev_b32_e32 v2, v6, v2 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffffff, v7 ; GFX10-NEXT: v_lshl_or_b32 v0, v0, v4, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, v7, v3 ; GFX10-NEXT: v_lshl_or_b32 v1, v1, v5, v3 @@ -2466,15 +2452,12 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) { ; GFX11-NEXT: v_sub_nc_u32_e32 v6, 23, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_dual_cndmask_b32 v5, v5, v7 :: v_dual_and_b32 v4, 0xffffff, v4 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffffff, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, v6, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_sub_nc_u32_e32 v7, 23, v5 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffffff, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, v6, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_b32_e32 v7, 0xffffff, v7 ; GFX11-NEXT: v_lshl_or_b32 v0, v0, v4, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v3, v7, v3 ; GFX11-NEXT: v_lshl_or_b32 v1, v1, v5, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll index 5886b5d98f771..3b1c5b2d4db8b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll @@ -1510,9 +1510,8 @@ define amdgpu_ps i24 @s_fshr_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt) ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 23, v0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffffff, v0 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GFX10-NEXT: v_lshrrev_b32_e64 v0, v0, s1 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GFX10-NEXT: v_lshl_or_b32 v0, s0, v1, v0 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog @@ -1547,12 +1546,10 @@ define amdgpu_ps i24 @s_fshr_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_sub_nc_u32_e32 v1, 23, v0 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffffff, v0 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffffff, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e64 v0, v0, s1 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffffff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshl_or_b32 v0, s0, v1, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: ; return to shader part epilog %result = call i24 @llvm.fshr.i24(i24 %lhs, i24 %rhs, i24 %amt) @@ -1675,9 +1672,8 @@ define i24 @v_fshr_i24(i24 %lhs, i24 %rhs, i24 %amt) { ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo ; GFX10-NEXT: v_sub_nc_u32_e32 v3, 23, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffffff, v2 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffffff, v3 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, v2, v1 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffffff, v3 ; GFX10-NEXT: v_lshl_or_b32 v0, v0, v3, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1712,10 +1708,9 @@ define i24 @v_fshr_i24(i24 %lhs, i24 %rhs, i24 %amt) { ; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_sub_nc_u32_e32 v3, 23, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffffff, v2 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffffff, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v1, v2, v1 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffffff, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshl_or_b32 v0, v0, v3, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call i24 @llvm.fshr.i24(i24 %lhs, i24 %rhs, i24 %amt) @@ -2052,77 +2047,77 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX10-LABEL: s_fshr_v2i24: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 -; GFX10-NEXT: s_lshr_b32 s13, s4, 8 -; GFX10-NEXT: s_lshr_b32 s14, s4, 16 -; GFX10-NEXT: s_and_b32 s13, s13, 0xff -; GFX10-NEXT: s_lshr_b32 s15, s4, 24 +; GFX10-NEXT: s_lshr_b32 s12, s4, 8 +; GFX10-NEXT: s_lshr_b32 s13, s4, 16 +; GFX10-NEXT: s_and_b32 s12, s12, 0xff +; GFX10-NEXT: s_lshr_b32 s14, s4, 24 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: s_and_b32 s4, s4, 0xff -; GFX10-NEXT: s_and_b32 s14, s14, 0xff -; GFX10-NEXT: s_lshl_b32 s13, s13, 8 -; GFX10-NEXT: s_lshl_b32 s14, s14, 16 -; GFX10-NEXT: s_or_b32 s4, s4, s13 -; GFX10-NEXT: s_lshr_b32 s16, s5, 8 +; GFX10-NEXT: s_and_b32 s13, s13, 0xff +; GFX10-NEXT: s_lshl_b32 s12, s12, 8 +; GFX10-NEXT: s_lshl_b32 s13, s13, 16 +; GFX10-NEXT: s_or_b32 s4, s4, s12 +; GFX10-NEXT: s_lshr_b32 s15, s5, 8 ; GFX10-NEXT: s_and_b32 s5, s5, 0xff -; GFX10-NEXT: s_or_b32 s4, s4, s14 +; GFX10-NEXT: s_or_b32 s4, s4, s13 ; GFX10-NEXT: s_lshl_b32 s5, s5, 8 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX10-NEXT: s_and_b32 s16, s16, 0xff -; GFX10-NEXT: s_or_b32 s5, s15, s5 -; GFX10-NEXT: s_lshl_b32 s13, s16, 16 +; GFX10-NEXT: s_and_b32 s15, s15, 0xff +; GFX10-NEXT: s_or_b32 s5, s14, s5 +; GFX10-NEXT: s_lshl_b32 s12, s15, 16 ; GFX10-NEXT: s_lshr_b32 s10, s2, 8 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX10-NEXT: s_or_b32 s5, s5, s13 +; GFX10-NEXT: s_or_b32 s5, s5, s12 +; GFX10-NEXT: s_lshr_b32 s6, s0, 8 ; GFX10-NEXT: s_lshr_b32 s9, s1, 8 ; GFX10-NEXT: s_and_b32 s1, s1, 0xff -; GFX10-NEXT: s_lshr_b32 s11, s2, 16 ; GFX10-NEXT: v_mul_lo_u32 v2, 0xffffffe8, v0 +; GFX10-NEXT: s_lshr_b32 s11, s2, 16 ; GFX10-NEXT: s_lshr_b32 s13, s3, 8 ; GFX10-NEXT: s_and_b32 s3, s3, 0xff ; GFX10-NEXT: s_and_b32 s10, s10, 0xff -; GFX10-NEXT: s_lshr_b32 s6, s0, 8 ; GFX10-NEXT: s_lshr_b32 s8, s0, 24 ; GFX10-NEXT: s_lshr_b32 s12, s2, 24 ; GFX10-NEXT: s_and_b32 s2, s2, 0xff ; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 +; GFX10-NEXT: s_and_b32 s6, s6, 0xff ; GFX10-NEXT: s_lshl_b32 s1, s1, 8 ; GFX10-NEXT: s_and_b32 s11, s11, 0xff ; GFX10-NEXT: s_lshl_b32 s3, s3, 8 ; GFX10-NEXT: s_and_b32 s13, s13, 0xff -; GFX10-NEXT: s_and_b32 s6, s6, 0xff ; GFX10-NEXT: s_or_b32 s1, s8, s1 ; GFX10-NEXT: s_or_b32 s3, s12, s3 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 ; GFX10-NEXT: s_lshl_b32 s8, s13, 16 ; GFX10-NEXT: s_lshr_b32 s7, s0, 16 ; GFX10-NEXT: s_and_b32 s0, s0, 0xff -; GFX10-NEXT: s_lshl_b32 s6, s6, 8 +; GFX10-NEXT: s_or_b32 s3, s3, s8 ; GFX10-NEXT: v_mul_hi_u32 v2, s4, v0 ; GFX10-NEXT: v_mul_hi_u32 v0, s5, v0 -; GFX10-NEXT: s_or_b32 s3, s3, s8 ; GFX10-NEXT: s_and_b32 s7, s7, 0xff ; GFX10-NEXT: s_and_b32 s9, s9, 0xff -; GFX10-NEXT: s_or_b32 s0, s0, s6 ; GFX10-NEXT: s_lshl_b32 s7, s7, 17 ; GFX10-NEXT: s_lshl_b32 s9, s9, 17 +; GFX10-NEXT: s_lshl_b32 s1, s1, 1 +; GFX10-NEXT: v_mov_b32_e32 v1, 8 ; GFX10-NEXT: v_mul_lo_u32 v2, v2, 24 ; GFX10-NEXT: v_mul_lo_u32 v0, v0, 24 -; GFX10-NEXT: s_lshl_b32 s0, s0, 1 -; GFX10-NEXT: s_lshl_b32 s1, s1, 1 -; GFX10-NEXT: s_or_b32 s0, s7, s0 ; GFX10-NEXT: s_or_b32 s1, s9, s1 -; GFX10-NEXT: v_mov_b32_e32 v1, 8 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, s4, v2 ; GFX10-NEXT: v_sub_nc_u32_e32 v0, s5, v0 -; GFX10-NEXT: s_lshl_b32 s4, s10, 8 -; GFX10-NEXT: s_lshl_b32 s5, s11, 16 -; GFX10-NEXT: s_or_b32 s2, s2, s4 +; GFX10-NEXT: s_lshl_b32 s5, s10, 8 +; GFX10-NEXT: s_lshl_b32 s4, s6, 8 +; GFX10-NEXT: s_lshl_b32 s6, s11, 16 ; GFX10-NEXT: v_add_nc_u32_e32 v3, 0xffffffe8, v2 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v2 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 0xffffffe8, v0 ; GFX10-NEXT: s_or_b32 s2, s2, s5 +; GFX10-NEXT: s_or_b32 s0, s0, s4 +; GFX10-NEXT: s_or_b32 s2, s2, s6 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 +; GFX10-NEXT: s_lshl_b32 s0, s0, 1 +; GFX10-NEXT: s_or_b32 s0, s7, s0 ; GFX10-NEXT: v_add_nc_u32_e32 v3, 0xffffffe8, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v2 @@ -2131,21 +2126,19 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, 23, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo -; GFX10-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GFX10-NEXT: v_lshrrev_b32_e64 v2, v2, s2 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffffff, v3 ; GFX10-NEXT: v_sub_nc_u32_e32 v4, 23, v0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffffff, v0 -; GFX10-NEXT: v_lshrrev_b32_e64 v2, v2, s2 -; GFX10-NEXT: v_and_b32_e32 v4, 0xffffff, v4 ; GFX10-NEXT: v_lshrrev_b32_e64 v0, v0, s3 ; GFX10-NEXT: v_lshl_or_b32 v2, s0, v3, v2 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffffff, v4 ; GFX10-NEXT: v_mov_b32_e32 v3, 16 -; GFX10-NEXT: v_lshl_or_b32 v0, s1, v4, v0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshl_or_b32 v0, s1, v4, v0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_or_b32 v1, 0xff, v2, v1 ; GFX10-NEXT: v_and_b32_e32 v4, 0xff, v0 ; GFX10-NEXT: v_bfe_u32 v5, v0, 8, 8 -; GFX10-NEXT: v_and_or_b32 v1, 0xff, v2, v1 ; GFX10-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v4 ; GFX10-NEXT: v_lshl_or_b32 v0, v0, 8, v5 @@ -2157,36 +2150,36 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX11-LABEL: s_fshr_v2i24: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 -; GFX11-NEXT: s_lshr_b32 s14, s4, 8 -; GFX11-NEXT: s_lshr_b32 s15, s4, 16 -; GFX11-NEXT: s_and_b32 s14, s14, 0xff -; GFX11-NEXT: s_lshr_b32 s16, s4, 24 +; GFX11-NEXT: s_lshr_b32 s13, s4, 8 +; GFX11-NEXT: s_lshr_b32 s14, s4, 16 +; GFX11-NEXT: s_and_b32 s13, s13, 0xff +; GFX11-NEXT: s_lshr_b32 s15, s4, 24 ; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX11-NEXT: s_and_b32 s4, s4, 0xff -; GFX11-NEXT: s_and_b32 s15, s15, 0xff -; GFX11-NEXT: s_lshl_b32 s14, s14, 8 -; GFX11-NEXT: s_lshl_b32 s15, s15, 16 -; GFX11-NEXT: s_or_b32 s4, s4, s14 -; GFX11-NEXT: s_lshr_b32 s17, s5, 8 +; GFX11-NEXT: s_and_b32 s14, s14, 0xff +; GFX11-NEXT: s_lshl_b32 s13, s13, 8 +; GFX11-NEXT: s_lshl_b32 s14, s14, 16 +; GFX11-NEXT: s_or_b32 s4, s4, s13 +; GFX11-NEXT: s_lshr_b32 s16, s5, 8 ; GFX11-NEXT: s_and_b32 s5, s5, 0xff -; GFX11-NEXT: s_or_b32 s4, s4, s15 +; GFX11-NEXT: s_or_b32 s4, s4, s14 ; GFX11-NEXT: s_lshl_b32 s5, s5, 8 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX11-NEXT: s_and_b32 s17, s17, 0xff -; GFX11-NEXT: s_or_b32 s5, s16, s5 -; GFX11-NEXT: s_lshl_b32 s14, s17, 16 +; GFX11-NEXT: s_and_b32 s16, s16, 0xff +; GFX11-NEXT: s_or_b32 s5, s15, s5 +; GFX11-NEXT: s_lshl_b32 s13, s16, 16 ; GFX11-NEXT: s_lshr_b32 s10, s2, 8 ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX11-NEXT: s_or_b32 s5, s5, s14 +; GFX11-NEXT: s_or_b32 s5, s5, s13 +; GFX11-NEXT: s_lshr_b32 s6, s0, 8 ; GFX11-NEXT: s_lshr_b32 s11, s2, 16 ; GFX11-NEXT: s_and_b32 s10, s10, 0xff -; GFX11-NEXT: s_lshr_b32 s6, s0, 8 ; GFX11-NEXT: v_mul_lo_u32 v1, 0xffffffe8, v0 ; GFX11-NEXT: s_lshr_b32 s12, s2, 24 ; GFX11-NEXT: s_and_b32 s2, s2, 0xff -; GFX11-NEXT: s_and_b32 s11, s11, 0xff ; GFX11-NEXT: s_and_b32 s6, s6, 0xff +; GFX11-NEXT: s_and_b32 s11, s11, 0xff ; GFX11-NEXT: s_lshr_b32 s7, s0, 16 ; GFX11-NEXT: s_lshr_b32 s8, s0, 24 ; GFX11-NEXT: s_lshr_b32 s9, s1, 8 @@ -2195,80 +2188,78 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX11-NEXT: s_and_b32 s1, s1, 0xff ; GFX11-NEXT: s_lshr_b32 s13, s3, 8 ; GFX11-NEXT: s_and_b32 s3, s3, 0xff -; GFX11-NEXT: s_lshl_b32 s6, s6, 8 ; GFX11-NEXT: s_and_b32 s7, s7, 0xff ; GFX11-NEXT: s_lshl_b32 s1, s1, 8 -; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v1 ; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v1 ; GFX11-NEXT: s_and_b32 s13, s13, 0xff -; GFX11-NEXT: s_or_b32 s0, s0, s6 ; GFX11-NEXT: s_or_b32 s1, s8, s1 -; GFX11-NEXT: v_mul_hi_u32 v1, s4, v0 -; GFX11-NEXT: v_mul_hi_u32 v0, s5, v0 ; GFX11-NEXT: s_or_b32 s3, s12, s3 ; GFX11-NEXT: s_lshl_b32 s8, s13, 16 +; GFX11-NEXT: v_mul_hi_u32 v1, s4, v0 +; GFX11-NEXT: v_mul_hi_u32 v0, s5, v0 ; GFX11-NEXT: s_lshl_b32 s7, s7, 17 -; GFX11-NEXT: s_lshl_b32 s0, s0, 1 ; GFX11-NEXT: s_or_b32 s3, s3, s8 -; GFX11-NEXT: s_or_b32 s0, s7, s0 -; GFX11-NEXT: v_mul_lo_u32 v1, v1, 24 -; GFX11-NEXT: v_mul_lo_u32 v0, v0, 24 ; GFX11-NEXT: s_and_b32 s9, s9, 0xff ; GFX11-NEXT: s_lshl_b32 s1, s1, 1 ; GFX11-NEXT: s_lshl_b32 s9, s9, 17 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mul_lo_u32 v1, v1, 24 +; GFX11-NEXT: v_mul_lo_u32 v0, v0, 24 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_sub_nc_u32_e32 v1, s4, v1 ; GFX11-NEXT: v_sub_nc_u32_e32 v0, s5, v0 -; GFX11-NEXT: s_lshl_b32 s4, s10, 8 -; GFX11-NEXT: s_lshl_b32 s5, s11, 16 -; GFX11-NEXT: s_or_b32 s2, s2, s4 +; GFX11-NEXT: s_lshl_b32 s5, s10, 8 +; GFX11-NEXT: s_lshl_b32 s4, s6, 8 +; GFX11-NEXT: s_lshl_b32 s6, s11, 16 ; GFX11-NEXT: v_add_nc_u32_e32 v2, 0xffffffe8, v1 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v3, 0xffffffe8, v0 ; GFX11-NEXT: s_or_b32 s2, s2, s5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_or_b32 s0, s0, s4 +; GFX11-NEXT: s_or_b32 s2, s2, s6 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 +; GFX11-NEXT: s_lshl_b32 s0, s0, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_or_b32 s0, s7, s0 ; GFX11-NEXT: v_add_nc_u32_e32 v2, 0xffffffe8, v1 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo -; GFX11-NEXT: v_sub_nc_u32_e32 v2, 23, v1 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffffff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_add_nc_u32_e32 v3, 0xffffffe8, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_and_b32_e32 v2, 0xffffff, v2 -; GFX11-NEXT: v_lshrrev_b32_e64 v1, v1, s2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_sub_nc_u32_e32 v2, 23, v1 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo -; GFX11-NEXT: v_lshl_or_b32 v1, s0, v2, v1 -; GFX11-NEXT: s_or_b32 s0, s9, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshrrev_b32_e64 v1, v1, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; GFX11-NEXT: v_sub_nc_u32_e32 v3, 23, v0 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffffff, v0 -; GFX11-NEXT: v_bfe_u32 v2, v1, 8, 8 +; GFX11-NEXT: v_lshrrev_b32_e64 v0, v0, s3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshl_or_b32 v1, s0, v2, v1 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffffff, v3 -; GFX11-NEXT: v_lshrrev_b32_e64 v0, v0, s3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX11-NEXT: s_or_b32 s0, s9, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v2, v1, 8, 8 ; GFX11-NEXT: v_lshl_or_b32 v0, s0, v3, v0 ; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_and_or_b32 v1, 0xff, v1, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v0 ; GFX11-NEXT: v_bfe_u32 v5, v0, 8, 8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX11-NEXT: v_bfe_u32 v0, v0, 16, 8 +; GFX11-NEXT: v_and_or_b32 v1, 0xff, v1, v2 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_lshl_or_b32 v0, v0, 8, v5 -; GFX11-NEXT: v_or3_b32 v1, v1, v3, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_or3_b32 v1, v1, v3, v4 ; GFX11-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_readfirstlane_b32 s0, v1 ; GFX11-NEXT: ; return to shader part epilog %lhs = bitcast i48 %lhs.arg to <2 x i24> @@ -2460,15 +2451,13 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) { ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v5 ; GFX10-NEXT: v_sub_nc_u32_e32 v6, 23, v4 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo -; GFX10-NEXT: v_and_b32_e32 v4, 0xffffff, v4 -; GFX10-NEXT: v_and_b32_e32 v6, 0xffffff, v6 -; GFX10-NEXT: v_sub_nc_u32_e32 v7, 23, v5 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffffff, v5 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v2 -; GFX10-NEXT: v_and_b32_e32 v4, 0xffffff, v7 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffffff, v6 +; GFX10-NEXT: v_sub_nc_u32_e32 v7, 23, v5 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v3 -; GFX10-NEXT: v_lshl_or_b32 v0, v0, v6, v2 -; GFX10-NEXT: v_lshl_or_b32 v1, v1, v4, v3 +; GFX10-NEXT: v_lshl_or_b32 v0, v0, v4, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffffff, v7 +; GFX10-NEXT: v_lshl_or_b32 v1, v1, v5, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fshr_v2i24: @@ -2513,19 +2502,17 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) { ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_sub_nc_u32_e32 v6, 23, v4 -; GFX11-NEXT: v_dual_cndmask_b32 v5, v5, v7 :: v_dual_and_b32 v4, 0xffffff, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_b32_e32 v6, 0xffffff, v6 -; GFX11-NEXT: v_sub_nc_u32_e32 v7, 23, v5 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffffff, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo ; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v2 -; GFX11-NEXT: v_and_b32_e32 v4, 0xffffff, v7 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_and_b32_e32 v4, 0xffffff, v6 +; GFX11-NEXT: v_sub_nc_u32_e32 v7, 23, v5 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v3 -; GFX11-NEXT: v_lshl_or_b32 v0, v0, v6, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_lshl_or_b32 v1, v1, v4, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshl_or_b32 v0, v0, v4, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffffff, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshl_or_b32 v1, v1, v5, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i24> @llvm.fshr.v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) ret <2 x i24> %result diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll index 7d7dd85dc2451..deccf408fb6c4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll @@ -96,7 +96,6 @@ define i24 @v_lshr_i24(i24 %value, i24 %amount) { ; GFX10PLUS-LABEL: v_lshr_i24: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GFX10PLUS-NEXT: v_and_b32_e32 v0, 0xffffff, v0 ; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] @@ -663,7 +662,6 @@ define amdgpu_ps half @lshr_i16_sv(i16 inreg %value, i16 %amount) { define amdgpu_ps half @lshr_i16_vs(i16 %value, i16 inreg %amount) { ; GFX6-LABEL: lshr_i16_vs: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GFX6-NEXT: ; return to shader part epilog @@ -840,12 +838,10 @@ define amdgpu_ps float @lshr_v2i16_sv(<2 x i16> inreg %value, <2 x i16> %amount) define amdgpu_ps float @lshr_v2i16_vs(<2 x i16> %value, <2 x i16> inreg %amount) { ; GFX6-LABEL: lshr_v2i16_vs: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b32 s0, s0, 0xffff +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, s1, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, s0, v0 -; GFX6-NEXT: s_and_b32 s0, s1, 0xffff -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, s0, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll index b12e915c7d21b..7fdc7c5405518 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll @@ -116,7 +116,6 @@ define i24 @v_shl_i24(i24 %value, i24 %amount) { ; GFX10PLUS-LABEL: v_shl_i24: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GFX10PLUS-NEXT: v_lshlrev_b32_e32 v0, v1, v0 ; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = shl i24 %value, %amount @@ -689,7 +688,6 @@ define amdgpu_ps half @shl_i16_sv(i16 inreg %value, i16 %amount) { define amdgpu_ps half @shl_i16_vs(i16 %value, i16 inreg %amount) { ; GFX6-LABEL: shl_i16_vs: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: v_lshlrev_b32_e32 v0, s0, v0 ; GFX6-NEXT: ; return to shader part epilog ; @@ -861,10 +859,8 @@ define amdgpu_ps float @shl_v2i16_sv(<2 x i16> inreg %value, <2 x i16> %amount) define amdgpu_ps float @shl_v2i16_vs(<2 x i16> %value, <2 x i16> inreg %amount) { ; GFX6-LABEL: shl_v2i16_vs: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b32 s0, s0, 0xffff +; GFX6-NEXT: v_lshlrev_b32_e32 v1, s1, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, s0, v0 -; GFX6-NEXT: s_and_b32 s0, s1, 0xffff -; GFX6-NEXT: v_lshlrev_b32_e32 v1, s0, v1 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 diff --git a/llvm/test/CodeGen/AMDGPU/constrained-shift.ll b/llvm/test/CodeGen/AMDGPU/constrained-shift.ll index 661af021e8a84..cefa1059e6b3b 100644 --- a/llvm/test/CodeGen/AMDGPU/constrained-shift.ll +++ b/llvm/test/CodeGen/AMDGPU/constrained-shift.ll @@ -45,7 +45,6 @@ define i32 @csh_32(i32 %a, i32 %b) { ; GISEL-LABEL: csh_32: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_and_b32_e32 v1, 31, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v2, v1, v0 ; GISEL-NEXT: v_lshrrev_b32_e32 v3, v1, v0 ; GISEL-NEXT: v_ashrrev_i32_e32 v0, v1, v0 diff --git a/llvm/test/CodeGen/AMDGPU/si-fold-bitmasks.mir b/llvm/test/CodeGen/AMDGPU/si-fold-bitmasks.mir index 1edf970591179..c9946b3f9d0ef 100644 --- a/llvm/test/CodeGen/AMDGPU/si-fold-bitmasks.mir +++ b/llvm/test/CodeGen/AMDGPU/si-fold-bitmasks.mir @@ -16,7 +16,7 @@ body: | ; GCN-NEXT: %src:vgpr_32 = COPY $vgpr0 ; GCN-NEXT: %shift:vgpr_32 = COPY $vgpr1 ; GCN-NEXT: %shiftmask:vgpr_32 = V_AND_B32_e32 65535, %shift, implicit $exec - ; GCN-NEXT: %ret:vgpr_32 = V_ASHR_I32_e64 %src, %shiftmask, implicit $exec + ; GCN-NEXT: %ret:vgpr_32 = V_ASHR_I32_e64 %src, %shift, implicit $exec ; GCN-NEXT: $vgpr0 = COPY %ret %src:vgpr_32 = COPY $vgpr0 %shift:vgpr_32 = COPY $vgpr1 @@ -38,7 +38,7 @@ body: | ; GCN-NEXT: %src:vgpr_32 = COPY $vgpr0 ; GCN-NEXT: %shift:vgpr_32 = COPY $vgpr1 ; GCN-NEXT: %shiftmask:vgpr_32 = V_AND_B32_e32 65535, %shift, implicit $exec - ; GCN-NEXT: %ret:vgpr_32 = V_LSHR_B32_e64 %src, %shiftmask, implicit $exec + ; GCN-NEXT: %ret:vgpr_32 = V_LSHR_B32_e64 %src, %shift, implicit $exec ; GCN-NEXT: $vgpr0 = COPY %ret %src:vgpr_32 = COPY $vgpr0 %shift:vgpr_32 = COPY $vgpr1 @@ -60,7 +60,7 @@ body: | ; GCN-NEXT: %src:vgpr_32 = COPY $vgpr0 ; GCN-NEXT: %shift:vgpr_32 = COPY $vgpr1 ; GCN-NEXT: %shiftmask:vgpr_32 = V_AND_B32_e64 65535, %shift, implicit $exec - ; GCN-NEXT: %ret:vgpr_32 = V_LSHR_B32_e32 %src, %shiftmask, implicit $exec + ; GCN-NEXT: %ret:vgpr_32 = V_LSHR_B32_e32 %src, %shift, implicit $exec ; GCN-NEXT: $vgpr0 = COPY %ret %src:vgpr_32 = COPY $vgpr0 %shift:vgpr_32 = COPY $vgpr1 @@ -82,7 +82,7 @@ body: | ; GCN-NEXT: %src:vgpr_32 = COPY $vgpr0 ; GCN-NEXT: %shift:vgpr_32 = COPY $vgpr1 ; GCN-NEXT: %shiftmask:vgpr_32 = V_AND_B32_e64 65535, %shift, implicit $exec - ; GCN-NEXT: %ret:vgpr_32 = V_LSHL_B32_e64 %src, %shiftmask, implicit $exec + ; GCN-NEXT: %ret:vgpr_32 = V_LSHL_B32_e64 %src, %shift, implicit $exec ; GCN-NEXT: $vgpr0 = COPY %ret %src:vgpr_32 = COPY $vgpr0 %shift:vgpr_32 = COPY $vgpr1 @@ -104,7 +104,7 @@ body: | ; GCN-NEXT: %src:sgpr_32 = COPY $sgpr0 ; GCN-NEXT: %shift:sgpr_32 = COPY $sgpr1 ; GCN-NEXT: %shiftmask:sgpr_32 = S_AND_B32 65535, %shift, implicit-def $scc - ; GCN-NEXT: %ret:sgpr_32 = S_LSHL_B32 %src, %shiftmask, implicit-def $scc + ; GCN-NEXT: %ret:sgpr_32 = S_LSHL_B32 %src, %shift, implicit-def $scc ; GCN-NEXT: $sgpr0 = COPY %ret %src:sgpr_32 = COPY $sgpr0 %shift:sgpr_32 = COPY $sgpr1 @@ -126,7 +126,7 @@ body: | ; GCN-NEXT: %src:sgpr_32 = COPY $sgpr0 ; GCN-NEXT: %shift:sgpr_32 = COPY $sgpr1 ; GCN-NEXT: %shiftmask:sgpr_32 = S_AND_B32 65535, %shift, implicit-def $scc - ; GCN-NEXT: %ret:sgpr_32 = S_LSHR_B32 %src, %shiftmask, implicit-def $scc + ; GCN-NEXT: %ret:sgpr_32 = S_LSHR_B32 %src, %shift, implicit-def $scc ; GCN-NEXT: $sgpr0 = COPY %ret %src:sgpr_32 = COPY $sgpr0 %shift:sgpr_32 = COPY $sgpr1 @@ -148,7 +148,7 @@ body: | ; GCN-NEXT: %src:sgpr_32 = COPY $sgpr0 ; GCN-NEXT: %shift:sgpr_32 = COPY $sgpr1 ; GCN-NEXT: %shiftmask:sgpr_32 = S_AND_B32 65535, %shift, implicit-def $scc - ; GCN-NEXT: %ret:sgpr_32 = S_ASHR_I32 %src, %shiftmask, implicit-def $scc + ; GCN-NEXT: %ret:sgpr_32 = S_ASHR_I32 %src, %shift, implicit-def $scc ; GCN-NEXT: $sgpr0 = COPY %ret %src:sgpr_32 = COPY $sgpr0 %shift:sgpr_32 = COPY $sgpr1 @@ -170,7 +170,7 @@ body: | ; GCN-NEXT: %src:sgpr_64 = COPY $sgpr0_sgpr1 ; GCN-NEXT: %shift:sgpr_32 = COPY $sgpr2 ; GCN-NEXT: %shiftmask:sgpr_32 = S_AND_B32 63, %shift, implicit-def $scc - ; GCN-NEXT: %ret:sgpr_64 = S_LSHL_B64 %src, %shiftmask, implicit-def $scc + ; GCN-NEXT: %ret:sgpr_64 = S_LSHL_B64 %src, %shift, implicit-def $scc ; GCN-NEXT: $sgpr0_sgpr1 = COPY %ret %src:sgpr_64 = COPY $sgpr0_sgpr1 %shift:sgpr_32 = COPY $sgpr2 @@ -192,7 +192,7 @@ body: | ; GCN-NEXT: %src:sgpr_64 = COPY $sgpr0_sgpr1 ; GCN-NEXT: %shift:sgpr_32 = COPY $sgpr2 ; GCN-NEXT: %shiftmask:sgpr_32 = S_AND_B32 63, %shift, implicit-def $scc - ; GCN-NEXT: %ret:sgpr_64 = S_LSHR_B64 %src, %shiftmask, implicit-def $scc + ; GCN-NEXT: %ret:sgpr_64 = S_LSHR_B64 %src, %shift, implicit-def $scc ; GCN-NEXT: $sgpr0_sgpr1 = COPY %ret %src:sgpr_64 = COPY $sgpr0_sgpr1 %shift:sgpr_32 = COPY $sgpr2 @@ -214,7 +214,7 @@ body: | ; GCN-NEXT: %src:sgpr_64 = COPY $sgpr0_sgpr1 ; GCN-NEXT: %shift:sgpr_32 = COPY $sgpr2 ; GCN-NEXT: %shiftmask:sgpr_32 = S_AND_B32 63, %shift, implicit-def $scc - ; GCN-NEXT: %ret:sgpr_64 = S_ASHR_I64 %src, %shiftmask, implicit-def $scc + ; GCN-NEXT: %ret:sgpr_64 = S_ASHR_I64 %src, %shift, implicit-def $scc ; GCN-NEXT: $sgpr0_sgpr1 = COPY %ret %src:sgpr_64 = COPY $sgpr0_sgpr1 %shift:sgpr_32 = COPY $sgpr2 @@ -236,7 +236,7 @@ body: | ; GCN-NEXT: %src:vgpr_32 = COPY $vgpr0 ; GCN-NEXT: %shift:vgpr_32 = COPY $vgpr1 ; GCN-NEXT: %shiftmask:vgpr_32 = V_AND_B32_e32 65535, %shift, implicit $exec - ; GCN-NEXT: %ret:vgpr_32 = V_LSHLREV_B32_e64 %shiftmask, %src, implicit $exec + ; GCN-NEXT: %ret:vgpr_32 = V_LSHLREV_B32_e64 %shift, %src, implicit $exec ; GCN-NEXT: $vgpr0 = COPY %ret %src:vgpr_32 = COPY $vgpr0 %shift:vgpr_32 = COPY $vgpr1 @@ -258,7 +258,7 @@ body: | ; GCN-NEXT: %src:vgpr_32 = COPY $vgpr0 ; GCN-NEXT: %shift:vgpr_32 = COPY $vgpr1 ; GCN-NEXT: %shiftmask:vgpr_32 = V_AND_B32_e32 65535, %shift, implicit $exec - ; GCN-NEXT: %ret:vgpr_32 = V_LSHRREV_B32_e64 %shiftmask, %src, implicit $exec + ; GCN-NEXT: %ret:vgpr_32 = V_LSHRREV_B32_e64 %shift, %src, implicit $exec ; GCN-NEXT: $vgpr0 = COPY %ret %src:vgpr_32 = COPY $vgpr0 %shift:vgpr_32 = COPY $vgpr1 @@ -280,7 +280,7 @@ body: | ; GCN-NEXT: %src:vgpr_32 = COPY $vgpr0 ; GCN-NEXT: %shift:vgpr_32 = COPY $vgpr1 ; GCN-NEXT: %shiftmask:vgpr_32 = V_AND_B32_e32 65535, %shift, implicit $exec - ; GCN-NEXT: %ret:vgpr_32 = V_ASHRREV_I32_e64 %shiftmask, %src, implicit $exec + ; GCN-NEXT: %ret:vgpr_32 = V_ASHRREV_I32_e64 %shift, %src, implicit $exec ; GCN-NEXT: $vgpr0 = COPY %ret %src:vgpr_32 = COPY $vgpr0 %shift:vgpr_32 = COPY $vgpr1