diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp index 22f23e4c94e2d..a320e6769a115 100644 --- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -62,6 +62,7 @@ class SIPeepholeSDWA { std::unique_ptr matchSDWAOperand(MachineInstr &MI); void pseudoOpConvertToVOP2(MachineInstr &MI, const GCNSubtarget &ST) const; + void convertVcndmaskToVOP2(MachineInstr &MI, const GCNSubtarget &ST) const; MachineInstr *createSDWAVersion(MachineInstr &MI); bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands); void legalizeScalarOperands(MachineInstr &MI, const GCNSubtarget &ST) const; @@ -1037,7 +1038,8 @@ void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI, return; // Make sure VCC or its subregs are dead before MI. MachineBasicBlock &MBB = *MI.getParent(); - auto Liveness = MBB.computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 25); + MachineBasicBlock::LivenessQueryResult Liveness = + MBB.computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 25); if (Liveness != MachineBasicBlock::LQR_Dead) return; // Check if VCC is referenced in range of (MI,MISucc]. @@ -1061,6 +1063,52 @@ void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI, MISucc.substituteRegister(CarryIn->getReg(), TRI->getVCC(), 0, *TRI); } +/// Try to convert an \p MI in VOP3 which takes an src2 carry-in +/// operand into the corresponding VOP2 form which expects the +/// argument in VCC. To this end, add an copy from the carry-in to +/// VCC. The conversion will only be applied if \p MI can be shrunk +/// to VOP2 and if VCC can be proven to be dead before \p MI. +void SIPeepholeSDWA::convertVcndmaskToVOP2(MachineInstr &MI, + const GCNSubtarget &ST) const { + assert(MI.getOpcode() == AMDGPU::V_CNDMASK_B32_e64); + + LLVM_DEBUG(dbgs() << "Attempting VOP2 conversion: " << MI); + if (!TII->canShrink(MI, *MRI)) { + LLVM_DEBUG(dbgs() << "Cannot shrink instruction\n"); + return; + } + + const MachineOperand &CarryIn = + *TII->getNamedOperand(MI, AMDGPU::OpName::src2); + Register CarryReg = CarryIn.getReg(); + MachineInstr *CarryDef = MRI->getVRegDef(CarryReg); + if (!CarryDef) { + LLVM_DEBUG(dbgs() << "Missing carry-in operand definition\n"); + return; + } + + // Make sure VCC or its subregs are dead before MI. + MCRegister Vcc = TRI->getVCC(); + MachineBasicBlock &MBB = *MI.getParent(); + MachineBasicBlock::LivenessQueryResult Liveness = + MBB.computeRegisterLiveness(TRI, Vcc, MI); + if (Liveness != MachineBasicBlock::LQR_Dead) { + LLVM_DEBUG(dbgs() << "VCC not known to be dead before instruction\n"); + return; + } + + BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY), Vcc).add(CarryIn); + + auto Converted = BuildMI(MBB, MI, MI.getDebugLoc(), + TII->get(AMDGPU::getVOPe32(MI.getOpcode()))) + .add(*TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) + .add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0)) + .add(*TII->getNamedOperand(MI, AMDGPU::OpName::src1)) + .setMIFlags(MI.getFlags()); + LLVM_DEBUG(dbgs() << "Converted to VOP2: " << *Converted); + MI.eraseFromParent(); +} + namespace { bool isConvertibleToSDWA(MachineInstr &MI, const GCNSubtarget &ST, @@ -1070,6 +1118,11 @@ bool isConvertibleToSDWA(MachineInstr &MI, if (TII->isSDWA(Opc)) return true; + // Can only be handled after ealier conversion to + // AMDGPU::V_CNDMASK_B32_e32 which is not always possible. + if (Opc == AMDGPU::V_CNDMASK_B32_e64) + return false; + // Check if this instruction has opcode that supports SDWA if (AMDGPU::getSDWAOp(Opc) == -1) Opc = AMDGPU::getVOPe32(Opc); @@ -1108,10 +1161,6 @@ bool isConvertibleToSDWA(MachineInstr &MI, if (TII->pseudoToMCOpcode(Opc) == -1) return false; - // FIXME: has SDWA but require handling of implicit VCC use - if (Opc == AMDGPU::V_CNDMASK_B32_e32) - return false; - if (MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0)) { if (!Src0->isReg() && !Src0->isImm()) return false; @@ -1266,7 +1315,9 @@ MachineInstr *SIPeepholeSDWA::createSDWAVersion(MachineInstr &MI) { SDWAInst->tieOperands(PreserveDstIdx, SDWAInst->getNumOperands() - 1); } - return SDWAInst.getInstr(); + MachineInstr *Ret = SDWAInst.getInstr(); + TII->fixImplicitOperands(*Ret); + return Ret; } bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI, @@ -1384,10 +1435,18 @@ bool SIPeepholeSDWA::run(MachineFunction &MF) { for (const auto &OperandPair : SDWAOperands) { const auto &Operand = OperandPair.second; MachineInstr *PotentialMI = Operand->potentialToConvert(TII, ST); - if (PotentialMI && - (PotentialMI->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 || - PotentialMI->getOpcode() == AMDGPU::V_SUB_CO_U32_e64)) + if (!PotentialMI) + continue; + + switch (PotentialMI->getOpcode()) { + case AMDGPU::V_ADD_CO_U32_e64: + case AMDGPU::V_SUB_CO_U32_e64: pseudoOpConvertToVOP2(*PotentialMI, ST); + break; + case AMDGPU::V_CNDMASK_B32_e64: + convertVcndmaskToVOP2(*PotentialMI, ST); + break; + }; } SDWAOperands.clear(); diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index 19b6ff68b9869..c4957fd44e2be 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -38481,10 +38481,7 @@ define <2 x bfloat> @v_select_v2bf16(i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b) ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -38494,9 +38491,7 @@ define <2 x bfloat> @v_select_v2bf16(i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b) ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -38505,11 +38500,9 @@ define <2 x bfloat> @v_select_v2bf16(i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b) ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -38577,29 +38570,24 @@ define <2 x bfloat> @v_vselect_v2bf16(<2 x i1> %cond, <2 x bfloat> %a, <2 x bflo ; GFX8-LABEL: v_vselect_v2bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[4:5] +; GFX8-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_vselect_v2bf16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[4:5] +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -38607,14 +38595,12 @@ define <2 x bfloat> @v_vselect_v2bf16(<2 x i1> %cond, <2 x bfloat> %a, <2 x bflo ; GFX10-LABEL: v_vselect_v2bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo +; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v0 +; GFX10-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v2, s4 ; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -38771,13 +38757,12 @@ define amdgpu_ps i32 @s_select_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg ; GFX8-NEXT: s_lshr_b32 s3, s1, 16 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; GFX8-NEXT: v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; @@ -38882,14 +38867,13 @@ define amdgpu_ps i32 @s_vselect_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 ; GFX8-NEXT: s_lshr_b32 s3, s1, 16 -; GFX8-NEXT: v_mov_b32_e32 v2, s3 -; GFX8-NEXT: v_mov_b32_e32 v3, s2 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_mov_b32_e32 v2, s1 ; GFX8-NEXT: v_mov_b32_e32 v3, s0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 @@ -40792,48 +40776,42 @@ define <4 x bfloat> @v_vselect_v4bf16(<4 x i1> %cond, <4 x bfloat> %a, <4 x bflo ; GFX9-LABEL: v_vselect_v4bf16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX9-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v5, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 ; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v5, vcc +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v1 +; GFX9-NEXT: v_and_b32_e32 v1, 1, v3 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX9-NEXT: v_and_b32_e32 v1, 1, v2 +; GFX9-NEXT: v_cndmask_b32_sdwa v2, v7, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v6 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; GFX9-NEXT: s_mov_b64 vcc, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v7, v5, s[6:7] +; GFX9-NEXT: v_cndmask_b32_sdwa v3, v6, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v3, v2, s4 +; GFX9-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX9-NEXT: v_perm_b32 v1, v2, v1, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_vselect_v4bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX10-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v6 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v5, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc_lo +; GFX10-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 -; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v7, v5, vcc_lo -; GFX10-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v1 +; GFX10-NEXT: v_and_b32_e32 v1, 1, v2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 1, v0 +; GFX10-NEXT: v_cndmask_b32_sdwa v2, v7, v5, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: s_mov_b32 vcc_lo, s4 +; GFX10-NEXT: v_cndmask_b32_sdwa v3, v6, v4, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v4, s5 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v5, vcc_lo +; GFX10-NEXT: v_perm_b32 v0, v3, v0, 0x5040100 +; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11TRUE16-LABEL: v_vselect_v4bf16: @@ -41081,42 +41059,37 @@ define <8 x bfloat> @v_vselect_v8bf16(<8 x i1> %cond, <8 x bfloat> %a, <8 x bflo ; GFX10-LABEL: v_vselect_v8bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_and_b32_e32 v6, 1, v6 -; GFX10-NEXT: v_and_b32_e32 v4, 1, v4 -; GFX10-NEXT: v_and_b32_e32 v5, 1, v5 -; GFX10-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v16, 16, v10 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 -; GFX10-NEXT: v_lshrrev_b32_e32 v17, 16, v14 -; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX10-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v15, v11, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 ; GFX10-NEXT: v_and_b32_e32 v7, 1, v7 -; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v14, v10, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5 -; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v8 -; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v12 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v17, v16, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v13, v9, vcc_lo +; GFX10-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v1 +; GFX10-NEXT: v_and_b32_e32 v1, 1, v5 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7 +; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 1, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 1, v6 +; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 1, v1 +; GFX10-NEXT: v_and_b32_e32 v1, 1, v4 +; GFX10-NEXT: v_cndmask_b32_sdwa v4, v15, v11, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v14, v10, vcc_lo +; GFX10-NEXT: s_mov_b32 vcc_lo, s6 +; GFX10-NEXT: v_cndmask_b32_sdwa v6, v14, v10, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: s_mov_b32 vcc_lo, s5 +; GFX10-NEXT: v_cndmask_b32_sdwa v1, v13, v9, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v13 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v12, v8, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v14, v10, vcc_lo +; GFX10-NEXT: s_mov_b32 vcc_lo, s4 +; GFX10-NEXT: v_cndmask_b32_sdwa v7, v12, v8, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 +; GFX10-NEXT: v_perm_b32 v0, v7, v0, 0x5040100 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v13, v9, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 -; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v13, v9, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7 -; GFX10-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 -; GFX10-NEXT: v_cndmask_b32_e32 v7, v15, v11, vcc_lo -; GFX10-NEXT: v_perm_b32 v2, v5, v4, 0x5040100 -; GFX10-NEXT: v_perm_b32 v3, v7, v6, 0x5040100 +; GFX10-NEXT: v_perm_b32 v1, v1, v2, 0x5040100 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v15, v11, vcc_lo +; GFX10-NEXT: v_perm_b32 v2, v6, v5, 0x5040100 +; GFX10-NEXT: v_perm_b32 v3, v4, v3, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11TRUE16-LABEL: v_vselect_v8bf16: diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll index 4fe11760e71fd..92ece0d007fe2 100644 --- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll @@ -652,14 +652,14 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 32 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v1 -; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v2 -; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v3, 24, v1 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc +; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v3 +; GFX9-GISEL-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %val = load i8, ptr addrspace(1) %arrayidx, align 1 @@ -760,16 +760,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 32 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] ; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v2 -; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc +; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v2 +; GFX9-GISEL-NEXT: v_cndmask_b32_sdwa v1, v3, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX9-GISEL-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %val = load i16, ptr addrspace(1) %arrayidx, align 1 diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll index 9fcfbba6fb235..3c45596fba14b 100644 --- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll @@ -629,9 +629,9 @@ define amdgpu_kernel void @v_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v1 -; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 32 +; GFX9-GISEL-NEXT: v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %val = load i8, ptr addrspace(1) %arrayidx, align 1 @@ -731,9 +731,9 @@ define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(ptr addrspace(1) no ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v1 -; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 32 +; GFX9-GISEL-NEXT: v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX9-GISEL-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %val = load i16, ptr addrspace(1) %arrayidx, align 1 @@ -1508,10 +1508,9 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_or_b32_e32 v3, 0x100, v1 -; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v3, v3 -; GFX9-GISEL-NEXT: v_and_b32_e32 v3, 0xff, v3 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v1, v3 +; GFX9-GISEL-NEXT: v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %val = load i8, ptr addrspace(1) %arrayidx, align 1 @@ -1612,10 +1611,9 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX9-GISEL-NEXT: v_or_b32_e32 v2, 0x10000, v1 -; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v2 -; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v1, v2 +; GFX9-GISEL-NEXT: v_cndmask_b32_sdwa v1, v1, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-GISEL-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %val = load i16, ptr addrspace(1) %arrayidx, align 1 diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll index ce7281702c108..a511233af0703 100644 --- a/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll @@ -2775,11 +2775,10 @@ define bfloat @fmul_select_bf16_test1(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX9-LABEL: fmul_select_bf16_test1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, 0x3f80 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x4000 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x3f80 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x4000 +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 @@ -2794,11 +2793,11 @@ define bfloat @fmul_select_bf16_test1(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX10-LABEL: fmul_select_bf16_test1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v3, 0x4000 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-NEXT: v_mov_b32_e32 v1, 0x4000 +; GFX10-NEXT: v_mov_b32_e32 v2, 0x3f80 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 @@ -2910,11 +2909,10 @@ define bfloat @fmul_select_bf16_test2(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX9-LABEL: fmul_select_bf16_test2: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, 0x3f80 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x3f00 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x3f80 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x3f00 +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 @@ -2929,11 +2927,11 @@ define bfloat @fmul_select_bf16_test2(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX10-LABEL: fmul_select_bf16_test2: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v3, 0x3f00 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-NEXT: v_mov_b32_e32 v1, 0x3f00 +; GFX10-NEXT: v_mov_b32_e32 v2, 0x3f80 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 @@ -3051,25 +3049,24 @@ define <2 x bfloat> @fmul_select_v2bf16_test3(<2 x bfloat> %x, <2 x i32> %bool.a ; GFX9-LABEL: fmul_select_v2bf16_test3: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v5, 0x3f80 -; GFX9-NEXT: v_mov_b32_e32 v6, 0x4000 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v3 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_mul_f32_e32 v1, v3, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x3f80 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x4000 +; GFX9-NEXT: v_cndmask_b32_sdwa v3, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: s_mov_b64 vcc, s[4:5] +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX9-NEXT: v_mul_f32_e32 v1, v2, v1 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_add3_u32 v2, v2, v1, s4 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_mul_f32_e32 v0, v0, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -3081,24 +3078,24 @@ define <2 x bfloat> @fmul_select_v2bf16_test3(<2 x bfloat> %x, <2 x i32> %bool.a ; GFX10-LABEL: fmul_select_v2bf16_test3: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v5, 0x4000 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo -; GFX10-NEXT: v_mul_f32_e32 v1, v3, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_mov_b32_e32 v2, 0x4000 +; GFX10-NEXT: v_mov_b32_e32 v4, 0x3f80 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, v1, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX10-NEXT: v_cndmask_b32_sdwa v3, v4, v2, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: s_mov_b32 vcc_lo, s4 +; GFX10-NEXT: v_cndmask_b32_sdwa v2, v4, v2, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_mul_f32_e32 v0, v0, v3 +; GFX10-NEXT: v_mul_f32_e32 v1, v1, v2 ; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff ; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo @@ -3261,25 +3258,24 @@ define <2 x bfloat> @fmul_select_v2bf16_test4(<2 x bfloat> %x, <2 x i32> %bool.a ; GFX9-LABEL: fmul_select_v2bf16_test4: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v5, 0x3f80 -; GFX9-NEXT: v_mov_b32_e32 v6, 0x3f00 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v3 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_mul_f32_e32 v1, v3, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x3f80 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x3f00 +; GFX9-NEXT: v_cndmask_b32_sdwa v3, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: s_mov_b64 vcc, s[4:5] +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX9-NEXT: v_mul_f32_e32 v1, v2, v1 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_add3_u32 v2, v2, v1, s4 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_mul_f32_e32 v0, v0, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc ; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 @@ -3291,24 +3287,24 @@ define <2 x bfloat> @fmul_select_v2bf16_test4(<2 x bfloat> %x, <2 x i32> %bool.a ; GFX10-LABEL: fmul_select_v2bf16_test4: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v5, 0x3f00 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo -; GFX10-NEXT: v_mul_f32_e32 v1, v3, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_mov_b32_e32 v2, 0x3f00 +; GFX10-NEXT: v_mov_b32_e32 v4, 0x3f80 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, v1, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX10-NEXT: v_cndmask_b32_sdwa v3, v4, v2, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: s_mov_b32 vcc_lo, s4 +; GFX10-NEXT: v_cndmask_b32_sdwa v2, v4, v2, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_mul_f32_e32 v0, v0, v3 +; GFX10-NEXT: v_mul_f32_e32 v1, v1, v2 ; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff ; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff ; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo @@ -3466,11 +3462,10 @@ define bfloat @fmul_select_bf16_test5(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX9-LABEL: fmul_select_bf16_test5: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, 0x4100 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x4000 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x4100 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x4000 +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 @@ -3485,11 +3480,11 @@ define bfloat @fmul_select_bf16_test5(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX10-LABEL: fmul_select_bf16_test5: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v3, 0x4000 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-NEXT: v_mov_b32_e32 v1, 0x4000 +; GFX10-NEXT: v_mov_b32_e32 v2, 0x4100 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x4100, v3, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 @@ -3603,11 +3598,10 @@ define bfloat @fmul_select_bf16_test6(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX9-LABEL: fmul_select_bf16_test6: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, 0x4040 -; GFX9-NEXT: v_mov_b32_e32 v4, 0xffffc100 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x4040 +; GFX9-NEXT: v_mov_b32_e32 v2, 0xffffc100 +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 @@ -3622,11 +3616,11 @@ define bfloat @fmul_select_bf16_test6(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX10-LABEL: fmul_select_bf16_test6: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v3, 0xffffc100 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-NEXT: v_mov_b32_e32 v1, 0xffffc100 +; GFX10-NEXT: v_mov_b32_e32 v2, 0x4040 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x4040, v3, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 @@ -3739,11 +3733,10 @@ define bfloat @fmul_select_bf16_test7(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX9-LABEL: fmul_select_bf16_test7: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, 0xffffc080 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x4100 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffffc080 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x4100 +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 @@ -3758,11 +3751,11 @@ define bfloat @fmul_select_bf16_test7(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX10-LABEL: fmul_select_bf16_test7: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v3, 0x4100 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-NEXT: v_mov_b32_e32 v1, 0x4100 +; GFX10-NEXT: v_mov_b32_e32 v2, 0xffffc080 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, 0xffffc080, v3, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 @@ -3875,10 +3868,10 @@ define bfloat @fmul_select_bf16_test8(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX9-LABEL: fmul_select_bf16_test8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff8000 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff8000 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 @@ -4004,11 +3997,10 @@ define bfloat @fmul_select_bf16_test9(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX9-LABEL: fmul_select_bf16_test9: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, 0xffffc200 -; GFX9-NEXT: v_mov_b32_e32 v4, 0xffffc180 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffffc200 +; GFX9-NEXT: v_mov_b32_e32 v2, 0xffffc180 +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 @@ -4023,11 +4015,11 @@ define bfloat @fmul_select_bf16_test9(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX10-LABEL: fmul_select_bf16_test9: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v3, 0xffffc180 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-NEXT: v_mov_b32_e32 v1, 0xffffc180 +; GFX10-NEXT: v_mov_b32_e32 v2, 0xffffc200 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, 0xffffc200, v3, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 @@ -4141,11 +4133,10 @@ define bfloat @fmul_select_bf16_test10_sel_log2val_pos65_pos56(bfloat %x, i32 %b ; GFX9-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, 0xffffdb80 -; GFX9-NEXT: v_mov_b32_e32 v4, 0xffffe000 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffffdb80 +; GFX9-NEXT: v_mov_b32_e32 v2, 0xffffe000 +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 @@ -4160,11 +4151,11 @@ define bfloat @fmul_select_bf16_test10_sel_log2val_pos65_pos56(bfloat %x, i32 %b ; GFX10-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v3, 0xffffe000 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-NEXT: v_mov_b32_e32 v1, 0xffffe000 +; GFX10-NEXT: v_mov_b32_e32 v2, 0xffffdb80 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, 0xffffdb80, v3, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 @@ -4278,11 +4269,10 @@ define bfloat @fmul_select_bf16_test11_sel_log2val_neg22_pos25(bfloat %x, i32 %b ; GFX9-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, 0x4c00 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x3480 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x4c00 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x3480 +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 @@ -4297,11 +4287,11 @@ define bfloat @fmul_select_bf16_test11_sel_log2val_neg22_pos25(bfloat %x, i32 %b ; GFX10-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v3, 0x3480 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-NEXT: v_mov_b32_e32 v1, 0x3480 +; GFX10-NEXT: v_mov_b32_e32 v2, 0x4c00 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x4c00, v3, vcc_lo -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll index 8aab9ec885f3c..35fe6ebaf1b12 100644 --- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll @@ -611,28 +611,24 @@ define amdgpu_kernel void @v_extractelement_v8f16_dynamic_sgpr(ptr addrspace(1) ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_cmp_eq_u32 s4, 2 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; VI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; VI-NEXT: v_cndmask_b32_sdwa v0, v1, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_cmp_eq_u32 s4, 3 -; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_cmp_eq_u32 s4, 4 -; VI-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; VI-NEXT: v_cndmask_b32_sdwa v0, v0, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_cmp_eq_u32 s4, 5 -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_cmp_eq_u32 s4, 6 -; VI-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc +; VI-NEXT: v_cndmask_b32_sdwa v0, v0, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_cmp_eq_u32 s4, 7 -; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v4 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc +; VI-NEXT: v_cndmask_b32_sdwa v0, v0, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: flat_store_short v[5:6], v0 ; VI-NEXT: s_endpgm ; @@ -864,31 +860,27 @@ define amdgpu_kernel void @v_extractelement_v16f16_dynamic_sgpr(ptr addrspace(1) ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_cmp_eq_u32 s4, 9 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_cmp_eq_u32 s4, 10 -; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; VI-NEXT: v_cndmask_b32_sdwa v0, v0, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_cmp_eq_u32 s4, 11 -; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v6 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_cmp_eq_u32 s4, 12 -; VI-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc +; VI-NEXT: v_cndmask_b32_sdwa v0, v0, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_cmp_eq_u32 s4, 13 -; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_cmp_eq_u32 s4, 14 -; VI-NEXT: v_cndmask_b32_e32 v0, v0, v15, vcc +; VI-NEXT: v_cndmask_b32_sdwa v0, v0, v7, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_cmp_eq_u32 s4, 15 -; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v8 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc +; VI-NEXT: v_cndmask_b32_sdwa v0, v0, v8, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: flat_store_short v[9:10], v0 ; VI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll index db6ec2b32ad63..12b26cb6d8c19 100644 --- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll @@ -1,18 +1,24 @@ -; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,SIVI %s -; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GFX89,SIVI %s -; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --extra_scrub --version 5 +; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s +; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89 %s +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89 %s -; GCN-LABEL: {{^}}extract_vector_elt_v2i16: -; GCN: s_load_dword [[VEC:s[0-9]+]] -; SIVI: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], 16 -; SIVI-DAG: v_mov_b32_e32 [[VELT0:v[0-9]+]], [[VEC]] -; SIVI-DAG: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]] -; SIVI-DAG: buffer_store_short [[VELT0]] -; SIVI-DAG: buffer_store_short [[VELT1]] -; GFX9: v_mov_b32_e32 [[VVEC:v[0-9]+]], [[VEC]] -; GFX9: global_store_short_d16_hi v{{[0-9]+}}, [[VVEC]], -; GFX9: buffer_store_short [[VVEC]], define amdgpu_kernel void @extract_vector_elt_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr) #0 { +; +; GCN-LABEL: extract_vector_elt_v2i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dword s4, s[2:3], 0x0 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshr_b32 s5, s4, 16 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: buffer_store_short v1, off, s[0:3], 0 +; GCN-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:20 +; GCN-NEXT: s_endpgm %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr %p0 = extractelement <2 x i16> %vec, i32 0 %p1 = extractelement <2 x i16> %vec, i32 1 @@ -22,33 +28,65 @@ define amdgpu_kernel void @extract_vector_elt_v2i16(ptr addrspace(1) %out, ptr a ret void } -; GCN-LABEL: {{^}}extract_vector_elt_v2i16_dynamic_sgpr: -; GCN: s_load_dword [[IDX:s[0-9]+]] -; GCN: s_load_dword [[VEC:s[0-9]+]] -; GCN: s_lshl_b32 [[IDX_SCALED:s[0-9]+]], [[IDX]], 4 -; GCN: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], [[IDX_SCALED]] -; GCN: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]] -; GCN: buffer_store_short [[VELT1]] -; GCN: ScratchSize: 0 define amdgpu_kernel void @extract_vector_elt_v2i16_dynamic_sgpr(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, [8 x i32], i32 %idx) #0 { +; GCN-LABEL: extract_vector_elt_v2i16_dynamic_sgpr: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_load_dword s4, s[4:5], 0x15 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dword s2, s[2:3], 0x0 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_lshl_b32 s4, s4, 4 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshr_b32 s4, s2, s4 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GCN-NEXT: s_endpgm +; +; GFX89-LABEL: extract_vector_elt_v2i16_dynamic_sgpr: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX89-NEXT: s_load_dword s8, s[4:5], 0x54 +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX89-NEXT: s_mov_b32 s4, s0 +; GFX89-NEXT: s_lshl_b32 s0, s8, 4 +; GFX89-NEXT: s_mov_b32 s5, s1 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: s_lshr_b32 s0, s2, s0 +; GFX89-NEXT: v_mov_b32_e32 v0, s0 +; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX89-NEXT: s_endpgm %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr %elt = extractelement <2 x i16> %vec, i32 %idx store i16 %elt, ptr addrspace(1) %out, align 2 ret void } -; GCN-LABEL: {{^}}extract_vector_elt_v2i16_dynamic_vgpr: -; GCN-DAG: {{flat|buffer|global}}_load_dword [[IDX:v[0-9]+]] -; GCN-DAG: v_lshlrev_b32_e32 [[IDX_SCALED:v[0-9]+]], 4, [[IDX]] -; GCN-DAG: s_load_dword [[VEC:s[0-9]+]] - -; SI: v_lshr_b32_e32 [[ELT:v[0-9]+]], [[VEC]], [[IDX_SCALED]] -; VI: v_lshrrev_b32_e64 [[ELT:v[0-9]+]], [[IDX_SCALED]], [[VEC]] - -; SI: buffer_store_short [[ELT]] -; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[ELT]] -; GCN: ScratchSize: 0{{$}} define amdgpu_kernel void @extract_vector_elt_v2i16_dynamic_vgpr(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, ptr addrspace(1) %idx.ptr) #0 { +; +; GCN-LABEL: extract_vector_elt_v2i16_dynamic_vgpr: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: buffer_load_dword v3, v[1:2], s[4:7], 0 addr64 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_load_dword s2, s[2:3], 0x0 +; GCN-NEXT: s_mov_b64 s[4:5], s[0:1] +; GCN-NEXT: v_lshlrev_b32_e32 v1, 1, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 4, v3 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_lshr_b32_e32 v0, s2, v0 +; GCN-NEXT: buffer_store_short v0, v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds i32, ptr addrspace(1) %idx.ptr, i64 %tid.ext @@ -60,14 +98,39 @@ define amdgpu_kernel void @extract_vector_elt_v2i16_dynamic_vgpr(ptr addrspace(1 ret void } -; GCN-LABEL: {{^}}extract_vector_elt_v3i16: -; GCN: s_load_dwordx4 - -; GCN-NOT: {{buffer|flat|global}}_load - -; GCN: buffer_store_short -; GCN: buffer_store_short define amdgpu_kernel void @extract_vector_elt_v3i16(ptr addrspace(1) %out, <3 x i16> %foo) #0 { +; +; +; +; +; GCN-LABEL: extract_vector_elt_v3i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: v_mov_b32_e32 v0, s3 +; GCN-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:2 +; GCN-NEXT: s_endpgm +; +; GFX89-LABEL: extract_vector_elt_v3i16: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s4, s0 +; GFX89-NEXT: s_mov_b32 s5, s1 +; GFX89-NEXT: v_mov_b32_e32 v0, s3 +; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX89-NEXT: v_mov_b32_e32 v0, s2 +; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:2 +; GFX89-NEXT: s_endpgm %p0 = extractelement <3 x i16> %foo, i32 0 %p1 = extractelement <3 x i16> %foo, i32 2 %out1 = getelementptr i16, ptr addrspace(1) %out, i32 1 @@ -76,17 +139,40 @@ define amdgpu_kernel void @extract_vector_elt_v3i16(ptr addrspace(1) %out, <3 x ret void } -; GCN-LABEL: {{^}}extract_vector_elt_v4i16: -; SI: s_load_dwordx4 -; SI: buffer_store_short -; SI: buffer_store_short - -; GFX89-DAG: s_load_dwordx4 s[[[#LOAD:]]:[[#END:]]], s[4:5], 0x24 -; GFX89-DAG: v_mov_b32_e32 [[VLOAD0:v[0-9]+]], s[[#LOAD + 2]] -; GFX89-DAG: buffer_store_short [[VLOAD0]], off -; GFX89-DAG: v_mov_b32_e32 [[VLOAD1:v[0-9]+]], s[[#LOAD + 3]] -; GFX89-DAG: buffer_store_short [[VLOAD1]], off define amdgpu_kernel void @extract_vector_elt_v4i16(ptr addrspace(1) %out, <4 x i16> %foo) #0 { +; +; +; GCN-LABEL: extract_vector_elt_v4i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: v_mov_b32_e32 v0, s3 +; GCN-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:20 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_endpgm +; +; GFX89-LABEL: extract_vector_elt_v4i16: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s4, s0 +; GFX89-NEXT: s_mov_b32 s5, s1 +; GFX89-NEXT: v_mov_b32_e32 v0, s3 +; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: v_mov_b32_e32 v0, s2 +; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:20 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: s_endpgm %p0 = extractelement <4 x i16> %foo, i32 0 %p1 = extractelement <4 x i16> %foo, i32 2 %out1 = getelementptr i16, ptr addrspace(1) %out, i32 10 @@ -95,36 +181,64 @@ define amdgpu_kernel void @extract_vector_elt_v4i16(ptr addrspace(1) %out, <4 x ret void } -; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v3i16: -; SI: s_load_dword s -; SI: s_load_dwordx2 s -; SI: s_load_dwordx2 s - -; GFX89-DAG: s_load_dwordx2 s[[[LOAD0:[0-9]+]]:[[LOAD1:[0-9]+]]], s[4:5], 0x24 -; GFX89-DAG: s_load_dwordx2 s[[[LOAD0:[0-9]+]]:[[LOAD1:[0-9]+]]], s[4:5], 0x4c -; GFX89-DAG: s_load_dword s{{[0-9]+}}, s[4:5], 0x54 - -; GCN-NOT: {{buffer|flat|global}} - -; SICI: buffer_store_short -; SICI: buffer_store_short -; SICI: buffer_store_short - -; GFX9-NOT: s_pack_ll_b32_b16 -; GFX9-NOT: s_pack_lh_b32_b16 - -; GCN-DAG: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 4 -; GCN: s_lshr_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s -; GCN: {{buffer|global}}_store_short define amdgpu_kernel void @dynamic_extract_vector_elt_v3i16(ptr addrspace(1) %out, [8 x i32], <3 x i16> %foo, i32 %idx) #0 { +; +; +; GCN-LABEL: dynamic_extract_vector_elt_v3i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dword s2, s[4:5], 0x15 +; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshl_b32 s2, s2, 4 +; GCN-NEXT: s_lshr_b64 s[4:5], s[6:7], s2 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GCN-NEXT: s_endpgm +; +; GFX89-LABEL: dynamic_extract_vector_elt_v3i16: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_load_dword s8, s[4:5], 0x54 +; GFX89-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c +; GFX89-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX89-NEXT: s_mov_b32 s3, 0xf000 +; GFX89-NEXT: s_mov_b32 s2, -1 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: s_lshl_b32 s4, s8, 4 +; GFX89-NEXT: s_lshr_b64 s[4:5], s[6:7], s4 +; GFX89-NEXT: v_mov_b32_e32 v0, s4 +; GFX89-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX89-NEXT: s_endpgm %p0 = extractelement <3 x i16> %foo, i32 %idx %out1 = getelementptr i16, ptr addrspace(1) %out, i32 1 store i16 %p0, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}v_insertelement_v4i16_dynamic_sgpr: define amdgpu_kernel void @v_insertelement_v4i16_dynamic_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %idx) #0 { +; +; +; +; GCN-LABEL: v_insertelement_v4i16_dynamic_sgpr: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_load_dword s8, s[4:5], 0xd +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: s_mov_b64 s[2:3], s[6:7] +; GCN-NEXT: v_lshlrev_b32_e32 v1, 1, v0 +; GCN-NEXT: s_lshl_b32 s4, s8, 4 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshr_b64 v[3:4], v[3:4], s4 +; GCN-NEXT: buffer_store_short v3, v[1:2], s[0:3], 0 addr64 +; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <4 x i16>, ptr addrspace(1) %in, i64 %tid.ext @@ -135,13 +249,44 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_sgpr(ptr addrspace(1) % ret void } -; GCN-LABEL: {{^}}reduce_load_vector_v8i16_extract_01: -; GCN: s_load_dwordx2 [[PTR:s\[[0-9]+:[0-9]+\]]], -; GCN-NOT: {{s|buffer|flat|global}}_load_ -; GCN: s_load_dword s{{[0-9]+}}, [[PTR]], 0x0 -; GCN-NOT: {{s|buffer|flat|global}}_load_ -; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 define amdgpu_kernel void @reduce_load_vector_v8i16_extract_01(ptr addrspace(4) %ptr) #0 { +; +; +; +; +; GCN-LABEL: reduce_load_vector_v8i16_extract_01: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dword s0, s[0:1], 0x0 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshr_b32 s1, s0, 16 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_endpgm +; +; GFX89-LABEL: reduce_load_vector_v8i16_extract_01: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX89-NEXT: s_mov_b32 s3, 0xf000 +; GFX89-NEXT: s_mov_b32 s2, -1 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: s_lshr_b32 s1, s0, 16 +; GFX89-NEXT: v_mov_b32_e32 v0, s0 +; GFX89-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: v_mov_b32_e32 v0, s1 +; GFX89-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: s_endpgm %load = load <16 x i16>, ptr addrspace(4) %ptr %elt0 = extractelement <16 x i16> %load, i32 0 %elt1 = extractelement <16 x i16> %load, i32 1 @@ -150,13 +295,44 @@ define amdgpu_kernel void @reduce_load_vector_v8i16_extract_01(ptr addrspace(4) ret void } -; GCN-LABEL: {{^}}reduce_load_vector_v8i16_extract_23: -; GCN: s_load_dwordx2 [[PTR:s\[[0-9]+:[0-9]+\]]], -; GCN-NOT: {{s|buffer|flat|global}}_load_ -; GCN: s_load_dword s{{[0-9]+}}, [[PTR]], {{0x1|0x4}} -; GCN-NOT: {{s|buffer|flat|global}}_load_ -; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 define amdgpu_kernel void @reduce_load_vector_v8i16_extract_23(ptr addrspace(4) %ptr) #0 { +; +; +; +; +; GCN-LABEL: reduce_load_vector_v8i16_extract_23: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dword s0, s[0:1], 0x1 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshr_b32 s1, s0, 16 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_endpgm +; +; GFX89-LABEL: reduce_load_vector_v8i16_extract_23: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX89-NEXT: s_mov_b32 s3, 0xf000 +; GFX89-NEXT: s_mov_b32 s2, -1 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: s_load_dword s0, s[0:1], 0x4 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: s_lshr_b32 s1, s0, 16 +; GFX89-NEXT: v_mov_b32_e32 v0, s0 +; GFX89-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: v_mov_b32_e32 v0, s1 +; GFX89-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: s_endpgm %load = load <16 x i16>, ptr addrspace(4) %ptr %elt2 = extractelement <16 x i16> %load, i32 2 %elt3 = extractelement <16 x i16> %load, i32 3 @@ -165,14 +341,26 @@ define amdgpu_kernel void @reduce_load_vector_v8i16_extract_23(ptr addrspace(4) ret void } -; GCN-LABEL: {{^}}v_extractelement_v8i16_2: -; SI: buffer_load_dword [[RES:v[0-9]+]], v[{{[0-9:]+}}], s[{{[0-9:]+}}], 0 addr64 offset:4 -; SI: buffer_store_short [[RES]] -; VI: flat_load_dword [[RES:v[0-9]+]] -; VI: flat_store_short v[{{[0-9:]+}}], [[RES]] -; GFX9: global_load_dword [[RES:v[0-9]+]], v{{[0-9]+}}, s[{{[0-9:]+}}] offset:4 -; GFX9: global_store_short v{{[0-9]+}}, [[RES]] define amdgpu_kernel void @v_extractelement_v8i16_2(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; +; +; +; GCN-LABEL: v_extractelement_v8i16_2: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 4, v0 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: s_mov_b64 s[10:11], s[6:7] +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[8:9], s[2:3] +; GCN-NEXT: buffer_load_dword v3, v[1:2], s[8:11], 0 addr64 offset:4 +; GCN-NEXT: s_mov_b64 s[4:5], s[0:1] +; GCN-NEXT: v_lshlrev_b32_e32 v1, 1, v0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_short v3, v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <8 x i16>, ptr addrspace(1) %in, i64 %tid.ext @@ -183,14 +371,26 @@ define amdgpu_kernel void @v_extractelement_v8i16_2(ptr addrspace(1) %out, ptr a ret void } -; GCN-LABEL: {{^}}v_extractelement_v8i16_6: -; SI: buffer_load_dword [[RES:v[0-9]+]], v[{{[0-9:]+}}], s[{{[0-9:]+}}], 0 addr64 offset:12 -; SI: buffer_store_short [[RES]] -; VI: flat_load_dword [[RES:v[0-9]+]] -; VI: flat_store_short v[{{[0-9:]+}}], [[RES]] -; GFX9: global_load_dword [[RES:v[0-9]+]], v{{[0-9]+}}, s[{{[0-9:]+}}] offset:12 -; GFX9: global_store_short v{{[0-9]+}}, [[RES]] define amdgpu_kernel void @v_extractelement_v8i16_6(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; +; +; +; GCN-LABEL: v_extractelement_v8i16_6: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 4, v0 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: s_mov_b64 s[10:11], s[6:7] +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[8:9], s[2:3] +; GCN-NEXT: buffer_load_dword v3, v[1:2], s[8:11], 0 addr64 offset:12 +; GCN-NEXT: s_mov_b64 s[4:5], s[0:1] +; GCN-NEXT: v_lshlrev_b32_e32 v1, 1, v0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_short v3, v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <8 x i16>, ptr addrspace(1) %in, i64 %tid.ext @@ -201,9 +401,52 @@ define amdgpu_kernel void @v_extractelement_v8i16_6(ptr addrspace(1) %out, ptr a ret void } -; GCN-LABEL: {{^}}v_extractelement_v8i16_dynamic_sgpr: -; GCN-COUNT-7: v_cndmask_b32_e32 define amdgpu_kernel void @v_extractelement_v8i16_dynamic_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %n) #0 { +; +; +; +; GCN-LABEL: v_extractelement_v8i16_dynamic_sgpr: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_load_dword s8, s[4:5], 0xd +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GCN-NEXT: v_mov_b32_e32 v5, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: buffer_load_dwordx4 v[1:4], v[4:5], s[4:7], 0 addr64 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 1, v0 +; GCN-NEXT: v_mov_b32_e32 v7, v5 +; GCN-NEXT: s_mov_b64 s[2:3], s[6:7] +; GCN-NEXT: s_cmp_eq_u32 s8, 1 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s8, 2 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v4 +; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s8, 3 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s8, 4 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s8, 5 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s8, 6 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s8, 7 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc +; GCN-NEXT: buffer_store_short v0, v[6:7], s[0:3], 0 addr64 +; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <8 x i16>, ptr addrspace(1) %in, i64 %tid.ext @@ -214,14 +457,26 @@ define amdgpu_kernel void @v_extractelement_v8i16_dynamic_sgpr(ptr addrspace(1) ret void } -; GCN-LABEL: {{^}}v_extractelement_v16i16_2: -; SI: buffer_load_dword [[RES:v[0-9]+]], v[{{[0-9:]+}}], s[{{[0-9:]+}}], 0 addr64 offset:4 -; SI: buffer_store_short [[RES]] -; VI: flat_load_dword [[RES:v[0-9]+]] -; VI: flat_store_short v[{{[0-9:]+}}], [[RES]] -; GFX9: global_load_dword [[RES:v[0-9]+]], v{{[0-9]+}}, s[{{[0-9:]+}}] offset:4 -; GFX9: global_store_short v{{[0-9]+}}, [[RES]] define amdgpu_kernel void @v_extractelement_v16i16_2(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; +; +; +; GCN-LABEL: v_extractelement_v16i16_2: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 5, v0 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: s_mov_b64 s[10:11], s[6:7] +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[8:9], s[2:3] +; GCN-NEXT: buffer_load_dword v3, v[1:2], s[8:11], 0 addr64 offset:4 +; GCN-NEXT: s_mov_b64 s[4:5], s[0:1] +; GCN-NEXT: v_lshlrev_b32_e32 v1, 1, v0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_short v3, v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <16 x i16>, ptr addrspace(1) %in, i64 %tid.ext @@ -232,14 +487,26 @@ define amdgpu_kernel void @v_extractelement_v16i16_2(ptr addrspace(1) %out, ptr ret void } -; GCN-LABEL: {{^}}v_extractelement_v16i16_6: -; SI: buffer_load_dword [[RES:v[0-9]+]], v[{{[0-9:]+}}], s[{{[0-9:]+}}], 0 addr64 offset:12 -; SI: buffer_store_short [[RES]] -; VI: flat_load_dword [[RES:v[0-9]+]] -; VI: flat_store_short v[{{[0-9:]+}}], [[RES]] -; GFX9: global_load_dword [[RES:v[0-9]+]], v{{[0-9]+}}, s[{{[0-9:]+}}] offset:12 -; GFX9: global_store_short v{{[0-9]+}}, [[RES]] define amdgpu_kernel void @v_extractelement_v16i16_6(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; +; +; +; GCN-LABEL: v_extractelement_v16i16_6: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 5, v0 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: s_mov_b64 s[10:11], s[6:7] +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[8:9], s[2:3] +; GCN-NEXT: buffer_load_dword v3, v[1:2], s[8:11], 0 addr64 offset:12 +; GCN-NEXT: s_mov_b64 s[4:5], s[0:1] +; GCN-NEXT: v_lshlrev_b32_e32 v1, 1, v0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_short v3, v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <16 x i16>, ptr addrspace(1) %in, i64 %tid.ext @@ -250,9 +517,82 @@ define amdgpu_kernel void @v_extractelement_v16i16_6(ptr addrspace(1) %out, ptr ret void } -; GCN-LABEL: {{^}}v_extractelement_v16i16_dynamic_sgpr: -; GCN-COUNT-15: v_cndmask_b32_e32 define amdgpu_kernel void @v_extractelement_v16i16_dynamic_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %n) #0 { +; +; +; +; GCN-LABEL: v_extractelement_v16i16_dynamic_sgpr: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_load_dword s8, s[4:5], 0xd +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 5, v0 +; GCN-NEXT: v_mov_b32_e32 v6, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: buffer_load_dwordx4 v[1:4], v[5:6], s[4:7], 0 addr64 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 1, v0 +; GCN-NEXT: v_mov_b32_e32 v10, v6 +; GCN-NEXT: s_mov_b64 s[2:3], s[6:7] +; GCN-NEXT: buffer_load_dwordx4 v[5:8], v[5:6], s[4:7], 0 addr64 offset:16 +; GCN-NEXT: s_cmp_eq_u32 s8, 1 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s8, 2 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v4 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v8 +; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s8, 3 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s8, 4 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s8, 5 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s8, 6 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s8, 7 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s8, 8 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s8, 9 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s8, 10 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s8, 11 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s8, 12 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v15, vcc +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s8, 13 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s8, 14 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s8, 15 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v17, vcc +; GCN-NEXT: buffer_store_short v0, v[9:10], s[0:3], 0 addr64 +; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <16 x i16>, ptr addrspace(1) %in, i64 %tid.ext diff --git a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll index 09279f6f0768c..67a9c12dca94a 100644 --- a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll @@ -116,9 +116,8 @@ define <2 x half> @test_fmax_legacy_ugt_v2f16(<2 x half> %a, <2 x half> %b) #0 { ; VI-SAFE-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; VI-SAFE-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; VI-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v3, v2 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; VI-SAFE-NEXT: v_cndmask_b32_sdwa v2, v2, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v0, v1 -; VI-SAFE-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; VI-SAFE-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-SAFE-NEXT: s_setpc_b64 s[30:31] @@ -223,13 +222,12 @@ define <3 x half> @test_fmax_legacy_ugt_v3f16(<3 x half> %a, <3 x half> %b) #0 { ; VI-SAFE-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; VI-SAFE-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; VI-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v5, v4 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; VI-SAFE-NEXT: v_cndmask_b32_sdwa v4, v4, v5, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v1, v3 ; VI-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; VI-SAFE-NEXT: v_cmp_nle_f16_e32 vcc, v0, v2 ; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; VI-SAFE-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; VI-SAFE-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-SAFE-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; VI-NNAN-LABEL: test_fmax_legacy_ugt_v3f16: diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll index 1b8a79ee982d1..567202be69fa6 100644 --- a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll @@ -2083,19 +2083,17 @@ define <2 x half> @v_fmaximum3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v4, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v1, v0, v5, s0 ; GFX942-NEXT: v_pk_max_f16 v1, v2, v1 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v2, v5 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v2, v0 src0_sel:WORD_1 src1_sel:DWORD ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v4, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v0, v0, v3, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; @@ -2131,19 +2129,17 @@ define <2 x half> @v_fmaximum3_v2f16_commute(<2 x half> %a, <2 x half> %b, <2 x ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v4, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v1, v0, v5, s0 ; GFX942-NEXT: v_pk_max_f16 v1, v1, v2 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v5, v2 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v4, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v0, v0, v3, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; @@ -2178,25 +2174,22 @@ define <2 x half> @v_fmaximum3_v2f16__fabs_all(<2 x half> %a, <2 x half> %b, <2 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v0 ; GFX942-NEXT: v_and_b32_e32 v4, 0x7fff7fff, v1 -; GFX942-NEXT: v_pk_max_f16 v3, v3, v4 -; GFX942-NEXT: v_mov_b32_e32 v6, 0x7e00 -; GFX942-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, |v0|, |v1| src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: v_pk_max_f16 v3, v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX942-NEXT: v_cndmask_b32_sdwa v6, v4, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v1| ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: v_and_b32_e32 v5, 0x7fff7fff, v2 -; GFX942-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v1| -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v6, v3, vcc -; GFX942-NEXT: v_perm_b32 v1, v4, v0, s0 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_perm_b32 v1, v6, v0, s0 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v6, |v2| src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_pk_max_f16 v1, v1, v5 -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v4, |v2| src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v3, v4, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, |v2| ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v6, v1, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX942-NEXT: v_perm_b32 v0, v3, v0, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; @@ -2238,19 +2231,17 @@ define <2 x half> @v_fmaximum3_v2f16__fneg_all(<2 x half> %a, <2 x half> %b, <2 ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, -v0, -v1 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v4, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v1, v0, v5, s0 ; GFX942-NEXT: v_pk_max_f16 v1, v1, v2 neg_lo:[0,1] neg_hi:[0,1] ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v5, -v2 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, -v2 src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v4, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v0, v0, v3, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; @@ -2283,26 +2274,23 @@ define <2 x half> @v_fmaximum3_v2f16__inlineimm1(<2 x half> %a, <2 x half> %c) { ; GFX942-LABEL: v_fmaximum3_v2f16__inlineimm1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v2, v0, 2.0 op_sel_hi:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: v_pk_max_f16 v2, v0, 2.0 op_sel_hi:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX942-NEXT: v_cndmask_b32_sdwa v4, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc -; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GFX942-NEXT: v_perm_b32 v2, v3, v0, s0 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX942-NEXT: v_perm_b32 v2, v4, v0, s0 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v4, v1 src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_pk_max_f16 v2, v2, v1 -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v3, v1 src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v4, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GFX942-NEXT: v_perm_b32 v0, v3, v0, s0 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX942-NEXT: v_perm_b32 v0, v4, v0, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fmaximum3_v2f16__inlineimm1: @@ -2337,19 +2325,17 @@ define <2 x half> @v_fmaximum3_v2f16__inlineimm2(<2 x half> %a, <2 x half> %b) { ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v1, v0, v4, s0 ; GFX942-NEXT: v_pk_max_f16 v1, v1, 4.0 op_sel_hi:[1,0] ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v4, v4 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v3, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v0, v0, v2, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; @@ -2408,10 +2394,9 @@ define <3 x half> @v_fmaximum3_v3f16(<3 x half> %a, <3 x half> %b, <3 x half> %c ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v4, v8 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v4, v0 src0_sel:WORD_1 src1_sel:DWORD ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v0, v0, v3, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; @@ -2471,10 +2456,9 @@ define <3 x half> @v_fmaximum3_v3f16_commute(<3 x half> %a, <3 x half> %b, <3 x ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v8, v4 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v0, v0, v3, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; @@ -2617,10 +2601,9 @@ define <3 x half> @v_fmaximum3_v3f16__fneg_all(<3 x half> %a, <3 x half> %b, <3 ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v8, -v4 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, -v4 src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v0, v0, v3, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; @@ -2656,35 +2639,33 @@ define <3 x half> @v_fmaximum3_v3f16__inlineimm1(<3 x half> %a, <3 x half> %c) { ; GFX942-LABEL: v_fmaximum3_v3f16__inlineimm1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v4, v0, 2.0 op_sel_hi:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v6, 0x7e00 -; GFX942-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: v_pk_max_f16 v4, v0, 2.0 op_sel_hi:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v5, 0x7e00 +; GFX942-NEXT: v_cndmask_b32_sdwa v6, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_pk_max_f16 v7, v1, 2.0 -; GFX942-NEXT: s_mov_b32 s1, 0x5040100 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v1 +; GFX942-NEXT: s_mov_b32 s1, 0x5040100 ; GFX942-NEXT: s_movk_i32 s0, 0x7e00 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc -; GFX942-NEXT: v_perm_b32 v4, v5, v0, s1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX942-NEXT: v_perm_b32 v4, v6, v0, s1 ; GFX942-NEXT: v_pk_max_f16 v4, v4, v2 -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v5, v2 src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v6, v2 src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_lshrrev_b32_e32 v7, 16, v4 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v6, v5, v7, vcc ; GFX942-NEXT: v_pack_b32_f16 v7, v1, s0 ; GFX942-NEXT: v_pk_max_f16 v7, v7, v3 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc -; GFX942-NEXT: v_perm_b32 v0, v5, v0, s1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX942-NEXT: v_perm_b32 v0, v6, v0, s1 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fmaximum3_v3f16__inlineimm1: @@ -2743,10 +2724,9 @@ define <3 x half> @v_fmaximum3_v3f16__inlineimm2(<3 x half> %a, <3 x half> %b) { ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v6, v6 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v5, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v0, v0, v3, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; @@ -2785,37 +2765,33 @@ define <4 x half> @v_fmaximum3_v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_pk_max_f16 v2, v1, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v2, v1, v6, s0 ; GFX942-NEXT: v_pk_max_f16 v2, v5, v2 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v5, v6 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v5, v1 src0_sel:WORD_1 src1_sel:DWORD ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0 ; GFX942-NEXT: v_pk_max_f16 v2, v4, v2 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v4, v8 ; GFX942-NEXT: v_perm_b32 v1, v1, v3, s0 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v7, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v4, v0 src0_sel:WORD_1 src1_sel:DWORD ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v0, v0, v5, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; @@ -2854,37 +2830,33 @@ define <4 x half> @v_fmaximum3_v4f16_commute(<4 x half> %a, <4 x half> %b, <4 x ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_pk_max_f16 v2, v1, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v2, v1, v6, s0 ; GFX942-NEXT: v_pk_max_f16 v2, v2, v5 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v6, v5 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v5 src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0 ; GFX942-NEXT: v_pk_max_f16 v2, v2, v4 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v8, v4 ; GFX942-NEXT: v_perm_b32 v1, v1, v3, s0 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v7, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v0, v0, v5, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; @@ -2947,17 +2919,14 @@ define <4 x half> @v_fmaximum3_v4f16__fabs_all(<4 x half> %a, <4 x half> %b, <4 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v1, v12, v6, vcc ; GFX942-NEXT: v_perm_b32 v2, v8, v1, s0 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v8, |v5| src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_pk_max_f16 v2, v2, v11 ; GFX942-NEXT: v_perm_b32 v6, v9, v0, s0 -; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v8, |v5| src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_cndmask_b32_sdwa v3, v12, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v9, |v4| src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_pk_max_f16 v6, v6, v10 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v9, |v4| src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v7, v12, v7, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v7, v12, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v1, |v5| ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v1, v12, v2, vcc @@ -3012,37 +2981,33 @@ define <4 x half> @v_fmaximum3_v4f16__fneg_all(<4 x half> %a, <4 x half> %b, <4 ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, -v0, -v2 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_pk_max_f16 v2, v1, v3 neg_lo:[1,1] neg_hi:[1,1] ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, -v1, -v3 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, -v1, -v3 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v2, v1, v6, s0 ; GFX942-NEXT: v_pk_max_f16 v2, v2, v5 neg_lo:[0,1] neg_hi:[0,1] ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v6, -v5 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, -v5 src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0 ; GFX942-NEXT: v_pk_max_f16 v2, v2, v4 neg_lo:[0,1] neg_hi:[0,1] ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v8, -v4 ; GFX942-NEXT: v_perm_b32 v1, v1, v3, s0 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v7, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, -v4 src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v0, v0, v5, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; @@ -3078,42 +3043,40 @@ define <4 x half> @v_fmaximum3_v4f16__inlineimm1(<4 x half> %a, <4 x half> %c) { ; GFX942-LABEL: v_fmaximum3_v4f16__inlineimm1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_pk_max_f16 v4, v0, 2.0 op_sel_hi:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v6, 0x7e00 -; GFX942-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: v_pk_max_f16 v4, v0, 2.0 op_sel_hi:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v5, 0x7e00 ; GFX942-NEXT: v_pk_max_f16 v7, v1, 2.0 op_sel_hi:[1,0] -; GFX942-NEXT: s_mov_b32 s0, 0x5040100 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v6, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_lshrrev_b32_e32 v8, 16, v7 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v8, v6, v8, vcc +; GFX942-NEXT: s_mov_b32 s0, 0x5040100 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v8, v5, v8, vcc ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc ; GFX942-NEXT: v_perm_b32 v4, v8, v1, s0 -; GFX942-NEXT: v_pk_max_f16 v4, v4, v3 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v8, v3 src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_perm_b32 v8, v5, v0, s0 -; GFX942-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; GFX942-NEXT: v_pk_max_f16 v8, v8, v2 -; GFX942-NEXT: v_cndmask_b32_e32 v7, v6, v7, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v9, 16, v8 -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v5, v2 src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v9, vcc +; GFX942-NEXT: v_pk_max_f16 v4, v4, v3 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_sdwa v7, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v6, v2 src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_perm_b32 v6, v6, v0, s0 +; GFX942-NEXT: v_pk_max_f16 v6, v6, v2 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_sdwa v8, v5, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v6, v4, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 ; GFX942-NEXT: v_perm_b32 v1, v7, v1, s0 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc -; GFX942-NEXT: v_perm_b32 v0, v5, v0, s0 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc +; GFX942-NEXT: v_perm_b32 v0, v8, v0, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fmaximum3_v4f16__inlineimm1: @@ -3151,37 +3114,33 @@ define <4 x half> @v_fmaximum3_v4f16__inlineimm2(<4 x half> %a, <4 x half> %b) { ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_pk_max_f16 v2, v1, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v1, v5, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v2, v1, v4, s0 ; GFX942-NEXT: v_pk_max_f16 v2, v2, 4.0 op_sel_hi:[1,0] ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v4, v4 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v1, v5, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v2, v0, v6, s0 ; GFX942-NEXT: v_pk_max_f16 v2, v2, 4.0 op_sel_hi:[1,0] ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v6, v6 ; GFX942-NEXT: v_perm_b32 v1, v1, v3, s0 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v5, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v0, v0, v4, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; @@ -4005,19 +3964,17 @@ define <4 x half> @v_no_fmaximum3_v2f16__multi_use(<2 x half> %a, <2 x half> %b, ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v1, v4, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v0, v1, v5, s0 ; GFX942-NEXT: v_pk_max_f16 v3, v0, v2 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v5, v2 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v1, v4, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v1, v1, v5, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll index 049c6799da079..fd809c6103d2c 100644 --- a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll @@ -117,9 +117,8 @@ define <2 x half> @test_fmin_legacy_ule_v2f16(<2 x half> %a, <2 x half> %b) #0 { ; VI-SAFE-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; VI-SAFE-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; VI-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v3, v2 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; VI-SAFE-NEXT: v_cndmask_b32_sdwa v2, v2, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v0, v1 -; VI-SAFE-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; VI-SAFE-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-SAFE-NEXT: s_setpc_b64 s[30:31] @@ -224,13 +223,12 @@ define <3 x half> @test_fmin_legacy_ule_v3f16(<3 x half> %a, <3 x half> %b) #0 { ; VI-SAFE-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; VI-SAFE-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; VI-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v5, v4 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; VI-SAFE-NEXT: v_cndmask_b32_sdwa v4, v4, v5, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v1, v3 ; VI-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; VI-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, v0, v2 ; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; VI-SAFE-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; VI-SAFE-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-SAFE-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; VI-NNAN-LABEL: test_fmin_legacy_ule_v3f16: diff --git a/llvm/test/CodeGen/AMDGPU/fminimum3.ll b/llvm/test/CodeGen/AMDGPU/fminimum3.ll index 96e9aa375f5ee..81b8e8ebd10e3 100644 --- a/llvm/test/CodeGen/AMDGPU/fminimum3.ll +++ b/llvm/test/CodeGen/AMDGPU/fminimum3.ll @@ -2083,19 +2083,17 @@ define <2 x half> @v_fminimum3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v4, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v1, v0, v5, s0 ; GFX942-NEXT: v_pk_min_f16 v1, v2, v1 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v2, v5 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v2, v0 src0_sel:WORD_1 src1_sel:DWORD ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v4, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v0, v0, v3, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; @@ -2131,19 +2129,17 @@ define <2 x half> @v_fminimum3_v2f16_commute(<2 x half> %a, <2 x half> %b, <2 x ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v4, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v1, v0, v5, s0 ; GFX942-NEXT: v_pk_min_f16 v1, v1, v2 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v5, v2 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v4, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v0, v0, v3, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; @@ -2178,25 +2174,22 @@ define <2 x half> @v_fminimum3_v2f16__fabs_all(<2 x half> %a, <2 x half> %b, <2 ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v0 ; GFX942-NEXT: v_and_b32_e32 v4, 0x7fff7fff, v1 -; GFX942-NEXT: v_pk_min_f16 v3, v3, v4 -; GFX942-NEXT: v_mov_b32_e32 v6, 0x7e00 -; GFX942-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, |v0|, |v1| src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: v_pk_min_f16 v3, v3, v4 +; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX942-NEXT: v_cndmask_b32_sdwa v6, v4, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v1| ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: v_and_b32_e32 v5, 0x7fff7fff, v2 -; GFX942-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v1| -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v6, v3, vcc -; GFX942-NEXT: v_perm_b32 v1, v4, v0, s0 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_perm_b32 v1, v6, v0, s0 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v6, |v2| src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_pk_min_f16 v1, v1, v5 -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v4, |v2| src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v3, v4, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, |v2| ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v6, v1, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc ; GFX942-NEXT: v_perm_b32 v0, v3, v0, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; @@ -2238,19 +2231,17 @@ define <2 x half> @v_fminimum3_v2f16__fneg_all(<2 x half> %a, <2 x half> %b, <2 ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, -v0, -v1 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v4, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v1, v0, v5, s0 ; GFX942-NEXT: v_pk_min_f16 v1, v1, v2 neg_lo:[0,1] neg_hi:[0,1] ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v5, -v2 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, -v2 src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v4, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v0, v0, v3, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; @@ -2283,26 +2274,23 @@ define <2 x half> @v_fminimum3_v2f16__inlineimm1(<2 x half> %a, <2 x half> %c) { ; GFX942-LABEL: v_fminimum3_v2f16__inlineimm1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_pk_min_f16 v2, v0, 2.0 op_sel_hi:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: v_pk_min_f16 v2, v0, 2.0 op_sel_hi:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX942-NEXT: v_cndmask_b32_sdwa v4, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc -; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GFX942-NEXT: v_perm_b32 v2, v3, v0, s0 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX942-NEXT: v_perm_b32 v2, v4, v0, s0 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v4, v1 src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_pk_min_f16 v2, v2, v1 -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v3, v1 src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v4, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GFX942-NEXT: v_perm_b32 v0, v3, v0, s0 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX942-NEXT: v_perm_b32 v0, v4, v0, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fminimum3_v2f16__inlineimm1: @@ -2337,19 +2325,17 @@ define <2 x half> @v_fminimum3_v2f16__inlineimm2(<2 x half> %a, <2 x half> %b) { ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v1, v0, v4, s0 ; GFX942-NEXT: v_pk_min_f16 v1, v1, 4.0 op_sel_hi:[1,0] ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v4, v4 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v3, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v0, v0, v2, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; @@ -2408,10 +2394,9 @@ define <3 x half> @v_fminimum3_v3f16(<3 x half> %a, <3 x half> %b, <3 x half> %c ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v4, v8 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v4, v0 src0_sel:WORD_1 src1_sel:DWORD ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v0, v0, v3, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; @@ -2471,10 +2456,9 @@ define <3 x half> @v_fminimum3_v3f16_commute(<3 x half> %a, <3 x half> %b, <3 x ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v8, v4 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v0, v0, v3, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; @@ -2617,10 +2601,9 @@ define <3 x half> @v_fminimum3_v3f16__fneg_all(<3 x half> %a, <3 x half> %b, <3 ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v8, -v4 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, -v4 src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v0, v0, v3, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; @@ -2656,35 +2639,33 @@ define <3 x half> @v_fminimum3_v3f16__inlineimm1(<3 x half> %a, <3 x half> %c) { ; GFX942-LABEL: v_fminimum3_v3f16__inlineimm1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_pk_min_f16 v4, v0, 2.0 op_sel_hi:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v6, 0x7e00 -; GFX942-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: v_pk_min_f16 v4, v0, 2.0 op_sel_hi:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v5, 0x7e00 +; GFX942-NEXT: v_cndmask_b32_sdwa v6, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_pk_min_f16 v7, v1, 2.0 -; GFX942-NEXT: s_mov_b32 s1, 0x5040100 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v1 +; GFX942-NEXT: s_mov_b32 s1, 0x5040100 ; GFX942-NEXT: s_movk_i32 s0, 0x7e00 -; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc -; GFX942-NEXT: v_perm_b32 v4, v5, v0, s1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX942-NEXT: v_perm_b32 v4, v6, v0, s1 ; GFX942-NEXT: v_pk_min_f16 v4, v4, v2 -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v5, v2 src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v6, v2 src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_lshrrev_b32_e32 v7, 16, v4 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v6, v5, v7, vcc ; GFX942-NEXT: v_pack_b32_f16 v7, v1, s0 ; GFX942-NEXT: v_pk_min_f16 v7, v7, v3 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc -; GFX942-NEXT: v_perm_b32 v0, v5, v0, s1 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX942-NEXT: v_perm_b32 v0, v6, v0, s1 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fminimum3_v3f16__inlineimm1: @@ -2743,10 +2724,9 @@ define <3 x half> @v_fminimum3_v3f16__inlineimm2(<3 x half> %a, <3 x half> %b) { ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v6, v6 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v5, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v0, v0, v3, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; @@ -2785,37 +2765,33 @@ define <4 x half> @v_fminimum3_v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_pk_min_f16 v2, v1, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v2, v1, v6, s0 ; GFX942-NEXT: v_pk_min_f16 v2, v5, v2 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v5, v6 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v5, v1 src0_sel:WORD_1 src1_sel:DWORD ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0 ; GFX942-NEXT: v_pk_min_f16 v2, v4, v2 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v4, v8 ; GFX942-NEXT: v_perm_b32 v1, v1, v3, s0 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v7, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v4, v0 src0_sel:WORD_1 src1_sel:DWORD ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v0, v0, v5, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; @@ -2854,37 +2830,33 @@ define <4 x half> @v_fminimum3_v4f16_commute(<4 x half> %a, <4 x half> %b, <4 x ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_pk_min_f16 v2, v1, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v2, v1, v6, s0 ; GFX942-NEXT: v_pk_min_f16 v2, v2, v5 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v6, v5 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v5 src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0 ; GFX942-NEXT: v_pk_min_f16 v2, v2, v4 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v8, v4 ; GFX942-NEXT: v_perm_b32 v1, v1, v3, s0 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v7, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v0, v0, v5, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; @@ -2947,17 +2919,14 @@ define <4 x half> @v_fminimum3_v4f16__fabs_all(<4 x half> %a, <4 x half> %b, <4 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v1, v12, v6, vcc ; GFX942-NEXT: v_perm_b32 v2, v8, v1, s0 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v8, |v5| src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_pk_min_f16 v2, v2, v11 ; GFX942-NEXT: v_perm_b32 v6, v9, v0, s0 -; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v8, |v5| src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_cndmask_b32_sdwa v3, v12, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v9, |v4| src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_pk_min_f16 v6, v6, v10 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v9, |v4| src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v7, v12, v7, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v7, v12, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v1, |v5| ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v1, v12, v2, vcc @@ -3012,37 +2981,33 @@ define <4 x half> @v_fminimum3_v4f16__fneg_all(<4 x half> %a, <4 x half> %b, <4 ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, -v0, -v2 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_pk_min_f16 v2, v1, v3 neg_lo:[1,1] neg_hi:[1,1] ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, -v1, -v3 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, -v1, -v3 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v2, v1, v6, s0 ; GFX942-NEXT: v_pk_min_f16 v2, v2, v5 neg_lo:[0,1] neg_hi:[0,1] ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v6, -v5 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, -v5 src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v2, v0, v8, s0 ; GFX942-NEXT: v_pk_min_f16 v2, v2, v4 neg_lo:[0,1] neg_hi:[0,1] ; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v8, -v4 ; GFX942-NEXT: v_perm_b32 v1, v1, v3, s0 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v7, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, -v4 src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v0, v0, v5, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; @@ -3078,42 +3043,40 @@ define <4 x half> @v_fminimum3_v4f16__inlineimm1(<4 x half> %a, <4 x half> %c) { ; GFX942-LABEL: v_fminimum3_v4f16__inlineimm1: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_pk_min_f16 v4, v0, 2.0 op_sel_hi:[1,0] -; GFX942-NEXT: v_mov_b32_e32 v6, 0x7e00 -; GFX942-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX942-NEXT: v_pk_min_f16 v4, v0, 2.0 op_sel_hi:[1,0] +; GFX942-NEXT: v_mov_b32_e32 v5, 0x7e00 ; GFX942-NEXT: v_pk_min_f16 v7, v1, 2.0 op_sel_hi:[1,0] -; GFX942-NEXT: s_mov_b32 s0, 0x5040100 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v6, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_lshrrev_b32_e32 v8, 16, v7 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v8, v6, v8, vcc +; GFX942-NEXT: s_mov_b32 s0, 0x5040100 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e32 v8, v5, v8, vcc ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc ; GFX942-NEXT: v_perm_b32 v4, v8, v1, s0 -; GFX942-NEXT: v_pk_min_f16 v4, v4, v3 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v8, v3 src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: v_perm_b32 v8, v5, v0, s0 -; GFX942-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; GFX942-NEXT: v_pk_min_f16 v8, v8, v2 -; GFX942-NEXT: v_cndmask_b32_e32 v7, v6, v7, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v9, 16, v8 -; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v5, v2 src0_sel:DWORD src1_sel:WORD_1 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v6, v9, vcc +; GFX942-NEXT: v_pk_min_f16 v4, v4, v3 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_sdwa v7, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v6, v2 src0_sel:DWORD src1_sel:WORD_1 +; GFX942-NEXT: v_perm_b32 v6, v6, v0, s0 +; GFX942-NEXT: v_pk_min_f16 v6, v6, v2 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_sdwa v8, v5, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v6, v4, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 ; GFX942-NEXT: v_perm_b32 v1, v7, v1, s0 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc -; GFX942-NEXT: v_perm_b32 v0, v5, v0, s0 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc +; GFX942-NEXT: v_perm_b32 v0, v8, v0, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-LABEL: v_fminimum3_v4f16__inlineimm1: @@ -3151,37 +3114,33 @@ define <4 x half> @v_fminimum3_v4f16__inlineimm2(<4 x half> %a, <4 x half> %b) { ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: v_pk_min_f16 v2, v1, v3 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v1, v5, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v2, v1, v4, s0 ; GFX942-NEXT: v_pk_min_f16 v2, v2, 4.0 op_sel_hi:[1,0] ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v4, v4 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v3, v5, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v1, v1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v1, v5, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v2, v0, v6, s0 ; GFX942-NEXT: v_pk_min_f16 v2, v2, 4.0 op_sel_hi:[1,0] ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v6, v6 ; GFX942-NEXT: v_perm_b32 v1, v1, v3, s0 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v4, v5, v2, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v5, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v0, v0, v4, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; @@ -4005,19 +3964,17 @@ define <4 x half> @v_no_fminimum3_v2f16__multi_use(<2 x half> %a, <2 x half> %b, ; GFX942-NEXT: s_mov_b32 s0, 0x5040100 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v1, v4, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v0, v1, v5, s0 ; GFX942-NEXT: v_pk_min_f16 v3, v0, v2 ; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v5, v2 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX942-NEXT: v_cmp_o_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX942-NEXT: v_cndmask_b32_sdwa v1, v4, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX942-NEXT: v_perm_b32 v1, v1, v5, s0 ; GFX942-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll index ff894d184e6c4..924378eb2376d 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll @@ -204,14 +204,12 @@ define <2 x i16> @fneg_xor_select_v2i16(<2 x i1> %cond, <2 x i16> %arg0, <2 x i1 ; GFX9-LABEL: fneg_xor_select_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[4:5] +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 @@ -746,25 +744,23 @@ define <2 x half> @select_fneg_select_v2f16(<2 x i1> %cond0, <2 x i1> %cond1, <2 ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX9-NEXT: v_xor_b32_e32 v4, 0x80008000, v4 +; GFX9-NEXT: v_and_b32_e32 v3, 1, v3 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v4 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc +; GFX9-NEXT: v_xor_b32_e32 v1, 0x80008000, v4 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v3 +; GFX9-NEXT: v_cndmask_b32_sdwa v3, v1, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v4, v1, v0, s4 -; GFX9-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX9-NEXT: v_xor_b32_e32 v4, 0x80008000, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v5, vcc +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: v_perm_b32 v1, v3, v0, s6 ; GFX9-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX9-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 +; GFX9-NEXT: s_mov_b64 vcc, s[4:5] +; GFX9-NEXT: v_cndmask_b32_sdwa v3, v3, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX9-NEXT: v_perm_b32 v0, v3, v0, s6 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: select_fneg_select_v2f16: @@ -854,25 +850,23 @@ define <2 x i16> @select_fneg_xor_select_v2i16(<2 x i1> %cond0, <2 x i1> %cond1, ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX9-NEXT: v_xor_b32_e32 v4, 0x80008000, v4 +; GFX9-NEXT: v_and_b32_e32 v3, 1, v3 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v4 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc +; GFX9-NEXT: v_xor_b32_e32 v1, 0x80008000, v4 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v3 +; GFX9-NEXT: v_cndmask_b32_sdwa v3, v1, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v4, v1, v0, s4 -; GFX9-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX9-NEXT: v_xor_b32_e32 v4, 0x80008000, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v5, vcc +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: v_perm_b32 v1, v3, v0, s6 ; GFX9-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX9-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 +; GFX9-NEXT: s_mov_b64 vcc, s[4:5] +; GFX9-NEXT: v_cndmask_b32_sdwa v3, v3, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX9-NEXT: v_perm_b32 v0, v3, v0, s6 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: select_fneg_xor_select_v2i16: diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll index 4b9da7b49e997..6925a98f643b9 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll @@ -345,11 +345,10 @@ define amdgpu_kernel void @half8_inselt(ptr addrspace(1) %out, <8 x half> %vec, ; GCN-NEXT: v_mov_b32_e32 v1, s7 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_cmp_lg_u32 s6, 6 -; GCN-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc +; GCN-NEXT: v_cndmask_b32_sdwa v1, v0, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GCN-NEXT: v_mov_b32_e32 v2, s3 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: s_lshr_b32 s3, s2, 16 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GCN-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc ; GCN-NEXT: s_cmp_lg_u32 s6, 5 ; GCN-NEXT: v_or_b32_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll index b51cb9df8d784..47a371d8de07c 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll @@ -1145,106 +1145,104 @@ define amdgpu_kernel void @v_insertelement_v8bf16_dynamic(ptr addrspace(1) %out, ; VI-NEXT: v_cndmask_b32_e32 v7, v9, v6, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_cmp_eq_u32 s5, 1 -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; VI-NEXT: v_cndmask_b32_e32 v8, v0, v6, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc ; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; VI-NEXT: v_cndmask_b32_sdwa v0, v0, v6, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm ; ; GFX900-LABEL: v_insertelement_v8bf16_dynamic: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX900-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; GFX900-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0 +; GFX900-NEXT: s_load_dwordx2 s[12:13], s[8:9], 0x10 ; GFX900-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GFX900-NEXT: s_mov_b32 s14, 0x5040100 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] -; GFX900-NEXT: s_cmp_eq_u32 s5, 6 -; GFX900-NEXT: v_mov_b32_e32 v5, s4 +; GFX900-NEXT: global_load_dwordx4 v[0:3], v4, s[18:19] +; GFX900-NEXT: s_cmp_eq_u32 s13, 6 ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 7 -; GFX900-NEXT: s_mov_b32 s2, 0x5040100 +; GFX900-NEXT: s_cmp_eq_u32 s13, 7 +; GFX900-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX900-NEXT: s_cmp_eq_u32 s13, 4 +; GFX900-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX900-NEXT: s_cmp_eq_u32 s13, 5 +; GFX900-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX900-NEXT: s_cmp_eq_u32 s13, 2 +; GFX900-NEXT: v_mov_b32_e32 v5, s12 +; GFX900-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX900-NEXT: s_cmp_eq_u32 s13, 3 +; GFX900-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GFX900-NEXT: s_cmp_eq_u32 s13, 0 +; GFX900-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GFX900-NEXT: s_cmp_eq_u32 s13, 1 +; GFX900-NEXT: s_cselect_b64 s[12:13], -1, 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: v_cndmask_b32_e32 v6, v3, v5, vcc -; GFX900-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 4 -; GFX900-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 5 -; GFX900-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX900-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 2 -; GFX900-NEXT: v_perm_b32 v3, v3, v6, s2 -; GFX900-NEXT: v_cndmask_b32_e32 v6, v7, v5, vcc -; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 3 -; GFX900-NEXT: v_lshrrev_b32_e32 v8, 16, v1 -; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 0 -; GFX900-NEXT: v_perm_b32 v2, v6, v2, s2 -; GFX900-NEXT: v_cndmask_b32_e32 v6, v8, v5, vcc -; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 1 -; GFX900-NEXT: v_lshrrev_b32_e32 v9, 16, v0 -; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc -; GFX900-NEXT: v_perm_b32 v1, v6, v1, s2 -; GFX900-NEXT: v_perm_b32 v0, v5, v0, s2 -; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX900-NEXT: s_mov_b64 vcc, s[0:1] +; GFX900-NEXT: v_cndmask_b32_sdwa v3, v3, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX900-NEXT: s_mov_b64 vcc, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v7, v2, v5, s[2:3] +; GFX900-NEXT: v_cndmask_b32_sdwa v2, v2, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX900-NEXT: s_mov_b64 vcc, s[8:9] +; GFX900-NEXT: v_cndmask_b32_e64 v8, v1, v5, s[6:7] +; GFX900-NEXT: v_cndmask_b32_sdwa v1, v1, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX900-NEXT: s_mov_b64 vcc, s[12:13] +; GFX900-NEXT: v_cndmask_b32_e64 v9, v0, v5, s[10:11] +; GFX900-NEXT: v_cndmask_b32_sdwa v0, v0, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX900-NEXT: v_perm_b32 v3, v3, v6, s14 +; GFX900-NEXT: v_perm_b32 v2, v2, v7, s14 +; GFX900-NEXT: v_perm_b32 v1, v1, v8, s14 +; GFX900-NEXT: v_perm_b32 v0, v0, v9, s14 +; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX900-NEXT: s_endpgm ; ; GFX942-LABEL: v_insertelement_v8bf16_dynamic: ; GFX942: ; %bb.0: -; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX942-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 +; GFX942-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10 ; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX942-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GFX942-NEXT: s_mov_b32 s14, 0x5040100 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] -; GFX942-NEXT: s_cmp_eq_u32 s7, 6 -; GFX942-NEXT: v_mov_b32_e32 v5, s6 +; GFX942-NEXT: global_load_dwordx4 v[0:3], v4, s[18:19] +; GFX942-NEXT: s_cmp_eq_u32 s13, 6 ; GFX942-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX942-NEXT: s_cmp_eq_u32 s7, 7 -; GFX942-NEXT: s_mov_b32 s2, 0x5040100 +; GFX942-NEXT: s_cmp_eq_u32 s13, 7 +; GFX942-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX942-NEXT: s_cmp_eq_u32 s13, 4 +; GFX942-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX942-NEXT: s_cmp_eq_u32 s13, 5 +; GFX942-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX942-NEXT: s_cmp_eq_u32 s13, 2 +; GFX942-NEXT: v_mov_b32_e32 v5, s12 +; GFX942-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX942-NEXT: s_cmp_eq_u32 s13, 3 +; GFX942-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GFX942-NEXT: s_cmp_eq_u32 s13, 0 +; GFX942-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GFX942-NEXT: s_cmp_eq_u32 s13, 1 +; GFX942-NEXT: s_cselect_b64 s[12:13], -1, 0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_cndmask_b32_e32 v6, v3, v5, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX942-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX942-NEXT: s_cmp_eq_u32 s7, 4 -; GFX942-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GFX942-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX942-NEXT: s_cmp_eq_u32 s7, 5 -; GFX942-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX942-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; GFX942-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX942-NEXT: s_cmp_eq_u32 s7, 2 -; GFX942-NEXT: v_perm_b32 v3, v3, v6, s2 -; GFX942-NEXT: v_cndmask_b32_e32 v6, v7, v5, vcc -; GFX942-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX942-NEXT: s_cmp_eq_u32 s7, 3 -; GFX942-NEXT: v_lshrrev_b32_e32 v8, 16, v1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX942-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX942-NEXT: s_cmp_eq_u32 s7, 0 -; GFX942-NEXT: v_perm_b32 v2, v6, v2, s2 -; GFX942-NEXT: v_cndmask_b32_e32 v6, v8, v5, vcc -; GFX942-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX942-NEXT: s_cmp_eq_u32 s7, 1 -; GFX942-NEXT: v_lshrrev_b32_e32 v9, 16, v0 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; GFX942-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc -; GFX942-NEXT: v_perm_b32 v1, v6, v1, s2 -; GFX942-NEXT: v_perm_b32 v0, v5, v0, s2 -; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX942-NEXT: s_mov_b64 vcc, s[0:1] +; GFX942-NEXT: v_cndmask_b32_sdwa v3, v3, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX942-NEXT: s_mov_b64 vcc, s[4:5] +; GFX942-NEXT: v_cndmask_b32_e64 v7, v2, v5, s[2:3] +; GFX942-NEXT: v_cndmask_b32_sdwa v2, v2, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX942-NEXT: s_mov_b64 vcc, s[8:9] +; GFX942-NEXT: v_cndmask_b32_e64 v8, v1, v5, s[6:7] +; GFX942-NEXT: v_cndmask_b32_sdwa v1, v1, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX942-NEXT: s_mov_b64 vcc, s[12:13] +; GFX942-NEXT: v_cndmask_b32_e64 v9, v0, v5, s[10:11] +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v0, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX942-NEXT: v_perm_b32 v3, v3, v6, s14 +; GFX942-NEXT: v_perm_b32 v2, v2, v7, s14 +; GFX942-NEXT: v_perm_b32 v1, v1, v8, s14 +; GFX942-NEXT: v_perm_b32 v0, v0, v9, s14 +; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX942-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -1559,163 +1557,163 @@ define amdgpu_kernel void @v_insertelement_v16bf16_dynamic(ptr addrspace(1) %out ; ; GFX900-LABEL: v_insertelement_v16bf16_dynamic: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX900-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 -; GFX900-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; GFX900-NEXT: s_load_dwordx4 s[36:39], s[8:9], 0x0 +; GFX900-NEXT: s_load_dwordx2 s[28:29], s[8:9], 0x10 +; GFX900-NEXT: v_lshlrev_b32_e32 v8, 5, v0 +; GFX900-NEXT: s_mov_b32 s30, 0x5040100 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: global_load_dwordx4 v[1:4], v0, s[2:3] -; GFX900-NEXT: global_load_dwordx4 v[5:8], v0, s[2:3] offset:16 -; GFX900-NEXT: s_cmp_eq_u32 s5, 6 -; GFX900-NEXT: v_mov_b32_e32 v9, s4 +; GFX900-NEXT: global_load_dwordx4 v[0:3], v8, s[38:39] +; GFX900-NEXT: global_load_dwordx4 v[4:7], v8, s[38:39] offset:16 +; GFX900-NEXT: s_cmp_eq_u32 s29, 6 ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 7 -; GFX900-NEXT: s_mov_b32 s2, 0x5040100 +; GFX900-NEXT: s_cmp_eq_u32 s29, 7 +; GFX900-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX900-NEXT: s_cmp_eq_u32 s29, 4 +; GFX900-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX900-NEXT: s_cmp_eq_u32 s29, 5 +; GFX900-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX900-NEXT: s_cmp_eq_u32 s29, 2 +; GFX900-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX900-NEXT: s_cmp_eq_u32 s29, 3 +; GFX900-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GFX900-NEXT: s_cmp_eq_u32 s29, 0 +; GFX900-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GFX900-NEXT: s_cmp_eq_u32 s29, 1 +; GFX900-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GFX900-NEXT: s_cmp_eq_u32 s29, 14 +; GFX900-NEXT: v_mov_b32_e32 v9, s28 +; GFX900-NEXT: s_cselect_b64 s[14:15], -1, 0 +; GFX900-NEXT: s_cmp_eq_u32 s29, 15 +; GFX900-NEXT: s_cselect_b64 s[16:17], -1, 0 +; GFX900-NEXT: s_cmp_eq_u32 s29, 12 +; GFX900-NEXT: s_cselect_b64 s[18:19], -1, 0 +; GFX900-NEXT: s_cmp_eq_u32 s29, 13 +; GFX900-NEXT: s_cselect_b64 s[20:21], -1, 0 +; GFX900-NEXT: s_cmp_eq_u32 s29, 10 +; GFX900-NEXT: s_cselect_b64 s[22:23], -1, 0 +; GFX900-NEXT: s_cmp_eq_u32 s29, 11 +; GFX900-NEXT: s_cselect_b64 s[24:25], -1, 0 +; GFX900-NEXT: s_cmp_eq_u32 s29, 8 +; GFX900-NEXT: s_cselect_b64 s[26:27], -1, 0 +; GFX900-NEXT: s_cmp_eq_u32 s29, 9 +; GFX900-NEXT: s_cselect_b64 s[28:29], -1, 0 ; GFX900-NEXT: s_waitcnt vmcnt(1) -; GFX900-NEXT: v_cndmask_b32_e32 v10, v4, v9, vcc -; GFX900-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 4 -; GFX900-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc -; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 5 -; GFX900-NEXT: v_lshrrev_b32_e32 v11, 16, v3 -; GFX900-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc -; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 2 -; GFX900-NEXT: v_perm_b32 v4, v4, v10, s2 -; GFX900-NEXT: v_cndmask_b32_e32 v10, v11, v9, vcc -; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 3 -; GFX900-NEXT: v_lshrrev_b32_e32 v12, 16, v2 -; GFX900-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc -; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 0 -; GFX900-NEXT: v_cndmask_b32_e32 v11, v12, v9, vcc -; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 1 -; GFX900-NEXT: v_lshrrev_b32_e32 v13, 16, v1 -; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc -; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 14 -; GFX900-NEXT: v_cndmask_b32_e32 v12, v13, v9, vcc -; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 15 +; GFX900-NEXT: v_cndmask_b32_e32 v10, v3, v9, vcc +; GFX900-NEXT: s_mov_b64 vcc, s[0:1] +; GFX900-NEXT: v_cndmask_b32_sdwa v3, v3, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX900-NEXT: s_mov_b64 vcc, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v11, v2, v9, s[2:3] +; GFX900-NEXT: v_cndmask_b32_sdwa v2, v2, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX900-NEXT: s_mov_b64 vcc, s[8:9] +; GFX900-NEXT: v_cndmask_b32_e64 v12, v1, v9, s[6:7] +; GFX900-NEXT: v_cndmask_b32_sdwa v1, v1, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX900-NEXT: s_mov_b64 vcc, s[12:13] +; GFX900-NEXT: v_cndmask_b32_e64 v13, v0, v9, s[10:11] +; GFX900-NEXT: v_cndmask_b32_sdwa v0, v0, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX900-NEXT: s_mov_b64 vcc, s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_lshrrev_b32_e32 v14, 16, v8 -; GFX900-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc -; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 12 -; GFX900-NEXT: v_perm_b32 v1, v12, v1, s2 -; GFX900-NEXT: v_cndmask_b32_e32 v12, v14, v9, vcc -; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 13 -; GFX900-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; GFX900-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc -; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 10 -; GFX900-NEXT: v_perm_b32 v8, v12, v8, s2 -; GFX900-NEXT: v_cndmask_b32_e32 v12, v15, v9, vcc -; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 11 -; GFX900-NEXT: v_perm_b32 v3, v10, v3, s2 -; GFX900-NEXT: v_lshrrev_b32_e32 v10, 16, v6 -; GFX900-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc -; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 8 -; GFX900-NEXT: v_cndmask_b32_e32 v10, v10, v9, vcc -; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: s_cmp_eq_u32 s5, 9 -; GFX900-NEXT: v_perm_b32 v2, v11, v2, s2 -; GFX900-NEXT: v_lshrrev_b32_e32 v11, 16, v5 -; GFX900-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc -; GFX900-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX900-NEXT: v_cndmask_b32_e32 v9, v11, v9, vcc -; GFX900-NEXT: v_perm_b32 v7, v12, v7, s2 -; GFX900-NEXT: v_perm_b32 v6, v10, v6, s2 -; GFX900-NEXT: v_perm_b32 v5, v9, v5, s2 -; GFX900-NEXT: global_store_dwordx4 v0, v[5:8], s[0:1] offset:16 -; GFX900-NEXT: global_store_dwordx4 v0, v[1:4], s[0:1] +; GFX900-NEXT: v_cndmask_b32_e64 v14, v7, v9, s[14:15] +; GFX900-NEXT: v_cndmask_b32_sdwa v7, v7, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX900-NEXT: s_mov_b64 vcc, s[20:21] +; GFX900-NEXT: v_cndmask_b32_e64 v15, v6, v9, s[18:19] +; GFX900-NEXT: v_cndmask_b32_sdwa v6, v6, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX900-NEXT: s_mov_b64 vcc, s[24:25] +; GFX900-NEXT: v_perm_b32 v3, v3, v10, s30 +; GFX900-NEXT: v_cndmask_b32_e64 v10, v5, v9, s[22:23] +; GFX900-NEXT: v_cndmask_b32_sdwa v5, v5, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX900-NEXT: s_mov_b64 vcc, s[28:29] +; GFX900-NEXT: v_perm_b32 v2, v2, v11, s30 +; GFX900-NEXT: v_cndmask_b32_e64 v11, v4, v9, s[26:27] +; GFX900-NEXT: v_cndmask_b32_sdwa v4, v4, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX900-NEXT: v_perm_b32 v7, v7, v14, s30 +; GFX900-NEXT: v_perm_b32 v6, v6, v15, s30 +; GFX900-NEXT: v_perm_b32 v5, v5, v10, s30 +; GFX900-NEXT: v_perm_b32 v4, v4, v11, s30 +; GFX900-NEXT: v_perm_b32 v1, v1, v12, s30 +; GFX900-NEXT: v_perm_b32 v0, v0, v13, s30 +; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[36:37] offset:16 +; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[36:37] ; GFX900-NEXT: s_endpgm ; ; GFX942-LABEL: v_insertelement_v16bf16_dynamic: ; GFX942: ; %bb.0: -; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX942-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x0 +; GFX942-NEXT: s_load_dwordx2 s[28:29], s[4:5], 0x10 ; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX942-NEXT: v_lshlrev_b32_e32 v8, 5, v0 +; GFX942-NEXT: s_mov_b32 s30, 0x5040100 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] -; GFX942-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:16 -; GFX942-NEXT: s_cmp_eq_u32 s7, 6 -; GFX942-NEXT: v_mov_b32_e32 v9, s6 +; GFX942-NEXT: global_load_dwordx4 v[0:3], v8, s[38:39] +; GFX942-NEXT: global_load_dwordx4 v[4:7], v8, s[38:39] offset:16 +; GFX942-NEXT: s_cmp_eq_u32 s29, 6 ; GFX942-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX942-NEXT: s_cmp_eq_u32 s7, 7 -; GFX942-NEXT: s_mov_b32 s2, 0x5040100 +; GFX942-NEXT: s_cmp_eq_u32 s29, 7 +; GFX942-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX942-NEXT: s_cmp_eq_u32 s29, 4 +; GFX942-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX942-NEXT: s_cmp_eq_u32 s29, 5 +; GFX942-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX942-NEXT: s_cmp_eq_u32 s29, 2 +; GFX942-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX942-NEXT: s_cmp_eq_u32 s29, 3 +; GFX942-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GFX942-NEXT: s_cmp_eq_u32 s29, 0 +; GFX942-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GFX942-NEXT: s_cmp_eq_u32 s29, 1 +; GFX942-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GFX942-NEXT: s_cmp_eq_u32 s29, 14 +; GFX942-NEXT: v_mov_b32_e32 v9, s28 +; GFX942-NEXT: s_cselect_b64 s[14:15], -1, 0 +; GFX942-NEXT: s_cmp_eq_u32 s29, 15 +; GFX942-NEXT: s_cselect_b64 s[16:17], -1, 0 +; GFX942-NEXT: s_cmp_eq_u32 s29, 12 +; GFX942-NEXT: s_cselect_b64 s[18:19], -1, 0 +; GFX942-NEXT: s_cmp_eq_u32 s29, 13 +; GFX942-NEXT: s_cselect_b64 s[20:21], -1, 0 +; GFX942-NEXT: s_cmp_eq_u32 s29, 10 +; GFX942-NEXT: s_cselect_b64 s[22:23], -1, 0 +; GFX942-NEXT: s_cmp_eq_u32 s29, 11 +; GFX942-NEXT: s_cselect_b64 s[24:25], -1, 0 +; GFX942-NEXT: s_cmp_eq_u32 s29, 8 +; GFX942-NEXT: s_cselect_b64 s[26:27], -1, 0 +; GFX942-NEXT: s_cmp_eq_u32 s29, 9 +; GFX942-NEXT: s_cselect_b64 s[28:29], -1, 0 ; GFX942-NEXT: s_waitcnt vmcnt(1) ; GFX942-NEXT: v_cndmask_b32_e32 v10, v3, v9, vcc -; GFX942-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX942-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX942-NEXT: s_cmp_eq_u32 s7, 4 -; GFX942-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc -; GFX942-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX942-NEXT: s_cmp_eq_u32 s7, 5 -; GFX942-NEXT: v_lshrrev_b32_e32 v11, 16, v2 -; GFX942-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc -; GFX942-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX942-NEXT: s_cmp_eq_u32 s7, 2 -; GFX942-NEXT: v_perm_b32 v3, v3, v10, s2 -; GFX942-NEXT: v_cndmask_b32_e32 v10, v11, v9, vcc -; GFX942-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX942-NEXT: s_cmp_eq_u32 s7, 3 -; GFX942-NEXT: v_lshrrev_b32_e32 v12, 16, v1 -; GFX942-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc -; GFX942-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX942-NEXT: s_cmp_eq_u32 s7, 0 -; GFX942-NEXT: v_perm_b32 v2, v10, v2, s2 -; GFX942-NEXT: v_cndmask_b32_e32 v10, v12, v9, vcc -; GFX942-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX942-NEXT: s_cmp_eq_u32 s7, 1 -; GFX942-NEXT: v_lshrrev_b32_e32 v13, 16, v0 -; GFX942-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc -; GFX942-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX942-NEXT: s_cmp_eq_u32 s7, 14 -; GFX942-NEXT: v_perm_b32 v1, v10, v1, s2 -; GFX942-NEXT: v_cndmask_b32_e32 v10, v13, v9, vcc -; GFX942-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX942-NEXT: s_cmp_eq_u32 s7, 15 +; GFX942-NEXT: s_mov_b64 vcc, s[0:1] +; GFX942-NEXT: v_cndmask_b32_sdwa v3, v3, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX942-NEXT: s_mov_b64 vcc, s[4:5] +; GFX942-NEXT: v_cndmask_b32_e64 v11, v2, v9, s[2:3] +; GFX942-NEXT: v_cndmask_b32_sdwa v2, v2, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX942-NEXT: s_mov_b64 vcc, s[8:9] +; GFX942-NEXT: v_cndmask_b32_e64 v12, v1, v9, s[6:7] +; GFX942-NEXT: v_cndmask_b32_sdwa v1, v1, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX942-NEXT: s_mov_b64 vcc, s[12:13] +; GFX942-NEXT: v_cndmask_b32_e64 v13, v0, v9, s[10:11] +; GFX942-NEXT: v_cndmask_b32_sdwa v0, v0, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX942-NEXT: s_mov_b64 vcc, s[16:17] ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v14, 16, v7 -; GFX942-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc -; GFX942-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX942-NEXT: s_cmp_eq_u32 s7, 12 -; GFX942-NEXT: v_perm_b32 v0, v10, v0, s2 -; GFX942-NEXT: v_cndmask_b32_e32 v10, v14, v9, vcc -; GFX942-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX942-NEXT: s_cmp_eq_u32 s7, 13 -; GFX942-NEXT: v_lshrrev_b32_e32 v15, 16, v6 -; GFX942-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc -; GFX942-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX942-NEXT: s_cmp_eq_u32 s7, 10 -; GFX942-NEXT: v_perm_b32 v7, v10, v7, s2 -; GFX942-NEXT: v_cndmask_b32_e32 v10, v15, v9, vcc -; GFX942-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX942-NEXT: s_cmp_eq_u32 s7, 11 -; GFX942-NEXT: v_lshrrev_b32_e32 v16, 16, v5 -; GFX942-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc -; GFX942-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX942-NEXT: s_cmp_eq_u32 s7, 8 -; GFX942-NEXT: v_perm_b32 v6, v10, v6, s2 -; GFX942-NEXT: v_cndmask_b32_e32 v10, v16, v9, vcc -; GFX942-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX942-NEXT: s_cmp_eq_u32 s7, 9 -; GFX942-NEXT: v_lshrrev_b32_e32 v17, 16, v4 -; GFX942-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc -; GFX942-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX942-NEXT: v_cndmask_b32_e32 v9, v17, v9, vcc -; GFX942-NEXT: v_perm_b32 v5, v10, v5, s2 -; GFX942-NEXT: v_perm_b32 v4, v9, v4, s2 -; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 -; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX942-NEXT: v_cndmask_b32_e64 v14, v7, v9, s[14:15] +; GFX942-NEXT: v_cndmask_b32_sdwa v7, v7, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX942-NEXT: s_mov_b64 vcc, s[20:21] +; GFX942-NEXT: v_cndmask_b32_e64 v15, v6, v9, s[18:19] +; GFX942-NEXT: v_cndmask_b32_sdwa v6, v6, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX942-NEXT: s_mov_b64 vcc, s[24:25] +; GFX942-NEXT: v_cndmask_b32_e64 v16, v5, v9, s[22:23] +; GFX942-NEXT: v_cndmask_b32_sdwa v5, v5, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX942-NEXT: s_mov_b64 vcc, s[28:29] +; GFX942-NEXT: v_cndmask_b32_e64 v17, v4, v9, s[26:27] +; GFX942-NEXT: v_cndmask_b32_sdwa v4, v4, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX942-NEXT: v_perm_b32 v7, v7, v14, s30 +; GFX942-NEXT: v_perm_b32 v6, v6, v15, s30 +; GFX942-NEXT: v_perm_b32 v5, v5, v16, s30 +; GFX942-NEXT: v_perm_b32 v4, v4, v17, s30 +; GFX942-NEXT: v_perm_b32 v3, v3, v10, s30 +; GFX942-NEXT: v_perm_b32 v2, v2, v11, s30 +; GFX942-NEXT: v_perm_b32 v1, v1, v12, s30 +; GFX942-NEXT: v_perm_b32 v0, v0, v13, s30 +; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[36:37] offset:16 +; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[36:37] ; GFX942-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll index e11900ac0ca68..e0dacb7a59a42 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -2694,47 +2694,47 @@ define amdgpu_kernel void @v_insertelement_v8i16_6(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %n) { ; GFX9-LABEL: v_insertelement_v8f16_dynamic: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[12:13], s[8:9], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GFX9-NEXT: s_mov_b32 s14, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] -; GFX9-NEXT: s_cmp_eq_u32 s5, 6 -; GFX9-NEXT: v_mov_b32_e32 v5, s4 +; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[18:19] +; GFX9-NEXT: s_cmp_eq_u32 s13, 6 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 7 -; GFX9-NEXT: s_mov_b32 s2, 0x5040100 +; GFX9-NEXT: s_cmp_eq_u32 s13, 7 +; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s13, 4 +; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s13, 5 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s13, 2 +; GFX9-NEXT: v_mov_b32_e32 v5, s12 +; GFX9-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s13, 3 +; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s13, 0 +; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s13, 1 +; GFX9-NEXT: s_cselect_b64 s[12:13], -1, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v5, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 4 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 5 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 2 -; GFX9-NEXT: v_perm_b32 v3, v3, v6, s2 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v5, vcc -; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 3 -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 0 -; GFX9-NEXT: v_perm_b32 v2, v6, v2, s2 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v8, v5, vcc -; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 1 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc -; GFX9-NEXT: v_perm_b32 v1, v6, v1, s2 -; GFX9-NEXT: v_perm_b32 v0, v5, v0, s2 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX9-NEXT: s_mov_b64 vcc, s[0:1] +; GFX9-NEXT: v_cndmask_b32_sdwa v3, v3, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: s_mov_b64 vcc, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v2, v5, s[2:3] +; GFX9-NEXT: v_cndmask_b32_sdwa v2, v2, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: s_mov_b64 vcc, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v8, v1, v5, s[6:7] +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v1, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: s_mov_b64 vcc, s[12:13] +; GFX9-NEXT: v_cndmask_b32_e64 v9, v0, v5, s[10:11] +; GFX9-NEXT: v_cndmask_b32_sdwa v0, v0, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v3, v3, v6, s14 +; GFX9-NEXT: v_perm_b32 v2, v2, v7, s14 +; GFX9-NEXT: v_perm_b32 v1, v1, v8, s14 +; GFX9-NEXT: v_perm_b32 v0, v0, v9, s14 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v8f16_dynamic: @@ -2783,14 +2783,12 @@ define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out, ; VI-NEXT: v_cndmask_b32_e32 v7, v9, v6, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: s_cmp_eq_u32 s5, 1 -; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; VI-NEXT: v_cndmask_b32_e32 v8, v0, v6, vcc ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc ; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; VI-NEXT: v_cndmask_b32_sdwa v0, v0, v6, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm ; @@ -3196,82 +3194,82 @@ define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr a define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %n) { ; GFX9-LABEL: v_insertelement_v16f16_dynamic: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; GFX9-NEXT: s_load_dwordx4 s[36:39], s[8:9], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[28:29], s[8:9], 0x10 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 5, v0 +; GFX9-NEXT: s_mov_b32 s30, 0x5040100 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[1:4], v0, s[2:3] -; GFX9-NEXT: global_load_dwordx4 v[5:8], v0, s[2:3] offset:16 -; GFX9-NEXT: s_cmp_eq_u32 s5, 6 -; GFX9-NEXT: v_mov_b32_e32 v9, s4 +; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[38:39] +; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[38:39] offset:16 +; GFX9-NEXT: s_cmp_eq_u32 s29, 6 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 7 -; GFX9-NEXT: s_mov_b32 s2, 0x5040100 +; GFX9-NEXT: s_cmp_eq_u32 s29, 7 +; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s29, 4 +; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s29, 5 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s29, 2 +; GFX9-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s29, 3 +; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s29, 0 +; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s29, 1 +; GFX9-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s29, 14 +; GFX9-NEXT: v_mov_b32_e32 v9, s28 +; GFX9-NEXT: s_cselect_b64 s[14:15], -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s29, 15 +; GFX9-NEXT: s_cselect_b64 s[16:17], -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s29, 12 +; GFX9-NEXT: s_cselect_b64 s[18:19], -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s29, 13 +; GFX9-NEXT: s_cselect_b64 s[20:21], -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s29, 10 +; GFX9-NEXT: s_cselect_b64 s[22:23], -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s29, 11 +; GFX9-NEXT: s_cselect_b64 s[24:25], -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s29, 8 +; GFX9-NEXT: s_cselect_b64 s[26:27], -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s29, 9 +; GFX9-NEXT: s_cselect_b64 s[28:29], -1, 0 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cndmask_b32_e32 v10, v4, v9, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 4 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc -; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 5 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc -; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 2 -; GFX9-NEXT: v_perm_b32 v4, v4, v10, s2 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v11, v9, vcc -; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 3 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc -; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v12, v9, vcc -; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 1 -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc -; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 14 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v13, v9, vcc -; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 15 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v3, v9, vcc +; GFX9-NEXT: s_mov_b64 vcc, s[0:1] +; GFX9-NEXT: v_cndmask_b32_sdwa v3, v3, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: s_mov_b64 vcc, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v11, v2, v9, s[2:3] +; GFX9-NEXT: v_cndmask_b32_sdwa v2, v2, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: s_mov_b64 vcc, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v12, v1, v9, s[6:7] +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v1, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: s_mov_b64 vcc, s[12:13] +; GFX9-NEXT: v_cndmask_b32_e64 v13, v0, v9, s[10:11] +; GFX9-NEXT: v_cndmask_b32_sdwa v0, v0, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: s_mov_b64 vcc, s[16:17] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc -; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 12 -; GFX9-NEXT: v_perm_b32 v1, v12, v1, s2 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v14, v9, vcc -; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 13 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc -; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 10 -; GFX9-NEXT: v_perm_b32 v8, v12, v8, s2 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v15, v9, vcc -; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 11 -; GFX9-NEXT: v_perm_b32 v3, v10, v3, s2 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc -; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 8 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v10, v9, vcc -; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s5, 9 -; GFX9-NEXT: v_perm_b32 v2, v11, v2, s2 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc -; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v11, v9, vcc -; GFX9-NEXT: v_perm_b32 v7, v12, v7, s2 -; GFX9-NEXT: v_perm_b32 v6, v10, v6, s2 -; GFX9-NEXT: v_perm_b32 v5, v9, v5, s2 -; GFX9-NEXT: global_store_dwordx4 v0, v[5:8], s[0:1] offset:16 -; GFX9-NEXT: global_store_dwordx4 v0, v[1:4], s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v14, v7, v9, s[14:15] +; GFX9-NEXT: v_cndmask_b32_sdwa v7, v7, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: s_mov_b64 vcc, s[20:21] +; GFX9-NEXT: v_cndmask_b32_e64 v15, v6, v9, s[18:19] +; GFX9-NEXT: v_cndmask_b32_sdwa v6, v6, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: s_mov_b64 vcc, s[24:25] +; GFX9-NEXT: v_perm_b32 v3, v3, v10, s30 +; GFX9-NEXT: v_cndmask_b32_e64 v10, v5, v9, s[22:23] +; GFX9-NEXT: v_cndmask_b32_sdwa v5, v5, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: s_mov_b64 vcc, s[28:29] +; GFX9-NEXT: v_perm_b32 v2, v2, v11, s30 +; GFX9-NEXT: v_cndmask_b32_e64 v11, v4, v9, s[26:27] +; GFX9-NEXT: v_cndmask_b32_sdwa v4, v4, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v7, v7, v14, s30 +; GFX9-NEXT: v_perm_b32 v6, v6, v15, s30 +; GFX9-NEXT: v_perm_b32 v5, v5, v10, s30 +; GFX9-NEXT: v_perm_b32 v4, v4, v11, s30 +; GFX9-NEXT: v_perm_b32 v1, v1, v12, s30 +; GFX9-NEXT: v_perm_b32 v0, v0, v13, s30 +; GFX9-NEXT: global_store_dwordx4 v8, v[4:7], s[36:37] offset:16 +; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[36:37] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v16f16_dynamic: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll index 362b9dacaf257..17fdc841a1258 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll @@ -666,14 +666,13 @@ define <2 x half> @v_maximum_v2f16(<2 x half> %src0, <2 x half> %src1) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX8-NEXT: v_max_f16_e32 v4, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v5, 0x7e00 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v3, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc -; GFX8-NEXT: v_max_f16_e32 v3, v0, v1 +; GFX8-NEXT: v_max_f16_e32 v2, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX8-NEXT: v_cndmask_b32_sdwa v2, v3, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v4, v0, v1 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -684,9 +683,8 @@ define <2 x half> @v_maximum_v2f16(<2 x half> %src0, <2 x half> %src1) { ; GFX900-NEXT: v_mov_b32_e32 v3, 0x7e00 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 ; GFX900-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc -; GFX900-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX900-NEXT: v_cndmask_b32_sdwa v0, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: v_perm_b32 v0, v0, v4, s4 ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -701,12 +699,12 @@ define <2 x half> @v_maximum_v2f16(<2 x half> %src0, <2 x half> %src1) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v2, v0, v1 -; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v2, vcc_lo ; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v3, vcc_lo -; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x5040100 +; GFX10-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX10-NEXT: v_cmp_o_f16_e64 s4, v0, v1 +; GFX10-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0x7e00, v2, s4 +; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: v_maximum_v2f16: @@ -836,14 +834,13 @@ define <2 x half> @v_maximum_v2f16__nsz(<2 x half> %src0, <2 x half> %src1) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX8-NEXT: v_max_f16_e32 v4, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v5, 0x7e00 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v3, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc -; GFX8-NEXT: v_max_f16_e32 v3, v0, v1 +; GFX8-NEXT: v_max_f16_e32 v2, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX8-NEXT: v_cndmask_b32_sdwa v2, v3, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v4, v0, v1 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -854,9 +851,8 @@ define <2 x half> @v_maximum_v2f16__nsz(<2 x half> %src0, <2 x half> %src1) { ; GFX900-NEXT: v_mov_b32_e32 v3, 0x7e00 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 ; GFX900-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc -; GFX900-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX900-NEXT: v_cndmask_b32_sdwa v0, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: v_perm_b32 v0, v0, v4, s4 ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -871,12 +867,12 @@ define <2 x half> @v_maximum_v2f16__nsz(<2 x half> %src0, <2 x half> %src1) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v2, v0, v1 -; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v2, vcc_lo ; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v3, vcc_lo -; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x5040100 +; GFX10-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX10-NEXT: v_cmp_o_f16_e64 s4, v0, v1 +; GFX10-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0x7e00, v2, s4 +; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: v_maximum_v2f16__nsz: @@ -1014,15 +1010,14 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) { ; GFX8-NEXT: s_lshr_b32 s4, s17, 16 ; GFX8-NEXT: s_lshr_b32 s5, s16, 16 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_max_f16_e32 v1, s5, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, 0x7e00 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s5, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX8-NEXT: v_mov_b32_e32 v1, s17 -; GFX8-NEXT: v_max_f16_e32 v3, s16, v1 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s16, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX8-NEXT: v_max_f16_e32 v0, s5, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0x7e00 +; GFX8-NEXT: v_mov_b32_e32 v2, s17 +; GFX8-NEXT: v_cndmask_b32_sdwa v0, v1, v0, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v3, s16, v2 +; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s16, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; use v0 @@ -1041,9 +1036,8 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) { ; GFX900-NEXT: s_lshr_b32 s5, s16, 16 ; GFX900-NEXT: v_mov_b32_e32 v3, s4 ; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, s5, v3 -; GFX900-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX900-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX900-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX900-NEXT: ;;#ASMSTART @@ -1065,16 +1059,16 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) { ; GFX10-LABEL: s_maximum_v2f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v0, s16, s17 -; GFX10-NEXT: v_cmp_o_f16_e64 vcc_lo, s16, s17 ; GFX10-NEXT: s_lshr_b32 s4, s17, 16 ; GFX10-NEXT: s_lshr_b32 s5, s16, 16 -; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo +; GFX10-NEXT: v_pk_max_f16 v0, s16, s17 ; GFX10-NEXT: v_cmp_o_f16_e64 vcc_lo, s5, s4 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v1, vcc_lo -; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: v_cmp_o_f16_e64 s4, s16, s17 +; GFX10-NEXT: v_mov_b32_e32 v1, 0x7e00 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0x7e00, v0, s4 +; GFX10-NEXT: v_cndmask_b32_sdwa v0, v1, v0, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; use v0 ; GFX10-NEXT: ;;#ASMEND @@ -1172,18 +1166,17 @@ define <3 x half> @v_maximum_v3f16(<3 x half> %src0, <3 x half> %src1) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX8-NEXT: v_max_f16_e32 v6, v5, v4 -; GFX8-NEXT: v_mov_b32_e32 v7, 0x7e00 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v5, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v7, v6, vcc -; GFX8-NEXT: v_max_f16_e32 v5, v1, v3 +; GFX8-NEXT: v_max_f16_e32 v4, v5, v4 +; GFX8-NEXT: v_mov_b32_e32 v5, 0x7e00 +; GFX8-NEXT: v_cndmask_b32_sdwa v4, v5, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v6, v1, v3 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v5, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc ; GFX8-NEXT: v_max_f16_e32 v3, v0, v2 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: v_maximum_v3f16: @@ -1196,9 +1189,8 @@ define <3 x half> @v_maximum_v3f16(<3 x half> %src0, <3 x half> %src1) { ; GFX900-NEXT: v_pk_max_f16 v3, v0, v2 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 ; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc -; GFX900-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX900-NEXT: v_cndmask_b32_sdwa v0, v5, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: v_perm_b32 v0, v0, v4, s4 ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -1214,15 +1206,15 @@ define <3 x half> @v_maximum_v3f16(<3 x half> %src0, <3 x half> %src1) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v4, v0, v2 -; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v4, 0x7e00, v4, vcc_lo ; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_pk_max_f16 v2, v1, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v5, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v5, 0x7e00 +; GFX10-NEXT: v_cmp_o_f16_e64 s4, v0, v2 +; GFX10-NEXT: v_cndmask_b32_sdwa v2, v5, v4, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0x7e00, v4, s4 +; GFX10-NEXT: v_pk_max_f16 v4, v1, v3 ; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3 -; GFX10-NEXT: v_perm_b32 v0, v0, v4, 0x5040100 -; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v2, vcc_lo +; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: v_maximum_v3f16: @@ -1379,18 +1371,17 @@ define <3 x half> @v_maximum_v3f16__nsz(<3 x half> %src0, <3 x half> %src1) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX8-NEXT: v_max_f16_e32 v6, v5, v4 -; GFX8-NEXT: v_mov_b32_e32 v7, 0x7e00 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v5, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v7, v6, vcc -; GFX8-NEXT: v_max_f16_e32 v5, v1, v3 +; GFX8-NEXT: v_max_f16_e32 v4, v5, v4 +; GFX8-NEXT: v_mov_b32_e32 v5, 0x7e00 +; GFX8-NEXT: v_cndmask_b32_sdwa v4, v5, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v6, v1, v3 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v5, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc ; GFX8-NEXT: v_max_f16_e32 v3, v0, v2 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: v_maximum_v3f16__nsz: @@ -1403,9 +1394,8 @@ define <3 x half> @v_maximum_v3f16__nsz(<3 x half> %src0, <3 x half> %src1) { ; GFX900-NEXT: v_pk_max_f16 v3, v0, v2 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 ; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc -; GFX900-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX900-NEXT: v_cndmask_b32_sdwa v0, v5, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: v_perm_b32 v0, v0, v4, s4 ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -1421,15 +1411,15 @@ define <3 x half> @v_maximum_v3f16__nsz(<3 x half> %src0, <3 x half> %src1) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v4, v0, v2 -; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v4, 0x7e00, v4, vcc_lo ; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_pk_max_f16 v2, v1, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v5, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v5, 0x7e00 +; GFX10-NEXT: v_cmp_o_f16_e64 s4, v0, v2 +; GFX10-NEXT: v_cndmask_b32_sdwa v2, v5, v4, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0x7e00, v4, s4 +; GFX10-NEXT: v_pk_max_f16 v4, v1, v3 ; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3 -; GFX10-NEXT: v_perm_b32 v0, v0, v4, 0x5040100 -; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v2, vcc_lo +; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: v_maximum_v3f16__nsz: @@ -1593,25 +1583,24 @@ define <4 x half> @v_maximum_v4f16(<4 x half> %src0, <4 x half> %src1) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX8-NEXT: v_max_f16_e32 v6, v5, v4 -; GFX8-NEXT: v_mov_b32_e32 v7, 0x7e00 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v5, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v7, v6, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX8-NEXT: v_max_f16_e32 v8, v6, v5 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v6, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc -; GFX8-NEXT: v_max_f16_e32 v6, v1, v3 +; GFX8-NEXT: v_max_f16_e32 v4, v5, v4 +; GFX8-NEXT: v_mov_b32_e32 v5, 0x7e00 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; GFX8-NEXT: v_cndmask_b32_sdwa v4, v5, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v8, v7, v6 +; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v7, v6 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX8-NEXT: v_max_f16_e32 v7, v1, v3 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc ; GFX8-NEXT: v_max_f16_e32 v3, v0, v2 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: v_maximum_v4f16: @@ -1621,15 +1610,13 @@ define <4 x half> @v_maximum_v4f16(<4 x half> %src0, <4 x half> %src1) { ; GFX900-NEXT: v_mov_b32_e32 v5, 0x7e00 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX900-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc -; GFX900-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; GFX900-NEXT: v_cndmask_b32_sdwa v1, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX900-NEXT: v_pk_max_f16 v3, v0, v2 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 ; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc -; GFX900-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX900-NEXT: v_cndmask_b32_sdwa v0, v5, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: v_perm_b32 v0, v0, v4, s4 ; GFX900-NEXT: v_perm_b32 v1, v1, v6, s4 @@ -1645,20 +1632,20 @@ define <4 x half> @v_maximum_v4f16(<4 x half> %src0, <4 x half> %src1) { ; GFX10-LABEL: v_maximum_v4f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v4, v1, v3 -; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3 -; GFX10-NEXT: v_pk_max_f16 v5, v0, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, 0x7e00, v4, vcc_lo -; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v5, 0x7e00, v5, vcc_lo -; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v7, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_perm_b32 v0, v0, v5, 0x5040100 -; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo -; GFX10-NEXT: v_perm_b32 v1, v1, v6, 0x5040100 +; GFX10-NEXT: v_pk_max_f16 v6, v1, v3 +; GFX10-NEXT: v_cmp_o_f16_sdwa s4, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_pk_max_f16 v5, v0, v2 +; GFX10-NEXT: v_cmp_o_f16_e64 s5, v0, v2 +; GFX10-NEXT: v_cndmask_b32_sdwa v2, v4, v6, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: s_mov_b32 vcc_lo, s4 +; GFX10-NEXT: v_cndmask_b32_sdwa v4, v4, v5, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0x7e00, v5, s5 +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v6, vcc_lo +; GFX10-NEXT: v_perm_b32 v0, v4, v0, 0x5040100 +; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: v_maximum_v4f16: @@ -1836,25 +1823,24 @@ define <4 x half> @v_maximum_v4f16__nsz(<4 x half> %src0, <4 x half> %src1) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX8-NEXT: v_max_f16_e32 v6, v5, v4 -; GFX8-NEXT: v_mov_b32_e32 v7, 0x7e00 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v5, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v7, v6, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX8-NEXT: v_max_f16_e32 v8, v6, v5 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v6, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc -; GFX8-NEXT: v_max_f16_e32 v6, v1, v3 +; GFX8-NEXT: v_max_f16_e32 v4, v5, v4 +; GFX8-NEXT: v_mov_b32_e32 v5, 0x7e00 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; GFX8-NEXT: v_cndmask_b32_sdwa v4, v5, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v8, v7, v6 +; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v7, v6 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX8-NEXT: v_max_f16_e32 v7, v1, v3 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc ; GFX8-NEXT: v_max_f16_e32 v3, v0, v2 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: v_maximum_v4f16__nsz: @@ -1864,15 +1850,13 @@ define <4 x half> @v_maximum_v4f16__nsz(<4 x half> %src0, <4 x half> %src1) { ; GFX900-NEXT: v_mov_b32_e32 v5, 0x7e00 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX900-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc -; GFX900-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; GFX900-NEXT: v_cndmask_b32_sdwa v1, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX900-NEXT: v_pk_max_f16 v3, v0, v2 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 ; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc -; GFX900-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX900-NEXT: v_cndmask_b32_sdwa v0, v5, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: v_perm_b32 v0, v0, v4, s4 ; GFX900-NEXT: v_perm_b32 v1, v1, v6, s4 @@ -1888,20 +1872,20 @@ define <4 x half> @v_maximum_v4f16__nsz(<4 x half> %src0, <4 x half> %src1) { ; GFX10-LABEL: v_maximum_v4f16__nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_max_f16 v4, v1, v3 -; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3 -; GFX10-NEXT: v_pk_max_f16 v5, v0, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, 0x7e00, v4, vcc_lo -; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v5, 0x7e00, v5, vcc_lo -; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v7, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_perm_b32 v0, v0, v5, 0x5040100 -; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo -; GFX10-NEXT: v_perm_b32 v1, v1, v6, 0x5040100 +; GFX10-NEXT: v_pk_max_f16 v6, v1, v3 +; GFX10-NEXT: v_cmp_o_f16_sdwa s4, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_pk_max_f16 v5, v0, v2 +; GFX10-NEXT: v_cmp_o_f16_e64 s5, v0, v2 +; GFX10-NEXT: v_cndmask_b32_sdwa v2, v4, v6, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: s_mov_b32 vcc_lo, s4 +; GFX10-NEXT: v_cndmask_b32_sdwa v4, v4, v5, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0x7e00, v5, s5 +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v6, vcc_lo +; GFX10-NEXT: v_perm_b32 v0, v4, v0, 0x5040100 +; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: v_maximum_v4f16__nsz: @@ -2195,34 +2179,34 @@ define <8 x half> @v_maximum_v8f16(<8 x half> %src0, <8 x half> %src1) { ; GFX10-LABEL: v_maximum_v8f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v3, v7 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-NEXT: v_pk_max_f16 v8, v3, v7 -; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v7 -; GFX10-NEXT: v_pk_max_f16 v9, v2, v6 -; GFX10-NEXT: v_pk_max_f16 v12, v1, v5 -; GFX10-NEXT: v_pk_max_f16 v13, v0, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v10, 0x7e00, v8, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v9, 0x7e00 +; GFX10-NEXT: v_pk_max_f16 v10, v2, v6 +; GFX10-NEXT: v_cmp_o_f16_sdwa s4, v2, v6 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_cmp_o_f16_sdwa s5, v1, v5 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_pk_max_f16 v12, v0, v4 +; GFX10-NEXT: v_cndmask_b32_sdwa v11, v9, v8, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v2, v6 -; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v9 -; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX10-NEXT: v_cndmask_b32_e32 v9, 0x7e00, v9, vcc_lo -; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v2, v6 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v11, vcc_lo -; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v5 -; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v13 -; GFX10-NEXT: v_perm_b32 v2, v2, v9, 0x5040100 -; GFX10-NEXT: v_cndmask_b32_e32 v6, 0x7e00, v12, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v10, vcc_lo +; GFX10-NEXT: s_mov_b32 vcc_lo, s4 +; GFX10-NEXT: v_cmp_o_f16_sdwa s4, v0, v4 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_cndmask_b32_sdwa v6, v9, v10, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_pk_max_f16 v10, v1, v5 +; GFX10-NEXT: s_mov_b32 vcc_lo, s5 +; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x5040100 +; GFX10-NEXT: v_cndmask_b32_sdwa v13, v9, v10, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v4 -; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GFX10-NEXT: v_cndmask_b32_e32 v13, 0x7e00, v13, vcc_lo -; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v4 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v11, vcc_lo -; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v1, v5 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_perm_b32 v0, v0, v13, 0x5040100 -; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v12, vcc_lo -; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v3, v7 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_perm_b32 v1, v1, v6, 0x5040100 +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v12, vcc_lo +; GFX10-NEXT: s_mov_b32 vcc_lo, s4 +; GFX10-NEXT: v_cndmask_b32_sdwa v4, v9, v12, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v5 +; GFX10-NEXT: v_perm_b32 v0, v4, v0, 0x5040100 +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v10, vcc_lo +; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v7 +; GFX10-NEXT: v_perm_b32 v1, v13, v1, 0x5040100 ; GFX10-NEXT: v_cndmask_b32_e32 v3, 0x7e00, v8, vcc_lo -; GFX10-NEXT: v_perm_b32 v3, v3, v10, 0x5040100 +; GFX10-NEXT: v_perm_b32 v3, v11, v3, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: v_maximum_v8f16: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll index f6d37b34807b1..b8e5be785a77d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll @@ -556,14 +556,13 @@ define <2 x half> @v_minimum_v2f16(<2 x half> %src0, <2 x half> %src1) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX8-NEXT: v_min_f16_e32 v4, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v5, 0x7e00 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v3, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc -; GFX8-NEXT: v_min_f16_e32 v3, v0, v1 +; GFX8-NEXT: v_min_f16_e32 v2, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX8-NEXT: v_cndmask_b32_sdwa v2, v3, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v4, v0, v1 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -574,9 +573,8 @@ define <2 x half> @v_minimum_v2f16(<2 x half> %src0, <2 x half> %src1) { ; GFX900-NEXT: v_mov_b32_e32 v3, 0x7e00 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 ; GFX900-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc -; GFX900-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX900-NEXT: v_cndmask_b32_sdwa v0, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: v_perm_b32 v0, v0, v4, s4 ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -591,12 +589,12 @@ define <2 x half> @v_minimum_v2f16(<2 x half> %src0, <2 x half> %src1) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_pk_min_f16 v2, v0, v1 -; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v2, vcc_lo ; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v3, vcc_lo -; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x5040100 +; GFX10-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX10-NEXT: v_cmp_o_f16_e64 s4, v0, v1 +; GFX10-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0x7e00, v2, s4 +; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: v_minimum_v2f16: @@ -691,14 +689,13 @@ define <2 x half> @v_minimum_v2f16__nsz(<2 x half> %src0, <2 x half> %src1) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX8-NEXT: v_min_f16_e32 v4, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v5, 0x7e00 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v3, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc -; GFX8-NEXT: v_min_f16_e32 v3, v0, v1 +; GFX8-NEXT: v_min_f16_e32 v2, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX8-NEXT: v_cndmask_b32_sdwa v2, v3, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v4, v0, v1 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -709,9 +706,8 @@ define <2 x half> @v_minimum_v2f16__nsz(<2 x half> %src0, <2 x half> %src1) { ; GFX900-NEXT: v_mov_b32_e32 v3, 0x7e00 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 ; GFX900-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc -; GFX900-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX900-NEXT: v_cndmask_b32_sdwa v0, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: v_perm_b32 v0, v0, v4, s4 ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -726,12 +722,12 @@ define <2 x half> @v_minimum_v2f16__nsz(<2 x half> %src0, <2 x half> %src1) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_pk_min_f16 v2, v0, v1 -; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v2, vcc_lo ; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v3, vcc_lo -; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x5040100 +; GFX10-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX10-NEXT: v_cmp_o_f16_e64 s4, v0, v1 +; GFX10-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0x7e00, v2, s4 +; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: v_minimum_v2f16__nsz: @@ -827,15 +823,14 @@ define void @s_minimum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) { ; GFX8-NEXT: s_lshr_b32 s4, s17, 16 ; GFX8-NEXT: s_lshr_b32 s5, s16, 16 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_min_f16_e32 v1, s5, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, 0x7e00 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s5, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX8-NEXT: v_mov_b32_e32 v1, s17 -; GFX8-NEXT: v_min_f16_e32 v3, s16, v1 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s16, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX8-NEXT: v_min_f16_e32 v0, s5, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0x7e00 +; GFX8-NEXT: v_mov_b32_e32 v2, s17 +; GFX8-NEXT: v_cndmask_b32_sdwa v0, v1, v0, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v3, s16, v2 +; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s16, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; use v0 @@ -854,9 +849,8 @@ define void @s_minimum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) { ; GFX900-NEXT: s_lshr_b32 s5, s16, 16 ; GFX900-NEXT: v_mov_b32_e32 v3, s4 ; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, s5, v3 -; GFX900-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX900-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX900-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX900-NEXT: ;;#ASMSTART @@ -878,16 +872,16 @@ define void @s_minimum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) { ; GFX10-LABEL: s_minimum_v2f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_min_f16 v0, s16, s17 -; GFX10-NEXT: v_cmp_o_f16_e64 vcc_lo, s16, s17 ; GFX10-NEXT: s_lshr_b32 s4, s17, 16 ; GFX10-NEXT: s_lshr_b32 s5, s16, 16 -; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo +; GFX10-NEXT: v_pk_min_f16 v0, s16, s17 ; GFX10-NEXT: v_cmp_o_f16_e64 vcc_lo, s5, s4 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v1, vcc_lo -; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT: v_cmp_o_f16_e64 s4, s16, s17 +; GFX10-NEXT: v_mov_b32_e32 v1, 0x7e00 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0x7e00, v0, s4 +; GFX10-NEXT: v_cndmask_b32_sdwa v0, v1, v0, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; use v0 ; GFX10-NEXT: ;;#ASMEND @@ -958,18 +952,17 @@ define <3 x half> @v_minimum_v3f16(<3 x half> %src0, <3 x half> %src1) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX8-NEXT: v_min_f16_e32 v6, v5, v4 -; GFX8-NEXT: v_mov_b32_e32 v7, 0x7e00 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v5, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v7, v6, vcc -; GFX8-NEXT: v_min_f16_e32 v5, v1, v3 +; GFX8-NEXT: v_min_f16_e32 v4, v5, v4 +; GFX8-NEXT: v_mov_b32_e32 v5, 0x7e00 +; GFX8-NEXT: v_cndmask_b32_sdwa v4, v5, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v6, v1, v3 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v5, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc ; GFX8-NEXT: v_min_f16_e32 v3, v0, v2 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: v_minimum_v3f16: @@ -982,9 +975,8 @@ define <3 x half> @v_minimum_v3f16(<3 x half> %src0, <3 x half> %src1) { ; GFX900-NEXT: v_pk_min_f16 v3, v0, v2 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 ; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc -; GFX900-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX900-NEXT: v_cndmask_b32_sdwa v0, v5, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: v_perm_b32 v0, v0, v4, s4 ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -1000,15 +992,15 @@ define <3 x half> @v_minimum_v3f16(<3 x half> %src0, <3 x half> %src1) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_pk_min_f16 v4, v0, v2 -; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v4, 0x7e00, v4, vcc_lo ; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_pk_min_f16 v2, v1, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v5, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v5, 0x7e00 +; GFX10-NEXT: v_cmp_o_f16_e64 s4, v0, v2 +; GFX10-NEXT: v_cndmask_b32_sdwa v2, v5, v4, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0x7e00, v4, s4 +; GFX10-NEXT: v_pk_min_f16 v4, v1, v3 ; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3 -; GFX10-NEXT: v_perm_b32 v0, v0, v4, 0x5040100 -; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v2, vcc_lo +; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: v_minimum_v3f16: @@ -1118,18 +1110,17 @@ define <3 x half> @v_minimum_v3f16__nsz(<3 x half> %src0, <3 x half> %src1) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX8-NEXT: v_min_f16_e32 v6, v5, v4 -; GFX8-NEXT: v_mov_b32_e32 v7, 0x7e00 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v5, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v7, v6, vcc -; GFX8-NEXT: v_min_f16_e32 v5, v1, v3 +; GFX8-NEXT: v_min_f16_e32 v4, v5, v4 +; GFX8-NEXT: v_mov_b32_e32 v5, 0x7e00 +; GFX8-NEXT: v_cndmask_b32_sdwa v4, v5, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v6, v1, v3 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v5, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc ; GFX8-NEXT: v_min_f16_e32 v3, v0, v2 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: v_minimum_v3f16__nsz: @@ -1142,9 +1133,8 @@ define <3 x half> @v_minimum_v3f16__nsz(<3 x half> %src0, <3 x half> %src1) { ; GFX900-NEXT: v_pk_min_f16 v3, v0, v2 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 ; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc -; GFX900-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX900-NEXT: v_cndmask_b32_sdwa v0, v5, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: v_perm_b32 v0, v0, v4, s4 ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -1160,15 +1150,15 @@ define <3 x half> @v_minimum_v3f16__nsz(<3 x half> %src0, <3 x half> %src1) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_pk_min_f16 v4, v0, v2 -; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v4, 0x7e00, v4, vcc_lo ; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_pk_min_f16 v2, v1, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v5, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v5, 0x7e00 +; GFX10-NEXT: v_cmp_o_f16_e64 s4, v0, v2 +; GFX10-NEXT: v_cndmask_b32_sdwa v2, v5, v4, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0x7e00, v4, s4 +; GFX10-NEXT: v_pk_min_f16 v4, v1, v3 ; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3 -; GFX10-NEXT: v_perm_b32 v0, v0, v4, 0x5040100 -; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v2, vcc_lo +; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: v_minimum_v3f16__nsz: @@ -1278,25 +1268,24 @@ define <4 x half> @v_minimum_v4f16(<4 x half> %src0, <4 x half> %src1) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX8-NEXT: v_min_f16_e32 v6, v5, v4 -; GFX8-NEXT: v_mov_b32_e32 v7, 0x7e00 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v5, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v7, v6, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX8-NEXT: v_min_f16_e32 v8, v6, v5 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v6, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc -; GFX8-NEXT: v_min_f16_e32 v6, v1, v3 +; GFX8-NEXT: v_min_f16_e32 v4, v5, v4 +; GFX8-NEXT: v_mov_b32_e32 v5, 0x7e00 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; GFX8-NEXT: v_cndmask_b32_sdwa v4, v5, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v8, v7, v6 +; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v7, v6 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX8-NEXT: v_min_f16_e32 v7, v1, v3 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc ; GFX8-NEXT: v_min_f16_e32 v3, v0, v2 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: v_minimum_v4f16: @@ -1306,15 +1295,13 @@ define <4 x half> @v_minimum_v4f16(<4 x half> %src0, <4 x half> %src1) { ; GFX900-NEXT: v_mov_b32_e32 v5, 0x7e00 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX900-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc -; GFX900-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; GFX900-NEXT: v_cndmask_b32_sdwa v1, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX900-NEXT: v_pk_min_f16 v3, v0, v2 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 ; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc -; GFX900-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX900-NEXT: v_cndmask_b32_sdwa v0, v5, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: v_perm_b32 v0, v0, v4, s4 ; GFX900-NEXT: v_perm_b32 v1, v1, v6, s4 @@ -1330,20 +1317,20 @@ define <4 x half> @v_minimum_v4f16(<4 x half> %src0, <4 x half> %src1) { ; GFX10-LABEL: v_minimum_v4f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_min_f16 v4, v1, v3 -; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3 -; GFX10-NEXT: v_pk_min_f16 v5, v0, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, 0x7e00, v4, vcc_lo -; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v5, 0x7e00, v5, vcc_lo -; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v7, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_perm_b32 v0, v0, v5, 0x5040100 -; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo -; GFX10-NEXT: v_perm_b32 v1, v1, v6, 0x5040100 +; GFX10-NEXT: v_pk_min_f16 v6, v1, v3 +; GFX10-NEXT: v_cmp_o_f16_sdwa s4, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_pk_min_f16 v5, v0, v2 +; GFX10-NEXT: v_cmp_o_f16_e64 s5, v0, v2 +; GFX10-NEXT: v_cndmask_b32_sdwa v2, v4, v6, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: s_mov_b32 vcc_lo, s4 +; GFX10-NEXT: v_cndmask_b32_sdwa v4, v4, v5, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0x7e00, v5, s5 +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v6, vcc_lo +; GFX10-NEXT: v_perm_b32 v0, v4, v0, 0x5040100 +; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: v_minimum_v4f16: @@ -1462,25 +1449,24 @@ define <4 x half> @v_minimum_v4f16__nsz(<4 x half> %src0, <4 x half> %src1) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX8-NEXT: v_min_f16_e32 v6, v5, v4 -; GFX8-NEXT: v_mov_b32_e32 v7, 0x7e00 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v5, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v7, v6, vcc -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX8-NEXT: v_min_f16_e32 v8, v6, v5 -; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v6, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc -; GFX8-NEXT: v_min_f16_e32 v6, v1, v3 +; GFX8-NEXT: v_min_f16_e32 v4, v5, v4 +; GFX8-NEXT: v_mov_b32_e32 v5, 0x7e00 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; GFX8-NEXT: v_cndmask_b32_sdwa v4, v5, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v8, v7, v6 +; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v7, v6 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc +; GFX8-NEXT: v_min_f16_e32 v7, v1, v3 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc ; GFX8-NEXT: v_min_f16_e32 v3, v0, v2 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v6 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: v_minimum_v4f16__nsz: @@ -1490,15 +1476,13 @@ define <4 x half> @v_minimum_v4f16__nsz(<4 x half> %src0, <4 x half> %src1) { ; GFX900-NEXT: v_mov_b32_e32 v5, 0x7e00 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 ; GFX900-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc -; GFX900-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; GFX900-NEXT: v_cndmask_b32_sdwa v1, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX900-NEXT: v_pk_min_f16 v3, v0, v2 ; GFX900-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 ; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc -; GFX900-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX900-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX900-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX900-NEXT: v_cndmask_b32_sdwa v0, v5, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX900-NEXT: s_mov_b32 s4, 0x5040100 ; GFX900-NEXT: v_perm_b32 v0, v0, v4, s4 ; GFX900-NEXT: v_perm_b32 v1, v1, v6, s4 @@ -1514,20 +1498,20 @@ define <4 x half> @v_minimum_v4f16__nsz(<4 x half> %src0, <4 x half> %src1) { ; GFX10-LABEL: v_minimum_v4f16__nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_min_f16 v4, v1, v3 -; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3 -; GFX10-NEXT: v_pk_min_f16 v5, v0, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v6, 0x7e00, v4, vcc_lo -; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v5, 0x7e00, v5, vcc_lo -; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v7, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v4, 0x7e00 ; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_perm_b32 v0, v0, v5, 0x5040100 -; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo -; GFX10-NEXT: v_perm_b32 v1, v1, v6, 0x5040100 +; GFX10-NEXT: v_pk_min_f16 v6, v1, v3 +; GFX10-NEXT: v_cmp_o_f16_sdwa s4, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_pk_min_f16 v5, v0, v2 +; GFX10-NEXT: v_cmp_o_f16_e64 s5, v0, v2 +; GFX10-NEXT: v_cndmask_b32_sdwa v2, v4, v6, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: s_mov_b32 vcc_lo, s4 +; GFX10-NEXT: v_cndmask_b32_sdwa v4, v4, v5, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0x7e00, v5, s5 +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v6, vcc_lo +; GFX10-NEXT: v_perm_b32 v0, v4, v0, 0x5040100 +; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: v_minimum_v4f16__nsz: @@ -1734,34 +1718,34 @@ define <8 x half> @v_minimum_v8f16(<8 x half> %src0, <8 x half> %src1) { ; GFX10-LABEL: v_minimum_v8f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v3, v7 src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-NEXT: v_pk_min_f16 v8, v3, v7 -; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v7 -; GFX10-NEXT: v_pk_min_f16 v9, v2, v6 -; GFX10-NEXT: v_pk_min_f16 v12, v1, v5 -; GFX10-NEXT: v_pk_min_f16 v13, v0, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v10, 0x7e00, v8, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v9, 0x7e00 +; GFX10-NEXT: v_pk_min_f16 v10, v2, v6 +; GFX10-NEXT: v_cmp_o_f16_sdwa s4, v2, v6 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_cmp_o_f16_sdwa s5, v1, v5 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_pk_min_f16 v12, v0, v4 +; GFX10-NEXT: v_cndmask_b32_sdwa v11, v9, v8, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v2, v6 -; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v9 -; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX10-NEXT: v_cndmask_b32_e32 v9, 0x7e00, v9, vcc_lo -; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v2, v6 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v11, vcc_lo -; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v5 -; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v13 -; GFX10-NEXT: v_perm_b32 v2, v2, v9, 0x5040100 -; GFX10-NEXT: v_cndmask_b32_e32 v6, 0x7e00, v12, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v10, vcc_lo +; GFX10-NEXT: s_mov_b32 vcc_lo, s4 +; GFX10-NEXT: v_cmp_o_f16_sdwa s4, v0, v4 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: v_cndmask_b32_sdwa v6, v9, v10, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_pk_min_f16 v10, v1, v5 +; GFX10-NEXT: s_mov_b32 vcc_lo, s5 +; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x5040100 +; GFX10-NEXT: v_cndmask_b32_sdwa v13, v9, v10, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v4 -; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GFX10-NEXT: v_cndmask_b32_e32 v13, 0x7e00, v13, vcc_lo -; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v0, v4 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v11, vcc_lo -; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v1, v5 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_perm_b32 v0, v0, v13, 0x5040100 -; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v12, vcc_lo -; GFX10-NEXT: v_cmp_o_f16_sdwa vcc_lo, v3, v7 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_perm_b32 v1, v1, v6, 0x5040100 +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v12, vcc_lo +; GFX10-NEXT: s_mov_b32 vcc_lo, s4 +; GFX10-NEXT: v_cndmask_b32_sdwa v4, v9, v12, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v5 +; GFX10-NEXT: v_perm_b32 v0, v4, v0, 0x5040100 +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v10, vcc_lo +; GFX10-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v7 +; GFX10-NEXT: v_perm_b32 v1, v13, v1, 0x5040100 ; GFX10-NEXT: v_cndmask_b32_e32 v3, 0x7e00, v8, vcc_lo -; GFX10-NEXT: v_perm_b32 v3, v3, v10, 0x5040100 +; GFX10-NEXT: v_perm_b32 v3, v11, v3, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-TRUE16-LABEL: v_minimum_v8f16: diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-wave32.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-wave32.mir new file mode 100644 index 0000000000000..4b45c54a3b83d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-wave32.mir @@ -0,0 +1,232 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc %s -mtriple=amdgcn -mcpu=gfx1030 -run-pass=si-peephole-sdwa -o - | FileCheck %s + +--- +name: cndmask_b32 # can be directly converted to SDWA without a copy to VCC +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vcc + + ; CHECK-LABEL: name: cndmask_b32 + ; CHECK: liveins: $vgpr0, $vgpr1, $vcc + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[COPY]], implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[COPY1]], implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_sdwa:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_sdwa 0, [[COPY]], 0, [[COPY1]], 0, 6, 0, 5, 5, implicit $vcc_lo, implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_sdwa]] + ; CHECK-NEXT: SI_RETURN implicit $vgpr0 + %0:vgpr_32 = COPY $vgpr1 + %1:vgpr_32 = COPY $vgpr0 + %2:vgpr_32 = V_LSHRREV_B32_e64 16, %0, implicit $exec + %3:vgpr_32 = V_LSHRREV_B32_e64 16, %1, implicit $exec + %4:vgpr_32 = V_CNDMASK_B32_e32 killed %2, killed %3, implicit $exec, implicit $vcc + $vgpr0 = COPY %4 + SI_RETURN implicit $vgpr0 +... + +# For SDWA conversion of V_CNDMASK, the carry-in operand must be +# available in VCC_LO. This is achieved by introducing a COPY +# instruction. Comparison instructions could be changed to VOP2 form +# intead, but we prefer to use a COPY. + +--- +name: carry-compare +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: carry-compare + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 undef [[DEF]], 1, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, undef [[DEF1]], implicit $exec + ; CHECK-NEXT: $vcc_lo = COPY killed [[V_CMP_EQ_U32_e64_]] + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_sdwa:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_sdwa 0, [[V_MOV_B32_e32_]], 0, undef [[DEF1]], 0, 6, 0, 6, 5, implicit $vcc_lo, implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_sdwa]] + ; CHECK-NEXT: SI_RETURN implicit $vgpr0 + %0:sreg_32_xm0_xexec = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + %2:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 undef %0, 1, implicit $exec + %3:vgpr_32 = V_LSHRREV_B32_e64 16, undef %1, implicit $exec + %4:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %3, killed %2, implicit $exec + $vgpr0 = COPY %4 + SI_RETURN implicit $vgpr0 +... + +--- +name: carry-compare-class +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: carry-compare-class + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_CMP_CLASS_F32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_CLASS_F32_e64 2, undef [[DEF]], 1, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, undef [[DEF1]], implicit $exec + ; CHECK-NEXT: $vcc_lo = COPY killed [[V_CMP_CLASS_F32_e64_]] + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_sdwa:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_sdwa 0, [[V_MOV_B32_e32_]], 0, undef [[DEF1]], 0, 6, 0, 6, 5, implicit $vcc_lo, implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_sdwa]] + ; CHECK-NEXT: SI_RETURN implicit $vgpr0 + %0:sreg_32_xm0_xexec = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + %2:sreg_32_xm0_xexec = V_CMP_CLASS_F32_e64 2, undef %0, 1, implicit $exec + %3:vgpr_32 = V_LSHRREV_B32_e64 16, undef %1, implicit $exec + %4:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %3, killed %2, implicit $exec + $vgpr0 = COPY %4 + SI_RETURN implicit $vgpr0 +... + +... +--- +name: carry-non-compare +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: carry-non-compare + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_PK_MAX_F16_:%[0-9]+]]:vgpr_32 = V_PK_MAX_F16 8, [[DEF1]], 8, [[DEF1]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[V_PK_MAX_F16_]], implicit $exec + ; CHECK-NEXT: $vcc_lo = COPY killed undef [[DEF]] + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_sdwa:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_sdwa 0, [[V_MOV_B32_e32_]], 0, [[V_PK_MAX_F16_]], 0, 6, 0, 6, 5, implicit $vcc_lo, implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_sdwa]] + ; CHECK-NEXT: SI_RETURN implicit $vgpr0 + %0:sreg_32_xm0_xexec = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + %2:vgpr_32 = V_PK_MAX_F16 8, %1, 8, %1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %3:vgpr_32 = V_LSHRREV_B32_e64 16, %2, implicit $exec + %4:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %3, killed undef %0, implicit $exec + $vgpr0 = COPY %4 + SI_RETURN implicit $vgpr0 +... + +... +--- +name: carry-multiuse +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; CHECK-LABEL: name: carry-multiuse + ; CHECK: liveins: $vgpr0, $vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_PK_MAX_F16_:%[0-9]+]]:vgpr_32 = V_PK_MAX_F16 8, [[DEF1]], 8, [[DEF1]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[V_PK_MAX_F16_]], implicit $exec + ; CHECK-NEXT: $vcc_lo = COPY undef [[DEF]] + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_sdwa:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_sdwa 0, [[V_MOV_B32_e32_]], 0, [[V_PK_MAX_F16_]], 0, 6, 0, 6, 5, implicit $vcc_lo, implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_sdwa]] + ; CHECK-NEXT: $vgpr1 = COPY [[DEF]] + ; CHECK-NEXT: SI_RETURN implicit $vgpr0 + %0:sreg_32_xm0_xexec = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + %2:vgpr_32 = V_PK_MAX_F16 8, %1, 8, %1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %3:vgpr_32 = V_LSHRREV_B32_e64 16, %2, implicit $exec + %4:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %3, undef %0, implicit $exec + $vgpr0 = COPY %4 + $vgpr1 = COPY %0 + SI_RETURN implicit $vgpr0 +... + +... +--- +name: live-vcc # cannot convert because of live VCC +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: live-vcc + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[DEF]], 1, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[DEF1]], implicit $exec + ; CHECK-NEXT: V_CMP_EQ_U32_e32 1, undef [[DEF1]], implicit-def $vcc_lo, implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, [[V_LSHRREV_B32_e64_]], [[V_CMP_EQ_U32_e64_]], implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_e32_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e32 killed [[V_LSHRREV_B32_e64_]], killed [[V_LSHRREV_B32_e64_]], implicit $vcc_lo, implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_e64_]] + ; CHECK-NEXT: SI_RETURN implicit $vgpr0 + %0:sreg_32_xm0_xexec = IMPLICIT_DEF + %2:vgpr_32 = IMPLICIT_DEF + %3:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 %0, 1, implicit $exec + %5:vgpr_32 = V_LSHRREV_B32_e64 16, %2, implicit $exec + V_CMP_EQ_U32_e32 1, undef %2, implicit-def $vcc, implicit $exec + %6:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %5, %3, implicit $exec + %7:vgpr_32 = V_CNDMASK_B32_e32 killed %5, killed %5, implicit $vcc, implicit $exec + $vgpr0 = COPY %6 + SI_RETURN implicit $vgpr0 +... + +... +--- +name: cannot-shrink-with-source-mods +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: cannot-shrink-with-source-mods + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[DEF]], 0, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[DEF1]], implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 1, 0, 0, [[V_LSHRREV_B32_e64_]], killed [[V_CMP_EQ_U32_e64_]], implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_e64_]] + ; CHECK-NEXT: SI_RETURN implicit $vgpr0 + %0:sreg_32_xm0_xexec = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + %2:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 %0, 0, implicit $exec + %3:vgpr_32 = V_LSHRREV_B32_e64 16, %1, implicit $exec + %4:vgpr_32 = V_CNDMASK_B32_e64 1, 0, 0, %3, killed %2, implicit $exec + $vgpr0 = COPY %4 + SI_RETURN implicit $vgpr0 +... + +... +--- +name: missing-carry-def +tracksRegLiveness: true +registers: + - { id: 0, class: sreg_32_xm0_xexec } +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: missing-carry-def + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF + ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[DEF]], implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, [[V_LSHRREV_B32_e64_]], undef %0:sreg_32_xm0_xexec, implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_e64_]] + ; CHECK-NEXT: SI_RETURN implicit $vgpr0 + %1:sreg_32_xm0_xexec = IMPLICIT_DEF + %2:vgpr_32 = V_LSHRREV_B32_e64 16, %1:sreg_32_xm0_xexec, implicit $exec + %3:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %2, undef %0:sreg_32_xm0_xexec, implicit $exec + $vgpr0 = COPY %3 + SI_RETURN implicit $vgpr0 +... diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-wave64.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-wave64.mir new file mode 100644 index 0000000000000..e243df4077ff4 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-wave64.mir @@ -0,0 +1,233 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc %s -mtriple=amdgcn -mcpu=gfx900 -run-pass=si-peephole-sdwa -o - | FileCheck %s + +--- +name: cndmask_b32 # can be directly converted to SDWA without a copy to VCC +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vcc + + ; CHECK-LABEL: name: cndmask_b32 + ; CHECK: liveins: $vgpr0, $vgpr1, $vcc + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[COPY]], implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[COPY1]], implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_sdwa:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_sdwa 0, [[COPY]], 0, [[COPY1]], 0, 6, 0, 5, 5, implicit $vcc, implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_sdwa]] + ; CHECK-NEXT: SI_RETURN implicit $vgpr0 + %0:vgpr_32 = COPY $vgpr1 + %1:vgpr_32 = COPY $vgpr0 + %2:vgpr_32 = V_LSHRREV_B32_e64 16, %0, implicit $exec + %3:vgpr_32 = V_LSHRREV_B32_e64 16, %1, implicit $exec + %4:vgpr_32 = V_CNDMASK_B32_e32 killed %2, killed %3, implicit $exec, implicit $vcc + $vgpr0 = COPY %4 + SI_RETURN implicit $vgpr0 + +... + +# For SDWA conversion of V_CNDMASK, the carry-in operand must be +# available in VCC. This is achieved by introducing a COPY +# instruction. Comparison instructions could be changed to VOP2 form +# instead, but we prefer to use a COPY. + +--- +name: carry-compare +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: carry-compare + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[DEF]], 1, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[DEF1]], implicit $exec + ; CHECK-NEXT: $vcc = COPY killed [[V_CMP_EQ_U32_e64_]] + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_sdwa:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_sdwa 0, [[V_MOV_B32_e32_]], 0, [[DEF1]], 0, 6, 0, 6, 5, implicit $vcc, implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_sdwa]] + ; CHECK-NEXT: SI_RETURN implicit $vgpr0 + %0:vgpr_32 = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + %2:sreg_64_xexec = V_CMP_EQ_U32_e64 %0, 1, implicit $exec + %3:vgpr_32 = V_LSHRREV_B32_e64 16, %1, implicit $exec + %4:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %3, killed %2, implicit $exec + $vgpr0 = COPY %4 + SI_RETURN implicit $vgpr0 +... + +--- +name: carry-compare-class +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: carry-compare-class + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_CMP_CLASS_F32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_CLASS_F32_e64 2, [[DEF]], 1, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[DEF1]], implicit $exec + ; CHECK-NEXT: $vcc = COPY killed [[V_CMP_CLASS_F32_e64_]] + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_sdwa:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_sdwa 0, [[V_MOV_B32_e32_]], 0, [[DEF1]], 0, 6, 0, 6, 5, implicit $vcc, implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_sdwa]] + ; CHECK-NEXT: SI_RETURN implicit $vgpr0 + %0:sreg_32_xm0_xexec = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + %2:sreg_64_xexec = V_CMP_CLASS_F32_e64 2,%0, 1, implicit $exec + %3:vgpr_32 = V_LSHRREV_B32_e64 16, %1, implicit $exec + %4:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %3, killed %2, implicit $exec + $vgpr0 = COPY %4 + SI_RETURN implicit $vgpr0 +... + +... +--- +name: carry-non-compare +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: carry-non-compare + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_PK_MAX_F16_:%[0-9]+]]:vgpr_32 = V_PK_MAX_F16 8, [[DEF1]], 8, [[DEF1]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[V_PK_MAX_F16_]], implicit $exec + ; CHECK-NEXT: $vcc = COPY killed [[DEF]] + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_sdwa:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_sdwa 0, [[V_MOV_B32_e32_]], 0, [[V_PK_MAX_F16_]], 0, 6, 0, 6, 5, implicit $vcc, implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_sdwa]] + ; CHECK-NEXT: SI_RETURN implicit $vgpr0 + %0:sreg_64_xexec = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + %2:vgpr_32 = V_PK_MAX_F16 8, %1, 8, %1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %3:vgpr_32 = V_LSHRREV_B32_e64 16, %2, implicit $exec + %4:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %3, killed %0, implicit $exec + $vgpr0 = COPY %4 + SI_RETURN implicit $vgpr0 +... + +... +--- +name: carry-multiuse +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; CHECK-LABEL: name: carry-multiuse + ; CHECK: liveins: $vgpr0, $vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_PK_MAX_F16_:%[0-9]+]]:vgpr_32 = V_PK_MAX_F16 8, undef [[DEF1]], 8, undef [[DEF1]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[V_PK_MAX_F16_]], implicit $exec + ; CHECK-NEXT: $vcc = COPY [[DEF]] + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_sdwa:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_sdwa 0, [[V_MOV_B32_e32_]], 0, [[V_PK_MAX_F16_]], 0, 6, 0, 6, 5, implicit $vcc, implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_sdwa]] + ; CHECK-NEXT: $vgpr1 = COPY [[DEF]] + ; CHECK-NEXT: SI_RETURN implicit $vgpr0 + %0:sreg_64_xexec = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + %2:vgpr_32 = V_PK_MAX_F16 8, undef %1, 8, undef %1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %3:vgpr_32 = V_LSHRREV_B32_e64 16, %2, implicit $exec + %4:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %3, %0, implicit $exec + $vgpr0 = COPY %4 + $vgpr1 = COPY %0 + SI_RETURN implicit $vgpr0 +... + +... +--- +name: live-vcc # cannot convert because of live VCC +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: live-vcc + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[DEF]], 1, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[DEF1]], implicit $exec + ; CHECK-NEXT: V_CMP_EQ_U32_e32 1, [[DEF1]], implicit-def $vcc, implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, [[V_LSHRREV_B32_e64_]], [[V_CMP_EQ_U32_e64_]], implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_e32_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e32 killed [[V_LSHRREV_B32_e64_]], killed [[V_LSHRREV_B32_e64_]], implicit $vcc, implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_e64_]] + ; CHECK-NEXT: SI_RETURN implicit $vgpr0 + %0:vgpr_32 = IMPLICIT_DEF + %2:vgpr_32 = IMPLICIT_DEF + %3:sreg_64_xexec = V_CMP_EQ_U32_e64 %0, 1, implicit $exec + %5:vgpr_32 = V_LSHRREV_B32_e64 16, %2, implicit $exec + V_CMP_EQ_U32_e32 1, %2, implicit-def $vcc, implicit $exec + %6:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %5, %3, implicit $exec + %7:vgpr_32 = V_CNDMASK_B32_e32 killed %5, killed %5, implicit $vcc, implicit $exec + $vgpr0 = COPY %6 + SI_RETURN implicit $vgpr0 +... + +... +--- +name: cannot-shrink-with-source-mods +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: cannot-shrink-with-source-mods + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 undef [[DEF]], 0, implicit $exec + ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, undef [[DEF1]], implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 1, 0, 0, [[V_LSHRREV_B32_e64_]], killed [[V_CMP_EQ_U32_e64_]], implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_e64_]] + ; CHECK-NEXT: SI_RETURN implicit $vgpr0 + %0:vgpr_32 = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + %2:sreg_64_xexec = V_CMP_EQ_U32_e64 undef %0, 0, implicit $exec + %3:vgpr_32 = V_LSHRREV_B32_e64 16, undef %1, implicit $exec + %4:vgpr_32 = V_CNDMASK_B32_e64 1, 0, 0, %3, killed %2, implicit $exec + $vgpr0 = COPY %4 + SI_RETURN implicit $vgpr0 +... + +... +--- +name: missing-carry-def +tracksRegLiveness: true +registers: + - { id: 0, class: sreg_64_xexec } +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: missing-carry-def + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, undef [[DEF]], implicit $exec + ; CHECK-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, [[V_LSHRREV_B32_e64_]], undef %0:sreg_64_xexec, implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_e64_]] + ; CHECK-NEXT: SI_RETURN implicit $vgpr0 + %1:vgpr_32 = IMPLICIT_DEF + %2:vgpr_32 = V_LSHRREV_B32_e64 16, undef %1, implicit $exec + %3:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %2, undef %0:sreg_64_xexec, implicit $exec + $vgpr0 = COPY %3 + SI_RETURN implicit $vgpr0 +... diff --git a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll index a2e1ae83a6e5f..7ed27f008083e 100644 --- a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll +++ b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll @@ -38,14 +38,12 @@ define <2 x half> @add_select_fabs_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; VI-LABEL: add_select_fabs_fabs_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 -; VI-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; VI-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v2 +; VI-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v3 +; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; VI-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_add_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_add_f16_e32 v0, v0, v4 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -54,14 +52,12 @@ define <2 x half> @add_select_fabs_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; GFX9-LABEL: add_select_fabs_fabs_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 -; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; GFX9-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v3 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: v_pk_add_f16 v0, v0, v4 @@ -190,18 +186,16 @@ define { <2 x half>, <2 x half> } @add_select_multi_use_lhs_fabs_fabs_v2f16(<2 x ; GFX9-LABEL: add_select_multi_use_lhs_fabs_fabs_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 -; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc +; GFX9-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v3 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_sdwa v2, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX9-NEXT: v_perm_b32 v0, v2, v0, s4 ; GFX9-NEXT: v_pk_add_f16 v0, v0, v5 -; GFX9-NEXT: v_pk_add_f16 v1, v2, v4 +; GFX9-NEXT: v_pk_add_f16 v1, v1, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SAFE-TRUE16-LABEL: add_select_multi_use_lhs_fabs_fabs_v2f16: @@ -309,35 +303,29 @@ define { <2 x half>, <2 x half> } @add_select_multi_store_use_lhs_fabs_fabs_v2f1 ; VI-LABEL: add_select_multi_store_use_lhs_fabs_fabs_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 -; VI-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc -; VI-NEXT: v_add_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v2 +; VI-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v3 +; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; VI-NEXT: v_cndmask_b32_sdwa v2, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_add_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_add_f16_e32 v0, v0, v4 -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_mov_b32_e32 v1, v2 +; VI-NEXT: v_or_b32_e32 v0, v0, v2 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: add_select_multi_store_use_lhs_fabs_fabs_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 -; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; GFX9-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v3 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_sdwa v2, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX9-NEXT: v_perm_b32 v0, v2, v0, s4 ; GFX9-NEXT: v_pk_add_f16 v0, v0, v4 -; GFX9-NEXT: v_mov_b32_e32 v1, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SAFE-TRUE16-LABEL: add_select_multi_store_use_lhs_fabs_fabs_v2f16: @@ -469,18 +457,16 @@ define { <2 x half>, <2 x half> } @add_select_multi_use_rhs_fabs_fabs_v2f16(<2 x ; GFX9-LABEL: add_select_multi_use_rhs_fabs_fabs_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 -; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v3 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc +; GFX9-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v3 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: v_pk_add_f16 v0, v0, v4 -; GFX9-NEXT: v_pk_add_f16 v1, v3, v5 +; GFX9-NEXT: v_pk_add_f16 v1, v2, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SAFE-TRUE16-LABEL: add_select_multi_use_rhs_fabs_fabs_v2f16: @@ -588,14 +574,12 @@ define <2 x half> @add_select_fabs_var_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h ; VI-LABEL: add_select_fabs_var_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc +; VI-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v2 +; VI-NEXT: v_cndmask_b32_sdwa v2, v3, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; VI-NEXT: v_add_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc +; VI-NEXT: v_add_f16_sdwa v1, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_add_f16_e32 v0, v0, v4 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 ; VI-NEXT: s_setpc_b64 s[30:31] @@ -603,15 +587,13 @@ define <2 x half> @add_select_fabs_var_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h ; GFX9-LABEL: add_select_fabs_var_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc +; GFX9-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v2 +; GFX9-NEXT: v_cndmask_b32_sdwa v2, v3, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX9-NEXT: v_perm_b32 v0, v2, v0, s4 ; GFX9-NEXT: v_pk_add_f16 v0, v0, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -702,13 +684,12 @@ define <2 x half> @add_select_fabs_negk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; VI-LABEL: add_select_fabs_negk_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 -; VI-NEXT: v_mov_b32_e32 v4, 0xbc00 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc +; VI-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v2 +; VI-NEXT: v_mov_b32_e32 v2, 0xbc00 +; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; VI-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_add_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_add_f16_e32 v0, v0, v3 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -717,13 +698,12 @@ define <2 x half> @add_select_fabs_negk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; GFX9-LABEL: add_select_fabs_negk_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 -; GFX9-NEXT: v_mov_b32_e32 v4, 0xbc00 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc +; GFX9-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v2 +; GFX9-NEXT: v_mov_b32_e32 v2, 0xbc00 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: v_pk_add_f16 v0, v0, v3 @@ -1039,9 +1019,8 @@ define <2 x half> @add_select_negk_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; VI-NEXT: v_mov_b32_e32 v4, 0xbc00 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc +; VI-NEXT: v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_add_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_add_f16_e32 v0, v0, v3 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -1054,9 +1033,8 @@ define <2 x half> @add_select_negk_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; GFX9-NEXT: v_mov_b32_e32 v4, 0xbc00 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: v_pk_add_f16 v0, v0, v3 @@ -1152,9 +1130,8 @@ define <2 x half> @add_select_negliteralk_fabs_v2f16(<2 x i32> %c, <2 x half> %x ; VI-NEXT: v_mov_b32_e32 v4, 0xe400 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc +; VI-NEXT: v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_add_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_add_f16_e32 v0, v0, v3 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -1167,9 +1144,8 @@ define <2 x half> @add_select_negliteralk_fabs_v2f16(<2 x i32> %c, <2 x half> %x ; GFX9-NEXT: v_mov_b32_e32 v4, 0xe400 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: v_pk_add_f16 v0, v0, v3 @@ -1260,13 +1236,12 @@ define <2 x half> @add_select_fabs_posk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; VI-LABEL: add_select_fabs_posk_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 -; VI-NEXT: v_mov_b32_e32 v4, 0x3c00 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc +; VI-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v2 +; VI-NEXT: v_mov_b32_e32 v2, 0x3c00 +; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; VI-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_add_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_add_f16_e32 v0, v0, v3 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -1275,13 +1250,12 @@ define <2 x half> @add_select_fabs_posk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; GFX9-LABEL: add_select_fabs_posk_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x3c00 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc +; GFX9-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v2 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x3c00 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: v_pk_add_f16 v0, v0, v3 @@ -1376,9 +1350,8 @@ define <2 x half> @add_select_posk_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; VI-NEXT: v_mov_b32_e32 v4, 0x3c00 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc +; VI-NEXT: v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_add_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_add_f16_e32 v0, v0, v3 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -1391,9 +1364,8 @@ define <2 x half> @add_select_posk_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; GFX9-NEXT: v_mov_b32_e32 v4, 0x3c00 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: v_pk_add_f16 v0, v0, v3 @@ -1488,10 +1460,8 @@ define <2 x half> @add_select_fneg_fneg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; VI-LABEL: add_select_fneg_fneg_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc +; VI-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; VI-NEXT: v_sub_f16_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -1502,10 +1472,8 @@ define <2 x half> @add_select_fneg_fneg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; GFX9-LABEL: add_select_fneg_fneg_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v3 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 @@ -1605,16 +1573,14 @@ define { <2 x half>, <2 x half> } @add_select_multi_use_lhs_fneg_fneg_v2f16(<2 x ; VI-LABEL: add_select_multi_use_lhs_fneg_fneg_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc +; VI-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; VI-NEXT: v_sub_f16_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_sub_f16_e32 v0, v4, v0 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_sub_f16_sdwa v1, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_sub_f16_sdwa v1, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_sub_f16_e32 v2, v5, v2 ; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: s_setpc_b64 s[30:31] @@ -1622,10 +1588,8 @@ define { <2 x half>, <2 x half> } @add_select_multi_use_lhs_fneg_fneg_v2f16(<2 x ; GFX9-LABEL: add_select_multi_use_lhs_fneg_fneg_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 @@ -1729,33 +1693,27 @@ define { <2 x half>, <2 x half> } @add_select_multi_store_use_lhs_fneg_fneg_v2f1 ; VI-LABEL: add_select_multi_store_use_lhs_fneg_fneg_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc +; VI-NEXT: v_cndmask_b32_sdwa v5, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; VI-NEXT: v_xor_b32_e32 v5, 0x80008000, v2 -; VI-NEXT: v_sub_f16_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_xor_b32_e32 v1, 0x80008000, v2 +; VI-NEXT: v_sub_f16_sdwa v2, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_sub_f16_e32 v0, v4, v0 -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_mov_b32_e32 v1, v5 +; VI-NEXT: v_or_b32_e32 v0, v0, v2 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: add_select_multi_store_use_lhs_fneg_fneg_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc +; GFX9-NEXT: v_cndmask_b32_sdwa v5, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_xor_b32_e32 v5, 0x80008000, v2 -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX9-NEXT: v_perm_b32 v0, v5, v0, s4 +; GFX9-NEXT: v_xor_b32_e32 v1, 0x80008000, v2 ; GFX9-NEXT: v_pk_add_f16 v0, v4, v0 neg_lo:[0,1] neg_hi:[0,1] -; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SAFE-TRUE16-LABEL: add_select_multi_store_use_lhs_fneg_fneg_v2f16: @@ -1856,16 +1814,14 @@ define { <2 x half>, <2 x half> } @add_select_multi_use_rhs_fneg_fneg_v2f16(<2 x ; VI-LABEL: add_select_multi_use_rhs_fneg_fneg_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc +; VI-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; VI-NEXT: v_sub_f16_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_sub_f16_e32 v0, v4, v0 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_sub_f16_sdwa v1, v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_sub_f16_sdwa v1, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_sub_f16_e32 v2, v5, v3 ; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: s_setpc_b64 s[30:31] @@ -1873,10 +1829,8 @@ define { <2 x half>, <2 x half> } @add_select_multi_use_rhs_fneg_fneg_v2f16(<2 x ; GFX9-LABEL: add_select_multi_use_rhs_fneg_fneg_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v3 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 @@ -1982,14 +1936,12 @@ define <2 x half> @add_select_fneg_var_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h ; VI-LABEL: add_select_fneg_var_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc +; VI-NEXT: v_xor_b32_e32 v1, 0x80008000, v2 +; VI-NEXT: v_cndmask_b32_sdwa v2, v3, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; VI-NEXT: v_add_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc +; VI-NEXT: v_add_f16_sdwa v1, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_add_f16_e32 v0, v0, v4 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 ; VI-NEXT: s_setpc_b64 s[30:31] @@ -1997,15 +1949,13 @@ define <2 x half> @add_select_fneg_var_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h ; GFX9-LABEL: add_select_fneg_var_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc +; GFX9-NEXT: v_xor_b32_e32 v1, 0x80008000, v2 +; GFX9-NEXT: v_cndmask_b32_sdwa v2, v3, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX9-NEXT: v_perm_b32 v0, v2, v0, s4 ; GFX9-NEXT: v_pk_add_f16 v0, v0, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2096,13 +2046,12 @@ define <2 x half> @add_select_fneg_negk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; VI-LABEL: add_select_fneg_negk_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; VI-NEXT: v_mov_b32_e32 v5, 0x3c00 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; VI-NEXT: v_mov_b32_e32 v1, 0x3c00 +; VI-NEXT: v_cndmask_b32_sdwa v4, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc -; VI-NEXT: v_sub_f16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_sub_f16_sdwa v1, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_sub_f16_e32 v0, v3, v0 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 ; VI-NEXT: s_setpc_b64 s[30:31] @@ -2110,14 +2059,13 @@ define <2 x half> @add_select_fneg_negk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; GFX9-LABEL: add_select_fneg_negk_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX9-NEXT: v_mov_b32_e32 v5, 0x3c00 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; GFX9-NEXT: v_mov_b32_e32 v1, 0x3c00 +; GFX9-NEXT: v_cndmask_b32_sdwa v4, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX9-NEXT: v_perm_b32 v0, v4, v0, s4 ; GFX9-NEXT: v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2201,13 +2149,12 @@ define <2 x half> @add_select_fneg_inv2pi_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; VI-LABEL: add_select_fneg_inv2pi_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; VI-NEXT: v_mov_b32_e32 v5, 0xb118 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; VI-NEXT: v_mov_b32_e32 v1, 0xb118 +; VI-NEXT: v_cndmask_b32_sdwa v4, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc -; VI-NEXT: v_sub_f16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_sub_f16_sdwa v1, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_sub_f16_e32 v0, v3, v0 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 ; VI-NEXT: s_setpc_b64 s[30:31] @@ -2215,14 +2162,13 @@ define <2 x half> @add_select_fneg_inv2pi_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; GFX9-LABEL: add_select_fneg_inv2pi_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX9-NEXT: v_mov_b32_e32 v5, 0xb118 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; GFX9-NEXT: v_mov_b32_e32 v1, 0xb118 +; GFX9-NEXT: v_cndmask_b32_sdwa v4, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX9-NEXT: v_perm_b32 v0, v4, v0, s4 ; GFX9-NEXT: v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2306,13 +2252,12 @@ define <2 x half> @add_select_fneg_neginv2pi_v2f16(<2 x i32> %c, <2 x half> %x, ; VI-LABEL: add_select_fneg_neginv2pi_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; VI-NEXT: v_mov_b32_e32 v5, 0x3118 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; VI-NEXT: v_mov_b32_e32 v1, 0x3118 +; VI-NEXT: v_cndmask_b32_sdwa v4, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc -; VI-NEXT: v_sub_f16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_sub_f16_sdwa v1, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_sub_f16_e32 v0, v3, v0 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 ; VI-NEXT: s_setpc_b64 s[30:31] @@ -2320,14 +2265,13 @@ define <2 x half> @add_select_fneg_neginv2pi_v2f16(<2 x i32> %c, <2 x half> %x, ; GFX9-LABEL: add_select_fneg_neginv2pi_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX9-NEXT: v_mov_b32_e32 v5, 0x3118 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; GFX9-NEXT: v_mov_b32_e32 v1, 0x3118 +; GFX9-NEXT: v_cndmask_b32_sdwa v4, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX9-NEXT: v_perm_b32 v0, v4, v0, s4 ; GFX9-NEXT: v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2725,13 +2669,12 @@ define <2 x half> @add_select_negk_fneg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; VI-LABEL: add_select_negk_fneg_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; VI-NEXT: v_mov_b32_e32 v5, 0x3c00 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; VI-NEXT: v_mov_b32_e32 v1, 0x3c00 +; VI-NEXT: v_cndmask_b32_sdwa v4, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc -; VI-NEXT: v_sub_f16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_sub_f16_sdwa v1, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_sub_f16_e32 v0, v3, v0 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 ; VI-NEXT: s_setpc_b64 s[30:31] @@ -2739,14 +2682,13 @@ define <2 x half> @add_select_negk_fneg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; GFX9-LABEL: add_select_negk_fneg_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX9-NEXT: v_mov_b32_e32 v5, 0x3c00 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; GFX9-NEXT: v_mov_b32_e32 v1, 0x3c00 +; GFX9-NEXT: v_cndmask_b32_sdwa v4, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX9-NEXT: v_perm_b32 v0, v4, v0, s4 ; GFX9-NEXT: v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2829,13 +2771,12 @@ define <2 x half> @add_select_fneg_posk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; VI-LABEL: add_select_fneg_posk_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; VI-NEXT: v_mov_b32_e32 v5, 0xbc00 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; VI-NEXT: v_mov_b32_e32 v1, 0xbc00 +; VI-NEXT: v_cndmask_b32_sdwa v4, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc -; VI-NEXT: v_sub_f16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_sub_f16_sdwa v1, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_sub_f16_e32 v0, v3, v0 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 ; VI-NEXT: s_setpc_b64 s[30:31] @@ -2843,14 +2784,13 @@ define <2 x half> @add_select_fneg_posk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; GFX9-LABEL: add_select_fneg_posk_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX9-NEXT: v_mov_b32_e32 v5, 0xbc00 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; GFX9-NEXT: v_mov_b32_e32 v1, 0xbc00 +; GFX9-NEXT: v_cndmask_b32_sdwa v4, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX9-NEXT: v_perm_b32 v0, v4, v0, s4 ; GFX9-NEXT: v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2933,13 +2873,12 @@ define <2 x half> @add_select_posk_fneg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; VI-LABEL: add_select_posk_fneg_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; VI-NEXT: v_mov_b32_e32 v5, 0xbc00 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; VI-NEXT: v_mov_b32_e32 v1, 0xbc00 +; VI-NEXT: v_cndmask_b32_sdwa v4, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc -; VI-NEXT: v_sub_f16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_sub_f16_sdwa v1, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_sub_f16_e32 v0, v3, v0 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 ; VI-NEXT: s_setpc_b64 s[30:31] @@ -2947,14 +2886,13 @@ define <2 x half> @add_select_posk_fneg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; GFX9-LABEL: add_select_posk_fneg_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX9-NEXT: v_mov_b32_e32 v5, 0xbc00 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; GFX9-NEXT: v_mov_b32_e32 v1, 0xbc00 +; GFX9-NEXT: v_cndmask_b32_sdwa v4, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX9-NEXT: v_perm_b32 v0, v4, v0, s4 ; GFX9-NEXT: v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -3045,14 +2983,12 @@ define <2 x half> @add_select_negfabs_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; VI-LABEL: add_select_negfabs_fabs_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_or_b32_e32 v2, 0x80008000, v2 -; VI-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; VI-NEXT: v_or_b32_e32 v1, 0x80008000, v2 +; VI-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v3 +; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; VI-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_add_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_add_f16_e32 v0, v0, v4 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -3061,14 +2997,12 @@ define <2 x half> @add_select_negfabs_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; GFX9-LABEL: add_select_negfabs_fabs_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_or_b32_e32 v2, 0x80008000, v2 -; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; GFX9-NEXT: v_or_b32_e32 v1, 0x80008000, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v3 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: v_pk_add_f16 v0, v0, v4 @@ -3177,14 +3111,12 @@ define <2 x half> @add_select_fabs_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; VI-LABEL: add_select_fabs_negfabs_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 -; VI-NEXT: v_or_b32_e32 v3, 0x80008000, v3 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; VI-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v2 +; VI-NEXT: v_or_b32_e32 v2, 0x80008000, v3 +; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; VI-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_add_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_add_f16_e32 v0, v0, v4 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -3193,14 +3125,12 @@ define <2 x half> @add_select_fabs_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; GFX9-LABEL: add_select_fabs_negfabs_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 -; GFX9-NEXT: v_or_b32_e32 v3, 0x80008000, v3 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; GFX9-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v2 +; GFX9-NEXT: v_or_b32_e32 v2, 0x80008000, v3 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: v_pk_add_f16 v0, v0, v4 @@ -3309,14 +3239,12 @@ define <2 x half> @add_select_neg_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h ; VI-LABEL: add_select_neg_fabs_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; VI-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; VI-NEXT: v_xor_b32_e32 v1, 0x80008000, v2 +; VI-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v3 +; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; VI-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_add_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_add_f16_e32 v0, v0, v4 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -3325,14 +3253,12 @@ define <2 x half> @add_select_neg_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h ; GFX9-LABEL: add_select_neg_fabs_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; GFX9-NEXT: v_xor_b32_e32 v1, 0x80008000, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v3 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: v_pk_add_f16 v0, v0, v4 @@ -3440,14 +3366,12 @@ define <2 x half> @add_select_fabs_neg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h ; VI-LABEL: add_select_fabs_neg_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 -; VI-NEXT: v_xor_b32_e32 v3, 0x80008000, v3 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; VI-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v2 +; VI-NEXT: v_xor_b32_e32 v2, 0x80008000, v3 +; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; VI-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_add_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_add_f16_e32 v0, v0, v4 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -3456,14 +3380,12 @@ define <2 x half> @add_select_fabs_neg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h ; GFX9-LABEL: add_select_fabs_neg_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 -; GFX9-NEXT: v_xor_b32_e32 v3, 0x80008000, v3 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; GFX9-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v2 +; GFX9-NEXT: v_xor_b32_e32 v2, 0x80008000, v3 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: v_pk_add_f16 v0, v0, v4 @@ -3567,14 +3489,12 @@ define <2 x half> @add_select_neg_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; VI-LABEL: add_select_neg_negfabs_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc +; VI-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v3 +; VI-NEXT: v_cndmask_b32_sdwa v3, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; VI-NEXT: v_sub_f16_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_sub_f16_sdwa v1, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_sub_f16_e32 v0, v4, v0 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 ; VI-NEXT: s_setpc_b64 s[30:31] @@ -3582,15 +3502,13 @@ define <2 x half> @add_select_neg_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; GFX9-LABEL: add_select_neg_negfabs_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v3 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc +; GFX9-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v3 +; GFX9-NEXT: v_cndmask_b32_sdwa v3, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX9-NEXT: v_perm_b32 v0, v3, v0, s4 ; GFX9-NEXT: v_pk_add_f16 v0, v4, v0 neg_lo:[0,1] neg_hi:[0,1] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -3687,14 +3605,12 @@ define <2 x half> @add_select_negfabs_neg_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; VI-LABEL: add_select_negfabs_neg_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc +; VI-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v2 +; VI-NEXT: v_cndmask_b32_sdwa v2, v1, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_sub_f16_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_sub_f16_sdwa v1, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_sub_f16_e32 v0, v4, v0 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 ; VI-NEXT: s_setpc_b64 s[30:31] @@ -3702,15 +3618,13 @@ define <2 x half> @add_select_negfabs_neg_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; GFX9-LABEL: add_select_negfabs_neg_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc +; GFX9-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v2 +; GFX9-NEXT: v_cndmask_b32_sdwa v2, v1, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX9-NEXT: v_perm_b32 v0, v2, v0, s4 ; GFX9-NEXT: v_pk_add_f16 v0, v4, v0 neg_lo:[0,1] neg_hi:[0,1] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -3807,13 +3721,12 @@ define <2 x half> @mul_select_negfabs_posk_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; VI-LABEL: mul_select_negfabs_posk_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_or_b32_e32 v2, 0x80008000, v2 -; VI-NEXT: v_mov_b32_e32 v4, 0x4400 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc +; VI-NEXT: v_or_b32_e32 v1, 0x80008000, v2 +; VI-NEXT: v_mov_b32_e32 v2, 0x4400 +; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; VI-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_mul_f16_e32 v0, v0, v3 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -3822,13 +3735,12 @@ define <2 x half> @mul_select_negfabs_posk_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; GFX9-LABEL: mul_select_negfabs_posk_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_or_b32_e32 v2, 0x80008000, v2 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x4400 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc +; GFX9-NEXT: v_or_b32_e32 v1, 0x80008000, v2 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x4400 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: v_pk_mul_f16 v0, v0, v3 @@ -3928,9 +3840,8 @@ define <2 x half> @mul_select_posk_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; VI-NEXT: v_mov_b32_e32 v4, 0x4400 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc +; VI-NEXT: v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_mul_f16_e32 v0, v0, v3 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -3943,9 +3854,8 @@ define <2 x half> @mul_select_posk_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x4400 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: v_pk_mul_f16 v0, v0, v3 @@ -4041,13 +3951,12 @@ define <2 x half> @mul_select_negfabs_negk_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; VI-LABEL: mul_select_negfabs_negk_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_or_b32_e32 v2, 0x80008000, v2 -; VI-NEXT: v_mov_b32_e32 v4, 0xc400 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc +; VI-NEXT: v_or_b32_e32 v1, 0x80008000, v2 +; VI-NEXT: v_mov_b32_e32 v2, 0xc400 +; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; VI-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_mul_f16_e32 v0, v0, v3 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -4056,13 +3965,12 @@ define <2 x half> @mul_select_negfabs_negk_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; GFX9-LABEL: mul_select_negfabs_negk_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_or_b32_e32 v2, 0x80008000, v2 -; GFX9-NEXT: v_mov_b32_e32 v4, 0xc400 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc +; GFX9-NEXT: v_or_b32_e32 v1, 0x80008000, v2 +; GFX9-NEXT: v_mov_b32_e32 v2, 0xc400 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: v_pk_mul_f16 v0, v0, v3 @@ -4162,9 +4070,8 @@ define <2 x half> @mul_select_negk_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; VI-NEXT: v_mov_b32_e32 v4, 0xc400 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc +; VI-NEXT: v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_mul_f16_e32 v0, v0, v3 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -4177,9 +4084,8 @@ define <2 x half> @mul_select_negk_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; GFX9-NEXT: v_mov_b32_e32 v4, 0xc400 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: v_pk_mul_f16 v0, v0, v3 @@ -4279,32 +4185,29 @@ define <2 x half> @select_fneg_posk_src_add_v2f16(<2 x i32> %c, <2 x half> %x, < ; VI-SAFE-LABEL: select_fneg_posk_src_add_v2f16: ; VI-SAFE: ; %bb.0: ; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-SAFE-NEXT: v_mov_b32_e32 v3, 0x4400 -; VI-SAFE-NEXT: v_add_f16_sdwa v3, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-SAFE-NEXT: v_add_f16_e32 v2, 4.0, v2 -; VI-SAFE-NEXT: v_or_b32_e32 v2, v2, v3 -; VI-SAFE-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; VI-SAFE-NEXT: v_mov_b32_e32 v3, 0x4000 -; VI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc -; VI-SAFE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-SAFE-NEXT: v_mov_b32_e32 v1, 0x4400 +; VI-SAFE-NEXT: v_add_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-SAFE-NEXT: v_add_f16_e32 v2, 4.0, v2 +; VI-SAFE-NEXT: v_or_b32_e32 v1, v2, v1 +; VI-SAFE-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 +; VI-SAFE-NEXT: v_mov_b32_e32 v2, 0x4000 +; VI-SAFE-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; VI-SAFE-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-SAFE-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SAFE-LABEL: select_fneg_posk_src_add_v2f16: ; GFX9-SAFE: ; %bb.0: ; GFX9-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SAFE-NEXT: v_pk_add_f16 v2, v2, 4.0 op_sel_hi:[1,0] -; GFX9-SAFE-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; GFX9-SAFE-NEXT: v_mov_b32_e32 v3, 0x4000 -; GFX9-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-SAFE-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; GFX9-SAFE-NEXT: v_pk_add_f16 v1, v2, 4.0 op_sel_hi:[1,0] +; GFX9-SAFE-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 +; GFX9-SAFE-NEXT: v_mov_b32_e32 v2, 0x4000 +; GFX9-SAFE-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX9-SAFE-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; GFX9-SAFE-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-SAFE-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-SAFE-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-SAFE-NEXT: s_setpc_b64 s[30:31] @@ -4355,28 +4258,26 @@ define <2 x half> @select_fneg_posk_src_add_v2f16(<2 x i32> %c, <2 x half> %x, < ; VI-NSZ-LABEL: select_fneg_posk_src_add_v2f16: ; VI-NSZ: ; %bb.0: ; VI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NSZ-NEXT: v_mov_b32_e32 v3, 0xc400 -; VI-NSZ-NEXT: v_sub_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NSZ-NEXT: v_sub_f16_e32 v2, -4.0, v2 -; VI-NSZ-NEXT: v_mov_b32_e32 v4, 0x4000 -; VI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NSZ-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc ; VI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NSZ-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; VI-NSZ-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NSZ-NEXT: v_mov_b32_e32 v1, 0xc400 +; VI-NSZ-NEXT: v_sub_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NSZ-NEXT: v_sub_f16_e32 v2, -4.0, v2 +; VI-NSZ-NEXT: v_mov_b32_e32 v3, 0x4000 +; VI-NSZ-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; VI-NSZ-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[4:5] +; VI-NSZ-NEXT: v_cndmask_b32_sdwa v1, v3, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NSZ-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NSZ-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-NSZ-LABEL: select_fneg_posk_src_add_v2f16: ; GFX9-NSZ: ; %bb.0: ; GFX9-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NSZ-NEXT: v_pk_add_f16 v2, v2, -4.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0] -; GFX9-NSZ-NEXT: v_mov_b32_e32 v3, 0x4000 -; GFX9-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NSZ-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NSZ-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NSZ-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; GFX9-NSZ-NEXT: v_pk_add_f16 v1, v2, -4.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0] +; GFX9-NSZ-NEXT: v_mov_b32_e32 v2, 0x4000 +; GFX9-NSZ-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX9-NSZ-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; GFX9-NSZ-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NSZ-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NSZ-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NSZ-NEXT: s_setpc_b64 s[30:31] @@ -4439,32 +4340,29 @@ define <2 x half> @select_fneg_posk_src_sub_v2f16(<2 x i32> %c, <2 x half> %x) { ; VI-SAFE-LABEL: select_fneg_posk_src_sub_v2f16: ; VI-SAFE: ; %bb.0: ; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-SAFE-NEXT: v_mov_b32_e32 v3, 0xc400 -; VI-SAFE-NEXT: v_add_f16_sdwa v3, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-SAFE-NEXT: v_add_f16_e32 v2, -4.0, v2 -; VI-SAFE-NEXT: v_or_b32_e32 v2, v2, v3 -; VI-SAFE-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; VI-SAFE-NEXT: v_mov_b32_e32 v3, 0x4000 -; VI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc -; VI-SAFE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-SAFE-NEXT: v_mov_b32_e32 v1, 0xc400 +; VI-SAFE-NEXT: v_add_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-SAFE-NEXT: v_add_f16_e32 v2, -4.0, v2 +; VI-SAFE-NEXT: v_or_b32_e32 v1, v2, v1 +; VI-SAFE-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 +; VI-SAFE-NEXT: v_mov_b32_e32 v2, 0x4000 +; VI-SAFE-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; VI-SAFE-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-SAFE-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SAFE-LABEL: select_fneg_posk_src_sub_v2f16: ; GFX9-SAFE: ; %bb.0: ; GFX9-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SAFE-NEXT: v_pk_add_f16 v2, v2, -4.0 op_sel_hi:[1,0] -; GFX9-SAFE-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; GFX9-SAFE-NEXT: v_mov_b32_e32 v3, 0x4000 -; GFX9-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-SAFE-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; GFX9-SAFE-NEXT: v_pk_add_f16 v1, v2, -4.0 op_sel_hi:[1,0] +; GFX9-SAFE-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 +; GFX9-SAFE-NEXT: v_mov_b32_e32 v2, 0x4000 +; GFX9-SAFE-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX9-SAFE-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; GFX9-SAFE-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-SAFE-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-SAFE-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-SAFE-NEXT: s_setpc_b64 s[30:31] @@ -4515,28 +4413,26 @@ define <2 x half> @select_fneg_posk_src_sub_v2f16(<2 x i32> %c, <2 x half> %x) { ; VI-NSZ-LABEL: select_fneg_posk_src_sub_v2f16: ; VI-NSZ: ; %bb.0: ; VI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NSZ-NEXT: v_mov_b32_e32 v3, 0x4400 -; VI-NSZ-NEXT: v_sub_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NSZ-NEXT: v_sub_f16_e32 v2, 4.0, v2 -; VI-NSZ-NEXT: v_mov_b32_e32 v4, 0x4000 -; VI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NSZ-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc ; VI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NSZ-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; VI-NSZ-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NSZ-NEXT: v_mov_b32_e32 v1, 0x4400 +; VI-NSZ-NEXT: v_sub_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NSZ-NEXT: v_sub_f16_e32 v2, 4.0, v2 +; VI-NSZ-NEXT: v_mov_b32_e32 v3, 0x4000 +; VI-NSZ-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; VI-NSZ-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[4:5] +; VI-NSZ-NEXT: v_cndmask_b32_sdwa v1, v3, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NSZ-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NSZ-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-NSZ-LABEL: select_fneg_posk_src_sub_v2f16: ; GFX9-NSZ: ; %bb.0: ; GFX9-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NSZ-NEXT: v_pk_add_f16 v2, v2, 4.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0] -; GFX9-NSZ-NEXT: v_mov_b32_e32 v3, 0x4000 -; GFX9-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NSZ-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NSZ-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NSZ-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; GFX9-NSZ-NEXT: v_pk_add_f16 v1, v2, 4.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0] +; GFX9-NSZ-NEXT: v_mov_b32_e32 v2, 0x4000 +; GFX9-NSZ-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX9-NSZ-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; GFX9-NSZ-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NSZ-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NSZ-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NSZ-NEXT: s_setpc_b64 s[30:31] @@ -4591,28 +4487,26 @@ define <2 x half> @select_fneg_posk_src_mul_v2f16(<2 x i32> %c, <2 x half> %x) { ; VI-LABEL: select_fneg_posk_src_mul_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, 0xc400 -; VI-NEXT: v_mul_f16_sdwa v3, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_mul_f16_e32 v2, -4.0, v2 -; VI-NEXT: v_mov_b32_e32 v4, 0x4000 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_mov_b32_e32 v1, 0xc400 +; VI-NEXT: v_mul_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v2, -4.0, v2 +; VI-NEXT: v_mov_b32_e32 v3, 0x4000 +; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[4:5] +; VI-NEXT: v_cndmask_b32_sdwa v1, v3, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: select_fneg_posk_src_mul_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_mul_f16 v2, v2, -4.0 op_sel_hi:[1,0] -; GFX9-NEXT: v_mov_b32_e32 v3, 0x4000 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; GFX9-NEXT: v_pk_mul_f16 v1, v2, -4.0 op_sel_hi:[1,0] +; GFX9-NEXT: v_mov_b32_e32 v2, 0x4000 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -4704,34 +4598,31 @@ define <2 x half> @select_fneg_posk_src_fma_v2f16(<2 x i32> %c, <2 x half> %x, < ; VI-SAFE-LABEL: select_fneg_posk_src_fma_v2f16: ; VI-SAFE: ; %bb.0: ; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; VI-SAFE-NEXT: v_fma_f16 v4, v5, 4.0, v4 -; VI-SAFE-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; VI-SAFE-NEXT: v_fma_f16 v2, v2, 4.0, v3 -; VI-SAFE-NEXT: v_or_b32_e32 v2, v2, v4 -; VI-SAFE-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; VI-SAFE-NEXT: v_mov_b32_e32 v3, 0x4000 -; VI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; VI-SAFE-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; VI-SAFE-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; VI-SAFE-NEXT: v_fma_f16 v1, v4, 4.0, v1 ; VI-SAFE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-SAFE-NEXT: v_fma_f16 v2, v2, 4.0, v3 +; VI-SAFE-NEXT: v_or_b32_e32 v1, v2, v1 +; VI-SAFE-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 +; VI-SAFE-NEXT: v_mov_b32_e32 v2, 0x4000 +; VI-SAFE-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; VI-SAFE-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-SAFE-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SAFE-LABEL: select_fneg_posk_src_fma_v2f16: ; GFX9-SAFE: ; %bb.0: ; GFX9-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SAFE-NEXT: v_pk_fma_f16 v2, v2, 4.0, v3 op_sel_hi:[1,0,1] -; GFX9-SAFE-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; GFX9-SAFE-NEXT: v_mov_b32_e32 v3, 0x4000 -; GFX9-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-SAFE-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; GFX9-SAFE-NEXT: v_pk_fma_f16 v1, v2, 4.0, v3 op_sel_hi:[1,0,1] +; GFX9-SAFE-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 +; GFX9-SAFE-NEXT: v_mov_b32_e32 v2, 0x4000 +; GFX9-SAFE-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX9-SAFE-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; GFX9-SAFE-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-SAFE-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-SAFE-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-SAFE-NEXT: s_setpc_b64 s[30:31] @@ -4767,29 +4658,27 @@ define <2 x half> @select_fneg_posk_src_fma_v2f16(<2 x i32> %c, <2 x half> %x, < ; VI-NSZ-LABEL: select_fneg_posk_src_fma_v2f16: ; VI-NSZ: ; %bb.0: ; VI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NSZ-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; VI-NSZ-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; VI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; VI-NSZ-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; VI-NSZ-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; VI-NSZ-NEXT: v_fma_f16 v1, v4, -4.0, -v1 ; VI-NSZ-NEXT: v_fma_f16 v2, v2, -4.0, -v3 ; VI-NSZ-NEXT: v_mov_b32_e32 v3, 0x4000 -; VI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NSZ-NEXT: v_fma_f16 v4, v5, -4.0, -v4 -; VI-NSZ-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; VI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NSZ-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; VI-NSZ-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NSZ-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; VI-NSZ-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[4:5] +; VI-NSZ-NEXT: v_cndmask_b32_sdwa v1, v3, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NSZ-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NSZ-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-NSZ-LABEL: select_fneg_posk_src_fma_v2f16: ; GFX9-NSZ: ; %bb.0: ; GFX9-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NSZ-NEXT: v_pk_fma_f16 v2, v2, -4.0, v3 op_sel_hi:[1,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1] -; GFX9-NSZ-NEXT: v_mov_b32_e32 v3, 0x4000 -; GFX9-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NSZ-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NSZ-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NSZ-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; GFX9-NSZ-NEXT: v_pk_fma_f16 v1, v2, -4.0, v3 op_sel_hi:[1,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX9-NSZ-NEXT: v_mov_b32_e32 v2, 0x4000 +; GFX9-NSZ-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX9-NSZ-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; GFX9-NSZ-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NSZ-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NSZ-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NSZ-NEXT: s_setpc_b64 s[30:31] @@ -4858,34 +4747,31 @@ define <2 x half> @select_fneg_posk_src_fmad_v2f16(<2 x i32> %c, <2 x half> %x, ; VI-SAFE-LABEL: select_fneg_posk_src_fmad_v2f16: ; VI-SAFE: ; %bb.0: ; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; VI-SAFE-NEXT: v_fma_f16 v4, v5, 4.0, v4 -; VI-SAFE-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; VI-SAFE-NEXT: v_fma_f16 v2, v2, 4.0, v3 -; VI-SAFE-NEXT: v_or_b32_e32 v2, v2, v4 -; VI-SAFE-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; VI-SAFE-NEXT: v_mov_b32_e32 v3, 0x4000 -; VI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; VI-SAFE-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; VI-SAFE-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; VI-SAFE-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; VI-SAFE-NEXT: v_fma_f16 v1, v4, 4.0, v1 ; VI-SAFE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-SAFE-NEXT: v_fma_f16 v2, v2, 4.0, v3 +; VI-SAFE-NEXT: v_or_b32_e32 v1, v2, v1 +; VI-SAFE-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 +; VI-SAFE-NEXT: v_mov_b32_e32 v2, 0x4000 +; VI-SAFE-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; VI-SAFE-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-SAFE-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-SAFE-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SAFE-LABEL: select_fneg_posk_src_fmad_v2f16: ; GFX9-SAFE: ; %bb.0: ; GFX9-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SAFE-NEXT: v_pk_fma_f16 v2, v2, 4.0, v3 op_sel_hi:[1,0,1] -; GFX9-SAFE-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; GFX9-SAFE-NEXT: v_mov_b32_e32 v3, 0x4000 -; GFX9-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-SAFE-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; GFX9-SAFE-NEXT: v_pk_fma_f16 v1, v2, 4.0, v3 op_sel_hi:[1,0,1] +; GFX9-SAFE-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 +; GFX9-SAFE-NEXT: v_mov_b32_e32 v2, 0x4000 +; GFX9-SAFE-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX9-SAFE-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; GFX9-SAFE-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-SAFE-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-SAFE-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-SAFE-NEXT: s_setpc_b64 s[30:31] @@ -4942,29 +4828,27 @@ define <2 x half> @select_fneg_posk_src_fmad_v2f16(<2 x i32> %c, <2 x half> %x, ; VI-NSZ-LABEL: select_fneg_posk_src_fmad_v2f16: ; VI-NSZ: ; %bb.0: ; VI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NSZ-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; VI-NSZ-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; VI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; VI-NSZ-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; VI-NSZ-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; VI-NSZ-NEXT: v_fma_f16 v1, v4, -4.0, -v1 ; VI-NSZ-NEXT: v_fma_f16 v2, v2, -4.0, -v3 ; VI-NSZ-NEXT: v_mov_b32_e32 v3, 0x4000 -; VI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NSZ-NEXT: v_fma_f16 v4, v5, -4.0, -v4 -; VI-NSZ-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; VI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NSZ-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; VI-NSZ-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NSZ-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; VI-NSZ-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[4:5] +; VI-NSZ-NEXT: v_cndmask_b32_sdwa v1, v3, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NSZ-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NSZ-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-NSZ-LABEL: select_fneg_posk_src_fmad_v2f16: ; GFX9-NSZ: ; %bb.0: ; GFX9-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NSZ-NEXT: v_pk_fma_f16 v2, v2, -4.0, v3 op_sel_hi:[1,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1] -; GFX9-NSZ-NEXT: v_mov_b32_e32 v3, 0x4000 -; GFX9-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NSZ-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NSZ-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NSZ-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; GFX9-NSZ-NEXT: v_pk_fma_f16 v1, v2, -4.0, v3 op_sel_hi:[1,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX9-NSZ-NEXT: v_mov_b32_e32 v2, 0x4000 +; GFX9-NSZ-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX9-NSZ-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; GFX9-NSZ-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NSZ-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NSZ-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NSZ-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/select.f16.ll b/llvm/test/CodeGen/AMDGPU/select.f16.ll index 7339b545686f5..21719226710de 100644 --- a/llvm/test/CodeGen/AMDGPU/select.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll @@ -806,45 +806,41 @@ define amdgpu_kernel void @select_v2f16( ; VI-LABEL: select_v2f16: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x44 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x44 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s22, s6 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s16, s10 -; VI-NEXT: s_mov_b32 s17, s11 -; VI-NEXT: s_mov_b32 s18, s2 -; VI-NEXT: s_mov_b32 s19, s3 ; VI-NEXT: s_mov_b32 s20, s12 ; VI-NEXT: s_mov_b32 s21, s13 -; VI-NEXT: s_mov_b32 s22, s2 -; VI-NEXT: s_mov_b32 s23, s3 +; VI-NEXT: s_mov_b32 s23, s7 +; VI-NEXT: s_mov_b32 s16, s10 +; VI-NEXT: s_mov_b32 s17, s11 +; VI-NEXT: s_mov_b32 s18, s6 +; VI-NEXT: s_mov_b32 s19, s7 +; VI-NEXT: buffer_load_dword v0, off, s[20:23], 0 +; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0 +; VI-NEXT: s_mov_b32 s2, s6 +; VI-NEXT: s_mov_b32 s3, s7 ; VI-NEXT: s_mov_b32 s12, s14 ; VI-NEXT: s_mov_b32 s13, s15 -; VI-NEXT: s_mov_b32 s14, s2 -; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 -; VI-NEXT: buffer_load_dword v1, off, s[20:23], 0 -; VI-NEXT: buffer_load_dword v2, off, s[16:19], 0 -; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: s_mov_b32 s14, s6 +; VI-NEXT: s_mov_b32 s15, s7 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], 0 ; VI-NEXT: buffer_load_dword v3, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s0, s8 -; VI-NEXT: s_mov_b32 s1, s9 +; VI-NEXT: s_mov_b32 s4, s8 +; VI-NEXT: s_mov_b32 s5, s9 ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; VI-NEXT: v_cmp_lt_f16_e32 vcc, v2, v1 +; VI-NEXT: v_cmp_lt_f16_e32 vcc, v5, v4 +; VI-NEXT: v_cmp_lt_f16_e64 s[0:1], v1, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; VI-NEXT: v_cmp_lt_f16_e32 vcc, v6, v5 -; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_cndmask_b32_e64 v0, v2, v3, s[0:1] +; VI-NEXT: v_cndmask_b32_sdwa v1, v2, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX11-TRUE16-LABEL: select_v2f16: @@ -1012,6 +1008,7 @@ define amdgpu_kernel void @select_v2f16_imm_a( ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s2 ; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; VI-NEXT: s_mov_b32 s16, s4 ; VI-NEXT: s_mov_b32 s17, s5 ; VI-NEXT: s_mov_b32 s18, s10 @@ -1020,7 +1017,6 @@ define amdgpu_kernel void @select_v2f16_imm_a( ; VI-NEXT: s_mov_b32 s5, s7 ; VI-NEXT: s_mov_b32 s6, s10 ; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0 ; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 ; VI-NEXT: s_movk_i32 s2, 0x3900 @@ -1028,14 +1024,11 @@ define amdgpu_kernel void @select_v2f16_imm_a( ; VI-NEXT: s_mov_b32 s9, s1 ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; VI-NEXT: v_cmp_lt_f16_e32 vcc, 0.5, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: v_cmp_lt_f16_e32 vcc, s2, v3 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_cmp_lt_f16_e64 s[0:1], 0.5, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[0:1] +; VI-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm @@ -1191,6 +1184,7 @@ define amdgpu_kernel void @select_v2f16_imm_b( ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s12, s2 ; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; VI-NEXT: s_mov_b32 s16, s4 ; VI-NEXT: s_mov_b32 s17, s5 ; VI-NEXT: s_mov_b32 s18, s10 @@ -1199,7 +1193,6 @@ define amdgpu_kernel void @select_v2f16_imm_b( ; VI-NEXT: s_mov_b32 s5, s7 ; VI-NEXT: s_mov_b32 s6, s10 ; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0 ; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 ; VI-NEXT: s_movk_i32 s2, 0x3900 @@ -1207,14 +1200,11 @@ define amdgpu_kernel void @select_v2f16_imm_b( ; VI-NEXT: s_mov_b32 s9, s1 ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; VI-NEXT: v_cmp_gt_f16_e32 vcc, 0.5, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; VI-NEXT: v_cmp_gt_f16_e32 vcc, s2, v3 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_cmp_gt_f16_e64 s[0:1], 0.5, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[0:1] +; VI-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm @@ -1362,42 +1352,40 @@ define amdgpu_kernel void @select_v2f16_imm_c( ; ; VI-LABEL: select_v2f16_imm_c: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s18, s2 -; VI-NEXT: s_mov_b32 s19, s3 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s16, s8 -; VI-NEXT: s_mov_b32 s17, s9 -; VI-NEXT: s_mov_b32 s14, s2 -; VI-NEXT: s_mov_b32 s12, s6 -; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s3 -; VI-NEXT: s_mov_b32 s8, s10 -; VI-NEXT: s_mov_b32 s9, s11 -; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s10 ; VI-NEXT: buffer_load_dword v0, off, s[16:19], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s11, s3 -; VI-NEXT: buffer_load_dword v2, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 ; VI-NEXT: v_mov_b32_e32 v3, 0x3800 -; VI-NEXT: v_mov_b32_e32 v4, 0x3900 -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v1, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v6, v5 -; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v5, v4 +; VI-NEXT: v_mov_b32_e32 v1, 0x3900 +; VI-NEXT: v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX11-TRUE16-LABEL: select_v2f16_imm_c: @@ -1543,42 +1531,40 @@ define amdgpu_kernel void @select_v2f16_imm_d( ; ; VI-LABEL: select_v2f16_imm_d: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s18, s2 -; VI-NEXT: s_mov_b32 s19, s3 +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s16, s8 -; VI-NEXT: s_mov_b32 s17, s9 -; VI-NEXT: s_mov_b32 s14, s2 -; VI-NEXT: s_mov_b32 s12, s6 -; VI-NEXT: s_mov_b32 s13, s7 -; VI-NEXT: s_mov_b32 s15, s3 -; VI-NEXT: s_mov_b32 s8, s10 -; VI-NEXT: s_mov_b32 s9, s11 -; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s10 ; VI-NEXT: buffer_load_dword v0, off, s[16:19], 0 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 -; VI-NEXT: s_mov_b32 s11, s3 -; VI-NEXT: buffer_load_dword v2, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 ; VI-NEXT: v_mov_b32_e32 v3, 0x3800 -; VI-NEXT: v_mov_b32_e32 v4, 0x3900 -; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; VI-NEXT: v_cmp_lt_f16_e32 vcc, v1, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; VI-NEXT: v_cmp_lt_f16_e32 vcc, v6, v5 -; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_cmp_lt_f16_e32 vcc, v5, v4 +; VI-NEXT: v_mov_b32_e32 v1, 0x3900 +; VI-NEXT: v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX11-TRUE16-LABEL: select_v2f16_imm_d: