From a2c7ebca3deffb9779dd7a2309eb0c0c209bc0a7 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Fri, 6 Sep 2024 14:23:06 +0100 Subject: [PATCH 1/4] Legalize llvm.amdgcn.set.inactive.b64 to b32 This matches the handling of llvm.amdgcn.readlane and others and avoids some messy expansion of V_SET_INACTIVE_B64 in expandPostRAPseudo. --- .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 14 +- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 16 +- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 71 +- llvm/lib/Target/AMDGPU/SIInstructions.td | 11 - .../Target/AMDGPU/SIPreAllocateWWMRegs.cpp | 3 +- llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp | 7 +- .../GlobalISel/llvm.amdgcn.set.inactive.ll | 71 +- .../atomic_optimizations_global_pointer.ll | 758 ++-- .../atomic_optimizations_local_pointer.ll | 3778 ++++++++--------- .../AMDGPU/global_atomics_scan_fadd.ll | 525 +-- .../AMDGPU/global_atomics_scan_fmax.ll | 342 +- .../AMDGPU/global_atomics_scan_fmin.ll | 342 +- .../AMDGPU/global_atomics_scan_fsub.ll | 525 +-- .../llvm.amdgcn.set.inactive.chain.arg.ll | 55 +- .../AMDGPU/llvm.amdgcn.set.inactive.ll | 71 +- llvm/test/CodeGen/AMDGPU/wave32.ll | 4 +- .../test/CodeGen/AMDGPU/wwm-reserved-spill.ll | 490 ++- llvm/test/CodeGen/AMDGPU/wwm-reserved.ll | 340 +- 18 files changed, 3732 insertions(+), 3691 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 56f4efda7925f..e657f668cc656 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -5439,6 +5439,8 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper, bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 || IID == Intrinsic::amdgcn_permlanex16; + bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive || + IID == Intrinsic::amdgcn_set_inactive_chain_arg; auto createLaneOp = [&IID, &B, &MI](Register Src0, Register Src1, Register Src2, LLT VT) -> Register { @@ -5448,6 +5450,8 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper, case Intrinsic::amdgcn_permlane64: return LaneOp.getReg(0); case Intrinsic::amdgcn_readlane: + case Intrinsic::amdgcn_set_inactive: + case Intrinsic::amdgcn_set_inactive_chain_arg: return LaneOp.addUse(Src1).getReg(0); case Intrinsic::amdgcn_writelane: return LaneOp.addUse(Src1).addUse(Src2).getReg(0); @@ -5472,7 +5476,7 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper, Register Src0 = MI.getOperand(2).getReg(); Register Src1, Src2; if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane || - IsPermLane16) { + IsSetInactive || IsPermLane16) { Src1 = MI.getOperand(3).getReg(); if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) { Src2 = MI.getOperand(4).getReg(); @@ -5490,7 +5494,7 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper, if (Size < 32) { Src0 = B.buildAnyExt(S32, Src0).getReg(0); - if (IsPermLane16) + if (IsSetInactive || IsPermLane16) Src1 = B.buildAnyExt(LLT::scalar(32), Src1).getReg(0); if (IID == Intrinsic::amdgcn_writelane) @@ -5526,7 +5530,7 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper, MachineInstrBuilder Src0Parts = B.buildUnmerge(PartialResTy, Src0); MachineInstrBuilder Src1Parts, Src2Parts; - if (IsPermLane16) + if (IsSetInactive || IsPermLane16) Src1Parts = B.buildUnmerge(PartialResTy, Src1); if (IID == Intrinsic::amdgcn_writelane) @@ -5535,7 +5539,7 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper, for (unsigned i = 0; i < NumParts; ++i) { Src0 = Src0Parts.getReg(i); - if (IsPermLane16) + if (IsSetInactive || IsPermLane16) Src1 = Src1Parts.getReg(i); if (IID == Intrinsic::amdgcn_writelane) @@ -7496,6 +7500,8 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, case Intrinsic::amdgcn_permlane16: case Intrinsic::amdgcn_permlanex16: case Intrinsic::amdgcn_permlane64: + case Intrinsic::amdgcn_set_inactive: + case Intrinsic::amdgcn_set_inactive_chain_arg: return legalizeLaneOp(Helper, MI, IntrID); case Intrinsic::amdgcn_s_buffer_prefetch_data: return legalizeSBufferPrefetch(Helper, MI); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 25cb8341c51d5..04d95693f7599 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -6108,6 +6108,8 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, unsigned IID = N->getConstantOperandVal(0); bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 || IID == Intrinsic::amdgcn_permlanex16; + bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive || + IID == Intrinsic::amdgcn_set_inactive_chain_arg; SDLoc SL(N); MVT IntVT = MVT::getIntegerVT(ValSize); @@ -6125,6 +6127,8 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, Operands.push_back(Src2); [[fallthrough]]; case Intrinsic::amdgcn_readlane: + case Intrinsic::amdgcn_set_inactive: + case Intrinsic::amdgcn_set_inactive_chain_arg: Operands.push_back(Src1); [[fallthrough]]; case Intrinsic::amdgcn_readfirstlane: @@ -6151,7 +6155,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SDValue Src0 = N->getOperand(1); SDValue Src1, Src2; if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane || - IsPermLane16) { + IsSetInactive || IsPermLane16) { Src1 = N->getOperand(2); if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) Src2 = N->getOperand(3); @@ -6167,7 +6171,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0, SL, MVT::i32); - if (IsPermLane16) { + if (IsSetInactive || IsPermLane16) { Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1, SL, MVT::i32); } @@ -6243,7 +6247,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0, DAG.getConstant(EltIdx, SL, MVT::i32)); - if (IsPermLane16) + if (IsSetInactive || IsPermLane16) Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1, DAG.getConstant(EltIdx, SL, MVT::i32)); @@ -6252,7 +6256,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, DAG.getConstant(EltIdx, SL, MVT::i32)); Pieces.push_back( - IsPermLane16 + IsSetInactive || IsPermLane16 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT) : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT)); EltIdx += 2; @@ -6268,7 +6272,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, MVT VecVT = MVT::getVectorVT(MVT::i32, ValSize / 32); Src0 = DAG.getBitcast(VecVT, Src0); - if (IsPermLane16) + if (IsSetInactive || IsPermLane16) Src1 = DAG.getBitcast(VecVT, Src1); if (IID == Intrinsic::amdgcn_writelane) @@ -8751,6 +8755,8 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::amdgcn_permlane16: case Intrinsic::amdgcn_permlanex16: case Intrinsic::amdgcn_permlane64: + case Intrinsic::amdgcn_set_inactive: + case Intrinsic::amdgcn_set_inactive_chain_arg: return lowerLaneOp(*this, Op.getNode(), DAG); default: if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index c6f28af1e5e73..34298e81f1530 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2099,8 +2099,7 @@ unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) { } Register SIInstrInfo::findSetInactiveMask(const MachineInstr &MI) { - assert(MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B32 || - MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B64); + assert(MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B32); for (auto &Op : MI.implicit_operands()) { if (Op.isDef()) continue; @@ -2287,13 +2286,9 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { MI.eraseFromParent(); break; } - case AMDGPU::V_SET_INACTIVE_B32: - case AMDGPU::V_SET_INACTIVE_B64: { + case AMDGPU::V_SET_INACTIVE_B32: { unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64; unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; - unsigned VMovOpc = MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B64 - ? AMDGPU::V_MOV_B64_PSEUDO - : AMDGPU::V_MOV_B32_e32; Register ExecReg = RI.getExec(); Register DstReg = MI.getOperand(0).getReg(); MachineOperand &ActiveSrc = MI.getOperand(1); @@ -2311,7 +2306,6 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { // constant bus constraints and the presence of literal constants // present an issue. // Fallback to V_MOV base lowering in all but the common cases. - const bool VMov64 = VMovOpc != AMDGPU::V_MOV_B32_e32; MachineFunction *MF = MBB.getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); const unsigned Opcode = AMDGPU::V_CNDMASK_B32_e64; @@ -2319,15 +2313,11 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { const APInt ActiveImm(64, ActiveSrc.isImm() ? ActiveSrc.getImm() : 0); const APInt InactiveImm(64, InactiveSrc.isImm() ? InactiveSrc.getImm() : 0); - const APInt ActiveImmLo(32, ActiveImm.getLoBits(32).getZExtValue()); - const APInt ActiveImmHi(32, ActiveImm.getHiBits(32).getZExtValue()); - const APInt InactiveImmLo(32, InactiveImm.getLoBits(32).getZExtValue()); - const APInt InactiveImmHi(32, InactiveImm.getHiBits(32).getZExtValue()); int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); - int ConstantBusLimit = ST.getConstantBusLimit(AMDGPU::V_CNDMASK_B32_e64); + int ConstantBusLimit = ST.getConstantBusLimit(Opcode); int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0; int ConstantBusUses = 1 + // Starts at 1 for ExecSrcReg @@ -2345,47 +2335,8 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { bool UseVCndMask = ConstantBusUses <= ConstantBusLimit && LiteralConstants <= LiteralLimit; - if (VMov64 && UseVCndMask) { - // Decomposition must not introduce new literals. - UseVCndMask &= - ActiveSrc.isReg() || - (isInlineConstant(ActiveImmLo) && isInlineConstant(ActiveImmHi)) || - (!isInlineConstant(ActiveImm)); - UseVCndMask &= InactiveSrc.isReg() || - (isInlineConstant(InactiveImmLo) && - isInlineConstant(InactiveImmHi)) || - (!isInlineConstant(InactiveImm)); - } - - if (UseVCndMask && VMov64) { - // Dual V_CNDMASK_B32 - MachineOperand ActiveLo = buildExtractSubRegOrImm( - MI, MRI, ActiveSrc, nullptr, AMDGPU::sub0, nullptr); - MachineOperand ActiveHi = buildExtractSubRegOrImm( - MI, MRI, ActiveSrc, nullptr, AMDGPU::sub1, nullptr); - MachineOperand InactiveLo = buildExtractSubRegOrImm( - MI, MRI, InactiveSrc, nullptr, AMDGPU::sub0, nullptr); - MachineOperand InactiveHi = buildExtractSubRegOrImm( - MI, MRI, InactiveSrc, nullptr, AMDGPU::sub1, nullptr); - if (ActiveSrc.isReg()) - ActiveHi.setIsKill(ActiveSrc.isKill()); - if (InactiveSrc.isReg()) - InactiveHi.setIsKill(InactiveSrc.isKill()); - BuildMI(MBB, MI, DL, Desc, RI.getSubReg(DstReg, AMDGPU::sub0)) - .addImm(0) - .add(InactiveLo) - .addImm(0) - .add(ActiveLo) - .addReg(ExecSrcReg) - .addReg(DstReg, RegState::ImplicitDefine); - BuildMI(MBB, MI, DL, Desc, RI.getSubReg(DstReg, AMDGPU::sub1)) - .addImm(0) - .add(InactiveHi) - .addImm(0) - .add(ActiveHi) - .addReg(ExecSrcReg) - .addReg(DstReg, RegState::ImplicitDefine); - } else if (UseVCndMask) { + + if (UseVCndMask) { // Single V_CNDMASK_B32 BuildMI(MBB, MI, DL, Desc, DstReg) .addImm(0) @@ -2410,20 +2361,14 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { .setOperandDead(3); // Dead scc } // Copy inactive lanes - MachineInstr *VMov = - BuildMI(MBB, MI, DL, get(VMovOpc), DstReg).add(InactiveSrc); - if (VMov64) - expandPostRAPseudo(*VMov); + BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstReg).add(InactiveSrc); } if (!DstIsActive) { // Set exec mask to active lanes BuildMI(MBB, MI, DL, get(MovOpc), ExecReg).addReg(ExecSrcReg); // Copy active lanes - MachineInstr *VMov = - BuildMI(MBB, MI, DL, get(VMovOpc), MI.getOperand(0).getReg()) - .add(ActiveSrc); - if (VMov64) - expandPostRAPseudo(*VMov); + BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg()) + .add(ActiveSrc); } // Restore WWM BuildMI(MBB, MI, DL, get(MovOpc), ExecReg).addImm(-1); diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index b7543238c1300..1549fddec8f0e 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -246,9 +246,6 @@ def : GCNPat <(f32 (fptrunc_round f64:$src0, (i32 SupportedRoundMode:$round))), let Defs = [SCC], isConvergent = 1 in { def V_SET_INACTIVE_B32 : VPseudoInstSI <(outs VGPR_32:$vdst), (ins VSrc_b32: $src, VSrc_b32:$inactive), []>; - -def V_SET_INACTIVE_B64 : VPseudoInstSI <(outs VReg_64:$vdst), - (ins VSrc_b64: $src, VSrc_b64:$inactive), []>; } // End Defs = [SCC] foreach vt = Reg32Types.types in { @@ -256,17 +253,9 @@ def : GCNPat <(vt (int_amdgcn_set_inactive vt:$src, vt:$inactive)), (V_SET_INACTIVE_B32 VSrc_b32:$src, VSrc_b32:$inactive)>; } -foreach vt = Reg64Types.types in { -def : GCNPat <(vt (int_amdgcn_set_inactive vt:$src, vt:$inactive)), - (V_SET_INACTIVE_B64 VSrc_b64:$src, VSrc_b64:$inactive)>; -} - def : GCNPat<(i32 (int_amdgcn_set_inactive_chain_arg i32:$src, i32:$inactive)), (V_SET_INACTIVE_B32 VGPR_32:$src, VGPR_32:$inactive)>; -def : GCNPat<(i64 (int_amdgcn_set_inactive_chain_arg i64:$src, i64:$inactive)), - (V_SET_INACTIVE_B64 VReg_64:$src, VReg_64:$inactive)>; - let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in { def WAVE_REDUCE_UMIN_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst), (ins VSrc_b32: $src, VSrc_b32:$strategy), diff --git a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp index 29fef49ee7095..3bf2ea0f9e53e 100644 --- a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp +++ b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp @@ -215,8 +215,7 @@ bool SIPreAllocateWWMRegs::runOnMachineFunction(MachineFunction &MF) { for (MachineBasicBlock *MBB : RPOT) { bool InWWM = false; for (MachineInstr &MI : *MBB) { - if (MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B32 || - MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B64) + if (MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B32) RegsAssigned |= processDef(MI.getOperand(0)); if (MI.getOpcode() == AMDGPU::SI_SPILL_S32_TO_VGPR) { diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp index f9d7ead4ff3ec..c267882c68936 100644 --- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -557,8 +557,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF, // This avoid unnecessarily marking M0 as requiring WQM. III.Needs |= StateStrictWQM; GlobalFlags |= StateStrictWQM; - } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 || - Opcode == AMDGPU::V_SET_INACTIVE_B64) { + } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32) { // Ignore these if V_SET_INACTIVE which already has exec src register. // These are generated by an earlier pass which has seperately ensured // WWM and provided a mask of inactive lanes. @@ -1078,7 +1077,6 @@ void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) { ActiveLanesReg = 0; break; case AMDGPU::V_SET_INACTIVE_B32: - case AMDGPU::V_SET_INACTIVE_B64: if (ActiveLanesReg) { MI.addOperand(*MBB.getParent(), MachineOperand::CreateReg(ActiveLanesReg, false, true)); @@ -1528,8 +1526,7 @@ bool SIWholeQuadMode::lowerCopyInstrs() { LLVM_DEBUG(dbgs() << "simplify: " << *MI); Register RecomputeReg = 0; - if (MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32 || - MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B64) { + if (MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32) { assert(MI->getNumExplicitOperands() == 3); if (MI->getOperand(2).isReg()) RecomputeReg = MI->getOperand(2).getReg(); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll index 137366a45cbdf..c78e072cc708f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll @@ -47,10 +47,12 @@ define amdgpu_kernel void @set_inactive_64(ptr addrspace(1) %out, i64 %in) { ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: s_mov_b64 exec, -1 ; GCN-NEXT: s_mov_b64 exec, s[4:5] @@ -175,6 +177,8 @@ define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) { ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: s_mov_b64 exec, -1 ; GCN-NEXT: s_mov_b64 exec, s[4:5] @@ -241,13 +245,13 @@ define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> % ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: s_mov_b32 s6, 1 -; GCN-NEXT: s_mov_b32 s7, s6 -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NEXT: v_mov_b32_e32 v0, 1 ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: v_mov_b32_e32 v1, 1 +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: s_mov_b64 exec, -1 ; GCN-NEXT: s_mov_b64 exec, s[4:5] @@ -268,13 +272,13 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float> ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: s_mov_b32 s6, 1.0 -; GCN-NEXT: s_mov_b32 s7, s6 -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NEXT: v_mov_b32_e32 v0, 1.0 ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: v_mov_b32_e32 v1, 1.0 +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: s_mov_b64 exec, -1 ; GCN-NEXT: s_mov_b64 exec, s[4:5] @@ -318,18 +322,21 @@ define amdgpu_kernel void @set_inactive_v4i16(ptr addrspace(1) %out, <4 x i16> % ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: s_mov_b32 s6, 0x10001 -; GCN-NEXT: s_mov_b32 s7, s6 -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NEXT: v_mov_b32_e32 v0, 0x10001 ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: s_mov_b64 exec, -1 ; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v2, v0 +; GCN-NEXT: v_mov_b32_e32 v2, s3 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_mov_b32_e32 v1, 0x10001 +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v1, v2 +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0 @@ -345,18 +352,21 @@ define amdgpu_kernel void @set_inactive_v4f16(ptr addrspace(1) %out, <4 x half> ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: s_mov_b32 s6, 0x3c003c00 -; GCN-NEXT: s_mov_b32 s7, s6 -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NEXT: v_mov_b32_e32 v0, 0x3c003c00 ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: s_mov_b64 exec, -1 ; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v2, v0 +; GCN-NEXT: v_mov_b32_e32 v2, s3 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_mov_b32_e32 v1, 0x3c003c00 +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v1, v2 +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0 @@ -372,18 +382,21 @@ define amdgpu_kernel void @set_inactive_v4bf16(ptr addrspace(1) %out, <4 x bfloa ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: s_mov_b32 s6, 0x3f803f80 -; GCN-NEXT: s_mov_b32 s7, s6 -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NEXT: v_mov_b32_e32 v0, 0x3f803f80 ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: s_mov_b64 exec, -1 ; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v2, v0 +; GCN-NEXT: v_mov_b32_e32 v2, s3 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_mov_b32_e32 v1, 0x3f803f80 +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v1, v2 +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0 @@ -400,10 +413,12 @@ define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) { ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: s_mov_b64 exec, -1 ; GCN-NEXT: s_mov_b64 exec, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index 5a8df7b84bf2f..b17dfc7c3754a 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -2890,68 +2890,65 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_DPP-LABEL: add_i64_varying: ; GFX8_DPP: ; %bb.0: ; %entry ; GFX8_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v7 ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[4:5] -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[4:5] -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s[4:5] +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[4:5] +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5 +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc -; GFX8_DPP-NEXT: v_readlane_b32 s7, v4, 63 -; GFX8_DPP-NEXT: v_readlane_b32 s6, v3, 63 -; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc +; GFX8_DPP-NEXT: v_readlane_b32 s7, v1, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s6, v4, 63 ; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] -; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 ; GFX8_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX8_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8_DPP-NEXT: s_cbranch_execz .LBB5_2 @@ -2970,8 +2967,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v8 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v7 -; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v3 ; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4 ; GFX8_DPP-NEXT: v_add_u32_e32 v7, vcc, s5, v7 ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -2984,68 +2981,65 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_DPP-LABEL: add_i64_varying: ; GFX9_DPP: ; %bb.0: ; %entry ; GFX9_DPP-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v7 ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s[0:1] +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc -; GFX9_DPP-NEXT: v_readlane_b32 s3, v4, 63 -; GFX9_DPP-NEXT: v_readlane_b32 s2, v3, 63 -; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc +; GFX9_DPP-NEXT: v_readlane_b32 s3, v1, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s2, v4, 63 ; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 ; GFX9_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB5_2 @@ -3064,8 +3058,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9_DPP-NEXT: v_readfirstlane_b32 s0, v8 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s1, v7 -; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v3 ; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s0 ; GFX9_DPP-NEXT: v_add_co_u32_e32 v7, vcc, s1, v7 ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -3077,23 +3071,21 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1064_DPP-LABEL: add_i64_varying: ; GFX1064_DPP: ; %bb.0: ; %entry -; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s[0:1] -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s[0:1] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v5, 0, 0, s[0:1] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc +; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v4 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v5, v6, vcc ; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7 @@ -3184,23 +3176,21 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1032_DPP-LABEL: add_i64_varying: ; GFX1032_DPP: ; %bb.0: ; %entry -; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s0 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s0 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v5, 0, 0, s0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v4 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v5, v6, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7 @@ -3275,31 +3265,30 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, 0 -; GFX1164_DPP-NEXT: v_and_b32_e32 v8, 0x3ff, v0 +; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1] -; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v2 -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v3, 0, 0, s[0:1] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s[0:1] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v4, vcc +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v4, vcc +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v5, v5 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf @@ -3388,53 +3377,53 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0 +; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v2 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s0 -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v3, 0, 0, s0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v2 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v5, v5 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo ; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v7, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v3, -1, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s6, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo ; GFX1132_DPP-NEXT: v_readlane_b32 s4, v4, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_readlane_b32 s7, v4, 15 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_readlane_b32 s8, v3, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s5, v3, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s6 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s6, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v1, s7, 16 @@ -3478,31 +3467,30 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1264_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1264_DPP-NEXT: v_mov_b32_e32 v9, 0 -; GFX1264_DPP-NEXT: v_and_b32_e32 v8, 0x3ff, v0 +; GFX1264_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1264_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1] -; GFX1264_DPP-NEXT: v_mov_b32_e32 v3, v2 -; GFX1264_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s[0:1] -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1264_DPP-NEXT: v_cndmask_b32_e64 v3, 0, 0, s[0:1] +; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1264_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s[0:1] ; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc -; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1264_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v4, vcc +; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, v2 ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1264_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc -; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v4, vcc +; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v5, v5 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v2 ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1264_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf @@ -3594,29 +3582,30 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1232_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1232_DPP-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0 +; GFX1232_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1232_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1232_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s0 -; GFX1232_DPP-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v6, v2 +; GFX1232_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1232_DPP-NEXT: v_cndmask_b32_e64 v3, 0, 0, s0 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1232_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s0 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1232_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s0 -; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1232_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1232_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo +; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1232_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1232_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1232_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo -; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1232_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo +; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v5, v5 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, v2 ; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1232_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf @@ -6611,68 +6600,65 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_DPP-LABEL: sub_i64_varying: ; GFX8_DPP: ; %bb.0: ; %entry ; GFX8_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v7 ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[4:5] -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[4:5] -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s[4:5] +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[4:5] +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5 +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc -; GFX8_DPP-NEXT: v_readlane_b32 s7, v4, 63 -; GFX8_DPP-NEXT: v_readlane_b32 s6, v3, 63 -; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc +; GFX8_DPP-NEXT: v_readlane_b32 s7, v1, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s6, v4, 63 ; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] -; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 ; GFX8_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX8_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8_DPP-NEXT: s_cbranch_execz .LBB11_2 @@ -6691,8 +6677,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v8 ; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v7 -; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v3 ; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4 ; GFX8_DPP-NEXT: v_sub_u32_e32 v7, vcc, s5, v7 ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -6705,68 +6691,65 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_DPP-LABEL: sub_i64_varying: ; GFX9_DPP: ; %bb.0: ; %entry ; GFX9_DPP-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 -; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v7 ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s[0:1] +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc -; GFX9_DPP-NEXT: v_readlane_b32 s3, v4, 63 -; GFX9_DPP-NEXT: v_readlane_b32 s2, v3, 63 -; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc +; GFX9_DPP-NEXT: v_readlane_b32 s3, v1, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s2, v4, 63 ; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 ; GFX9_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB11_2 @@ -6785,8 +6768,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9_DPP-NEXT: v_readfirstlane_b32 s0, v8 ; GFX9_DPP-NEXT: v_readfirstlane_b32 s1, v7 -; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v3 ; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s0 ; GFX9_DPP-NEXT: v_sub_co_u32_e32 v7, vcc, s1, v7 ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) @@ -6798,23 +6781,21 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1064_DPP-LABEL: sub_i64_varying: ; GFX1064_DPP: ; %bb.0: ; %entry -; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s[0:1] -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s[0:1] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v5, 0, 0, s[0:1] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc +; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v4 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v5, v6, vcc ; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7 @@ -6905,23 +6886,21 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1032_DPP-LABEL: sub_i64_varying: ; GFX1032_DPP: ; %bb.0: ; %entry -; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s0 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s0 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v5, 0, 0, s0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v4 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v5, v6, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7 @@ -6996,31 +6975,30 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, 0 -; GFX1164_DPP-NEXT: v_and_b32_e32 v8, 0x3ff, v0 +; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1] -; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v2 -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v3, 0, 0, s[0:1] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s[0:1] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v4, vcc +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v4, vcc +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v5, v5 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf @@ -7109,53 +7087,53 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0 +; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v2 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s0 -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v3, 0, 0, s0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v2 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v5, v5 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo ; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v7, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v3, -1, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 ; GFX1132_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s6, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo ; GFX1132_DPP-NEXT: v_readlane_b32 s4, v4, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_readlane_b32 s7, v4, 15 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_readlane_b32 s8, v3, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s5, v3, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s6 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s6, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v1, s7, 16 @@ -7199,31 +7177,30 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1264_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1264_DPP-NEXT: v_mov_b32_e32 v9, 0 -; GFX1264_DPP-NEXT: v_and_b32_e32 v8, 0x3ff, v0 +; GFX1264_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1264_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1] -; GFX1264_DPP-NEXT: v_mov_b32_e32 v3, v2 -; GFX1264_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s[0:1] -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1264_DPP-NEXT: v_cndmask_b32_e64 v3, 0, 0, s[0:1] +; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1264_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s[0:1] ; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc -; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1264_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v4, vcc +; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, v2 ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1264_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1264_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc -; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v4, vcc +; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v5, v5 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v2 ; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1264_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf @@ -7315,29 +7292,30 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1232_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1232_DPP-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0 +; GFX1232_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1232_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1232_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s0 -; GFX1232_DPP-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v6, v2 +; GFX1232_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1232_DPP-NEXT: v_cndmask_b32_e64 v3, 0, 0, s0 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1232_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s0 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1232_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s0 -; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1232_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1232_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo +; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1232_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1232_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1232_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1232_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo -; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1232_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo +; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v5, v5 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, v2 ; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1232_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll index 6bf03a202c143..60d9881a78ae0 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -2361,84 +2361,82 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; ; GFX8_DPP-LABEL: add_i64_varying: ; GFX8_DPP: ; %bb.0: ; %entry -; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v9, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v7 ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[0:1] -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1] -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s[0:1] +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5 +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc -; GFX8_DPP-NEXT: v_readlane_b32 s5, v4, 63 -; GFX8_DPP-NEXT: v_readlane_b32 s4, v3, 63 -; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc +; GFX8_DPP-NEXT: v_readlane_b32 s5, v1, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s4, v4, 63 ; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; GFX8_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8_DPP-NEXT: s_cbranch_execz .LBB6_2 ; GFX8_DPP-NEXT: ; %bb.1: -; GFX8_DPP-NEXT: v_mov_b32_e32 v10, s5 -; GFX8_DPP-NEXT: v_mov_b32_e32 v9, s4 +; GFX8_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, s4 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 -; GFX8_DPP-NEXT: ds_add_rtn_u64 v[9:10], v8, v[9:10] +; GFX8_DPP-NEXT: ds_add_rtn_u64 v[7:8], v9, v[7:8] ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_DPP-NEXT: .LBB6_2: ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v10 -; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v9 -; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v8 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v7 +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v3 ; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4 ; GFX8_DPP-NEXT: v_add_u32_e32 v7, vcc, s5, v7 ; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 @@ -2450,83 +2448,81 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; ; GFX9_DPP-LABEL: add_i64_varying: ; GFX9_DPP: ; %bb.0: ; %entry -; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v9, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v7 ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s[0:1] +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc -; GFX9_DPP-NEXT: v_readlane_b32 s5, v4, 63 -; GFX9_DPP-NEXT: v_readlane_b32 s4, v3, 63 -; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc +; GFX9_DPP-NEXT: v_readlane_b32 s5, v1, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s4, v4, 63 ; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; GFX9_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB6_2 ; GFX9_DPP-NEXT: ; %bb.1: -; GFX9_DPP-NEXT: v_mov_b32_e32 v10, s5 -; GFX9_DPP-NEXT: v_mov_b32_e32 v9, s4 -; GFX9_DPP-NEXT: ds_add_rtn_u64 v[9:10], v8, v[9:10] +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s4 +; GFX9_DPP-NEXT: ds_add_rtn_u64 v[7:8], v9, v[7:8] ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB6_2: ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v10 -; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v9 -; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v8 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v7 +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v3 ; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4 ; GFX9_DPP-NEXT: v_add_co_u32_e32 v7, vcc, s5, v7 ; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 @@ -2538,23 +2534,21 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; ; GFX1064_DPP-LABEL: add_i64_varying: ; GFX1064_DPP: ; %bb.0: ; %entry -; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s[0:1] -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s[0:1] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v5, 0, 0, s[0:1] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc +; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v4 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v5, v6, vcc ; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7 @@ -2605,57 +2599,56 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_writelane_b32 v2, s6, 32 ; GFX1064_DPP-NEXT: v_writelane_b32 v1, s7, 32 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v9, exec_hi, v0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] ; GFX1064_DPP-NEXT: v_writelane_b32 v2, s9, 48 ; GFX1064_DPP-NEXT: v_writelane_b32 v1, s8, 48 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] -; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 ; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1064_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 +; GFX1064_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB6_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, s1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, s0 -; GFX1064_DPP-NEXT: ds_add_rtn_u64 v[11:12], v10, v[11:12] +; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1064_DPP-NEXT: ds_add_rtn_u64 v[9:10], v0, v[9:10] ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB6_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v11 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, v2 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v12 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v9 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, v2 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v10 ; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1064_DPP-NEXT: v_add_co_u32 v9, vcc, s0, v9 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v10, vcc, s1, v10, vcc +; GFX1064_DPP-NEXT: v_add_co_u32 v9, vcc, s0, v11 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v10, vcc, s1, v12, vcc ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[4:7], 0 ; GFX1064_DPP-NEXT: s_endpgm ; ; GFX1032_DPP-LABEL: add_i64_varying: ; GFX1032_DPP: ; %bb.0: ; %entry -; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s4 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s4 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s4 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v5, 0, 0, s4 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v4 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v5, v6, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7 @@ -2685,33 +2678,34 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_readlane_b32 s1, v4, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v9, exec_lo, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1032_DPP-NEXT: v_writelane_b32 v2, s6, 16 ; GFX1032_DPP-NEXT: v_writelane_b32 v1, s5, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9 ; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1032_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 +; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB6_2 ; GFX1032_DPP-NEXT: ; %bb.1: -; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, s1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, s0 -; GFX1032_DPP-NEXT: ds_add_rtn_u64 v[11:12], v10, v[11:12] +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1032_DPP-NEXT: ds_add_rtn_u64 v[9:10], v0, v[9:10] ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB6_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v11 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, v2 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v12 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v9 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, v2 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v10 ; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032_DPP-NEXT: v_add_co_u32 v9, vcc_lo, s0, v9 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, s1, v10, vcc_lo +; GFX1032_DPP-NEXT: v_add_co_u32 v9, vcc_lo, s0, v11 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, s1, v12, vcc_lo ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[4:7], 0 ; GFX1032_DPP-NEXT: s_endpgm @@ -2722,31 +2716,30 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, 0 -; GFX1164_DPP-NEXT: v_and_b32_e32 v8, 0x3ff, v0 +; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1] -; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v2 -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v3, 0, 0, s[0:1] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s[0:1] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v4, vcc +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v4, vcc +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v5, v5 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf @@ -2790,34 +2783,35 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_writelane_b32 v2, s7, 32 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] ; GFX1164_DPP-NEXT: v_writelane_b32 v1, s8, 48 ; GFX1164_DPP-NEXT: v_writelane_b32 v2, s9, 48 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 ; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX1164_DPP-NEXT: ; implicit-def: $vgpr10_vgpr11 -; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9 +; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB6_2 ; GFX1164_DPP-NEXT: ; %bb.1: -; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, s1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, s0 -; GFX1164_DPP-NEXT: ds_add_rtn_u64 v[10:11], v9, v[10:11] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, s1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s0 +; GFX1164_DPP-NEXT: ds_add_rtn_u64 v[8:9], v0, v[8:9] ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv ; GFX1164_DPP-NEXT: .LBB6_2: ; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v10 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v2 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v11 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v8 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, v2 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v9 ; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_add_co_u32 v8, vcc, s0, v8 -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v9, vcc, s1, v9, vcc +; GFX1164_DPP-NEXT: v_add_co_u32 v8, vcc, s0, v10 +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v9, vcc, s1, v11, vcc ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_store_b64 v[8:9], off, s[4:7], 0 ; GFX1164_DPP-NEXT: s_nop 0 @@ -2830,77 +2824,77 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0 +; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s4 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v2 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s4 -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v3, 0, 0, s4 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s4 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v2 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v5, v5 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo ; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v7, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v3, -1, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX1132_DPP-NEXT: v_readlane_b32 s0, v4, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_readlane_b32 s5, v4, 15 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v3, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v1, s5, 16 ; GFX1132_DPP-NEXT: v_writelane_b32 v2, s6, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v8 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1132_DPP-NEXT: s_mov_b32 s4, exec_lo -; GFX1132_DPP-NEXT: ; implicit-def: $vgpr10_vgpr11 -; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9 +; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB6_2 ; GFX1132_DPP-NEXT: ; %bb.1: -; GFX1132_DPP-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0 -; GFX1132_DPP-NEXT: ds_add_rtn_u64 v[10:11], v9, v[10:11] +; GFX1132_DPP-NEXT: v_dual_mov_b32 v9, s1 :: v_dual_mov_b32 v8, s0 +; GFX1132_DPP-NEXT: ds_add_rtn_u64 v[8:9], v0, v[8:9] ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv ; GFX1132_DPP-NEXT: .LBB6_2: ; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v10 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v1 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, v2 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v11 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v8 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, v1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v11, v2 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v9 ; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_add_co_u32 v8, vcc_lo, s0, v8 -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, s1, v9, vcc_lo +; GFX1132_DPP-NEXT: v_add_co_u32 v8, vcc_lo, s0, v10 +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, s1, v11, vcc_lo ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_store_b64 v[8:9], off, s[4:7], 0 ; GFX1132_DPP-NEXT: s_nop 0 @@ -3163,164 +3157,158 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; ; GFX8_DPP-LABEL: add_i64_varying_nouse: ; GFX8_DPP: ; %bb.0: ; %entry -; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v8 ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[0:1] -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1] -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc -; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s[0:1] +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5 +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc -; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5 ; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc ; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_add_u32_e32 v1, vcc, v3, v1 -; GFX8_DPP-NEXT: v_addc_u32_e32 v2, vcc, v4, v2, vcc -; GFX8_DPP-NEXT: v_readlane_b32 s3, v2, 63 -; GFX8_DPP-NEXT: v_readlane_b32 s2, v1, 63 +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_add_u32_e32 v2, vcc, v4, v2 +; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GFX8_DPP-NEXT: v_readlane_b32 s3, v1, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s2, v2, 63 ; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX8_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 ; GFX8_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8_DPP-NEXT: s_cbranch_execz .LBB7_2 ; GFX8_DPP-NEXT: ; %bb.1: -; GFX8_DPP-NEXT: v_mov_b32_e32 v10, s1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v9, s1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v8, s0 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 -; GFX8_DPP-NEXT: ds_add_u64 v8, v[9:10] +; GFX8_DPP-NEXT: ds_add_u64 v7, v[8:9] ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_DPP-NEXT: .LBB7_2: ; GFX8_DPP-NEXT: s_endpgm ; ; GFX9_DPP-LABEL: add_i64_varying_nouse: ; GFX9_DPP: ; %bb.0: ; %entry -; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v8 ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc -; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s[0:1] +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc -; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 ; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc ; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_add_co_u32_e32 v1, vcc, v3, v1 -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v2, vcc, v4, v2, vcc -; GFX9_DPP-NEXT: v_readlane_b32 s3, v2, 63 -; GFX9_DPP-NEXT: v_readlane_b32 s2, v1, 63 +; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9_DPP-NEXT: v_readlane_b32 s3, v1, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s2, v2, 63 ; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9_DPP-NEXT: s_mov_b64 s[0:1], s[2:3] -; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 ; GFX9_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB7_2 ; GFX9_DPP-NEXT: ; %bb.1: -; GFX9_DPP-NEXT: v_mov_b32_e32 v10, s1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v9, s0 -; GFX9_DPP-NEXT: ds_add_u64 v8, v[9:10] +; GFX9_DPP-NEXT: v_mov_b32_e32 v9, s1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, s0 +; GFX9_DPP-NEXT: ds_add_u64 v7, v[8:9] ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB7_2: ; GFX9_DPP-NEXT: s_endpgm ; ; GFX1064_DPP-LABEL: add_i64_varying_nouse: ; GFX1064_DPP: ; %bb.0: ; %entry -; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s[0:1] -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s[0:1] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v5, 0, 0, s[0:1] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc +; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v4 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v5, v6, vcc ; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7 @@ -3345,16 +3333,17 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 32 ; GFX1064_DPP-NEXT: v_readlane_b32 s5, v2, 32 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v9, exec_hi, v0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064_DPP-NEXT: s_add_u32 s0, s3, s4 ; GFX1064_DPP-NEXT: s_addc_u32 s1, s2, s5 -; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 ; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB7_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, s1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, s0 -; GFX1064_DPP-NEXT: ds_add_u64 v10, v[11:12] +; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1064_DPP-NEXT: ds_add_u64 v0, v[9:10] ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB7_2: @@ -3362,25 +3351,21 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; ; GFX1032_DPP-LABEL: add_i64_varying_nouse: ; GFX1032_DPP: ; %bb.0: ; %entry -; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s0 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v5, 0, 0, s0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v5 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v4 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v5, v6, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7 @@ -3398,14 +3383,15 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3 ; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v4, vcc_lo ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, v1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, v2 -; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v11, exec_lo, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, v2 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v11 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB7_2 ; GFX1032_DPP-NEXT: ; %bb.1: -; GFX1032_DPP-NEXT: ds_add_u64 v10, v[11:12] +; GFX1032_DPP-NEXT: ds_add_u64 v0, v[9:10] ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB7_2: @@ -3417,30 +3403,29 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, 0 -; GFX1164_DPP-NEXT: v_and_b32_e32 v6, 0x3ff, v0 +; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v6, s[0:1] -; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v2 -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v7, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v2 +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s[0:1] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v1, v4 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v4, v1, vcc -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v2 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_xmask:2 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v5, vcc ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v4, v4 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:8 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc @@ -3460,15 +3445,16 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1164_DPP-NEXT: v_add_co_u32 v2, vcc, v2, v3 ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v1, v4, vcc ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v3 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], exec -; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB7_2 ; GFX1164_DPP-NEXT: ; %bb.1: -; GFX1164_DPP-NEXT: ds_add_u64 v7, v[8:9] +; GFX1164_DPP-NEXT: ds_add_u64 v0, v[6:7] ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv ; GFX1164_DPP-NEXT: .LBB7_2: @@ -3480,49 +3466,49 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, 0 :: v_dual_and_b32 v6, 0x3ff, v0 +; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v6, s0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v2 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v7, s0 -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v2 +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v2 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v4 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v4, v1, vcc_lo -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v2 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v4, v4 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v1 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v3, 0, 0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v1, 0, 0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_add_co_u32 v2, vcc_lo, v3, v2 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v1, v4, vcc_lo ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, v3 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v7, v3 ; GFX1132_DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB7_2 ; GFX1132_DPP-NEXT: ; %bb.1: -; GFX1132_DPP-NEXT: ds_add_u64 v7, v[8:9] +; GFX1132_DPP-NEXT: ds_add_u64 v0, v[6:7] ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv ; GFX1132_DPP-NEXT: .LBB7_2: @@ -5909,84 +5895,82 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; ; GFX8_DPP-LABEL: sub_i64_varying: ; GFX8_DPP: ; %bb.0: ; %entry -; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v9, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v7 ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[0:1] -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1] -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s[0:1] +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5 +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc -; GFX8_DPP-NEXT: v_readlane_b32 s5, v4, 63 -; GFX8_DPP-NEXT: v_readlane_b32 s4, v3, 63 -; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc +; GFX8_DPP-NEXT: v_readlane_b32 s5, v1, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s4, v4, 63 ; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; GFX8_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8_DPP-NEXT: s_cbranch_execz .LBB14_2 ; GFX8_DPP-NEXT: ; %bb.1: -; GFX8_DPP-NEXT: v_mov_b32_e32 v10, s5 -; GFX8_DPP-NEXT: v_mov_b32_e32 v9, s4 +; GFX8_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, s4 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 -; GFX8_DPP-NEXT: ds_sub_rtn_u64 v[9:10], v8, v[9:10] +; GFX8_DPP-NEXT: ds_sub_rtn_u64 v[7:8], v9, v[7:8] ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_DPP-NEXT: .LBB14_2: ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v10 -; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v9 -; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v8 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v7 +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v3 ; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4 ; GFX8_DPP-NEXT: v_sub_u32_e32 v7, vcc, s5, v7 ; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 @@ -5998,83 +5982,81 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; ; GFX9_DPP-LABEL: sub_i64_varying: ; GFX9_DPP: ; %bb.0: ; %entry -; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v9, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v7 ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s[0:1] +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3 +; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc -; GFX9_DPP-NEXT: v_readlane_b32 s5, v4, 63 -; GFX9_DPP-NEXT: v_readlane_b32 s4, v3, 63 -; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc +; GFX9_DPP-NEXT: v_readlane_b32 s5, v1, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s4, v4, 63 ; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; GFX9_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB14_2 ; GFX9_DPP-NEXT: ; %bb.1: -; GFX9_DPP-NEXT: v_mov_b32_e32 v10, s5 -; GFX9_DPP-NEXT: v_mov_b32_e32 v9, s4 -; GFX9_DPP-NEXT: ds_sub_rtn_u64 v[9:10], v8, v[9:10] +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s4 +; GFX9_DPP-NEXT: ds_sub_rtn_u64 v[7:8], v9, v[7:8] ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB14_2: ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v10 -; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v9 -; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v8 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v7 +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v2 +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v3 ; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4 ; GFX9_DPP-NEXT: v_sub_co_u32_e32 v7, vcc, s5, v7 ; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 @@ -6086,23 +6068,21 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; ; GFX1064_DPP-LABEL: sub_i64_varying: ; GFX1064_DPP: ; %bb.0: ; %entry -; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s[0:1] -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s[0:1] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] +; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v5, 0, 0, s[0:1] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc +; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v4 +; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v5, v6, vcc ; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7 @@ -6153,57 +6133,56 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_writelane_b32 v2, s6, 32 ; GFX1064_DPP-NEXT: v_writelane_b32 v1, s7, 32 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v9, exec_hi, v0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] ; GFX1064_DPP-NEXT: v_writelane_b32 v2, s9, 48 ; GFX1064_DPP-NEXT: v_writelane_b32 v1, s8, 48 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] -; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 ; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1064_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 +; GFX1064_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB14_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, s1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, s0 -; GFX1064_DPP-NEXT: ds_sub_rtn_u64 v[11:12], v10, v[11:12] +; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1064_DPP-NEXT: ds_sub_rtn_u64 v[9:10], v0, v[9:10] ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB14_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v11 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, v2 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v12 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v9 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, v2 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v10 ; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1064_DPP-NEXT: v_sub_co_u32 v9, vcc, s0, v9 -; GFX1064_DPP-NEXT: v_sub_co_ci_u32_e32 v10, vcc, s1, v10, vcc +; GFX1064_DPP-NEXT: v_sub_co_u32 v9, vcc, s0, v11 +; GFX1064_DPP-NEXT: v_sub_co_ci_u32_e32 v10, vcc, s1, v12, vcc ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[4:7], 0 ; GFX1064_DPP-NEXT: s_endpgm ; ; GFX1032_DPP-LABEL: sub_i64_varying: ; GFX1032_DPP: ; %bb.0: ; %entry -; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s4 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s4 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s4 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v5, 0, 0, s4 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo +; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v5 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v4 +; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v5, v6, vcc_lo ; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7 @@ -6233,33 +6212,34 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_readlane_b32 s1, v4, 31 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v9, exec_lo, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1032_DPP-NEXT: v_writelane_b32 v2, s6, 16 ; GFX1032_DPP-NEXT: v_writelane_b32 v1, s5, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9 ; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1032_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 +; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB14_2 ; GFX1032_DPP-NEXT: ; %bb.1: -; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, s1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, s0 -; GFX1032_DPP-NEXT: ds_sub_rtn_u64 v[11:12], v10, v[11:12] +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1032_DPP-NEXT: ds_sub_rtn_u64 v[9:10], v0, v[9:10] ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB14_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v11 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, v2 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v12 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v9 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, v1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, v2 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v10 ; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032_DPP-NEXT: v_sub_co_u32 v9, vcc_lo, s0, v9 -; GFX1032_DPP-NEXT: v_sub_co_ci_u32_e32 v10, vcc_lo, s1, v10, vcc_lo +; GFX1032_DPP-NEXT: v_sub_co_u32 v9, vcc_lo, s0, v11 +; GFX1032_DPP-NEXT: v_sub_co_ci_u32_e32 v10, vcc_lo, s1, v12, vcc_lo ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[4:7], 0 ; GFX1032_DPP-NEXT: s_endpgm @@ -6270,31 +6250,30 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, 0 -; GFX1164_DPP-NEXT: v_and_b32_e32 v8, 0x3ff, v0 +; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1] -; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v2 -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v3, 0, 0, s[0:1] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s[0:1] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v4, vcc +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc -; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v4, vcc +; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v5, v5 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf @@ -6338,34 +6317,35 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_writelane_b32 v2, s7, 32 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] ; GFX1164_DPP-NEXT: v_writelane_b32 v1, s8, 48 ; GFX1164_DPP-NEXT: v_writelane_b32 v2, s9, 48 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 ; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX1164_DPP-NEXT: ; implicit-def: $vgpr10_vgpr11 -; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9 +; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB14_2 ; GFX1164_DPP-NEXT: ; %bb.1: -; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, s1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, s0 -; GFX1164_DPP-NEXT: ds_sub_rtn_u64 v[10:11], v9, v[10:11] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, s1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s0 +; GFX1164_DPP-NEXT: ds_sub_rtn_u64 v[8:9], v0, v[8:9] ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv ; GFX1164_DPP-NEXT: .LBB14_2: ; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v10 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v2 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v11 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v8 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, v2 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v9 ; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_sub_co_u32 v8, vcc, s0, v8 -; GFX1164_DPP-NEXT: v_sub_co_ci_u32_e32 v9, vcc, s1, v9, vcc +; GFX1164_DPP-NEXT: v_sub_co_u32 v8, vcc, s0, v10 +; GFX1164_DPP-NEXT: v_sub_co_ci_u32_e32 v9, vcc, s1, v11, vcc ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_store_b64 v[8:9], off, s[4:7], 0 ; GFX1164_DPP-NEXT: s_nop 0 @@ -6378,77 +6358,77 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0 +; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s4 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v2 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s4 -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v3, 0, 0, s4 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s4 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v2 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo +; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v5, v5 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 -; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo ; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v7, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v3, -1, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX1132_DPP-NEXT: v_readlane_b32 s0, v4, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_readlane_b32 s5, v4, 15 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v3, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v1, s5, 16 ; GFX1132_DPP-NEXT: v_writelane_b32 v2, s6, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v8 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1132_DPP-NEXT: s_mov_b32 s4, exec_lo -; GFX1132_DPP-NEXT: ; implicit-def: $vgpr10_vgpr11 -; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9 +; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB14_2 ; GFX1132_DPP-NEXT: ; %bb.1: -; GFX1132_DPP-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0 -; GFX1132_DPP-NEXT: ds_sub_rtn_u64 v[10:11], v9, v[10:11] +; GFX1132_DPP-NEXT: v_dual_mov_b32 v9, s1 :: v_dual_mov_b32 v8, s0 +; GFX1132_DPP-NEXT: ds_sub_rtn_u64 v[8:9], v0, v[8:9] ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv ; GFX1132_DPP-NEXT: .LBB14_2: ; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v10 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v1 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, v2 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v11 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v8 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, v1 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v11, v2 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v9 ; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_sub_co_u32 v8, vcc_lo, s0, v8 -; GFX1132_DPP-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, s1, v9, vcc_lo +; GFX1132_DPP-NEXT: v_sub_co_u32 v8, vcc_lo, s0, v10 +; GFX1132_DPP-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, s1, v11, vcc_lo ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_store_b64 v[8:9], off, s[4:7], 0 ; GFX1132_DPP-NEXT: s_nop 0 @@ -7432,129 +7412,121 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; ; GFX8_DPP-LABEL: and_i64_varying: ; GFX8_DPP: ; %bb.0: ; %entry -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v5, exec_lo, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v5, exec_hi, v5 ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v5, s[0:1] -; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, -1, v6, s[0:1] -; GFX8_DPP-NEXT: v_mov_b32_e32 v1, -1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v2, -1 -; GFX8_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, -1, 0, s[0:1] +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v0, s[0:1] +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, -1 +; GFX8_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: s_mov_b32 s6, -1 +; GFX8_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, -1 +; GFX8_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_readlane_b32 s5, v4, 63 -; GFX8_DPP-NEXT: v_readlane_b32 s4, v3, 63 -; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_readlane_b32 s5, v1, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s4, v2, 63 +; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 +; GFX8_DPP-NEXT: ; implicit-def: $vgpr5_vgpr6 ; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8_DPP-NEXT: s_cbranch_execz .LBB16_2 ; GFX8_DPP-NEXT: ; %bb.1: -; GFX8_DPP-NEXT: v_mov_b32_e32 v8, s5 -; GFX8_DPP-NEXT: v_mov_b32_e32 v7, s4 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, s5 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, s4 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 -; GFX8_DPP-NEXT: ds_and_rtn_b64 v[7:8], v6, v[7:8] +; GFX8_DPP-NEXT: ds_and_rtn_b64 v[5:6], v7, v[5:6] ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_DPP-NEXT: .LBB16_2: ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v8 -; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v7 -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 -; GFX8_DPP-NEXT: s_mov_b32 s2, -1 -; GFX8_DPP-NEXT: v_and_b32_e32 v6, s4, v6 -; GFX8_DPP-NEXT: v_and_b32_e32 v5, s5, v5 +; GFX8_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s0, v6 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s1, v5 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX8_DPP-NEXT: s_mov_b32 s7, 0xf000 +; GFX8_DPP-NEXT: v_and_b32_e32 v6, s0, v6 +; GFX8_DPP-NEXT: v_and_b32_e32 v5, s1, v5 ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX8_DPP-NEXT: buffer_store_dwordx2 v[5:6], off, s[0:3], 0 +; GFX8_DPP-NEXT: buffer_store_dwordx2 v[5:6], off, s[4:7], 0 ; GFX8_DPP-NEXT: s_endpgm ; ; GFX9_DPP-LABEL: and_i64_varying: ; GFX9_DPP: ; %bb.0: ; %entry -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v5, exec_lo, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v5, exec_hi, v5 ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v5, s[0:1] -; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, -1, v6, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v1, -1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v2, -1 -; GFX9_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, -1, 0, s[0:1] +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v0, s[0:1] +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, -1 +; GFX9_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: s_mov_b32 s6, -1 +; GFX9_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, -1 +; GFX9_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_readlane_b32 s5, v4, 63 -; GFX9_DPP-NEXT: v_readlane_b32 s4, v3, 63 -; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_readlane_b32 s5, v1, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s4, v2, 63 +; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 +; GFX9_DPP-NEXT: ; implicit-def: $vgpr5_vgpr6 ; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB16_2 ; GFX9_DPP-NEXT: ; %bb.1: -; GFX9_DPP-NEXT: v_mov_b32_e32 v8, s5 -; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s4 -; GFX9_DPP-NEXT: ds_and_rtn_b64 v[7:8], v6, v[7:8] +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, s5 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, s4 +; GFX9_DPP-NEXT: ds_and_rtn_b64 v[5:6], v7, v[5:6] ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB16_2: ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v8 -; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v7 -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 -; GFX9_DPP-NEXT: s_mov_b32 s2, -1 -; GFX9_DPP-NEXT: v_and_b32_e32 v6, s4, v6 -; GFX9_DPP-NEXT: v_and_b32_e32 v5, s5, v5 +; GFX9_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s0, v6 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s1, v5 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX9_DPP-NEXT: s_mov_b32 s7, 0xf000 +; GFX9_DPP-NEXT: v_and_b32_e32 v6, s0, v6 +; GFX9_DPP-NEXT: v_and_b32_e32 v5, s1, v5 ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_DPP-NEXT: buffer_store_dwordx2 v[5:6], off, s[0:3], 0 +; GFX9_DPP-NEXT: buffer_store_dwordx2 v[5:6], off, s[4:7], 0 ; GFX9_DPP-NEXT: s_endpgm ; ; GFX1064_DPP-LABEL: and_i64_varying: ; GFX1064_DPP: ; %bb.0: ; %entry -; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v7, s[0:1] -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v8, s[0:1] -; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, -1, 0, s[0:1] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v0, s[0:1] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, -1 +; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf @@ -7575,63 +7547,62 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v2, 15 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 15 -; GFX1064_DPP-NEXT: v_readlane_b32 s6, v2, 31 -; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 31 -; GFX1064_DPP-NEXT: v_readlane_b32 s8, v1, 47 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v2, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v2, 31 ; GFX1064_DPP-NEXT: v_writelane_b32 v6, s4, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v2, 63 ; GFX1064_DPP-NEXT: v_writelane_b32 v5, s5, 16 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v2, 63 -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 63 +; GFX1064_DPP-NEXT: v_readlane_b32 s8, v1, 47 ; GFX1064_DPP-NEXT: v_readlane_b32 s9, v2, 47 ; GFX1064_DPP-NEXT: v_writelane_b32 v6, s6, 32 ; GFX1064_DPP-NEXT: v_writelane_b32 v5, s7, 32 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] -; GFX1064_DPP-NEXT: v_writelane_b32 v6, s9, 48 -; GFX1064_DPP-NEXT: v_writelane_b32 v5, s8, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v6, s8, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v5, s9, 48 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] -; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 ; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1064_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1064_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB16_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, s1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s0 -; GFX1064_DPP-NEXT: ds_and_rtn_b64 v[9:10], v8, v[9:10] +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, s0 +; GFX1064_DPP-NEXT: ds_and_rtn_b64 v[7:8], v0, v[7:8] ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB16_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v5 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v10 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v6 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v9 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v8 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v5 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v6 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v7 ; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1064_DPP-NEXT: v_and_b32_e32 v8, s0, v8 -; GFX1064_DPP-NEXT: v_and_b32_e32 v7, s1, v7 +; GFX1064_DPP-NEXT: v_and_b32_e32 v9, s0, v9 +; GFX1064_DPP-NEXT: v_and_b32_e32 v8, s1, v8 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[4:7], 0 +; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[8:9], off, s[4:7], 0 ; GFX1064_DPP-NEXT: s_endpgm ; ; GFX1032_DPP-LABEL: and_i64_varying: ; GFX1032_DPP: ; %bb.0: ; %entry -; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v7, s4 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v8, s4 -; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, -1, 0, s4 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v0, s4 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, -1 +; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf @@ -7643,199 +7614,201 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1 ; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s0, v1, 31 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 31 ; GFX1032_DPP-NEXT: v_readlane_b32 s6, v2, 15 -; GFX1032_DPP-NEXT: v_readlane_b32 s1, v2, 31 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s0, v2, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032_DPP-NEXT: v_writelane_b32 v6, s6, 16 -; GFX1032_DPP-NEXT: v_writelane_b32 v5, s5, 16 +; GFX1032_DPP-NEXT: v_writelane_b32 v6, s5, 16 +; GFX1032_DPP-NEXT: v_writelane_b32 v5, s6, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 ; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1032_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB16_2 ; GFX1032_DPP-NEXT: ; %bb.1: -; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s0 -; GFX1032_DPP-NEXT: ds_and_rtn_b64 v[9:10], v8, v[9:10] +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, s1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, s0 +; GFX1032_DPP-NEXT: ds_and_rtn_b64 v[7:8], v0, v[7:8] ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB16_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v5 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v10 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v6 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v9 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v8 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v5 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v6 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v7 ; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032_DPP-NEXT: v_and_b32_e32 v8, s0, v8 -; GFX1032_DPP-NEXT: v_and_b32_e32 v7, s1, v7 +; GFX1032_DPP-NEXT: v_and_b32_e32 v9, s0, v9 +; GFX1032_DPP-NEXT: v_and_b32_e32 v8, s1, v8 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[4:7], 0 +; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[8:9], off, s[4:7], 0 ; GFX1032_DPP-NEXT: s_endpgm ; ; GFX1164_DPP-LABEL: and_i64_varying: ; GFX1164_DPP: ; %bb.0: ; %entry -; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX1164_DPP-NEXT: v_and_b32_e32 v7, 0x3ff, v0 +; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v7, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v8, s[0:1] -; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, -1, 0, s[0:1] +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v0, s[0:1] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, -1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1 ; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1 -; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 31 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, s4 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, s4 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s5 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v2, 15 -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 15 -; GFX1164_DPP-NEXT: v_readlane_b32 s6, v2, 31 -; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 31 -; GFX1164_DPP-NEXT: v_readlane_b32 s8, v1, 47 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 31 ; GFX1164_DPP-NEXT: v_writelane_b32 v6, s4, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v2, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v2, 63 ; GFX1164_DPP-NEXT: v_writelane_b32 v5, s5, 16 -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 63 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 63 -; GFX1164_DPP-NEXT: v_readlane_b32 s9, v2, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 63 +; GFX1164_DPP-NEXT: v_readlane_b32 s8, v1, 47 ; GFX1164_DPP-NEXT: v_writelane_b32 v6, s6, 32 +; GFX1164_DPP-NEXT: v_readlane_b32 s9, v2, 47 ; GFX1164_DPP-NEXT: v_writelane_b32 v5, s7, 32 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] -; GFX1164_DPP-NEXT: v_writelane_b32 v6, s9, 48 -; GFX1164_DPP-NEXT: v_writelane_b32 v5, s8, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v6, s8, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v5, s9, 48 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 ; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX1164_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 -; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB16_2 ; GFX1164_DPP-NEXT: ; %bb.1: -; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, s1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, s0 -; GFX1164_DPP-NEXT: ds_and_rtn_b64 v[9:10], v8, v[9:10] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s0 +; GFX1164_DPP-NEXT: ds_and_rtn_b64 v[7:8], v0, v[7:8] ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv ; GFX1164_DPP-NEXT: .LBB16_2: ; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v5 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v10 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v6 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v9 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v8 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v5 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v6 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v7 ; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_and_b32_e32 v8, s0, v8 -; GFX1164_DPP-NEXT: v_and_b32_e32 v7, s1, v7 +; GFX1164_DPP-NEXT: v_and_b32_e32 v9, s0, v9 +; GFX1164_DPP-NEXT: v_and_b32_e32 v8, s1, v8 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164_DPP-NEXT: buffer_store_b64 v[7:8], off, s[4:7], 0 +; GFX1164_DPP-NEXT: buffer_store_b64 v[8:9], off, s[4:7], 0 ; GFX1164_DPP-NEXT: s_nop 0 ; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164_DPP-NEXT: s_endpgm ; ; GFX1132_DPP-LABEL: and_i64_varying: ; GFX1132_DPP: ; %bb.0: ; %entry -; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_and_b32 v7, 0x3ff, v0 +; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v7, s4 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v8, s4 -; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, -1, 0, s4 +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v0, s4 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, -1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1 ; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1 -; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: v_readlane_b32 s0, v1, 31 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 31 ; GFX1132_DPP-NEXT: v_readlane_b32 s6, v2, 15 -; GFX1132_DPP-NEXT: v_readlane_b32 s1, v2, 31 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_readlane_b32 s0, v2, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: v_writelane_b32 v6, s6, 16 -; GFX1132_DPP-NEXT: v_writelane_b32 v5, s5, 16 +; GFX1132_DPP-NEXT: v_writelane_b32 v6, s5, 16 +; GFX1132_DPP-NEXT: v_writelane_b32 v5, s6, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1132_DPP-NEXT: s_mov_b32 s4, exec_lo -; GFX1132_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 -; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB16_2 ; GFX1132_DPP-NEXT: ; %bb.1: -; GFX1132_DPP-NEXT: v_dual_mov_b32 v10, s1 :: v_dual_mov_b32 v9, s0 -; GFX1132_DPP-NEXT: ds_and_rtn_b64 v[9:10], v8, v[9:10] +; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v7, s0 +; GFX1132_DPP-NEXT: ds_and_rtn_b64 v[7:8], v0, v[7:8] ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv ; GFX1132_DPP-NEXT: .LBB16_2: ; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v7, v5 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v10 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v6 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v9 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v8 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v5 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, v6 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v7 ; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_and_b32_e32 v8, s0, v8 -; GFX1132_DPP-NEXT: v_and_b32_e32 v7, s1, v7 +; GFX1132_DPP-NEXT: v_and_b32_e32 v9, s0, v9 +; GFX1132_DPP-NEXT: v_and_b32_e32 v8, s1, v8 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132_DPP-NEXT: buffer_store_b64 v[7:8], off, s[4:7], 0 +; GFX1132_DPP-NEXT: buffer_store_b64 v[8:9], off, s[4:7], 0 ; GFX1132_DPP-NEXT: s_nop 0 ; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132_DPP-NEXT: s_endpgm @@ -8816,54 +8789,52 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; ; GFX8_DPP-LABEL: or_i64_varying: ; GFX8_DPP: ; %bb.0: ; %entry -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v5, exec_lo, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v5, exec_hi, v5 ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v5, s[0:1] -; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v6, s[0:1] -; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX8_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_readlane_b32 s5, v4, 63 -; GFX8_DPP-NEXT: v_readlane_b32 s4, v3, 63 -; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_readlane_b32 s5, v1, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s4, v2, 63 +; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 +; GFX8_DPP-NEXT: ; implicit-def: $vgpr5_vgpr6 ; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8_DPP-NEXT: s_cbranch_execz .LBB18_2 ; GFX8_DPP-NEXT: ; %bb.1: -; GFX8_DPP-NEXT: v_mov_b32_e32 v8, s5 -; GFX8_DPP-NEXT: v_mov_b32_e32 v7, s4 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, s5 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, s4 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 -; GFX8_DPP-NEXT: ds_or_rtn_b64 v[7:8], v6, v[7:8] +; GFX8_DPP-NEXT: ds_or_rtn_b64 v[5:6], v7, v[5:6] ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_DPP-NEXT: .LBB18_2: ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v8 -; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v7 -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v6 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v5 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4 ; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 ; GFX8_DPP-NEXT: s_mov_b32 s2, -1 ; GFX8_DPP-NEXT: v_or_b32_e32 v6, s4, v6 @@ -8874,53 +8845,51 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; ; GFX9_DPP-LABEL: or_i64_varying: ; GFX9_DPP: ; %bb.0: ; %entry -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v5, exec_lo, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v5, exec_hi, v5 ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v5, s[0:1] -; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v6, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX9_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_readlane_b32 s5, v4, 63 -; GFX9_DPP-NEXT: v_readlane_b32 s4, v3, 63 -; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_readlane_b32 s5, v1, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s4, v2, 63 +; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 +; GFX9_DPP-NEXT: ; implicit-def: $vgpr5_vgpr6 ; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB18_2 ; GFX9_DPP-NEXT: ; %bb.1: -; GFX9_DPP-NEXT: v_mov_b32_e32 v8, s5 -; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s4 -; GFX9_DPP-NEXT: ds_or_rtn_b64 v[7:8], v6, v[7:8] +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, s5 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, s4 +; GFX9_DPP-NEXT: ds_or_rtn_b64 v[5:6], v7, v[5:6] ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB18_2: ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v8 -; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v7 -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v6 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v5 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4 ; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 ; GFX9_DPP-NEXT: s_mov_b32 s2, -1 ; GFX9_DPP-NEXT: v_or_b32_e32 v6, s4, v6 @@ -8931,14 +8900,12 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; ; GFX1064_DPP-LABEL: or_i64_varying: ; GFX1064_DPP: ; %bb.0: ; %entry -; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v7, s[0:1] -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v8, s[0:1] -; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -8959,63 +8926,62 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v2, 15 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 15 -; GFX1064_DPP-NEXT: v_readlane_b32 s6, v2, 31 -; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 31 -; GFX1064_DPP-NEXT: v_readlane_b32 s8, v1, 47 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v2, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v2, 31 ; GFX1064_DPP-NEXT: v_writelane_b32 v6, s4, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v2, 63 ; GFX1064_DPP-NEXT: v_writelane_b32 v5, s5, 16 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v2, 63 -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 63 +; GFX1064_DPP-NEXT: v_readlane_b32 s8, v1, 47 ; GFX1064_DPP-NEXT: v_readlane_b32 s9, v2, 47 ; GFX1064_DPP-NEXT: v_writelane_b32 v6, s6, 32 ; GFX1064_DPP-NEXT: v_writelane_b32 v5, s7, 32 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] -; GFX1064_DPP-NEXT: v_writelane_b32 v6, s9, 48 -; GFX1064_DPP-NEXT: v_writelane_b32 v5, s8, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v6, s8, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v5, s9, 48 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] -; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 ; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1064_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1064_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB18_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, s1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s0 -; GFX1064_DPP-NEXT: ds_or_rtn_b64 v[9:10], v8, v[9:10] +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, s0 +; GFX1064_DPP-NEXT: ds_or_rtn_b64 v[7:8], v0, v[7:8] ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB18_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v5 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v10 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v6 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v9 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v8 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v5 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v6 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v7 ; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1064_DPP-NEXT: v_or_b32_e32 v8, s0, v8 -; GFX1064_DPP-NEXT: v_or_b32_e32 v7, s1, v7 +; GFX1064_DPP-NEXT: v_or_b32_e32 v9, s0, v9 +; GFX1064_DPP-NEXT: v_or_b32_e32 v8, s1, v8 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[4:7], 0 +; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[8:9], off, s[4:7], 0 ; GFX1064_DPP-NEXT: s_endpgm ; ; GFX1032_DPP-LABEL: or_i64_varying: ; GFX1032_DPP: ; %bb.0: ; %entry -; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v7, s4 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v8, s4 -; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s4 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s4 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -9027,199 +8993,201 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1 ; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s0, v1, 31 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 31 ; GFX1032_DPP-NEXT: v_readlane_b32 s6, v2, 15 -; GFX1032_DPP-NEXT: v_readlane_b32 s1, v2, 31 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s0, v2, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032_DPP-NEXT: v_writelane_b32 v6, s6, 16 -; GFX1032_DPP-NEXT: v_writelane_b32 v5, s5, 16 +; GFX1032_DPP-NEXT: v_writelane_b32 v6, s5, 16 +; GFX1032_DPP-NEXT: v_writelane_b32 v5, s6, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 ; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1032_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB18_2 ; GFX1032_DPP-NEXT: ; %bb.1: -; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s0 -; GFX1032_DPP-NEXT: ds_or_rtn_b64 v[9:10], v8, v[9:10] +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, s1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, s0 +; GFX1032_DPP-NEXT: ds_or_rtn_b64 v[7:8], v0, v[7:8] ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB18_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v5 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v10 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v6 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v9 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v8 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v5 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v6 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v7 ; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032_DPP-NEXT: v_or_b32_e32 v8, s0, v8 -; GFX1032_DPP-NEXT: v_or_b32_e32 v7, s1, v7 +; GFX1032_DPP-NEXT: v_or_b32_e32 v9, s0, v9 +; GFX1032_DPP-NEXT: v_or_b32_e32 v8, s1, v8 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[4:7], 0 +; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[8:9], off, s[4:7], 0 ; GFX1032_DPP-NEXT: s_endpgm ; ; GFX1164_DPP-LABEL: or_i64_varying: ; GFX1164_DPP: ; %bb.0: ; %entry -; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX1164_DPP-NEXT: v_and_b32_e32 v7, 0x3ff, v0 +; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v7, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v8, s[0:1] -; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1 ; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1 -; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 31 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, s4 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, s4 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s5 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v2, 15 -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 15 -; GFX1164_DPP-NEXT: v_readlane_b32 s6, v2, 31 -; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 31 -; GFX1164_DPP-NEXT: v_readlane_b32 s8, v1, 47 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 31 ; GFX1164_DPP-NEXT: v_writelane_b32 v6, s4, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v2, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v2, 63 ; GFX1164_DPP-NEXT: v_writelane_b32 v5, s5, 16 -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 63 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 63 -; GFX1164_DPP-NEXT: v_readlane_b32 s9, v2, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 63 +; GFX1164_DPP-NEXT: v_readlane_b32 s8, v1, 47 ; GFX1164_DPP-NEXT: v_writelane_b32 v6, s6, 32 +; GFX1164_DPP-NEXT: v_readlane_b32 s9, v2, 47 ; GFX1164_DPP-NEXT: v_writelane_b32 v5, s7, 32 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] -; GFX1164_DPP-NEXT: v_writelane_b32 v6, s9, 48 -; GFX1164_DPP-NEXT: v_writelane_b32 v5, s8, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v6, s8, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v5, s9, 48 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 ; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX1164_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 -; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB18_2 ; GFX1164_DPP-NEXT: ; %bb.1: -; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, s1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, s0 -; GFX1164_DPP-NEXT: ds_or_rtn_b64 v[9:10], v8, v[9:10] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s0 +; GFX1164_DPP-NEXT: ds_or_rtn_b64 v[7:8], v0, v[7:8] ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv ; GFX1164_DPP-NEXT: .LBB18_2: ; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v5 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v10 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v6 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v9 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v8 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v5 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v6 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v7 ; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_or_b32_e32 v8, s0, v8 -; GFX1164_DPP-NEXT: v_or_b32_e32 v7, s1, v7 +; GFX1164_DPP-NEXT: v_or_b32_e32 v9, s0, v9 +; GFX1164_DPP-NEXT: v_or_b32_e32 v8, s1, v8 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164_DPP-NEXT: buffer_store_b64 v[7:8], off, s[4:7], 0 +; GFX1164_DPP-NEXT: buffer_store_b64 v[8:9], off, s[4:7], 0 ; GFX1164_DPP-NEXT: s_nop 0 ; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164_DPP-NEXT: s_endpgm ; ; GFX1132_DPP-LABEL: or_i64_varying: ; GFX1132_DPP: ; %bb.0: ; %entry -; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_and_b32 v7, 0x3ff, v0 +; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v7, s4 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v8, s4 -; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s4 +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s4 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1 ; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1 -; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: v_readlane_b32 s0, v1, 31 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 31 ; GFX1132_DPP-NEXT: v_readlane_b32 s6, v2, 15 -; GFX1132_DPP-NEXT: v_readlane_b32 s1, v2, 31 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_readlane_b32 s0, v2, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: v_writelane_b32 v6, s6, 16 -; GFX1132_DPP-NEXT: v_writelane_b32 v5, s5, 16 +; GFX1132_DPP-NEXT: v_writelane_b32 v6, s5, 16 +; GFX1132_DPP-NEXT: v_writelane_b32 v5, s6, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1132_DPP-NEXT: s_mov_b32 s4, exec_lo -; GFX1132_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 -; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB18_2 ; GFX1132_DPP-NEXT: ; %bb.1: -; GFX1132_DPP-NEXT: v_dual_mov_b32 v10, s1 :: v_dual_mov_b32 v9, s0 -; GFX1132_DPP-NEXT: ds_or_rtn_b64 v[9:10], v8, v[9:10] +; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v7, s0 +; GFX1132_DPP-NEXT: ds_or_rtn_b64 v[7:8], v0, v[7:8] ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv ; GFX1132_DPP-NEXT: .LBB18_2: ; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v7, v5 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v10 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v6 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v9 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v8 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v5 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, v6 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v7 ; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_or_b32_e32 v8, s0, v8 -; GFX1132_DPP-NEXT: v_or_b32_e32 v7, s1, v7 +; GFX1132_DPP-NEXT: v_or_b32_e32 v9, s0, v9 +; GFX1132_DPP-NEXT: v_or_b32_e32 v8, s1, v8 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132_DPP-NEXT: buffer_store_b64 v[7:8], off, s[4:7], 0 +; GFX1132_DPP-NEXT: buffer_store_b64 v[8:9], off, s[4:7], 0 ; GFX1132_DPP-NEXT: s_nop 0 ; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132_DPP-NEXT: s_endpgm @@ -10200,54 +10168,52 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; ; GFX8_DPP-LABEL: xor_i64_varying: ; GFX8_DPP: ; %bb.0: ; %entry -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v5, exec_lo, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v5, exec_hi, v5 ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v5, s[0:1] -; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v6, s[0:1] -; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX8_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_readlane_b32 s5, v4, 63 -; GFX8_DPP-NEXT: v_readlane_b32 s4, v3, 63 -; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_readlane_b32 s5, v1, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s4, v2, 63 +; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 +; GFX8_DPP-NEXT: ; implicit-def: $vgpr5_vgpr6 ; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8_DPP-NEXT: s_cbranch_execz .LBB20_2 ; GFX8_DPP-NEXT: ; %bb.1: -; GFX8_DPP-NEXT: v_mov_b32_e32 v8, s5 -; GFX8_DPP-NEXT: v_mov_b32_e32 v7, s4 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, s5 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, s4 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 -; GFX8_DPP-NEXT: ds_xor_rtn_b64 v[7:8], v6, v[7:8] +; GFX8_DPP-NEXT: ds_xor_rtn_b64 v[5:6], v7, v[5:6] ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_DPP-NEXT: .LBB20_2: ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v8 -; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v7 -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v6 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v5 +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4 ; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 ; GFX8_DPP-NEXT: s_mov_b32 s2, -1 ; GFX8_DPP-NEXT: v_xor_b32_e32 v6, s4, v6 @@ -10258,53 +10224,51 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; ; GFX9_DPP-LABEL: xor_i64_varying: ; GFX9_DPP: ; %bb.0: ; %entry -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, 0 -; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v5, exec_lo, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v5, exec_hi, v5 ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v5, s[0:1] -; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v6, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX9_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_readlane_b32 s5, v4, 63 -; GFX9_DPP-NEXT: v_readlane_b32 s4, v3, 63 -; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_readlane_b32 s5, v1, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s4, v2, 63 +; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 +; GFX9_DPP-NEXT: ; implicit-def: $vgpr5_vgpr6 ; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB20_2 ; GFX9_DPP-NEXT: ; %bb.1: -; GFX9_DPP-NEXT: v_mov_b32_e32 v8, s5 -; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s4 -; GFX9_DPP-NEXT: ds_xor_rtn_b64 v[7:8], v6, v[7:8] +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, s5 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, s4 +; GFX9_DPP-NEXT: ds_xor_rtn_b64 v[5:6], v7, v[5:6] ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB20_2: ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v8 -; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v7 -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v6 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v5 +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4 ; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 ; GFX9_DPP-NEXT: s_mov_b32 s2, -1 ; GFX9_DPP-NEXT: v_xor_b32_e32 v6, s4, v6 @@ -10315,14 +10279,12 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; ; GFX1064_DPP-LABEL: xor_i64_varying: ; GFX1064_DPP: ; %bb.0: ; %entry -; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v7, s[0:1] -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v8, s[0:1] -; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1064_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -10343,63 +10305,62 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v2, 15 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 15 -; GFX1064_DPP-NEXT: v_readlane_b32 s6, v2, 31 -; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 31 -; GFX1064_DPP-NEXT: v_readlane_b32 s8, v1, 47 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v2, 15 +; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 31 +; GFX1064_DPP-NEXT: v_readlane_b32 s7, v2, 31 ; GFX1064_DPP-NEXT: v_writelane_b32 v6, s4, 16 +; GFX1064_DPP-NEXT: v_readlane_b32 s4, v2, 63 ; GFX1064_DPP-NEXT: v_writelane_b32 v5, s5, 16 -; GFX1064_DPP-NEXT: v_readlane_b32 s5, v2, 63 -; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 63 +; GFX1064_DPP-NEXT: v_readlane_b32 s8, v1, 47 ; GFX1064_DPP-NEXT: v_readlane_b32 s9, v2, 47 ; GFX1064_DPP-NEXT: v_writelane_b32 v6, s6, 32 ; GFX1064_DPP-NEXT: v_writelane_b32 v5, s7, 32 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] -; GFX1064_DPP-NEXT: v_writelane_b32 v6, s9, 48 -; GFX1064_DPP-NEXT: v_writelane_b32 v5, s8, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v6, s8, 48 +; GFX1064_DPP-NEXT: v_writelane_b32 v5, s9, 48 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] -; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 ; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1064_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1064_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB20_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, s1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s0 -; GFX1064_DPP-NEXT: ds_xor_rtn_b64 v[9:10], v8, v[9:10] +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, s0 +; GFX1064_DPP-NEXT: ds_xor_rtn_b64 v[7:8], v0, v[7:8] ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB20_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v5 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v10 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v6 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v9 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v8 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v5 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v6 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v7 ; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1064_DPP-NEXT: v_xor_b32_e32 v8, s0, v8 -; GFX1064_DPP-NEXT: v_xor_b32_e32 v7, s1, v7 +; GFX1064_DPP-NEXT: v_xor_b32_e32 v9, s0, v9 +; GFX1064_DPP-NEXT: v_xor_b32_e32 v8, s1, v8 ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[4:7], 0 +; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[8:9], off, s[4:7], 0 ; GFX1064_DPP-NEXT: s_endpgm ; ; GFX1032_DPP-LABEL: xor_i64_varying: ; GFX1032_DPP: ; %bb.0: ; %entry -; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v7, s4 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v8, s4 -; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s4 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s4 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1032_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -10411,199 +10372,201 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1 ; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s0, v1, 31 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 31 ; GFX1032_DPP-NEXT: v_readlane_b32 s6, v2, 15 -; GFX1032_DPP-NEXT: v_readlane_b32 s1, v2, 31 -; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s0, v2, 31 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032_DPP-NEXT: v_writelane_b32 v6, s6, 16 -; GFX1032_DPP-NEXT: v_writelane_b32 v5, s5, 16 +; GFX1032_DPP-NEXT: v_writelane_b32 v6, s5, 16 +; GFX1032_DPP-NEXT: v_writelane_b32 v5, s6, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 ; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1032_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB20_2 ; GFX1032_DPP-NEXT: ; %bb.1: -; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s0 -; GFX1032_DPP-NEXT: ds_xor_rtn_b64 v[9:10], v8, v[9:10] +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, s1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, s0 +; GFX1032_DPP-NEXT: ds_xor_rtn_b64 v[7:8], v0, v[7:8] ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB20_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v5 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v10 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v6 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v9 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v8 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v5 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v6 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v7 ; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1032_DPP-NEXT: v_xor_b32_e32 v8, s0, v8 -; GFX1032_DPP-NEXT: v_xor_b32_e32 v7, s1, v7 +; GFX1032_DPP-NEXT: v_xor_b32_e32 v9, s0, v9 +; GFX1032_DPP-NEXT: v_xor_b32_e32 v8, s1, v8 ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[4:7], 0 +; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[8:9], off, s[4:7], 0 ; GFX1032_DPP-NEXT: s_endpgm ; ; GFX1164_DPP-LABEL: xor_i64_varying: ; GFX1164_DPP: ; %bb.0: ; %entry -; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX1164_DPP-NEXT: v_and_b32_e32 v7, 0x3ff, v0 +; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v7, s[0:1] -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v8, s[0:1] -; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1] +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1 ; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1 -; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 31 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, s4 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, s4 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s5 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v2, 15 -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 15 -; GFX1164_DPP-NEXT: v_readlane_b32 s6, v2, 31 -; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 31 -; GFX1164_DPP-NEXT: v_readlane_b32 s8, v1, 47 +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 15 +; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 31 ; GFX1164_DPP-NEXT: v_writelane_b32 v6, s4, 16 +; GFX1164_DPP-NEXT: v_readlane_b32 s7, v2, 31 +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v2, 63 ; GFX1164_DPP-NEXT: v_writelane_b32 v5, s5, 16 -; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 63 -; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 63 -; GFX1164_DPP-NEXT: v_readlane_b32 s9, v2, 47 +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 63 +; GFX1164_DPP-NEXT: v_readlane_b32 s8, v1, 47 ; GFX1164_DPP-NEXT: v_writelane_b32 v6, s6, 32 +; GFX1164_DPP-NEXT: v_readlane_b32 s9, v2, 47 ; GFX1164_DPP-NEXT: v_writelane_b32 v5, s7, 32 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] -; GFX1164_DPP-NEXT: v_writelane_b32 v6, s9, 48 -; GFX1164_DPP-NEXT: v_writelane_b32 v5, s8, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v6, s8, 48 +; GFX1164_DPP-NEXT: v_writelane_b32 v5, s9, 48 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 ; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX1164_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 -; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB20_2 ; GFX1164_DPP-NEXT: ; %bb.1: -; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, s1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, s0 -; GFX1164_DPP-NEXT: ds_xor_rtn_b64 v[9:10], v8, v[9:10] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s0 +; GFX1164_DPP-NEXT: ds_xor_rtn_b64 v[7:8], v0, v[7:8] ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv ; GFX1164_DPP-NEXT: .LBB20_2: ; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v5 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v10 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v6 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v9 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v8 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v5 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v6 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v7 ; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_xor_b32_e32 v8, s0, v8 -; GFX1164_DPP-NEXT: v_xor_b32_e32 v7, s1, v7 +; GFX1164_DPP-NEXT: v_xor_b32_e32 v9, s0, v9 +; GFX1164_DPP-NEXT: v_xor_b32_e32 v8, s1, v8 ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164_DPP-NEXT: buffer_store_b64 v[7:8], off, s[4:7], 0 +; GFX1164_DPP-NEXT: buffer_store_b64 v[8:9], off, s[4:7], 0 ; GFX1164_DPP-NEXT: s_nop 0 ; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164_DPP-NEXT: s_endpgm ; ; GFX1132_DPP-LABEL: xor_i64_varying: ; GFX1132_DPP: ; %bb.0: ; %entry -; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_and_b32 v7, 0x3ff, v0 +; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v7, s4 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v8, s4 -; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s4 +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s4 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, 0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, 0 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1 ; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1 -; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: v_readlane_b32 s0, v1, 31 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 31 ; GFX1132_DPP-NEXT: v_readlane_b32 s6, v2, 15 -; GFX1132_DPP-NEXT: v_readlane_b32 s1, v2, 31 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_readlane_b32 s0, v2, 31 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: v_writelane_b32 v6, s6, 16 -; GFX1132_DPP-NEXT: v_writelane_b32 v5, s5, 16 +; GFX1132_DPP-NEXT: v_writelane_b32 v6, s5, 16 +; GFX1132_DPP-NEXT: v_writelane_b32 v5, s6, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1132_DPP-NEXT: s_mov_b32 s4, exec_lo -; GFX1132_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 -; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 +; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB20_2 ; GFX1132_DPP-NEXT: ; %bb.1: -; GFX1132_DPP-NEXT: v_dual_mov_b32 v10, s1 :: v_dual_mov_b32 v9, s0 -; GFX1132_DPP-NEXT: ds_xor_rtn_b64 v[9:10], v8, v[9:10] +; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v7, s0 +; GFX1132_DPP-NEXT: ds_xor_rtn_b64 v[7:8], v0, v[7:8] ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv ; GFX1132_DPP-NEXT: .LBB20_2: ; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v7, v5 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v10 -; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v6 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v9 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v8 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v5 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, v6 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v7 ; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_xor_b32_e32 v8, s0, v8 -; GFX1132_DPP-NEXT: v_xor_b32_e32 v7, s1, v7 +; GFX1132_DPP-NEXT: v_xor_b32_e32 v9, s0, v9 +; GFX1132_DPP-NEXT: v_xor_b32_e32 v8, s1, v8 ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132_DPP-NEXT: buffer_store_b64 v[7:8], off, s[4:7], 0 +; GFX1132_DPP-NEXT: buffer_store_b64 v[8:9], off, s[4:7], 0 ; GFX1132_DPP-NEXT: s_nop 0 ; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132_DPP-NEXT: s_endpgm @@ -11885,20 +11848,18 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-LABEL: max_i64_varying: ; GFX8_DPP: ; %bb.0: ; %entry ; GFX8_DPP-NEXT: s_mov_b32 s0, 0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v9, 0 ; GFX8_DPP-NEXT: s_brev_b32 s1, 1 -; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v7 ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v1, s0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX8_DPP-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] -; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v7 -; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v8 +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX8_DPP-NEXT: s_mov_b64 exec, -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v4, s1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v3, s0 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[4:5] ; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4 ; GFX8_DPP-NEXT: s_nop 0 @@ -11953,22 +11914,22 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX8_DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; GFX8_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8_DPP-NEXT: s_cbranch_execz .LBB23_2 ; GFX8_DPP-NEXT: ; %bb.1: -; GFX8_DPP-NEXT: v_mov_b32_e32 v10, s5 -; GFX8_DPP-NEXT: v_mov_b32_e32 v9, s4 +; GFX8_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, s4 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 -; GFX8_DPP-NEXT: ds_max_rtn_i64 v[9:10], v8, v[9:10] +; GFX8_DPP-NEXT: ds_max_rtn_i64 v[7:8], v9, v[7:8] ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_DPP-NEXT: .LBB23_2: ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v8 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v7 ; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v10 -; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v9 ; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX8_DPP-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[7:8] ; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s5 @@ -11984,20 +11945,18 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-LABEL: max_i64_varying: ; GFX9_DPP: ; %bb.0: ; %entry ; GFX9_DPP-NEXT: s_mov_b32 s0, 0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0 +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v9, 0 ; GFX9_DPP-NEXT: s_brev_b32 s1, 1 -; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v7 ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v1, s0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX9_DPP-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] -; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v7 -; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v8 +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9_DPP-NEXT: s_mov_b64 exec, -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v4, s1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v3, s0 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[4:5] ; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4 ; GFX9_DPP-NEXT: s_nop 0 @@ -12052,21 +12011,21 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] ; GFX9_DPP-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; GFX9_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB23_2 ; GFX9_DPP-NEXT: ; %bb.1: -; GFX9_DPP-NEXT: v_mov_b32_e32 v10, s5 -; GFX9_DPP-NEXT: v_mov_b32_e32 v9, s4 -; GFX9_DPP-NEXT: ds_max_rtn_i64 v[9:10], v8, v[9:10] +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s4 +; GFX9_DPP-NEXT: ds_max_rtn_i64 v[7:8], v9, v[7:8] ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB23_2: ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v8 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v7 ; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v10 -; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v9 ; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX9_DPP-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[7:8] ; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s5 @@ -12081,15 +12040,13 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; ; GFX1064_DPP-LABEL: max_i64_varying: ; GFX1064_DPP: ; %bb.0: ; %entry -; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064_DPP-NEXT: s_mov_b32 s4, 0 ; GFX1064_DPP-NEXT: s_brev_b32 s5, 1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, s4 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s5 -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, s4, v9, s[0:1] -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, s5, v10, s[0:1] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, 0, s[0:1] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 @@ -12156,30 +12113,31 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_writelane_b32 v2, s6, 32 ; GFX1064_DPP-NEXT: v_writelane_b32 v1, s7, 32 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v9, exec_hi, v0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] ; GFX1064_DPP-NEXT: v_writelane_b32 v2, s8, 48 ; GFX1064_DPP-NEXT: v_writelane_b32 v1, s9, 48 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] -; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 ; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1064_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 +; GFX1064_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB23_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, s1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, s0 -; GFX1064_DPP-NEXT: ds_max_rtn_i64 v[11:12], v10, v[11:12] +; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1064_DPP-NEXT: ds_max_rtn_i64 v[9:10], v0, v[9:10] ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB23_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v10 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v9 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v1 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v12 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v11 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, v2 ; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1064_DPP-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[9:10] @@ -12191,15 +12149,13 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; ; GFX1032_DPP-LABEL: max_i64_varying: ; GFX1032_DPP: ; %bb.0: ; %entry -; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1032_DPP-NEXT: s_mov_b32 s0, 0 ; GFX1032_DPP-NEXT: s_brev_b32 s1, 1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, 0, s4 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, s1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, s0 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, s0, v9, s4 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, s1, v10, s4 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s4 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 @@ -12244,29 +12200,30 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_readlane_b32 s5, v4, 15 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v9, exec_lo, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1032_DPP-NEXT: v_writelane_b32 v2, s5, 16 ; GFX1032_DPP-NEXT: v_writelane_b32 v1, s6, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9 ; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1032_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 +; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB23_2 ; GFX1032_DPP-NEXT: ; %bb.1: -; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, s1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, s0 -; GFX1032_DPP-NEXT: ds_max_rtn_i64 v[11:12], v10, v[11:12] +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1032_DPP-NEXT: ds_max_rtn_i64 v[9:10], v0, v[9:10] ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB23_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v10 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v9 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v1 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v12 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v11 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, v2 ; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1032_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[9:10] @@ -12278,79 +12235,79 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; ; GFX1164_DPP-LABEL: max_i64_varying: ; GFX1164_DPP: ; %bb.0: ; %entry -; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1164_DPP-NEXT: s_mov_b32 s0, 0 -; GFX1164_DPP-NEXT: s_brev_b32 s1, 1 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, s0 -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] -; GFX1164_DPP-NEXT: v_and_b32_e32 v9, 0x3ff, v0 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v5, s0, v9, s[4:5] -; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v6, s1, v10, s[4:5] -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_mov_b32 s4, 0 +; GFX1164_DPP-NEXT: s_brev_b32 s5, 1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s5 +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, 0, s[0:1] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[5:6], v[3:4] -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[7:8] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[7:8] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GFX1164_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 ; GFX1164_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1164_DPP-NEXT: v_readlane_b32 s0, v4, 31 -; GFX1164_DPP-NEXT: v_readlane_b32 s1, v3, 31 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s0 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s1 +; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 31 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 31 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s4 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s5 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5] +; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -12369,29 +12326,30 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_writelane_b32 v1, s7, 32 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v9, exec_hi, v0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] ; GFX1164_DPP-NEXT: v_writelane_b32 v2, s8, 48 ; GFX1164_DPP-NEXT: v_writelane_b32 v1, s9, 48 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 ; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX1164_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 -; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB23_2 ; GFX1164_DPP-NEXT: ; %bb.1: -; GFX1164_DPP-NEXT: v_mov_b32_e32 v12, s1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, s0 -; GFX1164_DPP-NEXT: ds_max_rtn_i64 v[11:12], v10, v[11:12] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1164_DPP-NEXT: ds_max_rtn_i64 v[9:10], v0, v[9:10] ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv ; GFX1164_DPP-NEXT: .LBB23_2: ; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v10 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v9 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v1 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v12 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v11 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, v2 ; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -12406,55 +12364,54 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; ; GFX1132_DPP-LABEL: max_i64_varying: ; GFX1132_DPP: ; %bb.0: ; %entry -; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1132_DPP-NEXT: s_mov_b32 s0, 0 ; GFX1132_DPP-NEXT: s_brev_b32 s1, 1 -; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: v_and_b32_e32 v9, 0x3ff, v0 +; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v5, s0, v9, s4 -; GFX1132_DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v4, v2 -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v6, s1, v10, s4 +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, 0, s4 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s4 ; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[5:6], v[3:4] -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_cndmask_b32 v3, v3, v5 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[5:6] +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3 ; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[7:8] ; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3 ; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[5:6] ; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3 ; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[7:8] ; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 -; GFX1132_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[5:6] ; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v4, 31 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s0, v3, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -12462,27 +12419,28 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 ; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v9, exec_lo, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v2, s5, 16 ; GFX1132_DPP-NEXT: v_writelane_b32 v1, s6, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1132_DPP-NEXT: s_mov_b32 s4, exec_lo -; GFX1132_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 -; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB23_2 ; GFX1132_DPP-NEXT: ; %bb.1: -; GFX1132_DPP-NEXT: v_dual_mov_b32 v12, s1 :: v_dual_mov_b32 v11, s0 -; GFX1132_DPP-NEXT: ds_max_rtn_i64 v[11:12], v10, v[11:12] +; GFX1132_DPP-NEXT: v_dual_mov_b32 v10, s1 :: v_dual_mov_b32 v9, s0 +; GFX1132_DPP-NEXT: ds_max_rtn_i64 v[9:10], v0, v[9:10] ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv ; GFX1132_DPP-NEXT: .LBB23_2: ; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v10 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v9 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, v1 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v12 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v11 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, v2 ; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -13771,21 +13729,19 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; ; GFX8_DPP-LABEL: min_i64_varying: ; GFX8_DPP: ; %bb.0: ; %entry -; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v9, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v7 ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX8_DPP-NEXT: s_mov_b32 s6, -1 ; GFX8_DPP-NEXT: s_brev_b32 s7, -2 -; GFX8_DPP-NEXT: v_mov_b32_e32 v1, s6 -; GFX8_DPP-NEXT: v_mov_b32_e32 v2, s7 +; GFX8_DPP-NEXT: v_bfrev_b32_e32 v2, -2 ; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v7 -; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v8 +; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX8_DPP-NEXT: s_mov_b64 exec, -1 +; GFX8_DPP-NEXT: s_mov_b32 s6, -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v3, s6 ; GFX8_DPP-NEXT: v_mov_b32_e32 v4, s7 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s[0:1] ; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4 ; GFX8_DPP-NEXT: s_nop 0 @@ -13839,22 +13795,22 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; GFX8_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8_DPP-NEXT: s_cbranch_execz .LBB26_2 ; GFX8_DPP-NEXT: ; %bb.1: -; GFX8_DPP-NEXT: v_mov_b32_e32 v10, s5 -; GFX8_DPP-NEXT: v_mov_b32_e32 v9, s4 +; GFX8_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, s4 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 -; GFX8_DPP-NEXT: ds_min_rtn_i64 v[9:10], v8, v[9:10] +; GFX8_DPP-NEXT: ds_min_rtn_i64 v[7:8], v9, v[7:8] ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_DPP-NEXT: .LBB26_2: ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX8_DPP-NEXT: v_readfirstlane_b32 s1, v8 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s0, v7 ; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX8_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX8_DPP-NEXT: v_readfirstlane_b32 s1, v10 -; GFX8_DPP-NEXT: v_readfirstlane_b32 s0, v9 ; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX8_DPP-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[7:8] ; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s1 @@ -13868,21 +13824,19 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; ; GFX9_DPP-LABEL: min_i64_varying: ; GFX9_DPP: ; %bb.0: ; %entry -; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v9, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v7 ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9_DPP-NEXT: s_mov_b32 s6, -1 ; GFX9_DPP-NEXT: s_brev_b32 s7, -2 -; GFX9_DPP-NEXT: v_mov_b32_e32 v1, s6 -; GFX9_DPP-NEXT: v_mov_b32_e32 v2, s7 +; GFX9_DPP-NEXT: v_bfrev_b32_e32 v2, -2 ; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v7 -; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v8 +; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX9_DPP-NEXT: s_mov_b64 exec, -1 +; GFX9_DPP-NEXT: s_mov_b32 s6, -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v3, s6 ; GFX9_DPP-NEXT: v_mov_b32_e32 v4, s7 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s[0:1] ; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4 ; GFX9_DPP-NEXT: s_nop 0 @@ -13936,21 +13890,21 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; GFX9_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB26_2 ; GFX9_DPP-NEXT: ; %bb.1: -; GFX9_DPP-NEXT: v_mov_b32_e32 v10, s5 -; GFX9_DPP-NEXT: v_mov_b32_e32 v9, s4 -; GFX9_DPP-NEXT: ds_min_rtn_i64 v[9:10], v8, v[9:10] +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s4 +; GFX9_DPP-NEXT: ds_min_rtn_i64 v[7:8], v9, v[7:8] ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB26_2: ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9_DPP-NEXT: v_readfirstlane_b32 s1, v8 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s0, v7 ; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX9_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 -; GFX9_DPP-NEXT: v_readfirstlane_b32 s1, v10 -; GFX9_DPP-NEXT: v_readfirstlane_b32 s0, v9 ; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX9_DPP-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[7:8] ; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s1 @@ -13964,15 +13918,13 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; ; GFX1064_DPP-LABEL: min_i64_varying: ; GFX1064_DPP: ; %bb.0: ; %entry -; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v0 ; GFX1064_DPP-NEXT: s_brev_b32 s7, -2 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, s6 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s7 -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, s6, v9, s[0:1] -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, s7, v10, s[0:1] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fffffff, 0, s[0:1] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v0, s[0:1] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 @@ -14039,29 +13991,30 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_writelane_b32 v2, s7, 32 ; GFX1064_DPP-NEXT: v_writelane_b32 v1, s8, 32 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v9, exec_hi, v0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] ; GFX1064_DPP-NEXT: v_writelane_b32 v2, s10, 48 ; GFX1064_DPP-NEXT: v_writelane_b32 v1, s11, 48 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[8:9] -; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 +; GFX1064_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB26_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, s1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, s0 -; GFX1064_DPP-NEXT: ds_min_rtn_i64 v[11:12], v10, v[11:12] +; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1064_DPP-NEXT: ds_min_rtn_i64 v[9:10], v0, v[9:10] ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB26_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v10 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v9 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v1 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v12 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v11 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, v2 ; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1064_DPP-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[9:10] @@ -14073,15 +14026,13 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; ; GFX1032_DPP-LABEL: min_i64_varying: ; GFX1032_DPP: ; %bb.0: ; %entry -; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0 ; GFX1032_DPP-NEXT: s_brev_b32 s7, -2 -; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, s6 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, s7 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, s6, v9, s4 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, s7, v10, s4 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fffffff, 0, s4 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v0, s4 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 @@ -14126,28 +14077,29 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: v_readlane_b32 s7, v3, 15 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v9, exec_lo, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1032_DPP-NEXT: v_writelane_b32 v2, s5, 16 ; GFX1032_DPP-NEXT: v_writelane_b32 v1, s7, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9 +; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB26_2 ; GFX1032_DPP-NEXT: ; %bb.1: -; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, s1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, s0 -; GFX1032_DPP-NEXT: ds_min_rtn_i64 v[11:12], v10, v[11:12] +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1032_DPP-NEXT: ds_min_rtn_i64 v[9:10], v0, v[9:10] ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB26_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v10 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v9 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v1 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v12 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v11 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, v2 ; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1032_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[0:1], v[9:10] @@ -14159,79 +14111,79 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; ; GFX1164_DPP-LABEL: min_i64_varying: ; GFX1164_DPP: ; %bb.0: ; %entry -; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 ; GFX1164_DPP-NEXT: s_brev_b32 s7, -2 -; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, s6 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s7 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: v_and_b32_e32 v9, 0x3ff, v0 +; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v5, s6, v9, s[0:1] -; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v6, s7, v10, s[0:1] -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fffffff, 0, s[0:1] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v0, s[0:1] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[5:6], v[3:4] -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[7:8] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[7:8] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GFX1164_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 ; GFX1164_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 31 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 31 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s4 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s5 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s5 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -14250,29 +14202,29 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_writelane_b32 v1, s8, 32 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v9, exec_hi, v0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] ; GFX1164_DPP-NEXT: v_writelane_b32 v2, s10, 48 ; GFX1164_DPP-NEXT: v_writelane_b32 v1, s11, 48 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[8:9] -; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX1164_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 -; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 +; GFX1164_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB26_2 ; GFX1164_DPP-NEXT: ; %bb.1: -; GFX1164_DPP-NEXT: v_mov_b32_e32 v12, s1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, s0 -; GFX1164_DPP-NEXT: ds_min_rtn_i64 v[11:12], v10, v[11:12] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1164_DPP-NEXT: ds_min_rtn_i64 v[9:10], v0, v[9:10] ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv ; GFX1164_DPP-NEXT: .LBB26_2: ; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v10 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v9 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v1 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v12 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v11 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, v2 ; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -14287,82 +14239,82 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; ; GFX1132_DPP-LABEL: min_i64_varying: ; GFX1132_DPP: ; %bb.0: ; %entry -; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1132_DPP-NEXT: s_brev_b32 s7, -2 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132_DPP-NEXT: s_brev_b32 s7, -2 +; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1132_DPP-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_mov_b32 v2, s7 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: v_and_b32_e32 v9, 0x3ff, v0 +; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v5, s6, v9, s4 -; GFX1132_DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v4, v2 -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v6, s7, v10, s4 +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fffffff, 0, s4 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v0, s4 ; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[5:6], v[3:4] -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_cndmask_b32 v3, v3, v5 +; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[5:6] +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3 ; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[7:8] ; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3 ; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[5:6] ; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3 ; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[7:8] ; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 -; GFX1132_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[5:6] ; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v4, 31 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_readlane_b32 s0, v3, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_readlane_b32 s5, v4, 15 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: v_readlane_b32 s7, v3, 15 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v9, exec_lo, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v2, s5, 16 ; GFX1132_DPP-NEXT: v_writelane_b32 v1, s7, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: s_mov_b32 s4, exec_lo -; GFX1132_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 -; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9 +; GFX1132_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB26_2 ; GFX1132_DPP-NEXT: ; %bb.1: -; GFX1132_DPP-NEXT: v_dual_mov_b32 v12, s1 :: v_dual_mov_b32 v11, s0 -; GFX1132_DPP-NEXT: ds_min_rtn_i64 v[11:12], v10, v[11:12] +; GFX1132_DPP-NEXT: v_dual_mov_b32 v10, s1 :: v_dual_mov_b32 v9, s0 +; GFX1132_DPP-NEXT: ds_min_rtn_i64 v[9:10], v0, v[9:10] ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv ; GFX1132_DPP-NEXT: .LBB26_2: ; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v10 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v9 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, v1 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v12 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v11 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, v2 ; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -15634,85 +15586,84 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; ; GFX8_DPP-LABEL: umax_i64_varying: ; GFX8_DPP: ; %bb.0: ; %entry -; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v9, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v7 ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[0:1] -; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1] -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, 0, 0, s[0:1] +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] -; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] -; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] -; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] -; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] -; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] -; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX8_DPP-NEXT: v_readlane_b32 s5, v4, 63 -; GFX8_DPP-NEXT: v_readlane_b32 s4, v3, 63 -; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX8_DPP-NEXT: v_readlane_b32 s5, v2, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; GFX8_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8_DPP-NEXT: s_cbranch_execz .LBB29_2 ; GFX8_DPP-NEXT: ; %bb.1: -; GFX8_DPP-NEXT: v_mov_b32_e32 v10, s5 -; GFX8_DPP-NEXT: v_mov_b32_e32 v9, s4 +; GFX8_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, s4 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 -; GFX8_DPP-NEXT: ds_max_rtn_u64 v[9:10], v8, v[9:10] +; GFX8_DPP-NEXT: ds_max_rtn_u64 v[7:8], v9, v[7:8] ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_DPP-NEXT: .LBB29_2: ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v8 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v7 +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v10 -; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v9 -; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[7:8] ; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s5 ; GFX8_DPP-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc @@ -15726,84 +15677,83 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; ; GFX9_DPP-LABEL: umax_i64_varying: ; GFX9_DPP: ; %bb.0: ; %entry -; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v9, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v7 ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[0:1] -; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, 0, 0, s[0:1] +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] -; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] -; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] -; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] -; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] -; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] -; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX9_DPP-NEXT: v_readlane_b32 s5, v4, 63 -; GFX9_DPP-NEXT: v_readlane_b32 s4, v3, 63 -; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX9_DPP-NEXT: v_readlane_b32 s5, v2, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; GFX9_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB29_2 ; GFX9_DPP-NEXT: ; %bb.1: -; GFX9_DPP-NEXT: v_mov_b32_e32 v10, s5 -; GFX9_DPP-NEXT: v_mov_b32_e32 v9, s4 -; GFX9_DPP-NEXT: ds_max_rtn_u64 v[9:10], v8, v[9:10] +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s4 +; GFX9_DPP-NEXT: ds_max_rtn_u64 v[7:8], v9, v[7:8] ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB29_2: ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v8 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v7 +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v3 ; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v10 -; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v9 -; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[7:8] ; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s5 ; GFX9_DPP-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc @@ -15817,18 +15767,14 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; ; GFX1064_DPP-LABEL: umax_i64_varying: ; GFX1064_DPP: ; %bb.0: ; %entry -; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s[0:1] -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s[0:1] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, 0, 0, s[0:1] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -15892,30 +15838,31 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_writelane_b32 v2, s6, 32 ; GFX1064_DPP-NEXT: v_writelane_b32 v1, s7, 32 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v9, exec_hi, v0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] ; GFX1064_DPP-NEXT: v_writelane_b32 v2, s8, 48 ; GFX1064_DPP-NEXT: v_writelane_b32 v1, s9, 48 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] -; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 ; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1064_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 +; GFX1064_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB29_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, s1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, s0 -; GFX1064_DPP-NEXT: ds_max_rtn_u64 v[11:12], v10, v[11:12] +; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1064_DPP-NEXT: ds_max_rtn_u64 v[9:10], v0, v[9:10] ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB29_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v10 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v9 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v1 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v12 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v11 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, v2 ; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1064_DPP-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[9:10] @@ -15927,13 +15874,11 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; ; GFX1032_DPP-LABEL: umax_i64_varying: ; GFX1032_DPP: ; %bb.0: ; %entry -; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s4 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s4 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, 0, 0, s4 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s4 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 @@ -15978,29 +15923,30 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_readlane_b32 s5, v4, 15 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v9, exec_lo, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1032_DPP-NEXT: v_writelane_b32 v2, s5, 16 ; GFX1032_DPP-NEXT: v_writelane_b32 v1, s6, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9 ; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1032_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 +; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB29_2 ; GFX1032_DPP-NEXT: ; %bb.1: -; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, s1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, s0 -; GFX1032_DPP-NEXT: ds_max_rtn_u64 v[11:12], v10, v[11:12] +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1032_DPP-NEXT: ds_max_rtn_u64 v[9:10], v0, v[9:10] ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB29_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v10 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v9 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v1 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v12 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v11 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, v2 ; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1032_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[9:10] @@ -16016,73 +15962,73 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1164_DPP-NEXT: v_and_b32_e32 v9, 0x3ff, v0 +; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s[0:1] -; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v6, 0, v10, s[0:1] -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v4, 0, 0, s[0:1] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[5:6], v[3:4] -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[7:8] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[7:8] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GFX1164_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 ; GFX1164_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 31 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 31 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s4 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s5 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s5 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -16101,29 +16047,30 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_writelane_b32 v1, s7, 32 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v9, exec_hi, v0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] ; GFX1164_DPP-NEXT: v_writelane_b32 v2, s8, 48 ; GFX1164_DPP-NEXT: v_writelane_b32 v1, s9, 48 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 ; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX1164_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 -; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB29_2 ; GFX1164_DPP-NEXT: ; %bb.1: -; GFX1164_DPP-NEXT: v_mov_b32_e32 v12, s1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, s0 -; GFX1164_DPP-NEXT: ds_max_rtn_u64 v[11:12], v10, v[11:12] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1164_DPP-NEXT: ds_max_rtn_u64 v[9:10], v0, v[9:10] ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv ; GFX1164_DPP-NEXT: .LBB29_2: ; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v10 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v9 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v1 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v12 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v11 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, v2 ; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -16142,49 +16089,49 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_and_b32 v9, 0x3ff, v0 +; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s4 -; GFX1132_DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v4, v2 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v6, 0, v10, s4 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v4, 0, 0, s4 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2 +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s4 ; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[5:6], v[3:4] -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_cndmask_b32 v3, v3, v5 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[5:6] +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3 ; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[7:8] ; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3 ; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[5:6] ; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3 ; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[7:8] ; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[5:6] ; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v4, 31 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s0, v3, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -16192,27 +16139,28 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 ; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v9, exec_lo, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v2, s5, 16 ; GFX1132_DPP-NEXT: v_writelane_b32 v1, s6, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1132_DPP-NEXT: s_mov_b32 s4, exec_lo -; GFX1132_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 -; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB29_2 ; GFX1132_DPP-NEXT: ; %bb.1: -; GFX1132_DPP-NEXT: v_dual_mov_b32 v12, s1 :: v_dual_mov_b32 v11, s0 -; GFX1132_DPP-NEXT: ds_max_rtn_u64 v[11:12], v10, v[11:12] +; GFX1132_DPP-NEXT: v_dual_mov_b32 v10, s1 :: v_dual_mov_b32 v9, s0 +; GFX1132_DPP-NEXT: ds_max_rtn_u64 v[9:10], v0, v[9:10] ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv ; GFX1132_DPP-NEXT: .LBB29_2: ; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v10 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v9 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, v1 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v12 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v11 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, v2 ; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -17485,201 +17433,193 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; ; GFX8_DPP-LABEL: umin_i64_varying: ; GFX8_DPP: ; %bb.0: ; %entry -; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0 -; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 +; GFX8_DPP-NEXT: v_mov_b32_e32 v9, 0 +; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v7 ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v1, -1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v2, -1 -; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v7, s[0:1] -; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, -1, v8, s[0:1] -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] -; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_mov_b32_e32 v3, -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v4, -1 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, -1, 0, s[0:1] +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s[0:1] +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX8_DPP-NEXT: s_mov_b32 s6, -1 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] -; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] -; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] -; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] -; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4 ; GFX8_DPP-NEXT: s_nop 0 -; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] -; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX8_DPP-NEXT: v_readlane_b32 s5, v4, 63 -; GFX8_DPP-NEXT: v_readlane_b32 s4, v3, 63 -; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6] +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX8_DPP-NEXT: v_readlane_b32 s5, v2, 63 +; GFX8_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; GFX8_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8_DPP-NEXT: s_cbranch_execz .LBB32_2 ; GFX8_DPP-NEXT: ; %bb.1: -; GFX8_DPP-NEXT: v_mov_b32_e32 v10, s5 -; GFX8_DPP-NEXT: v_mov_b32_e32 v9, s4 +; GFX8_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, s4 ; GFX8_DPP-NEXT: s_mov_b32 m0, -1 -; GFX8_DPP-NEXT: ds_min_rtn_u64 v[9:10], v8, v[9:10] +; GFX8_DPP-NEXT: ds_min_rtn_u64 v[7:8], v9, v[7:8] ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX8_DPP-NEXT: .LBB32_2: ; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v10 -; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v9 -; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[7:8] -; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s5 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s1, v8 +; GFX8_DPP-NEXT: v_readfirstlane_b32 s0, v7 +; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX8_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[7:8] +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s1 ; GFX8_DPP-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc -; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4 -; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000 -; GFX8_DPP-NEXT: s_mov_b32 s2, -1 +; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s0 +; GFX8_DPP-NEXT: s_mov_b32 s7, 0xf000 ; GFX8_DPP-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc ; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX8_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 +; GFX8_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[4:7], 0 ; GFX8_DPP-NEXT: s_endpgm ; ; GFX9_DPP-LABEL: umin_i64_varying: ; GFX9_DPP: ; %bb.0: ; %entry -; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0 -; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0 -; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 +; GFX9_DPP-NEXT: v_mov_b32_e32 v9, 0 +; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v7 ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v1, -1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v2, -1 -; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v7, s[0:1] -; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, -1, v8, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] -; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_mov_b32_e32 v3, -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v4, -1 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, -1, 0, s[0:1] +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s[0:1] +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4 +; GFX9_DPP-NEXT: s_mov_b32 s6, -1 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] -; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] -; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] -; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] -; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4 ; GFX9_DPP-NEXT: s_nop 0 -; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] -; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX9_DPP-NEXT: v_readlane_b32 s5, v4, 63 -; GFX9_DPP-NEXT: v_readlane_b32 s4, v3, 63 -; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6] +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX9_DPP-NEXT: v_readlane_b32 s5, v2, 63 +; GFX9_DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; GFX9_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9_DPP-NEXT: s_cbranch_execz .LBB32_2 ; GFX9_DPP-NEXT: ; %bb.1: -; GFX9_DPP-NEXT: v_mov_b32_e32 v10, s5 -; GFX9_DPP-NEXT: v_mov_b32_e32 v9, s4 -; GFX9_DPP-NEXT: ds_min_rtn_u64 v[9:10], v8, v[9:10] +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, s5 +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s4 +; GFX9_DPP-NEXT: ds_min_rtn_u64 v[7:8], v9, v[7:8] ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9_DPP-NEXT: .LBB32_2: ; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v1 -; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v10 -; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v9 -; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[7:8] -; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s5 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s1, v8 +; GFX9_DPP-NEXT: v_readfirstlane_b32 s0, v7 +; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v3 +; GFX9_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[7:8] +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s1 ; GFX9_DPP-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc -; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4 -; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000 -; GFX9_DPP-NEXT: s_mov_b32 s2, -1 +; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s0 +; GFX9_DPP-NEXT: s_mov_b32 s7, 0xf000 ; GFX9_DPP-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc ; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0) -; GFX9_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0 +; GFX9_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[4:7], 0 ; GFX9_DPP-NEXT: s_endpgm ; ; GFX1064_DPP-LABEL: umin_i64_varying: ; GFX1064_DPP: ; %bb.0: ; %entry -; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, -1 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, -1 -; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v9, s[0:1] -; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, -1, v10, s[0:1] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, -1, 0, s[0:1] +; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v0, s[0:1] ; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2 ; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -17743,30 +17683,31 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1064_DPP-NEXT: v_writelane_b32 v2, s6, 32 ; GFX1064_DPP-NEXT: v_writelane_b32 v1, s7, 32 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v9, exec_hi, v0 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] ; GFX1064_DPP-NEXT: v_writelane_b32 v2, s8, 48 ; GFX1064_DPP-NEXT: v_writelane_b32 v1, s9, 48 ; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7] -; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 ; GFX1064_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1064_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 +; GFX1064_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064_DPP-NEXT: s_cbranch_execz .LBB32_2 ; GFX1064_DPP-NEXT: ; %bb.1: -; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, s1 -; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, s0 -; GFX1064_DPP-NEXT: ds_min_rtn_u64 v[11:12], v10, v[11:12] +; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1064_DPP-NEXT: ds_min_rtn_u64 v[9:10], v0, v[9:10] ; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064_DPP-NEXT: buffer_gl0_inv ; GFX1064_DPP-NEXT: .LBB32_2: ; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v10 +; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v9 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v1 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v12 -; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v11 ; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, v2 ; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1064_DPP-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[9:10] @@ -17778,13 +17719,11 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; ; GFX1032_DPP-LABEL: umin_i64_varying: ; GFX1032_DPP: ; %bb.0: ; %entry -; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, -1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, -1 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v9, s4 -; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, -1, v10, s4 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, -1, 0, s4 +; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v0, s4 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2 @@ -17829,29 +17768,30 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_readlane_b32 s5, v4, 15 ; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v9, exec_lo, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1032_DPP-NEXT: v_writelane_b32 v2, s5, 16 ; GFX1032_DPP-NEXT: v_writelane_b32 v1, s6, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9 ; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1032_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 +; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB32_2 ; GFX1032_DPP-NEXT: ; %bb.1: -; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, s1 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, s0 -; GFX1032_DPP-NEXT: ds_min_rtn_u64 v[11:12], v10, v[11:12] +; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1032_DPP-NEXT: ds_min_rtn_u64 v[9:10], v0, v[9:10] ; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032_DPP-NEXT: buffer_gl0_inv ; GFX1032_DPP-NEXT: .LBB32_2: ; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v10 +; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v9 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v1 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v12 -; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v11 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, v2 ; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1032_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[0:1], v[9:10] @@ -17867,73 +17807,73 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, -1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, -1 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, 0 -; GFX1164_DPP-NEXT: v_and_b32_e32 v9, 0x3ff, v0 +; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v5, -1, v9, s[0:1] -; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1 -; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v6, -1, v10, s[0:1] -; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v4, -1, 0, s[0:1] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 +; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v0, s[0:1] ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 -; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[5:6], v[3:4] -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[7:8] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[7:8] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GFX1164_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 ; GFX1164_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 31 +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 31 -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s4 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s5 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s5 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6] ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -17952,29 +17892,30 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1164_DPP-NEXT: v_writelane_b32 v1, s7, 32 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v9, exec_hi, v0 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[4:5] ; GFX1164_DPP-NEXT: v_writelane_b32 v2, s8, 48 ; GFX1164_DPP-NEXT: v_writelane_b32 v1, s9, 48 ; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7] +; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 ; GFX1164_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], exec -; GFX1164_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 -; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1164_DPP-NEXT: s_cbranch_execz .LBB32_2 ; GFX1164_DPP-NEXT: ; %bb.1: -; GFX1164_DPP-NEXT: v_mov_b32_e32 v12, s1 -; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, s0 -; GFX1164_DPP-NEXT: ds_min_rtn_u64 v[11:12], v10, v[11:12] +; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, s1 +; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, s0 +; GFX1164_DPP-NEXT: ds_min_rtn_u64 v[9:10], v0, v[9:10] ; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164_DPP-NEXT: buffer_gl0_inv ; GFX1164_DPP-NEXT: .LBB32_2: ; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v10 +; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v9 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v1 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v12 -; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v11 ; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, v2 ; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -17993,49 +17934,49 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, -1 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, -1 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_and_b32 v9, 0x3ff, v0 +; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v5, -1, v9, s4 -; GFX1132_DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v4, v2 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v6, -1, v10, s4 +; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v4, -1, 0, s4 +; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2 +; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v0, s4 ; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2 -; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[5:6], v[3:4] -; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_cndmask_b32 v3, v3, v5 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[5:6] +; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3 ; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[7:8] ; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3 ; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[5:6] ; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3 ; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[7:8] ; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[5:6] ; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_readlane_b32 s1, v4, 31 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15 ; GFX1132_DPP-NEXT: v_readlane_b32 s0, v3, 31 ; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -18043,27 +17984,28 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 ; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v9, exec_lo, 0 +; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v2, s5, 16 ; GFX1132_DPP-NEXT: v_writelane_b32 v1, s6, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1132_DPP-NEXT: s_mov_b32 s4, exec_lo -; GFX1132_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12 -; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 +; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB32_2 ; GFX1132_DPP-NEXT: ; %bb.1: -; GFX1132_DPP-NEXT: v_dual_mov_b32 v12, s1 :: v_dual_mov_b32 v11, s0 -; GFX1132_DPP-NEXT: ds_min_rtn_u64 v[11:12], v10, v[11:12] +; GFX1132_DPP-NEXT: v_dual_mov_b32 v10, s1 :: v_dual_mov_b32 v9, s0 +; GFX1132_DPP-NEXT: ds_min_rtn_u64 v[9:10], v0, v[9:10] ; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132_DPP-NEXT: buffer_gl0_inv ; GFX1132_DPP-NEXT: .LBB32_2: ; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v10 +; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v9 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, v1 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v12 -; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v11 ; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, v2 ; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll index 311c609291886..009f8b2704d50 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll @@ -7330,44 +7330,47 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX9-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] -; GFX9-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-DPP-NEXT: s_mov_b64 exec, -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v11, 1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v8, 0, v0, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[12:13] ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v10 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v9, v11 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] +; GFX9-DPP-NEXT: v_mov_b32_dpp v10, v8 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v11, v9 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 @@ -7463,9 +7466,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, 0x80000000, v1, s[0:1] +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -7585,9 +7588,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, 0x80000000, v1, s0 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -7691,12 +7694,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, 0x80000000, v1, s[0:1] +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) @@ -7813,42 +7816,42 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0 +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, 0x80000000, v1, s0 +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 -; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v41, v8 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1132-DPP-NEXT: ; %bb.1: @@ -8907,44 +8910,47 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1] -; GFX9-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-DPP-NEXT: s_mov_b64 exec, -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v6, 1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8] ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 @@ -9008,9 +9014,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1] +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, 0x80000000, v1, s[0:1] +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -9096,9 +9102,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s0 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, 0x80000000, v1, s0 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -9168,12 +9174,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s[0:1] +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, 0x80000000, v1, s[0:1] +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) @@ -9259,42 +9265,42 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s0 +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, 0x80000000, v1, s0 +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 -; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v2, 0, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB12_3 ; GFX1132-DPP-NEXT: ; %bb.1: @@ -10330,44 +10336,47 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1] -; GFX9-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-DPP-NEXT: s_mov_b64 exec, -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v6, 1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8] ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 @@ -10431,9 +10440,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1] +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, 0x80000000, v1, s[0:1] +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -10519,9 +10528,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s0 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, 0x80000000, v1, s0 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -10591,12 +10600,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s[0:1] +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, 0x80000000, v1, s[0:1] +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) @@ -10682,42 +10691,42 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s0 +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, 0x80000000, v1, s0 +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 -; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v2, 0, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB14_3 ; GFX1132-DPP-NEXT: ; %bb.1: @@ -11235,44 +11244,47 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1] -; GFX9-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-DPP-NEXT: s_mov_b64 exec, -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v6, 1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8] ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 @@ -11336,9 +11348,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1] +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, 0x80000000, v1, s[0:1] +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -11424,9 +11436,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s0 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, 0x80000000, v1, s0 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -11496,12 +11508,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s[0:1] +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, 0x80000000, v1, s[0:1] +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) @@ -11587,42 +11599,42 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s0 +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, 0x80000000, v1, s0 +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 -; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v2, 0, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB15_3 ; GFX1132-DPP-NEXT: ; %bb.1: @@ -13305,44 +13317,47 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX9-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] -; GFX9-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-DPP-NEXT: s_mov_b64 exec, -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v11, 1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v8, 0, v0, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[12:13] ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v10 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v9, v11 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] +; GFX9-DPP-NEXT: v_mov_b32_dpp v10, v8 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v11, v9 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 @@ -13438,9 +13453,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, 0x80000000, v1, s[0:1] +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -13560,9 +13575,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, 0x80000000, v1, s0 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -13666,12 +13681,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, 0x80000000, v1, s[0:1] +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) @@ -13788,42 +13803,42 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0 +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, 0x80000000, v1, s0 +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 -; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v41, v8 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB17_3 ; GFX1132-DPP-NEXT: ; %bb.1: diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll index 9dc82b17bd3f4..f82b5af855ef3 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll @@ -4742,51 +4742,54 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX9-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] -; GFX9-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-DPP-NEXT: s_mov_b64 exec, -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, 0x7ff80000 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v8, 0, v0, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] ; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] -; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] -; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] -; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] -; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] -; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[12:13] ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v10 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v9, v11 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] -; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[8:9] +; GFX9-DPP-NEXT: v_mov_b32_dpp v10, v8 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v11, v9 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 @@ -4884,9 +4887,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, 0x7ff80000, v1, s[0:1] +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -5016,9 +5019,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, 0x7ff80000, v1, s0 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -5130,12 +5133,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, 0x7ff80000, v1, s[0:1] +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] @@ -5264,51 +5267,50 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0 +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, 0x7ff80000, v1, s0 +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 -; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[8:9] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, v8 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1132-DPP-NEXT: ; %bb.1: @@ -6188,51 +6190,54 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff80000 -; GFX9-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1] -; GFX9-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-DPP-NEXT: s_mov_b64 exec, -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4] ; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] -; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] -; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] -; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] -; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] -; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[7:8] ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4] -; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4] +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] +; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[5:6] ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 @@ -6298,9 +6303,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff80000 -; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1] +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, 0x7ff80000, v1, s[0:1] +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -6381,9 +6386,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff80000 -; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s0 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, 0x7ff80000, v1, s0 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -6446,12 +6451,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7ff80000 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s[0:1] +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, 0x7ff80000, v1, s[0:1] +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] @@ -6550,51 +6555,50 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0x7ff80000 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s0 +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, 0x7ff80000, v1, s0 +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 -; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v2, 0, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1132-DPP-NEXT: ; %bb.1: @@ -8238,51 +8242,54 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX9-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] -; GFX9-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-DPP-NEXT: s_mov_b64 exec, -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, 0x7ff80000 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v8, 0, v0, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] ; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] -; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] -; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] -; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] -; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] -; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[12:13] ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v10 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v9, v11 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] -; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[8:9] +; GFX9-DPP-NEXT: v_mov_b32_dpp v10, v8 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v11, v9 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 @@ -8380,9 +8387,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, 0x7ff80000, v1, s[0:1] +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -8512,9 +8519,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, 0x7ff80000, v1, s0 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -8626,12 +8633,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, 0x7ff80000, v1, s[0:1] +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] @@ -8760,51 +8767,50 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0 +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, 0x7ff80000, v1, s0 +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 -; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[8:9] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, v8 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1132-DPP-NEXT: ; %bb.1: diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll index 945583c88ce26..c43e4e1dd2eaa 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll @@ -4742,51 +4742,54 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX9-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] -; GFX9-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-DPP-NEXT: s_mov_b64 exec, -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, 0x7ff80000 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v8, 0, v0, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] ; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] -; GFX9-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] -; GFX9-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] -; GFX9-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] -; GFX9-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] -; GFX9-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[12:13] ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v10 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v9, v11 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] -; GFX9-DPP-NEXT: v_min_f64 v[8:9], v[10:11], v[8:9] +; GFX9-DPP-NEXT: v_mov_b32_dpp v10, v8 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v11, v9 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX9-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 @@ -4884,9 +4887,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, 0x7ff80000, v1, s[0:1] +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -5016,9 +5019,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, 0x7ff80000, v1, s0 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -5130,12 +5133,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, 0x7ff80000, v1, s[0:1] +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] @@ -5264,51 +5267,50 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0 +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, 0x7ff80000, v1, s0 +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 -; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_min_f64 v[8:9], v[10:11], v[8:9] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, v8 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1132-DPP-NEXT: ; %bb.1: @@ -6188,51 +6190,54 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff80000 -; GFX9-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1] -; GFX9-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-DPP-NEXT: s_mov_b64 exec, -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4] ; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] -; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] -; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] -; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] -; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8] -; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[7:8] ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4] -; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4] +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] +; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[5:6] ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 @@ -6298,9 +6303,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff80000 -; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1] +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, 0x7ff80000, v1, s[0:1] +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -6381,9 +6386,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff80000 -; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s0 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, 0x7ff80000, v1, s0 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -6446,12 +6451,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7ff80000 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s[0:1] +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, 0x7ff80000, v1, s[0:1] +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] @@ -6550,51 +6555,50 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0x7ff80000 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s0 +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, 0x7ff80000, v1, s0 +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 -; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v2, 0, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1132-DPP-NEXT: ; %bb.1: @@ -8238,51 +8242,54 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX9-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] -; GFX9-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-DPP-NEXT: s_mov_b64 exec, -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v11, 0x7ff80000 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v8, 0, v0, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] ; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] -; GFX9-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:2 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] -; GFX9-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:4 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] -; GFX9-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:8 row_mask:0xf bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] -; GFX9-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] -; GFX9-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[12:13] ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v10 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v9, v11 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] -; GFX9-DPP-NEXT: v_min_f64 v[8:9], v[10:11], v[8:9] +; GFX9-DPP-NEXT: v_mov_b32_dpp v10, v8 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v11, v9 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX9-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 @@ -8380,9 +8387,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, 0x7ff80000, v1, s[0:1] +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -8512,9 +8519,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, 0x7ff80000, v1, s0 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -8626,12 +8633,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, 0x7ff80000, v1, s[0:1] +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] @@ -8760,51 +8767,50 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0 +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, 0x7ff80000, v1, s0 +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 -; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_min_f64 v[8:9], v[10:11], v[8:9] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, v8 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1132-DPP-NEXT: ; %bb.1: diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll index 3bc0f2546794d..c02a1b2c56e95 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll @@ -7650,44 +7650,47 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX9-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] -; GFX9-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-DPP-NEXT: s_mov_b64 exec, -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v11, 1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v8, 0, v0, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[12:13] ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v10 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v9, v11 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] +; GFX9-DPP-NEXT: v_mov_b32_dpp v10, v8 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v11, v9 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 @@ -7783,9 +7786,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, 0x80000000, v1, s[0:1] +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -7905,9 +7908,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, 0x80000000, v1, s0 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -8011,12 +8014,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, 0x80000000, v1, s[0:1] +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) @@ -8133,42 +8136,42 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0 +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, 0x80000000, v1, s0 +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 -; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v41, v8 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1132-DPP-NEXT: ; %bb.1: @@ -9226,44 +9229,47 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1] -; GFX9-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-DPP-NEXT: s_mov_b64 exec, -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v6, 1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8] ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 @@ -9327,9 +9333,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1] +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, 0x80000000, v1, s[0:1] +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -9415,9 +9421,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s0 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, 0x80000000, v1, s0 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -9487,12 +9493,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s[0:1] +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, 0x80000000, v1, s[0:1] +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) @@ -9578,42 +9584,42 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s0 +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, 0x80000000, v1, s0 +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 -; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v2, 0, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB12_3 ; GFX1132-DPP-NEXT: ; %bb.1: @@ -10649,44 +10655,47 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1] -; GFX9-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-DPP-NEXT: s_mov_b64 exec, -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v6, 1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8] ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 @@ -10750,9 +10759,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1] +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, 0x80000000, v1, s[0:1] +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -10838,9 +10847,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s0 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, 0x80000000, v1, s0 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -10910,12 +10919,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s[0:1] +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, 0x80000000, v1, s[0:1] +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) @@ -11001,42 +11010,42 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s0 +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, 0x80000000, v1, s0 +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 -; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v2, 0, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB14_3 ; GFX1132-DPP-NEXT: ; %bb.1: @@ -11554,44 +11563,47 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1] -; GFX9-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-DPP-NEXT: s_mov_b64 exec, -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v6, 1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] -; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8] +; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8] ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4] +; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 @@ -11655,9 +11667,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1] +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, 0x80000000, v1, s[0:1] +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -11743,9 +11755,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s0 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, 0x80000000, v1, s0 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -11815,12 +11827,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s[0:1] +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, 0x80000000, v1, s[0:1] +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2 -; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) @@ -11906,42 +11918,42 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s0 +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, 0x80000000, v1, s0 +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 -; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v2, 0, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB15_3 ; GFX1132-DPP-NEXT: ; %bb.1: @@ -13623,44 +13635,47 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX9-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] -; GFX9-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-DPP-NEXT: s_mov_b64 exec, -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v11, 1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v8, 0, v0, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] -; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 ; GFX9-DPP-NEXT: s_nop 0 -; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[12:13] ; GFX9-DPP-NEXT: s_nop 1 -; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v10 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_mov_b32_dpp v9, v11 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] +; GFX9-DPP-NEXT: v_mov_b32_dpp v10, v8 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_mov_b32_dpp v11, v9 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 @@ -13756,9 +13771,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, 0x80000000, v1, s[0:1] +; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -13878,9 +13893,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, 0x80000000, v1, s0 +; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf @@ -13984,12 +13999,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1] +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, 0x80000000, v1, s[0:1] +; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8 -; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1] ; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9 -; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) @@ -14106,42 +14121,42 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0 +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, 0x80000000, v1, s0 +; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 -; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] ; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9] -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v41, v8 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB17_3 ; GFX1132-DPP-NEXT: ; %bb.1: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll index c1b58f1795aae..fbf8c203dcb39 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll @@ -93,18 +93,18 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_64(ptr addrspace(1) %out, i6 ; DAGISEL11: ; %bb.0: ; DAGISEL11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; DAGISEL11-NEXT: s_or_saveexec_b32 s0, -1 -; DAGISEL11-NEXT: v_dual_mov_b32 v1, v11 :: v_dual_mov_b32 v0, v10 +; DAGISEL11-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v10 ; DAGISEL11-NEXT: s_mov_b32 exec_lo, s0 ; DAGISEL11-NEXT: s_or_saveexec_b32 s0, -1 ; DAGISEL11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; DAGISEL11-NEXT: v_cndmask_b32_e64 v0, v0, v12, s0 -; DAGISEL11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; DAGISEL11-NEXT: v_cndmask_b32_e64 v1, v1, v13, s0 +; DAGISEL11-NEXT: v_cndmask_b32_e64 v2, v0, v13, s0 +; DAGISEL11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; DAGISEL11-NEXT: v_cndmask_b32_e64 v1, v1, v12, s0 ; DAGISEL11-NEXT: s_mov_b32 exec_lo, s0 -; DAGISEL11-NEXT: v_mov_b32_e32 v2, v0 -; DAGISEL11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; DAGISEL11-NEXT: v_mov_b32_e32 v3, v1 -; DAGISEL11-NEXT: global_store_b64 v[8:9], v[2:3], off +; DAGISEL11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; DAGISEL11-NEXT: v_mov_b32_e32 v4, v2 +; DAGISEL11-NEXT: global_store_b64 v[8:9], v[3:4], off ; DAGISEL11-NEXT: s_endpgm ; ; GISEL10-LABEL: set_inactive_chain_arg_64: @@ -127,16 +127,16 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_64(ptr addrspace(1) %out, i6 ; DAGISEL10: ; %bb.0: ; DAGISEL10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; DAGISEL10-NEXT: s_or_saveexec_b32 s0, -1 -; DAGISEL10-NEXT: v_mov_b32_e32 v1, v11 -; DAGISEL10-NEXT: v_mov_b32_e32 v0, v10 +; DAGISEL10-NEXT: v_mov_b32_e32 v0, v11 +; DAGISEL10-NEXT: v_mov_b32_e32 v1, v10 ; DAGISEL10-NEXT: s_mov_b32 exec_lo, s0 ; DAGISEL10-NEXT: s_or_saveexec_b32 s0, -1 -; DAGISEL10-NEXT: v_cndmask_b32_e64 v0, v0, v12, s0 -; DAGISEL10-NEXT: v_cndmask_b32_e64 v1, v1, v13, s0 +; DAGISEL10-NEXT: v_cndmask_b32_e64 v2, v0, v13, s0 +; DAGISEL10-NEXT: v_cndmask_b32_e64 v1, v1, v12, s0 ; DAGISEL10-NEXT: s_mov_b32 exec_lo, s0 -; DAGISEL10-NEXT: v_mov_b32_e32 v2, v0 ; DAGISEL10-NEXT: v_mov_b32_e32 v3, v1 -; DAGISEL10-NEXT: global_store_dwordx2 v[8:9], v[2:3], off +; DAGISEL10-NEXT: v_mov_b32_e32 v4, v2 +; DAGISEL10-NEXT: global_store_dwordx2 v[8:9], v[3:4], off ; DAGISEL10-NEXT: s_endpgm ; ; GISEL11_W64-LABEL: set_inactive_chain_arg_64: @@ -162,18 +162,19 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_64(ptr addrspace(1) %out, i6 ; DAGISEL11_W64: ; %bb.0: ; DAGISEL11_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; DAGISEL11_W64-NEXT: s_or_saveexec_b64 s[0:1], -1 -; DAGISEL11_W64-NEXT: v_mov_b32_e32 v1, v11 -; DAGISEL11_W64-NEXT: v_mov_b32_e32 v0, v10 +; DAGISEL11_W64-NEXT: v_mov_b32_e32 v0, v11 +; DAGISEL11_W64-NEXT: v_mov_b32_e32 v1, v10 ; DAGISEL11_W64-NEXT: s_mov_b64 exec, s[0:1] ; DAGISEL11_W64-NEXT: s_or_saveexec_b64 s[0:1], -1 -; DAGISEL11_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; DAGISEL11_W64-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[0:1] -; DAGISEL11_W64-NEXT: v_cndmask_b32_e64 v1, v1, v13, s[0:1] +; DAGISEL11_W64-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; DAGISEL11_W64-NEXT: v_cndmask_b32_e64 v2, v0, v13, s[0:1] +; DAGISEL11_W64-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; DAGISEL11_W64-NEXT: v_cndmask_b32_e64 v1, v1, v12, s[0:1] ; DAGISEL11_W64-NEXT: s_mov_b64 exec, s[0:1] -; DAGISEL11_W64-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; DAGISEL11_W64-NEXT: v_mov_b32_e32 v2, v0 ; DAGISEL11_W64-NEXT: v_mov_b32_e32 v3, v1 -; DAGISEL11_W64-NEXT: global_store_b64 v[8:9], v[2:3], off +; DAGISEL11_W64-NEXT: s_delay_alu instid0(VALU_DEP_3) +; DAGISEL11_W64-NEXT: v_mov_b32_e32 v4, v2 +; DAGISEL11_W64-NEXT: global_store_b64 v[8:9], v[3:4], off ; DAGISEL11_W64-NEXT: s_endpgm ; ; GISEL10_W64-LABEL: set_inactive_chain_arg_64: @@ -196,16 +197,16 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_64(ptr addrspace(1) %out, i6 ; DAGISEL10_W64: ; %bb.0: ; DAGISEL10_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; DAGISEL10_W64-NEXT: s_or_saveexec_b64 s[0:1], -1 -; DAGISEL10_W64-NEXT: v_mov_b32_e32 v1, v11 -; DAGISEL10_W64-NEXT: v_mov_b32_e32 v0, v10 +; DAGISEL10_W64-NEXT: v_mov_b32_e32 v0, v11 +; DAGISEL10_W64-NEXT: v_mov_b32_e32 v1, v10 ; DAGISEL10_W64-NEXT: s_mov_b64 exec, s[0:1] ; DAGISEL10_W64-NEXT: s_or_saveexec_b64 s[0:1], -1 -; DAGISEL10_W64-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[0:1] -; DAGISEL10_W64-NEXT: v_cndmask_b32_e64 v1, v1, v13, s[0:1] +; DAGISEL10_W64-NEXT: v_cndmask_b32_e64 v2, v0, v13, s[0:1] +; DAGISEL10_W64-NEXT: v_cndmask_b32_e64 v1, v1, v12, s[0:1] ; DAGISEL10_W64-NEXT: s_mov_b64 exec, s[0:1] -; DAGISEL10_W64-NEXT: v_mov_b32_e32 v2, v0 ; DAGISEL10_W64-NEXT: v_mov_b32_e32 v3, v1 -; DAGISEL10_W64-NEXT: global_store_dwordx2 v[8:9], v[2:3], off +; DAGISEL10_W64-NEXT: v_mov_b32_e32 v4, v2 +; DAGISEL10_W64-NEXT: global_store_dwordx2 v[8:9], v[3:4], off ; DAGISEL10_W64-NEXT: s_endpgm %tmp = call i64 @llvm.amdgcn.set.inactive.chain.arg.i64(i64 %active, i64 %inactive) #0 %wwm = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll index 6dc4a2ce0504b..59110255c49db 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll @@ -47,7 +47,6 @@ define amdgpu_kernel void @set_inactive_64(ptr addrspace(1) %out, i64 %in) { ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -55,9 +54,12 @@ define amdgpu_kernel void @set_inactive_64(ptr addrspace(1) %out, i64 %in) { ; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_mov_b64 exec, s[8:9] -; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: s_mov_b64 exec, -1 ; GCN-NEXT: s_mov_b64 exec, s[8:9] ; GCN-NEXT: v_mov_b32_e32 v2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, v1 @@ -171,18 +173,20 @@ define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) { ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GCN-NEXT: s_mov_b32 s10, 0x4010cccc +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: v_mov_b32_e32 v1, s10 +; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: s_mov_b64 exec, -1 ; GCN-NEXT: s_mov_b32 s0, 0xcccccccd -; GCN-NEXT: s_mov_b32 s1, 0x4010cccc ; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: s_mov_b64 exec, s[8:9] ; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: s_mov_b64 exec, -1 ; GCN-NEXT: s_mov_b64 exec, s[8:9] ; GCN-NEXT: v_mov_b32_e32 v2, v0 @@ -248,19 +252,19 @@ define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> % ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-NEXT: s_mov_b32 s10, 1 -; GCN-NEXT: s_mov_b32 s11, s10 -; GCN-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_mov_b32_e32 v1, s11 +; GCN-NEXT: v_mov_b32_e32 v1, 1 ; GCN-NEXT: s_mov_b64 exec, s[8:9] -; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: v_mov_b32_e32 v0, 1 +; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: s_mov_b64 exec, -1 ; GCN-NEXT: s_mov_b64 exec, s[8:9] ; GCN-NEXT: v_mov_b32_e32 v2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, v1 @@ -277,19 +281,19 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float> ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-NEXT: s_mov_b32 s10, 1.0 -; GCN-NEXT: s_mov_b32 s11, s10 -; GCN-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_mov_b32_e32 v1, s11 +; GCN-NEXT: v_mov_b32_e32 v1, 1.0 ; GCN-NEXT: s_mov_b64 exec, s[8:9] -; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: v_mov_b32_e32 v0, 1.0 +; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: s_mov_b64 exec, -1 ; GCN-NEXT: s_mov_b64 exec, s[8:9] ; GCN-NEXT: v_mov_b32_e32 v2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, v1 @@ -331,18 +335,19 @@ define amdgpu_kernel void @set_inactive_v4i16(ptr addrspace(1) %out, <4 x i16> % ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-NEXT: s_mov_b32 s10, 0x10001 -; GCN-NEXT: s_mov_b32 s11, s10 -; GCN-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_mov_b32_e32 v1, s11 +; GCN-NEXT: v_mov_b32_e32 v1, s10 ; GCN-NEXT: s_mov_b64 exec, s[8:9] -; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: s_mov_b64 exec, -1 ; GCN-NEXT: s_mov_b64 exec, s[8:9] ; GCN-NEXT: v_mov_b32_e32 v2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, v1 @@ -360,18 +365,19 @@ define amdgpu_kernel void @set_inactive_v4f16(ptr addrspace(1) %out, <4 x half> ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-NEXT: s_mov_b32 s10, 0x3c003c00 -; GCN-NEXT: s_mov_b32 s11, s10 -; GCN-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_mov_b32_e32 v1, s11 +; GCN-NEXT: v_mov_b32_e32 v1, s10 ; GCN-NEXT: s_mov_b64 exec, s[8:9] -; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: s_mov_b64 exec, -1 ; GCN-NEXT: s_mov_b64 exec, s[8:9] ; GCN-NEXT: v_mov_b32_e32 v2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, v1 @@ -389,18 +395,19 @@ define amdgpu_kernel void @set_inactive_v4bf16(ptr addrspace(1) %out, <4 x bfloa ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-NEXT: s_mov_b32 s10, 0x3f803f80 -; GCN-NEXT: s_mov_b32 s11, s10 -; GCN-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_mov_b32_e32 v1, s11 +; GCN-NEXT: v_mov_b32_e32 v1, s10 ; GCN-NEXT: s_mov_b64 exec, s[8:9] -; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: s_mov_b64 exec, -1 ; GCN-NEXT: s_mov_b64 exec, s[8:9] ; GCN-NEXT: v_mov_b32_e32 v2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, v1 @@ -417,7 +424,6 @@ define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) { ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 ; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -425,9 +431,12 @@ define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) { ; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_mov_b64 exec, s[8:9] -; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: s_mov_b64 exec, -1 ; GCN-NEXT: s_mov_b64 exec, s[8:9] ; GCN-NEXT: v_mov_b32_e32 v2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, v1 diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll index ff692acda3c25..92117e0688f65 100644 --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -1708,8 +1708,8 @@ define amdgpu_kernel void @test_set_inactive_64(ptr addrspace(1) %out, i64 %in) ; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX1032-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_cndmask_b32_e64 v0, 0, s6, s0 ; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, s7, s0 +; GFX1032-NEXT: v_cndmask_b32_e64 v0, 0, s6, s0 ; GFX1032-NEXT: s_mov_b32 exec_lo, s0 ; GFX1032-NEXT: v_mov_b32_e32 v2, v0 ; GFX1032-NEXT: v_mov_b32_e32 v4, 0 @@ -1722,8 +1722,8 @@ define amdgpu_kernel void @test_set_inactive_64(ptr addrspace(1) %out, i64 %in) ; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_cndmask_b32_e64 v0, 0, s6, s[0:1] ; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, s7, s[0:1] +; GFX1064-NEXT: v_cndmask_b32_e64 v0, 0, s6, s[0:1] ; GFX1064-NEXT: s_mov_b64 exec, s[0:1] ; GFX1064-NEXT: v_mov_b32_e32 v2, v0 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0 diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll index 47e1897f6b420..4dda8a58a8d5c 100644 --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll @@ -545,14 +545,14 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i ; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill @@ -563,20 +563,20 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i ; GFX9-O0-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane ; GFX9-O0-NEXT: v_writelane_b32 v10, s30, 0 ; GFX9-O0-NEXT: v_writelane_b32 v10, s31, 1 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[38:39], -1 -; GFX9-O0-NEXT: v_writelane_b32 v0, s38, 0 -; GFX9-O0-NEXT: v_writelane_b32 v0, s39, 1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-O0-NEXT: v_writelane_b32 v0, s36, 0 +; GFX9-O0-NEXT: v_writelane_b32 v0, s37, 1 ; GFX9-O0-NEXT: s_mov_b32 s34, s8 ; GFX9-O0-NEXT: s_mov_b32 s40, s6 -; GFX9-O0-NEXT: s_mov_b32 s36, s4 +; GFX9-O0-NEXT: s_mov_b32 s38, s4 ; GFX9-O0-NEXT: ; kill: def $sgpr40 killed $sgpr40 def $sgpr40_sgpr41 ; GFX9-O0-NEXT: s_mov_b32 s41, s7 ; GFX9-O0-NEXT: s_mov_b32 s35, s41 ; GFX9-O0-NEXT: s_mov_b32 s42, s40 -; GFX9-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37 -; GFX9-O0-NEXT: s_mov_b32 s37, s5 -; GFX9-O0-NEXT: s_mov_b32 s43, s37 -; GFX9-O0-NEXT: s_mov_b32 s44, s36 +; GFX9-O0-NEXT: ; kill: def $sgpr38 killed $sgpr38 def $sgpr38_sgpr39 +; GFX9-O0-NEXT: s_mov_b32 s39, s5 +; GFX9-O0-NEXT: s_mov_b32 s43, s39 +; GFX9-O0-NEXT: s_mov_b32 s44, s38 ; GFX9-O0-NEXT: ; kill: def $sgpr44 killed $sgpr44 def $sgpr44_sgpr45_sgpr46_sgpr47 ; GFX9-O0-NEXT: s_mov_b32 s45, s43 ; GFX9-O0-NEXT: s_mov_b32 s46, s42 @@ -590,18 +590,27 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i ; GFX9-O0-NEXT: s_mov_b64 exec, s[48:49] ; GFX9-O0-NEXT: ; kill: def $sgpr34 killed $sgpr34 def $sgpr34_sgpr35 ; GFX9-O0-NEXT: s_mov_b32 s35, s9 -; GFX9-O0-NEXT: ; kill: def $sgpr36_sgpr37 killed $sgpr34_sgpr35 -; GFX9-O0-NEXT: s_mov_b64 s[36:37], 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, s36 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, s37 -; GFX9-O0-NEXT: s_mov_b64 exec, s[38:39] -; GFX9-O0-NEXT: v_mov_b32_e32 v8, s34 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, s35 +; GFX9-O0-NEXT: ; kill: def $sgpr38_sgpr39 killed $sgpr34_sgpr35 +; GFX9-O0-NEXT: s_mov_b32 s40, s35 +; GFX9-O0-NEXT: s_mov_b64 s[38:39], 0 +; GFX9-O0-NEXT: s_mov_b32 s41, s39 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s41 +; GFX9-O0-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s40 +; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: ; kill: def $sgpr34 killed $sgpr34 killed $sgpr34_sgpr35 +; GFX9-O0-NEXT: s_mov_b32 s35, s38 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, s35 +; GFX9-O0-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-O0-NEXT: v_mov_b32_e32 v3, s34 ; GFX9-O0-NEXT: s_mov_b64 exec, -1 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v8 +; GFX9-O0-NEXT: ; implicit-def: $sgpr34 +; GFX9-O0-NEXT: ; implicit-def: $sgpr34 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v2 ; GFX9-O0-NEXT: s_mov_b32 s34, 32 -; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37 -; GFX9-O0-NEXT: v_lshrrev_b64 v[3:4], s34, v[8:9] +; GFX9-O0-NEXT: v_lshrrev_b64 v[4:5], s34, v[8:9] +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v4 ; GFX9-O0-NEXT: s_getpc_b64 s[34:35] ; GFX9-O0-NEXT: s_add_u32 s34, s34, strict_wwm_called_i64@gotpcrel32@lo+4 ; GFX9-O0-NEXT: s_addc_u32 s35, s35, strict_wwm_called_i64@gotpcrel32@hi+12 @@ -610,8 +619,8 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i ; GFX9-O0-NEXT: s_mov_b64 s[36:37], s[0:1] ; GFX9-O0-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-O0-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 ; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-O0-NEXT: s_or_saveexec_b64 s[48:49], -1 @@ -647,12 +656,12 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i ; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload @@ -671,7 +680,6 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i ; GFX9-O3-NEXT: s_xor_saveexec_b64 s[34:35], -1 ; GFX9-O3-NEXT: buffer_store_dword v8, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-O3-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX9-O3-NEXT: s_waitcnt vmcnt(0) ; GFX9-O3-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX9-O3-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; GFX9-O3-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill @@ -683,24 +691,26 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i ; GFX9-O3-NEXT: s_addk_i32 s32, 0x800 ; GFX9-O3-NEXT: v_writelane_b32 v8, s31, 1 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-O3-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-O3-NEXT: v_mov_b32_e32 v6, s9 +; GFX9-O3-NEXT: s_mov_b64 exec, -1 ; GFX9-O3-NEXT: s_getpc_b64 s[34:35] ; GFX9-O3-NEXT: s_add_u32 s34, s34, strict_wwm_called_i64@gotpcrel32@lo+4 ; GFX9-O3-NEXT: s_addc_u32 s35, s35, strict_wwm_called_i64@gotpcrel32@hi+12 ; GFX9-O3-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 -; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-O3-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-O3-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-O3-NEXT: v_mov_b32_e32 v6, s8 -; GFX9-O3-NEXT: v_mov_b32_e32 v7, s9 +; GFX9-O3-NEXT: v_mov_b32_e32 v7, s8 ; GFX9-O3-NEXT: s_mov_b64 exec, -1 -; GFX9-O3-NEXT: v_mov_b32_e32 v0, v6 -; GFX9-O3-NEXT: v_mov_b32_e32 v1, v7 +; GFX9-O3-NEXT: v_mov_b32_e32 v0, v7 +; GFX9-O3-NEXT: v_mov_b32_e32 v1, v6 ; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-O3-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-O3-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-O3-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-O3-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 -; GFX9-O3-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v7, vcc +; GFX9-O3-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7 +; GFX9-O3-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v6, vcc ; GFX9-O3-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-O3-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, v3 @@ -735,8 +745,10 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: s_mov_b32 s36, s4 ; GFX9-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39 @@ -747,73 +759,88 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in ; GFX9-O0-NEXT: s_mov_b32 s34, 5 ; GFX9-O0-NEXT: v_lshlrev_b32_e64 v0, s34, v0 ; GFX9-O0-NEXT: s_mov_b32 s34, 0 -; GFX9-O0-NEXT: buffer_load_dwordx4 v[10:13], v0, s[36:39], s34 offen -; GFX9-O0-NEXT: buffer_load_dwordx2 v[3:4], v0, s[36:39], s34 offen offset:16 +; GFX9-O0-NEXT: buffer_load_dwordx4 v[11:14], v0, s[36:39], s34 offen +; GFX9-O0-NEXT: buffer_load_dwordx2 v[4:5], v0, s[36:39], s34 offen offset:16 ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10 -; GFX9-O0-NEXT: ; implicit-def: $sgpr35 -; GFX9-O0-NEXT: ; implicit-def: $sgpr35 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v12 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[40:41], -1 ; GFX9-O0-NEXT: s_mov_b32 s35, 0x7fffffff -; GFX9-O0-NEXT: s_mov_b32 s42, -1 -; GFX9-O0-NEXT: ; kill: def $sgpr42 killed $sgpr42 def $sgpr42_sgpr43 -; GFX9-O0-NEXT: s_mov_b32 s43, s35 +; GFX9-O0-NEXT: s_mov_b32 s44, -1 +; GFX9-O0-NEXT: ; kill: def $sgpr44 killed $sgpr44 def $sgpr44_sgpr45 +; GFX9-O0-NEXT: s_mov_b32 s45, s35 +; GFX9-O0-NEXT: s_mov_b32 s42, s45 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s42 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s43 ; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v7 +; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: s_mov_b32 s35, s44 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s35 +; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41] ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v6 ; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: ; implicit-def: $sgpr43 +; GFX9-O0-NEXT: ; implicit-def: $sgpr43 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41] -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v13 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v12 -; GFX9-O0-NEXT: ; implicit-def: $sgpr35 -; GFX9-O0-NEXT: ; implicit-def: $sgpr35 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v14 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[40:41], -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s42 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s43 ; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v7 +; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s35 +; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41] ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v6 ; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: ; implicit-def: $sgpr43 +; GFX9-O0-NEXT: ; implicit-def: $sgpr43 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41] -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v3 +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[40:41], -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s42 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s43 ; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41] -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6 ; GFX9-O0-NEXT: s_mov_b64 exec, -1 ; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41] +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: s_or_saveexec_b64 s[40:41], -1 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s35 +; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41] +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: ; implicit-def: $sgpr35 +; GFX9-O0-NEXT: ; implicit-def: $sgpr35 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41] ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v9 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v7 ; GFX9-O0-NEXT: ; implicit-def: $sgpr35 ; GFX9-O0-NEXT: ; implicit-def: $sgpr35 ; GFX9-O0-NEXT: ; implicit-def: $sgpr35 ; GFX9-O0-NEXT: ; implicit-def: $sgpr35 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6_vgpr7_vgpr8 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v10 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v9 -; GFX9-O0-NEXT: buffer_store_dwordx4 v[5:8], v0, s[36:39], s34 offen -; GFX9-O0-NEXT: buffer_store_dwordx2 v[3:4], v0, s[36:39], s34 offen offset:16 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7_vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v10 +; GFX9-O0-NEXT: buffer_store_dwordx4 v[6:9], v0, s[36:39], s34 offen +; GFX9-O0-NEXT: buffer_store_dwordx2 v[4:5], v0, s[36:39], s34 offen offset:16 ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] @@ -836,28 +863,23 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in ; GFX9-O3-NEXT: buffer_load_dwordx4 v[7:10], v0, s[4:7], 0 offen ; GFX9-O3-NEXT: buffer_load_dwordx2 v[11:12], v0, s[4:7], 0 offen offset:16 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-O3-NEXT: s_mov_b32 s36, -1 -; GFX9-O3-NEXT: s_brev_b32 s37, -2 -; GFX9-O3-NEXT: v_mov_b32_e32 v1, s36 -; GFX9-O3-NEXT: v_mov_b32_e32 v2, s37 +; GFX9-O3-NEXT: v_bfrev_b32_e32 v2, -2 ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O3-NEXT: s_waitcnt vmcnt(1) -; GFX9-O3-NEXT: v_mov_b32_e32 v1, v7 ; GFX9-O3-NEXT: v_mov_b32_e32 v2, v8 ; GFX9-O3-NEXT: s_mov_b64 exec, -1 -; GFX9-O3-NEXT: v_mov_b32_e32 v3, s36 -; GFX9-O3-NEXT: v_mov_b32_e32 v4, s37 +; GFX9-O3-NEXT: v_cndmask_b32_e64 v1, -1, v7, s[34:35] +; GFX9-O3-NEXT: v_bfrev_b32_e32 v4, -2 ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O3-NEXT: v_mov_b32_e32 v3, v9 ; GFX9-O3-NEXT: v_mov_b32_e32 v4, v10 ; GFX9-O3-NEXT: s_mov_b64 exec, -1 -; GFX9-O3-NEXT: v_mov_b32_e32 v5, s36 -; GFX9-O3-NEXT: v_mov_b32_e32 v6, s37 +; GFX9-O3-NEXT: v_cndmask_b32_e64 v3, -1, v9, s[34:35] +; GFX9-O3-NEXT: v_bfrev_b32_e32 v6, -2 ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) -; GFX9-O3-NEXT: v_mov_b32_e32 v5, v11 ; GFX9-O3-NEXT: v_mov_b32_e32 v6, v12 ; GFX9-O3-NEXT: s_mov_b64 exec, -1 +; GFX9-O3-NEXT: v_cndmask_b32_e64 v5, -1, v11, s[34:35] ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O3-NEXT: v_mov_b32_e32 v7, v1 ; GFX9-O3-NEXT: v_mov_b32_e32 v9, v3 @@ -908,9 +930,11 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt ; GFX9-O0: ; %bb.0: ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, -1 ; GFX9-O0-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] @@ -961,110 +985,113 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-O0-NEXT: v_mov_b32_e32 v34, s5 +; GFX9-O0-NEXT: v_mov_b32_e32 v35, s5 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s11 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s12 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s13 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s14 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s15 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s16 +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v39, s17 -; GFX9-O0-NEXT: v_mov_b32_e32 v38, s18 -; GFX9-O0-NEXT: v_mov_b32_e32 v37, s19 -; GFX9-O0-NEXT: v_mov_b32_e32 v36, s20 -; GFX9-O0-NEXT: v_mov_b32_e32 v35, s21 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s22 +; GFX9-O0-NEXT: v_mov_b32_e32 v40, s18 +; GFX9-O0-NEXT: v_mov_b32_e32 v39, s19 +; GFX9-O0-NEXT: v_mov_b32_e32 v38, s20 +; GFX9-O0-NEXT: v_mov_b32_e32 v37, s21 +; GFX9-O0-NEXT: v_mov_b32_e32 v36, s22 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s23 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v46, s23 -; GFX9-O0-NEXT: v_mov_b32_e32 v45, s24 -; GFX9-O0-NEXT: v_mov_b32_e32 v44, s25 -; GFX9-O0-NEXT: v_mov_b32_e32 v43, s26 -; GFX9-O0-NEXT: v_mov_b32_e32 v42, s27 -; GFX9-O0-NEXT: v_mov_b32_e32 v41, s28 -; GFX9-O0-NEXT: v_mov_b32_e32 v40, s29 +; GFX9-O0-NEXT: v_mov_b32_e32 v46, s24 +; GFX9-O0-NEXT: v_mov_b32_e32 v45, s25 +; GFX9-O0-NEXT: v_mov_b32_e32 v44, s26 +; GFX9-O0-NEXT: v_mov_b32_e32 v43, s27 +; GFX9-O0-NEXT: v_mov_b32_e32 v42, s28 +; GFX9-O0-NEXT: v_mov_b32_e32 v41, s29 ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v34 -; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v35 +; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v34 -; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v35 +; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v34 -; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v35 +; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v34 -; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v35 +; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v34 -; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v35 +; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v34 -; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v35 +; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v34 -; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v35 +; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v34 -; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v35 +; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v34 -; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v35 +; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v34 -; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v35 +; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v34 -; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v35 +; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v35 +; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v34 -; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v39 -; GFX9-O0-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v38 -; GFX9-O0-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v37 -; GFX9-O0-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v36 -; GFX9-O0-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v35 -; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v35 +; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v40 +; GFX9-O0-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v39 +; GFX9-O0-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v38 +; GFX9-O0-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v37 +; GFX9-O0-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v36 +; GFX9-O0-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(5) -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v34 -; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-O0-NEXT: v_mov_b32_e32 v19, v46 -; GFX9-O0-NEXT: v_mov_b32_e32 v20, v45 -; GFX9-O0-NEXT: v_mov_b32_e32 v21, v44 -; GFX9-O0-NEXT: v_mov_b32_e32 v22, v43 -; GFX9-O0-NEXT: v_mov_b32_e32 v23, v42 -; GFX9-O0-NEXT: v_mov_b32_e32 v24, v41 -; GFX9-O0-NEXT: v_mov_b32_e32 v25, v40 +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v35 +; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v20, v46 +; GFX9-O0-NEXT: v_mov_b32_e32 v21, v45 +; GFX9-O0-NEXT: v_mov_b32_e32 v22, v44 +; GFX9-O0-NEXT: v_mov_b32_e32 v23, v43 +; GFX9-O0-NEXT: v_mov_b32_e32 v24, v42 +; GFX9-O0-NEXT: v_mov_b32_e32 v25, v41 ; GFX9-O0-NEXT: s_waitcnt vmcnt(5) -; GFX9-O0-NEXT: v_mov_b32_e32 v26, v39 +; GFX9-O0-NEXT: v_mov_b32_e32 v26, v40 ; GFX9-O0-NEXT: s_waitcnt vmcnt(4) -; GFX9-O0-NEXT: v_mov_b32_e32 v27, v38 +; GFX9-O0-NEXT: v_mov_b32_e32 v27, v39 ; GFX9-O0-NEXT: s_waitcnt vmcnt(3) -; GFX9-O0-NEXT: v_mov_b32_e32 v28, v37 +; GFX9-O0-NEXT: v_mov_b32_e32 v28, v38 ; GFX9-O0-NEXT: s_waitcnt vmcnt(2) -; GFX9-O0-NEXT: v_mov_b32_e32 v29, v36 +; GFX9-O0-NEXT: v_mov_b32_e32 v29, v37 ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-O0-NEXT: v_mov_b32_e32 v30, v35 -; GFX9-O0-NEXT: ; kill: def $vgpr31 killed $vgpr34 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v30, v36 +; GFX9-O0-NEXT: ; kill: def $vgpr31 killed $vgpr35 killed $exec ; GFX9-O0-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload @@ -1104,63 +1131,110 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt ; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v11, v0 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37 -; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37 -; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37 -; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37 -; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37 +; GFX9-O0-NEXT: ; implicit-def: $sgpr34_sgpr35 +; GFX9-O0-NEXT: ; implicit-def: $sgpr34_sgpr35 +; GFX9-O0-NEXT: ; implicit-def: $sgpr34_sgpr35 +; GFX9-O0-NEXT: ; implicit-def: $sgpr34_sgpr35 +; GFX9-O0-NEXT: ; implicit-def: $sgpr34_sgpr35 ; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67 -; GFX9-O0-NEXT: s_mov_b64 s[36:37], 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v32, s36 -; GFX9-O0-NEXT: v_mov_b32_e32 v33, s37 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v11 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-O0-NEXT: s_mov_b64 s[38:39], 0 +; GFX9-O0-NEXT: s_mov_b32 s37, s39 +; GFX9-O0-NEXT: v_mov_b32_e32 v32, s37 +; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-O0-NEXT: v_mov_b32_e32 v32, v0 +; GFX9-O0-NEXT: s_mov_b64 exec, -1 ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O0-NEXT: v_mov_b32_e32 v32, v10 -; GFX9-O0-NEXT: v_mov_b32_e32 v33, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v10 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-O0-NEXT: s_mov_b32 s36, s38 +; GFX9-O0-NEXT: v_mov_b32_e32 v33, s36 +; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-O0-NEXT: v_mov_b32_e32 v33, v0 ; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: ; implicit-def: $sgpr38 +; GFX9-O0-NEXT: ; implicit-def: $sgpr38 +; GFX9-O0-NEXT: v_mov_b32_e32 v34, v32 ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v32 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v33 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v33 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v34 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v9 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: v_mov_b32_e32 v32, s36 -; GFX9-O0-NEXT: v_mov_b32_e32 v33, s37 +; GFX9-O0-NEXT: v_mov_b32_e32 v32, s37 ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O0-NEXT: v_mov_b32_e32 v32, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v33, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v32, v0 ; GFX9-O0-NEXT: s_mov_b64 exec, -1 ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v32 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v33 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v8 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: v_mov_b32_e32 v32, s36 -; GFX9-O0-NEXT: v_mov_b32_e32 v33, s37 +; GFX9-O0-NEXT: v_mov_b32_e32 v33, s36 ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O0-NEXT: v_mov_b32_e32 v32, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v33, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v33, v0 ; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: ; implicit-def: $sgpr38 +; GFX9-O0-NEXT: ; implicit-def: $sgpr38 +; GFX9-O0-NEXT: v_mov_b32_e32 v34, v32 ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v32 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v33 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v33 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v34 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v7 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: v_mov_b32_e32 v32, s36 -; GFX9-O0-NEXT: v_mov_b32_e32 v33, s37 +; GFX9-O0-NEXT: v_mov_b32_e32 v32, s37 ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O0-NEXT: v_mov_b32_e32 v32, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v33, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v32, v0 ; GFX9-O0-NEXT: s_mov_b64 exec, -1 ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v32 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v33 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v6 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: v_mov_b32_e32 v32, s36 -; GFX9-O0-NEXT: v_mov_b32_e32 v33, s37 +; GFX9-O0-NEXT: v_mov_b32_e32 v33, s36 ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O0-NEXT: v_mov_b32_e32 v32, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v33, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v33, v0 ; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: ; implicit-def: $sgpr38 +; GFX9-O0-NEXT: ; implicit-def: $sgpr38 +; GFX9-O0-NEXT: v_mov_b32_e32 v34, v32 ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v32 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v33 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v33 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v34 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v5 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-O0-NEXT: v_mov_b32_e32 v32, s37 +; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-O0-NEXT: v_mov_b32_e32 v32, v0 +; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-O0-NEXT: v_mov_b32_e32 v33, s36 +; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-O0-NEXT: v_mov_b32_e32 v33, v0 +; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: ; implicit-def: $sgpr38 +; GFX9-O0-NEXT: ; implicit-def: $sgpr38 +; GFX9-O0-NEXT: v_mov_b32_e32 v34, v32 +; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v33 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v34 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v3 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-O0-NEXT: v_mov_b32_e32 v32, s37 +; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-O0-NEXT: v_mov_b32_e32 v32, v0 +; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-O0-NEXT: v_mov_b32_e32 v33, s36 +; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-O0-NEXT: v_mov_b32_e32 v33, v0 +; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: ; implicit-def: $sgpr36 +; GFX9-O0-NEXT: ; implicit-def: $sgpr36 +; GFX9-O0-NEXT: v_mov_b32_e32 v34, v32 +; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v33 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v34 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v11 ; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:4 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v10 @@ -1219,8 +1293,10 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt ; GFX9-O0-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, -1 ; GFX9-O0-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] @@ -1242,32 +1318,32 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt ; GFX9-O3-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:16 ; GFX9-O3-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-O3-NEXT: v_cndmask_b32_e64 v32, 0, v1, s[34:35] ; GFX9-O3-NEXT: v_cndmask_b32_e64 v33, 0, v2, s[34:35] +; GFX9-O3-NEXT: v_cndmask_b32_e64 v32, 0, v1, s[34:35] ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O3-NEXT: v_mov_b32_e32 v1, v32 ; GFX9-O3-NEXT: v_mov_b32_e32 v2, v33 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-O3-NEXT: v_cndmask_b32_e64 v32, 0, v3, s[34:35] ; GFX9-O3-NEXT: v_cndmask_b32_e64 v33, 0, v4, s[34:35] +; GFX9-O3-NEXT: v_cndmask_b32_e64 v32, 0, v3, s[34:35] ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O3-NEXT: v_mov_b32_e32 v3, v32 ; GFX9-O3-NEXT: v_mov_b32_e32 v4, v33 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-O3-NEXT: v_cndmask_b32_e64 v32, 0, v5, s[34:35] ; GFX9-O3-NEXT: v_cndmask_b32_e64 v33, 0, v6, s[34:35] +; GFX9-O3-NEXT: v_cndmask_b32_e64 v32, 0, v5, s[34:35] ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O3-NEXT: v_mov_b32_e32 v5, v32 ; GFX9-O3-NEXT: v_mov_b32_e32 v6, v33 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-O3-NEXT: v_cndmask_b32_e64 v32, 0, v7, s[34:35] ; GFX9-O3-NEXT: v_cndmask_b32_e64 v33, 0, v8, s[34:35] +; GFX9-O3-NEXT: v_cndmask_b32_e64 v32, 0, v7, s[34:35] ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O3-NEXT: v_mov_b32_e32 v7, v32 ; GFX9-O3-NEXT: v_mov_b32_e32 v8, v33 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-O3-NEXT: v_cndmask_b32_e64 v32, 0, v9, s[34:35] ; GFX9-O3-NEXT: v_cndmask_b32_e64 v33, 0, v10, s[34:35] +; GFX9-O3-NEXT: v_cndmask_b32_e64 v32, 0, v9, s[34:35] ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O3-NEXT: v_mov_b32_e32 v9, v32 ; GFX9-O3-NEXT: v_mov_b32_e32 v10, v33 diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll index 7f0db3e362b30..80717e82d5f2c 100644 --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll @@ -594,20 +594,20 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar ; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-O0-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24 ; GFX9-O0-NEXT: s_load_dwordx2 s[18:19], s[0:1], 0x2c -; GFX9-O0-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GFX9-O0-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_writelane_b32 v0, s8, 4 -; GFX9-O0-NEXT: v_writelane_b32 v0, s9, 5 +; GFX9-O0-NEXT: v_writelane_b32 v0, s6, 4 +; GFX9-O0-NEXT: v_writelane_b32 v0, s7, 5 ; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-O0-NEXT: s_mov_b32 s6, s19 -; GFX9-O0-NEXT: s_mov_b32 s7, s18 +; GFX9-O0-NEXT: s_mov_b32 s2, s19 +; GFX9-O0-NEXT: s_mov_b32 s3, s18 ; GFX9-O0-NEXT: s_mov_b32 s15, s17 ; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17 ; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17_sgpr18_sgpr19 ; GFX9-O0-NEXT: s_mov_b32 s17, s15 -; GFX9-O0-NEXT: s_mov_b32 s18, s7 -; GFX9-O0-NEXT: s_mov_b32 s19, s6 +; GFX9-O0-NEXT: s_mov_b32 s18, s3 +; GFX9-O0-NEXT: s_mov_b32 s19, s2 ; GFX9-O0-NEXT: v_writelane_b32 v0, s16, 6 ; GFX9-O0-NEXT: v_writelane_b32 v0, s17, 7 ; GFX9-O0-NEXT: v_writelane_b32 v0, s18, 8 @@ -615,13 +615,26 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[24:27], 0 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, s6 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, s7 -; GFX9-O0-NEXT: s_mov_b64 exec, s[8:9] -; GFX9-O0-NEXT: v_mov_b32_e32 v8, s2 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, s3 +; GFX9-O0-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-O0-NEXT: s_mov_b32 s16, s3 +; GFX9-O0-NEXT: s_mov_b32 s15, s9 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, s16 +; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v6, s15 ; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: s_mov_b32 s3, s2 +; GFX9-O0-NEXT: s_mov_b32 s2, s8 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, s3 +; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v7, s2 +; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: ; implicit-def: $sgpr2 +; GFX9-O0-NEXT: ; implicit-def: $sgpr2 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v6 +; GFX9-O0-NEXT: s_mov_b32 s2, 32 +; GFX9-O0-NEXT: v_lshrrev_b64 v[10:11], s2, v[8:9] +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v10 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 60 ; GFX9-O0-NEXT: s_mov_b32 s2, s0 ; GFX9-O0-NEXT: s_mov_b32 s0, s1 @@ -631,11 +644,6 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar ; GFX9-O0-NEXT: s_addc_u32 s0, s0, s1 ; GFX9-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 ; GFX9-O0-NEXT: s_mov_b32 s9, s0 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8 -; GFX9-O0-NEXT: s_mov_b32 s0, 32 -; GFX9-O0-NEXT: ; implicit-def: $sgpr2_sgpr3 -; GFX9-O0-NEXT: v_lshrrev_b64 v[10:11], s0, v[8:9] -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v10 ; GFX9-O0-NEXT: s_getpc_b64 s[0:1] ; GFX9-O0-NEXT: s_add_u32 s0, s0, called_i64@gotpcrel32@lo+4 ; GFX9-O0-NEXT: s_addc_u32 s1, s1, called_i64@gotpcrel32@hi+12 @@ -650,8 +658,8 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar ; GFX9-O0-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GFX9-O0-NEXT: ; implicit-def: $sgpr15 ; GFX9-O0-NEXT: v_mov_b32_e32 v31, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6 ; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 @@ -703,11 +711,13 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar ; GFX9-O3-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-O3-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-O3-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-O3-NEXT: v_mov_b32_e32 v6, s4 -; GFX9-O3-NEXT: v_mov_b32_e32 v7, s5 +; GFX9-O3-NEXT: v_mov_b32_e32 v6, s5 +; GFX9-O3-NEXT: s_mov_b64 exec, -1 +; GFX9-O3-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-O3-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-O3-NEXT: v_mov_b32_e32 v7, s4 ; GFX9-O3-NEXT: s_mov_b64 exec, -1 ; GFX9-O3-NEXT: s_add_u32 s8, s2, 60 ; GFX9-O3-NEXT: s_addc_u32 s9, s3, 0 @@ -724,14 +734,14 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar ; GFX9-O3-NEXT: s_mov_b32 s13, s7 ; GFX9-O3-NEXT: v_mov_b32_e32 v31, v3 ; GFX9-O3-NEXT: s_mov_b64 s[2:3], s[26:27] -; GFX9-O3-NEXT: v_mov_b32_e32 v0, v6 -; GFX9-O3-NEXT: v_mov_b32_e32 v1, v7 +; GFX9-O3-NEXT: v_mov_b32_e32 v0, v7 +; GFX9-O3-NEXT: v_mov_b32_e32 v1, v6 ; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-O3-NEXT: s_swappc_b64 s[30:31], s[22:23] ; GFX9-O3-NEXT: v_mov_b32_e32 v3, v0 ; GFX9-O3-NEXT: v_mov_b32_e32 v4, v1 -; GFX9-O3-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 -; GFX9-O3-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v7, vcc +; GFX9-O3-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 +; GFX9-O3-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc ; GFX9-O3-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O3-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, v4 @@ -763,70 +773,83 @@ define amdgpu_cs void @_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %index) { ; GFX9-O0-NEXT: s_mov_b32 s4, 5 ; GFX9-O0-NEXT: v_lshlrev_b32_e64 v0, s4, v0 ; GFX9-O0-NEXT: s_mov_b32 s4, 0 -; GFX9-O0-NEXT: buffer_load_dwordx4 v[10:13], v0, s[0:3], s4 offen -; GFX9-O0-NEXT: buffer_load_dwordx2 v[3:4], v0, s[0:3], s4 offen offset:16 +; GFX9-O0-NEXT: buffer_load_dwordx4 v[11:14], v0, s[0:3], s4 offen +; GFX9-O0-NEXT: buffer_load_dwordx2 v[4:5], v0, s[0:3], s4 offen offset:16 ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10 -; GFX9-O0-NEXT: ; implicit-def: $sgpr5 -; GFX9-O0-NEXT: ; implicit-def: $sgpr5 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v12 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-O0-NEXT: s_mov_b32 s5, 0x7fffffff -; GFX9-O0-NEXT: s_mov_b32 s8, -1 -; GFX9-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 -; GFX9-O0-NEXT: s_mov_b32 s9, s5 +; GFX9-O0-NEXT: s_mov_b32 s10, -1 +; GFX9-O0-NEXT: ; kill: def $sgpr10 killed $sgpr10 def $sgpr10_sgpr11 +; GFX9-O0-NEXT: s_mov_b32 s11, s5 +; GFX9-O0-NEXT: s_mov_b32 s8, s11 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s9 ; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v7 +; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: s_mov_b32 s5, s10 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v6 ; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: ; implicit-def: $sgpr9 +; GFX9-O0-NEXT: ; implicit-def: $sgpr9 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v13 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v12 -; GFX9-O0-NEXT: ; implicit-def: $sgpr5 -; GFX9-O0-NEXT: ; implicit-def: $sgpr5 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v14 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s9 ; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v7 +; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v6 ; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: ; implicit-def: $sgpr9 +; GFX9-O0-NEXT: ; implicit-def: $sgpr9 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v3 +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s9 ; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6 ; GFX9-O0-NEXT: s_mov_b64 exec, -1 ; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: ; implicit-def: $sgpr5 +; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v9 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v7 ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6_vgpr7_vgpr8 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v10 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v9 -; GFX9-O0-NEXT: buffer_store_dwordx4 v[5:8], v0, s[0:3], s4 offen -; GFX9-O0-NEXT: buffer_store_dwordx2 v[3:4], v0, s[0:3], s4 offen offset:16 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7_vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v10 +; GFX9-O0-NEXT: buffer_store_dwordx4 v[6:9], v0, s[0:3], s4 offen +; GFX9-O0-NEXT: buffer_store_dwordx2 v[4:5], v0, s[0:3], s4 offen offset:16 ; GFX9-O0-NEXT: s_endpgm ; ; GFX9-O3-LABEL: _amdgpu_cs_main: @@ -835,28 +858,23 @@ define amdgpu_cs void @_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %index) { ; GFX9-O3-NEXT: buffer_load_dwordx4 v[7:10], v0, s[0:3], 0 offen ; GFX9-O3-NEXT: buffer_load_dwordx2 v[11:12], v0, s[0:3], 0 offen offset:16 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-O3-NEXT: s_mov_b32 s6, -1 -; GFX9-O3-NEXT: s_brev_b32 s7, -2 -; GFX9-O3-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-O3-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-O3-NEXT: v_bfrev_b32_e32 v2, -2 ; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O3-NEXT: s_waitcnt vmcnt(1) -; GFX9-O3-NEXT: v_mov_b32_e32 v1, v7 ; GFX9-O3-NEXT: v_mov_b32_e32 v2, v8 ; GFX9-O3-NEXT: s_mov_b64 exec, -1 -; GFX9-O3-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-O3-NEXT: v_mov_b32_e32 v4, s7 +; GFX9-O3-NEXT: v_cndmask_b32_e64 v1, -1, v7, s[4:5] +; GFX9-O3-NEXT: v_bfrev_b32_e32 v4, -2 ; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-O3-NEXT: v_mov_b32_e32 v3, v9 ; GFX9-O3-NEXT: v_mov_b32_e32 v4, v10 ; GFX9-O3-NEXT: s_mov_b64 exec, -1 -; GFX9-O3-NEXT: v_mov_b32_e32 v5, s6 -; GFX9-O3-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-O3-NEXT: v_cndmask_b32_e64 v3, -1, v9, s[4:5] +; GFX9-O3-NEXT: v_bfrev_b32_e32 v6, -2 ; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) -; GFX9-O3-NEXT: v_mov_b32_e32 v5, v11 ; GFX9-O3-NEXT: v_mov_b32_e32 v6, v12 ; GFX9-O3-NEXT: s_mov_b64 exec, -1 +; GFX9-O3-NEXT: v_cndmask_b32_e64 v5, -1, v11, s[4:5] ; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O3-NEXT: v_mov_b32_e32 v7, v1 ; GFX9-O3-NEXT: v_mov_b32_e32 v9, v3 @@ -1483,20 +1501,20 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6 ; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-O0-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24 ; GFX9-O0-NEXT: s_load_dwordx2 s[18:19], s[0:1], 0x2c -; GFX9-O0-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GFX9-O0-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_writelane_b32 v0, s8, 4 -; GFX9-O0-NEXT: v_writelane_b32 v0, s9, 5 +; GFX9-O0-NEXT: v_writelane_b32 v0, s6, 4 +; GFX9-O0-NEXT: v_writelane_b32 v0, s7, 5 ; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-O0-NEXT: s_mov_b32 s6, s19 -; GFX9-O0-NEXT: s_mov_b32 s7, s18 +; GFX9-O0-NEXT: s_mov_b32 s2, s19 +; GFX9-O0-NEXT: s_mov_b32 s3, s18 ; GFX9-O0-NEXT: s_mov_b32 s15, s17 ; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17 ; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17_sgpr18_sgpr19 ; GFX9-O0-NEXT: s_mov_b32 s17, s15 -; GFX9-O0-NEXT: s_mov_b32 s18, s7 -; GFX9-O0-NEXT: s_mov_b32 s19, s6 +; GFX9-O0-NEXT: s_mov_b32 s18, s3 +; GFX9-O0-NEXT: s_mov_b32 s19, s2 ; GFX9-O0-NEXT: v_writelane_b32 v0, s16, 6 ; GFX9-O0-NEXT: v_writelane_b32 v0, s17, 7 ; GFX9-O0-NEXT: v_writelane_b32 v0, s18, 8 @@ -1504,13 +1522,26 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[24:27], 0 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, s6 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, s7 -; GFX9-O0-NEXT: s_mov_b64 exec, s[8:9] -; GFX9-O0-NEXT: v_mov_b32_e32 v8, s2 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, s3 +; GFX9-O0-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-O0-NEXT: s_mov_b32 s16, s3 +; GFX9-O0-NEXT: s_mov_b32 s15, s9 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, s16 +; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v6, s15 ; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: s_mov_b32 s3, s2 +; GFX9-O0-NEXT: s_mov_b32 s2, s8 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, s3 +; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v7, s2 +; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: ; implicit-def: $sgpr2 +; GFX9-O0-NEXT: ; implicit-def: $sgpr2 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v6 +; GFX9-O0-NEXT: s_mov_b32 s2, 32 +; GFX9-O0-NEXT: v_lshrrev_b64 v[10:11], s2, v[8:9] +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v10 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 60 ; GFX9-O0-NEXT: s_mov_b32 s2, s0 ; GFX9-O0-NEXT: s_mov_b32 s0, s1 @@ -1520,11 +1551,6 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6 ; GFX9-O0-NEXT: s_addc_u32 s0, s0, s1 ; GFX9-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 ; GFX9-O0-NEXT: s_mov_b32 s9, s0 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8 -; GFX9-O0-NEXT: s_mov_b32 s0, 32 -; GFX9-O0-NEXT: ; implicit-def: $sgpr2_sgpr3 -; GFX9-O0-NEXT: v_lshrrev_b64 v[10:11], s0, v[8:9] -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v10 ; GFX9-O0-NEXT: s_getpc_b64 s[0:1] ; GFX9-O0-NEXT: s_add_u32 s0, s0, strict_wwm_called_i64@gotpcrel32@lo+4 ; GFX9-O0-NEXT: s_addc_u32 s1, s1, strict_wwm_called_i64@gotpcrel32@hi+12 @@ -1539,8 +1565,8 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6 ; GFX9-O0-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GFX9-O0-NEXT: ; implicit-def: $sgpr15 ; GFX9-O0-NEXT: v_mov_b32_e32 v31, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6 ; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 @@ -1592,11 +1618,13 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6 ; GFX9-O3-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-O3-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-O3-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-O3-NEXT: v_mov_b32_e32 v6, s4 -; GFX9-O3-NEXT: v_mov_b32_e32 v7, s5 +; GFX9-O3-NEXT: v_mov_b32_e32 v6, s5 +; GFX9-O3-NEXT: s_mov_b64 exec, -1 +; GFX9-O3-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-O3-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-O3-NEXT: v_mov_b32_e32 v7, s4 ; GFX9-O3-NEXT: s_mov_b64 exec, -1 ; GFX9-O3-NEXT: s_add_u32 s8, s2, 60 ; GFX9-O3-NEXT: s_addc_u32 s9, s3, 0 @@ -1613,14 +1641,14 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6 ; GFX9-O3-NEXT: s_mov_b32 s13, s7 ; GFX9-O3-NEXT: v_mov_b32_e32 v31, v3 ; GFX9-O3-NEXT: s_mov_b64 s[2:3], s[26:27] -; GFX9-O3-NEXT: v_mov_b32_e32 v0, v6 -; GFX9-O3-NEXT: v_mov_b32_e32 v1, v7 +; GFX9-O3-NEXT: v_mov_b32_e32 v0, v7 +; GFX9-O3-NEXT: v_mov_b32_e32 v1, v6 ; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-O3-NEXT: s_swappc_b64 s[30:31], s[22:23] ; GFX9-O3-NEXT: v_mov_b32_e32 v3, v0 ; GFX9-O3-NEXT: v_mov_b32_e32 v4, v1 -; GFX9-O3-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 -; GFX9-O3-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v7, vcc +; GFX9-O3-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 +; GFX9-O3-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc ; GFX9-O3-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O3-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, v4 @@ -1652,70 +1680,83 @@ define amdgpu_cs void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %ind ; GFX9-O0-NEXT: s_mov_b32 s4, 5 ; GFX9-O0-NEXT: v_lshlrev_b32_e64 v0, s4, v0 ; GFX9-O0-NEXT: s_mov_b32 s4, 0 -; GFX9-O0-NEXT: buffer_load_dwordx4 v[10:13], v0, s[0:3], s4 offen -; GFX9-O0-NEXT: buffer_load_dwordx2 v[3:4], v0, s[0:3], s4 offen offset:16 +; GFX9-O0-NEXT: buffer_load_dwordx4 v[11:14], v0, s[0:3], s4 offen +; GFX9-O0-NEXT: buffer_load_dwordx2 v[4:5], v0, s[0:3], s4 offen offset:16 ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10 -; GFX9-O0-NEXT: ; implicit-def: $sgpr5 -; GFX9-O0-NEXT: ; implicit-def: $sgpr5 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v12 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-O0-NEXT: s_mov_b32 s5, 0x7fffffff -; GFX9-O0-NEXT: s_mov_b32 s8, -1 -; GFX9-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 -; GFX9-O0-NEXT: s_mov_b32 s9, s5 +; GFX9-O0-NEXT: s_mov_b32 s10, -1 +; GFX9-O0-NEXT: ; kill: def $sgpr10 killed $sgpr10 def $sgpr10_sgpr11 +; GFX9-O0-NEXT: s_mov_b32 s11, s5 +; GFX9-O0-NEXT: s_mov_b32 s8, s11 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s9 ; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v7 +; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: s_mov_b32 s5, s10 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v6 ; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: ; implicit-def: $sgpr9 +; GFX9-O0-NEXT: ; implicit-def: $sgpr9 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v13 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v12 -; GFX9-O0-NEXT: ; implicit-def: $sgpr5 -; GFX9-O0-NEXT: ; implicit-def: $sgpr5 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v14 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s9 ; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v7 +; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v6 ; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: ; implicit-def: $sgpr9 +; GFX9-O0-NEXT: ; implicit-def: $sgpr9 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v3 +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s9 ; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6 ; GFX9-O0-NEXT: s_mov_b64 exec, -1 ; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec +; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: ; implicit-def: $sgpr5 +; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v9 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v7 ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6_vgpr7_vgpr8 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v10 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v9 -; GFX9-O0-NEXT: buffer_store_dwordx4 v[5:8], v0, s[0:3], s4 offen -; GFX9-O0-NEXT: buffer_store_dwordx2 v[3:4], v0, s[0:3], s4 offen offset:16 +; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7_vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v10 +; GFX9-O0-NEXT: buffer_store_dwordx4 v[6:9], v0, s[0:3], s4 offen +; GFX9-O0-NEXT: buffer_store_dwordx2 v[4:5], v0, s[0:3], s4 offen offset:16 ; GFX9-O0-NEXT: s_endpgm ; ; GFX9-O3-LABEL: strict_wwm_amdgpu_cs_main: @@ -1724,28 +1765,23 @@ define amdgpu_cs void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %ind ; GFX9-O3-NEXT: buffer_load_dwordx4 v[7:10], v0, s[0:3], 0 offen ; GFX9-O3-NEXT: buffer_load_dwordx2 v[11:12], v0, s[0:3], 0 offen offset:16 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-O3-NEXT: s_mov_b32 s6, -1 -; GFX9-O3-NEXT: s_brev_b32 s7, -2 -; GFX9-O3-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-O3-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-O3-NEXT: v_bfrev_b32_e32 v2, -2 ; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O3-NEXT: s_waitcnt vmcnt(1) -; GFX9-O3-NEXT: v_mov_b32_e32 v1, v7 ; GFX9-O3-NEXT: v_mov_b32_e32 v2, v8 ; GFX9-O3-NEXT: s_mov_b64 exec, -1 -; GFX9-O3-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-O3-NEXT: v_mov_b32_e32 v4, s7 +; GFX9-O3-NEXT: v_cndmask_b32_e64 v1, -1, v7, s[4:5] +; GFX9-O3-NEXT: v_bfrev_b32_e32 v4, -2 ; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-O3-NEXT: v_mov_b32_e32 v3, v9 ; GFX9-O3-NEXT: v_mov_b32_e32 v4, v10 ; GFX9-O3-NEXT: s_mov_b64 exec, -1 -; GFX9-O3-NEXT: v_mov_b32_e32 v5, s6 -; GFX9-O3-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-O3-NEXT: v_cndmask_b32_e64 v3, -1, v9, s[4:5] +; GFX9-O3-NEXT: v_bfrev_b32_e32 v6, -2 ; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) -; GFX9-O3-NEXT: v_mov_b32_e32 v5, v11 ; GFX9-O3-NEXT: v_mov_b32_e32 v6, v12 ; GFX9-O3-NEXT: s_mov_b64 exec, -1 +; GFX9-O3-NEXT: v_cndmask_b32_e64 v5, -1, v11, s[4:5] ; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O3-NEXT: v_mov_b32_e32 v7, v1 ; GFX9-O3-NEXT: v_mov_b32_e32 v9, v3 From 03605287cb6ae1f7bb4da00afc5c180d184ebf52 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Fri, 6 Sep 2024 17:31:59 +0100 Subject: [PATCH 2/4] Make V_SET_INACTIVE_B32 a VOP3 instruction This makes it trivial to expand it to V_CNDMASK_B32 in expandPostRAPseudo so we no longer need the fallback option of expanding it with V_MOV_B32 and exec mask manipulations. --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 106 +--- llvm/lib/Target/AMDGPU/SIInstrInfo.h | 2 - llvm/lib/Target/AMDGPU/SIInstructions.td | 10 +- llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp | 62 +-- .../global-atomic-fadd.f32-no-rtn.ll | 12 +- .../GlobalISel/global-atomic-fadd.f32-rtn.ll | 9 +- .../GlobalISel/llvm.amdgcn.set.inactive.ll | 240 ++++----- .../atomic_optimizations_local_pointer.ll | 56 +- .../AMDGPU/global-atomic-fadd.f32-no-rtn.ll | 13 +- .../AMDGPU/global-atomic-fadd.f32-rtn.ll | 37 +- .../AMDGPU/global_atomics_scan_fadd.ll | 55 +- .../AMDGPU/global_atomics_scan_fmax.ll | 33 +- .../AMDGPU/global_atomics_scan_fmin.ll | 33 +- .../AMDGPU/global_atomics_scan_fsub.ll | 55 +- .../AMDGPU/llvm.amdgcn.set.inactive.ll | 258 ++++----- .../AMDGPU/sgpr-spill-overlap-wwm-reserve.mir | 4 +- llvm/test/CodeGen/AMDGPU/wqm.mir | 29 +- .../test/CodeGen/AMDGPU/wwm-reserved-spill.ll | 296 +++++------ llvm/test/CodeGen/AMDGPU/wwm-reserved.ll | 496 +++++++++--------- 19 files changed, 737 insertions(+), 1069 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 34298e81f1530..59900d7e7fe9e 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2098,20 +2098,6 @@ unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) { } } -Register SIInstrInfo::findSetInactiveMask(const MachineInstr &MI) { - assert(MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B32); - for (auto &Op : MI.implicit_operands()) { - if (Op.isDef()) - continue; - Register OpReg = Op.getReg(); - if (OpReg == AMDGPU::EXEC || OpReg == AMDGPU::EXEC_LO || - OpReg == AMDGPU::SCC) - continue; - return OpReg; - } - return Register(); -} - bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { MachineBasicBlock &MBB = *MI.getParent(); DebugLoc DL = MBB.findDebugLoc(MI); @@ -2287,92 +2273,14 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { break; } case AMDGPU::V_SET_INACTIVE_B32: { - unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64; - unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; - Register ExecReg = RI.getExec(); + // Lower V_SET_INACTIVE_B32 to V_CNDMASK_B32. Register DstReg = MI.getOperand(0).getReg(); - MachineOperand &ActiveSrc = MI.getOperand(1); - MachineOperand &InactiveSrc = MI.getOperand(2); - - // Find implicit register defining lanes active outside WWM. - Register ExecSrcReg = findSetInactiveMask(MI); - assert(ExecSrcReg && "V_SET_INACTIVE must be in known WWM region"); - // Note: default here is set to ExecReg so that functional MIR is still - // generated if implicit def is not found and assertions are disabled. - if (!ExecSrcReg) - ExecSrcReg = ExecReg; - - // Ideally in WWM this operation is lowered to V_CNDMASK; however, - // constant bus constraints and the presence of literal constants - // present an issue. - // Fallback to V_MOV base lowering in all but the common cases. - MachineFunction *MF = MBB.getParent(); - MachineRegisterInfo &MRI = MF->getRegInfo(); - const unsigned Opcode = AMDGPU::V_CNDMASK_B32_e64; - const MCInstrDesc &Desc = get(Opcode); - - const APInt ActiveImm(64, ActiveSrc.isImm() ? ActiveSrc.getImm() : 0); - const APInt InactiveImm(64, InactiveSrc.isImm() ? InactiveSrc.getImm() : 0); - - int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); - int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); - - int ConstantBusLimit = ST.getConstantBusLimit(Opcode); - int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0; - int ConstantBusUses = - 1 + // Starts at 1 for ExecSrcReg - (usesConstantBus(MRI, ActiveSrc, Desc.operands()[Src1Idx]) ? 1 : 0) + - (usesConstantBus(MRI, InactiveSrc, Desc.operands()[Src0Idx]) ? 1 : 0); - int LiteralConstants = - ((ActiveSrc.isReg() || - (ActiveSrc.isImm() && isInlineConstant(ActiveImm))) - ? 0 - : 1) + - ((InactiveSrc.isReg() || - (InactiveSrc.isImm() && isInlineConstant(InactiveImm))) - ? 0 - : 1); - - bool UseVCndMask = - ConstantBusUses <= ConstantBusLimit && LiteralConstants <= LiteralLimit; - - if (UseVCndMask) { - // Single V_CNDMASK_B32 - BuildMI(MBB, MI, DL, Desc, DstReg) - .addImm(0) - .add(InactiveSrc) - .addImm(0) - .add(ActiveSrc) - .addReg(ExecSrcReg); - } else { - // Fallback V_MOV case. - // Avoid unnecessary work if a source VGPR is also the destination. - // This can happen if WWM register allocation was efficient. - // Note: this assumes WWM execution. - bool DstIsActive = ActiveSrc.isReg() && ActiveSrc.getReg() == DstReg; - bool DstIsInactive = - InactiveSrc.isReg() && InactiveSrc.getReg() == DstReg; - if (!DstIsInactive) { - // Set exec mask to inactive lanes, - // but only if active lanes would be overwritten. - if (DstIsActive) { - BuildMI(MBB, MI, DL, get(NotOpc), ExecReg) - .addReg(ExecSrcReg) - .setOperandDead(3); // Dead scc - } - // Copy inactive lanes - BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstReg).add(InactiveSrc); - } - if (!DstIsActive) { - // Set exec mask to active lanes - BuildMI(MBB, MI, DL, get(MovOpc), ExecReg).addReg(ExecSrcReg); - // Copy active lanes - BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg()) - .add(ActiveSrc); - } - // Restore WWM - BuildMI(MBB, MI, DL, get(MovOpc), ExecReg).addImm(-1); - } + BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) + .add(MI.getOperand(3)) + .add(MI.getOperand(4)) + .add(MI.getOperand(1)) + .add(MI.getOperand(2)) + .add(MI.getOperand(5)); MI.eraseFromParent(); break; } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 71432510fdee4..4fd9b4366159b 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -1437,8 +1437,6 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { // This is used if an operand is a 32 bit register but needs to be aligned // regardless. void enforceOperandRCAlignment(MachineInstr &MI, unsigned OpName) const; - - static Register findSetInactiveMask(const MachineInstr &MI); }; /// \brief Returns true if a reg:subreg pair P has a TRC class diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 1549fddec8f0e..5df595ff2cf4a 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -243,18 +243,16 @@ def : GCNPat <(f32 (fptrunc_round f64:$src0, (i32 SupportedRoundMode:$round))), // Invert the exec mask and overwrite the inactive lanes of dst with inactive, // restoring it after we're done. -let Defs = [SCC], isConvergent = 1 in { -def V_SET_INACTIVE_B32 : VPseudoInstSI <(outs VGPR_32:$vdst), - (ins VSrc_b32: $src, VSrc_b32:$inactive), []>; -} // End Defs = [SCC] +let isConvergent = 1 in +def V_SET_INACTIVE_B32 : VOP3_Pseudo<"v_set_inactive_b32", VOP2e_I32_I32_I32_I1>; foreach vt = Reg32Types.types in { def : GCNPat <(vt (int_amdgcn_set_inactive vt:$src, vt:$inactive)), - (V_SET_INACTIVE_B32 VSrc_b32:$src, VSrc_b32:$inactive)>; + (V_SET_INACTIVE_B32 0, VSrc_b32:$src, 0, VSrc_b32:$inactive, (IMPLICIT_DEF))>; } def : GCNPat<(i32 (int_amdgcn_set_inactive_chain_arg i32:$src, i32:$inactive)), - (V_SET_INACTIVE_B32 VGPR_32:$src, VGPR_32:$inactive)>; + (V_SET_INACTIVE_B32 0, VGPR_32:$src, 0, VGPR_32:$inactive, (IMPLICIT_DEF))>; let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in { def WAVE_REDUCE_UMIN_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst), diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp index c267882c68936..208af0634f81c 100644 --- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -558,24 +558,17 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF, III.Needs |= StateStrictWQM; GlobalFlags |= StateStrictWQM; } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32) { - // Ignore these if V_SET_INACTIVE which already has exec src register. - // These are generated by an earlier pass which has seperately ensured - // WWM and provided a mask of inactive lanes. - Register ExecSrc = TII->findSetInactiveMask(MI); - if (!ExecSrc) { - // Disable strict states; StrictWQM will be added as required later. - III.Disabled = StateStrict; - MachineOperand &Inactive = MI.getOperand(2); - if (Inactive.isReg()) { - if (Inactive.isUndef()) { - LowerToCopyInstrs.insert(&MI); - } else { - markOperand(MI, Inactive, StateStrictWWM, Worklist); - } - } - SetInactiveInstrs.push_back(&MI); - BBI.NeedsLowering = true; + // Disable strict states; StrictWQM will be added as required later. + III.Disabled = StateStrict; + MachineOperand &Inactive = MI.getOperand(4); + if (Inactive.isReg()) { + if (Inactive.isUndef() && MI.getOperand(3).getImm() == 0) + LowerToCopyInstrs.insert(&MI); + else + markOperand(MI, Inactive, StateStrictWWM, Worklist); } + SetInactiveInstrs.push_back(&MI); + BBI.NeedsLowering = true; } else if (TII->isDisableWQM(MI)) { BBI.Needs |= StateExact; if (!(BBI.InNeeds & StateExact)) { @@ -1078,8 +1071,10 @@ void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) { break; case AMDGPU::V_SET_INACTIVE_B32: if (ActiveLanesReg) { - MI.addOperand(*MBB.getParent(), - MachineOperand::CreateReg(ActiveLanesReg, false, true)); + LiveInterval &LI = LIS->getInterval(MI.getOperand(5).getReg()); + MRI->constrainRegClass(ActiveLanesReg, TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID)); + MI.getOperand(5).setReg(ActiveLanesReg); + LIS->shrinkToUses(&LI); } else { assert(State == StateExact || State == StateWQM); } @@ -1525,12 +1520,20 @@ bool SIWholeQuadMode::lowerCopyInstrs() { for (MachineInstr *MI : LowerToCopyInstrs) { LLVM_DEBUG(dbgs() << "simplify: " << *MI); - Register RecomputeReg = 0; if (MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32) { - assert(MI->getNumExplicitOperands() == 3); - if (MI->getOperand(2).isReg()) - RecomputeReg = MI->getOperand(2).getReg(); - MI->removeOperand(2); + assert(MI->getNumExplicitOperands() == 6); + + LiveInterval *LI = nullptr; + if (MI->getOperand(4).isReg()) + LI = &LIS->getInterval(MI->getOperand(4).getReg()); + + MI->removeOperand(5); + MI->removeOperand(4); + MI->removeOperand(3); + MI->removeOperand(1); + + if (LI) + LIS->shrinkToUses(LI); } else { assert(MI->getNumExplicitOperands() == 2); } @@ -1539,19 +1542,8 @@ bool SIWholeQuadMode::lowerCopyInstrs() { ? (unsigned)AMDGPU::COPY : TII->getMovOpcode(TRI->getRegClassForOperandReg( *MRI, MI->getOperand(0))); - int Index = MI->findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr); - while (Index >= 0) { - MI->removeOperand(Index); - Index = MI->findRegisterUseOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr); - } - MI->setDesc(TII->get(CopyOp)); LLVM_DEBUG(dbgs() << " -> " << *MI); - - if (RecomputeReg) { - LIS->removeInterval(RecomputeReg); - LIS->createAndComputeVirtRegInterval(RecomputeReg); - } } return !LowerToCopyInstrs.empty() || !LowerToMovInstrs.empty(); } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll index 9c634aba348d2..89c3bbc83018e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll @@ -85,7 +85,8 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa ; GFX908-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY10]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec ; GFX908-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648 ; GFX908-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] - ; GFX908-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY2]], [[COPY11]], implicit-def dead $scc, implicit $exec + ; GFX908-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; GFX908-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY2]], 0, [[COPY11]], [[DEF]], implicit $exec ; GFX908-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] ; GFX908-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY12]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec ; GFX908-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec @@ -159,7 +160,8 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa ; GFX90A-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY10]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec ; GFX90A-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648 ; GFX90A-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] - ; GFX90A-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY2]], [[COPY11]], implicit-def dead $scc, implicit $exec + ; GFX90A-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; GFX90A-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY2]], 0, [[COPY11]], [[DEF]], implicit $exec ; GFX90A-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] ; GFX90A-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY12]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec ; GFX90A-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec @@ -233,7 +235,8 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa ; GFX940-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY10]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec ; GFX940-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648 ; GFX940-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] - ; GFX940-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY2]], [[COPY11]], implicit-def dead $scc, implicit $exec + ; GFX940-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; GFX940-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY2]], 0, [[COPY11]], [[DEF]], implicit $exec ; GFX940-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] ; GFX940-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY12]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec ; GFX940-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec @@ -300,7 +303,8 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa ; GFX11-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY4]], [[COPY5]], implicit $exec ; GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648 ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; GFX11-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY2]], [[COPY6]], implicit-def dead $scc, implicit $exec + ; GFX11-NEXT: [[DEF1:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF + ; GFX11-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY2]], 0, [[COPY6]], [[DEF1]], implicit $exec ; GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] ; GFX11-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY7]], [[V_SET_INACTIVE_B32_]], 353, 15, 15, 0, implicit $exec ; GFX11-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll index fdce9d9258c88..8eef3d4ac4a3d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll @@ -77,7 +77,8 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX90A-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY10]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec ; GFX90A-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648 ; GFX90A-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] - ; GFX90A-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY2]], [[COPY11]], implicit-def dead $scc, implicit $exec + ; GFX90A-NEXT: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; GFX90A-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY2]], 0, [[COPY11]], [[DEF1]], implicit $exec ; GFX90A-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] ; GFX90A-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY12]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec ; GFX90A-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec @@ -170,7 +171,8 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX940-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY10]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec ; GFX940-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648 ; GFX940-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] - ; GFX940-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY2]], [[COPY11]], implicit-def dead $scc, implicit $exec + ; GFX940-NEXT: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; GFX940-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY2]], 0, [[COPY11]], [[DEF1]], implicit $exec ; GFX940-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] ; GFX940-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY12]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec ; GFX940-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec @@ -255,7 +257,8 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX11-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY4]], [[COPY5]], implicit $exec ; GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648 ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; GFX11-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY2]], [[COPY6]], implicit-def dead $scc, implicit $exec + ; GFX11-NEXT: [[DEF1:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF + ; GFX11-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY2]], 0, [[COPY6]], [[DEF1]], implicit $exec ; GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] ; GFX11-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY7]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec ; GFX11-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll index c78e072cc708f..f015099517902 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll @@ -4,15 +4,13 @@ define amdgpu_kernel void @set_inactive(ptr addrspace(1) %out, i32 %in) { ; GCN-LABEL: set_inactive: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: v_mov_b32_e32 v0, 42 -; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: v_cndmask_b32_e64 v0, 42, v1, s[4:5] ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 @@ -45,19 +43,15 @@ define amdgpu_kernel void @set_inactive_64(ptr addrspace(1) %out, i64 %in) { ; GCN-LABEL: set_inactive_64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: s_mov_b64 exec, -1 -; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, v2, s[4:5] +; GCN-NEXT: v_cndmask_b32_e64 v1, 0, v3, s[4:5] ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v2, v0 -; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0 @@ -91,22 +85,18 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x ; GCN-LABEL: set_inactive_scc: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34 +; GCN-NEXT: s_load_dword s8, s[2:3], 0x2c ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_buffer_load_dword s6, s[4:7], 0x0 -; GCN-NEXT: s_load_dword s7, s[2:3], 0x2c +; GCN-NEXT: s_buffer_load_dword s4, s[4:7], 0x0 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: s_mov_b32 s2, 1 +; GCN-NEXT: v_mov_b32_e32 v1, s8 +; GCN-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GCN-NEXT: v_cndmask_b32_e64 v0, 42, v1, s[2:3] +; GCN-NEXT: s_mov_b64 exec, s[2:3] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s6, 56 -; GCN-NEXT: s_cselect_b32 s3, 1, 0 -; GCN-NEXT: v_mov_b32_e32 v0, 42 -; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, s7 -; GCN-NEXT: s_mov_b64 exec, -1 -; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_cmp_lg_u32 s4, 56 ; GCN-NEXT: v_mov_b32_e32 v1, v0 -; GCN-NEXT: s_cmp_lg_u32 s3, 0 +; GCN-NEXT: s_mov_b32 s2, 1 ; GCN-NEXT: s_cbranch_scc0 .LBB4_2 ; GCN-NEXT: ; %bb.1: ; %.one ; GCN-NEXT: v_add_u32_e32 v2, vcc, 1, v1 @@ -147,15 +137,16 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) { ; GCN-LABEL: set_inactive_f32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GCN-NEXT: v_mov_b32_e32 v0, 0x40400000 +; GCN-NEXT: s_mov_b64 exec, s[2:3] ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 @@ -176,15 +167,15 @@ define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) { ; GCN-NEXT: v_mov_b32_e32 v1, 0x4010cccc ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: s_mov_b64 exec, -1 -; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: s_mov_b64 exec, -1 -; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[2:3] +; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[2:3] +; GCN-NEXT: s_mov_b64 exec, s[2:3] ; GCN-NEXT: v_mov_b32_e32 v2, v0 -; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v3, v1 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0 ; GCN-NEXT: s_endpgm @@ -197,15 +188,16 @@ define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) { define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %in) { ; GCN-LABEL: set_inactive_v2i16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GCN-NEXT: v_mov_b32_e32 v0, 0x10001 +; GCN-NEXT: s_mov_b64 exec, s[2:3] ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 @@ -220,15 +212,16 @@ define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> % define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half> %in) { ; GCN-LABEL: set_inactive_v2f16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GCN-NEXT: v_mov_b32_e32 v0, 0x3c003c00 +; GCN-NEXT: s_mov_b64 exec, s[2:3] ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 @@ -244,19 +237,15 @@ define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> % ; GCN-LABEL: set_inactive_v2i32: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: v_mov_b32_e32 v0, 1 -; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: s_mov_b64 exec, -1 -; GCN-NEXT: v_mov_b32_e32 v1, 1 -; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_cndmask_b32_e64 v0, 1, v2, s[4:5] +; GCN-NEXT: v_cndmask_b32_e64 v1, 1, v3, s[4:5] ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v2, v0 -; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0 @@ -271,19 +260,15 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float> ; GCN-LABEL: set_inactive_v2f32: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: v_mov_b32_e32 v0, 1.0 -; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: s_mov_b64 exec, -1 -; GCN-NEXT: v_mov_b32_e32 v1, 1.0 -; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_cndmask_b32_e64 v0, 1.0, v2, s[4:5] +; GCN-NEXT: v_cndmask_b32_e64 v1, 1.0, v3, s[4:5] ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v2, v0 -; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0 @@ -297,15 +282,16 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float> define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloat> %in) { ; GCN-LABEL: set_inactive_v2bf16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GCN-NEXT: v_mov_b32_e32 v0, 0x3f803f80 +; GCN-NEXT: s_mov_b64 exec, s[2:3] ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 @@ -325,21 +311,17 @@ define amdgpu_kernel void @set_inactive_v4i16(ptr addrspace(1) %out, <4 x i16> % ; GCN-NEXT: v_mov_b32_e32 v0, 0x10001 ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: s_mov_b64 exec, -1 -; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v2, s3 +; GCN-NEXT: v_mov_b32_e32 v3, s2 +; GCN-NEXT: v_mov_b32_e32 v4, s3 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: v_mov_b32_e32 v1, 0x10001 -; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v1, v2 -; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: v_cndmask_b32_e64 v1, v0, v3, s[4:5] +; GCN-NEXT: v_cndmask_b32_e64 v2, v0, v4, s[4:5] ; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, v1 +; GCN-NEXT: v_mov_b32_e32 v4, v2 ; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0 +; GCN-NEXT: buffer_store_dwordx2 v[3:4], off, s[0:3], 0 ; GCN-NEXT: s_endpgm %tmp.0 = call <4 x i16> @llvm.amdgcn.set.inactive.v4i16(<4 x i16> %in, <4 x i16> ) #0 %tmp = call <4 x i16> @llvm.amdgcn.strict.wwm.v4i16(<4 x i16> %tmp.0) @@ -355,21 +337,17 @@ define amdgpu_kernel void @set_inactive_v4f16(ptr addrspace(1) %out, <4 x half> ; GCN-NEXT: v_mov_b32_e32 v0, 0x3c003c00 ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: s_mov_b64 exec, -1 -; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v2, s3 +; GCN-NEXT: v_mov_b32_e32 v3, s2 +; GCN-NEXT: v_mov_b32_e32 v4, s3 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: v_mov_b32_e32 v1, 0x3c003c00 -; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v1, v2 -; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: v_cndmask_b32_e64 v1, v0, v3, s[4:5] +; GCN-NEXT: v_cndmask_b32_e64 v2, v0, v4, s[4:5] ; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, v1 +; GCN-NEXT: v_mov_b32_e32 v4, v2 ; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0 +; GCN-NEXT: buffer_store_dwordx2 v[3:4], off, s[0:3], 0 ; GCN-NEXT: s_endpgm %tmp.0 = call <4 x half> @llvm.amdgcn.set.inactive.v4f16(<4 x half> %in, <4 x half> ) #0 %tmp = call <4 x half> @llvm.amdgcn.strict.wwm.v4f16(<4 x half> %tmp.0) @@ -385,21 +363,17 @@ define amdgpu_kernel void @set_inactive_v4bf16(ptr addrspace(1) %out, <4 x bfloa ; GCN-NEXT: v_mov_b32_e32 v0, 0x3f803f80 ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: s_mov_b64 exec, -1 -; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v2, s3 +; GCN-NEXT: v_mov_b32_e32 v3, s2 +; GCN-NEXT: v_mov_b32_e32 v4, s3 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: v_mov_b32_e32 v1, 0x3f803f80 +; GCN-NEXT: v_cndmask_b32_e64 v1, v0, v3, s[4:5] +; GCN-NEXT: v_cndmask_b32_e64 v2, v0, v4, s[4:5] ; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v1, v2 -; GCN-NEXT: s_mov_b64 exec, -1 -; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, v1 +; GCN-NEXT: v_mov_b32_e32 v4, v2 ; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0 +; GCN-NEXT: buffer_store_dwordx2 v[3:4], off, s[0:3], 0 ; GCN-NEXT: s_endpgm %tmp.0 = call <4 x bfloat> @llvm.amdgcn.set.inactive.v4bf16(<4 x bfloat> %in, <4 x bfloat> ) #0 %tmp = call <4 x bfloat> @llvm.amdgcn.strict.wwm.v4bf16(<4 x bfloat> %tmp.0) @@ -411,19 +385,15 @@ define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) { ; GCN-LABEL: set_inactive_p0: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: s_mov_b64 exec, -1 -; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, v2, s[4:5] +; GCN-NEXT: v_cndmask_b32_e64 v1, 0, v3, s[4:5] ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v2, v0 -; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0 @@ -437,15 +407,13 @@ define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) { define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace(2) %in) { ; GCN-LABEL: set_inactive_p2: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, v1, s[4:5] ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 @@ -460,15 +428,13 @@ define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace(3) %in) { ; GCN-LABEL: set_inactive_p3: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, v1, s[4:5] ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 @@ -483,15 +449,13 @@ define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace(5) %in) { ; GCN-LABEL: set_inactive_p5: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, v1, s[4:5] ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 @@ -506,15 +470,13 @@ define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @set_inactive_p6(ptr addrspace(1) %out, ptr addrspace(6) %in) { ; GCN-LABEL: set_inactive_p6: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, v1, s[4:5] ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll index 60d9881a78ae0..988bc8eec6e51 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -10897,11 +10897,8 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX8_DPP-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX8_DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX8_DPP-NEXT: s_mov_b64 exec, -1 -; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, v1, v0, s[0:1] +; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf @@ -10945,11 +10942,8 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9_DPP-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX9_DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX9_DPP-NEXT: s_mov_b64 exec, -1 -; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, v1, v0, s[0:1] +; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf @@ -11853,12 +11847,10 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: s_brev_b32 s1, 1 ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v7 ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX8_DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5] -; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX8_DPP-NEXT: s_mov_b64 exec, -1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v4, s1 +; GFX8_DPP-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX8_DPP-NEXT: v_mov_b32_e32 v3, s0 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, v1, 0, s[4:5] ; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[4:5] ; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4 @@ -11950,12 +11942,10 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: s_brev_b32 s1, 1 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v7 ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9_DPP-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5] -; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX9_DPP-NEXT: s_mov_b64 exec, -1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v4, s1 +; GFX9_DPP-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX9_DPP-NEXT: v_mov_b32_e32 v3, s0 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, v1, 0, s[4:5] ; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[4:5] ; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4 @@ -12779,11 +12769,8 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX8_DPP-NEXT: v_bfrev_b32_e32 v1, -2 -; GFX8_DPP-NEXT: v_bfrev_b32_e32 v2, -2 -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX8_DPP-NEXT: s_mov_b64 exec, -1 -; GFX8_DPP-NEXT: s_nop 0 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, v1, v0, s[0:1] +; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX8_DPP-NEXT: s_nop 1 ; GFX8_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf @@ -12827,11 +12814,8 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9_DPP-NEXT: v_bfrev_b32_e32 v1, -2 -; GFX9_DPP-NEXT: v_bfrev_b32_e32 v2, -2 -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v0 -; GFX9_DPP-NEXT: s_mov_b64 exec, -1 -; GFX9_DPP-NEXT: s_nop 0 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, v1, v0, s[0:1] +; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX9_DPP-NEXT: s_nop 1 ; GFX9_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf @@ -13733,14 +13717,12 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX8_DPP-NEXT: v_mov_b32_e32 v9, 0 ; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v7 ; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX8_DPP-NEXT: s_brev_b32 s7, -2 -; GFX8_DPP-NEXT: v_bfrev_b32_e32 v2, -2 -; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX8_DPP-NEXT: s_mov_b64 exec, -1 ; GFX8_DPP-NEXT: s_mov_b32 s6, -1 +; GFX8_DPP-NEXT: s_brev_b32 s7, -2 ; GFX8_DPP-NEXT: v_mov_b32_e32 v3, s6 +; GFX8_DPP-NEXT: v_bfrev_b32_e32 v1, -2 ; GFX8_DPP-NEXT: v_mov_b32_e32 v4, s7 +; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, v1, 0, s[0:1] ; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s[0:1] ; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4 @@ -13828,14 +13810,12 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX9_DPP-NEXT: v_mov_b32_e32 v9, 0 ; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v7 ; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9_DPP-NEXT: s_brev_b32 s7, -2 -; GFX9_DPP-NEXT: v_bfrev_b32_e32 v2, -2 -; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0 -; GFX9_DPP-NEXT: s_mov_b64 exec, -1 ; GFX9_DPP-NEXT: s_mov_b32 s6, -1 +; GFX9_DPP-NEXT: s_brev_b32 s7, -2 ; GFX9_DPP-NEXT: v_mov_b32_e32 v3, s6 +; GFX9_DPP-NEXT: v_bfrev_b32_e32 v1, -2 ; GFX9_DPP-NEXT: v_mov_b32_e32 v4, s7 +; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, v1, 0, s[0:1] ; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s[0:1] ; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3 ; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4 diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll index 6bd0b11acc3ea..d62ff3795d8b8 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll @@ -59,7 +59,8 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa ; GFX908-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY6]], [[COPY7]], implicit $exec ; GFX908-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY5]], killed [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec ; GFX908-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2147483648, implicit $exec - ; GFX908-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY]], [[V_MOV_B32_e32_]], implicit-def dead $scc, implicit $exec + ; GFX908-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; GFX908-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY]], 0, [[V_MOV_B32_e32_]], killed [[DEF]], implicit $exec ; GFX908-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec ; GFX908-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, killed [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec ; GFX908-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec @@ -120,7 +121,8 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa ; GFX90A_GFX940-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY6]], [[COPY7]], implicit $exec ; GFX90A_GFX940-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY5]], killed [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec ; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2147483648, implicit $exec - ; GFX90A_GFX940-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY]], [[V_MOV_B32_e32_]], implicit-def dead $scc, implicit $exec + ; GFX90A_GFX940-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; GFX90A_GFX940-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY]], 0, [[V_MOV_B32_e32_]], killed [[DEF]], implicit $exec ; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec ; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, killed [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec ; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec @@ -177,7 +179,8 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa ; GFX11_GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX11_GFX12-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY4]], [[S_MOV_B32_]], implicit $exec ; GFX11_GFX12-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2147483648, implicit $exec - ; GFX11_GFX12-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY]], [[V_MOV_B32_e32_]], implicit-def dead $scc, implicit $exec + ; GFX11_GFX12-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF + ; GFX11_GFX12-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY]], 0, [[V_MOV_B32_e32_]], killed [[DEF]], implicit $exec ; GFX11_GFX12-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_SET_INACTIVE_B32_]], 353, 15, 15, 0, implicit $exec ; GFX11_GFX12-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, killed [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec ; GFX11_GFX12-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_]], 354, 15, 15, 0, implicit $exec @@ -186,8 +189,8 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa ; GFX11_GFX12-NEXT: [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_1]], 0, killed [[V_MOV_B32_dpp2]], 0, 0, implicit $mode, implicit $exec ; GFX11_GFX12-NEXT: [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_2]], 360, 15, 15, 0, implicit $exec ; GFX11_GFX12-NEXT: [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_2]], 0, killed [[V_MOV_B32_dpp3]], 0, 0, implicit $mode, implicit $exec - ; GFX11_GFX12-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; GFX11_GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[DEF]] + ; GFX11_GFX12-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX11_GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[DEF1]] ; GFX11_GFX12-NEXT: [[V_PERMLANEX16_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERMLANEX16_B32_e64 0, [[V_ADD_F32_e64_3]], 0, [[S_MOV_B32_]], 0, [[S_MOV_B32_]], [[COPY5]], 0, implicit $exec ; GFX11_GFX12-NEXT: [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, killed [[V_PERMLANEX16_B32_e64_]], 0, 0, implicit $mode, implicit $exec ; GFX11_GFX12-NEXT: early-clobber %1:vgpr_32 = STRICT_WWM killed [[V_ADD_F32_e64_4]], implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll index 6766c0c1fdaeb..946ee9eb9c065 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll @@ -61,7 +61,8 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX90A-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY6]], [[COPY7]], implicit $exec ; GFX90A-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY5]], killed [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec ; GFX90A-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2147483648, implicit $exec - ; GFX90A-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY]], [[V_MOV_B32_e32_]], implicit-def dead $scc, implicit $exec + ; GFX90A-NEXT: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; GFX90A-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY]], 0, [[V_MOV_B32_e32_]], killed [[DEF1]], implicit $exec ; GFX90A-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec ; GFX90A-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, killed [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec ; GFX90A-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec @@ -80,7 +81,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX90A-NEXT: early-clobber %2:sgpr_32 = STRICT_WWM killed [[V_READLANE_B32_]], implicit $exec ; GFX90A-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec ; GFX90A-NEXT: [[COPY8:%[0-9]+]]:vreg_1 = COPY [[V_CMP_EQ_U32_e64_]] - ; GFX90A-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; GFX90A-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX90A-NEXT: S_BRANCH %bb.2 ; GFX90A-NEXT: {{ $}} @@ -102,11 +103,11 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX90A-NEXT: bb.4 (%ir-block.35): ; GFX90A-NEXT: successors: %bb.3(0x80000000) ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF1]], %bb.1, [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.2 + ; GFX90A-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF2]], %bb.1, [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.2 ; GFX90A-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[PHI1]], implicit $exec - ; GFX90A-NEXT: early-clobber %45:vgpr_32 = STRICT_WWM [[V_MOV_B32_dpp6]], implicit $exec - ; GFX90A-NEXT: [[V_ADD_F32_e64_6:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_READFIRSTLANE_B32_]], 0, killed %45, 0, 0, implicit $mode, implicit $exec + ; GFX90A-NEXT: early-clobber %46:vgpr_32 = STRICT_WWM [[V_MOV_B32_dpp6]], implicit $exec + ; GFX90A-NEXT: [[V_ADD_F32_e64_6:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_READFIRSTLANE_B32_]], 0, killed %46, 0, 0, implicit $mode, implicit $exec ; GFX90A-NEXT: [[COPY10:%[0-9]+]]:sreg_64_xexec = COPY [[COPY8]] ; GFX90A-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]] ; GFX90A-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD_F32_e64_6]], 0, [[COPY11]], [[COPY10]], implicit $exec @@ -142,7 +143,8 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX940-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY6]], [[COPY7]], implicit $exec ; GFX940-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY5]], killed [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec ; GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2147483648, implicit $exec - ; GFX940-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY]], [[V_MOV_B32_e32_]], implicit-def dead $scc, implicit $exec + ; GFX940-NEXT: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; GFX940-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY]], 0, [[V_MOV_B32_e32_]], killed [[DEF1]], implicit $exec ; GFX940-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec ; GFX940-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, killed [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec ; GFX940-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec @@ -161,7 +163,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX940-NEXT: early-clobber %2:sgpr_32 = STRICT_WWM killed [[V_READLANE_B32_]], implicit $exec ; GFX940-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec ; GFX940-NEXT: [[COPY8:%[0-9]+]]:vreg_1 = COPY [[V_CMP_EQ_U32_e64_]] - ; GFX940-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX940-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; GFX940-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX940-NEXT: S_BRANCH %bb.2 ; GFX940-NEXT: {{ $}} @@ -183,11 +185,11 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX940-NEXT: bb.4 (%ir-block.35): ; GFX940-NEXT: successors: %bb.3(0x80000000) ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF1]], %bb.1, [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.2 + ; GFX940-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF2]], %bb.1, [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.2 ; GFX940-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[PHI1]], implicit $exec - ; GFX940-NEXT: early-clobber %44:vgpr_32 = STRICT_WWM [[V_MOV_B32_dpp6]], implicit $exec - ; GFX940-NEXT: [[V_ADD_F32_e64_6:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_READFIRSTLANE_B32_]], 0, killed %44, 0, 0, implicit $mode, implicit $exec + ; GFX940-NEXT: early-clobber %45:vgpr_32 = STRICT_WWM [[V_MOV_B32_dpp6]], implicit $exec + ; GFX940-NEXT: [[V_ADD_F32_e64_6:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_READFIRSTLANE_B32_]], 0, killed %45, 0, 0, implicit $mode, implicit $exec ; GFX940-NEXT: [[COPY10:%[0-9]+]]:sreg_64_xexec = COPY [[COPY8]] ; GFX940-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]] ; GFX940-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD_F32_e64_6]], 0, [[COPY11]], [[COPY10]], implicit $exec @@ -219,7 +221,8 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX11-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY4]], [[S_MOV_B32_]], implicit $exec ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2147483648, implicit $exec - ; GFX11-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY]], [[V_MOV_B32_e32_]], implicit-def dead $scc, implicit $exec + ; GFX11-NEXT: [[DEF1:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF + ; GFX11-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY]], 0, [[V_MOV_B32_e32_]], killed [[DEF1]], implicit $exec ; GFX11-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec ; GFX11-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, killed [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec ; GFX11-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec @@ -229,8 +232,8 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX11-NEXT: [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_2]], 280, 15, 15, 0, implicit $exec ; GFX11-NEXT: [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_2]], 0, killed [[V_MOV_B32_dpp3]], 0, 0, implicit $mode, implicit $exec ; GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 - ; GFX11-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[DEF1]] + ; GFX11-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[DEF2]] ; GFX11-NEXT: [[V_PERMLANEX16_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERMLANEX16_B32_e64 0, [[V_ADD_F32_e64_3]], 0, [[S_MOV_B32_1]], 0, [[S_MOV_B32_1]], [[COPY5]], 0, implicit $exec ; GFX11-NEXT: [[V_MOV_B32_dpp4:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], killed [[V_PERMLANEX16_B32_e64_]], 228, 10, 15, 0, implicit $exec ; GFX11-NEXT: [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, killed [[V_MOV_B32_dpp4]], 0, 0, implicit $mode, implicit $exec @@ -244,7 +247,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX11-NEXT: early-clobber %2:sgpr_32 = STRICT_WWM killed [[V_READLANE_B32_1]], implicit $exec ; GFX11-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_LO_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vreg_1 = COPY [[V_CMP_EQ_U32_e64_]] - ; GFX11-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX11-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; GFX11-NEXT: [[SI_IF1:%[0-9]+]]:sreg_32 = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX11-NEXT: S_BRANCH %bb.2 ; GFX11-NEXT: {{ $}} @@ -266,11 +269,11 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace ; GFX11-NEXT: bb.4 (%ir-block.32): ; GFX11-NEXT: successors: %bb.3(0x80000000) ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF2]], %bb.1, [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.2 + ; GFX11-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF3]], %bb.1, [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.2 ; GFX11-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[PHI1]], implicit $exec - ; GFX11-NEXT: early-clobber %46:vgpr_32 = STRICT_WWM [[V_WRITELANE_B32_]], implicit $exec - ; GFX11-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_READFIRSTLANE_B32_]], 0, killed %46, 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: early-clobber %47:vgpr_32 = STRICT_WWM [[V_WRITELANE_B32_]], implicit $exec + ; GFX11-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_READFIRSTLANE_B32_]], 0, killed %47, 0, 0, implicit $mode, implicit $exec ; GFX11-NEXT: [[COPY8:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY6]] ; GFX11-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD_F32_e64_5]], 0, [[V_READFIRSTLANE_B32_]], [[COPY8]], implicit $exec ; GFX11-NEXT: S_BRANCH %bb.3 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll index 009f8b2704d50..2b18f472c8c40 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll @@ -816,10 +816,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-DPP-NEXT: s_mov_b64 exec, -1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v0, s[0:1] ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -2027,10 +2024,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-DPP-NEXT: s_mov_b64 exec, -1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v0, s[0:1] ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -3298,10 +3292,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-DPP-NEXT: s_mov_b64 exec, -1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v0, s[0:1] ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -4065,10 +4056,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-DPP-NEXT: s_mov_b64 exec, -1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v0, s[0:1] ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -5361,10 +5349,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-DPP-NEXT: s_mov_b64 exec, -1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v0, s[0:1] ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -7330,12 +7315,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v9, v1 -; GFX9-DPP-NEXT: s_mov_b64 exec, -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v8, 1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v11, 1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v9, v8, v1, s[0:1] ; GFX9-DPP-NEXT: v_cndmask_b32_e64 v8, 0, v0, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 ; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 @@ -8910,12 +8893,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v1 -; GFX9-DPP-NEXT: s_mov_b64 exec, -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v6, 1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v1, s[0:1] ; GFX9-DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 @@ -10336,12 +10317,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v1 -; GFX9-DPP-NEXT: s_mov_b64 exec, -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v6, 1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v1, s[0:1] ; GFX9-DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 @@ -11244,12 +11223,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v1 -; GFX9-DPP-NEXT: s_mov_b64 exec, -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v6, 1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v1, s[0:1] ; GFX9-DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 @@ -13317,12 +13294,10 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v9, v1 -; GFX9-DPP-NEXT: s_mov_b64 exec, -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v8, 1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v11, 1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v9, v8, v1, s[0:1] ; GFX9-DPP-NEXT: v_cndmask_b32_e64 v8, 0, v0, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 ; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll index f82b5af855ef3..e3144ae24ae8d 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll @@ -718,10 +718,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-DPP-NEXT: s_mov_b64 exec, -1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v0, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -1756,10 +1753,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-DPP-NEXT: s_mov_b64 exec, -1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v0, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -2794,10 +2788,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-DPP-NEXT: s_mov_b64 exec, -1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v0, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -4742,12 +4733,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v9, v1 -; GFX9-DPP-NEXT: s_mov_b64 exec, -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0x7ff80000 ; GFX9-DPP-NEXT: v_mov_b32_e32 v11, 0x7ff80000 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v9, v8, v1, s[0:1] ; GFX9-DPP-NEXT: v_cndmask_b32_e64 v8, 0, v0, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 ; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 @@ -6190,12 +6179,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff80000 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v1 -; GFX9-DPP-NEXT: s_mov_b64 exec, -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7ff80000 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v1, s[0:1] ; GFX9-DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 @@ -8242,12 +8229,10 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v9, v1 -; GFX9-DPP-NEXT: s_mov_b64 exec, -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0x7ff80000 ; GFX9-DPP-NEXT: v_mov_b32_e32 v11, 0x7ff80000 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v9, v8, v1, s[0:1] ; GFX9-DPP-NEXT: v_cndmask_b32_e64 v8, 0, v0, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 ; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll index c43e4e1dd2eaa..ddc103184cdf3 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll @@ -718,10 +718,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-DPP-NEXT: s_mov_b64 exec, -1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v0, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -1756,10 +1753,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-DPP-NEXT: s_mov_b64 exec, -1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v0, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -2794,10 +2788,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-DPP-NEXT: s_mov_b64 exec, -1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v0, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -4742,12 +4733,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v9, v1 -; GFX9-DPP-NEXT: s_mov_b64 exec, -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0x7ff80000 ; GFX9-DPP-NEXT: v_mov_b32_e32 v11, 0x7ff80000 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v9, v8, v1, s[0:1] ; GFX9-DPP-NEXT: v_cndmask_b32_e64 v8, 0, v0, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 ; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 @@ -6190,12 +6179,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff80000 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v1 -; GFX9-DPP-NEXT: s_mov_b64 exec, -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7ff80000 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v1, s[0:1] ; GFX9-DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 @@ -8242,12 +8229,10 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v9, v1 -; GFX9-DPP-NEXT: s_mov_b64 exec, -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0x7ff80000 ; GFX9-DPP-NEXT: v_mov_b32_e32 v11, 0x7ff80000 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v9, v8, v1, s[0:1] ; GFX9-DPP-NEXT: v_cndmask_b32_e64 v8, 0, v0, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 ; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll index c02a1b2c56e95..f353edff1b477 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll @@ -894,10 +894,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-DPP-NEXT: s_mov_b64 exec, -1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v0, s[0:1] ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -2217,10 +2214,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-DPP-NEXT: s_mov_b64 exec, -1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v0, s[0:1] ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -3540,10 +3534,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-DPP-NEXT: s_mov_b64 exec, -1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v0, s[0:1] ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -4359,10 +4350,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_ ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-DPP-NEXT: s_mov_b64 exec, -1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v0, s[0:1] ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -5681,10 +5669,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-DPP-NEXT: s_mov_b64 exec, -1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v0, s[0:1] ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1 ; GFX9-DPP-NEXT: s_nop 1 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf @@ -7650,12 +7635,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v9, v1 -; GFX9-DPP-NEXT: s_mov_b64 exec, -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v8, 1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v11, 1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v9, v8, v1, s[0:1] ; GFX9-DPP-NEXT: v_cndmask_b32_e64 v8, 0, v0, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 ; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 @@ -9229,12 +9212,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v1 -; GFX9-DPP-NEXT: s_mov_b64 exec, -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v6, 1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v1, s[0:1] ; GFX9-DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 @@ -10655,12 +10636,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v1 -; GFX9-DPP-NEXT: s_mov_b64 exec, -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v6, 1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v1, s[0:1] ; GFX9-DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 @@ -11563,12 +11542,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v1 -; GFX9-DPP-NEXT: s_mov_b64 exec, -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v6, 1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v1, s[0:1] ; GFX9-DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6 @@ -13635,12 +13612,10 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-DPP-NEXT: v_bfrev_b32_e32 v9, 1 -; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-DPP-NEXT: v_mov_b32_e32 v9, v1 -; GFX9-DPP-NEXT: s_mov_b64 exec, -1 ; GFX9-DPP-NEXT: v_mov_b32_e32 v10, 0 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v8, 1 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v11, 1 +; GFX9-DPP-NEXT: v_cndmask_b32_e64 v9, v8, v1, s[0:1] ; GFX9-DPP-NEXT: v_cndmask_b32_e64 v8, 0, v0, s[0:1] ; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10 ; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll index 59110255c49db..6fb5a9ce47a84 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll @@ -5,16 +5,14 @@ define amdgpu_kernel void @set_inactive(ptr addrspace(1) %out, i32 %in) { ; GCN-LABEL: set_inactive: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: v_mov_b32_e32 v0, 42 -; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: v_cndmask_b32_e64 v0, 42, v1, s[4:5] ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 @@ -46,21 +44,19 @@ define amdgpu_kernel void @set_inactive_64(ptr addrspace(1) %out, i64 %in) { ; GCN-LABEL: set_inactive_64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: s_mov_b64 exec, s[8:9] -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: s_mov_b64 exec, -1 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_mov_b64 exec, s[8:9] -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: s_mov_b64 exec, -1 -; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: v_mov_b32_e32 v2, s3 +; GCN-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GCN-NEXT: v_cndmask_b32_e64 v1, 0, v2, s[0:1] +; GCN-NEXT: s_mov_b64 exec, s[0:1] +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, v2, s[0:1] +; GCN-NEXT: s_mov_b64 exec, s[0:1] ; GCN-NEXT: v_mov_b32_e32 v2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0 @@ -98,11 +94,9 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_buffer_load_dword s4, s[4:7], 0x0 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; GCN-NEXT: v_mov_b32_e32 v1, s8 ; GCN-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GCN-NEXT: v_mov_b32_e32 v0, 42 -; GCN-NEXT: s_mov_b64 exec, s[2:3] -; GCN-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: v_cndmask_b32_e64 v0, 42, v1, s[2:3] ; GCN-NEXT: s_mov_b64 exec, s[2:3] ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u32 s4, 56 @@ -147,17 +141,15 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) { ; GCN-LABEL: set_inactive_f32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: s_mov_b32 s7, 0x40400000 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: v_mov_b32_e32 v0, s7 -; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: v_mov_b32_e32 v0, 0x40400000 +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 @@ -172,23 +164,21 @@ define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) { ; GCN-LABEL: set_inactive_f64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-NEXT: s_mov_b32 s10, 0x4010cccc ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_mov_b32_e32 v1, s10 -; GCN-NEXT: s_mov_b64 exec, s[8:9] -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: s_mov_b64 exec, -1 -; GCN-NEXT: s_mov_b32 s0, 0xcccccccd -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: s_mov_b64 exec, s[8:9] -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: s_mov_b64 exec, -1 -; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: v_mov_b32_e32 v2, s3 +; GCN-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GCN-NEXT: v_mov_b32_e32 v0, 0x4010cccc +; GCN-NEXT: v_cndmask_b32_e64 v1, v0, v2, s[0:1] +; GCN-NEXT: s_mov_b64 exec, s[0:1] +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GCN-NEXT: v_mov_b32_e32 v0, 0xcccccccd +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GCN-NEXT: s_mov_b64 exec, s[0:1] ; GCN-NEXT: v_mov_b32_e32 v2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0 @@ -202,17 +192,15 @@ define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) { define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %in) { ; GCN-LABEL: set_inactive_v2i16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: s_mov_b32 s7, 0x10001 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: v_mov_b32_e32 v0, s7 -; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: v_mov_b32_e32 v0, 0x10001 +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 @@ -226,17 +214,15 @@ define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> % define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half> %in) { ; GCN-LABEL: set_inactive_v2f16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: s_mov_b32 s7, 0x3c003c00 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: v_mov_b32_e32 v0, s7 -; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: v_mov_b32_e32 v0, 0x3c003c00 +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 @@ -251,21 +237,19 @@ define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> % ; GCN-LABEL: set_inactive_v2i32: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_mov_b32_e32 v1, 1 -; GCN-NEXT: s_mov_b64 exec, s[8:9] -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: s_mov_b64 exec, -1 -; GCN-NEXT: v_mov_b32_e32 v0, 1 -; GCN-NEXT: s_mov_b64 exec, s[8:9] -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: s_mov_b64 exec, -1 -; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: v_mov_b32_e32 v2, s3 +; GCN-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GCN-NEXT: v_cndmask_b32_e64 v1, 1, v2, s[0:1] +; GCN-NEXT: s_mov_b64 exec, s[0:1] +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GCN-NEXT: v_cndmask_b32_e64 v0, 1, v2, s[0:1] +; GCN-NEXT: s_mov_b64 exec, s[0:1] ; GCN-NEXT: v_mov_b32_e32 v2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0 @@ -280,21 +264,19 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float> ; GCN-LABEL: set_inactive_v2f32: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_mov_b32_e32 v1, 1.0 -; GCN-NEXT: s_mov_b64 exec, s[8:9] -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: s_mov_b64 exec, -1 -; GCN-NEXT: v_mov_b32_e32 v0, 1.0 -; GCN-NEXT: s_mov_b64 exec, s[8:9] -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: s_mov_b64 exec, -1 -; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: v_mov_b32_e32 v2, s3 +; GCN-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GCN-NEXT: v_cndmask_b32_e64 v1, 1.0, v2, s[0:1] +; GCN-NEXT: s_mov_b64 exec, s[0:1] +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GCN-NEXT: v_cndmask_b32_e64 v0, 1.0, v2, s[0:1] +; GCN-NEXT: s_mov_b64 exec, s[0:1] ; GCN-NEXT: v_mov_b32_e32 v2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0 @@ -308,17 +290,15 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float> define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloat> %in) { ; GCN-LABEL: set_inactive_v2bf16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: s_mov_b32 s7, 0x3f803f80 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: v_mov_b32_e32 v0, s7 -; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: v_mov_b32_e32 v0, 0x3f803f80 +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 @@ -333,22 +313,20 @@ define amdgpu_kernel void @set_inactive_v4i16(ptr addrspace(1) %out, <4 x i16> % ; GCN-LABEL: set_inactive_v4i16: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-NEXT: s_mov_b32 s10, 0x10001 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_mov_b32_e32 v1, s10 -; GCN-NEXT: s_mov_b64 exec, s[8:9] -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: s_mov_b64 exec, -1 -; GCN-NEXT: v_mov_b32_e32 v0, s10 -; GCN-NEXT: s_mov_b64 exec, s[8:9] -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: s_mov_b64 exec, -1 -; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: v_mov_b32_e32 v2, s3 +; GCN-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GCN-NEXT: v_mov_b32_e32 v0, 0x10001 +; GCN-NEXT: v_cndmask_b32_e64 v1, v0, v2, s[0:1] +; GCN-NEXT: s_mov_b64 exec, s[0:1] +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GCN-NEXT: s_mov_b64 exec, s[0:1] ; GCN-NEXT: v_mov_b32_e32 v2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0 @@ -363,22 +341,20 @@ define amdgpu_kernel void @set_inactive_v4f16(ptr addrspace(1) %out, <4 x half> ; GCN-LABEL: set_inactive_v4f16: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-NEXT: s_mov_b32 s10, 0x3c003c00 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_mov_b32_e32 v1, s10 -; GCN-NEXT: s_mov_b64 exec, s[8:9] -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: s_mov_b64 exec, -1 -; GCN-NEXT: v_mov_b32_e32 v0, s10 -; GCN-NEXT: s_mov_b64 exec, s[8:9] -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: s_mov_b64 exec, -1 -; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: v_mov_b32_e32 v2, s3 +; GCN-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GCN-NEXT: v_mov_b32_e32 v0, 0x3c003c00 +; GCN-NEXT: v_cndmask_b32_e64 v1, v0, v2, s[0:1] +; GCN-NEXT: s_mov_b64 exec, s[0:1] +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GCN-NEXT: s_mov_b64 exec, s[0:1] ; GCN-NEXT: v_mov_b32_e32 v2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0 @@ -393,22 +369,20 @@ define amdgpu_kernel void @set_inactive_v4bf16(ptr addrspace(1) %out, <4 x bfloa ; GCN-LABEL: set_inactive_v4bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-NEXT: s_mov_b32 s10, 0x3f803f80 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_mov_b32_e32 v1, s10 -; GCN-NEXT: s_mov_b64 exec, s[8:9] -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: s_mov_b64 exec, -1 -; GCN-NEXT: v_mov_b32_e32 v0, s10 -; GCN-NEXT: s_mov_b64 exec, s[8:9] -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: s_mov_b64 exec, -1 -; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: v_mov_b32_e32 v2, s3 +; GCN-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GCN-NEXT: v_mov_b32_e32 v0, 0x3f803f80 +; GCN-NEXT: v_cndmask_b32_e64 v1, v0, v2, s[0:1] +; GCN-NEXT: s_mov_b64 exec, s[0:1] +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GCN-NEXT: s_mov_b64 exec, s[0:1] ; GCN-NEXT: v_mov_b32_e32 v2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0 @@ -423,21 +397,19 @@ define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) { ; GCN-LABEL: set_inactive_p0: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: s_mov_b64 exec, s[8:9] -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: s_mov_b64 exec, -1 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_mov_b64 exec, s[8:9] -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: s_mov_b64 exec, -1 -; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: v_mov_b32_e32 v2, s3 +; GCN-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GCN-NEXT: v_cndmask_b32_e64 v1, 0, v2, s[0:1] +; GCN-NEXT: s_mov_b64 exec, s[0:1] +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, v2, s[0:1] +; GCN-NEXT: s_mov_b64 exec, s[0:1] ; GCN-NEXT: v_mov_b32_e32 v2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0 @@ -451,16 +423,14 @@ define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) { define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace(2) %in) { ; GCN-LABEL: set_inactive_p2: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, v1, s[4:5] ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 @@ -474,16 +444,14 @@ define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace(3) %in) { ; GCN-LABEL: set_inactive_p3: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, v1, s[4:5] ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 @@ -497,16 +465,14 @@ define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace(5) %in) { ; GCN-LABEL: set_inactive_p5: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, v1, s[4:5] ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 @@ -520,16 +486,14 @@ define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @set_inactive_p6(ptr addrspace(1) %out, ptr addrspace(6) %in) { ; GCN-LABEL: set_inactive_p6: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c +; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, v1, s[4:5] ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir index 8a5f75332557e..b0fb24e60bead 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir @@ -111,7 +111,7 @@ body: | ; GCN-NEXT: $vgpr5 = SI_SPILL_S32_TO_VGPR $sgpr103, 3, $vgpr5 ; GCN-NEXT: $sgpr22 = IMPLICIT_DEF ; GCN-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr22, 0, killed $vgpr2 - ; GCN-NEXT: dead $vgpr1 = V_SET_INACTIVE_B32 $vgpr0, 0, implicit $exec, implicit-def $scc + ; GCN-NEXT: dead $vgpr1 = V_SET_INACTIVE_B32 0, $vgpr0, 0, 0, $sgpr_null, implicit $exec, implicit-def $scc ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.1: ; GCN-NEXT: successors: %bb.2(0x80000000) @@ -212,7 +212,7 @@ body: | $sgpr22 = IMPLICIT_DEF SI_SPILL_S32_SAVE $sgpr22, %stack.0, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.0, addrspace 5) - %0:vgpr_32 = V_SET_INACTIVE_B32 $vgpr0, 0, implicit $exec, implicit-def $scc + %0:vgpr_32 = V_SET_INACTIVE_B32 0, $vgpr0, 0, 0, $sgpr_null, implicit $exec, implicit-def $scc bb.1: KILL implicit-def $vcc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31, implicit-def $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63, implicit-def $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95, implicit-def $sgpr96_sgpr97_sgpr98_sgpr99_sgpr100_sgpr101_sgpr102_sgpr103 diff --git a/llvm/test/CodeGen/AMDGPU/wqm.mir b/llvm/test/CodeGen/AMDGPU/wqm.mir index 64a7c4457395c..3013aabbd3bd4 100644 --- a/llvm/test/CodeGen/AMDGPU/wqm.mir +++ b/llvm/test/CodeGen/AMDGPU/wqm.mir @@ -40,9 +40,6 @@ define amdgpu_vs void @no_wqm_in_vs() { ret void } - define amdgpu_ps void @preloaded_set_inactive() { - ret void - } ... --- @@ -155,7 +152,7 @@ registers: - { id: 9, class: sreg_32, preferred-register: '' } - { id: 10, class: vgpr_32, preferred-register: '' } - { id: 11, class: vgpr_32, preferred-register: '' } - - { id: 12, class: sreg_32, preferred-register: '' } + - { id: 12, class: vgpr_32, preferred-register: '' } - { id: 13, class: vgpr_32, preferred-register: '' } - { id: 14, class: vgpr_32, preferred-register: '' } - { id: 15, class: vgpr_32, preferred-register: '' } @@ -179,7 +176,8 @@ body: | %8:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %6, %7, 0, 0, 0, implicit $exec %16:vgpr_32 = COPY %8.sub1 %11:vgpr_32 = COPY %16 - %10:vgpr_32 = V_SET_INACTIVE_B32 %11, undef %12:sreg_32, implicit $exec, implicit-def $scc + %17:sreg_64_xexec = IMPLICIT_DEF + %10:vgpr_32 = V_SET_INACTIVE_B32 0, %11, 0, undef %12, undef %17, implicit $exec, implicit-def $scc %14:vgpr_32 = COPY %7 %13:vgpr_32 = V_MOV_B32_dpp %14, killed %10, 323, 12, 15, 0, implicit $exec early-clobber %15:vgpr_32 = STRICT_WWM killed %13, implicit $exec @@ -298,8 +296,9 @@ body: | %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:vgpr_32 = COPY $vgpr0 %2:vreg_64 = BUFFER_LOAD_DWORDX2_OFFEN %1:vgpr_32, %0:sgpr_128, 0, 0, 0, 0, implicit $exec - %2.sub0:vreg_64 = V_SET_INACTIVE_B32 %2.sub0:vreg_64, 0, implicit $exec, implicit-def $scc - %2.sub1:vreg_64 = V_SET_INACTIVE_B32 %2.sub1:vreg_64, 0, implicit $exec, implicit-def $scc + %4:sreg_64_xexec = IMPLICIT_DEF + %2.sub0:vreg_64 = V_SET_INACTIVE_B32 0, %2.sub0:vreg_64, 0, 0, undef %4, implicit $exec, implicit-def $scc + %2.sub1:vreg_64 = V_SET_INACTIVE_B32 0, %2.sub1:vreg_64, 0, 0, undef %4, implicit $exec, implicit-def $scc %3:vreg_64 = nnan nsz arcp contract reassoc nofpexcept V_MAX_F64_e64 0, %2:vreg_64, 0, %2:vreg_64, 0, 0, implicit $mode, implicit $exec $vgpr0 = STRICT_WWM %3.sub0:vreg_64, implicit $exec $vgpr1 = STRICT_WWM %3.sub1:vreg_64, implicit $exec @@ -446,19 +445,3 @@ body: | %4:vreg_128 = IMAGE_SAMPLE_V4_V2 %0:vreg_64, %100:sgpr_256, %101:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) ... - ---- -# Preserve V_SET_INACTIVE with exec mask already specified -#CHECK-LABEL: name: preloaded_set_inactive -#CHECK: V_SET_INACTIVE_B32 -name: preloaded_set_inactive -tracksRegLiveness: true -body: | - bb.0: - liveins: $vgpr1, $vgpr2 - - %0:vgpr_32 = COPY $vgpr1 - %1:vgpr_32 = COPY $vgpr2 - %mask:sreg_64 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - %value:vgpr_32 = V_SET_INACTIVE_B32 %0:vgpr_32, %1:vgpr_32, implicit $exec, implicit-def $scc, implicit %mask:sreg_64 -... diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll index 4dda8a58a8d5c..b35ef6497f14a 100644 --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll @@ -30,15 +30,15 @@ define amdgpu_gfx void @strict_wwm_no_cfg(ptr addrspace(8) inreg %tmp14) { ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v5 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6 +; GFX9-O0-NEXT: ; implicit-def: $sgpr40_sgpr41 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[40:41], -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, s34 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[40:41] ; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41] -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 -; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: ; implicit-def: $sgpr40_sgpr41 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[40:41], -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s34 -; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 -; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[40:41] ; GFX9-O0-NEXT: v_mov_b32_e32 v2, s34 ; GFX9-O0-NEXT: s_nop 1 ; GFX9-O0-NEXT: v_mov_b32_dpp v2, v0 row_bcast:31 row_mask:0xc bank_mask:0xf @@ -170,11 +170,10 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37 +; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s34 -; GFX9-O0-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 -; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[36:37] ; GFX9-O0-NEXT: v_mov_b32_e32 v2, s34 ; GFX9-O0-NEXT: s_nop 1 ; GFX9-O0-NEXT: v_mov_b32_dpp v2, v1 row_bcast:31 row_mask:0xc bank_mask:0xf @@ -201,6 +200,9 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-O0-NEXT: ; implicit-def: $sgpr34_sgpr35 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v1, v0, s[34:35] ; GFX9-O0-NEXT: s_nop 1 ; GFX9-O0-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf @@ -352,32 +354,32 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg ; GFX9-O0-NEXT: s_mov_b32 s33, s32 ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: s_add_i32 s32, s32, 0x400 ; GFX9-O0-NEXT: v_writelane_b32 v3, s30, 0 ; GFX9-O0-NEXT: v_writelane_b32 v3, s31, 1 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[40:41], -1 -; GFX9-O0-NEXT: s_mov_b32 s42, s6 +; GFX9-O0-NEXT: s_mov_b32 s40, s6 ; GFX9-O0-NEXT: s_mov_b32 s34, s4 -; GFX9-O0-NEXT: ; kill: def $sgpr42 killed $sgpr42 def $sgpr42_sgpr43 -; GFX9-O0-NEXT: s_mov_b32 s43, s7 -; GFX9-O0-NEXT: s_mov_b32 s44, s43 -; GFX9-O0-NEXT: s_mov_b32 s45, s42 +; GFX9-O0-NEXT: ; kill: def $sgpr40 killed $sgpr40 def $sgpr40_sgpr41 +; GFX9-O0-NEXT: s_mov_b32 s41, s7 +; GFX9-O0-NEXT: s_mov_b32 s42, s41 +; GFX9-O0-NEXT: s_mov_b32 s43, s40 ; GFX9-O0-NEXT: ; kill: def $sgpr34 killed $sgpr34 def $sgpr34_sgpr35 ; GFX9-O0-NEXT: s_mov_b32 s35, s5 -; GFX9-O0-NEXT: s_mov_b32 s46, s35 +; GFX9-O0-NEXT: s_mov_b32 s44, s35 ; GFX9-O0-NEXT: s_mov_b32 s36, s34 ; GFX9-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39 -; GFX9-O0-NEXT: s_mov_b32 s37, s46 -; GFX9-O0-NEXT: s_mov_b32 s38, s45 -; GFX9-O0-NEXT: s_mov_b32 s39, s44 +; GFX9-O0-NEXT: s_mov_b32 s37, s44 +; GFX9-O0-NEXT: s_mov_b32 s38, s43 +; GFX9-O0-NEXT: s_mov_b32 s39, s42 ; GFX9-O0-NEXT: s_mov_b32 s34, 0 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s34 -; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41] -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s8 -; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: ; implicit-def: $sgpr40_sgpr41 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[40:41], -1 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s34 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v1, v0, s[40:41] ; GFX9-O0-NEXT: s_getpc_b64 s[42:43] ; GFX9-O0-NEXT: s_add_u32 s42, s42, strict_wwm_called@rel32@lo+4 ; GFX9-O0-NEXT: s_addc_u32 s43, s43, strict_wwm_called@rel32@hi+12 @@ -396,8 +398,8 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg ; GFX9-O0-NEXT: v_readlane_b32 s30, v3, 0 ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xfffffc00 ; GFX9-O0-NEXT: s_mov_b32 s33, s48 @@ -417,11 +419,9 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg ; GFX9-O3-NEXT: v_writelane_b32 v3, s30, 0 ; GFX9-O3-NEXT: s_addk_i32 s32, 0x400 ; GFX9-O3-NEXT: v_writelane_b32 v3, s31, 1 +; GFX9-O3-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O3-NEXT: v_mov_b32_e32 v2, s8 -; GFX9-O3-NEXT: s_mov_b64 exec, -1 +; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[34:35] ; GFX9-O3-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-O3-NEXT: s_getpc_b64 s[36:37] ; GFX9-O3-NEXT: s_add_u32 s36, s36, strict_wwm_called@rel32@lo+4 @@ -539,11 +539,11 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i ; GFX9-O0-LABEL: strict_wwm_call_i64: ; GFX9-O0: ; %bb.0: ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-O0-NEXT: s_mov_b32 s50, s33 +; GFX9-O0-NEXT: s_mov_b32 s48, s33 ; GFX9-O0-NEXT: s_mov_b32 s33, s32 ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 ; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill @@ -560,50 +560,52 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i ; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: s_add_i32 s32, s32, 0x1000 -; GFX9-O0-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane +; GFX9-O0-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane ; GFX9-O0-NEXT: v_writelane_b32 v10, s30, 0 ; GFX9-O0-NEXT: v_writelane_b32 v10, s31, 1 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[36:37], -1 -; GFX9-O0-NEXT: v_writelane_b32 v0, s36, 0 -; GFX9-O0-NEXT: v_writelane_b32 v0, s37, 1 ; GFX9-O0-NEXT: s_mov_b32 s34, s8 -; GFX9-O0-NEXT: s_mov_b32 s40, s6 -; GFX9-O0-NEXT: s_mov_b32 s38, s4 -; GFX9-O0-NEXT: ; kill: def $sgpr40 killed $sgpr40 def $sgpr40_sgpr41 -; GFX9-O0-NEXT: s_mov_b32 s41, s7 -; GFX9-O0-NEXT: s_mov_b32 s35, s41 -; GFX9-O0-NEXT: s_mov_b32 s42, s40 +; GFX9-O0-NEXT: s_mov_b32 s38, s6 +; GFX9-O0-NEXT: s_mov_b32 s36, s4 ; GFX9-O0-NEXT: ; kill: def $sgpr38 killed $sgpr38 def $sgpr38_sgpr39 -; GFX9-O0-NEXT: s_mov_b32 s39, s5 -; GFX9-O0-NEXT: s_mov_b32 s43, s39 +; GFX9-O0-NEXT: s_mov_b32 s39, s7 +; GFX9-O0-NEXT: s_mov_b32 s35, s39 ; GFX9-O0-NEXT: s_mov_b32 s44, s38 -; GFX9-O0-NEXT: ; kill: def $sgpr44 killed $sgpr44 def $sgpr44_sgpr45_sgpr46_sgpr47 -; GFX9-O0-NEXT: s_mov_b32 s45, s43 -; GFX9-O0-NEXT: s_mov_b32 s46, s42 -; GFX9-O0-NEXT: s_mov_b32 s47, s35 -; GFX9-O0-NEXT: v_writelane_b32 v0, s44, 2 -; GFX9-O0-NEXT: v_writelane_b32 v0, s45, 3 -; GFX9-O0-NEXT: v_writelane_b32 v0, s46, 4 -; GFX9-O0-NEXT: v_writelane_b32 v0, s47, 5 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[48:49], -1 -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[48:49] +; GFX9-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37 +; GFX9-O0-NEXT: s_mov_b32 s37, s5 +; GFX9-O0-NEXT: s_mov_b32 s45, s37 +; GFX9-O0-NEXT: s_mov_b32 s40, s36 +; GFX9-O0-NEXT: ; kill: def $sgpr40 killed $sgpr40 def $sgpr40_sgpr41_sgpr42_sgpr43 +; GFX9-O0-NEXT: s_mov_b32 s41, s45 +; GFX9-O0-NEXT: s_mov_b32 s42, s44 +; GFX9-O0-NEXT: s_mov_b32 s43, s35 +; GFX9-O0-NEXT: v_writelane_b32 v1, s40, 0 +; GFX9-O0-NEXT: v_writelane_b32 v1, s41, 1 +; GFX9-O0-NEXT: v_writelane_b32 v1, s42, 2 +; GFX9-O0-NEXT: v_writelane_b32 v1, s43, 3 ; GFX9-O0-NEXT: ; kill: def $sgpr34 killed $sgpr34 def $sgpr34_sgpr35 ; GFX9-O0-NEXT: s_mov_b32 s35, s9 -; GFX9-O0-NEXT: ; kill: def $sgpr38_sgpr39 killed $sgpr34_sgpr35 -; GFX9-O0-NEXT: s_mov_b32 s40, s35 -; GFX9-O0-NEXT: s_mov_b64 s[38:39], 0 -; GFX9-O0-NEXT: s_mov_b32 s41, s39 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s41 -; GFX9-O0-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-O0-NEXT: ; kill: def $sgpr36_sgpr37 killed $sgpr34_sgpr35 +; GFX9-O0-NEXT: s_mov_b32 s38, s35 +; GFX9-O0-NEXT: s_mov_b64 s[36:37], 0 +; GFX9-O0-NEXT: s_mov_b32 s40, s37 +; GFX9-O0-NEXT: ; implicit-def: $sgpr42_sgpr43 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, s38 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[38:39], -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, s40 -; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[38:39] +; GFX9-O0-NEXT: s_mov_b64 exec, s[38:39] ; GFX9-O0-NEXT: ; kill: def $sgpr34 killed $sgpr34 killed $sgpr34_sgpr35 -; GFX9-O0-NEXT: s_mov_b32 s35, s38 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, s35 -; GFX9-O0-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-O0-NEXT: v_mov_b32_e32 v3, s34 -; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 killed $sgpr36_sgpr37 +; GFX9-O0-NEXT: ; implicit-def: $sgpr38_sgpr39 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, s34 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-O0-NEXT: v_writelane_b32 v1, s34, 4 +; GFX9-O0-NEXT: v_writelane_b32 v1, s35, 5 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[46:47], -1 +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 exec, s[46:47] +; GFX9-O0-NEXT: v_mov_b32_e32 v3, s36 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v0, s[34:35] ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v3 @@ -623,20 +625,20 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 ; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-O0-NEXT: s_or_saveexec_b64 s[48:49], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[46:47], -1 ; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[48:49] +; GFX9-O0-NEXT: s_mov_b64 exec, s[46:47] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s34, v6, 0 -; GFX9-O0-NEXT: v_readlane_b32 s35, v6, 1 -; GFX9-O0-NEXT: v_readlane_b32 s36, v6, 2 -; GFX9-O0-NEXT: v_readlane_b32 s37, v6, 3 -; GFX9-O0-NEXT: v_readlane_b32 s38, v6, 4 -; GFX9-O0-NEXT: v_readlane_b32 s39, v6, 5 +; GFX9-O0-NEXT: v_readlane_b32 s34, v6, 4 +; GFX9-O0-NEXT: v_readlane_b32 s35, v6, 5 +; GFX9-O0-NEXT: v_readlane_b32 s36, v6, 0 +; GFX9-O0-NEXT: v_readlane_b32 s37, v6, 1 +; GFX9-O0-NEXT: v_readlane_b32 s38, v6, 2 +; GFX9-O0-NEXT: v_readlane_b32 s39, v6, 3 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[48:49], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[46:47], -1 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[48:49] +; GFX9-O0-NEXT: s_mov_b64 exec, s[46:47] ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-O0-NEXT: ; implicit-def: $sgpr40 ; GFX9-O0-NEXT: ; implicit-def: $sgpr40 @@ -654,7 +656,7 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i ; GFX9-O0-NEXT: ; kill: killed $vgpr0 ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 ; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload @@ -668,7 +670,7 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i ; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xfffff000 -; GFX9-O0-NEXT: s_mov_b32 s33, s50 +; GFX9-O0-NEXT: s_mov_b32 s33, s48 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] ; @@ -690,19 +692,17 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i ; GFX9-O3-NEXT: v_writelane_b32 v8, s30, 0 ; GFX9-O3-NEXT: s_addk_i32 s32, 0x800 ; GFX9-O3-NEXT: v_writelane_b32 v8, s31, 1 +; GFX9-O3-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-O3-NEXT: v_cndmask_b32_e64 v6, 0, v0, s[34:35] +; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-O3-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[36:37], -1 -; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-O3-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-O3-NEXT: v_mov_b32_e32 v6, s9 -; GFX9-O3-NEXT: s_mov_b64 exec, -1 ; GFX9-O3-NEXT: s_getpc_b64 s[34:35] ; GFX9-O3-NEXT: s_add_u32 s34, s34, strict_wwm_called_i64@gotpcrel32@lo+4 ; GFX9-O3-NEXT: s_addc_u32 s35, s35, strict_wwm_called_i64@gotpcrel32@hi+12 ; GFX9-O3-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 -; GFX9-O3-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-O3-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-O3-NEXT: v_mov_b32_e32 v7, s8 -; GFX9-O3-NEXT: s_mov_b64 exec, -1 +; GFX9-O3-NEXT: v_cndmask_b32_e64 v7, 0, v0, s[36:37] ; GFX9-O3-NEXT: v_mov_b32_e32 v0, v7 ; GFX9-O3-NEXT: v_mov_b32_e32 v1, v6 ; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0) @@ -770,15 +770,13 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in ; GFX9-O0-NEXT: ; kill: def $sgpr44 killed $sgpr44 def $sgpr44_sgpr45 ; GFX9-O0-NEXT: s_mov_b32 s45, s35 ; GFX9-O0-NEXT: s_mov_b32 s42, s45 +; GFX9-O0-NEXT: ; implicit-def: $sgpr46_sgpr47 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s42 -; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v7 -; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[40:41] ; GFX9-O0-NEXT: s_mov_b32 s35, s44 +; GFX9-O0-NEXT: ; implicit-def: $sgpr44_sgpr45 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, s35 -; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41] -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v6 -; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[40:41] ; GFX9-O0-NEXT: ; implicit-def: $sgpr43 ; GFX9-O0-NEXT: ; implicit-def: $sgpr43 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 @@ -788,14 +786,12 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v13 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v14 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[40:41], -1 +; GFX9-O0-NEXT: ; implicit-def: $sgpr44_sgpr45 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s42 -; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v7 -; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[40:41] +; GFX9-O0-NEXT: ; implicit-def: $sgpr44_sgpr45 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, s35 -; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41] -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v6 -; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[40:41] ; GFX9-O0-NEXT: ; implicit-def: $sgpr43 ; GFX9-O0-NEXT: ; implicit-def: $sgpr43 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 @@ -805,17 +801,15 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[40:41], -1 +; GFX9-O0-NEXT: ; implicit-def: $sgpr44_sgpr45 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s42 -; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6 -; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[40:41] ; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41] ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: s_or_saveexec_b64 s[40:41], -1 +; GFX9-O0-NEXT: ; implicit-def: $sgpr42_sgpr43 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, s35 -; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41] -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v4 -; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[40:41] ; GFX9-O0-NEXT: ; implicit-def: $sgpr35 ; GFX9-O0-NEXT: ; implicit-def: $sgpr35 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 @@ -850,45 +844,38 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-O3-NEXT: s_xor_saveexec_b64 s[34:35], -1 ; GFX9-O3-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-O3-NEXT: s_waitcnt vmcnt(0) ; GFX9-O3-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-O3-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) +; GFX9-O3-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-O3-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-O3-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) +; GFX9-O3-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-O3-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-O3-NEXT: s_waitcnt vmcnt(0) +; GFX9-O3-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O3-NEXT: v_lshlrev_b32_e32 v0, 5, v0 -; GFX9-O3-NEXT: buffer_load_dwordx4 v[7:10], v0, s[4:7], 0 offen -; GFX9-O3-NEXT: buffer_load_dwordx2 v[11:12], v0, s[4:7], 0 offen offset:16 +; GFX9-O3-NEXT: buffer_load_dwordx4 v[8:11], v0, s[4:7], 0 offen +; GFX9-O3-NEXT: buffer_load_dwordx2 v[12:13], v0, s[4:7], 0 offen offset:16 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-O3-NEXT: v_bfrev_b32_e32 v2, -2 -; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-O3-NEXT: v_bfrev_b32_e32 v1, -2 ; GFX9-O3-NEXT: s_waitcnt vmcnt(1) -; GFX9-O3-NEXT: v_mov_b32_e32 v2, v8 -; GFX9-O3-NEXT: s_mov_b64 exec, -1 -; GFX9-O3-NEXT: v_cndmask_b32_e64 v1, -1, v7, s[34:35] -; GFX9-O3-NEXT: v_bfrev_b32_e32 v4, -2 -; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O3-NEXT: v_mov_b32_e32 v4, v10 -; GFX9-O3-NEXT: s_mov_b64 exec, -1 -; GFX9-O3-NEXT: v_cndmask_b32_e64 v3, -1, v9, s[34:35] -; GFX9-O3-NEXT: v_bfrev_b32_e32 v6, -2 -; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-O3-NEXT: v_cndmask_b32_e64 v3, v1, v9, s[34:35] +; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, -1, v8, s[34:35] +; GFX9-O3-NEXT: v_cndmask_b32_e64 v5, v1, v11, s[34:35] +; GFX9-O3-NEXT: v_cndmask_b32_e64 v4, -1, v10, s[34:35] ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) -; GFX9-O3-NEXT: v_mov_b32_e32 v6, v12 -; GFX9-O3-NEXT: s_mov_b64 exec, -1 -; GFX9-O3-NEXT: v_cndmask_b32_e64 v5, -1, v11, s[34:35] +; GFX9-O3-NEXT: v_cndmask_b32_e64 v7, v1, v13, s[34:35] +; GFX9-O3-NEXT: v_cndmask_b32_e64 v6, -1, v12, s[34:35] ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O3-NEXT: v_mov_b32_e32 v7, v1 -; GFX9-O3-NEXT: v_mov_b32_e32 v9, v3 ; GFX9-O3-NEXT: v_mov_b32_e32 v8, v2 ; GFX9-O3-NEXT: v_mov_b32_e32 v10, v4 +; GFX9-O3-NEXT: v_mov_b32_e32 v9, v3 ; GFX9-O3-NEXT: v_mov_b32_e32 v11, v5 ; GFX9-O3-NEXT: v_mov_b32_e32 v12, v6 -; GFX9-O3-NEXT: buffer_store_dwordx4 v[7:10], v0, s[4:7], 0 offen -; GFX9-O3-NEXT: buffer_store_dwordx2 v[11:12], v0, s[4:7], 0 offen offset:16 +; GFX9-O3-NEXT: v_mov_b32_e32 v13, v7 +; GFX9-O3-NEXT: buffer_store_dwordx4 v[8:11], v0, s[4:7], 0 offen +; GFX9-O3-NEXT: buffer_store_dwordx2 v[12:13], v0, s[4:7], 0 offen offset:16 ; GFX9-O3-NEXT: s_xor_saveexec_b64 s[34:35], -1 ; GFX9-O3-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O3-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -896,6 +883,7 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in ; GFX9-O3-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GFX9-O3-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; GFX9-O3-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-O3-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) ; GFX9-O3-NEXT: s_setpc_b64 s[30:31] @@ -1141,18 +1129,16 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt ; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-O0-NEXT: s_mov_b64 s[38:39], 0 ; GFX9-O0-NEXT: s_mov_b32 s37, s39 +; GFX9-O0-NEXT: ; implicit-def: $sgpr40_sgpr41 ; GFX9-O0-NEXT: v_mov_b32_e32 v32, s37 -; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O0-NEXT: v_mov_b32_e32 v32, v0 -; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v32, v32, v0, s[34:35] ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v10 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-O0-NEXT: s_mov_b32 s36, s38 +; GFX9-O0-NEXT: ; implicit-def: $sgpr38_sgpr39 ; GFX9-O0-NEXT: v_mov_b32_e32 v33, s36 -; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O0-NEXT: v_mov_b32_e32 v33, v0 -; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v33, v33, v0, s[34:35] ; GFX9-O0-NEXT: ; implicit-def: $sgpr38 ; GFX9-O0-NEXT: ; implicit-def: $sgpr38 ; GFX9-O0-NEXT: v_mov_b32_e32 v34, v32 @@ -1161,17 +1147,15 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt ; GFX9-O0-NEXT: v_mov_b32_e32 v11, v34 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v9 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-O0-NEXT: ; implicit-def: $sgpr38_sgpr39 ; GFX9-O0-NEXT: v_mov_b32_e32 v32, s37 -; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O0-NEXT: v_mov_b32_e32 v32, v0 -; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v32, v32, v0, s[34:35] ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v8 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-O0-NEXT: ; implicit-def: $sgpr38_sgpr39 ; GFX9-O0-NEXT: v_mov_b32_e32 v33, s36 -; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O0-NEXT: v_mov_b32_e32 v33, v0 -; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v33, v33, v0, s[34:35] ; GFX9-O0-NEXT: ; implicit-def: $sgpr38 ; GFX9-O0-NEXT: ; implicit-def: $sgpr38 ; GFX9-O0-NEXT: v_mov_b32_e32 v34, v32 @@ -1180,17 +1164,15 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v34 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v7 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-O0-NEXT: ; implicit-def: $sgpr38_sgpr39 ; GFX9-O0-NEXT: v_mov_b32_e32 v32, s37 -; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O0-NEXT: v_mov_b32_e32 v32, v0 -; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v32, v32, v0, s[34:35] ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v6 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-O0-NEXT: ; implicit-def: $sgpr38_sgpr39 ; GFX9-O0-NEXT: v_mov_b32_e32 v33, s36 -; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O0-NEXT: v_mov_b32_e32 v33, v0 -; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v33, v33, v0, s[34:35] ; GFX9-O0-NEXT: ; implicit-def: $sgpr38 ; GFX9-O0-NEXT: ; implicit-def: $sgpr38 ; GFX9-O0-NEXT: v_mov_b32_e32 v34, v32 @@ -1199,17 +1181,15 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v34 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v5 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-O0-NEXT: ; implicit-def: $sgpr38_sgpr39 ; GFX9-O0-NEXT: v_mov_b32_e32 v32, s37 -; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O0-NEXT: v_mov_b32_e32 v32, v0 -; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v32, v32, v0, s[34:35] ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-O0-NEXT: ; implicit-def: $sgpr38_sgpr39 ; GFX9-O0-NEXT: v_mov_b32_e32 v33, s36 -; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O0-NEXT: v_mov_b32_e32 v33, v0 -; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v33, v33, v0, s[34:35] ; GFX9-O0-NEXT: ; implicit-def: $sgpr38 ; GFX9-O0-NEXT: ; implicit-def: $sgpr38 ; GFX9-O0-NEXT: v_mov_b32_e32 v34, v32 @@ -1218,17 +1198,15 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v34 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-O0-NEXT: ; implicit-def: $sgpr38_sgpr39 ; GFX9-O0-NEXT: v_mov_b32_e32 v32, s37 -; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O0-NEXT: v_mov_b32_e32 v32, v0 -; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v32, v32, v0, s[34:35] ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX9-O0-NEXT: ; implicit-def: $sgpr38_sgpr39 ; GFX9-O0-NEXT: v_mov_b32_e32 v33, s36 -; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O0-NEXT: v_mov_b32_e32 v33, v0 -; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v33, v33, v0, s[34:35] ; GFX9-O0-NEXT: ; implicit-def: $sgpr36 ; GFX9-O0-NEXT: ; implicit-def: $sgpr36 ; GFX9-O0-NEXT: v_mov_b32_e32 v34, v32 diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll index 80717e82d5f2c..7fecab068396e 100644 --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll @@ -26,15 +26,15 @@ define amdgpu_cs void @no_cfg(ptr addrspace(8) inreg %tmp14) { ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v5 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[6:7] ; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 -; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6_sgpr7 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 -; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-O0-NEXT: s_nop 1 ; GFX9-O0-NEXT: v_mov_b32_dpp v2, v0 row_bcast:31 row_mask:0xc bank_mask:0xf @@ -147,11 +147,10 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v5, off, s[16:19], 0 offset:16 ; 4-byte Folded Spill ; GFX9-O0-NEXT: ; implicit-def: $sgpr2_sgpr3 +; GFX9-O0-NEXT: ; implicit-def: $sgpr2_sgpr3 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 -; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[2:3] ; GFX9-O0-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-O0-NEXT: s_nop 1 ; GFX9-O0-NEXT: v_mov_b32_dpp v2, v1 row_bcast:31 row_mask:0xc bank_mask:0xf @@ -178,6 +177,9 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-O0-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-O0-NEXT: ; implicit-def: $sgpr0_sgpr1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v1, v0, s[0:1] ; GFX9-O0-NEXT: s_nop 1 ; GFX9-O0-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf @@ -336,40 +338,40 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) { ; GFX9-O0-NEXT: v_readlane_b32 s1, v7, 3 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-O0-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24 -; GFX9-O0-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x2c +; GFX9-O0-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; GFX9-O0-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x2c ; GFX9-O0-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_writelane_b32 v0, s6, 4 -; GFX9-O0-NEXT: v_writelane_b32 v0, s7, 5 ; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-O0-NEXT: s_mov_b32 s3, s9 -; GFX9-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 -; GFX9-O0-NEXT: s_mov_b32 s9, s17 -; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17 +; GFX9-O0-NEXT: s_mov_b32 s3, s7 +; GFX9-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 killed $sgpr6_sgpr7 +; GFX9-O0-NEXT: s_mov_b32 s7, s9 +; GFX9-O0-NEXT: s_mov_b32 s16, s8 ; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17_sgpr18_sgpr19 -; GFX9-O0-NEXT: s_mov_b32 s17, s9 -; GFX9-O0-NEXT: s_mov_b32 s18, s8 +; GFX9-O0-NEXT: s_mov_b32 s17, s7 +; GFX9-O0-NEXT: s_mov_b32 s18, s6 ; GFX9-O0-NEXT: s_mov_b32 s19, s3 -; GFX9-O0-NEXT: v_writelane_b32 v0, s16, 6 -; GFX9-O0-NEXT: v_writelane_b32 v0, s17, 7 -; GFX9-O0-NEXT: v_writelane_b32 v0, s18, 8 -; GFX9-O0-NEXT: v_writelane_b32 v0, s19, 9 -; GFX9-O0-NEXT: s_mov_b32 s3, 0 -; GFX9-O0-NEXT: v_writelane_b32 v0, s3, 10 +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_writelane_b32 v1, s16, 4 +; GFX9-O0-NEXT: v_writelane_b32 v1, s17, 5 +; GFX9-O0-NEXT: v_writelane_b32 v1, s18, 6 +; GFX9-O0-NEXT: v_writelane_b32 v1, s19, 7 +; GFX9-O0-NEXT: s_mov_b32 s6, 0 +; GFX9-O0-NEXT: v_writelane_b32 v1, s6, 8 +; GFX9-O0-NEXT: ; implicit-def: $sgpr8_sgpr9 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9-O0-NEXT: v_writelane_b32 v1, s2, 9 +; GFX9-O0-NEXT: v_writelane_b32 v1, s3, 10 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[24:27], 0 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[24:27], 0 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-O0-NEXT: v_mov_b32_e32 v6, s3 -; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v6, s2 -; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v0, s[2:3] ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 56 ; GFX9-O0-NEXT: s_mov_b32 s2, s0 ; GFX9-O0-NEXT: s_mov_b32 s0, s1 @@ -398,13 +400,13 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) { ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s0, v1, 6 -; GFX9-O0-NEXT: v_readlane_b32 s1, v1, 7 -; GFX9-O0-NEXT: v_readlane_b32 s2, v1, 8 -; GFX9-O0-NEXT: v_readlane_b32 s3, v1, 9 -; GFX9-O0-NEXT: v_readlane_b32 s6, v1, 4 -; GFX9-O0-NEXT: v_readlane_b32 s7, v1, 5 -; GFX9-O0-NEXT: v_readlane_b32 s4, v1, 10 +; GFX9-O0-NEXT: v_readlane_b32 s0, v1, 4 +; GFX9-O0-NEXT: v_readlane_b32 s1, v1, 5 +; GFX9-O0-NEXT: v_readlane_b32 s2, v1, 6 +; GFX9-O0-NEXT: v_readlane_b32 s3, v1, 7 +; GFX9-O0-NEXT: v_readlane_b32 s6, v1, 9 +; GFX9-O0-NEXT: v_readlane_b32 s7, v1, 10 +; GFX9-O0-NEXT: v_readlane_b32 s4, v1, 8 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload @@ -434,15 +436,13 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) { ; GFX9-O3-NEXT: s_mov_b64 exec, s[12:13] ; GFX9-O3-NEXT: s_load_dword s4, s[2:3], 0x34 ; GFX9-O3-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 -; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-O3-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-O3-NEXT: v_mov_b32_e32 v6, s4 -; GFX9-O3-NEXT: s_mov_b64 exec, -1 +; GFX9-O3-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O3-NEXT: s_add_u32 s8, s2, 56 ; GFX9-O3-NEXT: v_lshlrev_b32_e32 v3, 20, v3 ; GFX9-O3-NEXT: v_lshlrev_b32_e32 v4, 10, v4 +; GFX9-O3-NEXT: v_cndmask_b32_e64 v6, 0, v0, s[20:21] ; GFX9-O3-NEXT: s_addc_u32 s9, s3, 0 ; GFX9-O3-NEXT: v_or3_b32 v3, v5, v4, v3 ; GFX9-O3-NEXT: s_mov_b64 s[4:5], s[0:1] @@ -587,47 +587,49 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar ; GFX9-O0-NEXT: v_readlane_b32 s1, v12, 3 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-O0-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24 -; GFX9-O0-NEXT: s_load_dwordx2 s[18:19], s[0:1], 0x2c -; GFX9-O0-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_writelane_b32 v0, s6, 4 -; GFX9-O0-NEXT: v_writelane_b32 v0, s7, 5 +; GFX9-O0-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x2c +; GFX9-O0-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-O0-NEXT: s_mov_b32 s2, s19 -; GFX9-O0-NEXT: s_mov_b32 s3, s18 -; GFX9-O0-NEXT: s_mov_b32 s15, s17 +; GFX9-O0-NEXT: s_mov_b32 s6, s9 +; GFX9-O0-NEXT: s_mov_b32 s7, s8 +; GFX9-O0-NEXT: s_mov_b32 s8, s17 ; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17 ; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17_sgpr18_sgpr19 -; GFX9-O0-NEXT: s_mov_b32 s17, s15 -; GFX9-O0-NEXT: s_mov_b32 s18, s3 -; GFX9-O0-NEXT: s_mov_b32 s19, s2 -; GFX9-O0-NEXT: v_writelane_b32 v0, s16, 6 -; GFX9-O0-NEXT: v_writelane_b32 v0, s17, 7 -; GFX9-O0-NEXT: v_writelane_b32 v0, s18, 8 -; GFX9-O0-NEXT: v_writelane_b32 v0, s19, 9 +; GFX9-O0-NEXT: s_mov_b32 s17, s8 +; GFX9-O0-NEXT: s_mov_b32 s18, s7 +; GFX9-O0-NEXT: s_mov_b32 s19, s6 +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_writelane_b32 v1, s16, 4 +; GFX9-O0-NEXT: v_writelane_b32 v1, s17, 5 +; GFX9-O0-NEXT: v_writelane_b32 v1, s18, 6 +; GFX9-O0-NEXT: v_writelane_b32 v1, s19, 7 +; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 +; GFX9-O0-NEXT: s_mov_b32 s15, s7 +; GFX9-O0-NEXT: s_mov_b32 s8, s3 +; GFX9-O0-NEXT: ; implicit-def: $sgpr16_sgpr17 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, s15 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v0, s[8:9] +; GFX9-O0-NEXT: s_mov_b64 exec, s[8:9] +; GFX9-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 killed $sgpr6_sgpr7 +; GFX9-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 killed $sgpr2_sgpr3 +; GFX9-O0-NEXT: ; implicit-def: $sgpr8_sgpr9 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9-O0-NEXT: v_writelane_b32 v1, s2, 8 +; GFX9-O0-NEXT: v_writelane_b32 v1, s3, 9 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[24:27], 0 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[24:27], 0 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-O0-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-O0-NEXT: s_mov_b32 s16, s3 -; GFX9-O0-NEXT: s_mov_b32 s15, s9 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, s16 -; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v6, s15 -; GFX9-O0-NEXT: s_mov_b64 exec, -1 -; GFX9-O0-NEXT: s_mov_b32 s3, s2 -; GFX9-O0-NEXT: s_mov_b32 s2, s8 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, s3 -; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v7, s2 -; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, s6 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v7, v0, s[2:3] ; GFX9-O0-NEXT: ; implicit-def: $sgpr2 ; GFX9-O0-NEXT: ; implicit-def: $sgpr2 ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v7 @@ -666,12 +668,12 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[24:27], 0 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s0, v2, 6 -; GFX9-O0-NEXT: v_readlane_b32 s1, v2, 7 -; GFX9-O0-NEXT: v_readlane_b32 s2, v2, 8 -; GFX9-O0-NEXT: v_readlane_b32 s3, v2, 9 -; GFX9-O0-NEXT: v_readlane_b32 s4, v2, 4 -; GFX9-O0-NEXT: v_readlane_b32 s5, v2, 5 +; GFX9-O0-NEXT: v_readlane_b32 s0, v2, 4 +; GFX9-O0-NEXT: v_readlane_b32 s1, v2, 5 +; GFX9-O0-NEXT: v_readlane_b32 s2, v2, 6 +; GFX9-O0-NEXT: v_readlane_b32 s3, v2, 7 +; GFX9-O0-NEXT: v_readlane_b32 s4, v2, 8 +; GFX9-O0-NEXT: v_readlane_b32 s5, v2, 9 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload @@ -709,16 +711,13 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar ; GFX9-O3-NEXT: s_mov_b64 exec, s[12:13] ; GFX9-O3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 ; GFX9-O3-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 -; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-O3-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-O3-NEXT: v_mov_b32_e32 v6, s5 -; GFX9-O3-NEXT: s_mov_b64 exec, -1 -; GFX9-O3-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-O3-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-O3-NEXT: v_mov_b32_e32 v7, s4 -; GFX9-O3-NEXT: s_mov_b64 exec, -1 +; GFX9-O3-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GFX9-O3-NEXT: v_cndmask_b32_e64 v6, 0, v0, s[8:9] +; GFX9-O3-NEXT: s_mov_b64 exec, s[8:9] +; GFX9-O3-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O3-NEXT: s_add_u32 s8, s2, 60 ; GFX9-O3-NEXT: s_addc_u32 s9, s3, 0 ; GFX9-O3-NEXT: s_getpc_b64 s[2:3] @@ -727,6 +726,7 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar ; GFX9-O3-NEXT: s_load_dwordx2 s[22:23], s[2:3], 0x0 ; GFX9-O3-NEXT: v_lshlrev_b32_e32 v3, 20, v3 ; GFX9-O3-NEXT: v_lshlrev_b32_e32 v4, 10, v4 +; GFX9-O3-NEXT: v_cndmask_b32_e64 v7, 0, v0, s[20:21] ; GFX9-O3-NEXT: v_or3_b32 v3, v5, v4, v3 ; GFX9-O3-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-O3-NEXT: s_mov_b64 s[0:1], s[24:25] @@ -784,15 +784,13 @@ define amdgpu_cs void @_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %index) { ; GFX9-O0-NEXT: ; kill: def $sgpr10 killed $sgpr10 def $sgpr10_sgpr11 ; GFX9-O0-NEXT: s_mov_b32 s11, s5 ; GFX9-O0-NEXT: s_mov_b32 s8, s11 +; GFX9-O0-NEXT: ; implicit-def: $sgpr12_sgpr13 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v7 -; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[6:7] ; GFX9-O0-NEXT: s_mov_b32 s5, s10 +; GFX9-O0-NEXT: ; implicit-def: $sgpr10_sgpr11 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v6 -; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[6:7] ; GFX9-O0-NEXT: ; implicit-def: $sgpr9 ; GFX9-O0-NEXT: ; implicit-def: $sgpr9 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 @@ -802,14 +800,12 @@ define amdgpu_cs void @_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %index) { ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v13 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v14 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-O0-NEXT: ; implicit-def: $sgpr10_sgpr11 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v7 -; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[6:7] +; GFX9-O0-NEXT: ; implicit-def: $sgpr10_sgpr11 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v6 -; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[6:7] ; GFX9-O0-NEXT: ; implicit-def: $sgpr9 ; GFX9-O0-NEXT: ; implicit-def: $sgpr9 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 @@ -819,17 +815,15 @@ define amdgpu_cs void @_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %index) { ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-O0-NEXT: ; implicit-def: $sgpr10_sgpr11 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6 -; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[6:7] ; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-O0-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v4 -; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[6:7] ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 @@ -855,35 +849,27 @@ define amdgpu_cs void @_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %index) { ; GFX9-O3-LABEL: _amdgpu_cs_main: ; GFX9-O3: ; %bb.0: ; GFX9-O3-NEXT: v_lshlrev_b32_e32 v0, 5, v0 -; GFX9-O3-NEXT: buffer_load_dwordx4 v[7:10], v0, s[0:3], 0 offen -; GFX9-O3-NEXT: buffer_load_dwordx2 v[11:12], v0, s[0:3], 0 offen offset:16 +; GFX9-O3-NEXT: buffer_load_dwordx4 v[8:11], v0, s[0:3], 0 offen +; GFX9-O3-NEXT: buffer_load_dwordx2 v[12:13], v0, s[0:3], 0 offen offset:16 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-O3-NEXT: v_bfrev_b32_e32 v2, -2 -; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-O3-NEXT: v_bfrev_b32_e32 v1, -2 ; GFX9-O3-NEXT: s_waitcnt vmcnt(1) -; GFX9-O3-NEXT: v_mov_b32_e32 v2, v8 -; GFX9-O3-NEXT: s_mov_b64 exec, -1 -; GFX9-O3-NEXT: v_cndmask_b32_e64 v1, -1, v7, s[4:5] -; GFX9-O3-NEXT: v_bfrev_b32_e32 v4, -2 -; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-O3-NEXT: v_mov_b32_e32 v4, v10 -; GFX9-O3-NEXT: s_mov_b64 exec, -1 -; GFX9-O3-NEXT: v_cndmask_b32_e64 v3, -1, v9, s[4:5] -; GFX9-O3-NEXT: v_bfrev_b32_e32 v6, -2 -; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-O3-NEXT: v_cndmask_b32_e64 v3, v1, v9, s[4:5] +; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, -1, v8, s[4:5] +; GFX9-O3-NEXT: v_cndmask_b32_e64 v5, v1, v11, s[4:5] +; GFX9-O3-NEXT: v_cndmask_b32_e64 v4, -1, v10, s[4:5] ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) -; GFX9-O3-NEXT: v_mov_b32_e32 v6, v12 -; GFX9-O3-NEXT: s_mov_b64 exec, -1 -; GFX9-O3-NEXT: v_cndmask_b32_e64 v5, -1, v11, s[4:5] +; GFX9-O3-NEXT: v_cndmask_b32_e64 v7, v1, v13, s[4:5] +; GFX9-O3-NEXT: v_cndmask_b32_e64 v6, -1, v12, s[4:5] ; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-O3-NEXT: v_mov_b32_e32 v7, v1 -; GFX9-O3-NEXT: v_mov_b32_e32 v9, v3 ; GFX9-O3-NEXT: v_mov_b32_e32 v8, v2 ; GFX9-O3-NEXT: v_mov_b32_e32 v10, v4 +; GFX9-O3-NEXT: v_mov_b32_e32 v9, v3 ; GFX9-O3-NEXT: v_mov_b32_e32 v11, v5 ; GFX9-O3-NEXT: v_mov_b32_e32 v12, v6 -; GFX9-O3-NEXT: buffer_store_dwordx4 v[7:10], v0, s[0:3], 0 offen -; GFX9-O3-NEXT: buffer_store_dwordx2 v[11:12], v0, s[0:3], 0 offen offset:16 +; GFX9-O3-NEXT: v_mov_b32_e32 v13, v7 +; GFX9-O3-NEXT: buffer_store_dwordx4 v[8:11], v0, s[0:3], 0 offen +; GFX9-O3-NEXT: buffer_store_dwordx2 v[12:13], v0, s[0:3], 0 offen offset:16 ; GFX9-O3-NEXT: s_endpgm %tmp17 = shl i32 %index, 5 %tmp18 = tail call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %desc, i32 %tmp17, i32 0) @@ -933,15 +919,15 @@ define amdgpu_cs void @strict_wwm_no_cfg(ptr addrspace(8) inreg %tmp14) { ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v5 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[6:7] ; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 -; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6_sgpr7 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 -; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-O0-NEXT: s_nop 1 ; GFX9-O0-NEXT: v_mov_b32_dpp v2, v0 row_bcast:31 row_mask:0xc bank_mask:0xf @@ -1054,11 +1040,10 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v5, off, s[16:19], 0 offset:16 ; 4-byte Folded Spill ; GFX9-O0-NEXT: ; implicit-def: $sgpr2_sgpr3 +; GFX9-O0-NEXT: ; implicit-def: $sgpr2_sgpr3 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 -; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[2:3] ; GFX9-O0-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-O0-NEXT: s_nop 1 ; GFX9-O0-NEXT: v_mov_b32_dpp v2, v1 row_bcast:31 row_mask:0xc bank_mask:0xf @@ -1085,6 +1070,9 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-O0-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-O0-NEXT: ; implicit-def: $sgpr0_sgpr1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v1, v0, s[0:1] ; GFX9-O0-NEXT: s_nop 1 ; GFX9-O0-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf @@ -1243,40 +1231,40 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in ; GFX9-O0-NEXT: v_readlane_b32 s1, v7, 3 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-O0-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24 -; GFX9-O0-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x2c +; GFX9-O0-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 +; GFX9-O0-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x2c ; GFX9-O0-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_writelane_b32 v0, s6, 4 -; GFX9-O0-NEXT: v_writelane_b32 v0, s7, 5 ; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-O0-NEXT: s_mov_b32 s3, s9 -; GFX9-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 -; GFX9-O0-NEXT: s_mov_b32 s9, s17 -; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17 +; GFX9-O0-NEXT: s_mov_b32 s3, s7 +; GFX9-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 killed $sgpr6_sgpr7 +; GFX9-O0-NEXT: s_mov_b32 s7, s9 +; GFX9-O0-NEXT: s_mov_b32 s16, s8 ; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17_sgpr18_sgpr19 -; GFX9-O0-NEXT: s_mov_b32 s17, s9 -; GFX9-O0-NEXT: s_mov_b32 s18, s8 +; GFX9-O0-NEXT: s_mov_b32 s17, s7 +; GFX9-O0-NEXT: s_mov_b32 s18, s6 ; GFX9-O0-NEXT: s_mov_b32 s19, s3 -; GFX9-O0-NEXT: v_writelane_b32 v0, s16, 6 -; GFX9-O0-NEXT: v_writelane_b32 v0, s17, 7 -; GFX9-O0-NEXT: v_writelane_b32 v0, s18, 8 -; GFX9-O0-NEXT: v_writelane_b32 v0, s19, 9 -; GFX9-O0-NEXT: s_mov_b32 s3, 0 -; GFX9-O0-NEXT: v_writelane_b32 v0, s3, 10 +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_writelane_b32 v1, s16, 4 +; GFX9-O0-NEXT: v_writelane_b32 v1, s17, 5 +; GFX9-O0-NEXT: v_writelane_b32 v1, s18, 6 +; GFX9-O0-NEXT: v_writelane_b32 v1, s19, 7 +; GFX9-O0-NEXT: s_mov_b32 s6, 0 +; GFX9-O0-NEXT: v_writelane_b32 v1, s6, 8 +; GFX9-O0-NEXT: ; implicit-def: $sgpr8_sgpr9 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9-O0-NEXT: v_writelane_b32 v1, s2, 9 +; GFX9-O0-NEXT: v_writelane_b32 v1, s3, 10 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[24:27], 0 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[24:27], 0 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-O0-NEXT: v_mov_b32_e32 v6, s3 -; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v6, s2 -; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v0, s[2:3] ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 56 ; GFX9-O0-NEXT: s_mov_b32 s2, s0 ; GFX9-O0-NEXT: s_mov_b32 s0, s1 @@ -1305,13 +1293,13 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s0, v1, 6 -; GFX9-O0-NEXT: v_readlane_b32 s1, v1, 7 -; GFX9-O0-NEXT: v_readlane_b32 s2, v1, 8 -; GFX9-O0-NEXT: v_readlane_b32 s3, v1, 9 -; GFX9-O0-NEXT: v_readlane_b32 s6, v1, 4 -; GFX9-O0-NEXT: v_readlane_b32 s7, v1, 5 -; GFX9-O0-NEXT: v_readlane_b32 s4, v1, 10 +; GFX9-O0-NEXT: v_readlane_b32 s0, v1, 4 +; GFX9-O0-NEXT: v_readlane_b32 s1, v1, 5 +; GFX9-O0-NEXT: v_readlane_b32 s2, v1, 6 +; GFX9-O0-NEXT: v_readlane_b32 s3, v1, 7 +; GFX9-O0-NEXT: v_readlane_b32 s6, v1, 9 +; GFX9-O0-NEXT: v_readlane_b32 s7, v1, 10 +; GFX9-O0-NEXT: v_readlane_b32 s4, v1, 8 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload @@ -1341,15 +1329,13 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in ; GFX9-O3-NEXT: s_mov_b64 exec, s[12:13] ; GFX9-O3-NEXT: s_load_dword s4, s[2:3], 0x34 ; GFX9-O3-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 -; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-O3-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-O3-NEXT: v_mov_b32_e32 v6, s4 -; GFX9-O3-NEXT: s_mov_b64 exec, -1 +; GFX9-O3-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O3-NEXT: s_add_u32 s8, s2, 56 ; GFX9-O3-NEXT: v_lshlrev_b32_e32 v3, 20, v3 ; GFX9-O3-NEXT: v_lshlrev_b32_e32 v4, 10, v4 +; GFX9-O3-NEXT: v_cndmask_b32_e64 v6, 0, v0, s[20:21] ; GFX9-O3-NEXT: s_addc_u32 s9, s3, 0 ; GFX9-O3-NEXT: v_or3_b32 v3, v5, v4, v3 ; GFX9-O3-NEXT: s_mov_b64 s[4:5], s[0:1] @@ -1494,47 +1480,49 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6 ; GFX9-O0-NEXT: v_readlane_b32 s1, v12, 3 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-O0-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24 -; GFX9-O0-NEXT: s_load_dwordx2 s[18:19], s[0:1], 0x2c -; GFX9-O0-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_writelane_b32 v0, s6, 4 -; GFX9-O0-NEXT: v_writelane_b32 v0, s7, 5 +; GFX9-O0-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x2c +; GFX9-O0-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-O0-NEXT: s_mov_b32 s2, s19 -; GFX9-O0-NEXT: s_mov_b32 s3, s18 -; GFX9-O0-NEXT: s_mov_b32 s15, s17 +; GFX9-O0-NEXT: s_mov_b32 s6, s9 +; GFX9-O0-NEXT: s_mov_b32 s7, s8 +; GFX9-O0-NEXT: s_mov_b32 s8, s17 ; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17 ; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17_sgpr18_sgpr19 -; GFX9-O0-NEXT: s_mov_b32 s17, s15 -; GFX9-O0-NEXT: s_mov_b32 s18, s3 -; GFX9-O0-NEXT: s_mov_b32 s19, s2 -; GFX9-O0-NEXT: v_writelane_b32 v0, s16, 6 -; GFX9-O0-NEXT: v_writelane_b32 v0, s17, 7 -; GFX9-O0-NEXT: v_writelane_b32 v0, s18, 8 -; GFX9-O0-NEXT: v_writelane_b32 v0, s19, 9 +; GFX9-O0-NEXT: s_mov_b32 s17, s8 +; GFX9-O0-NEXT: s_mov_b32 s18, s7 +; GFX9-O0-NEXT: s_mov_b32 s19, s6 +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_writelane_b32 v1, s16, 4 +; GFX9-O0-NEXT: v_writelane_b32 v1, s17, 5 +; GFX9-O0-NEXT: v_writelane_b32 v1, s18, 6 +; GFX9-O0-NEXT: v_writelane_b32 v1, s19, 7 +; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 +; GFX9-O0-NEXT: s_mov_b32 s15, s7 +; GFX9-O0-NEXT: s_mov_b32 s8, s3 +; GFX9-O0-NEXT: ; implicit-def: $sgpr16_sgpr17 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, s15 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v0, s[8:9] +; GFX9-O0-NEXT: s_mov_b64 exec, s[8:9] +; GFX9-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 killed $sgpr6_sgpr7 +; GFX9-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 killed $sgpr2_sgpr3 +; GFX9-O0-NEXT: ; implicit-def: $sgpr8_sgpr9 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9-O0-NEXT: v_writelane_b32 v1, s2, 8 +; GFX9-O0-NEXT: v_writelane_b32 v1, s3, 9 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[24:27], 0 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[24:27], 0 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-O0-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-O0-NEXT: s_mov_b32 s16, s3 -; GFX9-O0-NEXT: s_mov_b32 s15, s9 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, s16 -; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v6, s15 -; GFX9-O0-NEXT: s_mov_b64 exec, -1 -; GFX9-O0-NEXT: s_mov_b32 s3, s2 -; GFX9-O0-NEXT: s_mov_b32 s2, s8 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, s3 -; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v7, s2 -; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, s6 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v7, v0, s[2:3] ; GFX9-O0-NEXT: ; implicit-def: $sgpr2 ; GFX9-O0-NEXT: ; implicit-def: $sgpr2 ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v7 @@ -1573,12 +1561,12 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6 ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[24:27], 0 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s0, v2, 6 -; GFX9-O0-NEXT: v_readlane_b32 s1, v2, 7 -; GFX9-O0-NEXT: v_readlane_b32 s2, v2, 8 -; GFX9-O0-NEXT: v_readlane_b32 s3, v2, 9 -; GFX9-O0-NEXT: v_readlane_b32 s4, v2, 4 -; GFX9-O0-NEXT: v_readlane_b32 s5, v2, 5 +; GFX9-O0-NEXT: v_readlane_b32 s0, v2, 4 +; GFX9-O0-NEXT: v_readlane_b32 s1, v2, 5 +; GFX9-O0-NEXT: v_readlane_b32 s2, v2, 6 +; GFX9-O0-NEXT: v_readlane_b32 s3, v2, 7 +; GFX9-O0-NEXT: v_readlane_b32 s4, v2, 8 +; GFX9-O0-NEXT: v_readlane_b32 s5, v2, 9 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload @@ -1616,16 +1604,13 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6 ; GFX9-O3-NEXT: s_mov_b64 exec, s[12:13] ; GFX9-O3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 ; GFX9-O3-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24 -; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1 -; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-O3-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-O3-NEXT: v_mov_b32_e32 v6, s5 -; GFX9-O3-NEXT: s_mov_b64 exec, -1 -; GFX9-O3-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-O3-NEXT: s_mov_b64 exec, s[20:21] -; GFX9-O3-NEXT: v_mov_b32_e32 v7, s4 -; GFX9-O3-NEXT: s_mov_b64 exec, -1 +; GFX9-O3-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GFX9-O3-NEXT: v_cndmask_b32_e64 v6, 0, v0, s[8:9] +; GFX9-O3-NEXT: s_mov_b64 exec, s[8:9] +; GFX9-O3-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O3-NEXT: s_add_u32 s8, s2, 60 ; GFX9-O3-NEXT: s_addc_u32 s9, s3, 0 ; GFX9-O3-NEXT: s_getpc_b64 s[2:3] @@ -1634,6 +1619,7 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6 ; GFX9-O3-NEXT: s_load_dwordx2 s[22:23], s[2:3], 0x0 ; GFX9-O3-NEXT: v_lshlrev_b32_e32 v3, 20, v3 ; GFX9-O3-NEXT: v_lshlrev_b32_e32 v4, 10, v4 +; GFX9-O3-NEXT: v_cndmask_b32_e64 v7, 0, v0, s[20:21] ; GFX9-O3-NEXT: v_or3_b32 v3, v5, v4, v3 ; GFX9-O3-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-O3-NEXT: s_mov_b64 s[0:1], s[24:25] @@ -1691,15 +1677,13 @@ define amdgpu_cs void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %ind ; GFX9-O0-NEXT: ; kill: def $sgpr10 killed $sgpr10 def $sgpr10_sgpr11 ; GFX9-O0-NEXT: s_mov_b32 s11, s5 ; GFX9-O0-NEXT: s_mov_b32 s8, s11 +; GFX9-O0-NEXT: ; implicit-def: $sgpr12_sgpr13 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v7 -; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[6:7] ; GFX9-O0-NEXT: s_mov_b32 s5, s10 +; GFX9-O0-NEXT: ; implicit-def: $sgpr10_sgpr11 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v6 -; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[6:7] ; GFX9-O0-NEXT: ; implicit-def: $sgpr9 ; GFX9-O0-NEXT: ; implicit-def: $sgpr9 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 @@ -1709,14 +1693,12 @@ define amdgpu_cs void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %ind ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v13 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v14 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-O0-NEXT: ; implicit-def: $sgpr10_sgpr11 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v7 -; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[6:7] +; GFX9-O0-NEXT: ; implicit-def: $sgpr10_sgpr11 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v6 -; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[6:7] ; GFX9-O0-NEXT: ; implicit-def: $sgpr9 ; GFX9-O0-NEXT: ; implicit-def: $sgpr9 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 @@ -1726,17 +1708,15 @@ define amdgpu_cs void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %ind ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-O0-NEXT: ; implicit-def: $sgpr10_sgpr11 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6 -; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[6:7] ; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-O0-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v4 -; GFX9-O0-NEXT: s_mov_b64 exec, -1 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[6:7] ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr5 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 @@ -1762,35 +1742,27 @@ define amdgpu_cs void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %ind ; GFX9-O3-LABEL: strict_wwm_amdgpu_cs_main: ; GFX9-O3: ; %bb.0: ; GFX9-O3-NEXT: v_lshlrev_b32_e32 v0, 5, v0 -; GFX9-O3-NEXT: buffer_load_dwordx4 v[7:10], v0, s[0:3], 0 offen -; GFX9-O3-NEXT: buffer_load_dwordx2 v[11:12], v0, s[0:3], 0 offen offset:16 +; GFX9-O3-NEXT: buffer_load_dwordx4 v[8:11], v0, s[0:3], 0 offen +; GFX9-O3-NEXT: buffer_load_dwordx2 v[12:13], v0, s[0:3], 0 offen offset:16 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-O3-NEXT: v_bfrev_b32_e32 v2, -2 -; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-O3-NEXT: v_bfrev_b32_e32 v1, -2 ; GFX9-O3-NEXT: s_waitcnt vmcnt(1) -; GFX9-O3-NEXT: v_mov_b32_e32 v2, v8 -; GFX9-O3-NEXT: s_mov_b64 exec, -1 -; GFX9-O3-NEXT: v_cndmask_b32_e64 v1, -1, v7, s[4:5] -; GFX9-O3-NEXT: v_bfrev_b32_e32 v4, -2 -; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-O3-NEXT: v_mov_b32_e32 v4, v10 -; GFX9-O3-NEXT: s_mov_b64 exec, -1 -; GFX9-O3-NEXT: v_cndmask_b32_e64 v3, -1, v9, s[4:5] -; GFX9-O3-NEXT: v_bfrev_b32_e32 v6, -2 -; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-O3-NEXT: v_cndmask_b32_e64 v3, v1, v9, s[4:5] +; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, -1, v8, s[4:5] +; GFX9-O3-NEXT: v_cndmask_b32_e64 v5, v1, v11, s[4:5] +; GFX9-O3-NEXT: v_cndmask_b32_e64 v4, -1, v10, s[4:5] ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) -; GFX9-O3-NEXT: v_mov_b32_e32 v6, v12 -; GFX9-O3-NEXT: s_mov_b64 exec, -1 -; GFX9-O3-NEXT: v_cndmask_b32_e64 v5, -1, v11, s[4:5] +; GFX9-O3-NEXT: v_cndmask_b32_e64 v7, v1, v13, s[4:5] +; GFX9-O3-NEXT: v_cndmask_b32_e64 v6, -1, v12, s[4:5] ; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-O3-NEXT: v_mov_b32_e32 v7, v1 -; GFX9-O3-NEXT: v_mov_b32_e32 v9, v3 ; GFX9-O3-NEXT: v_mov_b32_e32 v8, v2 ; GFX9-O3-NEXT: v_mov_b32_e32 v10, v4 +; GFX9-O3-NEXT: v_mov_b32_e32 v9, v3 ; GFX9-O3-NEXT: v_mov_b32_e32 v11, v5 ; GFX9-O3-NEXT: v_mov_b32_e32 v12, v6 -; GFX9-O3-NEXT: buffer_store_dwordx4 v[7:10], v0, s[0:3], 0 offen -; GFX9-O3-NEXT: buffer_store_dwordx2 v[11:12], v0, s[0:3], 0 offen offset:16 +; GFX9-O3-NEXT: v_mov_b32_e32 v13, v7 +; GFX9-O3-NEXT: buffer_store_dwordx4 v[8:11], v0, s[0:3], 0 offen +; GFX9-O3-NEXT: buffer_store_dwordx2 v[12:13], v0, s[0:3], 0 offen offset:16 ; GFX9-O3-NEXT: s_endpgm %tmp17 = shl i32 %index, 5 %tmp18 = tail call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %desc, i32 %tmp17, i32 0) From b02aadf8b343a809397958c78bf0dc9b3b79d1f9 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Mon, 9 Sep 2024 17:44:59 +0100 Subject: [PATCH 3/4] clang-format --- llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp index 208af0634f81c..8064c07310d09 100644 --- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -1072,7 +1072,8 @@ void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) { case AMDGPU::V_SET_INACTIVE_B32: if (ActiveLanesReg) { LiveInterval &LI = LIS->getInterval(MI.getOperand(5).getReg()); - MRI->constrainRegClass(ActiveLanesReg, TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID)); + MRI->constrainRegClass( + ActiveLanesReg, TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID)); MI.getOperand(5).setReg(ActiveLanesReg); LIS->shrinkToUses(&LI); } else { From 990c94964d890cb9f8f68eec3c1befcf1affbbfb Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Wed, 11 Sep 2024 11:35:29 +0100 Subject: [PATCH 4/4] use getWaveMaskRegClass --- llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp index 8064c07310d09..31e1c1036f3e4 100644 --- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -1072,8 +1072,7 @@ void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) { case AMDGPU::V_SET_INACTIVE_B32: if (ActiveLanesReg) { LiveInterval &LI = LIS->getInterval(MI.getOperand(5).getReg()); - MRI->constrainRegClass( - ActiveLanesReg, TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID)); + MRI->constrainRegClass(ActiveLanesReg, TRI->getWaveMaskRegClass()); MI.getOperand(5).setReg(ActiveLanesReg); LIS->shrinkToUses(&LI); } else {