diff --git a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h index fac7fa6417265..7b0475ac2481d 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h @@ -1073,6 +1073,11 @@ class MachineIRBuilder { /// Build and insert an unmerge of \p Res sized pieces to cover \p Op MachineInstrBuilder buildUnmerge(LLT Res, const SrcOp &Op); + /// Build and insert an unmerge of pieces with \p Attrs register attributes to + /// cover \p Op + MachineInstrBuilder buildUnmerge(MachineRegisterInfo::VRegAttrs Attrs, + const SrcOp &Op); + /// Build and insert \p Res = G_BUILD_VECTOR \p Op0, ... /// /// G_BUILD_VECTOR creates a vector value from multiple scalar registers. diff --git a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h index 91f68581df48c..4fddc2033b81b 100644 --- a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h +++ b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h @@ -674,6 +674,12 @@ class MachineRegisterInfo { return dyn_cast_if_present(Val); } + /// Return the register bank of \p Reg. + /// This shouldn't be used directly unless \p Reg has a register bank. + const RegisterBank *getRegBank(Register Reg) const { + return cast(VRegInfo[Reg.id()].first); + } + /// Return the register bank of \p Reg, or null if Reg has not been assigned /// a register bank or has been assigned a register class. /// \note It is possible to get the register bank from the register class via diff --git a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp index be347006a81f9..db59ca1be281c 100644 --- a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp +++ b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp @@ -698,6 +698,15 @@ MachineInstrBuilder MachineIRBuilder::buildUnmerge(LLT Res, return buildInstr(TargetOpcode::G_UNMERGE_VALUES, TmpVec, Op); } +MachineInstrBuilder +MachineIRBuilder::buildUnmerge(MachineRegisterInfo::VRegAttrs Attrs, + const SrcOp &Op) { + LLT OpTy = Op.getLLTTy(*getMRI()); + unsigned NumRegs = OpTy.getSizeInBits() / Attrs.Ty.getSizeInBits(); + SmallVector TmpVec(NumRegs, Attrs); + return buildInstr(TargetOpcode::G_UNMERGE_VALUES, TmpVec, Op); +} + MachineInstrBuilder MachineIRBuilder::buildUnmerge(ArrayRef Res, const SrcOp &Op) { // Unfortunately to convert from ArrayRef to ArrayRef, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp index 2321a52102e60..d64337c4cb909 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp @@ -7,10 +7,13 @@ //===----------------------------------------------------------------------===// #include "AMDGPUGlobalISelUtils.h" +#include "AMDGPURegisterBankInfo.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "llvm/ADT/DenseSet.h" #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" +#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" #include "llvm/CodeGenTypes/LowLevelType.h" #include "llvm/IR/Constants.h" #include "llvm/IR/IntrinsicsAMDGPU.h" @@ -106,3 +109,59 @@ void IntrinsicLaneMaskAnalyzer::findLCSSAPhi(Register Reg) { S32S64LaneMask.insert(LCSSAPhi.getOperand(0).getReg()); } } + +static LLT getReadAnyLaneSplitTy(LLT Ty) { + if (Ty.isVector()) { + LLT ElTy = Ty.getElementType(); + if (ElTy.getSizeInBits() == 16) + return LLT::fixed_vector(2, ElTy); + // S32, S64 or pointer + return ElTy; + } + + // Large scalars and 64-bit pointers + return LLT::scalar(32); +} + +static Register buildReadAnyLane(MachineIRBuilder &B, Register VgprSrc, + const RegisterBankInfo &RBI); + +static void unmergeReadAnyLane(MachineIRBuilder &B, + SmallVectorImpl &SgprDstParts, + LLT UnmergeTy, Register VgprSrc, + const RegisterBankInfo &RBI) { + const RegisterBank *VgprRB = &RBI.getRegBank(AMDGPU::VGPRRegBankID); + auto Unmerge = B.buildUnmerge({VgprRB, UnmergeTy}, VgprSrc); + for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i) { + SgprDstParts.push_back(buildReadAnyLane(B, Unmerge.getReg(i), RBI)); + } +} + +static Register buildReadAnyLane(MachineIRBuilder &B, Register VgprSrc, + const RegisterBankInfo &RBI) { + LLT Ty = B.getMRI()->getType(VgprSrc); + const RegisterBank *SgprRB = &RBI.getRegBank(AMDGPU::SGPRRegBankID); + if (Ty.getSizeInBits() == 32) { + return B.buildInstr(AMDGPU::G_AMDGPU_READANYLANE, {{SgprRB, Ty}}, {VgprSrc}) + .getReg(0); + } + + SmallVector SgprDstParts; + unmergeReadAnyLane(B, SgprDstParts, getReadAnyLaneSplitTy(Ty), VgprSrc, RBI); + + return B.buildMergeLikeInstr({SgprRB, Ty}, SgprDstParts).getReg(0); +} + +void AMDGPU::buildReadAnyLane(MachineIRBuilder &B, Register SgprDst, + Register VgprSrc, const RegisterBankInfo &RBI) { + LLT Ty = B.getMRI()->getType(VgprSrc); + if (Ty.getSizeInBits() == 32) { + B.buildInstr(AMDGPU::G_AMDGPU_READANYLANE, {SgprDst}, {VgprSrc}); + return; + } + + SmallVector SgprDstParts; + unmergeReadAnyLane(B, SgprDstParts, getReadAnyLaneSplitTy(Ty), VgprSrc, RBI); + + B.buildMergeLikeInstr(SgprDst, SgprDstParts).getReg(0); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h index b53091325ccbd..27f8fed86d647 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h @@ -20,6 +20,8 @@ class GCNSubtarget; class GISelKnownBits; class LLT; class MachineFunction; +class MachineIRBuilder; +class RegisterBankInfo; namespace AMDGPU { @@ -48,6 +50,9 @@ class IntrinsicLaneMaskAnalyzer { // This will not be needed when we turn off LCSSA for global-isel. void findLCSSAPhi(Register Reg); }; + +void buildReadAnyLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc, + const RegisterBankInfo &RBI); } } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 926c1e4b23b4a..40eaba2c09209 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -217,6 +217,74 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const { return true; } +bool AMDGPUInstructionSelector::selectCOPY_SCC_VCC(MachineInstr &I) const { + const DebugLoc &DL = I.getDebugLoc(); + MachineBasicBlock *BB = I.getParent(); + + unsigned CmpOpc = + STI.isWave64() ? AMDGPU::S_CMP_LG_U64 : AMDGPU::S_CMP_LG_U32; + MachineInstr *Cmp = BuildMI(*BB, &I, DL, TII.get(CmpOpc)) + .addReg(I.getOperand(1).getReg()) + .addImm(0); + if (!constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI)) + return false; + + Register DstReg = I.getOperand(0).getReg(); + BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(AMDGPU::SCC); + + I.eraseFromParent(); + return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI); +} + +bool AMDGPUInstructionSelector::selectCOPY_VCC_SCC(MachineInstr &I) const { + const DebugLoc &DL = I.getDebugLoc(); + MachineBasicBlock *BB = I.getParent(); + + Register DstReg = I.getOperand(0).getReg(); + Register SrcReg = I.getOperand(1).getReg(); + std::optional Arg = + getIConstantVRegValWithLookThrough(I.getOperand(1).getReg(), *MRI); + + if (Arg) { + const int64_t Value = Arg->Value.getZExtValue(); + if (Value == 0) { + unsigned Opcode = STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32; + BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0); + } else { + assert(Value == 1); + BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(TRI.getExec()); + } + I.eraseFromParent(); + return RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI); + } + + // RegBankLegalize ensures that SrcReg is bool in reg (high bits are 0). + BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC).addReg(SrcReg); + + unsigned SelectOpcode = + STI.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32; + MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg) + .addReg(TRI.getExec()) + .addImm(0); + + I.eraseFromParent(); + return constrainSelectedInstRegOperands(*Select, TII, TRI, RBI); +} + +bool AMDGPUInstructionSelector::selectReadAnyLane(MachineInstr &I) const { + Register DstReg = I.getOperand(0).getReg(); + Register SrcReg = I.getOperand(1).getReg(); + + const DebugLoc &DL = I.getDebugLoc(); + MachineBasicBlock *BB = I.getParent(); + + auto RFL = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), DstReg) + .addReg(SrcReg); + + I.eraseFromParent(); + return constrainSelectedInstRegOperands(*RFL, TII, TRI, RBI); +} + bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const { const Register DefReg = I.getOperand(0).getReg(); const LLT DefTy = MRI->getType(DefReg); @@ -249,7 +317,21 @@ bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const { } } - // TODO: Verify that all registers have the same bank + // If inputs have register bank, assign corresponding reg class. + // Note: registers don't need to have the same reg bank. + for (unsigned i = 1; i != I.getNumOperands(); i += 2) { + const Register SrcReg = I.getOperand(i).getReg(); + + const RegisterBank *RB = MRI->getRegBankOrNull(SrcReg); + if (RB) { + const LLT SrcTy = MRI->getType(SrcReg); + const TargetRegisterClass *SrcRC = + TRI.getRegClassForTypeOnBank(SrcTy, *RB); + if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI)) + return false; + } + } + I.setDesc(TII.get(TargetOpcode::PHI)); return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI); } @@ -4014,6 +4096,12 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) { return selectStackRestore(I); case AMDGPU::G_PHI: return selectPHI(I); + case AMDGPU::G_AMDGPU_COPY_SCC_VCC: + return selectCOPY_SCC_VCC(I); + case AMDGPU::G_AMDGPU_COPY_VCC_SCC: + return selectCOPY_VCC_SCC(I); + case AMDGPU::G_AMDGPU_READANYLANE: + return selectReadAnyLane(I); case TargetOpcode::G_CONSTANT: case TargetOpcode::G_FCONSTANT: default: diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index d294300be4049..b0d2a73fe31d2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -87,6 +87,9 @@ class AMDGPUInstructionSelector final : public InstructionSelector { bool constrainCopyLikeIntrin(MachineInstr &MI, unsigned NewOpc) const; bool selectCOPY(MachineInstr &I) const; + bool selectCOPY_SCC_VCC(MachineInstr &I) const; + bool selectCOPY_VCC_SCC(MachineInstr &I) const; + bool selectReadAnyLane(MachineInstr &I) const; bool selectPHI(MachineInstr &I) const; bool selectG_TRUNC(MachineInstr &I) const; bool selectG_SZA_EXT(MachineInstr &I) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp index 283173deaeedc..8d3e7829e10e1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp @@ -18,12 +18,20 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" +#include "AMDGPUGlobalISelUtils.h" +#include "AMDGPURegBankLegalizeHelper.h" +#include "GCNSubtarget.h" +#include "llvm/CodeGen/GlobalISel/CSEInfo.h" +#include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineUniformityAnalysis.h" +#include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/InitializePasses.h" #define DEBUG_TYPE "amdgpu-regbanklegalize" using namespace llvm; +using namespace AMDGPU; namespace { @@ -43,6 +51,9 @@ class AMDGPURegBankLegalize : public MachineFunctionPass { } void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -58,6 +69,9 @@ class AMDGPURegBankLegalize : public MachineFunctionPass { INITIALIZE_PASS_BEGIN(AMDGPURegBankLegalize, DEBUG_TYPE, "AMDGPU Register Bank Legalize", false, false) +INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) +INITIALIZE_PASS_DEPENDENCY(GISelCSEAnalysisWrapperPass) +INITIALIZE_PASS_DEPENDENCY(MachineUniformityAnalysisPass) INITIALIZE_PASS_END(AMDGPURegBankLegalize, DEBUG_TYPE, "AMDGPU Register Bank Legalize", false, false) @@ -69,11 +83,291 @@ FunctionPass *llvm::createAMDGPURegBankLegalizePass() { return new AMDGPURegBankLegalize(); } -using namespace AMDGPU; +const RegBankLegalizeRules &getRules(const GCNSubtarget &ST, + MachineRegisterInfo &MRI) { + static std::mutex GlobalMutex; + static SmallDenseMap> + CacheForRuleSet; + std::lock_guard Lock(GlobalMutex); + if (!CacheForRuleSet.contains(ST.getGeneration())) { + auto Rules = std::make_unique(ST, MRI); + CacheForRuleSet[ST.getGeneration()] = std::move(Rules); + } else { + CacheForRuleSet[ST.getGeneration()]->refreshRefs(ST, MRI); + } + return *CacheForRuleSet[ST.getGeneration()]; +} + +class AMDGPURegBankLegalizeCombiner { + MachineIRBuilder &B; + MachineRegisterInfo &MRI; + const SIRegisterInfo &TRI; + const RegisterBank *SgprRB; + const RegisterBank *VgprRB; + const RegisterBank *VccRB; + + static constexpr LLT S1 = LLT::scalar(1); + static constexpr LLT S16 = LLT::scalar(16); + static constexpr LLT S32 = LLT::scalar(32); + static constexpr LLT S64 = LLT::scalar(64); + +public: + AMDGPURegBankLegalizeCombiner(MachineIRBuilder &B, const SIRegisterInfo &TRI, + const RegisterBankInfo &RBI) + : B(B), MRI(*B.getMRI()), TRI(TRI), + SgprRB(&RBI.getRegBank(AMDGPU::SGPRRegBankID)), + VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)), + VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {}; + + bool isLaneMask(Register Reg) { + const RegisterBank *RB = MRI.getRegBankOrNull(Reg); + if (RB && RB->getID() == AMDGPU::VCCRegBankID) + return true; + + const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg); + return RC && TRI.isSGPRClass(RC) && MRI.getType(Reg) == LLT::scalar(1); + } + + void cleanUpAfterCombine(MachineInstr &MI, MachineInstr *Optional0) { + MI.eraseFromParent(); + if (Optional0 && isTriviallyDead(*Optional0, MRI)) + Optional0->eraseFromParent(); + } + + std::pair tryMatch(Register Src, unsigned Opcode) { + MachineInstr *MatchMI = MRI.getVRegDef(Src); + if (MatchMI->getOpcode() != Opcode) + return {nullptr, Register()}; + return {MatchMI, MatchMI->getOperand(1).getReg()}; + } + + void tryCombineCopy(MachineInstr &MI) { + Register Dst = MI.getOperand(0).getReg(); + Register Src = MI.getOperand(1).getReg(); + // Skip copies of physical registers. + if (!Dst.isVirtual() || !Src.isVirtual()) + return; + + // This is a cross bank copy, sgpr S1 to lane mask. + // + // %Src:sgpr(s1) = G_TRUNC %TruncS32Src:sgpr(s32) + // %Dst:lane-mask(s1) = COPY %Src:sgpr(s1) + // -> + // %Dst:lane-mask(s1) = G_AMDGPU_COPY_VCC_SCC %TruncS32Src:sgpr(s32) + if (isLaneMask(Dst) && MRI.getRegBankOrNull(Src) == SgprRB) { + auto [Trunc, TruncS32Src] = tryMatch(Src, AMDGPU::G_TRUNC); + assert(Trunc && MRI.getType(TruncS32Src) == S32 && + "sgpr S1 must be result of G_TRUNC of sgpr S32"); + + B.setInstr(MI); + // Ensure that truncated bits in BoolSrc are 0. + auto One = B.buildConstant({SgprRB, S32}, 1); + auto BoolSrc = B.buildAnd({SgprRB, S32}, TruncS32Src, One); + B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {Dst}, {BoolSrc}); + cleanUpAfterCombine(MI, Trunc); + return; + } + + // Src = G_AMDGPU_READANYLANE RALSrc + // Dst = COPY Src + // -> + // Dst = RALSrc + if (MRI.getRegBankOrNull(Dst) == VgprRB && + MRI.getRegBankOrNull(Src) == SgprRB) { + auto [RAL, RALSrc] = tryMatch(Src, AMDGPU::G_AMDGPU_READANYLANE); + if (!RAL) + return; + + assert(MRI.getRegBank(RALSrc) == VgprRB); + MRI.replaceRegWith(Dst, RALSrc); + cleanUpAfterCombine(MI, RAL); + return; + } + } + + void tryCombineS1AnyExt(MachineInstr &MI) { + // %Src:sgpr(S1) = G_TRUNC %TruncSrc + // %Dst = G_ANYEXT %Src:sgpr(S1) + // -> + // %Dst = G_... %TruncSrc + Register Dst = MI.getOperand(0).getReg(); + Register Src = MI.getOperand(1).getReg(); + if (MRI.getType(Src) != S1) + return; + + auto [Trunc, TruncSrc] = tryMatch(Src, AMDGPU::G_TRUNC); + if (!Trunc) + return; + + LLT DstTy = MRI.getType(Dst); + LLT TruncSrcTy = MRI.getType(TruncSrc); + + if (DstTy == TruncSrcTy) { + MRI.replaceRegWith(Dst, TruncSrc); + cleanUpAfterCombine(MI, Trunc); + return; + } + + B.setInstr(MI); + + if (DstTy == S32 && TruncSrcTy == S64) { + auto Unmerge = B.buildUnmerge({SgprRB, S32}, TruncSrc); + MRI.replaceRegWith(Dst, Unmerge.getReg(0)); + cleanUpAfterCombine(MI, Trunc); + return; + } + + if (DstTy == S32 && TruncSrcTy == S16) { + B.buildAnyExt(Dst, TruncSrc); + cleanUpAfterCombine(MI, Trunc); + return; + } + + if (DstTy == S16 && TruncSrcTy == S32) { + B.buildTrunc(Dst, TruncSrc); + cleanUpAfterCombine(MI, Trunc); + return; + } + + llvm_unreachable("missing anyext + trunc combine"); + } +}; + +// Search through MRI for virtual registers with sgpr register bank and S1 LLT. +[[maybe_unused]] static Register getAnySgprS1(const MachineRegisterInfo &MRI) { + const LLT S1 = LLT::scalar(1); + for (unsigned i = 0; i < MRI.getNumVirtRegs(); ++i) { + Register Reg = Register::index2VirtReg(i); + if (MRI.def_empty(Reg) || MRI.getType(Reg) != S1) + continue; + + const RegisterBank *RB = MRI.getRegBankOrNull(Reg); + if (RB && RB->getID() == AMDGPU::SGPRRegBankID) { + LLVM_DEBUG(dbgs() << "Warning: detected sgpr S1 register in: "; + MRI.getVRegDef(Reg)->dump();); + return Reg; + } + } + + return {}; +} bool AMDGPURegBankLegalize::runOnMachineFunction(MachineFunction &MF) { if (MF.getProperties().hasProperty( MachineFunctionProperties::Property::FailedISel)) return false; + + // Setup the instruction builder with CSE. + const TargetPassConfig &TPC = getAnalysis(); + GISelCSEAnalysisWrapper &Wrapper = + getAnalysis().getCSEWrapper(); + GISelCSEInfo &CSEInfo = Wrapper.get(TPC.getCSEConfig()); + GISelObserverWrapper Observer; + Observer.addObserver(&CSEInfo); + + CSEMIRBuilder B(MF); + B.setCSEInfo(&CSEInfo); + B.setChangeObserver(Observer); + + RAIIDelegateInstaller DelegateInstaller(MF, &Observer); + RAIIMFObserverInstaller MFObserverInstaller(MF, Observer); + + const GCNSubtarget &ST = MF.getSubtarget(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + const RegisterBankInfo &RBI = *ST.getRegBankInfo(); + const MachineUniformityInfo &MUI = + getAnalysis().getUniformityInfo(); + + // RegBankLegalizeRules is initialized with assigning sets of IDs to opcodes. + const RegBankLegalizeRules &RBLRules = getRules(ST, MRI); + + // Logic that does legalization based on IDs assigned to Opcode. + RegBankLegalizeHelper RBLHelper(B, MUI, RBI, RBLRules); + + SmallVector AllInst; + + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + AllInst.push_back(&MI); + } + } + + for (MachineInstr *MI : AllInst) { + if (!MI->isPreISelOpcode()) + continue; + + unsigned Opc = MI->getOpcode(); + // Insert point for use operands needs some calculation. + if (Opc == AMDGPU::G_PHI) { + RBLHelper.applyMappingPHI(*MI); + continue; + } + + // Opcodes that support pretty much all combinations of reg banks and LLTs + // (except S1). There is no point in writing rules for them. + if (Opc == AMDGPU::G_BUILD_VECTOR || Opc == AMDGPU::G_UNMERGE_VALUES || + Opc == AMDGPU::G_MERGE_VALUES) { + RBLHelper.applyMappingTrivial(*MI); + continue; + } + + // Opcodes that also support S1. + if ((Opc == AMDGPU::G_CONSTANT || Opc == AMDGPU::G_FCONSTANT || + Opc == AMDGPU::G_IMPLICIT_DEF)) { + Register Dst = MI->getOperand(0).getReg(); + // Non S1 types are trivially accepted. + if (MRI.getType(Dst) != LLT::scalar(1)) { + assert(MRI.getRegBank(Dst)->getID() == AMDGPU::SGPRRegBankID); + continue; + } + + // S1 rules are in RegBankLegalizeRules. + } + + RBLHelper.findRuleAndApplyMapping(*MI); + } + + // Sgpr S1 clean up combines: + // - Sgpr S1(S32) to sgpr S1(S32) Copy: anyext + trunc combine. + // In RegBankLegalize 'S1 Dst' are legalized into S32 as + // 'S1Dst = Trunc S32Dst' and 'S1 Src' into 'S32Src = Anyext S1Src'. + // S1 Truncs and Anyexts that come from legalizer, that can have non-S32 + // types e.g. S16 = Anyext S1 or S1 = Trunc S64, will also be cleaned up. + // - Sgpr S1(S32) to vcc Copy: G_AMDGPU_COPY_VCC_SCC combine. + // Divergent instruction uses sgpr S1 as input that should be lane mask(vcc) + // Legalizing this use creates sgpr S1(S32) to vcc Copy. + + // Note: Remaining S1 copies, S1s are either sgpr S1(S32) or vcc S1: + // - Vcc to vcc Copy: nothing to do here, just a regular copy. + // - Vcc to sgpr S1 Copy: Should not exist in a form of COPY instruction(*). + // Note: For 'uniform-in-vcc to sgpr-S1 copy' G_AMDGPU_COPY_SCC_VCC is used + // instead. When only available instruction creates vcc result, use of + // UniformInVcc results in creating G_AMDGPU_COPY_SCC_VCC. + + // (*)Explanation for 'sgpr S1(uniform) = COPY vcc(divergent)': + // Copy from divergent to uniform register indicates an error in either: + // - Uniformity analysis: Uniform instruction has divergent input. If one of + // the inputs is divergent, instruction should be divergent! + // - RegBankLegalizer not executing in waterfall loop (missing implementation) + + AMDGPURegBankLegalizeCombiner Combiner(B, *ST.getRegisterInfo(), RBI); + + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : make_early_inc_range(MBB)) { + if (MI.getOpcode() == AMDGPU::COPY) { + Combiner.tryCombineCopy(MI); + continue; + } + if (MI.getOpcode() == AMDGPU::G_ANYEXT) { + Combiner.tryCombineS1AnyExt(MI); + continue; + } + } + } + + assert(!getAnySgprS1(MRI).isValid() && + "Registers with sgpr reg bank and S1 LLT are not legal after " + "AMDGPURegBankLegalize. Should lower to sgpr S32"); + return true; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp new file mode 100644 index 0000000000000..d27fa1f62538b --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp @@ -0,0 +1,416 @@ +//===-- AMDGPURegBankLegalizeHelper.cpp -----------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// Implements actual lowering algorithms for each ID that can be used in +/// Rule.OperandMapping. Similar to legalizer helper but with register banks. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPURegBankLegalizeHelper.h" +#include "AMDGPUGlobalISelUtils.h" +#include "AMDGPUInstrInfo.h" +#include "AMDGPURegisterBankInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" +#include "llvm/CodeGen/MachineUniformityAnalysis.h" + +#define DEBUG_TYPE "amdgpu-regbanklegalize" + +using namespace llvm; +using namespace AMDGPU; + +RegBankLegalizeHelper::RegBankLegalizeHelper( + MachineIRBuilder &B, const MachineUniformityInfo &MUI, + const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules) + : B(B), MRI(*B.getMRI()), MUI(MUI), RBI(RBI), RBLRules(RBLRules), + SgprRB(&RBI.getRegBank(AMDGPU::SGPRRegBankID)), + VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)), + VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {} + +void RegBankLegalizeHelper::findRuleAndApplyMapping(MachineInstr &MI) { + const SetOfRulesForOpcode &RuleSet = RBLRules.getRulesForOpc(MI); + const RegBankLLTMapping &Mapping = RuleSet.findMappingForMI(MI, MRI, MUI); + + SmallSet WaterfallSgprs; + unsigned OpIdx = 0; + if (Mapping.DstOpMapping.size() > 0) { + B.setInsertPt(*MI.getParent(), std::next(MI.getIterator())); + applyMappingDst(MI, OpIdx, Mapping.DstOpMapping); + } + if (Mapping.SrcOpMapping.size() > 0) { + B.setInstr(MI); + applyMappingSrc(MI, OpIdx, Mapping.SrcOpMapping, WaterfallSgprs); + } + + lower(MI, Mapping, WaterfallSgprs); +} + +void RegBankLegalizeHelper::lower(MachineInstr &MI, + const RegBankLLTMapping &Mapping, + SmallSet &WaterfallSgprs) { + + switch (Mapping.LoweringMethod) { + case DoNotLower: + return; + case UniExtToSel: { + LLT Ty = MRI.getType(MI.getOperand(0).getReg()); + auto True = B.buildConstant({SgprRB, Ty}, + MI.getOpcode() == AMDGPU::G_SEXT ? -1 : 1); + auto False = B.buildConstant({SgprRB, Ty}, 0); + // Input to G_{Z|S}EXT is 'Legalizer legal' S1. Most common case is compare. + // We are making select here. S1 cond was already 'any-extended to S32' + + // 'AND with 1 to clean high bits' by Sgpr32AExtBoolInReg. + B.buildSelect(MI.getOperand(0).getReg(), MI.getOperand(1).getReg(), True, + False); + MI.eraseFromParent(); + return; + } + case Ext32To64: { + const RegisterBank *RB = MRI.getRegBank(MI.getOperand(0).getReg()); + MachineInstrBuilder Hi; + + if (MI.getOpcode() == AMDGPU::G_ZEXT) { + Hi = B.buildConstant({RB, S32}, 0); + } else { + // Replicate sign bit from 32-bit extended part. + auto ShiftAmt = B.buildConstant({RB, S32}, 31); + Hi = B.buildAShr({RB, S32}, MI.getOperand(1).getReg(), ShiftAmt); + } + + B.buildMergeLikeInstr(MI.getOperand(0).getReg(), + {MI.getOperand(1).getReg(), Hi}); + MI.eraseFromParent(); + return; + } + case UniCstExt: { + uint64_t ConstVal = MI.getOperand(1).getCImm()->getZExtValue(); + B.buildConstant(MI.getOperand(0).getReg(), ConstVal); + + MI.eraseFromParent(); + return; + } + case VgprToVccCopy: { + Register Src = MI.getOperand(1).getReg(); + LLT Ty = MRI.getType(Src); + // Take lowest bit from each lane and put it in lane mask. + // Lowering via compare, but we need to clean high bits first as compare + // compares all bits in register. + Register BoolSrc = MRI.createVirtualRegister({VgprRB, Ty}); + if (Ty == S64) { + auto Src64 = B.buildUnmerge({VgprRB, Ty}, Src); + auto One = B.buildConstant(VgprRB_S32, 1); + auto AndLo = B.buildAnd(VgprRB_S32, Src64.getReg(0), One); + auto Zero = B.buildConstant(VgprRB_S32, 0); + auto AndHi = B.buildAnd(VgprRB_S32, Src64.getReg(1), Zero); + B.buildMergeLikeInstr(BoolSrc, {AndLo, AndHi}); + } else { + assert(Ty == S32 || Ty == S16); + auto One = B.buildConstant({VgprRB, Ty}, 1); + B.buildAnd(BoolSrc, Src, One); + } + auto Zero = B.buildConstant({VgprRB, Ty}, 0); + B.buildICmp(CmpInst::ICMP_NE, MI.getOperand(0).getReg(), BoolSrc, Zero); + MI.eraseFromParent(); + return; + } + case SplitTo32: { + auto Op1 = B.buildUnmerge(VgprRB_S32, MI.getOperand(1).getReg()); + auto Op2 = B.buildUnmerge(VgprRB_S32, MI.getOperand(2).getReg()); + unsigned Opc = MI.getOpcode(); + auto Lo = B.buildInstr(Opc, {VgprRB_S32}, {Op1.getReg(0), Op2.getReg(0)}); + auto Hi = B.buildInstr(Opc, {VgprRB_S32}, {Op1.getReg(1), Op2.getReg(1)}); + B.buildMergeLikeInstr(MI.getOperand(0).getReg(), {Lo, Hi}); + MI.eraseFromParent(); + break; + } + } + + // TODO: executeInWaterfallLoop(... WaterfallSgprs) +} + +LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) { + switch (ID) { + case Vcc: + case UniInVcc: + return LLT::scalar(1); + case Sgpr16: + return LLT::scalar(16); + case Sgpr32: + case Sgpr32Trunc: + case Sgpr32AExt: + case Sgpr32AExtBoolInReg: + case Sgpr32SExt: + case UniInVgprS32: + case Vgpr32: + return LLT::scalar(32); + case Sgpr64: + case Vgpr64: + return LLT::scalar(64); + case SgprV4S32: + case VgprV4S32: + case UniInVgprV4S32: + return LLT::fixed_vector(4, 32); + case VgprP1: + return LLT::pointer(1, 64); + default: + return LLT(); + } +} + +const RegisterBank * +RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) { + switch (ID) { + case Vcc: + return VccRB; + case Sgpr16: + case Sgpr32: + case Sgpr64: + case SgprV4S32: + case UniInVcc: + case UniInVgprS32: + case UniInVgprV4S32: + case Sgpr32Trunc: + case Sgpr32AExt: + case Sgpr32AExtBoolInReg: + case Sgpr32SExt: + return SgprRB; + case Vgpr32: + case Vgpr64: + case VgprP1: + case VgprV4S32: + return VgprRB; + default: + return nullptr; + } +} + +void RegBankLegalizeHelper::applyMappingDst( + MachineInstr &MI, unsigned &OpIdx, + const SmallVectorImpl &MethodIDs) { + // Defs start from operand 0 + for (; OpIdx < MethodIDs.size(); ++OpIdx) { + if (MethodIDs[OpIdx] == None) + continue; + MachineOperand &Op = MI.getOperand(OpIdx); + Register Reg = Op.getReg(); + LLT Ty = MRI.getType(Reg); + [[maybe_unused]] const RegisterBank *RB = MRI.getRegBank(Reg); + + switch (MethodIDs[OpIdx]) { + // vcc, sgpr and vgpr scalars, pointers and vectors + case Vcc: + case Sgpr16: + case Sgpr32: + case Sgpr64: + case SgprV4S32: + case Vgpr32: + case Vgpr64: + case VgprP1: + case VgprV4S32: { + assert(Ty == getTyFromID(MethodIDs[OpIdx])); + assert(RB == getRegBankFromID(MethodIDs[OpIdx])); + break; + } + // uniform in vcc/vgpr: scalars and vectors + case UniInVcc: { + assert(Ty == S1); + assert(RB == SgprRB); + Register NewDst = MRI.createVirtualRegister(VccRB_S1); + Op.setReg(NewDst); + auto CopyS32_Vcc = + B.buildInstr(AMDGPU::G_AMDGPU_COPY_SCC_VCC, {SgprRB_S32}, {NewDst}); + B.buildTrunc(Reg, CopyS32_Vcc); + break; + } + case UniInVgprS32: + case UniInVgprV4S32: { + assert(Ty == getTyFromID(MethodIDs[OpIdx])); + assert(RB == SgprRB); + Register NewVgprDst = MRI.createVirtualRegister({VgprRB, Ty}); + Op.setReg(NewVgprDst); + buildReadAnyLane(B, Reg, NewVgprDst, RBI); + break; + } + // sgpr trunc + case Sgpr32Trunc: { + assert(Ty.getSizeInBits() < 32); + assert(RB == SgprRB); + Register NewDst = MRI.createVirtualRegister(SgprRB_S32); + Op.setReg(NewDst); + B.buildTrunc(Reg, NewDst); + break; + } + case InvalidMapping: { + LLVM_DEBUG(dbgs() << "Instruction with Invalid mapping: "; MI.dump();); + llvm_unreachable("missing fast rule for MI"); + } + default: + llvm_unreachable("ID not supported"); + } + } +} + +void RegBankLegalizeHelper::applyMappingSrc( + MachineInstr &MI, unsigned &OpIdx, + const SmallVectorImpl &MethodIDs, + SmallSet &SgprWaterfallOperandRegs) { + for (unsigned i = 0; i < MethodIDs.size(); ++OpIdx, ++i) { + if (MethodIDs[i] == None || MethodIDs[i] == IntrId || MethodIDs[i] == Imm) + continue; + + MachineOperand &Op = MI.getOperand(OpIdx); + Register Reg = Op.getReg(); + LLT Ty = MRI.getType(Reg); + const RegisterBank *RB = MRI.getRegBank(Reg); + + switch (MethodIDs[i]) { + case Vcc: { + assert(Ty == S1); + assert(RB == VccRB || RB == SgprRB); + if (RB == SgprRB) { + auto Aext = B.buildAnyExt(SgprRB_S32, Reg); + auto CopyVcc_Scc = + B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {VccRB_S1}, {Aext}); + Op.setReg(CopyVcc_Scc.getReg(0)); + } + break; + } + // sgpr scalars, pointers and vectors + case Sgpr16: + case Sgpr32: + case Sgpr64: + case SgprV4S32: { + assert(Ty == getTyFromID(MethodIDs[i])); + assert(RB == getRegBankFromID(MethodIDs[i])); + break; + } + // vgpr scalars, pointers and vectors + case Vgpr32: + case Vgpr64: + case VgprP1: + case VgprV4S32: { + assert(Ty == getTyFromID(MethodIDs[i])); + if (RB != VgprRB) { + auto CopyToVgpr = B.buildCopy({VgprRB, Ty}, Reg); + Op.setReg(CopyToVgpr.getReg(0)); + } + break; + } + // sgpr and vgpr scalars with extend + case Sgpr32AExt: { + // Note: this ext allows S1, and it is meant to be combined away. + assert(Ty.getSizeInBits() < 32); + assert(RB == SgprRB); + auto Aext = B.buildAnyExt(SgprRB_S32, Reg); + Op.setReg(Aext.getReg(0)); + break; + } + case Sgpr32AExtBoolInReg: { + // Note: this ext allows S1, and it is meant to be combined away. + assert(Ty.getSizeInBits() == 1); + assert(RB == SgprRB); + auto Aext = B.buildAnyExt(SgprRB_S32, Reg); + // Zext SgprS1 is not legal, this instruction is most of times meant to be + // combined away in RB combiner, so do not make AND with 1. + auto Cst1 = B.buildConstant(SgprRB_S32, 1); + auto BoolInReg = B.buildAnd(SgprRB_S32, Aext, Cst1); + Op.setReg(BoolInReg.getReg(0)); + break; + } + case Sgpr32SExt: { + assert(1 < Ty.getSizeInBits() && Ty.getSizeInBits() < 32); + assert(RB == SgprRB); + auto Sext = B.buildSExt(SgprRB_S32, Reg); + Op.setReg(Sext.getReg(0)); + break; + } + default: + llvm_unreachable("ID not supported"); + } + } +} + +void RegBankLegalizeHelper::applyMappingPHI(MachineInstr &MI) { + Register Dst = MI.getOperand(0).getReg(); + LLT Ty = MRI.getType(Dst); + + if (Ty == LLT::scalar(1) && MUI.isUniform(Dst)) { + B.setInsertPt(*MI.getParent(), MI.getParent()->getFirstNonPHI()); + + Register NewDst = MRI.createVirtualRegister(SgprRB_S32); + MI.getOperand(0).setReg(NewDst); + B.buildTrunc(Dst, NewDst); + + for (unsigned i = 1; i < MI.getNumOperands(); i += 2) { + Register UseReg = MI.getOperand(i).getReg(); + + auto DefMI = MRI.getVRegDef(UseReg)->getIterator(); + MachineBasicBlock *DefMBB = DefMI->getParent(); + + B.setInsertPt(*DefMBB, DefMBB->SkipPHIsAndLabels(std::next(DefMI))); + + auto NewUse = B.buildAnyExt(SgprRB_S32, UseReg); + MI.getOperand(i).setReg(NewUse.getReg(0)); + } + + return; + } + + // ALL divergent i1 phis should be already lowered and inst-selected into PHI + // with sgpr reg class and S1 LLT. + // Note: this includes divergent phis that don't require lowering. + if (Ty == LLT::scalar(1) && MUI.isDivergent(Dst)) { + LLVM_DEBUG(dbgs() << "Divergent S1 G_PHI: "; MI.dump();); + llvm_unreachable("Make sure to run AMDGPUGlobalISelDivergenceLowering " + "before RegBankLegalize to lower lane mask(vcc) phis"); + } + + // We accept all types that can fit in some register class. + // Uniform G_PHIs have all sgpr registers. + // Divergent G_PHIs have vgpr dst but inputs can be sgpr or vgpr. + if (Ty == LLT::scalar(32)) { + return; + } + + LLVM_DEBUG(dbgs() << "G_PHI not handled: "; MI.dump();); + llvm_unreachable("type not supported"); +} + +[[maybe_unused]] static bool verifyRegBankOnOperands(MachineInstr &MI, + const RegisterBank *RB, + MachineRegisterInfo &MRI, + unsigned StartOpIdx, + unsigned EndOpIdx) { + for (unsigned i = StartOpIdx; i <= EndOpIdx; ++i) { + if (MRI.getRegBankOrNull(MI.getOperand(i).getReg()) != RB) + return false; + } + return true; +} + +void RegBankLegalizeHelper::applyMappingTrivial(MachineInstr &MI) { + const RegisterBank *RB = MRI.getRegBank(MI.getOperand(0).getReg()); + // Put RB on all registers + unsigned NumDefs = MI.getNumDefs(); + unsigned NumOperands = MI.getNumOperands(); + + assert(verifyRegBankOnOperands(MI, RB, MRI, 0, NumDefs - 1)); + if (RB == SgprRB) + assert(verifyRegBankOnOperands(MI, RB, MRI, NumDefs, NumOperands - 1)); + + if (RB == VgprRB) { + B.setInstr(MI); + for (unsigned i = NumDefs; i < NumOperands; ++i) { + Register Reg = MI.getOperand(i).getReg(); + if (MRI.getRegBank(Reg) != RB) { + auto Copy = B.buildCopy({VgprRB, MRI.getType(Reg)}, Reg); + MI.getOperand(i).setReg(Copy.getReg(0)); + } + } + } +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h new file mode 100644 index 0000000000000..a518b7e1fa1d3 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h @@ -0,0 +1,111 @@ +//===- AMDGPURegBankLegalizeHelper ------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUREGBANKLEGALIZEHELPER_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUREGBANKLEGALIZEHELPER_H + +#include "AMDGPURegBankLegalizeRules.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" + +namespace llvm { + +class MachineIRBuilder; + +namespace AMDGPU { + +// Receives list of RegBankLLTMappingApplyID and applies register banks on all +// operands. It is user's responsibility to provide RegBankLLTMappingApplyIDs +// for all register operands, there is no need to specify NonReg for trailing +// imm operands. This finishes selection of register banks if there is no need +// to replace instruction. In other case InstApplyMethod will create new +// instruction(s). +class RegBankLegalizeHelper { + MachineIRBuilder &B; + MachineRegisterInfo &MRI; + const MachineUniformityInfo &MUI; + const RegisterBankInfo &RBI; + const RegBankLegalizeRules &RBLRules; + const RegisterBank *SgprRB; + const RegisterBank *VgprRB; + const RegisterBank *VccRB; + + static constexpr LLT S1 = LLT::scalar(1); + static constexpr LLT S16 = LLT::scalar(16); + static constexpr LLT S32 = LLT::scalar(32); + static constexpr LLT S64 = LLT::scalar(64); + static constexpr LLT S96 = LLT::scalar(96); + static constexpr LLT S128 = LLT::scalar(128); + static constexpr LLT S256 = LLT::scalar(256); + + static constexpr LLT V2S16 = LLT::fixed_vector(2, 16); + static constexpr LLT V4S16 = LLT::fixed_vector(4, 16); + static constexpr LLT V6S16 = LLT::fixed_vector(6, 16); + static constexpr LLT V8S16 = LLT::fixed_vector(8, 16); + static constexpr LLT V16S16 = LLT::fixed_vector(16, 16); + static constexpr LLT V32S16 = LLT::fixed_vector(32, 16); + + static constexpr LLT V2S32 = LLT::fixed_vector(2, 32); + static constexpr LLT V3S32 = LLT::fixed_vector(3, 32); + static constexpr LLT V4S32 = LLT::fixed_vector(4, 32); + static constexpr LLT V6S32 = LLT::fixed_vector(6, 32); + static constexpr LLT V7S32 = LLT::fixed_vector(7, 32); + static constexpr LLT V8S32 = LLT::fixed_vector(8, 32); + static constexpr LLT V16S32 = LLT::fixed_vector(16, 32); + + static constexpr LLT V2S64 = LLT::fixed_vector(2, 64); + static constexpr LLT V3S64 = LLT::fixed_vector(3, 64); + static constexpr LLT V4S64 = LLT::fixed_vector(4, 64); + static constexpr LLT V8S64 = LLT::fixed_vector(8, 64); + static constexpr LLT V16S64 = LLT::fixed_vector(16, 64); + + static constexpr LLT P1 = LLT::pointer(1, 64); + static constexpr LLT P4 = LLT::pointer(4, 64); + static constexpr LLT P6 = LLT::pointer(6, 32); + + MachineRegisterInfo::VRegAttrs SgprRB_S32 = {SgprRB, S32}; + MachineRegisterInfo::VRegAttrs VgprRB_S32 = {VgprRB, S32}; + MachineRegisterInfo::VRegAttrs VccRB_S1 = {VccRB, S1}; + +public: + RegBankLegalizeHelper(MachineIRBuilder &B, const MachineUniformityInfo &MUI, + const RegisterBankInfo &RBI, + const RegBankLegalizeRules &RBLRules); + + void findRuleAndApplyMapping(MachineInstr &MI); + + // Manual apply helpers. + void applyMappingPHI(MachineInstr &MI); + void applyMappingTrivial(MachineInstr &MI); + +private: + bool executeInWaterfallLoop(MachineIRBuilder &B, + iterator_range Range, + SmallSet &SgprOperandRegs); + + LLT getTyFromID(RegBankLLTMappingApplyID ID); + + const RegisterBank *getRegBankFromID(RegBankLLTMappingApplyID ID); + + void + applyMappingDst(MachineInstr &MI, unsigned &OpIdx, + const SmallVectorImpl &MethodIDs); + + void + applyMappingSrc(MachineInstr &MI, unsigned &OpIdx, + const SmallVectorImpl &MethodIDs, + SmallSet &SgprWaterfallOperandRegs); + + void lower(MachineInstr &MI, const RegBankLLTMapping &Mapping, + SmallSet &SgprWaterfallOperandRegs); +}; + +} // end namespace AMDGPU +} // end namespace llvm + +#endif diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp new file mode 100644 index 0000000000000..68cfbd62b8e1c --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp @@ -0,0 +1,332 @@ +//===-- AMDGPURegBankLegalizeRules.cpp ------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// Definitions of RegBankLegalize Rules for all opcodes. +/// Implementation of container for all the Rules and search. +/// Fast search for most common case when Rule.Predicate checks LLT and +/// uniformity of register in operand 0. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPURegBankLegalizeRules.h" +#include "GCNSubtarget.h" +#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" +#include "llvm/CodeGen/MachineUniformityAnalysis.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" + +#define DEBUG_TYPE "amdgpu-regbanklegalize" + +using namespace llvm; +using namespace AMDGPU; + +RegBankLLTMapping::RegBankLLTMapping( + std::initializer_list DstOpMappingList, + std::initializer_list SrcOpMappingList, + LoweringMethodID LoweringMethod) + : DstOpMapping(DstOpMappingList), SrcOpMapping(SrcOpMappingList), + LoweringMethod(LoweringMethod) {} + +PredicateMapping::PredicateMapping( + std::initializer_list OpList, + std::function TestFunc) + : OpUniformityAndTypes(OpList), TestFunc(TestFunc) {} + +bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID, + const MachineUniformityInfo &MUI, + const MachineRegisterInfo &MRI) { + switch (UniID) { + case S1: + return MRI.getType(Reg) == LLT::scalar(1); + case S16: + return MRI.getType(Reg) == LLT::scalar(16); + case S32: + return MRI.getType(Reg) == LLT::scalar(32); + case S64: + return MRI.getType(Reg) == LLT::scalar(64); + case P1: + return MRI.getType(Reg) == LLT::pointer(1, 64); + case UniS1: + return MRI.getType(Reg) == LLT::scalar(1) && MUI.isUniform(Reg); + case UniS16: + return MRI.getType(Reg) == LLT::scalar(16) && MUI.isUniform(Reg); + case UniS32: + return MRI.getType(Reg) == LLT::scalar(32) && MUI.isUniform(Reg); + case UniS64: + return MRI.getType(Reg) == LLT::scalar(64) && MUI.isUniform(Reg); + case DivS1: + return MRI.getType(Reg) == LLT::scalar(1) && MUI.isDivergent(Reg); + case DivS32: + return MRI.getType(Reg) == LLT::scalar(32) && MUI.isDivergent(Reg); + case DivS64: + return MRI.getType(Reg) == LLT::scalar(64) && MUI.isDivergent(Reg); + case DivP1: + return MRI.getType(Reg) == LLT::pointer(1, 64) && MUI.isDivergent(Reg); + case _: + return true; + default: + llvm_unreachable("missing matchUniformityAndLLT"); + } +} + +bool PredicateMapping::match(const MachineInstr &MI, + const MachineUniformityInfo &MUI, + const MachineRegisterInfo &MRI) const { + // Check LLT signature. + for (unsigned i = 0; i < OpUniformityAndTypes.size(); ++i) { + if (OpUniformityAndTypes[i] == _) { + if (MI.getOperand(i).isReg()) + return false; + continue; + } + + // Remaining IDs check registers. + if (!MI.getOperand(i).isReg()) + return false; + + if (!matchUniformityAndLLT(MI.getOperand(i).getReg(), + OpUniformityAndTypes[i], MUI, MRI)) + return false; + } + + // More complex check. + if (TestFunc) + return TestFunc(MI); + + return true; +} + +SetOfRulesForOpcode::SetOfRulesForOpcode() {} + +SetOfRulesForOpcode::SetOfRulesForOpcode(FastRulesTypes FastTypes) + : FastTypes(FastTypes) {} + +UniformityLLTOpPredicateID LLTToId(LLT Ty) { + if (Ty == LLT::scalar(16)) + return S16; + if (Ty == LLT::scalar(32)) + return S32; + if (Ty == LLT::scalar(64)) + return S64; + if (Ty == LLT::fixed_vector(2, 16)) + return V2S16; + if (Ty == LLT::fixed_vector(2, 32)) + return V2S32; + if (Ty == LLT::fixed_vector(3, 32)) + return V3S32; + if (Ty == LLT::fixed_vector(4, 32)) + return V4S32; + return _; +} + +const RegBankLLTMapping & +SetOfRulesForOpcode::findMappingForMI(const MachineInstr &MI, + const MachineRegisterInfo &MRI, + const MachineUniformityInfo &MUI) const { + // Search in "Fast Rules". + // Note: if fast rules are enabled, RegBankLLTMapping must be added in each + // slot that could "match fast Predicate". If not, InvalidMapping is + // returned which results in failure, does not search "Slow Rules". + if (FastTypes != NoFastRules) { + Register Reg = MI.getOperand(0).getReg(); + int Slot = getFastPredicateSlot(LLTToId(MRI.getType(Reg))); + if (Slot != -1) + return MUI.isUniform(Reg) ? Uni[Slot] : Div[Slot]; + } + + // Slow search for more complex rules. + for (const RegBankLegalizeRule &Rule : Rules) { + if (Rule.Predicate.match(MI, MUI, MRI)) + return Rule.OperandMapping; + } + + LLVM_DEBUG(dbgs() << "MI: "; MI.dump();); + llvm_unreachable("None of the rules defined for MI's opcode matched MI"); +} + +void SetOfRulesForOpcode::addRule(RegBankLegalizeRule Rule) { + Rules.push_back(Rule); +} + +void SetOfRulesForOpcode::addFastRuleDivergent(UniformityLLTOpPredicateID Ty, + RegBankLLTMapping RuleApplyIDs) { + int Slot = getFastPredicateSlot(Ty); + assert(Slot != -1 && "Ty unsupported in this FastRulesTypes"); + Div[Slot] = RuleApplyIDs; +} + +void SetOfRulesForOpcode::addFastRuleUniform(UniformityLLTOpPredicateID Ty, + RegBankLLTMapping RuleApplyIDs) { + int Slot = getFastPredicateSlot(Ty); + assert(Slot != -1 && "Ty unsupported in this FastRulesTypes"); + Uni[Slot] = RuleApplyIDs; +} + +int SetOfRulesForOpcode::getFastPredicateSlot( + UniformityLLTOpPredicateID Ty) const { + switch (FastTypes) { + case Standard: { + switch (Ty) { + case S32: + return 0; + case S16: + return 1; + case S64: + return 2; + case V2S16: + return 3; + default: + return -1; + } + case Vector: + switch (Ty) { + case S32: + return 0; + case V2S32: + return 1; + case V3S32: + return 2; + case V4S32: + return 3; + default: + return -1; + } + } + default: + return -1; + } +} + +RegBankLegalizeRules::RuleSetInitializer +RegBankLegalizeRules::addRulesForGOpcs(std::initializer_list OpcList, + FastRulesTypes FastTypes) { + return RuleSetInitializer(OpcList, GRulesAlias, GRules, FastTypes); +} + +RegBankLegalizeRules::RuleSetInitializer +RegBankLegalizeRules::addRulesForIOpcs(std::initializer_list OpcList, + FastRulesTypes FastTypes) { + return RuleSetInitializer(OpcList, IRulesAlias, IRules, FastTypes); +} + +const SetOfRulesForOpcode & +RegBankLegalizeRules::getRulesForOpc(MachineInstr &MI) const { + unsigned Opc = MI.getOpcode(); + if (Opc == AMDGPU::G_INTRINSIC || Opc == AMDGPU::G_INTRINSIC_CONVERGENT || + Opc == AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS || + Opc == AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS) { + unsigned IntrID = cast(MI).getIntrinsicID(); + if (!IRulesAlias.contains(IntrID)) { + LLVM_DEBUG(dbgs() << "MI: "; MI.dump();); + llvm_unreachable("No rules defined for intrinsic opcode"); + } + return IRules.at(IRulesAlias.at(IntrID)); + } + + if (!GRulesAlias.contains(Opc)) { + LLVM_DEBUG(dbgs() << "MI: "; MI.dump();); + llvm_unreachable("No rules defined for generic opcode"); + } + return GRules.at(GRulesAlias.at(Opc)); +} + +// Initialize rules +RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, + MachineRegisterInfo &_MRI) + : ST(&_ST), MRI(&_MRI) { + + addRulesForGOpcs({G_ADD}, Standard) + .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}) + .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}); + + addRulesForGOpcs({G_MUL}, Standard).Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}); + + addRulesForGOpcs({G_XOR, G_OR, G_AND}, Standard) + .Any({{UniS1}, {{Sgpr32Trunc}, {Sgpr32AExt, Sgpr32AExt}}}) + .Any({{DivS1}, {{Vcc}, {Vcc, Vcc}}}) + .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}, SplitTo32}); + + addRulesForGOpcs({G_SHL}, Standard) + .Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr32}}) + .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32}}); + + // Note: we only write S1 rules for G_IMPLICIT_DEF, G_CONSTANT, G_FCONSTANT + // and G_FREEZE here, rest is trivially regbankselected earlier + addRulesForGOpcs({G_CONSTANT}) + .Any({{UniS1, _}, {{Sgpr32Trunc}, {None}, UniCstExt}}); + + addRulesForGOpcs({G_ICMP}) + .Any({{UniS1, _, S32}, {{Sgpr32Trunc}, {None, Sgpr32, Sgpr32}}}) + .Any({{DivS1, _, S32}, {{Vcc}, {None, Vgpr32, Vgpr32}}}); + + addRulesForGOpcs({G_FCMP}) + .Any({{UniS1, _, S32}, {{UniInVcc}, {None, Vgpr32, Vgpr32}}}) + .Any({{DivS1, _, S32}, {{Vcc}, {None, Vgpr32, Vgpr32}}}); + + addRulesForGOpcs({G_BRCOND}) + .Any({{UniS1}, {{}, {Sgpr32AExtBoolInReg}}}) + .Any({{DivS1}, {{}, {Vcc}}}); + + addRulesForGOpcs({G_BR}).Any({{_}, {{}, {None}}}); + + addRulesForGOpcs({G_SELECT}, Standard) + .Div(S32, {{Vgpr32}, {Vcc, Vgpr32, Vgpr32}}) + .Uni(S32, {{Sgpr32}, {Sgpr32AExtBoolInReg, Sgpr32, Sgpr32}}); + + addRulesForGOpcs({G_ANYEXT}).Any({{UniS32, S16}, {{Sgpr32}, {Sgpr16}}}); + + // In global-isel G_TRUNC in-reg is treated as no-op, inst selected into COPY. + // It is up to user to deal with truncated bits. + addRulesForGOpcs({G_TRUNC}) + .Any({{UniS16, S32}, {{Sgpr16}, {Sgpr32}}}) + // This is non-trivial. VgprToVccCopy is done using compare instruction. + .Any({{DivS1, DivS32}, {{Vcc}, {Vgpr32}, VgprToVccCopy}}); + + addRulesForGOpcs({G_ZEXT, G_SEXT}) + .Any({{UniS32, S1}, {{Sgpr32}, {Sgpr32AExtBoolInReg}, UniExtToSel}}) + .Any({{UniS64, S32}, {{Sgpr64}, {Sgpr32}, Ext32To64}}) + .Any({{DivS64, S32}, {{Vgpr64}, {Vgpr32}, Ext32To64}}); + + addRulesForGOpcs({G_LOAD}).Any({{DivS32, DivP1}, {{Vgpr32}, {VgprP1}}}); + + addRulesForGOpcs({G_AMDGPU_BUFFER_LOAD}, Vector) + .Div(V4S32, {{VgprV4S32}, {SgprV4S32, Vgpr32, Vgpr32, Sgpr32}}) + .Uni(V4S32, {{UniInVgprV4S32}, {SgprV4S32, Vgpr32, Vgpr32, Sgpr32}}); + + addRulesForGOpcs({G_STORE}) + .Any({{S32, P1}, {{}, {Vgpr32, VgprP1}}}) + .Any({{S64, P1}, {{}, {Vgpr64, VgprP1}}}) + .Any({{V4S32, P1}, {{}, {VgprV4S32, VgprP1}}}); + + addRulesForGOpcs({G_PTR_ADD}).Any({{DivP1}, {{VgprP1}, {VgprP1, Vgpr64}}}); + + addRulesForGOpcs({G_ABS}, Standard).Uni(S16, {{Sgpr32Trunc}, {Sgpr32SExt}}); + + bool hasSALUFloat = ST->hasSALUFloatInsts(); + + addRulesForGOpcs({G_FADD}, Standard) + .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}, hasSALUFloat) + .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}}, !hasSALUFloat) + .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}); + + addRulesForGOpcs({G_FPTOUI}) + .Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}}, hasSALUFloat) + .Any({{UniS32, S32}, {{UniInVgprS32}, {Vgpr32}}}, !hasSALUFloat); + + addRulesForGOpcs({G_UITOFP}) + .Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}}, hasSALUFloat) + .Any({{UniS32, S32}, {{UniInVgprS32}, {Vgpr32}}}, !hasSALUFloat); + + using namespace Intrinsic; + + // This is "intrinsic lane mask" it was set to i32/i64 in llvm-ir. + addRulesForIOpcs({amdgcn_end_cf}).Any({{_, S32}, {{}, {None, Sgpr32}}}); + + addRulesForIOpcs({amdgcn_if_break}, Standard) + .Uni(S32, {{Sgpr32}, {IntrId, Vcc, Sgpr32}}); + +} // end initialize rules diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h new file mode 100644 index 0000000000000..a65345e7ac49b --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h @@ -0,0 +1,268 @@ +//===- AMDGPURegBankLegalizeRules --------------------------------*- C++ -*-==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUREGBANKLEGALIZERULES_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUREGBANKLEGALIZERULES_H + +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallVector.h" +#include + +namespace llvm { + +class MachineRegisterInfo; +class MachineInstr; +class GCNSubtarget; +class MachineFunction; +template class GenericUniformityInfo; +template class GenericSSAContext; +using MachineSSAContext = GenericSSAContext; +using MachineUniformityInfo = GenericUniformityInfo; + +namespace AMDGPU { + +// IDs used to build predicate for RegBankLegalizeRule. Predicate can have one +// or more IDs and each represents a check for 'uniform or divergent' + LLT or +// just LLT on register operand. +// Most often checking one operand is enough to decide which RegBankLLTMapping +// to apply (see Fast Rules), IDs are useful when two or more operands need to +// be checked. +enum UniformityLLTOpPredicateID { + _, + // scalars + S1, + S16, + S32, + S64, + + UniS1, + UniS16, + UniS32, + UniS64, + + DivS1, + DivS32, + DivS64, + + // pointers + P1, + + DivP1, + + // vectors + V2S16, + V2S32, + V3S32, + V4S32, +}; + +// How to apply register bank on register operand. +// In most cases, this serves as a LLT and register bank assert. +// Can change operands and insert copies, extends, truncs, and read-any-lanes. +// Anything more complicated requires LoweringMethod. +enum RegBankLLTMappingApplyID { + InvalidMapping, + None, + IntrId, + Imm, + Vcc, + + // sgpr scalars, pointers, vectors and B-types + Sgpr16, + Sgpr32, + Sgpr64, + SgprV4S32, + + // vgpr scalars, pointers, vectors and B-types + Vgpr32, + Vgpr64, + VgprP1, + VgprV4S32, + + // Dst only modifiers: read-any-lane and truncs + UniInVcc, + UniInVgprS32, + UniInVgprV4S32, + + Sgpr32Trunc, + + // Src only modifiers: waterfalls, extends + Sgpr32AExt, + Sgpr32AExtBoolInReg, + Sgpr32SExt, +}; + +// Instruction needs to be replaced with sequence of instructions. Lowering was +// not done by legalizer since instructions is available in either sgpr or vgpr. +// For example S64 AND is available on sgpr, for that reason S64 AND is legal in +// context of Legalizer that only checks LLT. But S64 AND is not available on +// vgpr. Lower it to two S32 vgpr ANDs. +enum LoweringMethodID { + DoNotLower, + UniExtToSel, + VgprToVccCopy, + SplitTo32, + Ext32To64, + UniCstExt, +}; + +enum FastRulesTypes { + NoFastRules, + Standard, // S16, S32, S64, V2S16 + Vector, // S32, V2S32, V3S32, V4S32 +}; + +struct RegBankLLTMapping { + SmallVector DstOpMapping; + SmallVector SrcOpMapping; + LoweringMethodID LoweringMethod; + RegBankLLTMapping( + std::initializer_list DstOpMappingList, + std::initializer_list SrcOpMappingList, + LoweringMethodID LoweringMethod = DoNotLower); +}; + +struct PredicateMapping { + SmallVector OpUniformityAndTypes; + std::function TestFunc; + PredicateMapping( + std::initializer_list OpList, + std::function TestFunc = nullptr); + + bool match(const MachineInstr &MI, const MachineUniformityInfo &MUI, + const MachineRegisterInfo &MRI) const; +}; + +struct RegBankLegalizeRule { + PredicateMapping Predicate; + RegBankLLTMapping OperandMapping; +}; + +class SetOfRulesForOpcode { + // "Slow Rules". More complex 'Rules[i].Predicate', check them one by one. + SmallVector Rules; + + // "Fast Rules" + // Instead of testing each 'Rules[i].Predicate' we do direct access to + // RegBankLLTMapping using getFastPredicateSlot. For example if: + // - FastTypes == Standard Uni[0] holds Mapping in case Op 0 is uniform S32 + // - FastTypes == Vector Div[3] holds Mapping in case Op 0 is divergent V4S32 + FastRulesTypes FastTypes = NoFastRules; +#define InvMapping RegBankLLTMapping({InvalidMapping}, {InvalidMapping}) + RegBankLLTMapping Uni[4] = {InvMapping, InvMapping, InvMapping, InvMapping}; + RegBankLLTMapping Div[4] = {InvMapping, InvMapping, InvMapping, InvMapping}; + +public: + SetOfRulesForOpcode(); + SetOfRulesForOpcode(FastRulesTypes FastTypes); + + const RegBankLLTMapping & + findMappingForMI(const MachineInstr &MI, const MachineRegisterInfo &MRI, + const MachineUniformityInfo &MUI) const; + + void addRule(RegBankLegalizeRule Rule); + + void addFastRuleDivergent(UniformityLLTOpPredicateID Ty, + RegBankLLTMapping RuleApplyIDs); + void addFastRuleUniform(UniformityLLTOpPredicateID Ty, + RegBankLLTMapping RuleApplyIDs); + +private: + int getFastPredicateSlot(UniformityLLTOpPredicateID Ty) const; +}; + +// Essentially 'map' but a +// little more efficient. +class RegBankLegalizeRules { + const GCNSubtarget *ST; + MachineRegisterInfo *MRI; + // Separate maps for G-opcodes and instrinsics since they are in different + // enums. Multiple opcodes can share same set of rules. + // RulesAlias = map + // Rules = map + SmallDenseMap GRulesAlias; + SmallDenseMap GRules; + SmallDenseMap IRulesAlias; + SmallDenseMap IRules; + class RuleSetInitializer { + SetOfRulesForOpcode *RuleSet; + + public: + // Used for clang-format line breaks and to force writing all rules for + // opcode in same place. + template + RuleSetInitializer(std::initializer_list OpcList, + AliasMap &RulesAlias, RulesMap &Rules, + FastRulesTypes FastTypes = NoFastRules) { + unsigned KeyOpcode = *OpcList.begin(); + for (unsigned Opc : OpcList) { + [[maybe_unused]] auto [_, NewInput] = + RulesAlias.try_emplace(Opc, KeyOpcode); + assert(NewInput && "Can't redefine existing Rules"); + } + + auto [DenseMapIter, NewInput] = Rules.try_emplace(KeyOpcode, FastTypes); + assert(NewInput && "Can't redefine existing Rules"); + + RuleSet = &DenseMapIter->second; + } + + RuleSetInitializer(const RuleSetInitializer &) = delete; + RuleSetInitializer &operator=(const RuleSetInitializer &) = delete; + RuleSetInitializer(RuleSetInitializer &&) = delete; + RuleSetInitializer &operator=(RuleSetInitializer &&) = delete; + ~RuleSetInitializer() = default; + + RuleSetInitializer &Div(UniformityLLTOpPredicateID Ty, + RegBankLLTMapping RuleApplyIDs, + bool STPred = true) { + if (STPred) + RuleSet->addFastRuleDivergent(Ty, RuleApplyIDs); + return *this; + } + + RuleSetInitializer &Uni(UniformityLLTOpPredicateID Ty, + RegBankLLTMapping RuleApplyIDs, + bool STPred = true) { + if (STPred) + RuleSet->addFastRuleUniform(Ty, RuleApplyIDs); + return *this; + } + + RuleSetInitializer &Any(RegBankLegalizeRule Init, bool STPred = true) { + if (STPred) + RuleSet->addRule(Init); + return *this; + } + }; + + RuleSetInitializer addRulesForGOpcs(std::initializer_list OpcList, + FastRulesTypes FastTypes = NoFastRules); + + RuleSetInitializer addRulesForIOpcs(std::initializer_list OpcList, + FastRulesTypes FastTypes = NoFastRules); + +public: + // Initialize rules for all opcodes. + RegBankLegalizeRules(const GCNSubtarget &ST, MachineRegisterInfo &MRI); + + // In case we don't want to regenerate same rules, we can use already + // generated rules but need to refresh references to objects that are + // created for this run. + void refreshRefs(const GCNSubtarget &_ST, MachineRegisterInfo &_MRI) { + ST = &_ST; + MRI = &_MRI; + }; + + const SetOfRulesForOpcode &getRulesForOpc(MachineInstr &MI) const; +}; + +} // end namespace AMDGPU +} // end namespace llvm + +#endif diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index 97a0d59cfeeda..91cae76256306 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -94,6 +94,8 @@ add_llvm_target(AMDGPUCodeGen AMDGPUPromoteKernelArguments.cpp AMDGPURegBankCombiner.cpp AMDGPURegBankLegalize.cpp + AMDGPURegBankLegalizeHelper.cpp + AMDGPURegBankLegalizeRules.cpp AMDGPURegBankSelect.cpp AMDGPURegisterBankInfo.cpp AMDGPURemoveIncompatibleFunctions.cpp diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 40a20fa9cb15e..da44faac2f910 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -4158,6 +4158,36 @@ def G_SI_CALL : AMDGPUGenericInstruction { let isConvergent = 1; } +// Uniform in vgpr - vgpr with same value in all active lanes. + +// $dst = $src0 != 0, selected as: +// $dst(SCC) = s_cmp_lg $src0, 0 +// src0 is either exec or 0 (same value for all active lanes), +// for example result of comparison of two uniform in vgpr. +def G_AMDGPU_COPY_SCC_VCC : AMDGPUGenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type1:$src0); + let hasSideEffects = 0; +} + +// $dst = $src0 ? exec : 0, selected as: +// SCC = COPY $src0 +// $dst(SReg_32/64) = s_cselect exec, 0 +def G_AMDGPU_COPY_VCC_SCC : AMDGPUGenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type1:$src0); + let hasSideEffects = 0; +} + +// Move uniform in vgpr to sgpr. Selected as v_readfirstlane_b32. +// Semantic difference in READ ANY instead of FIRST(active) LANE allows for +// vgpr to sgpr back-to vgpr combine, vgpr has same value in all active lanes +// vgprDst = COPY (G_AMDGPU_READANYLANE vgprSrc) -> vgprDst = sgprSrc +def G_AMDGPU_READANYLANE : AMDGPUGenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src0); + let hasSideEffects = 0; +} //============================================================================// // Dummy Instructions diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui-regbanklegalize.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui-regbanklegalize.mir index 0113a5318bbf7..534dca9ede84a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui-regbanklegalize.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui-regbanklegalize.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 -# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=none %s -verify-machineinstrs -o - | FileCheck %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=amdgpu-regbanklegalize %s -verify-machineinstrs -o - | FileCheck %s --- name: uniform_in_vgpr @@ -16,9 +16,12 @@ body: | ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) - ; CHECK-NEXT: [[FPTOUI:%[0-9]+]]:sgpr(s32) = G_FPTOUI [[COPY]](s32) - ; CHECK-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[FPTOUI]], [[COPY1]] - ; CHECK-NEXT: G_STORE [[ADD]](s32), [[MV]](p1) :: (store (s32), addrspace 1) + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[FPTOUI:%[0-9]+]]:vgpr(s32) = G_FPTOUI [[COPY4]](s32) + ; CHECK-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[FPTOUI]] + ; CHECK-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[AMDGPU_READANYLANE]], [[COPY1]] + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32) + ; CHECK-NEXT: G_STORE [[COPY5]](s32), [[MV]](p1) :: (store (s32), addrspace 1) ; CHECK-NEXT: S_ENDPGM 0 %0:sgpr(s32) = COPY $sgpr0 %1:sgpr(s32) = COPY $sgpr1 @@ -47,10 +50,14 @@ body: | ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY3]](s32), [[COPY4]](s32) - ; CHECK-NEXT: [[FADD:%[0-9]+]]:sgpr(s32) = G_FADD [[COPY]], [[COPY1]] - ; CHECK-NEXT: [[FPTOUI:%[0-9]+]]:sgpr(s32) = G_FPTOUI [[FADD]](s32) - ; CHECK-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[FPTOUI]], [[COPY2]] - ; CHECK-NEXT: G_STORE [[ADD]](s32), [[MV]](p1) :: (store (s32), addrspace 1) + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) + ; CHECK-NEXT: [[FADD:%[0-9]+]]:vgpr(s32) = G_FADD [[COPY5]], [[COPY6]] + ; CHECK-NEXT: [[FPTOUI:%[0-9]+]]:vgpr(s32) = G_FPTOUI [[FADD]](s32) + ; CHECK-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[FPTOUI]] + ; CHECK-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[AMDGPU_READANYLANE]], [[COPY2]] + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32) + ; CHECK-NEXT: G_STORE [[COPY7]](s32), [[MV]](p1) :: (store (s32), addrspace 1) ; CHECK-NEXT: S_ENDPGM 0 %0:sgpr(s32) = COPY $sgpr0 %1:sgpr(s32) = COPY $sgpr1 @@ -85,11 +92,20 @@ body: | ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY5]](s32), [[COPY6]](s32) ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:sgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr(s32) = COPY [[COPY4]](s32) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY7]](s32), [[COPY8]], [[C]], 0, 0, 0 :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) + ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_BUFFER_LOAD]](<4 x s32>) + ; CHECK-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV]] + ; CHECK-NEXT: [[AMDGPU_READANYLANE1:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV1]] + ; CHECK-NEXT: [[AMDGPU_READANYLANE2:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV2]] + ; CHECK-NEXT: [[AMDGPU_READANYLANE3:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV3]] + ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[AMDGPU_READANYLANE]](s32), [[AMDGPU_READANYLANE1]](s32), [[AMDGPU_READANYLANE2]](s32), [[AMDGPU_READANYLANE3]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 - ; CHECK-NEXT: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32), [[UV2:%[0-9]+]]:sgpr(s32), [[UV3:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_BUFFER_LOAD]](<4 x s32>) - ; CHECK-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[UV1]], [[C1]] - ; CHECK-NEXT: G_STORE [[ADD]](s32), [[MV]](p1) :: (store (s32), addrspace 1) + ; CHECK-NEXT: [[UV4:%[0-9]+]]:sgpr(s32), [[UV5:%[0-9]+]]:sgpr(s32), [[UV6:%[0-9]+]]:sgpr(s32), [[UV7:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>) + ; CHECK-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[UV5]], [[C1]] + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32) + ; CHECK-NEXT: G_STORE [[COPY9]](s32), [[MV]](p1) :: (store (s32), addrspace 1) ; CHECK-NEXT: S_ENDPGM 0 %0:sgpr(s32) = COPY $sgpr0 %1:sgpr(s32) = COPY $sgpr1 @@ -129,10 +145,12 @@ body: | ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY5]](s32), [[COPY6]](s32) ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY7]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_BUFFER_LOAD]](<4 x s32>) - ; CHECK-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[UV1]], [[C1]] + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) + ; CHECK-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[UV1]], [[COPY8]] ; CHECK-NEXT: G_STORE [[ADD]](s32), [[MV]](p1) :: (store (s32), addrspace 1) ; CHECK-NEXT: S_ENDPGM 0 %0:sgpr(s32) = COPY $sgpr0 @@ -172,8 +190,12 @@ body: | ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY $vgpr5 ; CHECK-NEXT: [[MV2:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) - ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s64) = G_AND [[MV]], [[MV1]] - ; CHECK-NEXT: G_STORE [[AND]](s64), [[MV2]](p1) :: (store (s64), addrspace 1) + ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[MV]](s64) + ; CHECK-NEXT: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[MV1]](s64) + ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[UV]], [[UV2]] + ; CHECK-NEXT: [[AND1:%[0-9]+]]:vgpr(s32) = G_AND [[UV1]], [[UV3]] + ; CHECK-NEXT: [[MV3:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[AND]](s32), [[AND1]](s32) + ; CHECK-NEXT: G_STORE [[MV3]](s64), [[MV2]](p1) :: (store (s64), addrspace 1) ; CHECK-NEXT: S_ENDPGM 0 %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 @@ -204,9 +226,12 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32) - ; CHECK-NEXT: [[ABS:%[0-9]+]]:sgpr(s16) = G_ABS [[TRUNC]] - ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[ABS]](s16) - ; CHECK-NEXT: G_STORE [[ANYEXT]](s32), [[MV]](p1) :: (store (s16), addrspace 1) + ; CHECK-NEXT: [[SEXT:%[0-9]+]]:sgpr(s32) = G_SEXT [[TRUNC]](s16) + ; CHECK-NEXT: [[ABS:%[0-9]+]]:sgpr(s32) = G_ABS [[SEXT]] + ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:sgpr(s16) = G_TRUNC [[ABS]](s32) + ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC1]](s16) + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[ANYEXT]](s32) + ; CHECK-NEXT: G_STORE [[COPY3]](s32), [[MV]](p1) :: (store (s16), addrspace 1) ; CHECK-NEXT: S_ENDPGM 0 %0:sgpr(s32) = COPY $sgpr0 %1:sgpr(s16) = G_TRUNC %0(s32) @@ -235,24 +260,31 @@ body: | ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6 - ; CHECK-NEXT: [[ICMP:%[0-9]+]]:sgpr(s1) = G_ICMP intpred(uge), [[COPY2]](s32), [[C]] + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(uge), [[COPY2]](s32), [[C]] ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:sgpr(s1) = G_ICMP intpred(ne), [[COPY3]](s32), [[C1]] - ; CHECK-NEXT: G_BRCOND [[ICMP1]](s1), %bb.2 + ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(ne), [[COPY3]](s32), [[C1]] + ; CHECK-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[ICMP1]], [[C2]] + ; CHECK-NEXT: G_BRCOND [[AND]](s32), %bb.2 ; CHECK-NEXT: G_BR %bb.1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: ; CHECK-NEXT: successors: %bb.2(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 - ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:sgpr(s1) = G_ICMP intpred(ult), [[COPY2]](s32), [[C2]] + ; CHECK-NEXT: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(ult), [[COPY2]](s32), [[C3]] ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: [[PHI:%[0-9]+]]:sgpr(s1) = G_PHI [[ICMP]](s1), %bb.0, [[ICMP2]](s1), %bb.1 - ; CHECK-NEXT: [[SEXT:%[0-9]+]]:sgpr(s32) = G_SEXT [[PHI]](s1) - ; CHECK-NEXT: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2 - ; CHECK-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[SEXT]], [[C3]] - ; CHECK-NEXT: G_STORE [[ADD]](s32), [[MV]](p1) :: (store (s32), addrspace 1) + ; CHECK-NEXT: [[PHI:%[0-9]+]]:sgpr(s32) = G_PHI [[ICMP]](s32), %bb.0, [[ICMP2]](s32), %bb.1 + ; CHECK-NEXT: [[C4:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[AND1:%[0-9]+]]:sgpr(s32) = G_AND [[PHI]], [[C4]] + ; CHECK-NEXT: [[C5:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -1 + ; CHECK-NEXT: [[C6:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[AND1]](s32), [[C5]], [[C6]] + ; CHECK-NEXT: [[C7:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2 + ; CHECK-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[SELECT]], [[C7]] + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32) + ; CHECK-NEXT: G_STORE [[COPY4]](s32), [[MV]](p1) :: (store (s32), addrspace 1) ; CHECK-NEXT: S_ENDPGM 0 bb.0: successors: %bb.1(0x30000000), %bb.2(0x50000000) @@ -302,9 +334,15 @@ body: | ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY3]](s32), [[COPY4]](s32) ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 0.000000e+00 - ; CHECK-NEXT: [[FCMP:%[0-9]+]]:sgpr(s1) = G_FCMP floatpred(oeq), [[COPY]](s32), [[C]] - ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[FCMP]](s1), [[COPY1]], [[COPY2]] - ; CHECK-NEXT: G_STORE [[SELECT]](s32), [[MV]](p1) :: (store (s32), addrspace 1) + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; CHECK-NEXT: [[FCMP:%[0-9]+]]:vcc(s1) = G_FCMP floatpred(oeq), [[COPY5]](s32), [[COPY6]] + ; CHECK-NEXT: [[AMDGPU_COPY_SCC_VCC:%[0-9]+]]:sgpr(s32) = G_AMDGPU_COPY_SCC_VCC [[FCMP]](s1) + ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[AMDGPU_COPY_SCC_VCC]], [[C1]] + ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[AND]](s32), [[COPY1]], [[COPY2]] + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[SELECT]](s32) + ; CHECK-NEXT: G_STORE [[COPY7]](s32), [[MV]](p1) :: (store (s32), addrspace 1) ; CHECK-NEXT: S_ENDPGM 0 %0:sgpr(s32) = COPY $sgpr0 %1:sgpr(s32) = COPY $sgpr1 @@ -336,8 +374,9 @@ body: | ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY3]](s32), [[COPY4]](s32) ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[ICMP:%[0-9]+]]:sgpr(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[C]] - ; CHECK-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[COPY1]], [[COPY2]] + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(eq), [[COPY]](s32), [[C]] + ; CHECK-NEXT: [[AMDGPU_COPY_VCC_SCC:%[0-9]+]]:vcc(s1) = G_AMDGPU_COPY_VCC_SCC [[ICMP]](s32) + ; CHECK-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[AMDGPU_COPY_VCC_SCC]](s1), [[COPY1]], [[COPY2]] ; CHECK-NEXT: G_STORE [[SELECT]](s32), [[MV]](p1) :: (store (s32), addrspace 1) ; CHECK-NEXT: S_ENDPGM 0 %0:sgpr(s32) = COPY $sgpr0 @@ -369,8 +408,11 @@ body: | ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY3]](s32), [[COPY4]](s32) - ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:vcc(s1) = G_TRUNC [[COPY]](s32) - ; CHECK-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[TRUNC]](s1), [[COPY1]], [[COPY2]] + ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[COPY]], [[C]] + ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(ne), [[AND]](s32), [[C1]] + ; CHECK-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[COPY1]], [[COPY2]] ; CHECK-NEXT: G_STORE [[SELECT]](s32), [[MV]](p1) :: (store (s32), addrspace 1) ; CHECK-NEXT: S_ENDPGM 0 %0:vgpr(s32) = COPY $vgpr0 @@ -400,9 +442,13 @@ body: | ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32) ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[ICMP:%[0-9]+]]:sgpr(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[C]] - ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:sgpr(s32) = G_ZEXT [[ICMP]](s1) - ; CHECK-NEXT: G_STORE [[ZEXT]](s32), [[MV]](p1) :: (store (s32), addrspace 1) + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(eq), [[COPY]](s32), [[C]] + ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[ICMP]], [[C1]] + ; CHECK-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[AND]](s32), [[C1]], [[C2]] + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[SELECT]](s32) + ; CHECK-NEXT: G_STORE [[COPY3]](s32), [[MV]](p1) :: (store (s32), addrspace 1) ; CHECK-NEXT: S_ENDPGM 0 %0:sgpr(s32) = COPY $sgpr0 %1:vgpr(s32) = COPY $vgpr0 @@ -430,9 +476,14 @@ body: | ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32) ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[ICMP:%[0-9]+]]:sgpr(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[C]] - ; CHECK-NEXT: [[SEXT:%[0-9]+]]:sgpr(s32) = G_SEXT [[ICMP]](s1) - ; CHECK-NEXT: G_STORE [[SEXT]](s32), [[MV]](p1) :: (store (s32), addrspace 1) + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(eq), [[COPY]](s32), [[C]] + ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[ICMP]], [[C1]] + ; CHECK-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -1 + ; CHECK-NEXT: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[AND]](s32), [[C2]], [[C3]] + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[SELECT]](s32) + ; CHECK-NEXT: G_STORE [[COPY3]](s32), [[MV]](p1) :: (store (s32), addrspace 1) ; CHECK-NEXT: S_ENDPGM 0 %0:sgpr(s32) = COPY $sgpr0 %1:vgpr(s32) = COPY $vgpr0 @@ -461,9 +512,11 @@ body: | ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(uge), [[COPY]](s32), [[C]] + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(uge), [[COPY]](s32), [[COPY4]] ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 20 - ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(uge), [[COPY1]](s32), [[C1]] + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) + ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(uge), [[COPY1]](s32), [[COPY5]] ; CHECK-NEXT: [[AND:%[0-9]+]]:vcc(s1) = G_AND [[ICMP]], [[ICMP1]] ; CHECK-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[AND]](s1), [[COPY]], [[COPY1]] ; CHECK-NEXT: G_STORE [[SELECT]](s32), [[MV]](p1) :: (store (s32), addrspace 1) @@ -499,12 +552,15 @@ body: | ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[ICMP:%[0-9]+]]:sgpr(s1) = G_ICMP intpred(uge), [[COPY]](s32), [[C]] + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(uge), [[COPY]](s32), [[C]] ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 20 - ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:sgpr(s1) = G_ICMP intpred(uge), [[COPY1]](s32), [[C1]] - ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s1) = G_AND [[ICMP]], [[ICMP1]] - ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[AND]](s1), [[COPY]], [[COPY1]] - ; CHECK-NEXT: G_STORE [[SELECT]](s32), [[MV]](p1) :: (store (s32), addrspace 1) + ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(uge), [[COPY1]](s32), [[C1]] + ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[ICMP]], [[ICMP1]] + ; CHECK-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[AND1:%[0-9]+]]:sgpr(s32) = G_AND [[AND]], [[C2]] + ; CHECK-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[AND1]](s32), [[COPY]], [[COPY1]] + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[SELECT]](s32) + ; CHECK-NEXT: G_STORE [[COPY4]](s32), [[MV]](p1) :: (store (s32), addrspace 1) ; CHECK-NEXT: S_ENDPGM 0 %0:sgpr(s32) = COPY $sgpr0 %1:sgpr(s32) = COPY $sgpr1 @@ -536,9 +592,10 @@ body: | ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 ; CHECK-NEXT: [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32) ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[C]] - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[ICMP]](s1) - ; CHECK-NEXT: [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY3]](s1), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY3]] + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[ICMP]](s1) + ; CHECK-NEXT: [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY4]](s1), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK-NEXT: G_BR %bb.1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: @@ -548,8 +605,8 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: [[PHI:%[0-9]+]]:vgpr(s32) = G_PHI [[C]](s32), %bb.0, [[C1]](s32), %bb.1 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY [[SI_IF]](s32) - ; CHECK-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[COPY4]](s32) + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY [[SI_IF]](s32) + ; CHECK-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[COPY5]](s32) ; CHECK-NEXT: G_STORE [[PHI]](s32), [[MV]](p1) :: (store (s32), addrspace 1) ; CHECK-NEXT: S_ENDPGM 0 bb.0: @@ -603,11 +660,12 @@ body: | ; CHECK-NEXT: [[PHI1:%[0-9]+]]:sgpr(s32) = G_PHI [[C]](s32), %bb.0, %9(s32), %bb.1 ; CHECK-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 ; CHECK-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[PHI1]], [[C2]] - ; CHECK-NEXT: [[UITOFP:%[0-9]+]]:sgpr(s32) = G_UITOFP [[ADD]](s32) + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32) + ; CHECK-NEXT: [[UITOFP:%[0-9]+]]:vgpr(s32) = G_UITOFP [[COPY3]](s32) ; CHECK-NEXT: [[FCMP:%[0-9]+]]:vcc(s1) = G_FCMP floatpred(ogt), [[UITOFP]](s32), [[COPY]] ; CHECK-NEXT: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[FCMP]](s1), [[PHI]](s32) - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32_xm0_xexec(s32) = COPY [[INT]](s32) - ; CHECK-NEXT: SI_LOOP [[COPY3]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32_xm0_xexec(s32) = COPY [[INT]](s32) + ; CHECK-NEXT: SI_LOOP [[COPY4]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK-NEXT: G_BR %bb.2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: @@ -615,7 +673,8 @@ body: | ; CHECK-NEXT: [[PHI3:%[0-9]+]]:sgpr(s32) = G_PHI [[INT]](s32), %bb.1 ; CHECK-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI3]](s32) ; CHECK-NEXT: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: [[MUL:%[0-9]+]]:vgpr(s32) = G_MUL [[PHI2]], [[C3]] + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C3]](s32) + ; CHECK-NEXT: [[MUL:%[0-9]+]]:vgpr(s32) = G_MUL [[PHI2]], [[COPY5]] ; CHECK-NEXT: G_STORE [[MUL]](s32), [[MV]](p1) :: (store (s32), addrspace 1) ; CHECK-NEXT: S_ENDPGM 0 bb.0: @@ -683,37 +742,43 @@ body: | ; CHECK-NEXT: [[PHI1:%[0-9]+]]:sgpr(s32) = G_PHI %15(s32), %bb.3, [[C]](s32), %bb.0 ; CHECK-NEXT: [[PHI2:%[0-9]+]]:vgpr(s32) = G_PHI [[C]](s32), %bb.0, %17(s32), %bb.3 ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1) - ; CHECK-NEXT: [[SEXT:%[0-9]+]]:vgpr(s64) = G_SEXT [[PHI2]](s32) - ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:vgpr(s64) = G_SHL [[SEXT]], [[C1]](s32) + ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 31 + ; CHECK-NEXT: [[ASHR:%[0-9]+]]:vgpr(s32) = G_ASHR [[PHI2]], [[C1]](s32) + ; CHECK-NEXT: [[MV3:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[PHI2]](s32), [[ASHR]](s32) + ; CHECK-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:vgpr(s64) = G_SHL [[MV3]], [[COPY7]](s32) ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[MV1]], [[SHL]](s64) ; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1) - ; CHECK-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(ne), [[LOAD]](s32), [[C2]] - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[ICMP]](s1) - ; CHECK-NEXT: [[C3:%[0-9]+]]:sgpr(s1) = G_CONSTANT i1 true - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[C3]](s1) + ; CHECK-NEXT: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr(s32) = COPY [[C3]](s32) + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(ne), [[LOAD]](s32), [[COPY8]] + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[ICMP]](s1) + ; CHECK-NEXT: [[C4:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[AMDGPU_COPY_VCC_SCC:%[0-9]+]]:sreg_32(s1) = G_AMDGPU_COPY_VCC_SCC [[C4]](s32) ; CHECK-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc - ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY8]](s1), implicit-def $scc + ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[AMDGPU_COPY_VCC_SCC]](s1), implicit-def $scc ; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1) - ; CHECK-NEXT: [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY7]](s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1) + ; CHECK-NEXT: [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY9]](s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK-NEXT: G_BR %bb.2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.5(0x40000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[C4:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2 - ; CHECK-NEXT: [[SHL1:%[0-9]+]]:vgpr(s64) = G_SHL [[SEXT]], [[C4]](s32) + ; CHECK-NEXT: [[C5:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2 + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:vgpr(s32) = COPY [[C5]](s32) + ; CHECK-NEXT: [[SHL1:%[0-9]+]]:vgpr(s64) = G_SHL [[MV3]], [[COPY11]](s32) ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[MV2]], [[SHL1]](s64) ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:vgpr(s32) = G_LOAD [[PTR_ADD1]](p1) :: (load (s32), addrspace 1) - ; CHECK-NEXT: [[C5:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(ne), [[LOAD1]](s32), [[C5]] - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[ICMP1]](s1) - ; CHECK-NEXT: [[C6:%[0-9]+]]:sgpr(s1) = G_CONSTANT i1 true - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[C6]](s1) - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[COPY11]](s1) - ; CHECK-NEXT: [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY10]](s1), %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: [[C6:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:vgpr(s32) = COPY [[C6]](s32) + ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(ne), [[LOAD1]](s32), [[COPY12]] + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[ICMP1]](s1) + ; CHECK-NEXT: [[C7:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[AMDGPU_COPY_VCC_SCC1:%[0-9]+]]:sreg_32(s1) = G_AMDGPU_COPY_VCC_SCC [[C7]](s32) + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY [[AMDGPU_COPY_VCC_SCC1]](s1) + ; CHECK-NEXT: [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY13]](s1), %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK-NEXT: G_BR %bb.4 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: @@ -721,44 +786,48 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[PHI3:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_]](s1), %bb.1, %45(s1), %bb.5 ; CHECK-NEXT: [[PHI4:%[0-9]+]]:vgpr(s32) = G_PHI %46(s32), %bb.5, [[DEF]](s32), %bb.1 - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]](s1) - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:sgpr(s32) = COPY [[SI_IF]](s32) - ; CHECK-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[COPY14]](s32) - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:vcc(s1) = COPY [[COPY13]](s1) - ; CHECK-NEXT: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[COPY15]](s1), [[PHI1]](s32) - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:sreg_32_xm0_xexec(s32) = COPY [[INT]](s32) - ; CHECK-NEXT: SI_LOOP [[COPY16]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]](s1) + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:sgpr(s32) = COPY [[SI_IF]](s32) + ; CHECK-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[COPY16]](s32) + ; CHECK-NEXT: [[COPY17:%[0-9]+]]:vcc(s1) = COPY [[COPY15]](s1) + ; CHECK-NEXT: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[COPY17]](s1), [[PHI1]](s32) + ; CHECK-NEXT: [[COPY18:%[0-9]+]]:sreg_32_xm0_xexec(s32) = COPY [[INT]](s32) + ; CHECK-NEXT: SI_LOOP [[COPY18]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK-NEXT: G_BR %bb.6 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[C7:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2 - ; CHECK-NEXT: [[SHL2:%[0-9]+]]:vgpr(s64) = G_SHL [[SEXT]], [[C7]](s32) + ; CHECK-NEXT: [[C8:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2 + ; CHECK-NEXT: [[COPY19:%[0-9]+]]:vgpr(s32) = COPY [[C8]](s32) + ; CHECK-NEXT: [[SHL2:%[0-9]+]]:vgpr(s64) = G_SHL [[MV3]], [[COPY19]](s32) ; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[MV]], [[SHL2]](s64) ; CHECK-NEXT: [[LOAD2:%[0-9]+]]:vgpr(s32) = G_LOAD [[PTR_ADD2]](p1) :: (load (s32), addrspace 1) - ; CHECK-NEXT: [[C8:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 - ; CHECK-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[LOAD2]], [[C8]] + ; CHECK-NEXT: [[C9:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[COPY20:%[0-9]+]]:vgpr(s32) = COPY [[C9]](s32) + ; CHECK-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[LOAD2]], [[COPY20]] ; CHECK-NEXT: G_STORE [[ADD]](s32), [[PTR_ADD2]](p1) :: (store (s32), addrspace 1) - ; CHECK-NEXT: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[PHI2]], [[C8]] - ; CHECK-NEXT: [[C9:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 100 - ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:vcc(s1) = G_ICMP intpred(ult), [[PHI2]](s32), [[C9]] - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1) - ; CHECK-NEXT: [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY12]](s1), $exec_lo, implicit-def $scc - ; CHECK-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY17]](s1), implicit-def $scc + ; CHECK-NEXT: [[COPY21:%[0-9]+]]:vgpr(s32) = COPY [[C9]](s32) + ; CHECK-NEXT: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[PHI2]], [[COPY21]] + ; CHECK-NEXT: [[C10:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 100 + ; CHECK-NEXT: [[COPY22:%[0-9]+]]:vgpr(s32) = COPY [[C10]](s32) + ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:vcc(s1) = G_ICMP intpred(ult), [[PHI2]](s32), [[COPY22]] + ; CHECK-NEXT: [[COPY23:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1) + ; CHECK-NEXT: [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY14]](s1), $exec_lo, implicit-def $scc + ; CHECK-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY23]](s1), implicit-def $scc ; CHECK-NEXT: [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[PHI5:%[0-9]+]]:sreg_32(s1) = PHI [[COPY11]](s1), %bb.2, [[S_OR_B32_1]](s1), %bb.4 + ; CHECK-NEXT: [[PHI5:%[0-9]+]]:sreg_32(s1) = PHI [[AMDGPU_COPY_VCC_SCC1]](s1), %bb.2, [[S_OR_B32_1]](s1), %bb.4 ; CHECK-NEXT: [[PHI6:%[0-9]+]]:vgpr(s32) = G_PHI [[ADD1]](s32), %bb.4, [[DEF]](s32), %bb.2 - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]](s1) - ; CHECK-NEXT: [[COPY19:%[0-9]+]]:sreg_32(s1) = COPY [[COPY18]](s1) - ; CHECK-NEXT: [[COPY20:%[0-9]+]]:sgpr(s32) = COPY [[SI_IF1]](s32) - ; CHECK-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[COPY20]](s32) - ; CHECK-NEXT: [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY9]](s1), $exec_lo, implicit-def $scc - ; CHECK-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY19]](s1), implicit-def $scc + ; CHECK-NEXT: [[COPY24:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]](s1) + ; CHECK-NEXT: [[COPY25:%[0-9]+]]:sreg_32(s1) = COPY [[COPY24]](s1) + ; CHECK-NEXT: [[COPY26:%[0-9]+]]:sgpr(s32) = COPY [[SI_IF1]](s32) + ; CHECK-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[COPY26]](s32) + ; CHECK-NEXT: [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY10]](s1), $exec_lo, implicit-def $scc + ; CHECK-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY25]](s1), implicit-def $scc ; CHECK-NEXT: [[S_OR_B32_2:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_2]](s1), [[S_AND_B32_2]](s1), implicit-def $scc ; CHECK-NEXT: G_BR %bb.3 ; CHECK-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui-salu-float.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui-salu-float.ll index 0b4eb458b254f..f8f7f972f6c0a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui-salu-float.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui-salu-float.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=OLD_RBS_GFX10 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=OLD_RBS_GFX12 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=NEW_RBS_GFX10 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=NEW_RBS_GFX12 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -new-reg-bank-select < %s | FileCheck -check-prefix=NEW_RBS_GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -new-reg-bank-select < %s | FileCheck -check-prefix=NEW_RBS_GFX12 %s define amdgpu_ps void @salu_float(float inreg %a, float inreg %b, i32 inreg %c, ptr addrspace(1) %ptr) { ; OLD_RBS_GFX10-LABEL: salu_float: @@ -28,7 +28,9 @@ define amdgpu_ps void @salu_float(float inreg %a, float inreg %b, i32 inreg %c, ; NEW_RBS_GFX10: ; %bb.0: ; NEW_RBS_GFX10-NEXT: v_add_f32_e64 v2, s0, s1 ; NEW_RBS_GFX10-NEXT: v_cvt_u32_f32_e32 v2, v2 -; NEW_RBS_GFX10-NEXT: v_add_nc_u32_e32 v2, s2, v2 +; NEW_RBS_GFX10-NEXT: v_readfirstlane_b32 s0, v2 +; NEW_RBS_GFX10-NEXT: s_add_i32 s0, s0, s2 +; NEW_RBS_GFX10-NEXT: v_mov_b32_e32 v2, s0 ; NEW_RBS_GFX10-NEXT: global_store_dword v[0:1], v2, off ; NEW_RBS_GFX10-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui-salu-float.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui-salu-float.mir index 98a8f4f04e49d..733f3d302472c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui-salu-float.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui-salu-float.mir @@ -1,8 +1,8 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 -# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=regbankselect %s -o - | FileCheck %s -check-prefixes=OLD_RBS_GFX10 -# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass=regbankselect %s -o - | FileCheck %s -check-prefixes=OLD_RBS_GFX12 -# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=regbankselect %s -o - | FileCheck %s -check-prefixes=NEW_RBS_GFX10 -# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass=regbankselect %s -o - | FileCheck %s -check-prefixes=NEW_RBS_GFX12 +# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=regbankselect %s -verify-machineinstrs -o - | FileCheck %s -check-prefixes=OLD_RBS_GFX10 +# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass=regbankselect %s -verify-machineinstrs -o - | FileCheck %s -check-prefixes=OLD_RBS_GFX12 +# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" %s -verify-machineinstrs -o - | FileCheck %s -check-prefixes=NEW_RBS_GFX10 +# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" %s -verify-machineinstrs -o - | FileCheck %s -check-prefixes=NEW_RBS_GFX12 --- name: salu_float @@ -58,9 +58,10 @@ body: | ; NEW_RBS_GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) ; NEW_RBS_GFX10-NEXT: [[FADD:%[0-9]+]]:vgpr(s32) = G_FADD [[COPY5]], [[COPY6]] ; NEW_RBS_GFX10-NEXT: [[FPTOUI:%[0-9]+]]:vgpr(s32) = G_FPTOUI [[FADD]](s32) - ; NEW_RBS_GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[COPY2]](s32) - ; NEW_RBS_GFX10-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[FPTOUI]], [[COPY7]] - ; NEW_RBS_GFX10-NEXT: G_STORE [[ADD]](s32), [[MV]](p1) :: (store (s32), addrspace 1) + ; NEW_RBS_GFX10-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[FPTOUI]] + ; NEW_RBS_GFX10-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[AMDGPU_READANYLANE]], [[COPY2]] + ; NEW_RBS_GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32) + ; NEW_RBS_GFX10-NEXT: G_STORE [[COPY7]](s32), [[MV]](p1) :: (store (s32), addrspace 1) ; NEW_RBS_GFX10-NEXT: S_ENDPGM 0 ; ; NEW_RBS_GFX12-LABEL: name: salu_float diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.ll index 287a8ab0e52f5..63dbf3a8d3164 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=OLD_RBS %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=NEW_RBS %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -new-reg-bank-select < %s | FileCheck -check-prefix=NEW_RBS %s ; if instruction is uniform and there is available instruction, select SALU instruction define amdgpu_ps void @uniform_in_vgpr(float inreg %a, i32 inreg %b, ptr addrspace(1) %ptr) { @@ -14,7 +14,9 @@ define amdgpu_ps void @uniform_in_vgpr(float inreg %a, i32 inreg %b, ptr addrspa ; NEW_RBS-LABEL: uniform_in_vgpr: ; NEW_RBS: ; %bb.0: ; NEW_RBS-NEXT: v_cvt_u32_f32_e32 v2, s0 -; NEW_RBS-NEXT: v_add_nc_u32_e32 v2, s1, v2 +; NEW_RBS-NEXT: v_readfirstlane_b32 s0, v2 +; NEW_RBS-NEXT: s_add_i32 s0, s0, s1 +; NEW_RBS-NEXT: v_mov_b32_e32 v2, s0 ; NEW_RBS-NEXT: global_store_dword v[0:1], v2, off ; NEW_RBS-NEXT: s_endpgm %a.i32 = fptoui float %a to i32 @@ -37,7 +39,9 @@ define amdgpu_ps void @back_to_back_uniform_in_vgpr(float inreg %a, float inreg ; NEW_RBS: ; %bb.0: ; NEW_RBS-NEXT: v_add_f32_e64 v2, s0, s1 ; NEW_RBS-NEXT: v_cvt_u32_f32_e32 v2, v2 -; NEW_RBS-NEXT: v_add_nc_u32_e32 v2, s2, v2 +; NEW_RBS-NEXT: v_readfirstlane_b32 s0, v2 +; NEW_RBS-NEXT: s_add_i32 s0, s0, s2 +; NEW_RBS-NEXT: v_mov_b32_e32 v2, s0 ; NEW_RBS-NEXT: global_store_dword v[0:1], v2, off ; NEW_RBS-NEXT: s_endpgm %add = fadd float %a, %b @@ -63,7 +67,9 @@ define amdgpu_cs void @buffer_load_uniform(<4 x i32> inreg %rsrc, i32 inreg %vof ; NEW_RBS-NEXT: v_mov_b32_e32 v2, s4 ; NEW_RBS-NEXT: buffer_load_dwordx4 v[2:5], v2, s[0:3], 0 offen ; NEW_RBS-NEXT: s_waitcnt vmcnt(0) -; NEW_RBS-NEXT: v_add_nc_u32_e32 v2, 1, v3 +; NEW_RBS-NEXT: v_readfirstlane_b32 s0, v3 +; NEW_RBS-NEXT: s_add_i32 s0, s0, 1 +; NEW_RBS-NEXT: v_mov_b32_e32 v2, s0 ; NEW_RBS-NEXT: global_store_dword v[0:1], v2, off ; NEW_RBS-NEXT: s_endpgm .entry: @@ -168,7 +174,8 @@ define amdgpu_ps void @uniform_i1_phi(ptr addrspace(1) %out, i32 inreg %tid, i32 ; NEW_RBS-NEXT: s_cmp_lt_u32 s0, 1 ; NEW_RBS-NEXT: s_cselect_b32 s2, 1, 0 ; NEW_RBS-NEXT: .LBB6_2: ; %exit -; NEW_RBS-NEXT: s_bfe_i32 s0, s2, 0x10000 +; NEW_RBS-NEXT: s_cmp_lg_u32 s2, 0 +; NEW_RBS-NEXT: s_cselect_b32 s0, -1, 0 ; NEW_RBS-NEXT: s_add_i32 s0, s0, 2 ; NEW_RBS-NEXT: v_mov_b32_e32 v2, s0 ; NEW_RBS-NEXT: global_store_dword v[0:1], v2, off @@ -202,9 +209,13 @@ define amdgpu_ps void @vcc_to_scc(float inreg %a, i32 inreg %b, i32 inreg %c, pt ; ; NEW_RBS-LABEL: vcc_to_scc: ; NEW_RBS: ; %bb.0: -; NEW_RBS-NEXT: v_mov_b32_e32 v2, s2 ; NEW_RBS-NEXT: v_cmp_eq_f32_e64 s0, s0, 0 -; NEW_RBS-NEXT: v_cndmask_b32_e64 v2, v2, s1, s0 +; NEW_RBS-NEXT: s_cmp_lg_u32 s0, 0 +; NEW_RBS-NEXT: s_cselect_b32 s0, 1, 0 +; NEW_RBS-NEXT: s_and_b32 s0, s0, 1 +; NEW_RBS-NEXT: s_cmp_lg_u32 s0, 0 +; NEW_RBS-NEXT: s_cselect_b32 s0, s1, s2 +; NEW_RBS-NEXT: v_mov_b32_e32 v2, s0 ; NEW_RBS-NEXT: global_store_dword v[0:1], v2, off ; NEW_RBS-NEXT: s_endpgm %vcc_to_scc = fcmp oeq float %a, 0.0 @@ -228,9 +239,7 @@ define amdgpu_ps void @scc_to_vcc(i32 inreg %a, i32 %b, i32 %c, ptr addrspace(1) ; NEW_RBS-LABEL: scc_to_vcc: ; NEW_RBS: ; %bb.0: ; NEW_RBS-NEXT: s_cmp_eq_u32 s0, 0 -; NEW_RBS-NEXT: s_cselect_b32 s0, 1, 0 -; NEW_RBS-NEXT: s_and_b32 s0, 1, s0 -; NEW_RBS-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; NEW_RBS-NEXT: s_cselect_b32 vcc_lo, exec_lo, 0 ; NEW_RBS-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; NEW_RBS-NEXT: global_store_dword v[2:3], v0, off ; NEW_RBS-NEXT: s_endpgm @@ -300,8 +309,7 @@ define amdgpu_ps void @sext(i32 inreg %a, ptr addrspace(1) %ptr) { ; NEW_RBS-LABEL: sext: ; NEW_RBS: ; %bb.0: ; NEW_RBS-NEXT: s_cmp_eq_u32 s0, 10 -; NEW_RBS-NEXT: s_cselect_b32 s0, 1, 0 -; NEW_RBS-NEXT: s_bfe_i32 s0, s0, 0x10000 +; NEW_RBS-NEXT: s_cselect_b32 s0, -1, 0 ; NEW_RBS-NEXT: v_mov_b32_e32 v2, s0 ; NEW_RBS-NEXT: global_store_dword v[0:1], v2, off ; NEW_RBS-NEXT: s_endpgm @@ -362,7 +370,6 @@ define amdgpu_ps void @and_i1_scc(i32 inreg %a, i32 inreg %b, ptr addrspace(1) % ; NEW_RBS-NEXT: s_cmp_ge_u32 s1, 20 ; NEW_RBS-NEXT: s_cselect_b32 s3, 1, 0 ; NEW_RBS-NEXT: s_and_b32 s2, s2, s3 -; NEW_RBS-NEXT: s_and_b32 s2, s2, 1 ; NEW_RBS-NEXT: s_cmp_lg_u32 s2, 0 ; NEW_RBS-NEXT: s_cselect_b32 s0, s0, s1 ; NEW_RBS-NEXT: v_mov_b32_e32 v2, s0 @@ -395,12 +402,13 @@ define amdgpu_ps void @divergent_phi_with_uniform_inputs(i32 %a, ptr addrspace(1 ; NEW_RBS: ; %bb.0: ; %A ; NEW_RBS-NEXT: s_mov_b32 s0, 0 ; NEW_RBS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; NEW_RBS-NEXT: s_and_saveexec_b32 s1, vcc_lo +; NEW_RBS-NEXT: v_mov_b32_e32 v0, s0 +; NEW_RBS-NEXT: s_and_saveexec_b32 s0, vcc_lo ; NEW_RBS-NEXT: ; %bb.1: ; %B -; NEW_RBS-NEXT: s_mov_b32 s0, 1 +; NEW_RBS-NEXT: s_mov_b32 s1, 1 +; NEW_RBS-NEXT: v_mov_b32_e32 v0, s1 ; NEW_RBS-NEXT: ; %bb.2: ; %exit -; NEW_RBS-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; NEW_RBS-NEXT: v_mov_b32_e32 v0, s0 +; NEW_RBS-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; NEW_RBS-NEXT: global_store_dword v[1:2], v0, off ; NEW_RBS-NEXT: s_endpgm A: @@ -443,19 +451,19 @@ define amdgpu_ps void @divergent_because_of_temporal_divergent_use(float %val, p ; NEW_RBS-LABEL: divergent_because_of_temporal_divergent_use: ; NEW_RBS: ; %bb.0: ; %entry ; NEW_RBS-NEXT: s_mov_b32 s0, -1 -; NEW_RBS-NEXT: v_mov_b32_e32 v3, s0 -; NEW_RBS-NEXT: s_mov_b32 s0, 0 +; NEW_RBS-NEXT: s_mov_b32 s1, 0 ; NEW_RBS-NEXT: .LBB15_1: ; %loop ; NEW_RBS-NEXT: ; =>This Inner Loop Header: Depth=1 -; NEW_RBS-NEXT: v_add_nc_u32_e32 v3, 1, v3 -; NEW_RBS-NEXT: v_cvt_f32_u32_e32 v4, v3 -; NEW_RBS-NEXT: v_cmp_gt_f32_e32 vcc_lo, v4, v0 -; NEW_RBS-NEXT: s_or_b32 s0, vcc_lo, s0 -; NEW_RBS-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; NEW_RBS-NEXT: s_add_i32 s0, s0, 1 +; NEW_RBS-NEXT: v_cvt_f32_u32_e32 v3, s0 +; NEW_RBS-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v0 +; NEW_RBS-NEXT: s_or_b32 s1, vcc_lo, s1 +; NEW_RBS-NEXT: s_andn2_b32 exec_lo, exec_lo, s1 ; NEW_RBS-NEXT: s_cbranch_execnz .LBB15_1 ; NEW_RBS-NEXT: ; %bb.2: ; %exit -; NEW_RBS-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; NEW_RBS-NEXT: v_mul_lo_u32 v0, v3, 10 +; NEW_RBS-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; NEW_RBS-NEXT: v_mov_b32_e32 v0, s0 +; NEW_RBS-NEXT: v_mul_lo_u32 v0, v0, 10 ; NEW_RBS-NEXT: global_store_dword v[1:2], v0, off ; NEW_RBS-NEXT: s_endpgm entry: @@ -550,9 +558,9 @@ define amdgpu_cs void @loop_with_2breaks(ptr addrspace(1) %x, ptr addrspace(1) % ; NEW_RBS-NEXT: .LBB16_1: ; %Flow3 ; NEW_RBS-NEXT: ; in Loop: Header=BB16_3 Depth=1 ; NEW_RBS-NEXT: s_waitcnt_depctr 0xffe3 -; NEW_RBS-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; NEW_RBS-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; NEW_RBS-NEXT: s_andn2_b32 s1, s1, exec_lo -; NEW_RBS-NEXT: s_and_b32 s3, exec_lo, s4 +; NEW_RBS-NEXT: s_and_b32 s3, exec_lo, s3 ; NEW_RBS-NEXT: s_or_b32 s1, s1, s3 ; NEW_RBS-NEXT: .LBB16_2: ; %Flow ; NEW_RBS-NEXT: ; in Loop: Header=BB16_3 Depth=1 @@ -565,7 +573,7 @@ define amdgpu_cs void @loop_with_2breaks(ptr addrspace(1) %x, ptr addrspace(1) % ; NEW_RBS-NEXT: ; =>This Inner Loop Header: Depth=1 ; NEW_RBS-NEXT: v_ashrrev_i32_e32 v7, 31, v6 ; NEW_RBS-NEXT: s_andn2_b32 s1, s1, exec_lo -; NEW_RBS-NEXT: s_and_b32 s2, exec_lo, -1 +; NEW_RBS-NEXT: s_and_b32 s2, exec_lo, exec_lo ; NEW_RBS-NEXT: s_or_b32 s1, s1, s2 ; NEW_RBS-NEXT: v_lshlrev_b64 v[7:8], 2, v[6:7] ; NEW_RBS-NEXT: v_add_co_u32 v9, vcc_lo, v2, v7 @@ -579,11 +587,11 @@ define amdgpu_cs void @loop_with_2breaks(ptr addrspace(1) %x, ptr addrspace(1) % ; NEW_RBS-NEXT: ; in Loop: Header=BB16_3 Depth=1 ; NEW_RBS-NEXT: v_add_co_u32 v9, vcc_lo, v4, v7 ; NEW_RBS-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v5, v8, vcc_lo -; NEW_RBS-NEXT: s_mov_b32 s4, -1 +; NEW_RBS-NEXT: s_mov_b32 s3, exec_lo ; NEW_RBS-NEXT: global_load_dword v9, v[9:10], off ; NEW_RBS-NEXT: s_waitcnt vmcnt(0) ; NEW_RBS-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v9 -; NEW_RBS-NEXT: s_and_saveexec_b32 s3, vcc_lo +; NEW_RBS-NEXT: s_and_saveexec_b32 s4, vcc_lo ; NEW_RBS-NEXT: s_cbranch_execz .LBB16_1 ; NEW_RBS-NEXT: ; %bb.5: ; %loop.body ; NEW_RBS-NEXT: ; in Loop: Header=BB16_3 Depth=1 @@ -591,11 +599,11 @@ define amdgpu_cs void @loop_with_2breaks(ptr addrspace(1) %x, ptr addrspace(1) % ; NEW_RBS-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, v1, v8, vcc_lo ; NEW_RBS-NEXT: v_add_nc_u32_e32 v10, 1, v6 ; NEW_RBS-NEXT: v_cmp_gt_u32_e32 vcc_lo, 0x64, v6 -; NEW_RBS-NEXT: s_andn2_b32 s4, -1, exec_lo +; NEW_RBS-NEXT: s_andn2_b32 s3, s3, exec_lo ; NEW_RBS-NEXT: global_load_dword v9, v[7:8], off ; NEW_RBS-NEXT: v_mov_b32_e32 v6, v10 ; NEW_RBS-NEXT: s_and_b32 s5, exec_lo, vcc_lo -; NEW_RBS-NEXT: s_or_b32 s4, s4, s5 +; NEW_RBS-NEXT: s_or_b32 s3, s3, s5 ; NEW_RBS-NEXT: s_waitcnt vmcnt(0) ; NEW_RBS-NEXT: v_add_nc_u32_e32 v9, 1, v9 ; NEW_RBS-NEXT: global_store_dword v[7:8], v9, off diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.mir index ef3a0a3a67594..1f3b7211eebf8 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.mir @@ -1,6 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 # RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=regbankselect %s -verify-machineinstrs -o - | FileCheck %s -check-prefixes=OLD_RBS -# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=regbankselect %s -verify-machineinstrs -o - | FileCheck %s -check-prefixes=NEW_RBS +# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" %s -verify-machineinstrs -o - | FileCheck %s -check-prefixes=NEW_RBS --- name: uniform_in_vgpr @@ -34,9 +34,10 @@ body: | ; NEW_RBS-NEXT: [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) ; NEW_RBS-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) ; NEW_RBS-NEXT: [[FPTOUI:%[0-9]+]]:vgpr(s32) = G_FPTOUI [[COPY4]](s32) - ; NEW_RBS-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) - ; NEW_RBS-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[FPTOUI]], [[COPY5]] - ; NEW_RBS-NEXT: G_STORE [[ADD]](s32), [[MV]](p1) :: (store (s32), addrspace 1) + ; NEW_RBS-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[FPTOUI]] + ; NEW_RBS-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[AMDGPU_READANYLANE]], [[COPY1]] + ; NEW_RBS-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32) + ; NEW_RBS-NEXT: G_STORE [[COPY5]](s32), [[MV]](p1) :: (store (s32), addrspace 1) ; NEW_RBS-NEXT: S_ENDPGM 0 %0:_(s32) = COPY $sgpr0 %1:_(s32) = COPY $sgpr1 @@ -87,9 +88,10 @@ body: | ; NEW_RBS-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) ; NEW_RBS-NEXT: [[FADD:%[0-9]+]]:vgpr(s32) = G_FADD [[COPY5]], [[COPY6]] ; NEW_RBS-NEXT: [[FPTOUI:%[0-9]+]]:vgpr(s32) = G_FPTOUI [[FADD]](s32) - ; NEW_RBS-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[COPY2]](s32) - ; NEW_RBS-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[FPTOUI]], [[COPY7]] - ; NEW_RBS-NEXT: G_STORE [[ADD]](s32), [[MV]](p1) :: (store (s32), addrspace 1) + ; NEW_RBS-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[FPTOUI]] + ; NEW_RBS-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[AMDGPU_READANYLANE]], [[COPY2]] + ; NEW_RBS-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32) + ; NEW_RBS-NEXT: G_STORE [[COPY7]](s32), [[MV]](p1) :: (store (s32), addrspace 1) ; NEW_RBS-NEXT: S_ENDPGM 0 %0:_(s32) = COPY $sgpr0 %1:_(s32) = COPY $sgpr1 @@ -150,11 +152,17 @@ body: | ; NEW_RBS-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; NEW_RBS-NEXT: [[COPY8:%[0-9]+]]:vgpr(s32) = COPY [[COPY4]](s32) ; NEW_RBS-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY7]](s32), [[COPY8]], [[C]], 0, 0, 0 :: (dereferenceable load (<4 x s32>), align 1, addrspace 8) - ; NEW_RBS-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 ; NEW_RBS-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_BUFFER_LOAD]](<4 x s32>) - ; NEW_RBS-NEXT: [[COPY9:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) - ; NEW_RBS-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[UV1]], [[COPY9]] - ; NEW_RBS-NEXT: G_STORE [[ADD]](s32), [[MV]](p1) :: (store (s32), addrspace 1) + ; NEW_RBS-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV]] + ; NEW_RBS-NEXT: [[AMDGPU_READANYLANE1:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV1]] + ; NEW_RBS-NEXT: [[AMDGPU_READANYLANE2:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV2]] + ; NEW_RBS-NEXT: [[AMDGPU_READANYLANE3:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV3]] + ; NEW_RBS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[AMDGPU_READANYLANE]](s32), [[AMDGPU_READANYLANE1]](s32), [[AMDGPU_READANYLANE2]](s32), [[AMDGPU_READANYLANE3]](s32) + ; NEW_RBS-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; NEW_RBS-NEXT: [[UV4:%[0-9]+]]:sgpr(s32), [[UV5:%[0-9]+]]:sgpr(s32), [[UV6:%[0-9]+]]:sgpr(s32), [[UV7:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<4 x s32>) + ; NEW_RBS-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[UV5]], [[C1]] + ; NEW_RBS-NEXT: [[COPY9:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32) + ; NEW_RBS-NEXT: G_STORE [[COPY9]](s32), [[MV]](p1) :: (store (s32), addrspace 1) ; NEW_RBS-NEXT: S_ENDPGM 0 %0:_(s32) = COPY $sgpr0 %1:_(s32) = COPY $sgpr1 @@ -407,29 +415,28 @@ body: | ; NEW_RBS-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 ; NEW_RBS-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6 ; NEW_RBS-NEXT: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(uge), [[COPY2]](s32), [[C]] - ; NEW_RBS-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP]](s32) ; NEW_RBS-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; NEW_RBS-NEXT: [[ICMP1:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(ne), [[COPY3]](s32), [[C1]] - ; NEW_RBS-NEXT: [[TRUNC1:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP1]](s32) - ; NEW_RBS-NEXT: [[ZEXT:%[0-9]+]]:sgpr(s32) = G_ZEXT [[TRUNC1]](s1) - ; NEW_RBS-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC]](s1) - ; NEW_RBS-NEXT: G_BRCOND [[ZEXT]](s32), %bb.2 + ; NEW_RBS-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; NEW_RBS-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[ICMP1]], [[C2]] + ; NEW_RBS-NEXT: G_BRCOND [[AND]](s32), %bb.2 ; NEW_RBS-NEXT: G_BR %bb.1 ; NEW_RBS-NEXT: {{ $}} ; NEW_RBS-NEXT: bb.1: ; NEW_RBS-NEXT: successors: %bb.2(0x80000000) ; NEW_RBS-NEXT: {{ $}} - ; NEW_RBS-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 - ; NEW_RBS-NEXT: [[ICMP2:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(ult), [[COPY2]](s32), [[C2]] - ; NEW_RBS-NEXT: [[TRUNC2:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP2]](s32) - ; NEW_RBS-NEXT: [[ANYEXT1:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC2]](s1) + ; NEW_RBS-NEXT: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; NEW_RBS-NEXT: [[ICMP2:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(ult), [[COPY2]](s32), [[C3]] ; NEW_RBS-NEXT: {{ $}} ; NEW_RBS-NEXT: bb.2: - ; NEW_RBS-NEXT: [[PHI:%[0-9]+]]:sgpr(s32) = G_PHI [[ANYEXT]](s32), %bb.0, [[ANYEXT1]](s32), %bb.1 - ; NEW_RBS-NEXT: [[TRUNC3:%[0-9]+]]:sgpr(s1) = G_TRUNC [[PHI]](s32) - ; NEW_RBS-NEXT: [[SEXT:%[0-9]+]]:sgpr(s32) = G_SEXT [[TRUNC3]](s1) - ; NEW_RBS-NEXT: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2 - ; NEW_RBS-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[SEXT]], [[C3]] + ; NEW_RBS-NEXT: [[PHI:%[0-9]+]]:sgpr(s32) = G_PHI [[ICMP]](s32), %bb.0, [[ICMP2]](s32), %bb.1 + ; NEW_RBS-NEXT: [[C4:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; NEW_RBS-NEXT: [[AND1:%[0-9]+]]:sgpr(s32) = G_AND [[PHI]], [[C4]] + ; NEW_RBS-NEXT: [[C5:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -1 + ; NEW_RBS-NEXT: [[C6:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; NEW_RBS-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[AND1]](s32), [[C5]], [[C6]] + ; NEW_RBS-NEXT: [[C7:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2 + ; NEW_RBS-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[SELECT]], [[C7]] ; NEW_RBS-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32) ; NEW_RBS-NEXT: G_STORE [[COPY4]](s32), [[MV]](p1) :: (store (s32), addrspace 1) ; NEW_RBS-NEXT: S_ENDPGM 0 @@ -503,10 +510,12 @@ body: | ; NEW_RBS-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) ; NEW_RBS-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; NEW_RBS-NEXT: [[FCMP:%[0-9]+]]:vcc(s1) = G_FCMP floatpred(oeq), [[COPY5]](s32), [[COPY6]] - ; NEW_RBS-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) - ; NEW_RBS-NEXT: [[COPY8:%[0-9]+]]:vgpr(s32) = COPY [[COPY2]](s32) - ; NEW_RBS-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[FCMP]](s1), [[COPY7]], [[COPY8]] - ; NEW_RBS-NEXT: G_STORE [[SELECT]](s32), [[MV]](p1) :: (store (s32), addrspace 1) + ; NEW_RBS-NEXT: [[AMDGPU_COPY_SCC_VCC:%[0-9]+]]:sgpr(s32) = G_AMDGPU_COPY_SCC_VCC [[FCMP]](s1) + ; NEW_RBS-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; NEW_RBS-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[AMDGPU_COPY_SCC_VCC]], [[C1]] + ; NEW_RBS-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[AND]](s32), [[COPY1]], [[COPY2]] + ; NEW_RBS-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[SELECT]](s32) + ; NEW_RBS-NEXT: G_STORE [[COPY7]](s32), [[MV]](p1) :: (store (s32), addrspace 1) ; NEW_RBS-NEXT: S_ENDPGM 0 %0:_(s32) = COPY $sgpr0 %1:_(s32) = COPY $sgpr1 @@ -556,9 +565,8 @@ body: | ; NEW_RBS-NEXT: [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY3]](s32), [[COPY4]](s32) ; NEW_RBS-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; NEW_RBS-NEXT: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(eq), [[COPY]](s32), [[C]] - ; NEW_RBS-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP]](s32) - ; NEW_RBS-NEXT: [[COPY5:%[0-9]+]]:vcc(s1) = COPY [[TRUNC]](s1) - ; NEW_RBS-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY5]](s1), [[COPY1]], [[COPY2]] + ; NEW_RBS-NEXT: [[AMDGPU_COPY_VCC_SCC:%[0-9]+]]:vcc(s1) = G_AMDGPU_COPY_VCC_SCC [[ICMP]](s32) + ; NEW_RBS-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[AMDGPU_COPY_VCC_SCC]](s1), [[COPY1]], [[COPY2]] ; NEW_RBS-NEXT: G_STORE [[SELECT]](s32), [[MV]](p1) :: (store (s32), addrspace 1) ; NEW_RBS-NEXT: S_ENDPGM 0 %0:_(s32) = COPY $sgpr0 @@ -605,9 +613,11 @@ body: | ; NEW_RBS-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3 ; NEW_RBS-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 ; NEW_RBS-NEXT: [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY3]](s32), [[COPY4]](s32) - ; NEW_RBS-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s1) = G_TRUNC [[COPY]](s32) - ; NEW_RBS-NEXT: [[COPY5:%[0-9]+]]:vcc(s1) = COPY [[TRUNC]](s1) - ; NEW_RBS-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[COPY5]](s1), [[COPY1]], [[COPY2]] + ; NEW_RBS-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1 + ; NEW_RBS-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[COPY]], [[C]] + ; NEW_RBS-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; NEW_RBS-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(ne), [[AND]](s32), [[C1]] + ; NEW_RBS-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[COPY1]], [[COPY2]] ; NEW_RBS-NEXT: G_STORE [[SELECT]](s32), [[MV]](p1) :: (store (s32), addrspace 1) ; NEW_RBS-NEXT: S_ENDPGM 0 %0:_(s32) = COPY $vgpr0 @@ -653,9 +663,11 @@ body: | ; NEW_RBS-NEXT: [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32) ; NEW_RBS-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 10 ; NEW_RBS-NEXT: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(eq), [[COPY]](s32), [[C]] - ; NEW_RBS-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP]](s32) - ; NEW_RBS-NEXT: [[ZEXT:%[0-9]+]]:sgpr(s32) = G_ZEXT [[TRUNC]](s1) - ; NEW_RBS-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[ZEXT]](s32) + ; NEW_RBS-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; NEW_RBS-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[ICMP]], [[C1]] + ; NEW_RBS-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; NEW_RBS-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[AND]](s32), [[C1]], [[C2]] + ; NEW_RBS-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[SELECT]](s32) ; NEW_RBS-NEXT: G_STORE [[COPY3]](s32), [[MV]](p1) :: (store (s32), addrspace 1) ; NEW_RBS-NEXT: S_ENDPGM 0 %0:_(s32) = COPY $sgpr0 @@ -700,9 +712,12 @@ body: | ; NEW_RBS-NEXT: [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32) ; NEW_RBS-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 10 ; NEW_RBS-NEXT: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(eq), [[COPY]](s32), [[C]] - ; NEW_RBS-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP]](s32) - ; NEW_RBS-NEXT: [[SEXT:%[0-9]+]]:sgpr(s32) = G_SEXT [[TRUNC]](s1) - ; NEW_RBS-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[SEXT]](s32) + ; NEW_RBS-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; NEW_RBS-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[ICMP]], [[C1]] + ; NEW_RBS-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -1 + ; NEW_RBS-NEXT: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; NEW_RBS-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[AND]](s32), [[C2]], [[C3]] + ; NEW_RBS-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[SELECT]](s32) ; NEW_RBS-NEXT: G_STORE [[COPY3]](s32), [[MV]](p1) :: (store (s32), addrspace 1) ; NEW_RBS-NEXT: S_ENDPGM 0 %0:_(s32) = COPY $sgpr0 @@ -816,16 +831,12 @@ body: | ; NEW_RBS-NEXT: [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) ; NEW_RBS-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 10 ; NEW_RBS-NEXT: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(uge), [[COPY]](s32), [[C]] - ; NEW_RBS-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP]](s32) ; NEW_RBS-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 20 ; NEW_RBS-NEXT: [[ICMP1:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(uge), [[COPY1]](s32), [[C1]] - ; NEW_RBS-NEXT: [[TRUNC1:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP1]](s32) - ; NEW_RBS-NEXT: [[ANYEXT:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC]](s1) - ; NEW_RBS-NEXT: [[ANYEXT1:%[0-9]+]]:sgpr(s32) = G_ANYEXT [[TRUNC1]](s1) - ; NEW_RBS-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[ANYEXT]], [[ANYEXT1]] - ; NEW_RBS-NEXT: [[TRUNC2:%[0-9]+]]:sgpr(s1) = G_TRUNC [[AND]](s32) - ; NEW_RBS-NEXT: [[ZEXT:%[0-9]+]]:sgpr(s32) = G_ZEXT [[TRUNC2]](s1) - ; NEW_RBS-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ZEXT]](s32), [[COPY]], [[COPY1]] + ; NEW_RBS-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[ICMP]], [[ICMP1]] + ; NEW_RBS-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; NEW_RBS-NEXT: [[AND1:%[0-9]+]]:sgpr(s32) = G_AND [[AND]], [[C2]] + ; NEW_RBS-NEXT: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[AND1]](s32), [[COPY]], [[COPY1]] ; NEW_RBS-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[SELECT]](s32) ; NEW_RBS-NEXT: G_STORE [[COPY4]](s32), [[MV]](p1) :: (store (s32), addrspace 1) ; NEW_RBS-NEXT: S_ENDPGM 0 @@ -887,8 +898,9 @@ body: | ; NEW_RBS-NEXT: [[MV:%[0-9]+]]:vgpr(p1) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32) ; NEW_RBS-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 ; NEW_RBS-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; NEW_RBS-NEXT: [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY3]] - ; NEW_RBS-NEXT: [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec + ; NEW_RBS-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY3]] + ; NEW_RBS-NEXT: [[COPY4:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[ICMP]](s1) + ; NEW_RBS-NEXT: [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY4]](s1), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec ; NEW_RBS-NEXT: G_BR %bb.1 ; NEW_RBS-NEXT: {{ $}} ; NEW_RBS-NEXT: bb.1: @@ -897,10 +909,10 @@ body: | ; NEW_RBS-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 ; NEW_RBS-NEXT: {{ $}} ; NEW_RBS-NEXT: bb.2: - ; NEW_RBS-NEXT: [[PHI:%[0-9]+]]:sgpr(s32) = G_PHI [[C]](s32), %bb.0, [[C1]](s32), %bb.1 - ; NEW_RBS-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32) - ; NEW_RBS-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[PHI]](s32) - ; NEW_RBS-NEXT: G_STORE [[COPY4]](s32), [[MV]](p1) :: (store (s32), addrspace 1) + ; NEW_RBS-NEXT: [[PHI:%[0-9]+]]:vgpr(s32) = G_PHI [[C]](s32), %bb.0, [[C1]](s32), %bb.1 + ; NEW_RBS-NEXT: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY [[SI_IF]](s32) + ; NEW_RBS-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[COPY5]](s32) + ; NEW_RBS-NEXT: G_STORE [[PHI]](s32), [[MV]](p1) :: (store (s32), addrspace 1) ; NEW_RBS-NEXT: S_ENDPGM 0 bb.0: successors: %bb.1(0x40000000), %bb.2(0x40000000) @@ -983,15 +995,16 @@ body: | ; NEW_RBS-NEXT: bb.1: ; NEW_RBS-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000) ; NEW_RBS-NEXT: {{ $}} - ; NEW_RBS-NEXT: [[PHI:%[0-9]+]]:sgpr(s32) = G_PHI %7(s32), %bb.1, [[C1]](s32), %bb.0 - ; NEW_RBS-NEXT: [[PHI1:%[0-9]+]]:vgpr(s32) = G_PHI [[C]](s32), %bb.0, %9(s32), %bb.1 + ; NEW_RBS-NEXT: [[PHI:%[0-9]+]]:sgpr(s32) = G_PHI %17(s32), %bb.1, [[C1]](s32), %bb.0 + ; NEW_RBS-NEXT: [[PHI1:%[0-9]+]]:sgpr(s32) = G_PHI [[C]](s32), %bb.0, %9(s32), %bb.1 ; NEW_RBS-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 - ; NEW_RBS-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) - ; NEW_RBS-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[PHI1]], [[COPY3]] - ; NEW_RBS-NEXT: [[UITOFP:%[0-9]+]]:vgpr(s32) = G_UITOFP [[ADD]](s32) + ; NEW_RBS-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[PHI1]], [[C2]] + ; NEW_RBS-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32) + ; NEW_RBS-NEXT: [[UITOFP:%[0-9]+]]:vgpr(s32) = G_UITOFP [[COPY3]](s32) ; NEW_RBS-NEXT: [[FCMP:%[0-9]+]]:vcc(s1) = G_FCMP floatpred(ogt), [[UITOFP]](s32), [[COPY]] - ; NEW_RBS-NEXT: [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[FCMP]](s1), [[PHI]](s32) - ; NEW_RBS-NEXT: SI_LOOP [[INT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec + ; NEW_RBS-NEXT: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[FCMP]](s1), [[PHI]](s32) + ; NEW_RBS-NEXT: [[COPY4:%[0-9]+]]:sreg_32_xm0_xexec(s32) = COPY [[INT]](s32) + ; NEW_RBS-NEXT: SI_LOOP [[COPY4]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec ; NEW_RBS-NEXT: G_BR %bb.2 ; NEW_RBS-NEXT: {{ $}} ; NEW_RBS-NEXT: bb.2: @@ -999,8 +1012,8 @@ body: | ; NEW_RBS-NEXT: [[PHI3:%[0-9]+]]:sgpr(s32) = G_PHI [[INT]](s32), %bb.1 ; NEW_RBS-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI3]](s32) ; NEW_RBS-NEXT: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 10 - ; NEW_RBS-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[C3]](s32) - ; NEW_RBS-NEXT: [[MUL:%[0-9]+]]:vgpr(s32) = G_MUL [[PHI2]], [[COPY4]] + ; NEW_RBS-NEXT: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C3]](s32) + ; NEW_RBS-NEXT: [[MUL:%[0-9]+]]:vgpr(s32) = G_MUL [[PHI2]], [[COPY5]] ; NEW_RBS-NEXT: G_STORE [[MUL]](s32), [[MV]](p1) :: (store (s32), addrspace 1) ; NEW_RBS-NEXT: S_ENDPGM 0 bb.0: @@ -1180,47 +1193,46 @@ body: | ; NEW_RBS-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) ; NEW_RBS-NEXT: {{ $}} ; NEW_RBS-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, %13(s1), %bb.3 - ; NEW_RBS-NEXT: [[PHI1:%[0-9]+]]:sgpr(s32) = G_PHI %15(s32), %bb.3, [[C]](s32), %bb.0 + ; NEW_RBS-NEXT: [[PHI1:%[0-9]+]]:sgpr(s32) = G_PHI %68(s32), %bb.3, [[C]](s32), %bb.0 ; NEW_RBS-NEXT: [[PHI2:%[0-9]+]]:vgpr(s32) = G_PHI [[C]](s32), %bb.0, %17(s32), %bb.3 ; NEW_RBS-NEXT: [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1) - ; NEW_RBS-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[PHI2]](s32) ; NEW_RBS-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 31 - ; NEW_RBS-NEXT: [[ASHR:%[0-9]+]]:vgpr(s32) = G_ASHR [[COPY7]], [[C1]](s32) - ; NEW_RBS-NEXT: [[MV3:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[COPY7]](s32), [[ASHR]](s32) + ; NEW_RBS-NEXT: [[ASHR:%[0-9]+]]:vgpr(s32) = G_ASHR [[PHI2]], [[C1]](s32) + ; NEW_RBS-NEXT: [[MV3:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[PHI2]](s32), [[ASHR]](s32) ; NEW_RBS-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2 - ; NEW_RBS-NEXT: [[COPY8:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) - ; NEW_RBS-NEXT: [[SHL:%[0-9]+]]:vgpr(s64) = G_SHL [[MV3]], [[COPY8]](s32) + ; NEW_RBS-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32) + ; NEW_RBS-NEXT: [[SHL:%[0-9]+]]:vgpr(s64) = G_SHL [[MV3]], [[COPY7]](s32) ; NEW_RBS-NEXT: [[PTR_ADD:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[MV1]], [[SHL]](s64) ; NEW_RBS-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1) ; NEW_RBS-NEXT: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; NEW_RBS-NEXT: [[COPY9:%[0-9]+]]:vgpr(s32) = COPY [[C3]](s32) - ; NEW_RBS-NEXT: [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[LOAD]](s32), [[COPY9]] + ; NEW_RBS-NEXT: [[COPY8:%[0-9]+]]:vgpr(s32) = COPY [[C3]](s32) + ; NEW_RBS-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(ne), [[LOAD]](s32), [[COPY8]] + ; NEW_RBS-NEXT: [[COPY9:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[ICMP]](s1) ; NEW_RBS-NEXT: [[C4:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 - ; NEW_RBS-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[C4]](s32) - ; NEW_RBS-NEXT: [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[TRUNC]](s1) + ; NEW_RBS-NEXT: [[AMDGPU_COPY_VCC_SCC:%[0-9]+]]:sreg_32(s1) = G_AMDGPU_COPY_VCC_SCC [[C4]](s32) ; NEW_RBS-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc - ; NEW_RBS-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY10]](s1), implicit-def $scc + ; NEW_RBS-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[AMDGPU_COPY_VCC_SCC]](s1), implicit-def $scc ; NEW_RBS-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc - ; NEW_RBS-NEXT: [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1) - ; NEW_RBS-NEXT: [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec + ; NEW_RBS-NEXT: [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1) + ; NEW_RBS-NEXT: [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY9]](s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec ; NEW_RBS-NEXT: G_BR %bb.2 ; NEW_RBS-NEXT: {{ $}} ; NEW_RBS-NEXT: bb.2: ; NEW_RBS-NEXT: successors: %bb.4(0x40000000), %bb.5(0x40000000) ; NEW_RBS-NEXT: {{ $}} ; NEW_RBS-NEXT: [[C5:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2 - ; NEW_RBS-NEXT: [[COPY12:%[0-9]+]]:vgpr(s32) = COPY [[C5]](s32) - ; NEW_RBS-NEXT: [[SHL1:%[0-9]+]]:vgpr(s64) = G_SHL [[MV3]], [[COPY12]](s32) + ; NEW_RBS-NEXT: [[COPY11:%[0-9]+]]:vgpr(s32) = COPY [[C5]](s32) + ; NEW_RBS-NEXT: [[SHL1:%[0-9]+]]:vgpr(s64) = G_SHL [[MV3]], [[COPY11]](s32) ; NEW_RBS-NEXT: [[PTR_ADD1:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[MV2]], [[SHL1]](s64) ; NEW_RBS-NEXT: [[LOAD1:%[0-9]+]]:vgpr(s32) = G_LOAD [[PTR_ADD1]](p1) :: (load (s32), addrspace 1) ; NEW_RBS-NEXT: [[C6:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 - ; NEW_RBS-NEXT: [[COPY13:%[0-9]+]]:vgpr(s32) = COPY [[C6]](s32) - ; NEW_RBS-NEXT: [[ICMP1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[LOAD1]](s32), [[COPY13]] + ; NEW_RBS-NEXT: [[COPY12:%[0-9]+]]:vgpr(s32) = COPY [[C6]](s32) + ; NEW_RBS-NEXT: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(ne), [[LOAD1]](s32), [[COPY12]] + ; NEW_RBS-NEXT: [[COPY13:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[ICMP1]](s1) ; NEW_RBS-NEXT: [[C7:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 - ; NEW_RBS-NEXT: [[TRUNC1:%[0-9]+]]:sgpr(s1) = G_TRUNC [[C7]](s32) - ; NEW_RBS-NEXT: [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY [[TRUNC1]](s1) - ; NEW_RBS-NEXT: [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY [[COPY14]](s1) - ; NEW_RBS-NEXT: [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP1]](s1), %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec + ; NEW_RBS-NEXT: [[AMDGPU_COPY_VCC_SCC1:%[0-9]+]]:sreg_32(s1) = G_AMDGPU_COPY_VCC_SCC [[C7]](s32) + ; NEW_RBS-NEXT: [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY [[AMDGPU_COPY_VCC_SCC1]](s1) + ; NEW_RBS-NEXT: [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY13]](s1), %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec ; NEW_RBS-NEXT: G_BR %bb.4 ; NEW_RBS-NEXT: {{ $}} ; NEW_RBS-NEXT: bb.3: @@ -1228,44 +1240,48 @@ body: | ; NEW_RBS-NEXT: {{ $}} ; NEW_RBS-NEXT: [[PHI3:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_]](s1), %bb.1, %43(s1), %bb.5 ; NEW_RBS-NEXT: [[PHI4:%[0-9]+]]:vgpr(s32) = G_PHI %44(s32), %bb.5, [[DEF]](s32), %bb.1 - ; NEW_RBS-NEXT: [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]](s1) - ; NEW_RBS-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32) - ; NEW_RBS-NEXT: [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[COPY16]](s1), [[PHI1]](s32) - ; NEW_RBS-NEXT: SI_LOOP [[INT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec + ; NEW_RBS-NEXT: [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]](s1) + ; NEW_RBS-NEXT: [[COPY16:%[0-9]+]]:sgpr(s32) = COPY [[SI_IF]](s32) + ; NEW_RBS-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[COPY16]](s32) + ; NEW_RBS-NEXT: [[COPY17:%[0-9]+]]:vcc(s1) = COPY [[COPY15]](s1) + ; NEW_RBS-NEXT: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[COPY17]](s1), [[PHI1]](s32) + ; NEW_RBS-NEXT: [[COPY18:%[0-9]+]]:sreg_32_xm0_xexec(s32) = COPY [[INT]](s32) + ; NEW_RBS-NEXT: SI_LOOP [[COPY18]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec ; NEW_RBS-NEXT: G_BR %bb.6 ; NEW_RBS-NEXT: {{ $}} ; NEW_RBS-NEXT: bb.4: ; NEW_RBS-NEXT: successors: %bb.5(0x80000000) ; NEW_RBS-NEXT: {{ $}} ; NEW_RBS-NEXT: [[C8:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2 - ; NEW_RBS-NEXT: [[COPY17:%[0-9]+]]:vgpr(s32) = COPY [[C8]](s32) - ; NEW_RBS-NEXT: [[SHL2:%[0-9]+]]:vgpr(s64) = G_SHL [[MV3]], [[COPY17]](s32) + ; NEW_RBS-NEXT: [[COPY19:%[0-9]+]]:vgpr(s32) = COPY [[C8]](s32) + ; NEW_RBS-NEXT: [[SHL2:%[0-9]+]]:vgpr(s64) = G_SHL [[MV3]], [[COPY19]](s32) ; NEW_RBS-NEXT: [[PTR_ADD2:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[MV]], [[SHL2]](s64) ; NEW_RBS-NEXT: [[LOAD2:%[0-9]+]]:vgpr(s32) = G_LOAD [[PTR_ADD2]](p1) :: (load (s32), addrspace 1) ; NEW_RBS-NEXT: [[C9:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 - ; NEW_RBS-NEXT: [[COPY18:%[0-9]+]]:vgpr(s32) = COPY [[C9]](s32) - ; NEW_RBS-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[LOAD2]], [[COPY18]] + ; NEW_RBS-NEXT: [[COPY20:%[0-9]+]]:vgpr(s32) = COPY [[C9]](s32) + ; NEW_RBS-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[LOAD2]], [[COPY20]] ; NEW_RBS-NEXT: G_STORE [[ADD]](s32), [[PTR_ADD2]](p1) :: (store (s32), addrspace 1) - ; NEW_RBS-NEXT: [[COPY19:%[0-9]+]]:vgpr(s32) = COPY [[C9]](s32) - ; NEW_RBS-NEXT: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[PHI2]], [[COPY19]] + ; NEW_RBS-NEXT: [[COPY21:%[0-9]+]]:vgpr(s32) = COPY [[C9]](s32) + ; NEW_RBS-NEXT: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[PHI2]], [[COPY21]] ; NEW_RBS-NEXT: [[C10:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 100 - ; NEW_RBS-NEXT: [[COPY20:%[0-9]+]]:vgpr(s32) = COPY [[C10]](s32) - ; NEW_RBS-NEXT: [[ICMP2:%[0-9]+]]:vcc(s1) = G_ICMP intpred(ult), [[PHI2]](s32), [[COPY20]] - ; NEW_RBS-NEXT: [[COPY21:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1) - ; NEW_RBS-NEXT: [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY15]](s1), $exec_lo, implicit-def $scc - ; NEW_RBS-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY21]](s1), implicit-def $scc + ; NEW_RBS-NEXT: [[COPY22:%[0-9]+]]:vgpr(s32) = COPY [[C10]](s32) + ; NEW_RBS-NEXT: [[ICMP2:%[0-9]+]]:vcc(s1) = G_ICMP intpred(ult), [[PHI2]](s32), [[COPY22]] + ; NEW_RBS-NEXT: [[COPY23:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1) + ; NEW_RBS-NEXT: [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY14]](s1), $exec_lo, implicit-def $scc + ; NEW_RBS-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY23]](s1), implicit-def $scc ; NEW_RBS-NEXT: [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc ; NEW_RBS-NEXT: {{ $}} ; NEW_RBS-NEXT: bb.5: ; NEW_RBS-NEXT: successors: %bb.3(0x80000000) ; NEW_RBS-NEXT: {{ $}} - ; NEW_RBS-NEXT: [[PHI5:%[0-9]+]]:sreg_32(s1) = PHI [[COPY14]](s1), %bb.2, [[S_OR_B32_1]](s1), %bb.4 + ; NEW_RBS-NEXT: [[PHI5:%[0-9]+]]:sreg_32(s1) = PHI [[AMDGPU_COPY_VCC_SCC1]](s1), %bb.2, [[S_OR_B32_1]](s1), %bb.4 ; NEW_RBS-NEXT: [[PHI6:%[0-9]+]]:vgpr(s32) = G_PHI [[ADD1]](s32), %bb.4, [[DEF]](s32), %bb.2 - ; NEW_RBS-NEXT: [[COPY22:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]](s1) - ; NEW_RBS-NEXT: [[COPY23:%[0-9]+]]:sreg_32(s1) = COPY [[COPY22]](s1) - ; NEW_RBS-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF1]](s32) - ; NEW_RBS-NEXT: [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY11]](s1), $exec_lo, implicit-def $scc - ; NEW_RBS-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY23]](s1), implicit-def $scc + ; NEW_RBS-NEXT: [[COPY24:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]](s1) + ; NEW_RBS-NEXT: [[COPY25:%[0-9]+]]:sreg_32(s1) = COPY [[COPY24]](s1) + ; NEW_RBS-NEXT: [[COPY26:%[0-9]+]]:sgpr(s32) = COPY [[SI_IF1]](s32) + ; NEW_RBS-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[COPY26]](s32) + ; NEW_RBS-NEXT: [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY10]](s1), $exec_lo, implicit-def $scc + ; NEW_RBS-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY25]](s1), implicit-def $scc ; NEW_RBS-NEXT: [[S_OR_B32_2:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_2]](s1), [[S_AND_B32_2]](s1), implicit-def $scc ; NEW_RBS-NEXT: G_BR %bb.3 ; NEW_RBS-NEXT: {{ $}}