llvm
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td‎
Lines changed: 10 additions & 2 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td‎
Lines changed: 10 additions & 2 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp‎
Lines changed: 77 additions & 6 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp‎
Lines changed: 77 additions & 6 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h‎
Lines changed: 3 additions & 0 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h‎
Lines changed: 3 additions & 0 deletions
@@ -278,8 +278,8 @@ def AMDGPUatomic_cmp_swap : SDNode<"AMDGPUISD::ATOMIC_CMP_SWAP",
 def AMDGPUround : SDNode<"ISD::FROUND",
                          SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisSameAs<0,1>]>>;
 
-def AMDGPUbfe_u32 : SDNode<"AMDGPUISD::BFE_U32", AMDGPUDTIntTernaryOp>;
-def AMDGPUbfe_i32 : SDNode<"AMDGPUISD::BFE_I32", AMDGPUDTIntTernaryOp>;
+def AMDGPUbfe_u32_impl : SDNode<"AMDGPUISD::BFE_U32", AMDGPUDTIntTernaryOp>;
+def AMDGPUbfe_i32_impl : SDNode<"AMDGPUISD::BFE_I32", AMDGPUDTIntTernaryOp>;
 def AMDGPUbfi : SDNode<"AMDGPUISD::BFI", AMDGPUDTIntTernaryOp>;
 def AMDGPUbfm : SDNode<"AMDGPUISD::BFM", SDTIntBinOp>;
 
@@ -460,3 +460,11 @@ def AMDGPUmul_u24 : PatFrags<(ops node:$src0, node:$src1),
 def AMDGPUmul_i24 : PatFrags<(ops node:$src0, node:$src1),
   [(int_amdgcn_mul_i24 node:$src0, node:$src1),
    (AMDGPUmul_i24_impl node:$src0, node:$src1)]>;
+
+def AMDGPUbfe_i32 : PatFrags<(ops node:$src0, node:$src1, node:$src2),
+  [(int_amdgcn_sbfe node:$src0, node:$src1, node:$src2),
+   (AMDGPUbfe_i32_impl node:$src0, node:$src1, node:$src2)]>;
+
+def AMDGPUbfe_u32 : PatFrags<(ops node:$src0, node:$src1, node:$src2),
+  [(int_amdgcn_ubfe node:$src0, node:$src1, node:$src2),
+   (AMDGPUbfe_u32_impl node:$src0, node:$src1, node:$src2)]>;
@@ -160,8 +160,9 @@ class ApplyRegBankMapping final : public GISelChangeObserver {
       if (!Op.isReg())
         continue;
 
+      // We may see physical registers if building a real MI
       Register Reg = Op.getReg();
-      if (MRI.getRegClassOrRegBank(Reg))
+      if (Reg.isPhysical() || MRI.getRegClassOrRegBank(Reg))
         continue;
 
       const RegisterBank *RB = NewBank;
@@ -1444,6 +1445,65 @@ bool AMDGPURegisterBankInfo::applyMappingSBufferLoad(
   return true;
 }
 
+bool AMDGPURegisterBankInfo::applyMappingBFEIntrinsic(
+  const OperandsMapper &OpdMapper, bool Signed) const {
+  MachineInstr &MI = OpdMapper.getMI();
+  MachineRegisterInfo &MRI = OpdMapper.getMRI();
+
+  // Insert basic copies
+  applyDefaultMapping(OpdMapper);
+
+  Register DstReg = MI.getOperand(0).getReg();
+  LLT Ty = MRI.getType(DstReg);
+
+  const LLT S32 = LLT::scalar(32);
+
+  const RegisterBank *DstBank =
+    OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
+  if (DstBank == &AMDGPU::VGPRRegBank) {
+    if (Ty == S32)
+      return true;
+
+    // TODO: 64-bit version is scalar only, so we need to expand this.
+    return false;
+  }
+
+  Register SrcReg = MI.getOperand(2).getReg();
+  Register OffsetReg = MI.getOperand(3).getReg();
+  Register WidthReg = MI.getOperand(4).getReg();
+
+  // The scalar form packs the offset and width in a single operand.
+
+  ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank);
+  GISelObserverWrapper Observer(&ApplyBank);
+  MachineIRBuilder B(MI);
+  B.setChangeObserver(Observer);
+
+  // Ensure the high bits are clear to insert the offset.
+  auto OffsetMask = B.buildConstant(S32, maskTrailingOnes<unsigned>(6));
+  auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask);
+
+  // Zeros out the low bits, so don't bother clamping the input value.
+  auto ShiftWidth = B.buildShl(S32, WidthReg, B.buildConstant(S32, 16));
+
+  // Transformation function, pack the offset and width of a BFE into
+  // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
+  // source, bits [5:0] contain the offset and bits [22:16] the width.
+  auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth);
+
+  // TODO: It might be worth using a pseudo here to avoid scc clobber and
+  // register class constraints.
+  unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) :
+                             (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64);
+
+  auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs});
+  if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))
+    llvm_unreachable("failed to constrain BFE");
+
+  MI.eraseFromParent();
+  return true;
+}
+
 // FIXME: Duplicated from LegalizerHelper
 static CmpInst::Predicate minMaxToCompare(unsigned Opc) {
   switch (Opc) {
@@ -2592,8 +2652,12 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
       constrainOpWithReadfirstlane(MI, MRI, 5);
       return;
     }
-    default:
-      break;
+    case Intrinsic::amdgcn_sbfe:
+      applyMappingBFEIntrinsic(OpdMapper, true);
+      return;
+    case Intrinsic::amdgcn_ubfe:
+      applyMappingBFEIntrinsic(OpdMapper, false);
+      return;
     }
     break;
   }
@@ -2687,7 +2751,11 @@ AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const {
   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
 
   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
-    unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);
+    const MachineOperand &SrcOp = MI.getOperand(i);
+    if (!SrcOp.isReg())
+      continue;
+
+    unsigned Size = getSizeInBits(SrcOp.getReg(), MRI, *TRI);
     OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
   }
   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
@@ -3498,8 +3566,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     case Intrinsic::amdgcn_fmad_ftz:
     case Intrinsic::amdgcn_mbcnt_lo:
     case Intrinsic::amdgcn_mbcnt_hi:
-    case Intrinsic::amdgcn_ubfe:
-    case Intrinsic::amdgcn_sbfe:
     case Intrinsic::amdgcn_mul_u24:
     case Intrinsic::amdgcn_mul_i24:
     case Intrinsic::amdgcn_lerp:
@@ -3521,6 +3587,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     case Intrinsic::amdgcn_sdot8:
     case Intrinsic::amdgcn_udot8:
       return getDefaultMappingVOP(MI);
+    case Intrinsic::amdgcn_sbfe:
+    case Intrinsic::amdgcn_ubfe:
+      if (isSALUMapping(MI))
+        return getDefaultMappingSOP(MI);
+      return getDefaultMappingVOP(MI);
     case Intrinsic::amdgcn_ds_swizzle:
     case Intrinsic::amdgcn_ds_permute:
     case Intrinsic::amdgcn_ds_bpermute:
 
@@ -78,6 +78,9 @@ class AMDGPURegisterBankInfo : public AMDGPUGenRegisterBankInfo {
                     MachineRegisterInfo &MRI, int RSrcIdx) const;
   bool applyMappingSBufferLoad(const OperandsMapper &OpdMapper) const;
 
+  bool applyMappingBFEIntrinsic(const OperandsMapper &OpdMapper,
+                                bool Signed) const;
+
   void lowerScalarMinMax(MachineIRBuilder &B, MachineInstr &MI) const;
 
   Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI,