Skip to content

Commit 361f2a7

Browse files
committed
AMDGPU/GlobalISel: Handle sbfe/ubfe intrinsic
Try to handle arbitrary scalar BFEs by packing the operands. The DAG gives up on non-constant arguments. We're still missing any constant folding, so we end up with pretty ugly code most of the time. Also handle the 64-bit scalar case, which the DAG doesn't try to do.
1 parent 323db5d commit 361f2a7

File tree

5 files changed

+2131
-8
lines changed

5 files changed

+2131
-8
lines changed

llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -278,8 +278,8 @@ def AMDGPUatomic_cmp_swap : SDNode<"AMDGPUISD::ATOMIC_CMP_SWAP",
278278
def AMDGPUround : SDNode<"ISD::FROUND",
279279
SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisSameAs<0,1>]>>;
280280

281-
def AMDGPUbfe_u32 : SDNode<"AMDGPUISD::BFE_U32", AMDGPUDTIntTernaryOp>;
282-
def AMDGPUbfe_i32 : SDNode<"AMDGPUISD::BFE_I32", AMDGPUDTIntTernaryOp>;
281+
def AMDGPUbfe_u32_impl : SDNode<"AMDGPUISD::BFE_U32", AMDGPUDTIntTernaryOp>;
282+
def AMDGPUbfe_i32_impl : SDNode<"AMDGPUISD::BFE_I32", AMDGPUDTIntTernaryOp>;
283283
def AMDGPUbfi : SDNode<"AMDGPUISD::BFI", AMDGPUDTIntTernaryOp>;
284284
def AMDGPUbfm : SDNode<"AMDGPUISD::BFM", SDTIntBinOp>;
285285

@@ -460,3 +460,11 @@ def AMDGPUmul_u24 : PatFrags<(ops node:$src0, node:$src1),
460460
def AMDGPUmul_i24 : PatFrags<(ops node:$src0, node:$src1),
461461
[(int_amdgcn_mul_i24 node:$src0, node:$src1),
462462
(AMDGPUmul_i24_impl node:$src0, node:$src1)]>;
463+
464+
def AMDGPUbfe_i32 : PatFrags<(ops node:$src0, node:$src1, node:$src2),
465+
[(int_amdgcn_sbfe node:$src0, node:$src1, node:$src2),
466+
(AMDGPUbfe_i32_impl node:$src0, node:$src1, node:$src2)]>;
467+
468+
def AMDGPUbfe_u32 : PatFrags<(ops node:$src0, node:$src1, node:$src2),
469+
[(int_amdgcn_ubfe node:$src0, node:$src1, node:$src2),
470+
(AMDGPUbfe_u32_impl node:$src0, node:$src1, node:$src2)]>;

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 77 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -160,8 +160,9 @@ class ApplyRegBankMapping final : public GISelChangeObserver {
160160
if (!Op.isReg())
161161
continue;
162162

163+
// We may see physical registers if building a real MI
163164
Register Reg = Op.getReg();
164-
if (MRI.getRegClassOrRegBank(Reg))
165+
if (Reg.isPhysical() || MRI.getRegClassOrRegBank(Reg))
165166
continue;
166167

167168
const RegisterBank *RB = NewBank;
@@ -1444,6 +1445,65 @@ bool AMDGPURegisterBankInfo::applyMappingSBufferLoad(
14441445
return true;
14451446
}
14461447

1448+
bool AMDGPURegisterBankInfo::applyMappingBFEIntrinsic(
1449+
const OperandsMapper &OpdMapper, bool Signed) const {
1450+
MachineInstr &MI = OpdMapper.getMI();
1451+
MachineRegisterInfo &MRI = OpdMapper.getMRI();
1452+
1453+
// Insert basic copies
1454+
applyDefaultMapping(OpdMapper);
1455+
1456+
Register DstReg = MI.getOperand(0).getReg();
1457+
LLT Ty = MRI.getType(DstReg);
1458+
1459+
const LLT S32 = LLT::scalar(32);
1460+
1461+
const RegisterBank *DstBank =
1462+
OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1463+
if (DstBank == &AMDGPU::VGPRRegBank) {
1464+
if (Ty == S32)
1465+
return true;
1466+
1467+
// TODO: 64-bit version is scalar only, so we need to expand this.
1468+
return false;
1469+
}
1470+
1471+
Register SrcReg = MI.getOperand(2).getReg();
1472+
Register OffsetReg = MI.getOperand(3).getReg();
1473+
Register WidthReg = MI.getOperand(4).getReg();
1474+
1475+
// The scalar form packs the offset and width in a single operand.
1476+
1477+
ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank);
1478+
GISelObserverWrapper Observer(&ApplyBank);
1479+
MachineIRBuilder B(MI);
1480+
B.setChangeObserver(Observer);
1481+
1482+
// Ensure the high bits are clear to insert the offset.
1483+
auto OffsetMask = B.buildConstant(S32, maskTrailingOnes<unsigned>(6));
1484+
auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask);
1485+
1486+
// Zeros out the low bits, so don't bother clamping the input value.
1487+
auto ShiftWidth = B.buildShl(S32, WidthReg, B.buildConstant(S32, 16));
1488+
1489+
// Transformation function, pack the offset and width of a BFE into
1490+
// the format expected by the S_BFE_I32 / S_BFE_U32. In the second
1491+
// source, bits [5:0] contain the offset and bits [22:16] the width.
1492+
auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth);
1493+
1494+
// TODO: It might be worth using a pseudo here to avoid scc clobber and
1495+
// register class constraints.
1496+
unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) :
1497+
(Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64);
1498+
1499+
auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs});
1500+
if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))
1501+
llvm_unreachable("failed to constrain BFE");
1502+
1503+
MI.eraseFromParent();
1504+
return true;
1505+
}
1506+
14471507
// FIXME: Duplicated from LegalizerHelper
14481508
static CmpInst::Predicate minMaxToCompare(unsigned Opc) {
14491509
switch (Opc) {
@@ -2592,8 +2652,12 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
25922652
constrainOpWithReadfirstlane(MI, MRI, 5);
25932653
return;
25942654
}
2595-
default:
2596-
break;
2655+
case Intrinsic::amdgcn_sbfe:
2656+
applyMappingBFEIntrinsic(OpdMapper, true);
2657+
return;
2658+
case Intrinsic::amdgcn_ubfe:
2659+
applyMappingBFEIntrinsic(OpdMapper, false);
2660+
return;
25972661
}
25982662
break;
25992663
}
@@ -2687,7 +2751,11 @@ AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const {
26872751
SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
26882752

26892753
for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
2690-
unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);
2754+
const MachineOperand &SrcOp = MI.getOperand(i);
2755+
if (!SrcOp.isReg())
2756+
continue;
2757+
2758+
unsigned Size = getSizeInBits(SrcOp.getReg(), MRI, *TRI);
26912759
OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
26922760
}
26932761
return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
@@ -3498,8 +3566,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
34983566
case Intrinsic::amdgcn_fmad_ftz:
34993567
case Intrinsic::amdgcn_mbcnt_lo:
35003568
case Intrinsic::amdgcn_mbcnt_hi:
3501-
case Intrinsic::amdgcn_ubfe:
3502-
case Intrinsic::amdgcn_sbfe:
35033569
case Intrinsic::amdgcn_mul_u24:
35043570
case Intrinsic::amdgcn_mul_i24:
35053571
case Intrinsic::amdgcn_lerp:
@@ -3521,6 +3587,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
35213587
case Intrinsic::amdgcn_sdot8:
35223588
case Intrinsic::amdgcn_udot8:
35233589
return getDefaultMappingVOP(MI);
3590+
case Intrinsic::amdgcn_sbfe:
3591+
case Intrinsic::amdgcn_ubfe:
3592+
if (isSALUMapping(MI))
3593+
return getDefaultMappingSOP(MI);
3594+
return getDefaultMappingVOP(MI);
35243595
case Intrinsic::amdgcn_ds_swizzle:
35253596
case Intrinsic::amdgcn_ds_permute:
35263597
case Intrinsic::amdgcn_ds_bpermute:

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,9 @@ class AMDGPURegisterBankInfo : public AMDGPUGenRegisterBankInfo {
7878
MachineRegisterInfo &MRI, int RSrcIdx) const;
7979
bool applyMappingSBufferLoad(const OperandsMapper &OpdMapper) const;
8080

81+
bool applyMappingBFEIntrinsic(const OperandsMapper &OpdMapper,
82+
bool Signed) const;
83+
8184
void lowerScalarMinMax(MachineIRBuilder &B, MachineInstr &MI) const;
8285

8386
Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI,

0 commit comments

Comments
 (0)