Skip to content

Commit f33ec0a

Browse files
shiltianjayfoadkosarev
committed
[AMDGPU] Support lowering of cluster related instrinsics
Since many code are connected, this also changes how workgroup id is lowered. Co-authored-by: Jay Foad <[email protected]> Co-authored-by: Ivan Kosarev <[email protected]>
1 parent 3e18b5a commit f33ec0a

21 files changed

+4100
-43
lines changed

llvm/docs/AMDGPUUsage.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1812,6 +1812,13 @@ The AMDGPU backend supports the following LLVM IR attributes.
18121812
offset by one less than the number of dynamic VGPR blocks required
18131813
by the function encoded in bits 5..3.
18141814

1815+
"amdgpu-cluster-dims"="x,y,z" Specify the cluster workgroup dimensions. A value of "0,0,0" indicates that
1816+
cluster is disabled. A value of "1024,1024,1024" indicates that cluster is enabled,
1817+
but the dimensions cannot be determined at compile time. Any other value explicitly
1818+
specifies the cluster dimensions.
1819+
1820+
This is only relevant on targets with cluster support.
1821+
18151822
================================================ ==========================================================
18161823

18171824
Calling Conventions

llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,14 @@ AMDGPUFunctionArgInfo::getPreloadedValue(
107107
case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z:
108108
return std::tuple(WorkGroupIDZ ? &WorkGroupIDZ : nullptr,
109109
&AMDGPU::SGPR_32RegClass, LLT::scalar(32));
110+
case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_X:
111+
case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Y:
112+
case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Z:
113+
case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_X:
114+
case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Y:
115+
case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Z:
116+
case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_FLAT_ID:
117+
return std::tuple(nullptr, &AMDGPU::SGPR_32RegClass, LLT::scalar(32));
110118
case AMDGPUFunctionArgInfo::LDS_KERNEL_ID:
111119
return std::tuple(LDSKernelId ? &LDSKernelId : nullptr,
112120
&AMDGPU::SGPR_32RegClass, LLT::scalar(32));

llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -111,18 +111,25 @@ struct AMDGPUFunctionArgInfo {
111111
DISPATCH_ID = 4,
112112
FLAT_SCRATCH_INIT = 5,
113113
LDS_KERNEL_ID = 6, // LLVM internal, not part of the ABI
114-
WORKGROUP_ID_X = 10,
115-
WORKGROUP_ID_Y = 11,
116-
WORKGROUP_ID_Z = 12,
114+
WORKGROUP_ID_X = 10, // Also used for cluster ID X.
115+
WORKGROUP_ID_Y = 11, // Also used for cluster ID Y.
116+
WORKGROUP_ID_Z = 12, // Also used for cluster ID Z.
117117
PRIVATE_SEGMENT_WAVE_BYTE_OFFSET = 14,
118118
IMPLICIT_BUFFER_PTR = 15,
119119
IMPLICIT_ARG_PTR = 16,
120120
PRIVATE_SEGMENT_SIZE = 17,
121+
CLUSTER_WORKGROUP_ID_X = 21,
122+
CLUSTER_WORKGROUP_ID_Y = 22,
123+
CLUSTER_WORKGROUP_ID_Z = 23,
124+
CLUSTER_WORKGROUP_MAX_ID_X = 24,
125+
CLUSTER_WORKGROUP_MAX_ID_Y = 25,
126+
CLUSTER_WORKGROUP_MAX_ID_Z = 26,
127+
CLUSTER_WORKGROUP_MAX_FLAT_ID = 27,
121128

122129
// VGPRS:
123-
WORKITEM_ID_X = 18,
124-
WORKITEM_ID_Y = 19,
125-
WORKITEM_ID_Z = 20,
130+
WORKITEM_ID_X = 28,
131+
WORKITEM_ID_Y = 29,
132+
WORKITEM_ID_Z = 30,
126133
FIRST_VGPR_VALUE = WORKITEM_ID_X
127134
};
128135
// clang-format on

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

Lines changed: 212 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4452,6 +4452,74 @@ void AMDGPULegalizerInfo::buildLoadInputValue(Register DstReg,
44524452
}
44534453
}
44544454

4455+
bool AMDGPULegalizerInfo::legalizeWorkGroupId(
4456+
MachineInstr &MI, MachineIRBuilder &B,
4457+
AMDGPUFunctionArgInfo::PreloadedValue WorkGroupIdPV,
4458+
AMDGPUFunctionArgInfo::PreloadedValue ClusterMaxIdPV,
4459+
AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const {
4460+
Register DstReg = MI.getOperand(0).getReg();
4461+
if (!ST.hasClusters()) {
4462+
if (!loadInputValue(DstReg, B, WorkGroupIdPV))
4463+
return false;
4464+
MI.eraseFromParent();
4465+
return true;
4466+
}
4467+
4468+
// Clusters are supported. Return the global position in the grid. If clusters
4469+
// are enabled, WorkGroupIdPV returns the cluster ID not the workgroup ID.
4470+
4471+
// WorkGroupIdXYZ = ClusterId == 0 ?
4472+
// ClusterIdXYZ :
4473+
// ClusterIdXYZ * (ClusterMaxIdXYZ + 1) + ClusterWorkGroupIdXYZ
4474+
MachineRegisterInfo &MRI = *B.getMRI();
4475+
const LLT S32 = LLT::scalar(32);
4476+
Register ClusterIdXYZ = MRI.createGenericVirtualRegister(S32);
4477+
Register ClusterMaxIdXYZ = MRI.createGenericVirtualRegister(S32);
4478+
Register ClusterWorkGroupIdXYZ = MRI.createGenericVirtualRegister(S32);
4479+
if (!loadInputValue(ClusterIdXYZ, B, WorkGroupIdPV) ||
4480+
!loadInputValue(ClusterWorkGroupIdXYZ, B, ClusterWorkGroupIdPV) ||
4481+
!loadInputValue(ClusterMaxIdXYZ, B, ClusterMaxIdPV))
4482+
return false;
4483+
4484+
auto One = B.buildConstant(S32, 1);
4485+
auto ClusterSizeXYZ = B.buildAdd(S32, ClusterMaxIdXYZ, One);
4486+
auto GlobalIdXYZ = B.buildAdd(S32, ClusterWorkGroupIdXYZ,
4487+
B.buildMul(S32, ClusterIdXYZ, ClusterSizeXYZ));
4488+
4489+
const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4490+
4491+
switch (MFI->getClusterDims().getKind()) {
4492+
case AMDGPU::ClusterDimsAttr::Kind::FixedDims:
4493+
case AMDGPU::ClusterDimsAttr::Kind::VariableDims: {
4494+
B.buildCopy(DstReg, GlobalIdXYZ);
4495+
MI.eraseFromParent();
4496+
return true;
4497+
}
4498+
case AMDGPU::ClusterDimsAttr::Kind::NoCluster: {
4499+
B.buildCopy(DstReg, ClusterIdXYZ);
4500+
MI.eraseFromParent();
4501+
return true;
4502+
}
4503+
case AMDGPU::ClusterDimsAttr::Kind::Unknown: {
4504+
using namespace AMDGPU::Hwreg;
4505+
unsigned ClusterIdField = HwregEncoding::encode(ID_IB_STS2, 6, 4);
4506+
Register ClusterId = MRI.createGenericVirtualRegister(S32);
4507+
MRI.setRegClass(ClusterId, &AMDGPU::SReg_32RegClass);
4508+
B.buildInstr(AMDGPU::S_GETREG_B32_const)
4509+
.addDef(ClusterId)
4510+
.addImm(ClusterIdField);
4511+
auto Zero = B.buildConstant(S32, 0);
4512+
auto NoClusters =
4513+
B.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1), ClusterId, Zero);
4514+
B.buildSelect(DstReg, NoClusters, ClusterIdXYZ, GlobalIdXYZ);
4515+
MI.eraseFromParent();
4516+
return true;
4517+
}
4518+
}
4519+
4520+
llvm_unreachable("nothing should reach here");
4521+
}
4522+
44554523
bool AMDGPULegalizerInfo::loadInputValue(
44564524
Register DstReg, MachineIRBuilder &B,
44574525
AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
@@ -4471,8 +4539,31 @@ bool AMDGPULegalizerInfo::loadInputValue(
44714539
AMDGPU::isEntryFunctionCC(CC) && !MFI->hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
44724540
const ArgDescriptor WorkGroupIDZ =
44734541
ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
4542+
const ArgDescriptor ClusterWorkGroupIDX =
4543+
ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000000Fu);
4544+
const ArgDescriptor ClusterWorkGroupIDY =
4545+
ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000000F0u);
4546+
const ArgDescriptor ClusterWorkGroupIDZ =
4547+
ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00000F00u);
4548+
const ArgDescriptor ClusterWorkGroupMaxIDX =
4549+
ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000F000u);
4550+
const ArgDescriptor ClusterWorkGroupMaxIDY =
4551+
ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000F0000u);
4552+
const ArgDescriptor ClusterWorkGroupMaxIDZ =
4553+
ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00F00000u);
4554+
const ArgDescriptor ClusterWorkGroupMaxFlatID =
4555+
ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0F000000u);
4556+
4557+
auto LoadConstant = [&](unsigned N) {
4558+
B.buildConstant(DstReg, N);
4559+
return true;
4560+
};
4561+
44744562
if (ST.hasArchitectedSGPRs() &&
44754563
(AMDGPU::isCompute(CC) || CC == CallingConv::AMDGPU_Gfx)) {
4564+
AMDGPU::ClusterDimsAttr ClusterDims = MFI->getClusterDims();
4565+
bool HasFixedDims = ClusterDims.isFixedDims();
4566+
44764567
switch (ArgType) {
44774568
case AMDGPUFunctionArgInfo::WORKGROUP_ID_X:
44784569
Arg = &WorkGroupIDX;
@@ -4489,6 +4580,53 @@ bool AMDGPULegalizerInfo::loadInputValue(
44894580
ArgRC = &AMDGPU::SReg_32RegClass;
44904581
ArgTy = LLT::scalar(32);
44914582
break;
4583+
case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_X:
4584+
if (HasFixedDims && ClusterDims.getDims()[0] == 1)
4585+
return LoadConstant(0);
4586+
Arg = &ClusterWorkGroupIDX;
4587+
ArgRC = &AMDGPU::SReg_32RegClass;
4588+
ArgTy = LLT::scalar(32);
4589+
break;
4590+
case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Y:
4591+
if (HasFixedDims && ClusterDims.getDims()[1] == 1)
4592+
return LoadConstant(0);
4593+
Arg = &ClusterWorkGroupIDY;
4594+
ArgRC = &AMDGPU::SReg_32RegClass;
4595+
ArgTy = LLT::scalar(32);
4596+
break;
4597+
case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Z:
4598+
if (HasFixedDims && ClusterDims.getDims()[2] == 1)
4599+
return LoadConstant(0);
4600+
Arg = &ClusterWorkGroupIDZ;
4601+
ArgRC = &AMDGPU::SReg_32RegClass;
4602+
ArgTy = LLT::scalar(32);
4603+
break;
4604+
case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_X:
4605+
if (HasFixedDims)
4606+
return LoadConstant(ClusterDims.getDims()[0] - 1);
4607+
Arg = &ClusterWorkGroupMaxIDX;
4608+
ArgRC = &AMDGPU::SReg_32RegClass;
4609+
ArgTy = LLT::scalar(32);
4610+
break;
4611+
case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Y:
4612+
if (HasFixedDims)
4613+
return LoadConstant(ClusterDims.getDims()[1] - 1);
4614+
Arg = &ClusterWorkGroupMaxIDY;
4615+
ArgRC = &AMDGPU::SReg_32RegClass;
4616+
ArgTy = LLT::scalar(32);
4617+
break;
4618+
case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Z:
4619+
if (HasFixedDims)
4620+
return LoadConstant(ClusterDims.getDims()[2] - 1);
4621+
Arg = &ClusterWorkGroupMaxIDZ;
4622+
ArgRC = &AMDGPU::SReg_32RegClass;
4623+
ArgTy = LLT::scalar(32);
4624+
break;
4625+
case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_FLAT_ID:
4626+
Arg = &ClusterWorkGroupMaxFlatID;
4627+
ArgRC = &AMDGPU::SReg_32RegClass;
4628+
ArgTy = LLT::scalar(32);
4629+
break;
44924630
default:
44934631
break;
44944632
}
@@ -4499,10 +4637,9 @@ bool AMDGPULegalizerInfo::loadInputValue(
44994637

45004638
if (!Arg) {
45014639
if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {
4502-
// The intrinsic may appear when we have a 0 sized kernarg segment, in which
4503-
// case the pointer argument may be missing and we use null.
4504-
B.buildConstant(DstReg, 0);
4505-
return true;
4640+
// The intrinsic may appear when we have a 0 sized kernarg segment, in
4641+
// which case the pointer argument may be missing and we use null.
4642+
return LoadConstant(0);
45064643
}
45074644

45084645
// It's undefined behavior if a function marked with the amdgpu-no-*
@@ -7415,6 +7552,22 @@ bool AMDGPULegalizerInfo::legalizeWaveID(MachineInstr &MI,
74157552
return true;
74167553
}
74177554

7555+
bool AMDGPULegalizerInfo::legalizeConstHwRegRead(MachineInstr &MI,
7556+
MachineIRBuilder &B,
7557+
AMDGPU::Hwreg::Id HwReg,
7558+
unsigned LowBit,
7559+
unsigned Width) const {
7560+
MachineRegisterInfo &MRI = *B.getMRI();
7561+
Register DstReg = MI.getOperand(0).getReg();
7562+
if (!MRI.getRegClassOrNull(DstReg))
7563+
MRI.setRegClass(DstReg, &AMDGPU::SReg_32RegClass);
7564+
B.buildInstr(AMDGPU::S_GETREG_B32_const)
7565+
.addDef(DstReg)
7566+
.addImm(AMDGPU::Hwreg::HwregEncoding::encode(HwReg, LowBit, Width));
7567+
MI.eraseFromParent();
7568+
return true;
7569+
}
7570+
74187571
static constexpr unsigned FPEnvModeBitField =
74197572
AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 0, 23);
74207573

@@ -7577,14 +7730,64 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
75777730
return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2,
75787731
AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
75797732
case Intrinsic::amdgcn_workgroup_id_x:
7580-
return legalizePreloadedArgIntrin(MI, MRI, B,
7581-
AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
7733+
return legalizeWorkGroupId(
7734+
MI, B, AMDGPUFunctionArgInfo::WORKGROUP_ID_X,
7735+
AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_X,
7736+
AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_X);
75827737
case Intrinsic::amdgcn_workgroup_id_y:
7583-
return legalizePreloadedArgIntrin(MI, MRI, B,
7584-
AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
7738+
return legalizeWorkGroupId(
7739+
MI, B, AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,
7740+
AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Y,
7741+
AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Y);
75857742
case Intrinsic::amdgcn_workgroup_id_z:
7586-
return legalizePreloadedArgIntrin(MI, MRI, B,
7743+
return legalizeWorkGroupId(
7744+
MI, B, AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,
7745+
AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Z,
7746+
AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Z);
7747+
case Intrinsic::amdgcn_cluster_id_x:
7748+
return ST.hasGFX1250Insts() &&
7749+
legalizePreloadedArgIntrin(MI, MRI, B,
7750+
AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
7751+
case Intrinsic::amdgcn_cluster_id_y:
7752+
return ST.hasGFX1250Insts() &&
7753+
legalizePreloadedArgIntrin(MI, MRI, B,
7754+
AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
7755+
case Intrinsic::amdgcn_cluster_id_z:
7756+
return ST.hasGFX1250Insts() &&
7757+
legalizePreloadedArgIntrin(MI, MRI, B,
75877758
AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
7759+
case Intrinsic::amdgcn_cluster_workgroup_id_x:
7760+
return ST.hasGFX1250Insts() &&
7761+
legalizePreloadedArgIntrin(
7762+
MI, MRI, B, AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_X);
7763+
case Intrinsic::amdgcn_cluster_workgroup_id_y:
7764+
return ST.hasGFX1250Insts() &&
7765+
legalizePreloadedArgIntrin(
7766+
MI, MRI, B, AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Y);
7767+
case Intrinsic::amdgcn_cluster_workgroup_id_z:
7768+
return ST.hasGFX1250Insts() &&
7769+
legalizePreloadedArgIntrin(
7770+
MI, MRI, B, AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Z);
7771+
case Intrinsic::amdgcn_cluster_workgroup_flat_id:
7772+
return AMDGPU::isGFX1250(ST) &&
7773+
legalizeConstHwRegRead(MI, B, AMDGPU::Hwreg::ID_IB_STS2, 21, 4);
7774+
case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
7775+
return ST.hasGFX1250Insts() &&
7776+
legalizePreloadedArgIntrin(
7777+
MI, MRI, B, AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_X);
7778+
case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
7779+
return ST.hasGFX1250Insts() &&
7780+
legalizePreloadedArgIntrin(
7781+
MI, MRI, B, AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Y);
7782+
case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
7783+
return ST.hasGFX1250Insts() &&
7784+
legalizePreloadedArgIntrin(
7785+
MI, MRI, B, AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Z);
7786+
case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
7787+
return ST.hasGFX1250Insts() &&
7788+
legalizePreloadedArgIntrin(
7789+
MI, MRI, B,
7790+
AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_FLAT_ID);
75887791
case Intrinsic::amdgcn_wave_id:
75897792
return legalizeWaveID(MI, B);
75907793
case Intrinsic::amdgcn_lds_kernel_id:

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,11 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
114114
void buildLoadInputValue(Register DstReg, MachineIRBuilder &B,
115115
const ArgDescriptor *Arg,
116116
const TargetRegisterClass *ArgRC, LLT ArgTy) const;
117+
bool legalizeWorkGroupId(
118+
MachineInstr &MI, MachineIRBuilder &B,
119+
AMDGPUFunctionArgInfo::PreloadedValue ClusterIdPV,
120+
AMDGPUFunctionArgInfo::PreloadedValue ClusterMaxIdPV,
121+
AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const;
117122
bool loadInputValue(Register DstReg, MachineIRBuilder &B,
118123
AMDGPUFunctionArgInfo::PreloadedValue ArgType) const;
119124

@@ -218,6 +223,9 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
218223

219224
bool legalizeStackSave(MachineInstr &MI, MachineIRBuilder &B) const;
220225
bool legalizeWaveID(MachineInstr &MI, MachineIRBuilder &B) const;
226+
bool legalizeConstHwRegRead(MachineInstr &MI, MachineIRBuilder &B,
227+
AMDGPU::Hwreg::Id HwReg, unsigned LowBit,
228+
unsigned Width) const;
221229

222230
bool legalizeGetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI,
223231
MachineIRBuilder &B) const;

0 commit comments

Comments
 (0)