@@ -4452,6 +4452,74 @@ void AMDGPULegalizerInfo::buildLoadInputValue(Register DstReg,
44524452 }
44534453}
44544454
4455+ bool AMDGPULegalizerInfo::legalizeWorkGroupId (
4456+ MachineInstr &MI, MachineIRBuilder &B,
4457+ AMDGPUFunctionArgInfo::PreloadedValue WorkGroupIdPV,
4458+ AMDGPUFunctionArgInfo::PreloadedValue ClusterMaxIdPV,
4459+ AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const {
4460+ Register DstReg = MI.getOperand (0 ).getReg ();
4461+ if (!ST.hasClusters ()) {
4462+ if (!loadInputValue (DstReg, B, WorkGroupIdPV))
4463+ return false ;
4464+ MI.eraseFromParent ();
4465+ return true ;
4466+ }
4467+
4468+ // Clusters are supported. Return the global position in the grid. If clusters
4469+ // are enabled, WorkGroupIdPV returns the cluster ID not the workgroup ID.
4470+
4471+ // WorkGroupIdXYZ = ClusterId == 0 ?
4472+ // ClusterIdXYZ :
4473+ // ClusterIdXYZ * (ClusterMaxIdXYZ + 1) + ClusterWorkGroupIdXYZ
4474+ MachineRegisterInfo &MRI = *B.getMRI ();
4475+ const LLT S32 = LLT::scalar (32 );
4476+ Register ClusterIdXYZ = MRI.createGenericVirtualRegister (S32);
4477+ Register ClusterMaxIdXYZ = MRI.createGenericVirtualRegister (S32);
4478+ Register ClusterWorkGroupIdXYZ = MRI.createGenericVirtualRegister (S32);
4479+ if (!loadInputValue (ClusterIdXYZ, B, WorkGroupIdPV) ||
4480+ !loadInputValue (ClusterWorkGroupIdXYZ, B, ClusterWorkGroupIdPV) ||
4481+ !loadInputValue (ClusterMaxIdXYZ, B, ClusterMaxIdPV))
4482+ return false ;
4483+
4484+ auto One = B.buildConstant (S32, 1 );
4485+ auto ClusterSizeXYZ = B.buildAdd (S32, ClusterMaxIdXYZ, One);
4486+ auto GlobalIdXYZ = B.buildAdd (S32, ClusterWorkGroupIdXYZ,
4487+ B.buildMul (S32, ClusterIdXYZ, ClusterSizeXYZ));
4488+
4489+ const SIMachineFunctionInfo *MFI = B.getMF ().getInfo <SIMachineFunctionInfo>();
4490+
4491+ switch (MFI->getClusterDims ().getKind ()) {
4492+ case AMDGPU::ClusterDimsAttr::Kind::FixedDims:
4493+ case AMDGPU::ClusterDimsAttr::Kind::VariableDims: {
4494+ B.buildCopy (DstReg, GlobalIdXYZ);
4495+ MI.eraseFromParent ();
4496+ return true ;
4497+ }
4498+ case AMDGPU::ClusterDimsAttr::Kind::NoCluster: {
4499+ B.buildCopy (DstReg, ClusterIdXYZ);
4500+ MI.eraseFromParent ();
4501+ return true ;
4502+ }
4503+ case AMDGPU::ClusterDimsAttr::Kind::Unknown: {
4504+ using namespace AMDGPU ::Hwreg;
4505+ unsigned ClusterIdField = HwregEncoding::encode (ID_IB_STS2, 6 , 4 );
4506+ Register ClusterId = MRI.createGenericVirtualRegister (S32);
4507+ MRI.setRegClass (ClusterId, &AMDGPU::SReg_32RegClass);
4508+ B.buildInstr (AMDGPU::S_GETREG_B32_const)
4509+ .addDef (ClusterId)
4510+ .addImm (ClusterIdField);
4511+ auto Zero = B.buildConstant (S32, 0 );
4512+ auto NoClusters =
4513+ B.buildICmp (CmpInst::ICMP_EQ, LLT::scalar (1 ), ClusterId, Zero);
4514+ B.buildSelect (DstReg, NoClusters, ClusterIdXYZ, GlobalIdXYZ);
4515+ MI.eraseFromParent ();
4516+ return true ;
4517+ }
4518+ }
4519+
4520+ llvm_unreachable (" nothing should reach here" );
4521+ }
4522+
44554523bool AMDGPULegalizerInfo::loadInputValue (
44564524 Register DstReg, MachineIRBuilder &B,
44574525 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
@@ -4471,8 +4539,31 @@ bool AMDGPULegalizerInfo::loadInputValue(
44714539 AMDGPU::isEntryFunctionCC (CC) && !MFI->hasWorkGroupIDZ () ? ~0u : 0xFFFFu );
44724540 const ArgDescriptor WorkGroupIDZ =
44734541 ArgDescriptor::createRegister (AMDGPU::TTMP7, 0xFFFF0000u );
4542+ const ArgDescriptor ClusterWorkGroupIDX =
4543+ ArgDescriptor::createRegister (AMDGPU::TTMP6, 0x0000000Fu );
4544+ const ArgDescriptor ClusterWorkGroupIDY =
4545+ ArgDescriptor::createRegister (AMDGPU::TTMP6, 0x000000F0u );
4546+ const ArgDescriptor ClusterWorkGroupIDZ =
4547+ ArgDescriptor::createRegister (AMDGPU::TTMP6, 0x00000F00u );
4548+ const ArgDescriptor ClusterWorkGroupMaxIDX =
4549+ ArgDescriptor::createRegister (AMDGPU::TTMP6, 0x0000F000u );
4550+ const ArgDescriptor ClusterWorkGroupMaxIDY =
4551+ ArgDescriptor::createRegister (AMDGPU::TTMP6, 0x000F0000u );
4552+ const ArgDescriptor ClusterWorkGroupMaxIDZ =
4553+ ArgDescriptor::createRegister (AMDGPU::TTMP6, 0x00F00000u );
4554+ const ArgDescriptor ClusterWorkGroupMaxFlatID =
4555+ ArgDescriptor::createRegister (AMDGPU::TTMP6, 0x0F000000u );
4556+
4557+ auto LoadConstant = [&](unsigned N) {
4558+ B.buildConstant (DstReg, N);
4559+ return true ;
4560+ };
4561+
44744562 if (ST.hasArchitectedSGPRs () &&
44754563 (AMDGPU::isCompute (CC) || CC == CallingConv::AMDGPU_Gfx)) {
4564+ AMDGPU::ClusterDimsAttr ClusterDims = MFI->getClusterDims ();
4565+ bool HasFixedDims = ClusterDims.isFixedDims ();
4566+
44764567 switch (ArgType) {
44774568 case AMDGPUFunctionArgInfo::WORKGROUP_ID_X:
44784569 Arg = &WorkGroupIDX;
@@ -4489,6 +4580,53 @@ bool AMDGPULegalizerInfo::loadInputValue(
44894580 ArgRC = &AMDGPU::SReg_32RegClass;
44904581 ArgTy = LLT::scalar (32 );
44914582 break ;
4583+ case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_X:
4584+ if (HasFixedDims && ClusterDims.getDims ()[0 ] == 1 )
4585+ return LoadConstant (0 );
4586+ Arg = &ClusterWorkGroupIDX;
4587+ ArgRC = &AMDGPU::SReg_32RegClass;
4588+ ArgTy = LLT::scalar (32 );
4589+ break ;
4590+ case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Y:
4591+ if (HasFixedDims && ClusterDims.getDims ()[1 ] == 1 )
4592+ return LoadConstant (0 );
4593+ Arg = &ClusterWorkGroupIDY;
4594+ ArgRC = &AMDGPU::SReg_32RegClass;
4595+ ArgTy = LLT::scalar (32 );
4596+ break ;
4597+ case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Z:
4598+ if (HasFixedDims && ClusterDims.getDims ()[2 ] == 1 )
4599+ return LoadConstant (0 );
4600+ Arg = &ClusterWorkGroupIDZ;
4601+ ArgRC = &AMDGPU::SReg_32RegClass;
4602+ ArgTy = LLT::scalar (32 );
4603+ break ;
4604+ case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_X:
4605+ if (HasFixedDims)
4606+ return LoadConstant (ClusterDims.getDims ()[0 ] - 1 );
4607+ Arg = &ClusterWorkGroupMaxIDX;
4608+ ArgRC = &AMDGPU::SReg_32RegClass;
4609+ ArgTy = LLT::scalar (32 );
4610+ break ;
4611+ case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Y:
4612+ if (HasFixedDims)
4613+ return LoadConstant (ClusterDims.getDims ()[1 ] - 1 );
4614+ Arg = &ClusterWorkGroupMaxIDY;
4615+ ArgRC = &AMDGPU::SReg_32RegClass;
4616+ ArgTy = LLT::scalar (32 );
4617+ break ;
4618+ case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Z:
4619+ if (HasFixedDims)
4620+ return LoadConstant (ClusterDims.getDims ()[2 ] - 1 );
4621+ Arg = &ClusterWorkGroupMaxIDZ;
4622+ ArgRC = &AMDGPU::SReg_32RegClass;
4623+ ArgTy = LLT::scalar (32 );
4624+ break ;
4625+ case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_FLAT_ID:
4626+ Arg = &ClusterWorkGroupMaxFlatID;
4627+ ArgRC = &AMDGPU::SReg_32RegClass;
4628+ ArgTy = LLT::scalar (32 );
4629+ break ;
44924630 default :
44934631 break ;
44944632 }
@@ -4499,10 +4637,9 @@ bool AMDGPULegalizerInfo::loadInputValue(
44994637
45004638 if (!Arg) {
45014639 if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {
4502- // The intrinsic may appear when we have a 0 sized kernarg segment, in which
4503- // case the pointer argument may be missing and we use null.
4504- B.buildConstant (DstReg, 0 );
4505- return true ;
4640+ // The intrinsic may appear when we have a 0 sized kernarg segment, in
4641+ // which case the pointer argument may be missing and we use null.
4642+ return LoadConstant (0 );
45064643 }
45074644
45084645 // It's undefined behavior if a function marked with the amdgpu-no-*
@@ -7415,6 +7552,22 @@ bool AMDGPULegalizerInfo::legalizeWaveID(MachineInstr &MI,
74157552 return true ;
74167553}
74177554
7555+ bool AMDGPULegalizerInfo::legalizeConstHwRegRead (MachineInstr &MI,
7556+ MachineIRBuilder &B,
7557+ AMDGPU::Hwreg::Id HwReg,
7558+ unsigned LowBit,
7559+ unsigned Width) const {
7560+ MachineRegisterInfo &MRI = *B.getMRI ();
7561+ Register DstReg = MI.getOperand (0 ).getReg ();
7562+ if (!MRI.getRegClassOrNull (DstReg))
7563+ MRI.setRegClass (DstReg, &AMDGPU::SReg_32RegClass);
7564+ B.buildInstr (AMDGPU::S_GETREG_B32_const)
7565+ .addDef (DstReg)
7566+ .addImm (AMDGPU::Hwreg::HwregEncoding::encode (HwReg, LowBit, Width));
7567+ MI.eraseFromParent ();
7568+ return true ;
7569+ }
7570+
74187571static constexpr unsigned FPEnvModeBitField =
74197572 AMDGPU::Hwreg::HwregEncoding::encode (AMDGPU::Hwreg::ID_MODE, 0 , 23 );
74207573
@@ -7577,14 +7730,64 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
75777730 return legalizeWorkitemIDIntrinsic (MI, MRI, B, 2 ,
75787731 AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
75797732 case Intrinsic::amdgcn_workgroup_id_x:
7580- return legalizePreloadedArgIntrin (MI, MRI, B,
7581- AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
7733+ return legalizeWorkGroupId (
7734+ MI, B, AMDGPUFunctionArgInfo::WORKGROUP_ID_X,
7735+ AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_X,
7736+ AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_X);
75827737 case Intrinsic::amdgcn_workgroup_id_y:
7583- return legalizePreloadedArgIntrin (MI, MRI, B,
7584- AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
7738+ return legalizeWorkGroupId (
7739+ MI, B, AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,
7740+ AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Y,
7741+ AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Y);
75857742 case Intrinsic::amdgcn_workgroup_id_z:
7586- return legalizePreloadedArgIntrin (MI, MRI, B,
7743+ return legalizeWorkGroupId (
7744+ MI, B, AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,
7745+ AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Z,
7746+ AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Z);
7747+ case Intrinsic::amdgcn_cluster_id_x:
7748+ return ST.hasGFX1250Insts () &&
7749+ legalizePreloadedArgIntrin (MI, MRI, B,
7750+ AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
7751+ case Intrinsic::amdgcn_cluster_id_y:
7752+ return ST.hasGFX1250Insts () &&
7753+ legalizePreloadedArgIntrin (MI, MRI, B,
7754+ AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
7755+ case Intrinsic::amdgcn_cluster_id_z:
7756+ return ST.hasGFX1250Insts () &&
7757+ legalizePreloadedArgIntrin (MI, MRI, B,
75877758 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
7759+ case Intrinsic::amdgcn_cluster_workgroup_id_x:
7760+ return ST.hasGFX1250Insts () &&
7761+ legalizePreloadedArgIntrin (
7762+ MI, MRI, B, AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_X);
7763+ case Intrinsic::amdgcn_cluster_workgroup_id_y:
7764+ return ST.hasGFX1250Insts () &&
7765+ legalizePreloadedArgIntrin (
7766+ MI, MRI, B, AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Y);
7767+ case Intrinsic::amdgcn_cluster_workgroup_id_z:
7768+ return ST.hasGFX1250Insts () &&
7769+ legalizePreloadedArgIntrin (
7770+ MI, MRI, B, AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Z);
7771+ case Intrinsic::amdgcn_cluster_workgroup_flat_id:
7772+ return AMDGPU::isGFX1250 (ST) &&
7773+ legalizeConstHwRegRead (MI, B, AMDGPU::Hwreg::ID_IB_STS2, 21 , 4 );
7774+ case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
7775+ return ST.hasGFX1250Insts () &&
7776+ legalizePreloadedArgIntrin (
7777+ MI, MRI, B, AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_X);
7778+ case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
7779+ return ST.hasGFX1250Insts () &&
7780+ legalizePreloadedArgIntrin (
7781+ MI, MRI, B, AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Y);
7782+ case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
7783+ return ST.hasGFX1250Insts () &&
7784+ legalizePreloadedArgIntrin (
7785+ MI, MRI, B, AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Z);
7786+ case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
7787+ return ST.hasGFX1250Insts () &&
7788+ legalizePreloadedArgIntrin (
7789+ MI, MRI, B,
7790+ AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_FLAT_ID);
75887791 case Intrinsic::amdgcn_wave_id:
75897792 return legalizeWaveID (MI, B);
75907793 case Intrinsic::amdgcn_lds_kernel_id:
0 commit comments