-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[AMDGPU] Extend type support for update_dpp intrinsic #114597
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU] Extend type support for update_dpp intrinsic #114597
Conversation
|
This stack of pull requests is managed by Graphite. Learn more about stacking. |
|
@llvm/pr-subscribers-llvm-globalisel @llvm/pr-subscribers-backend-amdgpu Author: Stanislav Mekhanoshin (rampitec) ChangesWe can split 64-bit DPP as a post-RA pseudo if control values are Patch is 962.41 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/114597.diff 12 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index d7126132356d2c..410cf98d507d11 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -5496,6 +5496,13 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
}
case Intrinsic::amdgcn_mov_dpp8:
return LaneOp.addImm(MI.getOperand(3).getImm()).getReg(0);
+ case Intrinsic::amdgcn_update_dpp:
+ return LaneOp.addUse(Src1)
+ .addImm(MI.getOperand(4).getImm())
+ .addImm(MI.getOperand(5).getImm())
+ .addImm(MI.getOperand(6).getImm())
+ .addImm(MI.getOperand(7).getImm())
+ .getReg(0);
default:
llvm_unreachable("unhandled lane op");
}
@@ -5505,7 +5512,7 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
Register Src0 = MI.getOperand(2).getReg();
Register Src1, Src2;
if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
- IsSetInactive || IsPermLane16) {
+ IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
Src1 = MI.getOperand(3).getReg();
if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) {
Src2 = MI.getOperand(4).getReg();
@@ -5515,7 +5522,14 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
LLT Ty = MRI.getType(DstReg);
unsigned Size = Ty.getSizeInBits();
- if (Size == 32) {
+ unsigned SplitSize =
+ (IID == Intrinsic::amdgcn_update_dpp && (Size % 64 == 0) &&
+ ST.hasDPALU_DPP() &&
+ AMDGPU::isLegalDPALU_DPPControl(MI.getOperand(4).getImm()))
+ ? 64
+ : 32;
+
+ if (Size == SplitSize) {
// Already legal
return true;
}
@@ -5523,7 +5537,7 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
if (Size < 32) {
Src0 = B.buildAnyExt(S32, Src0).getReg(0);
- if (IsSetInactive || IsPermLane16)
+ if (IID == Intrinsic::amdgcn_update_dpp ||IsSetInactive || IsPermLane16)
Src1 = B.buildAnyExt(LLT::scalar(32), Src1).getReg(0);
if (IID == Intrinsic::amdgcn_writelane)
@@ -5535,31 +5549,28 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
return true;
}
- if (Size % 32 != 0)
+ if (Size % SplitSize != 0)
return false;
- LLT PartialResTy = S32;
+ LLT PartialResTy = LLT::scalar(SplitSize);
if (Ty.isVector()) {
LLT EltTy = Ty.getElementType();
- switch (EltTy.getSizeInBits()) {
- case 16:
- PartialResTy = Ty.changeElementCount(ElementCount::getFixed(2));
- break;
- case 32:
+ unsigned EltSize =
+ EltTy.getSizeInBits();
+ if (EltSize == SplitSize)
PartialResTy = EltTy;
- break;
- default:
- // Handle all other cases via S32 pieces;
- break;
- }
+ else if (EltSize == 16 || EltSize == 32)
+ PartialResTy =
+ Ty.changeElementCount(ElementCount::getFixed(SplitSize / EltSize));
+ // Handle all other cases via S32/S64 pieces;
}
- SmallVector<Register, 2> PartialRes;
- unsigned NumParts = Size / 32;
+ SmallVector<Register, 4> PartialRes;
+ unsigned NumParts = Size / SplitSize;
MachineInstrBuilder Src0Parts = B.buildUnmerge(PartialResTy, Src0);
MachineInstrBuilder Src1Parts, Src2Parts;
- if (IsSetInactive || IsPermLane16)
+ if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
Src1Parts = B.buildUnmerge(PartialResTy, Src1);
if (IID == Intrinsic::amdgcn_writelane)
@@ -5568,7 +5579,7 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
for (unsigned i = 0; i < NumParts; ++i) {
Src0 = Src0Parts.getReg(i);
- if (IsSetInactive || IsPermLane16)
+ if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
Src1 = Src1Parts.getReg(i);
if (IID == Intrinsic::amdgcn_writelane)
@@ -7532,6 +7543,7 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
case Intrinsic::amdgcn_set_inactive:
case Intrinsic::amdgcn_set_inactive_chain_arg:
case Intrinsic::amdgcn_mov_dpp8:
+ case Intrinsic::amdgcn_update_dpp:
return legalizeLaneOp(Helper, MI, IntrID);
case Intrinsic::amdgcn_s_buffer_prefetch_data:
return legalizeSBufferPrefetch(Helper, MI);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index d66610ae0a160d..e72a53ee9e81e6 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -6166,6 +6166,14 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
IID == Intrinsic::amdgcn_set_inactive_chain_arg;
SDLoc SL(N);
MVT IntVT = MVT::getIntegerVT(ValSize);
+ const GCNSubtarget &ST =
+ DAG.getMachineFunction().getSubtarget<GCNSubtarget>();
+ unsigned SplitSize =
+ (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
+ ST.hasDPALU_DPP() &&
+ AMDGPU::isLegalDPALU_DPPControl(N->getConstantOperandVal(3)))
+ ? 64
+ : 32;
auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
SDValue Src2, MVT ValT) -> SDValue {
@@ -6173,6 +6181,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
switch (IID) {
case Intrinsic::amdgcn_permlane16:
case Intrinsic::amdgcn_permlanex16:
+ case Intrinsic::amdgcn_update_dpp:
Operands.push_back(N->getOperand(6));
Operands.push_back(N->getOperand(5));
Operands.push_back(N->getOperand(4));
@@ -6210,13 +6219,15 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
SDValue Src0 = N->getOperand(1);
SDValue Src1, Src2;
if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
- IID == Intrinsic::amdgcn_mov_dpp8 || IsSetInactive || IsPermLane16) {
+ IID == Intrinsic::amdgcn_mov_dpp8 ||
+ IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
Src1 = N->getOperand(2);
- if (IID == Intrinsic::amdgcn_writelane || IsPermLane16)
+ if (IID == Intrinsic::amdgcn_writelane ||
+ IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
Src2 = N->getOperand(3);
}
- if (ValSize == 32) {
+ if (ValSize == SplitSize) {
// Already legal
return SDValue();
}
@@ -6226,7 +6237,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,
SL, MVT::i32);
- if (IsSetInactive || IsPermLane16) {
+ if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1,
SL, MVT::i32);
}
@@ -6241,7 +6252,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc;
}
- if (ValSize % 32 != 0)
+ if (ValSize % SplitSize != 0)
return SDValue();
auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue {
@@ -6288,21 +6299,26 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
switch (MVT::SimpleValueType EltTy =
VT.getVectorElementType().getSimpleVT().SimpleTy) {
case MVT::i32:
- case MVT::f32: {
- SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT());
- return unrollLaneOp(LaneOp.getNode());
- }
+ case MVT::f32:
+ if (SplitSize == 32) {
+ SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT());
+ return unrollLaneOp(LaneOp.getNode());
+ }
+ [[fallthrough]];
case MVT::i16:
case MVT::f16:
case MVT::bf16: {
- MVT SubVecVT = MVT::getVectorVT(EltTy, 2);
+ unsigned SubVecNumElt =
+ SplitSize / VT.getVectorElementType().getSizeInBits();
+ MVT SubVecVT = MVT::getVectorVT(EltTy, SubVecNumElt);
SmallVector<SDValue, 4> Pieces;
SDValue Src0SubVec, Src1SubVec, Src2SubVec;
- for (unsigned i = 0, EltIdx = 0; i < ValSize / 32; i++) {
+ for (unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
DAG.getConstant(EltIdx, SL, MVT::i32));
- if (IsSetInactive || IsPermLane16)
+ if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
+ IsPermLane16)
Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1,
DAG.getConstant(EltIdx, SL, MVT::i32));
@@ -6311,10 +6327,10 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
DAG.getConstant(EltIdx, SL, MVT::i32));
Pieces.push_back(
- IsSetInactive || IsPermLane16
+ IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
: createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
- EltIdx += 2;
+ EltIdx += SubVecNumElt;
}
return DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, Pieces);
}
@@ -6324,10 +6340,11 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
}
}
- MVT VecVT = MVT::getVectorVT(MVT::i32, ValSize / 32);
+ MVT VecVT =
+ MVT::getVectorVT(MVT::getIntegerVT(SplitSize), ValSize / SplitSize);
Src0 = DAG.getBitcast(VecVT, Src0);
- if (IsSetInactive || IsPermLane16)
+ if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
Src1 = DAG.getBitcast(VecVT, Src1);
if (IID == Intrinsic::amdgcn_writelane)
@@ -8837,6 +8854,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case Intrinsic::amdgcn_set_inactive:
case Intrinsic::amdgcn_set_inactive_chain_arg:
case Intrinsic::amdgcn_mov_dpp8:
+ case Intrinsic::amdgcn_update_dpp:
return lowerLaneOp(*this, Op.getNode(), DAG);
default:
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
index 4b0f2ef77a9834..336a7767ee0622 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
@@ -52,11 +52,11 @@ define amdgpu_kernel void @update_dppi64_test(ptr addrspace(1) %arg, i64 %in1, i
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NEXT: v_mov_b32_e32 v4, s2
+; GFX8-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NEXT: v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX8-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
; GFX8-NEXT: s_endpgm
;
@@ -77,10 +77,10 @@ define amdgpu_kernel void @update_dppi64_test(ptr addrspace(1) %arg, i64 %in1, i
; GFX11-LABEL: update_dppi64_test:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_and_b32 v0, 0x3ff, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; GFX11-NEXT: v_mov_b32_e32 v2, s2
; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
@@ -106,11 +106,11 @@ define amdgpu_kernel void @update_dppf64_test(ptr addrspace(1) %arg, double %in1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NEXT: v_mov_b32_e32 v4, s2
+; GFX8-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NEXT: v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX8-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
; GFX8-NEXT: s_endpgm
;
@@ -131,10 +131,10 @@ define amdgpu_kernel void @update_dppf64_test(ptr addrspace(1) %arg, double %in1
; GFX11-LABEL: update_dppf64_test:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_and_b32 v0, 0x3ff, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; GFX11-NEXT: v_mov_b32_e32 v2, s2
; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
@@ -160,11 +160,11 @@ define amdgpu_kernel void @update_dppv2i32_test(ptr addrspace(1) %arg, <2 x i32>
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NEXT: v_mov_b32_e32 v4, s2
+; GFX8-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NEXT: v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX8-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
; GFX8-NEXT: s_endpgm
;
@@ -185,10 +185,10 @@ define amdgpu_kernel void @update_dppv2i32_test(ptr addrspace(1) %arg, <2 x i32>
; GFX11-LABEL: update_dppv2i32_test:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_and_b32 v0, 0x3ff, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; GFX11-NEXT: v_mov_b32_e32 v2, s2
; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
@@ -214,11 +214,11 @@ define amdgpu_kernel void @update_dppv2f32_test(ptr addrspace(1) %arg, <2 x floa
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NEXT: v_mov_b32_e32 v4, s2
+; GFX8-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NEXT: v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX8-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
; GFX8-NEXT: s_endpgm
;
@@ -239,10 +239,10 @@ define amdgpu_kernel void @update_dppv2f32_test(ptr addrspace(1) %arg, <2 x floa
; GFX11-LABEL: update_dppv2f32_test:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_and_b32 v0, 0x3ff, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; GFX11-NEXT: v_mov_b32_e32 v2, s2
; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
@@ -268,11 +268,11 @@ define amdgpu_kernel void @update_dpp_p0_test(ptr addrspace(1) %arg, ptr %in1, p
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NEXT: v_mov_b32_e32 v4, s2
+; GFX8-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NEXT: v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX8-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
; GFX8-NEXT: s_endpgm
;
@@ -293,10 +293,10 @@ define amdgpu_kernel void @update_dpp_p0_test(ptr addrspace(1) %arg, ptr %in1, p
; GFX11-LABEL: update_dpp_p0_test:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_and_b32 v0, 0x3ff, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; GFX11-NEXT: v_mov_b32_e32 v2, s2
; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index 495bfec5454ee8..98d2e71f3975c2 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -2819,343 +2819,327 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX8_DPP-LABEL: add_i64_varying:
; GFX8_DPP: ; %bb.0: ; %entry
; GFX8_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0
-; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v7
; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v6, exec_lo, 0
+; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v6, exec_hi, v6
+; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[4:5]
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, 0
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, 0, 0, s[4:5]
+; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, 0
+; GFX8_DPP-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc
+; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, 0
+; GFX8_DPP-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc
+; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, 0
+; GFX8_DPP-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc
+; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, 0
+; GFX8_DPP-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc
+; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, 0
+; ...
[truncated]
|
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
252eeb7 to
826ad1a
Compare
| %tmp0 = call bfloat @llvm.amdgcn.update.dpp.bf16(bfloat %old, bfloat %in, i32 337, i32 1, i32 1, i1 0) | ||
| store bfloat %tmp0, ptr addrspace(1) %out | ||
| ret void | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Test vector of pointers
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added. Some vectors of pointers cannot be handled by global ISel even w/o the split, such as <2 x p3>, it just cannot select. But that is not a problem of this patch. SDag works on any combination though.
826ad1a to
bfc0c66
Compare
2189661 to
2c5bf20
Compare
We can split 64-bit DPP as a post-RA pseudo if control values are supported, but cannot handle other types.
2c5bf20 to
0c899e7
Compare
|
Do we still need |
Yes, we need a single MI to have something to be able to combine with other 64-bit VALU on targets supporting it. We could probably replace it with V_MOV_B64, but gfx90a has FeatureDPALU_DPP, but does not have V_MOV_B64. |
Oh, I see. That's annoying. 64-bit DPP is only allowed on F64 operations, so there is nothing that we could easily use as a dummy V_MOV_B64-like instruction. |

We can split 64-bit DPP as a post-RA pseudo if control values are
supported, but cannot handle other types.