Skip to content

Commit 826ad1a

Browse files
committed
[AMDGPU] Extend type support for update_dpp intrinsic
We can split 64-bit DPP as a post-RA pseudo if control values are supported, but cannot handle other types.
1 parent 2bd21b2 commit 826ad1a

12 files changed

+6163
-6105
lines changed

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

Lines changed: 30 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -5495,6 +5495,13 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
54955495
}
54965496
case Intrinsic::amdgcn_mov_dpp8:
54975497
return LaneOp.addImm(MI.getOperand(3).getImm()).getReg(0);
5498+
case Intrinsic::amdgcn_update_dpp:
5499+
return LaneOp.addUse(Src1)
5500+
.addImm(MI.getOperand(4).getImm())
5501+
.addImm(MI.getOperand(5).getImm())
5502+
.addImm(MI.getOperand(6).getImm())
5503+
.addImm(MI.getOperand(7).getImm())
5504+
.getReg(0);
54985505
default:
54995506
llvm_unreachable("unhandled lane op");
55005507
}
@@ -5504,7 +5511,7 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
55045511
Register Src0 = MI.getOperand(2).getReg();
55055512
Register Src1, Src2;
55065513
if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
5507-
IsSetInactive || IsPermLane16) {
5514+
IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
55085515
Src1 = MI.getOperand(3).getReg();
55095516
if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) {
55105517
Src2 = MI.getOperand(4).getReg();
@@ -5514,15 +5521,22 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
55145521
LLT Ty = MRI.getType(DstReg);
55155522
unsigned Size = Ty.getSizeInBits();
55165523

5517-
if (Size == 32) {
5524+
unsigned SplitSize =
5525+
(IID == Intrinsic::amdgcn_update_dpp && (Size % 64 == 0) &&
5526+
ST.hasDPALU_DPP() &&
5527+
AMDGPU::isLegalDPALU_DPPControl(MI.getOperand(4).getImm()))
5528+
? 64
5529+
: 32;
5530+
5531+
if (Size == SplitSize) {
55185532
// Already legal
55195533
return true;
55205534
}
55215535

55225536
if (Size < 32) {
55235537
Src0 = B.buildAnyExt(S32, Src0).getReg(0);
55245538

5525-
if (IsSetInactive || IsPermLane16)
5539+
if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
55265540
Src1 = B.buildAnyExt(LLT::scalar(32), Src1).getReg(0);
55275541

55285542
if (IID == Intrinsic::amdgcn_writelane)
@@ -5534,31 +5548,27 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
55345548
return true;
55355549
}
55365550

5537-
if (Size % 32 != 0)
5551+
if (Size % SplitSize != 0)
55385552
return false;
55395553

5540-
LLT PartialResTy = S32;
5554+
LLT PartialResTy = LLT::scalar(SplitSize);
55415555
if (Ty.isVector()) {
55425556
LLT EltTy = Ty.getElementType();
5543-
switch (EltTy.getSizeInBits()) {
5544-
case 16:
5545-
PartialResTy = Ty.changeElementCount(ElementCount::getFixed(2));
5546-
break;
5547-
case 32:
5557+
unsigned EltSize = EltTy.getSizeInBits();
5558+
if (EltSize == SplitSize)
55485559
PartialResTy = EltTy;
5549-
break;
5550-
default:
5551-
// Handle all other cases via S32 pieces;
5552-
break;
5553-
}
5560+
else if (EltSize == 16 || EltSize == 32)
5561+
PartialResTy =
5562+
Ty.changeElementCount(ElementCount::getFixed(SplitSize / EltSize));
5563+
// Handle all other cases via S32/S64 pieces;
55545564
}
55555565

5556-
SmallVector<Register, 2> PartialRes;
5557-
unsigned NumParts = Size / 32;
5566+
SmallVector<Register, 4> PartialRes;
5567+
unsigned NumParts = Size / SplitSize;
55585568
MachineInstrBuilder Src0Parts = B.buildUnmerge(PartialResTy, Src0);
55595569
MachineInstrBuilder Src1Parts, Src2Parts;
55605570

5561-
if (IsSetInactive || IsPermLane16)
5571+
if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
55625572
Src1Parts = B.buildUnmerge(PartialResTy, Src1);
55635573

55645574
if (IID == Intrinsic::amdgcn_writelane)
@@ -5567,7 +5577,7 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
55675577
for (unsigned i = 0; i < NumParts; ++i) {
55685578
Src0 = Src0Parts.getReg(i);
55695579

5570-
if (IsSetInactive || IsPermLane16)
5580+
if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
55715581
Src1 = Src1Parts.getReg(i);
55725582

55735583
if (IID == Intrinsic::amdgcn_writelane)
@@ -7555,6 +7565,7 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
75557565
case Intrinsic::amdgcn_set_inactive:
75567566
case Intrinsic::amdgcn_set_inactive_chain_arg:
75577567
case Intrinsic::amdgcn_mov_dpp8:
7568+
case Intrinsic::amdgcn_update_dpp:
75587569
return legalizeLaneOp(Helper, MI, IntrID);
75597570
case Intrinsic::amdgcn_s_buffer_prefetch_data:
75607571
return legalizeSBufferPrefetch(Helper, MI);

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 34 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -6166,13 +6166,22 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
61666166
IID == Intrinsic::amdgcn_set_inactive_chain_arg;
61676167
SDLoc SL(N);
61686168
MVT IntVT = MVT::getIntegerVT(ValSize);
6169+
const GCNSubtarget &ST =
6170+
DAG.getMachineFunction().getSubtarget<GCNSubtarget>();
6171+
unsigned SplitSize =
6172+
(IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
6173+
ST.hasDPALU_DPP() &&
6174+
AMDGPU::isLegalDPALU_DPPControl(N->getConstantOperandVal(3)))
6175+
? 64
6176+
: 32;
61696177

61706178
auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
61716179
SDValue Src2, MVT ValT) -> SDValue {
61726180
SmallVector<SDValue, 8> Operands;
61736181
switch (IID) {
61746182
case Intrinsic::amdgcn_permlane16:
61756183
case Intrinsic::amdgcn_permlanex16:
6184+
case Intrinsic::amdgcn_update_dpp:
61766185
Operands.push_back(N->getOperand(6));
61776186
Operands.push_back(N->getOperand(5));
61786187
Operands.push_back(N->getOperand(4));
@@ -6210,13 +6219,15 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
62106219
SDValue Src0 = N->getOperand(1);
62116220
SDValue Src1, Src2;
62126221
if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
6213-
IID == Intrinsic::amdgcn_mov_dpp8 || IsSetInactive || IsPermLane16) {
6222+
IID == Intrinsic::amdgcn_mov_dpp8 ||
6223+
IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
62146224
Src1 = N->getOperand(2);
6215-
if (IID == Intrinsic::amdgcn_writelane || IsPermLane16)
6225+
if (IID == Intrinsic::amdgcn_writelane ||
6226+
IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
62166227
Src2 = N->getOperand(3);
62176228
}
62186229

6219-
if (ValSize == 32) {
6230+
if (ValSize == SplitSize) {
62206231
// Already legal
62216232
return SDValue();
62226233
}
@@ -6226,7 +6237,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
62266237
Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,
62276238
SL, MVT::i32);
62286239

6229-
if (IsSetInactive || IsPermLane16) {
6240+
if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
62306241
Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1,
62316242
SL, MVT::i32);
62326243
}
@@ -6241,7 +6252,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
62416252
return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc;
62426253
}
62436254

6244-
if (ValSize % 32 != 0)
6255+
if (ValSize % SplitSize != 0)
62456256
return SDValue();
62466257

62476258
auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue {
@@ -6288,21 +6299,26 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
62886299
switch (MVT::SimpleValueType EltTy =
62896300
VT.getVectorElementType().getSimpleVT().SimpleTy) {
62906301
case MVT::i32:
6291-
case MVT::f32: {
6292-
SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT());
6293-
return unrollLaneOp(LaneOp.getNode());
6294-
}
6302+
case MVT::f32:
6303+
if (SplitSize == 32) {
6304+
SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT());
6305+
return unrollLaneOp(LaneOp.getNode());
6306+
}
6307+
[[fallthrough]];
62956308
case MVT::i16:
62966309
case MVT::f16:
62976310
case MVT::bf16: {
6298-
MVT SubVecVT = MVT::getVectorVT(EltTy, 2);
6311+
unsigned SubVecNumElt =
6312+
SplitSize / VT.getVectorElementType().getSizeInBits();
6313+
MVT SubVecVT = MVT::getVectorVT(EltTy, SubVecNumElt);
62996314
SmallVector<SDValue, 4> Pieces;
63006315
SDValue Src0SubVec, Src1SubVec, Src2SubVec;
6301-
for (unsigned i = 0, EltIdx = 0; i < ValSize / 32; i++) {
6316+
for (unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
63026317
Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
63036318
DAG.getConstant(EltIdx, SL, MVT::i32));
63046319

6305-
if (IsSetInactive || IsPermLane16)
6320+
if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
6321+
IsPermLane16)
63066322
Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1,
63076323
DAG.getConstant(EltIdx, SL, MVT::i32));
63086324

@@ -6311,10 +6327,10 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
63116327
DAG.getConstant(EltIdx, SL, MVT::i32));
63126328

63136329
Pieces.push_back(
6314-
IsSetInactive || IsPermLane16
6330+
IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
63156331
? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
63166332
: createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
6317-
EltIdx += 2;
6333+
EltIdx += SubVecNumElt;
63186334
}
63196335
return DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, Pieces);
63206336
}
@@ -6324,10 +6340,11 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
63246340
}
63256341
}
63266342

6327-
MVT VecVT = MVT::getVectorVT(MVT::i32, ValSize / 32);
6343+
MVT VecVT =
6344+
MVT::getVectorVT(MVT::getIntegerVT(SplitSize), ValSize / SplitSize);
63286345
Src0 = DAG.getBitcast(VecVT, Src0);
63296346

6330-
if (IsSetInactive || IsPermLane16)
6347+
if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
63316348
Src1 = DAG.getBitcast(VecVT, Src1);
63326349

63336350
if (IID == Intrinsic::amdgcn_writelane)
@@ -8837,6 +8854,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
88378854
case Intrinsic::amdgcn_set_inactive:
88388855
case Intrinsic::amdgcn_set_inactive_chain_arg:
88398856
case Intrinsic::amdgcn_mov_dpp8:
8857+
case Intrinsic::amdgcn_update_dpp:
88408858
return lowerLaneOp(*this, Op.getNode(), DAG);
88418859
default:
88428860
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll

Lines changed: 20 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -52,11 +52,11 @@ define amdgpu_kernel void @update_dppi64_test(ptr addrspace(1) %arg, i64 %in1, i
5252
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
5353
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
5454
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
55-
; GFX8-NEXT: v_mov_b32_e32 v5, s3
5655
; GFX8-NEXT: v_mov_b32_e32 v4, s2
56+
; GFX8-NEXT: v_mov_b32_e32 v5, s3
5757
; GFX8-NEXT: s_waitcnt vmcnt(0)
58-
; GFX8-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
5958
; GFX8-NEXT: v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
59+
; GFX8-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
6060
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
6161
; GFX8-NEXT: s_endpgm
6262
;
@@ -77,10 +77,10 @@ define amdgpu_kernel void @update_dppi64_test(ptr addrspace(1) %arg, i64 %in1, i
7777
; GFX11-LABEL: update_dppi64_test:
7878
; GFX11: ; %bb.0:
7979
; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
80-
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
8180
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
82-
; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
81+
; GFX11-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_and_b32 v0, 0x3ff, v0
8382
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0
83+
; GFX11-NEXT: v_mov_b32_e32 v2, s2
8484
; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1]
8585
; GFX11-NEXT: s_waitcnt vmcnt(0)
8686
; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
@@ -106,11 +106,11 @@ define amdgpu_kernel void @update_dppf64_test(ptr addrspace(1) %arg, double %in1
106106
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
107107
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
108108
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
109-
; GFX8-NEXT: v_mov_b32_e32 v5, s3
110109
; GFX8-NEXT: v_mov_b32_e32 v4, s2
110+
; GFX8-NEXT: v_mov_b32_e32 v5, s3
111111
; GFX8-NEXT: s_waitcnt vmcnt(0)
112-
; GFX8-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
113112
; GFX8-NEXT: v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
113+
; GFX8-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
114114
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
115115
; GFX8-NEXT: s_endpgm
116116
;
@@ -131,10 +131,10 @@ define amdgpu_kernel void @update_dppf64_test(ptr addrspace(1) %arg, double %in1
131131
; GFX11-LABEL: update_dppf64_test:
132132
; GFX11: ; %bb.0:
133133
; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
134-
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
135134
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
136-
; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
135+
; GFX11-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_and_b32 v0, 0x3ff, v0
137136
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0
137+
; GFX11-NEXT: v_mov_b32_e32 v2, s2
138138
; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1]
139139
; GFX11-NEXT: s_waitcnt vmcnt(0)
140140
; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
@@ -160,11 +160,11 @@ define amdgpu_kernel void @update_dppv2i32_test(ptr addrspace(1) %arg, <2 x i32>
160160
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
161161
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
162162
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
163-
; GFX8-NEXT: v_mov_b32_e32 v5, s3
164163
; GFX8-NEXT: v_mov_b32_e32 v4, s2
164+
; GFX8-NEXT: v_mov_b32_e32 v5, s3
165165
; GFX8-NEXT: s_waitcnt vmcnt(0)
166-
; GFX8-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
167166
; GFX8-NEXT: v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
167+
; GFX8-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
168168
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
169169
; GFX8-NEXT: s_endpgm
170170
;
@@ -185,10 +185,10 @@ define amdgpu_kernel void @update_dppv2i32_test(ptr addrspace(1) %arg, <2 x i32>
185185
; GFX11-LABEL: update_dppv2i32_test:
186186
; GFX11: ; %bb.0:
187187
; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
188-
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
189188
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
190-
; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
189+
; GFX11-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_and_b32 v0, 0x3ff, v0
191190
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0
191+
; GFX11-NEXT: v_mov_b32_e32 v2, s2
192192
; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1]
193193
; GFX11-NEXT: s_waitcnt vmcnt(0)
194194
; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
@@ -214,11 +214,11 @@ define amdgpu_kernel void @update_dppv2f32_test(ptr addrspace(1) %arg, <2 x floa
214214
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
215215
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
216216
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
217-
; GFX8-NEXT: v_mov_b32_e32 v5, s3
218217
; GFX8-NEXT: v_mov_b32_e32 v4, s2
218+
; GFX8-NEXT: v_mov_b32_e32 v5, s3
219219
; GFX8-NEXT: s_waitcnt vmcnt(0)
220-
; GFX8-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
221220
; GFX8-NEXT: v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
221+
; GFX8-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
222222
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
223223
; GFX8-NEXT: s_endpgm
224224
;
@@ -239,10 +239,10 @@ define amdgpu_kernel void @update_dppv2f32_test(ptr addrspace(1) %arg, <2 x floa
239239
; GFX11-LABEL: update_dppv2f32_test:
240240
; GFX11: ; %bb.0:
241241
; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
242-
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
243242
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
244-
; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
243+
; GFX11-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_and_b32 v0, 0x3ff, v0
245244
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0
245+
; GFX11-NEXT: v_mov_b32_e32 v2, s2
246246
; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1]
247247
; GFX11-NEXT: s_waitcnt vmcnt(0)
248248
; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
@@ -268,11 +268,11 @@ define amdgpu_kernel void @update_dpp_p0_test(ptr addrspace(1) %arg, ptr %in1, p
268268
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
269269
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
270270
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
271-
; GFX8-NEXT: v_mov_b32_e32 v5, s3
272271
; GFX8-NEXT: v_mov_b32_e32 v4, s2
272+
; GFX8-NEXT: v_mov_b32_e32 v5, s3
273273
; GFX8-NEXT: s_waitcnt vmcnt(0)
274-
; GFX8-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
275274
; GFX8-NEXT: v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
275+
; GFX8-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
276276
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
277277
; GFX8-NEXT: s_endpgm
278278
;
@@ -293,10 +293,10 @@ define amdgpu_kernel void @update_dpp_p0_test(ptr addrspace(1) %arg, ptr %in1, p
293293
; GFX11-LABEL: update_dpp_p0_test:
294294
; GFX11: ; %bb.0:
295295
; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
296-
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
297296
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
298-
; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
297+
; GFX11-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_and_b32 v0, 0x3ff, v0
299298
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0
299+
; GFX11-NEXT: v_mov_b32_e32 v2, s2
300300
; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1]
301301
; GFX11-NEXT: s_waitcnt vmcnt(0)
302302
; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1

0 commit comments

Comments
 (0)