Skip to content

Commit a96acb5

Browse files
committed
[AMDGPU] Control Flow lowering: add S_CMOV_b32/64_term and S_CSELECT_B32/64_term pseudo instructions
1 parent 1805a17 commit a96acb5

File tree

128 files changed

+14041
-15175
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

128 files changed

+14041
-15175
lines changed

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1553,7 +1553,8 @@ bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
15531553
return true;
15541554
}
15551555

1556-
bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
1556+
bool AMDGPUInstructionSelector::selectWaveReconvergeIntrinsic(
1557+
MachineInstr &MI) const {
15571558
// FIXME: Manually selecting to avoid dealing with the SReg_1 trick
15581559
// SelectionDAG uses for wave32 vs wave64.
15591560
MachineBasicBlock *BB = MI.getParent();
@@ -2084,7 +2085,7 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
20842085
unsigned IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
20852086
switch (IntrinsicID) {
20862087
case Intrinsic::amdgcn_wave_reconverge:
2087-
return selectEndCfIntrinsic(I);
2088+
return selectWaveReconvergeIntrinsic(I);
20882089
case Intrinsic::amdgcn_ds_ordered_add:
20892090
case Intrinsic::amdgcn_ds_ordered_swap:
20902091
return selectDSOrderedIntrinsic(I, IntrinsicID);

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
119119
bool selectReturnAddress(MachineInstr &I) const;
120120
bool selectG_INTRINSIC(MachineInstr &I) const;
121121

122-
bool selectEndCfIntrinsic(MachineInstr &MI) const;
122+
bool selectWaveReconvergeIntrinsic(MachineInstr &MI) const;
123123
bool selectDSOrderedIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const;
124124
bool selectDSGWSIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const;
125125
bool selectDSAppendConsume(MachineInstr &MI, bool IsAppend) const;

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15754,7 +15754,7 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
1575415754
Next++;
1575515755
}
1575615756

15757-
assert((Next == MBB.end() || !Next->readsRegister(AMDGPU::SCC)) &&
15757+
assert((Next == MBB.end() || !Next->readsRegister(AMDGPU::SCC, TRI)) &&
1575815758
"Malformed CFG detected!\n");
1575915759

1576015760
if (NeedToMove) {

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2103,12 +2103,36 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
21032103
MI.setDesc(get(AMDGPU::S_MOV_B64));
21042104
break;
21052105

2106+
case AMDGPU::S_CMOV_B64_term:
2107+
// This is only a terminator to get the correct spill code placement during
2108+
// register allocation.
2109+
MI.setDesc(get(AMDGPU::S_CMOV_B64));
2110+
break;
2111+
21062112
case AMDGPU::S_MOV_B32_term:
21072113
// This is only a terminator to get the correct spill code placement during
21082114
// register allocation.
21092115
MI.setDesc(get(AMDGPU::S_MOV_B32));
21102116
break;
21112117

2118+
case AMDGPU::S_CMOV_B32_term:
2119+
// This is only a terminator to get the correct spill code placement during
2120+
// register allocation.
2121+
MI.setDesc(get(AMDGPU::S_CMOV_B32));
2122+
break;
2123+
2124+
case AMDGPU::S_CSELECT_B32_term:
2125+
// This is only a terminator to get the correct spill code placement during
2126+
// register allocation.
2127+
MI.setDesc(get(AMDGPU::S_CSELECT_B32));
2128+
break;
2129+
2130+
case AMDGPU::S_CSELECT_B64_term:
2131+
// This is only a terminator to get the correct spill code placement during
2132+
// register allocation.
2133+
MI.setDesc(get(AMDGPU::S_CSELECT_B64));
2134+
break;
2135+
21122136
case AMDGPU::S_XOR_B64_term:
21132137
// This is only a terminator to get the correct spill code placement during
21142138
// register allocation.
@@ -3088,17 +3112,21 @@ bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
30883112
while (I != E && !I->isBranch() && !I->isReturn()) {
30893113
switch (I->getOpcode()) {
30903114
case AMDGPU::S_MOV_B64_term:
3115+
case AMDGPU::S_CMOV_B64_term:
30913116
case AMDGPU::S_XOR_B64_term:
30923117
case AMDGPU::S_OR_B64_term:
30933118
case AMDGPU::S_ANDN2_B64_term:
30943119
case AMDGPU::S_AND_B64_term:
30953120
case AMDGPU::S_AND_SAVEEXEC_B64_term:
3121+
case AMDGPU::S_CSELECT_B64_term:
30963122
case AMDGPU::S_MOV_B32_term:
3123+
case AMDGPU::S_CMOV_B32_term:
30973124
case AMDGPU::S_XOR_B32_term:
30983125
case AMDGPU::S_OR_B32_term:
30993126
case AMDGPU::S_ANDN2_B32_term:
31003127
case AMDGPU::S_AND_B32_term:
31013128
case AMDGPU::S_AND_SAVEEXEC_B32_term:
3129+
case AMDGPU::S_CSELECT_B32_term:
31023130
break;
31033131
case AMDGPU::SI_IF:
31043132
case AMDGPU::SI_ELSE:

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -350,6 +350,8 @@ class WrapTerminatorInst<SOP_Pseudo base_inst> : SPseudoInstSI<
350350

351351
let WaveSizePredicate = isWave64 in {
352352
def S_MOV_B64_term : WrapTerminatorInst<S_MOV_B64>;
353+
def S_CMOV_B64_term : WrapTerminatorInst<S_CMOV_B64>;
354+
def S_CSELECT_B64_term : WrapTerminatorInst<S_CSELECT_B64>;
353355
def S_XOR_B64_term : WrapTerminatorInst<S_XOR_B64>;
354356
def S_OR_B64_term : WrapTerminatorInst<S_OR_B64>;
355357
def S_ANDN2_B64_term : WrapTerminatorInst<S_ANDN2_B64>;
@@ -359,6 +361,8 @@ def S_AND_SAVEEXEC_B64_term : WrapTerminatorInst<S_AND_SAVEEXEC_B64>;
359361

360362
let WaveSizePredicate = isWave32 in {
361363
def S_MOV_B32_term : WrapTerminatorInst<S_MOV_B32>;
364+
def S_CMOV_B32_term : WrapTerminatorInst<S_CMOV_B32>;
365+
def S_CSELECT_B32_term : WrapTerminatorInst<S_CSELECT_B32>;
362366
def S_XOR_B32_term : WrapTerminatorInst<S_XOR_B32>;
363367
def S_OR_B32_term : WrapTerminatorInst<S_OR_B32>;
364368
def S_ANDN2_B32_term : WrapTerminatorInst<S_ANDN2_B32>;

llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp

Lines changed: 84 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ class SILowerControlFlow : public MachineFunctionPass {
8282
SmallSet<Register, 8> RecomputeRegs;
8383

8484
const TargetRegisterClass *BoolRC = nullptr;
85-
long unsigned TestMask;
85+
uint64_t TestMask;
8686
unsigned Select;
8787
unsigned CmovOpc;
8888
unsigned AndOpc;
@@ -96,12 +96,14 @@ class SILowerControlFlow : public MachineFunctionPass {
9696
unsigned OrSaveExecOpc;
9797
unsigned Exec;
9898

99+
bool hasKill(const MachineBasicBlock *Begin, const MachineBasicBlock *End);
100+
99101
void emitIf(MachineInstr &MI);
100102
void emitElse(MachineInstr &MI);
101103
void emitIfBreak(MachineInstr &MI);
102104
void emitLoop(MachineInstr &MI);
103105
void emitWaveDiverge(MachineInstr &MI, Register EnabledLanesMask,
104-
Register DisableLanesMask);
106+
Register DisableLanesMask, bool IsIf);
105107

106108
void emitWaveReconverge(MachineInstr &MI);
107109

@@ -165,6 +167,37 @@ INITIALIZE_PASS(SILowerControlFlow, DEBUG_TYPE,
165167

166168
char &llvm::SILowerControlFlowID = SILowerControlFlow::ID;
167169

170+
bool SILowerControlFlow::hasKill(const MachineBasicBlock *Begin,
171+
const MachineBasicBlock *End) {
172+
DenseSet<const MachineBasicBlock*> Visited;
173+
SmallVector<MachineBasicBlock *, 4> Worklist(Begin->successors());
174+
175+
while (!Worklist.empty()) {
176+
MachineBasicBlock *MBB = Worklist.pop_back_val();
177+
178+
if (MBB == End || !Visited.insert(MBB).second)
179+
continue;
180+
if (KillBlocks.contains(MBB))
181+
return true;
182+
183+
Worklist.append(MBB->succ_begin(), MBB->succ_end());
184+
}
185+
186+
return false;
187+
}
188+
189+
static bool isSimpleIf(const MachineInstr &MI, const MachineRegisterInfo *MRI) {
190+
Register SaveExecReg = MI.getOperand(0).getReg();
191+
auto U = MRI->use_instr_nodbg_begin(SaveExecReg);
192+
193+
if (U == MRI->use_instr_nodbg_end() ||
194+
std::next(U) != MRI->use_instr_nodbg_end() ||
195+
U->getOpcode() != AMDGPU::SI_WAVE_RECONVERGE)
196+
return false;
197+
198+
return true;
199+
}
200+
168201
void SILowerControlFlow::emitIf(MachineInstr &MI) {
169202
MachineBasicBlock &MBB = *MI.getParent();
170203
const DebugLoc &DL = MI.getDebugLoc();
@@ -173,6 +206,9 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) {
173206
MachineOperand &Cond = MI.getOperand(1);
174207
assert(Cond.getSubReg() == AMDGPU::NoSubRegister);
175208
Register CondReg = Cond.getReg();
209+
MachineInstr *CondRegDef = MRI->getVRegDef(CondReg);
210+
if (CondRegDef && CondRegDef->getParent() == &MBB && TII->isVALU(*CondRegDef))
211+
return emitWaveDiverge(MI, CondReg, MaskElse, true);
176212

177213
Register MaskThen = MRI->createVirtualRegister(BoolRC);
178214
// Get rid of the garbage bits in the Cond register which might be coming from
@@ -184,7 +220,7 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) {
184220
if (LV)
185221
LV->replaceKillInstruction(CondReg, MI, *CondFiltered);
186222

187-
emitWaveDiverge(MI, MaskThen, MaskElse);
223+
emitWaveDiverge(MI, MaskThen, MaskElse, true);
188224

189225
if (LIS) {
190226
LIS->InsertMachineInstrInMaps(*CondFiltered);
@@ -195,7 +231,7 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) {
195231
void SILowerControlFlow::emitElse(MachineInstr &MI) {
196232
Register InvCondReg = MI.getOperand(0).getReg();
197233
Register CondReg = MI.getOperand(1).getReg();
198-
emitWaveDiverge(MI, CondReg, InvCondReg);
234+
emitWaveDiverge(MI, CondReg, InvCondReg, false);
199235
}
200236

201237
void SILowerControlFlow::emitIfBreak(MachineInstr &MI) {
@@ -258,24 +294,19 @@ void SILowerControlFlow::emitLoop(MachineInstr &MI) {
258294

259295
Register Cond = MI.getOperand(0).getReg();
260296
Register MaskLoop = MRI->createVirtualRegister(BoolRC);
261-
Register MaskExit = MRI->createVirtualRegister(BoolRC);
262297
Register AndZero = MRI->createVirtualRegister(BoolRC);
263298

264299
MachineInstr *CondLoop = BuildMI(MBB, &MI, DL, TII->get(Andn2Opc), MaskLoop)
265300
.addReg(Exec)
266301
.addReg(Cond);
267302

268-
MachineInstr *ExitExec = BuildMI(MBB, &MI, DL, TII->get(OrOpc), MaskExit)
269-
.addReg(Cond)
270-
.addReg(Exec);
271-
272303
MachineInstr *IfZeroMask = BuildMI(MBB, &MI, DL, TII->get(AndOpc), AndZero)
273304
.addReg(MaskLoop)
274305
.addImm(TestMask);
275306

276307
MachineInstr *SetExec= BuildMI(MBB, &MI, DL, TII->get(Select), Exec)
277308
.addReg(MaskLoop)
278-
.addReg(MaskExit);
309+
.addReg(Cond);
279310

280311
if (LV)
281312
LV->replaceKillInstruction(MI.getOperand(0).getReg(), MI, *SetExec);
@@ -290,10 +321,8 @@ void SILowerControlFlow::emitLoop(MachineInstr &MI) {
290321
LIS->ReplaceMachineInstrInMaps(MI, *SetExec);
291322
LIS->InsertMachineInstrInMaps(*CondLoop);
292323
LIS->InsertMachineInstrInMaps(*IfZeroMask);
293-
LIS->InsertMachineInstrInMaps(*ExitExec);
294324
LIS->InsertMachineInstrInMaps(*Branch);
295325
LIS->createAndComputeVirtRegInterval(MaskLoop);
296-
LIS->createAndComputeVirtRegInterval(MaskExit);
297326
LIS->createAndComputeVirtRegInterval(AndZero);
298327
}
299328

@@ -302,20 +331,49 @@ void SILowerControlFlow::emitLoop(MachineInstr &MI) {
302331

303332
void SILowerControlFlow::emitWaveDiverge(MachineInstr &MI,
304333
Register EnabledLanesMask,
305-
Register DisableLanesMask) {
334+
Register DisableLanesMask, bool IsIf) {
335+
306336
MachineBasicBlock &MBB = *MI.getParent();
307337
const DebugLoc &DL = MI.getDebugLoc();
308338
MachineBasicBlock::iterator I(MI);
309339

310-
MachineInstr *CondInverted =
311-
BuildMI(MBB, I, DL, TII->get(XorOpc), DisableLanesMask)
312-
.addReg(EnabledLanesMask)
313-
.addReg(Exec);
314-
315-
if (LV) {
316-
LV->replaceKillInstruction(DisableLanesMask, MI, *CondInverted);
340+
bool NeedXor = true;
341+
if (IsIf) {
342+
// If there is only one use of save exec register and that use is SI_END_CF,
343+
// we can optimize SI_IF by returning the full saved exec mask instead of
344+
// just cleared bits.
345+
bool SimpleIf = isSimpleIf(MI, MRI);
346+
347+
if (SimpleIf) {
348+
// Check for SI_KILL_*_TERMINATOR on path from if to endif.
349+
// if there is any such terminator simplifications are not safe.
350+
auto UseMI = MRI->use_instr_nodbg_begin(DisableLanesMask);
351+
SimpleIf = !hasKill(MI.getParent(), UseMI->getParent());
352+
}
353+
NeedXor = !SimpleIf;
317354
}
318355

356+
if (NeedXor) {
357+
358+
MachineInstr *CondInverted =
359+
BuildMI(MBB, I, DL, TII->get(XorOpc), DisableLanesMask)
360+
.addReg(EnabledLanesMask)
361+
.addReg(Exec);
362+
363+
if (LV) {
364+
LV->replaceKillInstruction(DisableLanesMask, MI, *CondInverted);
365+
}
366+
367+
if (LIS) {
368+
LIS->InsertMachineInstrInMaps(*CondInverted);
369+
}
370+
} else {
371+
MachineInstr *CopyExec =
372+
BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DisableLanesMask)
373+
.addReg(Exec);
374+
if(LIS)
375+
LIS->InsertMachineInstrInMaps(*CopyExec);
376+
}
319377
Register TestResultReg = MRI->createVirtualRegister(BoolRC);
320378
MachineInstr *IfZeroMask =
321379
BuildMI(MBB, I, DL, TII->get(AndOpc), TestResultReg)
@@ -327,7 +385,7 @@ void SILowerControlFlow::emitWaveDiverge(MachineInstr &MI,
327385

328386
MachineBasicBlock *FlowBB = MI.getOperand(2).getMBB();
329387
MachineBasicBlock *TargetBB = nullptr;
330-
// determine target BBs
388+
// determine target BBs
331389
I = skipToUncondBrOrEnd(MBB, I);
332390
if (I != MBB.end()) {
333391
// skipToUncondBrOrEnd returns either unconditional branch or end()
@@ -358,8 +416,7 @@ void SILowerControlFlow::emitWaveDiverge(MachineInstr &MI,
358416
return;
359417
}
360418

361-
LIS->InsertMachineInstrInMaps(*CondInverted);
362-
LIS->InsertMachineInstrInMaps(*IfZeroMask);
419+
LIS->InsertMachineInstrInMaps(*IfZeroMask);
363420
LIS->ReplaceMachineInstrInMaps(MI, *SetExecForSucc);
364421

365422
RecomputeRegs.insert(MI.getOperand(0).getReg());
@@ -607,8 +664,8 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
607664

608665
if (ST.isWave32()) {
609666
TestMask = 0xffffffff;
610-
Select = AMDGPU::S_CSELECT_B32;
611-
CmovOpc = AMDGPU::S_CMOV_B32;
667+
Select = AMDGPU::S_CSELECT_B32_term;
668+
CmovOpc = AMDGPU::S_CMOV_B32_term;
612669
AndOpc = AMDGPU::S_AND_B32;
613670
Andn2Opc = AMDGPU::S_ANDN2_B32;
614671
OrOpc = AMDGPU::S_OR_B32;
@@ -621,8 +678,8 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
621678
Exec = AMDGPU::EXEC_LO;
622679
} else {
623680
TestMask = 0xffffffffffffffff;
624-
Select = AMDGPU::S_CSELECT_B64;
625-
CmovOpc = AMDGPU::S_CMOV_B64;
681+
Select = AMDGPU::S_CSELECT_B64_term;
682+
CmovOpc = AMDGPU::S_CMOV_B64_term;
626683
AndOpc = AMDGPU::S_AND_B64;
627684
Andn2Opc = AMDGPU::S_ANDN2_B64;
628685
OrOpc = AMDGPU::S_OR_B64;

llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,9 @@ Register SIOptimizeExecMasking::isCopyToExec(const MachineInstr &MI) const {
114114
switch (MI.getOpcode()) {
115115
case AMDGPU::COPY:
116116
case AMDGPU::S_MOV_B64:
117-
case AMDGPU::S_MOV_B32: {
117+
case AMDGPU::S_MOV_B32:
118+
case AMDGPU::S_CMOV_B64:
119+
case AMDGPU::S_CMOV_B32: {
118120
const MachineOperand &Dst = MI.getOperand(0);
119121
if (Dst.isReg() && Dst.getReg() == Exec && MI.getOperand(1).isReg())
120122
return MI.getOperand(1).getReg();

llvm/test/%t

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
warning: <unknown>:0:0: in function func_use_lds_global void (): local memory global used by non-kernel function
2+
3+
warning: <unknown>:0:0: in function func_use_lds_global_constexpr_cast void (): local memory global used by non-kernel function
4+
5+
warning: <unknown>:0:0: in function func_uses_lds_multi void (i1): local memory global used by non-kernel function
6+
7+
warning: <unknown>:0:0: in function func_uses_lds_multi void (i1): local memory global used by non-kernel function
8+
9+
warning: <unknown>:0:0: in function func_uses_lds_multi void (i1): local memory global used by non-kernel function
10+
11+
warning: <unknown>:0:0: in function func_uses_lds_code_after void (ptr addrspace(1)): local memory global used by non-kernel function
12+
13+
warning: <unknown>:0:0: in function func_uses_lds_phi_after i32 (i1, ptr addrspace(1)): local memory global used by non-kernel function
14+

llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -118,9 +118,8 @@ define void @divergent_i1_phi_used_inside_loop(float %val, ptr %addr) {
118118
; GFX10-NEXT: s_and_b32 s4, exec_lo, s4
119119
; GFX10-NEXT: s_or_b32 s6, s6, s4
120120
; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5
121-
; GFX10-NEXT: s_or_b32 s7, s5, exec_lo
122-
; GFX10-NEXT: s_and_b32 s8, s4, -1
123-
; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s7
121+
; GFX10-NEXT: s_and_b32 s7, s4, -1
122+
; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5
124123
; GFX10-NEXT: s_cbranch_scc1 .LBB2_1
125124
; GFX10-NEXT: ; %bb.2: ; %exit
126125
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s6
@@ -166,9 +165,8 @@ define void @divergent_i1_phi_used_inside_loop_bigger_loop_body(float %val, floa
166165
; GFX10-NEXT: s_and_b32 s7, exec_lo, s4
167166
; GFX10-NEXT: s_or_b32 s6, s6, s7
168167
; GFX10-NEXT: s_andn2_b32 s7, exec_lo, s5
169-
; GFX10-NEXT: s_or_b32 s8, s5, exec_lo
170-
; GFX10-NEXT: s_and_b32 s9, s7, -1
171-
; GFX10-NEXT: s_cselect_b32 exec_lo, s7, s8
168+
; GFX10-NEXT: s_and_b32 s8, s7, -1
169+
; GFX10-NEXT: s_cselect_b32 exec_lo, s7, s5
172170
; GFX10-NEXT: s_cbranch_scc0 .LBB3_6
173171
; GFX10-NEXT: .LBB3_2: ; %loop_start
174172
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1

0 commit comments

Comments
 (0)