@@ -82,7 +82,7 @@ class SILowerControlFlow : public MachineFunctionPass {
8282 SmallSet<Register, 8 > RecomputeRegs;
8383
8484 const TargetRegisterClass *BoolRC = nullptr ;
85- long unsigned TestMask;
85+ uint64_t TestMask;
8686 unsigned Select;
8787 unsigned CmovOpc;
8888 unsigned AndOpc;
@@ -96,12 +96,14 @@ class SILowerControlFlow : public MachineFunctionPass {
9696 unsigned OrSaveExecOpc;
9797 unsigned Exec;
9898
99+ bool hasKill (const MachineBasicBlock *Begin, const MachineBasicBlock *End);
100+
99101 void emitIf (MachineInstr &MI);
100102 void emitElse (MachineInstr &MI);
101103 void emitIfBreak (MachineInstr &MI);
102104 void emitLoop (MachineInstr &MI);
103105 void emitWaveDiverge (MachineInstr &MI, Register EnabledLanesMask,
104- Register DisableLanesMask);
106+ Register DisableLanesMask, bool IsIf );
105107
106108 void emitWaveReconverge (MachineInstr &MI);
107109
@@ -165,6 +167,37 @@ INITIALIZE_PASS(SILowerControlFlow, DEBUG_TYPE,
165167
166168char &llvm::SILowerControlFlowID = SILowerControlFlow::ID;
167169
170+ bool SILowerControlFlow::hasKill (const MachineBasicBlock *Begin,
171+ const MachineBasicBlock *End) {
172+ DenseSet<const MachineBasicBlock*> Visited;
173+ SmallVector<MachineBasicBlock *, 4 > Worklist (Begin->successors ());
174+
175+ while (!Worklist.empty ()) {
176+ MachineBasicBlock *MBB = Worklist.pop_back_val ();
177+
178+ if (MBB == End || !Visited.insert (MBB).second )
179+ continue ;
180+ if (KillBlocks.contains (MBB))
181+ return true ;
182+
183+ Worklist.append (MBB->succ_begin (), MBB->succ_end ());
184+ }
185+
186+ return false ;
187+ }
188+
189+ static bool isSimpleIf (const MachineInstr &MI, const MachineRegisterInfo *MRI) {
190+ Register SaveExecReg = MI.getOperand (0 ).getReg ();
191+ auto U = MRI->use_instr_nodbg_begin (SaveExecReg);
192+
193+ if (U == MRI->use_instr_nodbg_end () ||
194+ std::next (U) != MRI->use_instr_nodbg_end () ||
195+ U->getOpcode () != AMDGPU::SI_WAVE_RECONVERGE)
196+ return false ;
197+
198+ return true ;
199+ }
200+
168201void SILowerControlFlow::emitIf (MachineInstr &MI) {
169202 MachineBasicBlock &MBB = *MI.getParent ();
170203 const DebugLoc &DL = MI.getDebugLoc ();
@@ -173,6 +206,9 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) {
173206 MachineOperand &Cond = MI.getOperand (1 );
174207 assert (Cond.getSubReg () == AMDGPU::NoSubRegister);
175208 Register CondReg = Cond.getReg ();
209+ MachineInstr *CondRegDef = MRI->getVRegDef (CondReg);
210+ if (CondRegDef && CondRegDef->getParent () == &MBB && TII->isVALU (*CondRegDef))
211+ return emitWaveDiverge (MI, CondReg, MaskElse, true );
176212
177213 Register MaskThen = MRI->createVirtualRegister (BoolRC);
178214 // Get rid of the garbage bits in the Cond register which might be coming from
@@ -184,7 +220,7 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) {
184220 if (LV)
185221 LV->replaceKillInstruction (CondReg, MI, *CondFiltered);
186222
187- emitWaveDiverge (MI, MaskThen, MaskElse);
223+ emitWaveDiverge (MI, MaskThen, MaskElse, true );
188224
189225 if (LIS) {
190226 LIS->InsertMachineInstrInMaps (*CondFiltered);
@@ -195,7 +231,7 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) {
195231void SILowerControlFlow::emitElse (MachineInstr &MI) {
196232 Register InvCondReg = MI.getOperand (0 ).getReg ();
197233 Register CondReg = MI.getOperand (1 ).getReg ();
198- emitWaveDiverge (MI, CondReg, InvCondReg);
234+ emitWaveDiverge (MI, CondReg, InvCondReg, false );
199235}
200236
201237void SILowerControlFlow::emitIfBreak (MachineInstr &MI) {
@@ -258,24 +294,19 @@ void SILowerControlFlow::emitLoop(MachineInstr &MI) {
258294
259295 Register Cond = MI.getOperand (0 ).getReg ();
260296 Register MaskLoop = MRI->createVirtualRegister (BoolRC);
261- Register MaskExit = MRI->createVirtualRegister (BoolRC);
262297 Register AndZero = MRI->createVirtualRegister (BoolRC);
263298
264299 MachineInstr *CondLoop = BuildMI (MBB, &MI, DL, TII->get (Andn2Opc), MaskLoop)
265300 .addReg (Exec)
266301 .addReg (Cond);
267302
268- MachineInstr *ExitExec = BuildMI (MBB, &MI, DL, TII->get (OrOpc), MaskExit)
269- .addReg (Cond)
270- .addReg (Exec);
271-
272303 MachineInstr *IfZeroMask = BuildMI (MBB, &MI, DL, TII->get (AndOpc), AndZero)
273304 .addReg (MaskLoop)
274305 .addImm (TestMask);
275306
276307 MachineInstr *SetExec= BuildMI (MBB, &MI, DL, TII->get (Select), Exec)
277308 .addReg (MaskLoop)
278- .addReg (MaskExit );
309+ .addReg (Cond );
279310
280311 if (LV)
281312 LV->replaceKillInstruction (MI.getOperand (0 ).getReg (), MI, *SetExec);
@@ -290,10 +321,8 @@ void SILowerControlFlow::emitLoop(MachineInstr &MI) {
290321 LIS->ReplaceMachineInstrInMaps (MI, *SetExec);
291322 LIS->InsertMachineInstrInMaps (*CondLoop);
292323 LIS->InsertMachineInstrInMaps (*IfZeroMask);
293- LIS->InsertMachineInstrInMaps (*ExitExec);
294324 LIS->InsertMachineInstrInMaps (*Branch);
295325 LIS->createAndComputeVirtRegInterval (MaskLoop);
296- LIS->createAndComputeVirtRegInterval (MaskExit);
297326 LIS->createAndComputeVirtRegInterval (AndZero);
298327 }
299328
@@ -302,20 +331,49 @@ void SILowerControlFlow::emitLoop(MachineInstr &MI) {
302331
303332void SILowerControlFlow::emitWaveDiverge (MachineInstr &MI,
304333 Register EnabledLanesMask,
305- Register DisableLanesMask) {
334+ Register DisableLanesMask, bool IsIf) {
335+
306336 MachineBasicBlock &MBB = *MI.getParent ();
307337 const DebugLoc &DL = MI.getDebugLoc ();
308338 MachineBasicBlock::iterator I (MI);
309339
310- MachineInstr *CondInverted =
311- BuildMI (MBB, I, DL, TII->get (XorOpc), DisableLanesMask)
312- .addReg (EnabledLanesMask)
313- .addReg (Exec);
314-
315- if (LV) {
316- LV->replaceKillInstruction (DisableLanesMask, MI, *CondInverted);
340+ bool NeedXor = true ;
341+ if (IsIf) {
342+ // If there is only one use of save exec register and that use is SI_END_CF,
343+ // we can optimize SI_IF by returning the full saved exec mask instead of
344+ // just cleared bits.
345+ bool SimpleIf = isSimpleIf (MI, MRI);
346+
347+ if (SimpleIf) {
348+ // Check for SI_KILL_*_TERMINATOR on path from if to endif.
349+ // if there is any such terminator simplifications are not safe.
350+ auto UseMI = MRI->use_instr_nodbg_begin (DisableLanesMask);
351+ SimpleIf = !hasKill (MI.getParent (), UseMI->getParent ());
352+ }
353+ NeedXor = !SimpleIf;
317354 }
318355
356+ if (NeedXor) {
357+
358+ MachineInstr *CondInverted =
359+ BuildMI (MBB, I, DL, TII->get (XorOpc), DisableLanesMask)
360+ .addReg (EnabledLanesMask)
361+ .addReg (Exec);
362+
363+ if (LV) {
364+ LV->replaceKillInstruction (DisableLanesMask, MI, *CondInverted);
365+ }
366+
367+ if (LIS) {
368+ LIS->InsertMachineInstrInMaps (*CondInverted);
369+ }
370+ } else {
371+ MachineInstr *CopyExec =
372+ BuildMI (MBB, I, DL, TII->get (AMDGPU::COPY), DisableLanesMask)
373+ .addReg (Exec);
374+ if (LIS)
375+ LIS->InsertMachineInstrInMaps (*CopyExec);
376+ }
319377 Register TestResultReg = MRI->createVirtualRegister (BoolRC);
320378 MachineInstr *IfZeroMask =
321379 BuildMI (MBB, I, DL, TII->get (AndOpc), TestResultReg)
@@ -327,7 +385,7 @@ void SILowerControlFlow::emitWaveDiverge(MachineInstr &MI,
327385
328386 MachineBasicBlock *FlowBB = MI.getOperand (2 ).getMBB ();
329387 MachineBasicBlock *TargetBB = nullptr ;
330- // determine target BBs
388+ // determine target BBs
331389 I = skipToUncondBrOrEnd (MBB, I);
332390 if (I != MBB.end ()) {
333391 // skipToUncondBrOrEnd returns either unconditional branch or end()
@@ -358,8 +416,7 @@ void SILowerControlFlow::emitWaveDiverge(MachineInstr &MI,
358416 return ;
359417 }
360418
361- LIS->InsertMachineInstrInMaps (*CondInverted);
362- LIS->InsertMachineInstrInMaps (*IfZeroMask);
419+ LIS->InsertMachineInstrInMaps (*IfZeroMask);
363420 LIS->ReplaceMachineInstrInMaps (MI, *SetExecForSucc);
364421
365422 RecomputeRegs.insert (MI.getOperand (0 ).getReg ());
@@ -607,8 +664,8 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
607664
608665 if (ST.isWave32 ()) {
609666 TestMask = 0xffffffff ;
610- Select = AMDGPU::S_CSELECT_B32 ;
611- CmovOpc = AMDGPU::S_CMOV_B32 ;
667+ Select = AMDGPU::S_CSELECT_B32_term ;
668+ CmovOpc = AMDGPU::S_CMOV_B32_term ;
612669 AndOpc = AMDGPU::S_AND_B32;
613670 Andn2Opc = AMDGPU::S_ANDN2_B32;
614671 OrOpc = AMDGPU::S_OR_B32;
@@ -621,8 +678,8 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
621678 Exec = AMDGPU::EXEC_LO;
622679 } else {
623680 TestMask = 0xffffffffffffffff ;
624- Select = AMDGPU::S_CSELECT_B64 ;
625- CmovOpc = AMDGPU::S_CMOV_B64 ;
681+ Select = AMDGPU::S_CSELECT_B64_term ;
682+ CmovOpc = AMDGPU::S_CMOV_B64_term ;
626683 AndOpc = AMDGPU::S_AND_B64;
627684 Andn2Opc = AMDGPU::S_ANDN2_B64;
628685 OrOpc = AMDGPU::S_OR_B64;
0 commit comments