@@ -2097,21 +2097,6 @@ unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) {
2097
2097
}
2098
2098
}
2099
2099
2100
- Register SIInstrInfo::findSetInactiveMask (const MachineInstr &MI) {
2101
- assert (MI.getOpcode () == AMDGPU::V_SET_INACTIVE_B32 ||
2102
- MI.getOpcode () == AMDGPU::V_SET_INACTIVE_B64);
2103
- for (auto &Op : MI.implicit_operands ()) {
2104
- if (Op.isDef ())
2105
- continue ;
2106
- Register OpReg = Op.getReg ();
2107
- if (OpReg == AMDGPU::EXEC || OpReg == AMDGPU::EXEC_LO ||
2108
- OpReg == AMDGPU::SCC)
2109
- continue ;
2110
- return OpReg;
2111
- }
2112
- return Register ();
2113
- }
2114
-
2115
2100
bool SIInstrInfo::expandPostRAPseudo (MachineInstr &MI) const {
2116
2101
MachineBasicBlock &MBB = *MI.getParent ();
2117
2102
DebugLoc DL = MBB.findDebugLoc (MI);
@@ -2286,147 +2271,15 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
2286
2271
MI.eraseFromParent ();
2287
2272
break ;
2288
2273
}
2289
- case AMDGPU::V_SET_INACTIVE_B32:
2290
- case AMDGPU::V_SET_INACTIVE_B64: {
2291
- unsigned NotOpc = ST.isWave32 () ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
2292
- unsigned MovOpc = ST.isWave32 () ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
2293
- unsigned VMovOpc = MI.getOpcode () == AMDGPU::V_SET_INACTIVE_B64
2294
- ? AMDGPU::V_MOV_B64_PSEUDO
2295
- : AMDGPU::V_MOV_B32_e32;
2296
- Register ExecReg = RI.getExec ();
2274
+ case AMDGPU::V_SET_INACTIVE_B32: {
2275
+ // Lower V_SET_INACTIVE_B32 to V_CNDMASK_B32.
2297
2276
Register DstReg = MI.getOperand (0 ).getReg ();
2298
- MachineOperand &ActiveSrc = MI.getOperand (1 );
2299
- MachineOperand &InactiveSrc = MI.getOperand (2 );
2300
-
2301
- // Find implicit register defining lanes active outside WWM.
2302
- Register ExecSrcReg = findSetInactiveMask (MI);
2303
- assert (ExecSrcReg && " V_SET_INACTIVE must be in known WWM region" );
2304
- // Note: default here is set to ExecReg so that functional MIR is still
2305
- // generated if implicit def is not found and assertions are disabled.
2306
- if (!ExecSrcReg)
2307
- ExecSrcReg = ExecReg;
2308
-
2309
- // Ideally in WWM this operation is lowered to V_CNDMASK; however,
2310
- // constant bus constraints and the presence of literal constants
2311
- // present an issue.
2312
- // Fallback to V_MOV base lowering in all but the common cases.
2313
- const bool VMov64 = VMovOpc != AMDGPU::V_MOV_B32_e32;
2314
- MachineFunction *MF = MBB.getParent ();
2315
- MachineRegisterInfo &MRI = MF->getRegInfo ();
2316
- const unsigned Opcode = AMDGPU::V_CNDMASK_B32_e64;
2317
- const MCInstrDesc &Desc = get (Opcode);
2318
-
2319
- const APInt ActiveImm (64 , ActiveSrc.isImm () ? ActiveSrc.getImm () : 0 );
2320
- const APInt InactiveImm (64 , InactiveSrc.isImm () ? InactiveSrc.getImm () : 0 );
2321
- const APInt ActiveImmLo (32 , ActiveImm.getLoBits (32 ).getZExtValue ());
2322
- const APInt ActiveImmHi (32 , ActiveImm.getHiBits (32 ).getZExtValue ());
2323
- const APInt InactiveImmLo (32 , InactiveImm.getLoBits (32 ).getZExtValue ());
2324
- const APInt InactiveImmHi (32 , InactiveImm.getHiBits (32 ).getZExtValue ());
2325
-
2326
- int Src0Idx = AMDGPU::getNamedOperandIdx (Opcode, AMDGPU::OpName::src0);
2327
- int Src1Idx = AMDGPU::getNamedOperandIdx (Opcode, AMDGPU::OpName::src1);
2328
-
2329
- int ConstantBusLimit = ST.getConstantBusLimit (AMDGPU::V_CNDMASK_B32_e64);
2330
- int LiteralLimit = ST.hasVOP3Literal () ? 1 : 0 ;
2331
- int ConstantBusUses =
2332
- 1 + // Starts at 1 for ExecSrcReg
2333
- (usesConstantBus (MRI, ActiveSrc, Desc.operands ()[Src1Idx]) ? 1 : 0 ) +
2334
- (usesConstantBus (MRI, InactiveSrc, Desc.operands ()[Src0Idx]) ? 1 : 0 );
2335
- int LiteralConstants =
2336
- ((ActiveSrc.isReg () ||
2337
- (ActiveSrc.isImm () && isInlineConstant (ActiveImm)))
2338
- ? 0
2339
- : 1 ) +
2340
- ((InactiveSrc.isReg () ||
2341
- (InactiveSrc.isImm () && isInlineConstant (InactiveImm)))
2342
- ? 0
2343
- : 1 );
2344
-
2345
- bool UseVCndMask =
2346
- ConstantBusUses <= ConstantBusLimit && LiteralConstants <= LiteralLimit;
2347
- if (VMov64 && UseVCndMask) {
2348
- // Decomposition must not introduce new literals.
2349
- UseVCndMask &=
2350
- ActiveSrc.isReg () ||
2351
- (isInlineConstant (ActiveImmLo) && isInlineConstant (ActiveImmHi)) ||
2352
- (!isInlineConstant (ActiveImm));
2353
- UseVCndMask &= InactiveSrc.isReg () ||
2354
- (isInlineConstant (InactiveImmLo) &&
2355
- isInlineConstant (InactiveImmHi)) ||
2356
- (!isInlineConstant (InactiveImm));
2357
- }
2358
-
2359
- if (UseVCndMask && VMov64) {
2360
- // Dual V_CNDMASK_B32
2361
- MachineOperand ActiveLo = buildExtractSubRegOrImm (
2362
- MI, MRI, ActiveSrc, nullptr , AMDGPU::sub0, nullptr );
2363
- MachineOperand ActiveHi = buildExtractSubRegOrImm (
2364
- MI, MRI, ActiveSrc, nullptr , AMDGPU::sub1, nullptr );
2365
- MachineOperand InactiveLo = buildExtractSubRegOrImm (
2366
- MI, MRI, InactiveSrc, nullptr , AMDGPU::sub0, nullptr );
2367
- MachineOperand InactiveHi = buildExtractSubRegOrImm (
2368
- MI, MRI, InactiveSrc, nullptr , AMDGPU::sub1, nullptr );
2369
- if (ActiveSrc.isReg ())
2370
- ActiveHi.setIsKill (ActiveSrc.isKill ());
2371
- if (InactiveSrc.isReg ())
2372
- InactiveHi.setIsKill (InactiveSrc.isKill ());
2373
- BuildMI (MBB, MI, DL, Desc, RI.getSubReg (DstReg, AMDGPU::sub0))
2374
- .addImm (0 )
2375
- .add (InactiveLo)
2376
- .addImm (0 )
2377
- .add (ActiveLo)
2378
- .addReg (ExecSrcReg)
2379
- .addReg (DstReg, RegState::ImplicitDefine);
2380
- BuildMI (MBB, MI, DL, Desc, RI.getSubReg (DstReg, AMDGPU::sub1))
2381
- .addImm (0 )
2382
- .add (InactiveHi)
2383
- .addImm (0 )
2384
- .add (ActiveHi)
2385
- .addReg (ExecSrcReg)
2386
- .addReg (DstReg, RegState::ImplicitDefine);
2387
- } else if (UseVCndMask) {
2388
- // Single V_CNDMASK_B32
2389
- BuildMI (MBB, MI, DL, Desc, DstReg)
2390
- .addImm (0 )
2391
- .add (InactiveSrc)
2392
- .addImm (0 )
2393
- .add (ActiveSrc)
2394
- .addReg (ExecSrcReg);
2395
- } else {
2396
- // Fallback V_MOV case.
2397
- // Avoid unnecessary work if a source VGPR is also the destination.
2398
- // This can happen if WWM register allocation was efficient.
2399
- // Note: this assumes WWM execution.
2400
- bool DstIsActive = ActiveSrc.isReg () && ActiveSrc.getReg () == DstReg;
2401
- bool DstIsInactive =
2402
- InactiveSrc.isReg () && InactiveSrc.getReg () == DstReg;
2403
- if (!DstIsInactive) {
2404
- // Set exec mask to inactive lanes,
2405
- // but only if active lanes would be overwritten.
2406
- if (DstIsActive) {
2407
- BuildMI (MBB, MI, DL, get (NotOpc), ExecReg)
2408
- .addReg (ExecSrcReg)
2409
- .setOperandDead (3 ); // Dead scc
2410
- }
2411
- // Copy inactive lanes
2412
- MachineInstr *VMov =
2413
- BuildMI (MBB, MI, DL, get (VMovOpc), DstReg).add (InactiveSrc);
2414
- if (VMov64)
2415
- expandPostRAPseudo (*VMov);
2416
- }
2417
- if (!DstIsActive) {
2418
- // Set exec mask to active lanes
2419
- BuildMI (MBB, MI, DL, get (MovOpc), ExecReg).addReg (ExecSrcReg);
2420
- // Copy active lanes
2421
- MachineInstr *VMov =
2422
- BuildMI (MBB, MI, DL, get (VMovOpc), MI.getOperand (0 ).getReg ())
2423
- .add (ActiveSrc);
2424
- if (VMov64)
2425
- expandPostRAPseudo (*VMov);
2426
- }
2427
- // Restore WWM
2428
- BuildMI (MBB, MI, DL, get (MovOpc), ExecReg).addImm (-1 );
2429
- }
2277
+ BuildMI (MBB, MI, DL, get (AMDGPU::V_CNDMASK_B32_e64), DstReg)
2278
+ .add (MI.getOperand (3 ))
2279
+ .add (MI.getOperand (4 ))
2280
+ .add (MI.getOperand (1 ))
2281
+ .add (MI.getOperand (2 ))
2282
+ .add (MI.getOperand (5 ));
2430
2283
MI.eraseFromParent ();
2431
2284
break ;
2432
2285
}
0 commit comments