1414#include " GCNSubtarget.h"
1515#include " Utils/AMDGPUBaseInfo.h"
1616#include " llvm/Analysis/CycleAnalysis.h"
17+ #include " llvm/Analysis/TargetTransformInfo.h"
18+ #include " llvm/Analysis/UniformityAnalysis.h"
1719#include " llvm/CodeGen/TargetPassConfig.h"
20+ #include " llvm/IR/IRBuilder.h"
1821#include " llvm/IR/IntrinsicsAMDGPU.h"
1922#include " llvm/IR/IntrinsicsR600.h"
2023#include " llvm/InitializePasses.h"
@@ -1295,6 +1298,130 @@ struct AAAMDGPUNoAGPR
12951298
12961299const char AAAMDGPUNoAGPR::ID = 0 ;
12971300
1301+ struct AAAMDGPUUniform : public StateWrapper <BooleanState, AbstractAttribute> {
1302+ using Base = StateWrapper<BooleanState, AbstractAttribute>;
1303+ AAAMDGPUUniform (const IRPosition &IRP, Attributor &A) : Base(IRP) {}
1304+
1305+ // / Create an abstract attribute view for the position \p IRP.
1306+ static AAAMDGPUUniform &createForPosition (const IRPosition &IRP,
1307+ Attributor &A);
1308+
1309+ // / See AbstractAttribute::getName()
1310+ StringRef getName () const override { return " AAAMDGPUUniform" ; }
1311+
1312+ const std::string getAsStr (Attributor *A) const override {
1313+ return getAssumed () ? " inreg" : " non-inreg" ;
1314+ }
1315+
1316+ void trackStatistics () const override {}
1317+
1318+ // / See AbstractAttribute::getIdAddr()
1319+ const char *getIdAddr () const override { return &ID; }
1320+
1321+ // / This function should return true if the type of the \p AA is
1322+ // / AAAMDGPUUniform
1323+ static bool classof (const AbstractAttribute *AA) {
1324+ return (AA->getIdAddr () == &ID);
1325+ }
1326+
1327+ // / Unique ID (due to the unique address)
1328+ static const char ID;
1329+ };
1330+
1331+ const char AAAMDGPUUniform::ID = 0 ;
1332+
1333+ struct AAAMDGPUUniformArgument : public AAAMDGPUUniform {
1334+ AAAMDGPUUniformArgument (const IRPosition &IRP, Attributor &A)
1335+ : AAAMDGPUUniform(IRP, A) {}
1336+
1337+ void initialize (Attributor &A) override {
1338+ Argument *Arg = getAssociatedArgument ();
1339+ CallingConv::ID CC = Arg->getParent ()->getCallingConv ();
1340+ if (Arg->hasAttribute (Attribute::InReg)) {
1341+ indicateOptimisticFixpoint ();
1342+ return ;
1343+ }
1344+ if (AMDGPU::isEntryFunctionCC (CC)) {
1345+ // We only use isArgPassedInSGPR on kernel entry function argument, so the
1346+ // potential i1 argument change will not affect this.
1347+ if (AMDGPU::isArgPassedInSGPR (Arg))
1348+ indicateOptimisticFixpoint ();
1349+ else
1350+ indicatePessimisticFixpoint ();
1351+ }
1352+ }
1353+
1354+ ChangeStatus updateImpl (Attributor &A) override {
1355+ unsigned ArgNo = getAssociatedArgument ()->getArgNo ();
1356+
1357+ auto isUniform = [&](AbstractCallSite ACS) -> bool {
1358+ CallBase *CB = ACS.getInstruction ();
1359+ Value *V = CB->getArgOperandUse (ArgNo);
1360+ if (isa<Constant>(V))
1361+ return true ;
1362+ Function *F = nullptr ;
1363+ if (auto *Arg = dyn_cast<Argument>(V)) {
1364+ auto *AA =
1365+ A.getOrCreateAAFor <AAAMDGPUUniform>(IRPosition::argument (*Arg));
1366+ if (AA)
1367+ return AA->isValidState ();
1368+ F = Arg->getParent ();
1369+ } else if (auto *I = dyn_cast<Instruction>(V)) {
1370+ F = I->getFunction ();
1371+ }
1372+
1373+ if (F) {
1374+ auto *UA =
1375+ A.getInfoCache ()
1376+ .getAnalysisResultForFunction <UniformityInfoAnalysis>(*F);
1377+ return UA && UA->isUniform (V);
1378+ }
1379+
1380+ // What else can it be here?
1381+ return false ;
1382+ };
1383+
1384+ bool UsedAssumedInformation = true ;
1385+ if (!A.checkForAllCallSites (isUniform, *this , /* RequireAllCallSites=*/ true ,
1386+ UsedAssumedInformation))
1387+ return indicatePessimisticFixpoint ();
1388+
1389+ if (!UsedAssumedInformation)
1390+ return indicateOptimisticFixpoint ();
1391+
1392+ return ChangeStatus::UNCHANGED;
1393+ }
1394+
1395+ ChangeStatus manifest (Attributor &A) override {
1396+ Argument *Arg = getAssociatedArgument ();
1397+ if (AMDGPU::isEntryFunctionCC (Arg->getParent ()->getCallingConv ()))
1398+ return ChangeStatus::UNCHANGED;
1399+ // If the argument already has inreg attribute, we will not do anything
1400+ // about it.
1401+ if (Arg->hasAttribute (Attribute::InReg))
1402+ return ChangeStatus::UNCHANGED;
1403+ // Add both inreg and "uniform" attribute to the argument. We will emit a
1404+ // readfirstlane at each call site for inreg uniform argument, and the
1405+ // "uniform" attribute will be removed later.
1406+ LLVMContext &Ctx = Arg->getContext ();
1407+ return A.manifestAttrs (getIRPosition (),
1408+ {Attribute::get (Ctx, Attribute::InReg),
1409+ Attribute::get (Ctx, " uniform" )});
1410+ }
1411+ };
1412+
1413+ AAAMDGPUUniform &AAAMDGPUUniform::createForPosition (const IRPosition &IRP,
1414+ Attributor &A) {
1415+ switch (IRP.getPositionKind ()) {
1416+ case IRPosition::IRP_ARGUMENT:
1417+ return *new (A.Allocator ) AAAMDGPUUniformArgument (IRP, A);
1418+ // TODO: Since inreg is also allowed for return value, maybe we need to add
1419+ // AAAMDGPUUniformCallSiteReturned?
1420+ default :
1421+ llvm_unreachable (" not a valid position for AAAMDGPUUniform" );
1422+ }
1423+ }
1424+
12981425// / Performs the final check and updates the 'amdgpu-waves-per-eu' attribute
12991426// / based on the finalized 'amdgpu-flat-work-group-size' attribute.
13001427// / Both attributes start with narrow ranges that expand during iteration.
@@ -1363,6 +1490,59 @@ static bool updateWavesPerEU(Module &M, TargetMachine &TM) {
13631490 return Changed;
13641491}
13651492
1493+ // / Emit the readfirstlane intrinsic for all inreg uniform function arguments at
1494+ // / each call site. The inreg uniform attribute combination is set by
1495+ // / AAAMDGPUUniform. This function provides a workaround for a downstream issue
1496+ // / where failing to emit a waterfall loop for 'inreg' arguments may result in
1497+ // / an invalid VGPR-to-SGPR copy. However, we intentionally avoid a waterfall
1498+ // / loop for inreg uniform arguments here, because the 'inreg' attribute set by
1499+ // / AAAMDGPUUniform guarantees uniformity, making the readfirstlane intrinsic
1500+ // / appropriate.
1501+ static bool emitReadFirstLaneForInregUniformArgs (Module &M) {
1502+ std::vector<std::pair<CallBase *, unsigned >> WorkList;
1503+
1504+ for (Function &F : M) {
1505+ if (F.isDeclaration ())
1506+ continue ;
1507+ for (Argument &Arg : F.args ()) {
1508+ if (!Arg.hasAttribute (Attribute::InReg) || !Arg.hasAttribute (" uniform" ))
1509+ continue ;
1510+ unsigned ArgNo = Arg.getArgNo ();
1511+ for (Use &U : F.uses ()) {
1512+ auto *CB = dyn_cast<CallBase>(U.getUser ());
1513+ if (!CB)
1514+ continue ;
1515+ // We will skip the call site argument when itself is an inreg argument.
1516+ // In this case, it will already be in SGPR.
1517+ if (auto *CSArg = dyn_cast<Argument>(CB->getArgOperand (ArgNo))) {
1518+ if (CSArg->hasAttribute (Attribute::InReg))
1519+ continue ;
1520+ }
1521+ WorkList.emplace_back (CB, ArgNo);
1522+ }
1523+ // We don't count this as changed since it just stays within this pass.
1524+ Arg.removeAttr (" uniform" );
1525+ }
1526+ }
1527+
1528+ if (WorkList.empty ())
1529+ return false ;
1530+
1531+ for (auto &[CB, ArgNo] : WorkList) {
1532+ Value *V = CB->getArgOperand (ArgNo);
1533+ IRBuilder<> Builder (CB);
1534+ Value *NewV = Builder.CreateIntrinsic (V->getType (),
1535+ Intrinsic::amdgcn_readfirstlane, {V});
1536+ CB->setArgOperand (ArgNo, NewV);
1537+ if (auto *I = dyn_cast<Instruction>(V)) {
1538+ if (I->use_empty ())
1539+ I->eraseFromParent ();
1540+ }
1541+ }
1542+
1543+ return true ;
1544+ }
1545+
13661546static bool runImpl (Module &M, AnalysisGetter &AG, TargetMachine &TM,
13671547 AMDGPUAttributorOptions Options,
13681548 ThinOrFullLTOPhase LTOPhase) {
@@ -1381,7 +1561,7 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
13811561 &AAAMDMaxNumWorkgroups::ID, &AAAMDWavesPerEU::ID, &AAAMDGPUNoAGPR::ID,
13821562 &AACallEdges::ID, &AAPointerInfo::ID, &AAPotentialConstantValues::ID,
13831563 &AAUnderlyingObjects::ID, &AAAddressSpace::ID, &AAIndirectCallInfo::ID,
1384- &AAInstanceInfo::ID});
1564+ &AAInstanceInfo::ID, &AAAMDGPUUniform::ID });
13851565
13861566 AttributorConfig AC (CGUpdater);
13871567 AC.IsClosedWorldModule = Options.IsClosedWorld ;
@@ -1434,11 +1614,17 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
14341614 IRPosition::value (*CmpX->getPointerOperand ()));
14351615 }
14361616 }
1617+
1618+ if (!AMDGPU::isEntryFunctionCC (F->getCallingConv ())) {
1619+ for (auto &Arg : F->args ())
1620+ A.getOrCreateAAFor <AAAMDGPUUniform>(IRPosition::argument (Arg));
1621+ }
14371622 }
14381623
14391624 bool Changed = A.run () == ChangeStatus::CHANGED;
14401625
14411626 Changed |= updateWavesPerEU (M, TM);
1627+ Changed |= emitReadFirstLaneForInregUniformArgs (M);
14421628
14431629 return Changed;
14441630}
0 commit comments