1414#include " GCNSubtarget.h"
1515#include " Utils/AMDGPUBaseInfo.h"
1616#include " llvm/Analysis/CycleAnalysis.h"
17+ #include " llvm/Analysis/TargetTransformInfo.h"
18+ #include " llvm/Analysis/UniformityAnalysis.h"
1719#include " llvm/CodeGen/TargetPassConfig.h"
20+ #include " llvm/IR/IRBuilder.h"
1821#include " llvm/IR/IntrinsicsAMDGPU.h"
1922#include " llvm/IR/IntrinsicsR600.h"
2023#include " llvm/InitializePasses.h"
@@ -1295,6 +1298,134 @@ struct AAAMDGPUNoAGPR
12951298
12961299const char AAAMDGPUNoAGPR::ID = 0 ;
12971300
1301+ struct AAAMDGPUUniform : public StateWrapper <BooleanState, AbstractAttribute> {
1302+ using Base = StateWrapper<BooleanState, AbstractAttribute>;
1303+ AAAMDGPUUniform (const IRPosition &IRP, Attributor &A) : Base(IRP) {}
1304+
1305+ // / Create an abstract attribute view for the position \p IRP.
1306+ static AAAMDGPUUniform &createForPosition (const IRPosition &IRP,
1307+ Attributor &A);
1308+
1309+ // / See AbstractAttribute::getName()
1310+ StringRef getName () const override { return " AAAMDGPUUniform" ; }
1311+
1312+ const std::string getAsStr (Attributor *A) const override {
1313+ return getAssumed () ? " uniform" : " divergent" ;
1314+ }
1315+
1316+ void trackStatistics () const override {}
1317+
1318+ // / See AbstractAttribute::getIdAddr()
1319+ const char *getIdAddr () const override { return &ID; }
1320+
1321+ // / This function should return true if the type of the \p AA is
1322+ // / AAAMDGPUUniform
1323+ static bool classof (const AbstractAttribute *AA) {
1324+ return (AA->getIdAddr () == &ID);
1325+ }
1326+
1327+ // / Unique ID (due to the unique address)
1328+ static const char ID;
1329+ };
1330+
1331+ const char AAAMDGPUUniform::ID = 0 ;
1332+
1333+ // / This AA is to infer the inreg attribute for a function argument.
1334+ struct AAAMDGPUUniformArgument : public AAAMDGPUUniform {
1335+ AAAMDGPUUniformArgument (const IRPosition &IRP, Attributor &A)
1336+ : AAAMDGPUUniform(IRP, A) {}
1337+
1338+ void initialize (Attributor &A) override {
1339+ Argument *Arg = getAssociatedArgument ();
1340+ CallingConv::ID CC = Arg->getParent ()->getCallingConv ();
1341+ if (Arg->hasAttribute (Attribute::InReg)) {
1342+ indicateOptimisticFixpoint ();
1343+ return ;
1344+ }
1345+ if (AMDGPU::isEntryFunctionCC (CC)) {
1346+ // We only use isArgPassedInSGPR on kernel entry function argument, so
1347+ // even if we will use VPGR for inreg i1 argument passing, it will not
1348+ // affect this.
1349+ if (AMDGPU::isArgPassedInSGPR (Arg))
1350+ indicateOptimisticFixpoint ();
1351+ else
1352+ indicatePessimisticFixpoint ();
1353+ }
1354+ }
1355+
1356+ ChangeStatus updateImpl (Attributor &A) override {
1357+ unsigned ArgNo = getAssociatedArgument ()->getArgNo ();
1358+
1359+ auto isUniform = [&](AbstractCallSite ACS) -> bool {
1360+ CallBase *CB = ACS.getInstruction ();
1361+ Value *V = CB->getArgOperandUse (ArgNo);
1362+ if (isa<Constant>(V))
1363+ return true ;
1364+ Function *F = nullptr ;
1365+ if (auto *Arg = dyn_cast<Argument>(V)) {
1366+ auto *AA =
1367+ A.getOrCreateAAFor <AAAMDGPUUniform>(IRPosition::argument (*Arg));
1368+ if (AA)
1369+ return AA->isValidState ();
1370+ F = Arg->getParent ();
1371+ } else if (auto *I = dyn_cast<Instruction>(V)) {
1372+ F = I->getFunction ();
1373+ }
1374+
1375+ if (F) {
1376+ auto *UA =
1377+ A.getInfoCache ()
1378+ .getAnalysisResultForFunction <UniformityInfoAnalysis>(*F);
1379+ return UA && UA->isUniform (V);
1380+ }
1381+
1382+ return false ;
1383+ };
1384+
1385+ bool UsedAssumedInformation = true ;
1386+ if (!A.checkForAllCallSites (isUniform, *this , /* RequireAllCallSites=*/ true ,
1387+ UsedAssumedInformation))
1388+ return indicatePessimisticFixpoint ();
1389+
1390+ if (!UsedAssumedInformation)
1391+ return indicateOptimisticFixpoint ();
1392+
1393+ return ChangeStatus::UNCHANGED;
1394+ }
1395+
1396+ ChangeStatus manifest (Attributor &A) override {
1397+ Argument *Arg = getAssociatedArgument ();
1398+ // If the argument already has inreg attribute, we will not do anything
1399+ // about it.
1400+ if (Arg->hasAttribute (Attribute::InReg))
1401+ return ChangeStatus::UNCHANGED;
1402+ if (AMDGPU::isEntryFunctionCC (Arg->getParent ()->getCallingConv ()))
1403+ return ChangeStatus::UNCHANGED;
1404+ // We don't directly emit readfirstlane here because it will cause multiple
1405+ // replacements of a single use in the manifest map, which is not supported
1406+ // at this moment.
1407+ // Add both inreg and "uniform" attribute to the argument. We will emit a
1408+ // readfirstlane at each call site for inreg uniform argument, and the
1409+ // "uniform" attribute will be removed later.
1410+ LLVMContext &Ctx = Arg->getContext ();
1411+ return A.manifestAttrs (getIRPosition (),
1412+ {Attribute::get (Ctx, Attribute::InReg),
1413+ Attribute::get (Ctx, " uniform" )});
1414+ }
1415+ };
1416+
1417+ AAAMDGPUUniform &AAAMDGPUUniform::createForPosition (const IRPosition &IRP,
1418+ Attributor &A) {
1419+ switch (IRP.getPositionKind ()) {
1420+ case IRPosition::IRP_ARGUMENT:
1421+ return *new (A.Allocator ) AAAMDGPUUniformArgument (IRP, A);
1422+ // TODO: Since inreg is also allowed for return value, maybe we need to add
1423+ // AAAMDGPUUniformCallSiteReturned?
1424+ default :
1425+ llvm_unreachable (" not a valid position for AAAMDGPUUniform" );
1426+ }
1427+ }
1428+
12981429// / Performs the final check and updates the 'amdgpu-waves-per-eu' attribute
12991430// / based on the finalized 'amdgpu-flat-work-group-size' attribute.
13001431// / Both attributes start with narrow ranges that expand during iteration.
@@ -1363,6 +1494,64 @@ static bool updateWavesPerEU(Module &M, TargetMachine &TM) {
13631494 return Changed;
13641495}
13651496
1497+ // / Emit the readfirstlane intrinsic for all inreg uniform function arguments at
1498+ // / each call site. The inreg uniform attribute combination is set by
1499+ // / AAAMDGPUUniform. This function provides a workaround for a downstream issue
1500+ // / where failing to emit a waterfall loop for 'inreg' arguments may result in
1501+ // / an invalid VGPR-to-SGPR copy. However, we intentionally avoid a waterfall
1502+ // / loop for inreg uniform arguments here, because the 'inreg' attribute set by
1503+ // / AAAMDGPUUniform guarantees uniformity, making the readfirstlane intrinsic
1504+ // / appropriate.
1505+ static bool emitReadFirstLaneForInregUniformArgs (Module &M) {
1506+ bool Changed = false ;
1507+ std::vector<std::pair<CallBase *, unsigned >> WorkList;
1508+
1509+ for (Function &F : M) {
1510+ if (F.isDeclaration ())
1511+ continue ;
1512+ for (Argument &Arg : F.args ()) {
1513+ if (!Arg.hasAttribute (Attribute::InReg) || !Arg.hasAttribute (" uniform" ))
1514+ continue ;
1515+ unsigned ArgNo = Arg.getArgNo ();
1516+ for (Use &U : F.uses ()) {
1517+ auto *CB = dyn_cast<CallBase>(U.getUser ());
1518+ if (!CB)
1519+ continue ;
1520+ Value *CSArg = CB->getArgOperand (ArgNo);
1521+ // We don't need readfirstvalue for a global value.
1522+ if (isa<GlobalValue>(CSArg))
1523+ continue ;
1524+ // We will skip the call site argument when itself is an inreg argument.
1525+ // In this case, it will already be in SGPR.
1526+ if (auto *CSArgArg = dyn_cast<Argument>(CSArg)) {
1527+ if (CSArgArg->hasAttribute (Attribute::InReg))
1528+ continue ;
1529+ }
1530+ WorkList.emplace_back (CB, ArgNo);
1531+ }
1532+ Arg.removeAttr (" uniform" );
1533+ Changed = true ;
1534+ }
1535+ }
1536+
1537+ if (WorkList.empty ())
1538+ return Changed;
1539+
1540+ for (auto &[CB, ArgNo] : WorkList) {
1541+ Value *V = CB->getArgOperand (ArgNo);
1542+ IRBuilder<> Builder (CB);
1543+ Value *NewV = Builder.CreateIntrinsic (V->getType (),
1544+ Intrinsic::amdgcn_readfirstlane, {V});
1545+ CB->setArgOperand (ArgNo, NewV);
1546+ if (auto *I = dyn_cast<Instruction>(V)) {
1547+ if (I->use_empty ())
1548+ I->eraseFromParent ();
1549+ }
1550+ }
1551+
1552+ return true ;
1553+ }
1554+
13661555static bool runImpl (Module &M, AnalysisGetter &AG, TargetMachine &TM,
13671556 AMDGPUAttributorOptions Options,
13681557 ThinOrFullLTOPhase LTOPhase) {
@@ -1381,7 +1570,7 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
13811570 &AAAMDMaxNumWorkgroups::ID, &AAAMDWavesPerEU::ID, &AAAMDGPUNoAGPR::ID,
13821571 &AACallEdges::ID, &AAPointerInfo::ID, &AAPotentialConstantValues::ID,
13831572 &AAUnderlyingObjects::ID, &AAAddressSpace::ID, &AAIndirectCallInfo::ID,
1384- &AAInstanceInfo::ID});
1573+ &AAInstanceInfo::ID, &AAAMDGPUUniform::ID });
13851574
13861575 AttributorConfig AC (CGUpdater);
13871576 AC.IsClosedWorldModule = Options.IsClosedWorld ;
@@ -1434,11 +1623,17 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
14341623 IRPosition::value (*CmpX->getPointerOperand ()));
14351624 }
14361625 }
1626+
1627+ if (!AMDGPU::isEntryFunctionCC (F->getCallingConv ())) {
1628+ for (auto &Arg : F->args ())
1629+ A.getOrCreateAAFor <AAAMDGPUUniform>(IRPosition::argument (Arg));
1630+ }
14371631 }
14381632
14391633 bool Changed = A.run () == ChangeStatus::CHANGED;
14401634
14411635 Changed |= updateWavesPerEU (M, TM);
1636+ Changed |= emitReadFirstLaneForInregUniformArgs (M);
14421637
14431638 return Changed;
14441639}
0 commit comments