1414#include " GCNSubtarget.h"
1515#include " Utils/AMDGPUBaseInfo.h"
1616#include " llvm/Analysis/CycleAnalysis.h"
17+ #include " llvm/Analysis/ValueTracking.h"
1718#include " llvm/CodeGen/TargetPassConfig.h"
19+ #include " llvm/IR/IRBuilder.h"
1820#include " llvm/IR/IntrinsicsAMDGPU.h"
1921#include " llvm/IR/IntrinsicsR600.h"
2022#include " llvm/Target/TargetMachine.h"
@@ -144,6 +146,213 @@ static bool funcRequiresHostcallPtr(const Function &F) {
144146}
145147
146148namespace {
149+
150+ class PreloadKernelArgInfo {
151+ private:
152+ Function &F;
153+ const GCNSubtarget &ST;
154+ unsigned NumFreeUserSGPRs;
155+
156+ enum HiddenArg : unsigned {
157+ HIDDEN_BLOCK_COUNT_X,
158+ HIDDEN_BLOCK_COUNT_Y,
159+ HIDDEN_BLOCK_COUNT_Z,
160+ HIDDEN_GROUP_SIZE_X,
161+ HIDDEN_GROUP_SIZE_Y,
162+ HIDDEN_GROUP_SIZE_Z,
163+ HIDDEN_REMAINDER_X,
164+ HIDDEN_REMAINDER_Y,
165+ HIDDEN_REMAINDER_Z,
166+ END_HIDDEN_ARGS
167+ };
168+
169+ // Stores information about a specific hidden argument.
170+ struct HiddenArgInfo {
171+ // Offset in bytes from the location in the kernearg segment pointed to by
172+ // the implicitarg pointer.
173+ uint8_t Offset;
174+ // The size of the hidden argument in bytes.
175+ uint8_t Size;
176+ // The name of the hidden argument in the kernel signature.
177+ const char *Name;
178+ };
179+
180+ static constexpr HiddenArgInfo HiddenArgs[END_HIDDEN_ARGS] = {
181+ {0 , 4 , " _hidden_block_count_x" }, {4 , 4 , " _hidden_block_count_y" },
182+ {8 , 4 , " _hidden_block_count_z" }, {12 , 2 , " _hidden_group_size_x" },
183+ {14 , 2 , " _hidden_group_size_y" }, {16 , 2 , " _hidden_group_size_z" },
184+ {18 , 2 , " _hidden_remainder_x" }, {20 , 2 , " _hidden_remainder_y" },
185+ {22 , 2 , " _hidden_remainder_z" }};
186+
187+ static HiddenArg getHiddenArgFromOffset (unsigned Offset) {
188+ for (unsigned I = 0 ; I < END_HIDDEN_ARGS; ++I)
189+ if (HiddenArgs[I].Offset == Offset)
190+ return static_cast <HiddenArg>(I);
191+
192+ return END_HIDDEN_ARGS;
193+ }
194+
195+ static Type *getHiddenArgType (LLVMContext &Ctx, HiddenArg HA) {
196+ if (HA < END_HIDDEN_ARGS)
197+ return Type::getIntNTy (Ctx, HiddenArgs[HA].Size * 8 );
198+
199+ llvm_unreachable (" Unexpected hidden argument." );
200+ }
201+
202+ static const char *getHiddenArgName (HiddenArg HA) {
203+ if (HA < END_HIDDEN_ARGS) {
204+ return HiddenArgs[HA].Name ;
205+ }
206+ llvm_unreachable (" Unexpected hidden argument." );
207+ }
208+
209+ // Clones the function after adding implicit arguments to the argument list
210+ // and returns the new updated function. Preloaded implicit arguments are
211+ // added up to and including the last one that will be preloaded, indicated by
212+ // LastPreloadIndex. Currently preloading is only performed on the totality of
213+ // sequential data from the kernarg segment including implicit (hidden)
214+ // arguments. This means that all arguments up to the last preloaded argument
215+ // will also be preloaded even if that data is unused.
216+ Function *cloneFunctionWithPreloadImplicitArgs (unsigned LastPreloadIndex) {
217+ FunctionType *FT = F.getFunctionType ();
218+ LLVMContext &Ctx = F.getParent ()->getContext ();
219+ SmallVector<Type *, 16 > FTypes (FT->param_begin (), FT->param_end ());
220+ for (unsigned I = 0 ; I <= LastPreloadIndex; ++I)
221+ FTypes.push_back (getHiddenArgType (Ctx, HiddenArg (I)));
222+
223+ FunctionType *NFT =
224+ FunctionType::get (FT->getReturnType (), FTypes, FT->isVarArg ());
225+ Function *NF =
226+ Function::Create (NFT, F.getLinkage (), F.getAddressSpace (), F.getName ());
227+
228+ NF->copyAttributesFrom (&F);
229+ NF->copyMetadata (&F, 0 );
230+ NF->setIsNewDbgInfoFormat (F.IsNewDbgInfoFormat );
231+
232+ F.getParent ()->getFunctionList ().insert (F.getIterator (), NF);
233+ NF->takeName (&F);
234+ NF->splice (NF->begin (), &F);
235+
236+ Function::arg_iterator NFArg = NF->arg_begin ();
237+ for (Argument &Arg : F.args ()) {
238+ Arg.replaceAllUsesWith (&*NFArg);
239+ NFArg->takeName (&Arg);
240+ ++NFArg;
241+ }
242+
243+ AttrBuilder AB (Ctx);
244+ AB.addAttribute (Attribute::InReg);
245+ AB.addAttribute (" amdgpu-hidden-argument" );
246+ AttributeList AL = NF->getAttributes ();
247+ for (unsigned I = 0 ; I <= LastPreloadIndex; ++I) {
248+ AL = AL.addParamAttributes (Ctx, NFArg->getArgNo (), AB);
249+ NFArg++->setName (getHiddenArgName (HiddenArg (I)));
250+ }
251+
252+ NF->setAttributes (AL);
253+ F.replaceAllUsesWith (NF);
254+
255+ return NF;
256+ }
257+
258+ public:
259+ PreloadKernelArgInfo (Function &F, const GCNSubtarget &ST) : F(F), ST(ST) {
260+ setInitialFreeUserSGPRsCount ();
261+ }
262+
263+ // Returns the maximum number of user SGPRs that we have available to preload
264+ // arguments.
265+ void setInitialFreeUserSGPRsCount () {
266+ GCNUserSGPRUsageInfo UserSGPRInfo (F, ST);
267+ NumFreeUserSGPRs = UserSGPRInfo.getNumFreeUserSGPRs ();
268+ }
269+
270+ bool canPreloadKernArgAtOffset (uint64_t ExplicitArgOffset) {
271+ return ExplicitArgOffset <= NumFreeUserSGPRs * 4 ;
272+ }
273+
274+ // Try to allocate SGPRs to preload hidden kernel arguments.
275+ void
276+ tryAllocHiddenArgPreloadSGPRs (uint64_t ImplicitArgsBaseOffset,
277+ SmallVectorImpl<Function *> &FunctionsToErase) {
278+ Function *ImplicitArgPtr = Intrinsic::getDeclarationIfExists (
279+ F.getParent (), Intrinsic::amdgcn_implicitarg_ptr);
280+ if (!ImplicitArgPtr)
281+ return ;
282+
283+ const DataLayout &DL = F.getParent ()->getDataLayout ();
284+ // Pair is the load and the load offset.
285+ SmallVector<std::pair<LoadInst *, unsigned >, 4 > ImplicitArgLoads;
286+ for (auto *U : ImplicitArgPtr->users ()) {
287+ Instruction *CI = dyn_cast<Instruction>(U);
288+ if (!CI || CI->getParent ()->getParent () != &F)
289+ continue ;
290+
291+ for (auto *U : CI->users ()) {
292+ int64_t Offset = 0 ;
293+ auto *Load = dyn_cast<LoadInst>(U); // Load from ImplicitArgPtr?
294+ if (!Load) {
295+ if (GetPointerBaseWithConstantOffset (U, Offset, DL) != CI)
296+ continue ;
297+
298+ Load = dyn_cast<LoadInst>(*U->user_begin ()); // Load from GEP?
299+ }
300+
301+ if (!Load || !Load->isSimple ())
302+ continue ;
303+
304+ // FIXME: Expand handle merged loads.
305+ LLVMContext &Ctx = F.getParent ()->getContext ();
306+ Type *LoadTy = Load->getType ();
307+ HiddenArg HA = getHiddenArgFromOffset (Offset);
308+ if (HA == END_HIDDEN_ARGS || LoadTy != getHiddenArgType (Ctx, HA))
309+ continue ;
310+
311+ ImplicitArgLoads.push_back (std::make_pair (Load, Offset));
312+ }
313+ }
314+
315+ if (ImplicitArgLoads.empty ())
316+ return ;
317+
318+ // Allocate loads in order of offset. We need to be sure that the implicit
319+ // argument can actually be preloaded.
320+ std::sort (ImplicitArgLoads.begin (), ImplicitArgLoads.end (), less_second ());
321+
322+ // If we fail to preload any implicit argument we know we don't have SGPRs
323+ // to preload any subsequent ones with larger offsets. Find the first
324+ // argument that we cannot preload.
325+ auto *PreloadEnd =
326+ std::find_if (ImplicitArgLoads.begin (), ImplicitArgLoads.end (),
327+ [&](const std::pair<LoadInst *, unsigned > &Load) {
328+ unsigned LoadSize =
329+ DL.getTypeStoreSize (Load.first ->getType ());
330+ unsigned LoadOffset = Load.second ;
331+ if (!canPreloadKernArgAtOffset (LoadOffset + LoadSize +
332+ ImplicitArgsBaseOffset))
333+ return true ;
334+
335+ return false ;
336+ });
337+
338+ if (PreloadEnd == ImplicitArgLoads.begin ())
339+ return ;
340+
341+ unsigned LastHiddenArgIndex = getHiddenArgFromOffset (PreloadEnd[-1 ].second );
342+ Function *NF = cloneFunctionWithPreloadImplicitArgs (LastHiddenArgIndex);
343+ assert (NF);
344+ FunctionsToErase.push_back (&F);
345+ for (const auto *I = ImplicitArgLoads.begin (); I != PreloadEnd; ++I) {
346+ LoadInst *LoadInst = I->first ;
347+ unsigned LoadOffset = I->second ;
348+ unsigned HiddenArgIndex = getHiddenArgFromOffset (LoadOffset);
349+ unsigned Index = NF->arg_size () - LastHiddenArgIndex + HiddenArgIndex - 1 ;
350+ Argument *Arg = NF->getArg (Index);
351+ LoadInst->replaceAllUsesWith (Arg);
352+ }
353+ }
354+ };
355+
147356class AMDGPUInformationCache : public InformationCache {
148357public:
149358 AMDGPUInformationCache (const Module &M, AnalysisGetter &AG,
@@ -1314,19 +1523,64 @@ struct AAAMDGPUNoAGPR
13141523
13151524const char AAAMDGPUNoAGPR::ID = 0 ;
13161525
1317- static void addPreloadKernArgHint (Function &F, TargetMachine &TM) {
1318- const GCNSubtarget &ST = TM.getSubtarget <GCNSubtarget>(F);
1319- for (unsigned I = 0 ;
1320- I < F.arg_size () &&
1321- I < std::min (KernargPreloadCount.getValue (), ST.getMaxNumUserSGPRs ());
1322- ++I) {
1323- Argument &Arg = *F.getArg (I);
1324- // Check for incompatible attributes.
1325- if (Arg.hasByRefAttr () || Arg.hasNestAttr ())
1326- break ;
1526+ static void markKernelArgsAsInreg (SetVector<Function *> &Functions,
1527+ TargetMachine &TM) {
1528+ SmallVector<Function *, 4 > FunctionsToErase;
1529+ for (auto *F : Functions) {
1530+ const GCNSubtarget &ST = TM.getSubtarget <GCNSubtarget>(*F);
1531+ if (!ST.hasKernargPreload () ||
1532+ F->getCallingConv () != CallingConv::AMDGPU_KERNEL || F->arg_empty ())
1533+ continue ;
1534+
1535+ PreloadKernelArgInfo PreloadInfo (*F, ST);
1536+ uint64_t ExplicitArgOffset = 0 ;
1537+ const DataLayout &DL = F->getDataLayout ();
1538+ const uint64_t BaseOffset = ST.getExplicitKernelArgOffset ();
1539+ unsigned NumPreloadsRequested = KernargPreloadCount;
1540+ unsigned NumPreloadedExplicitArgs = 0 ;
1541+ for (Argument &Arg : F->args ()) {
1542+ // Avoid incompatible attributes and guard against running this pass
1543+ // twice.
1544+ if (Arg.hasByRefAttr () || Arg.hasNestAttr () ||
1545+ Arg.hasAttribute (" amdgpu-hidden-argument" ))
1546+ break ;
1547+
1548+ // Inreg may be pre-existing on some arguments, try to preload these.
1549+ if (NumPreloadsRequested == 0 && !Arg.hasInRegAttr ())
1550+ break ;
1551+
1552+ // FIXME: Preload aggregates.
1553+ if (Arg.getType ()->isAggregateType ())
1554+ break ;
1555+
1556+ Type *ArgTy = Arg.getType ();
1557+ Align ABITypeAlign = DL.getABITypeAlign (ArgTy);
1558+ uint64_t AllocSize = DL.getTypeAllocSize (ArgTy);
1559+ ExplicitArgOffset = alignTo (ExplicitArgOffset, ABITypeAlign) + AllocSize;
1560+ if (!PreloadInfo.canPreloadKernArgAtOffset (ExplicitArgOffset))
1561+ break ;
1562+
1563+ Arg.addAttr (Attribute::InReg);
1564+ NumPreloadedExplicitArgs++;
1565+ if (NumPreloadsRequested > 0 )
1566+ NumPreloadsRequested--;
1567+ }
13271568
1328- Arg.addAttr (Attribute::InReg);
1569+ // Only try preloading hidden arguments if we can successfully preload the
1570+ // last explicit argument.
1571+ if (NumPreloadedExplicitArgs == F->arg_size ()) {
1572+ uint64_t ImplicitArgsBaseOffset =
1573+ alignTo (ExplicitArgOffset, ST.getAlignmentForImplicitArgPtr ()) +
1574+ BaseOffset;
1575+ PreloadInfo.tryAllocHiddenArgPreloadSGPRs (ImplicitArgsBaseOffset,
1576+ FunctionsToErase);
1577+ }
13291578 }
1579+
1580+ // Erase cloned functions if we needed to update the kernel signature to
1581+ // support preloading hidden kernel arguments.
1582+ for (auto *F : FunctionsToErase)
1583+ F->eraseFromParent ();
13301584}
13311585
13321586static bool runImpl (Module &M, AnalysisGetter &AG, TargetMachine &TM,
@@ -1378,8 +1632,6 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
13781632 if (!AMDGPU::isEntryFunctionCC (CC)) {
13791633 A.getOrCreateAAFor <AAAMDFlatWorkGroupSize>(IRPosition::function (*F));
13801634 A.getOrCreateAAFor <AAAMDWavesPerEU>(IRPosition::function (*F));
1381- } else if (CC == CallingConv::AMDGPU_KERNEL) {
1382- addPreloadKernArgHint (*F, TM);
13831635 }
13841636
13851637 for (auto &I : instructions (F)) {
@@ -1400,6 +1652,11 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
14001652 }
14011653
14021654 ChangeStatus Change = A.run ();
1655+
1656+ // Mark kernel arguments with 'inreg' attribute to indicate that they should
1657+ // be preloaded into SGPRs.
1658+ markKernelArgsAsInreg (Functions, TM);
1659+
14031660 return Change == ChangeStatus::CHANGED;
14041661}
14051662
0 commit comments