99//
1010// Performs general IR level optimizations on SVE intrinsics.
1111//
12- // The main goal of this pass is to remove unnecessary reinterpret
13- // intrinsics (llvm.aarch64.sve.convert.[to|from].svbool), e.g:
12+ // This pass performs the following optimizations:
1413//
15- // %1 = @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %a)
16- // %2 = @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %1)
14+ // - removes unnecessary reinterpret intrinsics
15+ // (llvm.aarch64.sve.convert.[to|from].svbool), e.g:
16+ // %1 = @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %a)
17+ // %2 = @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %1)
1718//
18- // This pass also looks for ptest intrinsics & phi instructions where the
19- // operands are being needlessly converted to and from svbool_t.
19+ // - removes unnecessary ptrue intrinsics (llvm.aarch64.sve.ptrue), e.g:
20+ // %1 = @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
21+ // %2 = @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
22+ // ; (%1 can be replaced with a reinterpret of %2)
23+ //
24+ // - optimizes ptest intrinsics and phi instructions where the operands are
25+ // being needlessly converted to and from svbool_t.
2026//
2127// ===----------------------------------------------------------------------===//
2228
@@ -56,8 +62,17 @@ struct SVEIntrinsicOpts : public ModulePass {
5662private:
5763 static IntrinsicInst *isReinterpretToSVBool (Value *V);
5864
65+ bool coalescePTrueIntrinsicCalls (BasicBlock &BB,
66+ SmallSetVector<IntrinsicInst *, 4 > &PTrues);
67+ bool optimizePTrueIntrinsicCalls (SmallSetVector<Function *, 4 > &Functions);
68+
69+ // / Operates at the instruction-scope. I.e., optimizations are applied local
70+ // / to individual instructions.
5971 static bool optimizeIntrinsic (Instruction *I);
72+ bool optimizeIntrinsicCalls (SmallSetVector<Function *, 4 > &Functions);
6073
74+ // / Operates at the function-scope. I.e., optimizations are applied local to
75+ // / the functions themselves.
6176 bool optimizeFunctions (SmallSetVector<Function *, 4 > &Functions);
6277
6378 static bool optimizeConvertFromSVBool (IntrinsicInst *I);
@@ -95,6 +110,188 @@ IntrinsicInst *SVEIntrinsicOpts::isReinterpretToSVBool(Value *V) {
95110 return I;
96111}
97112
113+ // / Checks if a ptrue intrinsic call is promoted. The act of promoting a
114+ // / ptrue will introduce zeroing. For example:
115+ // /
116+ // / %1 = <vscale x 4 x i1> call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
117+ // / %2 = <vscale x 16 x i1> call @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %1)
118+ // / %3 = <vscale x 8 x i1> call @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %2)
119+ // /
120+ // / %1 is promoted, because it is converted:
121+ // /
122+ // / <vscale x 4 x i1> => <vscale x 16 x i1> => <vscale x 8 x i1>
123+ // /
124+ // / via a sequence of the SVE reinterpret intrinsics convert.{to,from}.svbool.
125+ bool isPTruePromoted (IntrinsicInst *PTrue) {
126+ // Find all users of this intrinsic that are calls to convert-to-svbool
127+ // reinterpret intrinsics.
128+ SmallVector<IntrinsicInst *, 4 > ConvertToUses;
129+ for (User *User : PTrue->users ()) {
130+ if (match (User, m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>())) {
131+ ConvertToUses.push_back (cast<IntrinsicInst>(User));
132+ }
133+ }
134+
135+ // If no such calls were found, this is ptrue is not promoted.
136+ if (ConvertToUses.empty ())
137+ return false ;
138+
139+ // Otherwise, try to find users of the convert-to-svbool intrinsics that are
140+ // calls to the convert-from-svbool intrinsic, and would result in some lanes
141+ // being zeroed.
142+ const auto *PTrueVTy = cast<ScalableVectorType>(PTrue->getType ());
143+ for (IntrinsicInst *ConvertToUse : ConvertToUses) {
144+ for (User *User : ConvertToUse->users ()) {
145+ auto *IntrUser = dyn_cast<IntrinsicInst>(User);
146+ if (IntrUser && IntrUser->getIntrinsicID () ==
147+ Intrinsic::aarch64_sve_convert_from_svbool) {
148+ const auto *IntrUserVTy = cast<ScalableVectorType>(IntrUser->getType ());
149+
150+ // Would some lanes become zeroed by the conversion?
151+ if (IntrUserVTy->getElementCount ().getKnownMinValue () >
152+ PTrueVTy->getElementCount ().getKnownMinValue ())
153+ // This is a promoted ptrue.
154+ return true ;
155+ }
156+ }
157+ }
158+
159+ // If no matching calls were found, this is not a promoted ptrue.
160+ return false ;
161+ }
162+
163+ // / Attempts to coalesce ptrues in a basic block.
164+ bool SVEIntrinsicOpts::coalescePTrueIntrinsicCalls (
165+ BasicBlock &BB, SmallSetVector<IntrinsicInst *, 4 > &PTrues) {
166+ if (PTrues.size () <= 1 )
167+ return false ;
168+
169+ // Find the ptrue with the most lanes.
170+ auto *MostEncompassingPTrue = *std::max_element (
171+ PTrues.begin (), PTrues.end (), [](auto *PTrue1, auto *PTrue2) {
172+ auto *PTrue1VTy = cast<ScalableVectorType>(PTrue1->getType ());
173+ auto *PTrue2VTy = cast<ScalableVectorType>(PTrue2->getType ());
174+ return PTrue1VTy->getElementCount ().getKnownMinValue () <
175+ PTrue2VTy->getElementCount ().getKnownMinValue ();
176+ });
177+
178+ // Remove the most encompassing ptrue, as well as any promoted ptrues, leaving
179+ // behind only the ptrues to be coalesced.
180+ PTrues.remove (MostEncompassingPTrue);
181+ PTrues.remove_if ([](auto *PTrue) { return isPTruePromoted (PTrue); });
182+
183+ // Hoist MostEncompassingPTrue to the start of the basic block. It is always
184+ // safe to do this, since ptrue intrinsic calls are guaranteed to have no
185+ // predecessors.
186+ MostEncompassingPTrue->moveBefore (BB, BB.getFirstInsertionPt ());
187+
188+ LLVMContext &Ctx = BB.getContext ();
189+ IRBuilder<> Builder (Ctx);
190+ Builder.SetInsertPoint (&BB, ++MostEncompassingPTrue->getIterator ());
191+
192+ auto *MostEncompassingPTrueVTy =
193+ cast<VectorType>(MostEncompassingPTrue->getType ());
194+ auto *ConvertToSVBool = Builder.CreateIntrinsic (
195+ Intrinsic::aarch64_sve_convert_to_svbool, {MostEncompassingPTrueVTy},
196+ {MostEncompassingPTrue});
197+
198+ for (auto *PTrue : PTrues) {
199+ auto *PTrueVTy = cast<VectorType>(PTrue->getType ());
200+
201+ Builder.SetInsertPoint (&BB, ++ConvertToSVBool->getIterator ());
202+ auto *ConvertFromSVBool =
203+ Builder.CreateIntrinsic (Intrinsic::aarch64_sve_convert_from_svbool,
204+ {PTrueVTy}, {ConvertToSVBool});
205+ PTrue->replaceAllUsesWith (ConvertFromSVBool);
206+ PTrue->eraseFromParent ();
207+ }
208+
209+ return true ;
210+ }
211+
212+ // / The goal of this function is to remove redundant calls to the SVE ptrue
213+ // / intrinsic in each basic block within the given functions.
214+ // /
215+ // / SVE ptrues have two representations in LLVM IR:
216+ // / - a logical representation -- an arbitrary-width scalable vector of i1s,
217+ // / i.e. <vscale x N x i1>.
218+ // / - a physical representation (svbool, <vscale x 16 x i1>) -- a 16-element
219+ // / scalable vector of i1s, i.e. <vscale x 16 x i1>.
220+ // /
221+ // / The SVE ptrue intrinsic is used to create a logical representation of an SVE
222+ // / predicate. Suppose that we have two SVE ptrue intrinsic calls: P1 and P2. If
223+ // / P1 creates a logical SVE predicate that is at least as wide as the logical
224+ // / SVE predicate created by P2, then all of the bits that are true in the
225+ // / physical representation of P2 are necessarily also true in the physical
226+ // / representation of P1. P1 'encompasses' P2, therefore, the intrinsic call to
227+ // / P2 is redundant and can be replaced by an SVE reinterpret of P1 via
228+ // / convert.{to,from}.svbool.
229+ // /
230+ // / Currently, this pass only coalesces calls to SVE ptrue intrinsics
231+ // / if they match the following conditions:
232+ // /
233+ // / - the call to the intrinsic uses either the SV_ALL or SV_POW2 patterns.
234+ // / SV_ALL indicates that all bits of the predicate vector are to be set to
235+ // / true. SV_POW2 indicates that all bits of the predicate vector up to the
236+ // / largest power-of-two are to be set to true.
237+ // / - the result of the call to the intrinsic is not promoted to a wider
238+ // / predicate. In this case, keeping the extra ptrue leads to better codegen
239+ // / -- coalescing here would create an irreducible chain of SVE reinterprets
240+ // / via convert.{to,from}.svbool.
241+ // /
242+ // / EXAMPLE:
243+ // /
244+ // / %1 = <vscale x 8 x i1> ptrue(i32 SV_ALL)
245+ // / ; Logical: <1, 1, 1, 1, 1, 1, 1, 1>
246+ // / ; Physical: <1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0>
247+ // / ...
248+ // /
249+ // / %2 = <vscale x 4 x i1> ptrue(i32 SV_ALL)
250+ // / ; Logical: <1, 1, 1, 1>
251+ // / ; Physical: <1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0>
252+ // / ...
253+ // /
254+ // / Here, %2 can be replaced by an SVE reinterpret of %1, giving, for instance:
255+ // /
256+ // / %1 = <vscale x 8 x i1> ptrue(i32 i31)
257+ // / %2 = <vscale x 16 x i1> convert.to.svbool(<vscale x 8 x i1> %1)
258+ // / %3 = <vscale x 4 x i1> convert.from.svbool(<vscale x 16 x i1> %2)
259+ // /
260+ bool SVEIntrinsicOpts::optimizePTrueIntrinsicCalls (
261+ SmallSetVector<Function *, 4 > &Functions) {
262+ bool Changed = false ;
263+
264+ for (auto *F : Functions) {
265+ for (auto &BB : *F) {
266+ SmallSetVector<IntrinsicInst *, 4 > SVAllPTrues;
267+ SmallSetVector<IntrinsicInst *, 4 > SVPow2PTrues;
268+
269+ // For each basic block, collect the used ptrues and try to coalesce them.
270+ for (Instruction &I : BB) {
271+ if (I.use_empty ())
272+ continue ;
273+
274+ auto *IntrI = dyn_cast<IntrinsicInst>(&I);
275+ if (!IntrI || IntrI->getIntrinsicID () != Intrinsic::aarch64_sve_ptrue)
276+ continue ;
277+
278+ const auto PTruePattern =
279+ cast<ConstantInt>(IntrI->getOperand (0 ))->getZExtValue ();
280+
281+ if (PTruePattern == AArch64SVEPredPattern::all)
282+ SVAllPTrues.insert (IntrI);
283+ if (PTruePattern == AArch64SVEPredPattern::pow2)
284+ SVPow2PTrues.insert (IntrI);
285+ }
286+
287+ Changed |= coalescePTrueIntrinsicCalls (BB, SVAllPTrues);
288+ Changed |= coalescePTrueIntrinsicCalls (BB, SVPow2PTrues);
289+ }
290+ }
291+
292+ return Changed;
293+ }
294+
98295// / The function will remove redundant reinterprets casting in the presence
99296// / of the control flow
100297bool SVEIntrinsicOpts::processPhiNode (IntrinsicInst *X) {
@@ -243,7 +440,7 @@ bool SVEIntrinsicOpts::optimizeIntrinsic(Instruction *I) {
243440 return true ;
244441}
245442
246- bool SVEIntrinsicOpts::optimizeFunctions (
443+ bool SVEIntrinsicOpts::optimizeIntrinsicCalls (
247444 SmallSetVector<Function *, 4 > &Functions) {
248445 bool Changed = false ;
249446 for (auto *F : Functions) {
@@ -260,6 +457,16 @@ bool SVEIntrinsicOpts::optimizeFunctions(
260457 return Changed;
261458}
262459
460+ bool SVEIntrinsicOpts::optimizeFunctions (
461+ SmallSetVector<Function *, 4 > &Functions) {
462+ bool Changed = false ;
463+
464+ Changed |= optimizePTrueIntrinsicCalls (Functions);
465+ Changed |= optimizeIntrinsicCalls (Functions);
466+
467+ return Changed;
468+ }
469+
263470bool SVEIntrinsicOpts::runOnModule (Module &M) {
264471 bool Changed = false ;
265472 SmallSetVector<Function *, 4 > Functions;
@@ -276,6 +483,7 @@ bool SVEIntrinsicOpts::runOnModule(Module &M) {
276483 case Intrinsic::aarch64_sve_ptest_any:
277484 case Intrinsic::aarch64_sve_ptest_first:
278485 case Intrinsic::aarch64_sve_ptest_last:
486+ case Intrinsic::aarch64_sve_ptrue:
279487 for (User *U : F.users ())
280488 Functions.insert (cast<Instruction>(U)->getFunction ());
281489 break ;
0 commit comments