Skip to content

Commit 7689b17

Browse files
stefan-iligcbot
authored andcommitted
Add call merger pass
Merges mutually exclusive function calls, in case they are too large to inline. This will enable subroutine inliner to inline them as only one call will remain.
1 parent 26870fd commit 7689b17

File tree

7 files changed

+417
-1
lines changed

7 files changed

+417
-1
lines changed

IGC/Compiler/CISACodeGen/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ set(IGC_BUILD__SRC__CISACodeGen_Common
2020
"${CMAKE_CURRENT_SOURCE_DIR}/BlockMemOpAddrScalarizationPass.cpp"
2121
"${CMAKE_CURRENT_SOURCE_DIR}/CapLoopIterationsPass.cpp"
2222
"${CMAKE_CURRENT_SOURCE_DIR}/CastToGASAnalysis.cpp"
23+
"${CMAKE_CURRENT_SOURCE_DIR}/CallMergerPass.cpp"
2324
"${CMAKE_CURRENT_SOURCE_DIR}/CISABuilder.cpp"
2425
"${CMAKE_CURRENT_SOURCE_DIR}/CShader.cpp"
2526
"${CMAKE_CURRENT_SOURCE_DIR}/CShaderProgram.cpp"
@@ -126,6 +127,7 @@ set(IGC_BUILD__HDR__CISACodeGen_Common
126127
"${CMAKE_CURRENT_SOURCE_DIR}/BlockMemOpAddrScalarizationPass.hpp"
127128
"${CMAKE_CURRENT_SOURCE_DIR}/CapLoopIterationsPass.h"
128129
"${CMAKE_CURRENT_SOURCE_DIR}/CastToGASAnalysis.h"
130+
"${CMAKE_CURRENT_SOURCE_DIR}/CallMergerPass.hpp"
129131
"${CMAKE_CURRENT_SOURCE_DIR}/CISABuilder.hpp"
130132
"${CMAKE_CURRENT_SOURCE_DIR}/CISACodeGen.h"
131133
"${CMAKE_CURRENT_SOURCE_DIR}/CVariable.hpp"
Lines changed: 277 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,277 @@
1+
/*========================== begin_copyright_notice ============================
2+
3+
Copyright (C) 2025 Intel Corporation
4+
5+
SPDX-License-Identifier: MIT
6+
7+
============================= end_copyright_notice ===========================*/
8+
9+
#include "CallMergerPass.hpp"
10+
11+
#include "CodeGenPublic.h"
12+
#include "Compiler/CISACodeGen/helper.h"
13+
#include "Compiler/IGCPassSupport.h"
14+
#include "llvmWrapper/IR/BasicBlock.h"
15+
16+
#include "Probe/Assertion.h"
17+
#include "common/LLVMWarningsPush.hpp"
18+
#include <llvm/ADT/SmallPtrSet.h>
19+
#include <llvm/IR/Function.h>
20+
#include <llvm/IR/Instruction.h>
21+
#include <llvm/IR/Instructions.h>
22+
#include <llvm/IR/Use.h>
23+
#include <llvm/Pass.h>
24+
#include <llvm/ADT/DenseMap.h>
25+
#include <llvm/ADT/SmallVector.h>
26+
#include "common/LLVMWarningsPop.hpp"
27+
28+
using namespace IGC;
29+
using namespace llvm;
30+
31+
// Register pass to igc-opt
32+
namespace IGC
33+
{
34+
#define PASS_FLAG "call-merger-pass"
35+
#define PASS_DESCRIPTION \
36+
"Merge mutually exclusive calls to enable further inlining."
37+
#define PASS_CFG_ONLY false
38+
#define PASS_ANALYSIS false
39+
IGC_INITIALIZE_PASS_BEGIN(CallMerger, PASS_FLAG, PASS_DESCRIPTION,
40+
PASS_CFG_ONLY, PASS_ANALYSIS)
41+
IGC_INITIALIZE_PASS_DEPENDENCY(CodeGenContextWrapper)
42+
IGC_INITIALIZE_PASS_DEPENDENCY(EstimateFunctionSize)
43+
IGC_INITIALIZE_PASS_END(CallMerger, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY,
44+
PASS_ANALYSIS)
45+
} // namespace IGC
46+
47+
using CallSiteMap = DenseMap<Function *, SmallVector<CallInst *, 2>>;
48+
49+
namespace {
50+
CallSiteMap collectAllCallSites(Function &F) {
51+
CallSiteMap callSites;
52+
for (auto &BB : F) {
53+
for (auto &I : BB) {
54+
if (auto *callInst = dyn_cast<CallInst>(&I)) {
55+
auto *calledFunc = callInst->getCalledFunction();
56+
if (calledFunc && !calledFunc->isIntrinsic()) {
57+
callSites[calledFunc].push_back(callInst);
58+
}
59+
}
60+
}
61+
}
62+
return callSites;
63+
}
64+
65+
void setNewTerminator(BasicBlock *oldBB, BasicBlock *newBB) {
66+
auto *oldTerminator = oldBB->getTerminator();
67+
oldTerminator->eraseFromParent();
68+
IGCLLVM::pushBackInstruction(oldBB, BranchInst::Create(newBB));
69+
}
70+
71+
// We assume that BBs with call instructions have terminator with single successor
72+
// and that the list of uses of both calls is the same.
73+
void mergeCalls(Function* F, CallInst *call1, CallInst *call2) {
74+
auto* parentBB1 = call1->getParent();
75+
auto* parentBB2 = call2->getParent();
76+
auto* successorBB = parentBB1->getSingleSuccessor();
77+
78+
auto* newBB = llvm::BasicBlock::Create(F->getContext(), "mergedCallsBB", F, successorBB);
79+
llvm::IRBuilder<> Builder(newBB);
80+
81+
IGC_ASSERT(call1->arg_size() == call2->arg_size());
82+
83+
SmallVector<Value*, 4> args;
84+
for (unsigned i = 0; i < call1->arg_size(); ++i) {
85+
auto *arg1 = call1->getArgOperand(i);
86+
auto *arg2 = call2->getArgOperand(i);
87+
if (arg1 == arg2) {
88+
args.push_back(arg1);
89+
continue;
90+
}
91+
auto *PN = Builder.CreatePHI(arg1->getType(), 2);
92+
PN->addIncoming(arg1, parentBB1);
93+
PN->addIncoming(arg2, parentBB2);
94+
args.push_back(PN);
95+
}
96+
auto* newCall = Builder.CreateCall(call1->getCalledFunction(), args);
97+
newCall->setCallingConv(call1->getCallingConv());
98+
newCall->setAttributes(call1->getAttributes());
99+
newCall->setTailCall(call1->isTailCall());
100+
Builder.CreateBr(successorBB);
101+
102+
setNewTerminator(parentBB1, newBB);
103+
setNewTerminator(parentBB2, newBB);
104+
for (auto& u: call1->uses()) {
105+
auto *userI = cast<Instruction>(u.getUser());
106+
userI->replaceUsesOfWith(call1, newCall);
107+
}
108+
for (auto& u: call2->uses()) {
109+
auto *userI = cast<Instruction>(u.getUser());
110+
userI->replaceUsesOfWith(call2, newCall);
111+
}
112+
call1->eraseFromParent();
113+
call2->eraseFromParent();
114+
}
115+
116+
bool haveSingleCommonSuccessor(CallInst *call1, CallInst *call2) {
117+
auto *successor1 = call1->getParent()->getSingleSuccessor();
118+
auto *successor2 = call2->getParent()->getSingleSuccessor();
119+
if (!successor1 || !successor2 || successor1 != successor2) {
120+
return false;
121+
}
122+
return true;
123+
}
124+
125+
bool isAfterInstInBB(Instruction* inst1, Instruction* inst2){
126+
for (auto& I : *inst1->getParent()) {
127+
if (&I == inst1) {
128+
return false;
129+
}
130+
if (&I == inst2) {
131+
return true;
132+
}
133+
}
134+
return false;
135+
}
136+
137+
bool hasUsesInCurrentBB(CallInst *call) {
138+
auto *currentBB = call->getParent();
139+
140+
// Check if call results is used in same block as call
141+
for (auto *user : call->users()) {
142+
auto* userI = cast<Instruction>(user);
143+
if (userI->getParent() == currentBB) {
144+
return true;
145+
}
146+
}
147+
148+
// Check if any non const argument is used in call block
149+
// after call
150+
for (auto& arg : call->args()) {
151+
if (!arg->getType()->isPointerTy()) {
152+
continue;
153+
}
154+
for (auto *user : arg->users()) {
155+
if (auto* userI = dyn_cast<Instruction>(user)) {
156+
if (userI == call || userI->getParent() != currentBB) {
157+
continue;
158+
}
159+
if (isAfterInstInBB(call, userI)) {
160+
continue;
161+
}
162+
return true;
163+
}
164+
}
165+
}
166+
return false;
167+
}
168+
169+
bool hasSameUsesAs(CallInst *call1, CallInst *call2) {
170+
if (call1->getNumUses() != call2->getNumUses()) {
171+
return false;
172+
}
173+
174+
for (auto &use1 : call1->uses()) {
175+
bool matched = false;
176+
for (auto &use2 : call2->uses()) {
177+
if (use1 == use2) {
178+
matched = true;
179+
break;
180+
}
181+
}
182+
if (!matched) {
183+
return false;
184+
}
185+
}
186+
return true;
187+
}
188+
189+
void filterCallSites(CallSiteMap &callSites, EstimateFunctionSize *EFS) {
190+
SmallVector<Function*, 4> elementsToErase;
191+
size_t PerFuncThreshold = IGC_GET_FLAG_VALUE(SubroutineInlinerThreshold);
192+
193+
for (const auto&[calledFunc, callInsts] : callSites) {
194+
if (callInsts.size() != 2) {
195+
elementsToErase.push_back(calledFunc);
196+
continue;
197+
}
198+
199+
// We don't need to process function that can't get inlined
200+
if (calledFunc->hasFnAttribute(llvm::Attribute::NoInline) ||
201+
calledFunc->hasFnAttribute("igc-force-stackcall") ||
202+
calledFunc->hasFnAttribute("KMPLOCK")){
203+
elementsToErase.push_back(calledFunc);
204+
continue;
205+
}
206+
207+
// We can skip functions that are small enough to be inlined.
208+
if (EFS->getExpandedSize(calledFunc) <= PerFuncThreshold) {
209+
elementsToErase.push_back(calledFunc);
210+
continue;
211+
}
212+
213+
// We can merge calls with common successor, without result or args having uses in
214+
// call block. We also only merge function calls with same use list.
215+
if (!haveSingleCommonSuccessor(callInsts[0], callInsts[1]) ||
216+
hasUsesInCurrentBB(callInsts[0]) ||
217+
hasUsesInCurrentBB(callInsts[1]) ||
218+
!hasSameUsesAs(callInsts[0], callInsts[1])) {
219+
elementsToErase.push_back(calledFunc);
220+
continue;
221+
}
222+
}
223+
224+
for (auto *calledFunc : elementsToErase) {
225+
callSites.erase(calledFunc);
226+
}
227+
}
228+
} // anonymous namespace
229+
230+
char CallMerger::ID = 0;
231+
232+
CallMerger::CallMerger() : ModulePass(ID) {
233+
initializeCallMergerPass(*PassRegistry::getPassRegistry());
234+
}
235+
236+
void CallMerger::getAnalysisUsage(llvm::AnalysisUsage &AU) const {
237+
AU.addRequired<CodeGenContextWrapper>();
238+
AU.addRequired<EstimateFunctionSize>();
239+
}
240+
241+
bool CallMerger::runOnFunction(Function& F) {
242+
auto callSites = collectAllCallSites(F);
243+
244+
filterCallSites(callSites, EFS);
245+
if (callSites.empty()) {
246+
return false;
247+
}
248+
249+
for (auto&[calledFunc, callInsts] : callSites) {
250+
mergeCalls(&F, callInsts[0], callInsts[1]);
251+
}
252+
253+
return true;
254+
}
255+
256+
bool CallMerger::runOnModule(Module &M) {
257+
EFS = &getAnalysis<EstimateFunctionSize>();
258+
CTX = getAnalysis<CodeGenContextWrapper>().getCodeGenContext();
259+
260+
// We don't need to do any work if all functions will get inlined
261+
// or function control is not default.
262+
if (IGC::ForceAlwaysInline(CTX) ||
263+
CTX->m_enableSubroutine == false ||
264+
getFunctionControl(CTX) != FLAG_FCALL_DEFAULT ||
265+
!EFS->shouldEnableSubroutine()) {
266+
return false;
267+
}
268+
269+
bool changed = false;
270+
for (auto &F : M) {
271+
if (F.isDeclaration() || F.isIntrinsic() || F.hasOptNone()) {
272+
continue;
273+
}
274+
changed |= runOnFunction(F);
275+
}
276+
return changed;
277+
}
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
/*========================== begin_copyright_notice ============================
2+
3+
Copyright (C) 2025 Intel Corporation
4+
5+
SPDX-License-Identifier: MIT
6+
7+
============================= end_copyright_notice ===========================*/
8+
9+
//===----------------------------------------------------------------------===//
10+
//
11+
// Utility pass that merges exclusive calls prior to inlining pass in case their
12+
// size would be to great to inline otherwise. In this way functions can be
13+
// inlined as they are called only once.
14+
//===----------------------------------------------------------------------===//
15+
16+
#pragma once
17+
18+
#include "Compiler/CISACodeGen/EstimateFunctionSize.h"
19+
#include "Compiler/CodeGenContextWrapper.hpp"
20+
21+
#include "common/LLVMWarningsPush.hpp"
22+
#include <llvm/Pass.h>
23+
#include "common/LLVMWarningsPop.hpp"
24+
25+
namespace llvm
26+
{
27+
class PassRegistry;
28+
} // namespace llvm
29+
30+
namespace IGC {
31+
32+
class CallMerger : public llvm::ModulePass {
33+
private:
34+
CodeGenContext *CTX = nullptr;
35+
EstimateFunctionSize *EFS = nullptr;
36+
37+
public:
38+
static char ID;
39+
40+
CallMerger();
41+
42+
bool runOnModule(llvm::Module &F) override;
43+
44+
llvm::StringRef getPassName() const override { return "CallMergerPass"; }
45+
46+
void getAnalysisUsage(llvm::AnalysisUsage &AU) const override;
47+
48+
private:
49+
bool runOnFunction(llvm::Function &F);
50+
};
51+
52+
void initializeCallMergerPass(llvm::PassRegistry&);
53+
} // namespace IGC

IGC/Compiler/CISACodeGen/GenCodeGenModule.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@ SPDX-License-Identifier: MIT
88

99
#pragma once
1010
#include "Compiler/MetaDataApi/MetaDataApi.h"
11-
#include "Compiler/CISACodeGen/helper.h"
1211
#include "common/LLVMWarningsPush.hpp"
1312
#include "llvm/IR/ValueHandle.h"
1413
#include "llvm/Pass.h"

IGC/Compiler/CISACodeGen/ShaderCodeGen.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ SPDX-License-Identifier: MIT
6969
#include "Compiler/MSAAInsertDiscard.hpp"
7070
#include "Compiler/CISACodeGen/PromoteInt8Type.hpp"
7171
#include "Compiler/CISACodeGen/PrepareLoadsStoresPass.h"
72+
#include "Compiler/CISACodeGen/CallMergerPass.hpp"
7273
#include "Compiler/CISACodeGen/EvaluateFreeze.hpp"
7374
#include "Compiler/CISACodeGen/DpasScan.hpp"
7475
#include "Compiler/CISACodeGen/FPRoundingModeCoalescing.hpp"
@@ -1825,6 +1826,11 @@ void OptimizeIR(CodeGenContext* const pContext)
18251826
if (pContext->m_enableSubroutine &&
18261827
getFunctionControl(pContext) == FLAG_FCALL_DEFAULT)
18271828
{
1829+
mpm.add(createEstimateFunctionSizePass(EstimateFunctionSize::AL_Kernel));
1830+
if (IGC_IS_FLAG_ENABLED(EnableLargeFunctionCallMerging))
1831+
{
1832+
mpm.add(new CallMerger());
1833+
}
18281834
mpm.add(createEstimateFunctionSizePass(EstimateFunctionSize::AL_Kernel));
18291835
mpm.add(createSubroutineInlinerPass());
18301836
}

IGC/common/igc_flags.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -721,6 +721,7 @@ DECLARE_IGC_REGKEY(bool, EnableThreadCombiningWithNoSLM, false, "Enable thread c
721721
DECLARE_IGC_REGKEY(DWORD, PrintFunctionSizeAnalysis, 0, "Print analysis data of function sizes", true)
722722
DECLARE_IGC_REGKEY(DWORD, SubroutineThreshold, 110000, "Minimal kernel size to enable subroutines", false)
723723
DECLARE_IGC_REGKEY(DWORD, SubroutineInlinerThreshold, 3000, "Subroutine inliner threshold", false)
724+
DECLARE_IGC_REGKEY(bool, EnableLargeFunctionCallMerging, true, "Merge mutually exclusive calls to large functions to enable inlining", false)
724725
DECLARE_IGC_REGKEY(bool, ControlKernelTotalSize, true, "Control kernel total size", true)
725726
DECLARE_IGC_REGKEY(bool, StaticProfileGuidedTrimming, false, "Enable static analysis in the kernel trimming", true)
726727
DECLARE_IGC_REGKEY(debugString, SelectiveTrimming, 0, "Choose a specific function to trim", true)

0 commit comments

Comments
 (0)