@@ -91,18 +91,16 @@ class AMDGPULibCalls {
9191 // sqrt
9292 bool fold_sqrt (FPMathOperator *FPOp, IRBuilder<> &B, const FuncInfo &FInfo);
9393
94+ bool insertSinCos (CallInst *Sin, CallInst *Cos, IRBuilder<> &B,
95+ const FuncInfo &FInfo);
96+
9497 // sin/cos
9598 bool fold_sincos (FPMathOperator *FPOp, IRBuilder<> &B, const FuncInfo &FInfo);
9699
97100 // __read_pipe/__write_pipe
98101 bool fold_read_write_pipe (CallInst *CI, IRBuilder<> &B,
99102 const FuncInfo &FInfo);
100103
101- // Get insertion point at entry.
102- BasicBlock::iterator getEntryIns (CallInst * UI);
103- // Insert an Alloc instruction.
104- AllocaInst* insertAlloca (CallInst * UI, IRBuilder<> &B, const char *prefix);
105-
106104 // Get a scalar native builtin single argument FP function
107105 FunctionCallee getNativeFunction (Module *M, const FuncInfo &FInfo);
108106
@@ -1153,6 +1151,71 @@ bool AMDGPULibCalls::fold_sqrt(FPMathOperator *FPOp, IRBuilder<> &B,
11531151 return false ;
11541152}
11551153
1154+ bool AMDGPULibCalls::insertSinCos (CallInst *Sin, CallInst *Cos, IRBuilder<> &B,
1155+ const FuncInfo &fInfo ) {
1156+ Value *Arg = Sin->getOperand (0 );
1157+ assert (Arg == Cos->getOperand (0 ));
1158+
1159+ Function *F = B.GetInsertBlock ()->getParent ();
1160+ Module *M = F->getParent ();
1161+ // Merge the sin and cos.
1162+
1163+ // for OpenCL 2.0 we have only generic implementation of sincos
1164+ // function.
1165+ // FIXME: This is not true anymore
1166+ AMDGPULibFunc nf (AMDGPULibFunc::EI_SINCOS, fInfo );
1167+ nf.getLeads ()[0 ].PtrKind =
1168+ AMDGPULibFunc::getEPtrKindFromAddrSpace (AMDGPUAS::FLAT_ADDRESS);
1169+ FunctionCallee Fsincos = getFunction (M, nf);
1170+ if (!Fsincos)
1171+ return false ;
1172+
1173+ B.SetInsertPointPastAllocas (F);
1174+
1175+ DILocation *MergedDebugLoc =
1176+ DILocation::getMergedLocation (Sin->getDebugLoc (), Cos->getDebugLoc ());
1177+ B.SetCurrentDebugLocation (MergedDebugLoc);
1178+
1179+ AllocaInst *Alloc = B.CreateAlloca (Sin->getType (), nullptr , " __sincos_" );
1180+
1181+ if (Instruction *ArgInst = dyn_cast<Instruction>(Arg)) {
1182+ // If the argument is an instruction, it must dominate all uses so put our
1183+ // sincos call there. Otherwise, right after the allocas works well enough
1184+ // if it's an argument or constant.
1185+
1186+ B.SetInsertPoint (ArgInst->getParent (), ++ArgInst->getIterator ());
1187+ B.SetCurrentDebugLocation (MergedDebugLoc);
1188+ }
1189+
1190+ Value *P = Alloc;
1191+ Type *PTy = Fsincos.getFunctionType ()->getParamType (1 );
1192+ // The allocaInst allocates the memory in private address space. This need
1193+ // to be bitcasted to point to the address space of cos pointer type.
1194+ // In OpenCL 2.0 this is generic, while in 1.2 that is private.
1195+ if (PTy->getPointerAddressSpace () != AMDGPUAS::PRIVATE_ADDRESS)
1196+ P = B.CreateAddrSpaceCast (Alloc, PTy);
1197+
1198+ // Intersect the two sets of flags.
1199+ FastMathFlags FMF = cast<FPMathOperator>(Sin)->getFastMathFlags ();
1200+ FMF &= cast<FPMathOperator>(Cos)->getFastMathFlags ();
1201+ B.setFastMathFlags (FMF);
1202+
1203+ CallInst *Call = CreateCallEx2 (B, Fsincos, Arg, P);
1204+ LoadInst *Reload = B.CreateLoad (Alloc->getAllocatedType (), Alloc);
1205+ Reload->setDebugLoc (Cos->getDebugLoc ());
1206+
1207+ LLVM_DEBUG (errs () << " AMDIC: fold_sincos (" << *Sin << " , " << *Cos
1208+ << " ) with " << *Call << ' \n ' );
1209+
1210+ Sin->replaceAllUsesWith (Call);
1211+ Sin->eraseFromParent ();
1212+
1213+ Cos->replaceAllUsesWith (Reload);
1214+ Cos->eraseFromParent ();
1215+
1216+ return true ;
1217+ }
1218+
11561219// fold sin, cos -> sincos.
11571220bool AMDGPULibCalls::fold_sincos (FPMathOperator *FPOp, IRBuilder<> &B,
11581221 const FuncInfo &fInfo ) {
@@ -1168,106 +1231,33 @@ bool AMDGPULibCalls::fold_sincos(FPMathOperator *FPOp, IRBuilder<> &B,
11681231
11691232 Value *CArgVal = FPOp->getOperand (0 );
11701233 CallInst *CI = cast<CallInst>(FPOp);
1171- BasicBlock * const CBB = CI->getParent ();
1172-
1173- int const MaxScan = 30 ;
11741234 bool Changed = false ;
11751235
1176- Module *M = CI->getModule ();
11771236 FuncInfo PartnerInfo (isSin ? AMDGPULibFunc::EI_COS : AMDGPULibFunc::EI_SIN,
11781237 fInfo );
11791238 const std::string PairName = PartnerInfo.mangle ();
11801239
11811240 CallInst *UI = nullptr ;
1241+
1242+ // TODO: Handle repeated uses, the generic implementation does.
11821243 for (User* U : CArgVal->users ()) {
1183- CallInst *XI = dyn_cast_or_null <CallInst>(U);
1184- if (!XI || XI == CI || XI-> getParent () != CBB )
1244+ CallInst *XI = dyn_cast <CallInst>(U);
1245+ if (!XI || XI-> isNoBuiltin () )
11851246 continue ;
11861247
11871248 Function *UCallee = XI->getCalledFunction ();
1188- if (!UCallee || !UCallee->getName ().equals (PairName))
1189- continue ;
1190-
1191- BasicBlock::iterator BBI = CI->getIterator ();
1192- if (BBI == CI->getParent ()->begin ())
1193- break ;
1194- --BBI;
1195- for (int I = MaxScan; I > 0 && BBI != CBB->begin (); --BBI, --I) {
1196- if (cast<Instruction>(BBI) == XI) {
1197- UI = XI;
1198- break ;
1199- }
1200- }
1201- if (UI) break ;
1249+ if (UCallee && UCallee->getName ().equals (PairName))
1250+ UI = XI;
1251+ else if (UI)
1252+ return Changed;
12021253 }
12031254
12041255 if (!UI)
12051256 return Changed;
12061257
1207- // Merge the sin and cos.
1208-
1209- // for OpenCL 2.0 we have only generic implementation of sincos
1210- // function.
1211- AMDGPULibFunc nf (AMDGPULibFunc::EI_SINCOS, fInfo );
1212- nf.getLeads ()[0 ].PtrKind = AMDGPULibFunc::getEPtrKindFromAddrSpace (AMDGPUAS::FLAT_ADDRESS);
1213- FunctionCallee Fsincos = getFunction (M, nf);
1214- if (!Fsincos)
1215- return Changed;
1216-
1217- BasicBlock::iterator ItOld = B.GetInsertPoint ();
1218- AllocaInst *Alloc = insertAlloca (UI, B, " __sincos_" );
1219- B.SetInsertPoint (UI);
1220-
1221- Value *P = Alloc;
1222- Type *PTy = Fsincos.getFunctionType ()->getParamType (1 );
1223- // The allocaInst allocates the memory in private address space. This need
1224- // to be bitcasted to point to the address space of cos pointer type.
1225- // In OpenCL 2.0 this is generic, while in 1.2 that is private.
1226- if (PTy->getPointerAddressSpace () != AMDGPUAS::PRIVATE_ADDRESS)
1227- P = B.CreateAddrSpaceCast (Alloc, PTy);
1228- CallInst *Call = CreateCallEx2 (B, Fsincos, UI->getArgOperand (0 ), P);
1229-
1230- LLVM_DEBUG (errs () << " AMDIC: fold_sincos (" << *CI << " , " << *UI << " ) with "
1231- << *Call << " \n " );
1232-
1233- if (!isSin) { // CI->cos, UI->sin
1234- B.SetInsertPoint (&*ItOld);
1235- UI->replaceAllUsesWith (&*Call);
1236- Instruction *Reload = B.CreateLoad (Alloc->getAllocatedType (), Alloc);
1237- CI->replaceAllUsesWith (Reload);
1238- UI->eraseFromParent ();
1239- CI->eraseFromParent ();
1240- } else { // CI->sin, UI->cos
1241- Instruction *Reload = B.CreateLoad (Alloc->getAllocatedType (), Alloc);
1242- UI->replaceAllUsesWith (Reload);
1243- CI->replaceAllUsesWith (Call);
1244- UI->eraseFromParent ();
1245- CI->eraseFromParent ();
1246- }
1247- return true ;
1248- }
1249-
1250- // Get insertion point at entry.
1251- BasicBlock::iterator AMDGPULibCalls::getEntryIns (CallInst * UI) {
1252- Function * Func = UI->getParent ()->getParent ();
1253- BasicBlock * BB = &Func->getEntryBlock ();
1254- assert (BB && " Entry block not found!" );
1255- BasicBlock::iterator ItNew = BB->begin ();
1256- return ItNew;
1257- }
1258-
1259- // Insert a AllocsInst at the beginning of function entry block.
1260- AllocaInst* AMDGPULibCalls::insertAlloca (CallInst *UI, IRBuilder<> &B,
1261- const char *prefix) {
1262- BasicBlock::iterator ItNew = getEntryIns (UI);
1263- Function *UCallee = UI->getCalledFunction ();
1264- Type *RetType = UCallee->getReturnType ();
1265- B.SetInsertPoint (&*ItNew);
1266- AllocaInst *Alloc =
1267- B.CreateAlloca (RetType, nullptr , std::string (prefix) + UI->getName ());
1268- Alloc->setAlignment (
1269- Align (UCallee->getParent ()->getDataLayout ().getTypeAllocSize (RetType)));
1270- return Alloc;
1258+ CallInst *Sin = isSin ? CI : UI;
1259+ CallInst *Cos = isSin ? UI : CI;
1260+ return insertSinCos (Sin, Cos, B, fInfo ) || Changed;
12711261}
12721262
12731263bool AMDGPULibCalls::evaluateScalarMathFunc (const FuncInfo &FInfo,
0 commit comments